diff options
-rw-r--r-- | include/linux/list_lru.h | 8 | ||||
-rw-r--r-- | include/linux/mmzone.h | 1 | ||||
-rw-r--r-- | include/linux/radix-tree.h | 32 | ||||
-rw-r--r-- | include/linux/swap.h | 31 | ||||
-rw-r--r-- | lib/radix-tree.c | 36 | ||||
-rw-r--r-- | mm/filemap.c | 90 | ||||
-rw-r--r-- | mm/list_lru.c | 16 | ||||
-rw-r--r-- | mm/truncate.c | 26 | ||||
-rw-r--r-- | mm/vmstat.c | 1 | ||||
-rw-r--r-- | mm/workingset.c | 161 |
10 files changed, 359 insertions, 43 deletions
diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h index 3ce541753c88..f3434533fbf8 100644 --- a/include/linux/list_lru.h +++ b/include/linux/list_lru.h | |||
@@ -13,6 +13,8 @@ | |||
13 | /* list_lru_walk_cb has to always return one of those */ | 13 | /* list_lru_walk_cb has to always return one of those */ |
14 | enum lru_status { | 14 | enum lru_status { |
15 | LRU_REMOVED, /* item removed from list */ | 15 | LRU_REMOVED, /* item removed from list */ |
16 | LRU_REMOVED_RETRY, /* item removed, but lock has been | ||
17 | dropped and reacquired */ | ||
16 | LRU_ROTATE, /* item referenced, give another pass */ | 18 | LRU_ROTATE, /* item referenced, give another pass */ |
17 | LRU_SKIP, /* item cannot be locked, skip */ | 19 | LRU_SKIP, /* item cannot be locked, skip */ |
18 | LRU_RETRY, /* item not freeable. May drop the lock | 20 | LRU_RETRY, /* item not freeable. May drop the lock |
@@ -32,7 +34,11 @@ struct list_lru { | |||
32 | }; | 34 | }; |
33 | 35 | ||
34 | void list_lru_destroy(struct list_lru *lru); | 36 | void list_lru_destroy(struct list_lru *lru); |
35 | int list_lru_init(struct list_lru *lru); | 37 | int list_lru_init_key(struct list_lru *lru, struct lock_class_key *key); |
38 | static inline int list_lru_init(struct list_lru *lru) | ||
39 | { | ||
40 | return list_lru_init_key(lru, NULL); | ||
41 | } | ||
36 | 42 | ||
37 | /** | 43 | /** |
38 | * list_lru_add: add an element to the lru list's tail | 44 | * list_lru_add: add an element to the lru list's tail |
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index f25db1d74a21..fac5509c18f0 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h | |||
@@ -144,6 +144,7 @@ enum zone_stat_item { | |||
144 | #endif | 144 | #endif |
145 | WORKINGSET_REFAULT, | 145 | WORKINGSET_REFAULT, |
146 | WORKINGSET_ACTIVATE, | 146 | WORKINGSET_ACTIVATE, |
147 | WORKINGSET_NODERECLAIM, | ||
147 | NR_ANON_TRANSPARENT_HUGEPAGES, | 148 | NR_ANON_TRANSPARENT_HUGEPAGES, |
148 | NR_FREE_CMA_PAGES, | 149 | NR_FREE_CMA_PAGES, |
149 | NR_VM_ZONE_STAT_ITEMS }; | 150 | NR_VM_ZONE_STAT_ITEMS }; |
diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index 13636c40bc42..33170dbd9db4 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h | |||
@@ -72,21 +72,37 @@ static inline int radix_tree_is_indirect_ptr(void *ptr) | |||
72 | #define RADIX_TREE_TAG_LONGS \ | 72 | #define RADIX_TREE_TAG_LONGS \ |
73 | ((RADIX_TREE_MAP_SIZE + BITS_PER_LONG - 1) / BITS_PER_LONG) | 73 | ((RADIX_TREE_MAP_SIZE + BITS_PER_LONG - 1) / BITS_PER_LONG) |
74 | 74 | ||
75 | #define RADIX_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long)) | ||
76 | #define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \ | ||
77 | RADIX_TREE_MAP_SHIFT)) | ||
78 | |||
79 | /* Height component in node->path */ | ||
80 | #define RADIX_TREE_HEIGHT_SHIFT (RADIX_TREE_MAX_PATH + 1) | ||
81 | #define RADIX_TREE_HEIGHT_MASK ((1UL << RADIX_TREE_HEIGHT_SHIFT) - 1) | ||
82 | |||
83 | /* Internally used bits of node->count */ | ||
84 | #define RADIX_TREE_COUNT_SHIFT (RADIX_TREE_MAP_SHIFT + 1) | ||
85 | #define RADIX_TREE_COUNT_MASK ((1UL << RADIX_TREE_COUNT_SHIFT) - 1) | ||
86 | |||
75 | struct radix_tree_node { | 87 | struct radix_tree_node { |
76 | unsigned int height; /* Height from the bottom */ | 88 | unsigned int path; /* Offset in parent & height from the bottom */ |
77 | unsigned int count; | 89 | unsigned int count; |
78 | union { | 90 | union { |
79 | struct radix_tree_node *parent; /* Used when ascending tree */ | 91 | struct { |
80 | struct rcu_head rcu_head; /* Used when freeing node */ | 92 | /* Used when ascending tree */ |
93 | struct radix_tree_node *parent; | ||
94 | /* For tree user */ | ||
95 | void *private_data; | ||
96 | }; | ||
97 | /* Used when freeing node */ | ||
98 | struct rcu_head rcu_head; | ||
81 | }; | 99 | }; |
100 | /* For tree user */ | ||
101 | struct list_head private_list; | ||
82 | void __rcu *slots[RADIX_TREE_MAP_SIZE]; | 102 | void __rcu *slots[RADIX_TREE_MAP_SIZE]; |
83 | unsigned long tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS]; | 103 | unsigned long tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS]; |
84 | }; | 104 | }; |
85 | 105 | ||
86 | #define RADIX_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long)) | ||
87 | #define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \ | ||
88 | RADIX_TREE_MAP_SHIFT)) | ||
89 | |||
90 | /* root tags are stored in gfp_mask, shifted by __GFP_BITS_SHIFT */ | 106 | /* root tags are stored in gfp_mask, shifted by __GFP_BITS_SHIFT */ |
91 | struct radix_tree_root { | 107 | struct radix_tree_root { |
92 | unsigned int height; | 108 | unsigned int height; |
@@ -251,7 +267,7 @@ void *__radix_tree_lookup(struct radix_tree_root *root, unsigned long index, | |||
251 | struct radix_tree_node **nodep, void ***slotp); | 267 | struct radix_tree_node **nodep, void ***slotp); |
252 | void *radix_tree_lookup(struct radix_tree_root *, unsigned long); | 268 | void *radix_tree_lookup(struct radix_tree_root *, unsigned long); |
253 | void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long); | 269 | void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long); |
254 | bool __radix_tree_delete_node(struct radix_tree_root *root, unsigned long index, | 270 | bool __radix_tree_delete_node(struct radix_tree_root *root, |
255 | struct radix_tree_node *node); | 271 | struct radix_tree_node *node); |
256 | void *radix_tree_delete_item(struct radix_tree_root *, unsigned long, void *); | 272 | void *radix_tree_delete_item(struct radix_tree_root *, unsigned long, void *); |
257 | void *radix_tree_delete(struct radix_tree_root *, unsigned long); | 273 | void *radix_tree_delete(struct radix_tree_root *, unsigned long); |
diff --git a/include/linux/swap.h b/include/linux/swap.h index b83cf61403ed..350711560753 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h | |||
@@ -264,6 +264,37 @@ struct swap_list_t { | |||
264 | void *workingset_eviction(struct address_space *mapping, struct page *page); | 264 | void *workingset_eviction(struct address_space *mapping, struct page *page); |
265 | bool workingset_refault(void *shadow); | 265 | bool workingset_refault(void *shadow); |
266 | void workingset_activation(struct page *page); | 266 | void workingset_activation(struct page *page); |
267 | extern struct list_lru workingset_shadow_nodes; | ||
268 | |||
269 | static inline unsigned int workingset_node_pages(struct radix_tree_node *node) | ||
270 | { | ||
271 | return node->count & RADIX_TREE_COUNT_MASK; | ||
272 | } | ||
273 | |||
274 | static inline void workingset_node_pages_inc(struct radix_tree_node *node) | ||
275 | { | ||
276 | node->count++; | ||
277 | } | ||
278 | |||
279 | static inline void workingset_node_pages_dec(struct radix_tree_node *node) | ||
280 | { | ||
281 | node->count--; | ||
282 | } | ||
283 | |||
284 | static inline unsigned int workingset_node_shadows(struct radix_tree_node *node) | ||
285 | { | ||
286 | return node->count >> RADIX_TREE_COUNT_SHIFT; | ||
287 | } | ||
288 | |||
289 | static inline void workingset_node_shadows_inc(struct radix_tree_node *node) | ||
290 | { | ||
291 | node->count += 1U << RADIX_TREE_COUNT_SHIFT; | ||
292 | } | ||
293 | |||
294 | static inline void workingset_node_shadows_dec(struct radix_tree_node *node) | ||
295 | { | ||
296 | node->count -= 1U << RADIX_TREE_COUNT_SHIFT; | ||
297 | } | ||
267 | 298 | ||
268 | /* linux/mm/page_alloc.c */ | 299 | /* linux/mm/page_alloc.c */ |
269 | extern unsigned long totalram_pages; | 300 | extern unsigned long totalram_pages; |
diff --git a/lib/radix-tree.c b/lib/radix-tree.c index d60be40c111b..9599aa72d7a0 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c | |||
@@ -342,7 +342,8 @@ static int radix_tree_extend(struct radix_tree_root *root, unsigned long index) | |||
342 | 342 | ||
343 | /* Increase the height. */ | 343 | /* Increase the height. */ |
344 | newheight = root->height+1; | 344 | newheight = root->height+1; |
345 | node->height = newheight; | 345 | BUG_ON(newheight & ~RADIX_TREE_HEIGHT_MASK); |
346 | node->path = newheight; | ||
346 | node->count = 1; | 347 | node->count = 1; |
347 | node->parent = NULL; | 348 | node->parent = NULL; |
348 | slot = root->rnode; | 349 | slot = root->rnode; |
@@ -400,11 +401,12 @@ int __radix_tree_create(struct radix_tree_root *root, unsigned long index, | |||
400 | /* Have to add a child node. */ | 401 | /* Have to add a child node. */ |
401 | if (!(slot = radix_tree_node_alloc(root))) | 402 | if (!(slot = radix_tree_node_alloc(root))) |
402 | return -ENOMEM; | 403 | return -ENOMEM; |
403 | slot->height = height; | 404 | slot->path = height; |
404 | slot->parent = node; | 405 | slot->parent = node; |
405 | if (node) { | 406 | if (node) { |
406 | rcu_assign_pointer(node->slots[offset], slot); | 407 | rcu_assign_pointer(node->slots[offset], slot); |
407 | node->count++; | 408 | node->count++; |
409 | slot->path |= offset << RADIX_TREE_HEIGHT_SHIFT; | ||
408 | } else | 410 | } else |
409 | rcu_assign_pointer(root->rnode, ptr_to_indirect(slot)); | 411 | rcu_assign_pointer(root->rnode, ptr_to_indirect(slot)); |
410 | } | 412 | } |
@@ -498,7 +500,7 @@ void *__radix_tree_lookup(struct radix_tree_root *root, unsigned long index, | |||
498 | } | 500 | } |
499 | node = indirect_to_ptr(node); | 501 | node = indirect_to_ptr(node); |
500 | 502 | ||
501 | height = node->height; | 503 | height = node->path & RADIX_TREE_HEIGHT_MASK; |
502 | if (index > radix_tree_maxindex(height)) | 504 | if (index > radix_tree_maxindex(height)) |
503 | return NULL; | 505 | return NULL; |
504 | 506 | ||
@@ -704,7 +706,7 @@ int radix_tree_tag_get(struct radix_tree_root *root, | |||
704 | return (index == 0); | 706 | return (index == 0); |
705 | node = indirect_to_ptr(node); | 707 | node = indirect_to_ptr(node); |
706 | 708 | ||
707 | height = node->height; | 709 | height = node->path & RADIX_TREE_HEIGHT_MASK; |
708 | if (index > radix_tree_maxindex(height)) | 710 | if (index > radix_tree_maxindex(height)) |
709 | return 0; | 711 | return 0; |
710 | 712 | ||
@@ -741,7 +743,7 @@ void **radix_tree_next_chunk(struct radix_tree_root *root, | |||
741 | { | 743 | { |
742 | unsigned shift, tag = flags & RADIX_TREE_ITER_TAG_MASK; | 744 | unsigned shift, tag = flags & RADIX_TREE_ITER_TAG_MASK; |
743 | struct radix_tree_node *rnode, *node; | 745 | struct radix_tree_node *rnode, *node; |
744 | unsigned long index, offset; | 746 | unsigned long index, offset, height; |
745 | 747 | ||
746 | if ((flags & RADIX_TREE_ITER_TAGGED) && !root_tag_get(root, tag)) | 748 | if ((flags & RADIX_TREE_ITER_TAGGED) && !root_tag_get(root, tag)) |
747 | return NULL; | 749 | return NULL; |
@@ -772,7 +774,8 @@ void **radix_tree_next_chunk(struct radix_tree_root *root, | |||
772 | return NULL; | 774 | return NULL; |
773 | 775 | ||
774 | restart: | 776 | restart: |
775 | shift = (rnode->height - 1) * RADIX_TREE_MAP_SHIFT; | 777 | height = rnode->path & RADIX_TREE_HEIGHT_MASK; |
778 | shift = (height - 1) * RADIX_TREE_MAP_SHIFT; | ||
776 | offset = index >> shift; | 779 | offset = index >> shift; |
777 | 780 | ||
778 | /* Index outside of the tree */ | 781 | /* Index outside of the tree */ |
@@ -1142,7 +1145,7 @@ static unsigned long __locate(struct radix_tree_node *slot, void *item, | |||
1142 | unsigned int shift, height; | 1145 | unsigned int shift, height; |
1143 | unsigned long i; | 1146 | unsigned long i; |
1144 | 1147 | ||
1145 | height = slot->height; | 1148 | height = slot->path & RADIX_TREE_HEIGHT_MASK; |
1146 | shift = (height-1) * RADIX_TREE_MAP_SHIFT; | 1149 | shift = (height-1) * RADIX_TREE_MAP_SHIFT; |
1147 | 1150 | ||
1148 | for ( ; height > 1; height--) { | 1151 | for ( ; height > 1; height--) { |
@@ -1205,7 +1208,8 @@ unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item) | |||
1205 | } | 1208 | } |
1206 | 1209 | ||
1207 | node = indirect_to_ptr(node); | 1210 | node = indirect_to_ptr(node); |
1208 | max_index = radix_tree_maxindex(node->height); | 1211 | max_index = radix_tree_maxindex(node->path & |
1212 | RADIX_TREE_HEIGHT_MASK); | ||
1209 | if (cur_index > max_index) { | 1213 | if (cur_index > max_index) { |
1210 | rcu_read_unlock(); | 1214 | rcu_read_unlock(); |
1211 | break; | 1215 | break; |
@@ -1301,7 +1305,7 @@ static inline void radix_tree_shrink(struct radix_tree_root *root) | |||
1301 | * | 1305 | * |
1302 | * Returns %true if @node was freed, %false otherwise. | 1306 | * Returns %true if @node was freed, %false otherwise. |
1303 | */ | 1307 | */ |
1304 | bool __radix_tree_delete_node(struct radix_tree_root *root, unsigned long index, | 1308 | bool __radix_tree_delete_node(struct radix_tree_root *root, |
1305 | struct radix_tree_node *node) | 1309 | struct radix_tree_node *node) |
1306 | { | 1310 | { |
1307 | bool deleted = false; | 1311 | bool deleted = false; |
@@ -1320,9 +1324,10 @@ bool __radix_tree_delete_node(struct radix_tree_root *root, unsigned long index, | |||
1320 | 1324 | ||
1321 | parent = node->parent; | 1325 | parent = node->parent; |
1322 | if (parent) { | 1326 | if (parent) { |
1323 | index >>= RADIX_TREE_MAP_SHIFT; | 1327 | unsigned int offset; |
1324 | 1328 | ||
1325 | parent->slots[index & RADIX_TREE_MAP_MASK] = NULL; | 1329 | offset = node->path >> RADIX_TREE_HEIGHT_SHIFT; |
1330 | parent->slots[offset] = NULL; | ||
1326 | parent->count--; | 1331 | parent->count--; |
1327 | } else { | 1332 | } else { |
1328 | root_tag_clear_all(root); | 1333 | root_tag_clear_all(root); |
@@ -1386,7 +1391,7 @@ void *radix_tree_delete_item(struct radix_tree_root *root, | |||
1386 | node->slots[offset] = NULL; | 1391 | node->slots[offset] = NULL; |
1387 | node->count--; | 1392 | node->count--; |
1388 | 1393 | ||
1389 | __radix_tree_delete_node(root, index, node); | 1394 | __radix_tree_delete_node(root, node); |
1390 | 1395 | ||
1391 | return entry; | 1396 | return entry; |
1392 | } | 1397 | } |
@@ -1419,9 +1424,12 @@ int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag) | |||
1419 | EXPORT_SYMBOL(radix_tree_tagged); | 1424 | EXPORT_SYMBOL(radix_tree_tagged); |
1420 | 1425 | ||
1421 | static void | 1426 | static void |
1422 | radix_tree_node_ctor(void *node) | 1427 | radix_tree_node_ctor(void *arg) |
1423 | { | 1428 | { |
1424 | memset(node, 0, sizeof(struct radix_tree_node)); | 1429 | struct radix_tree_node *node = arg; |
1430 | |||
1431 | memset(node, 0, sizeof(*node)); | ||
1432 | INIT_LIST_HEAD(&node->private_list); | ||
1425 | } | 1433 | } |
1426 | 1434 | ||
1427 | static __init unsigned long __maxindex(unsigned int height) | 1435 | static __init unsigned long __maxindex(unsigned int height) |
diff --git a/mm/filemap.c b/mm/filemap.c index a603c4d7d3c9..d6df3bacb0fb 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -110,11 +110,17 @@ | |||
110 | static void page_cache_tree_delete(struct address_space *mapping, | 110 | static void page_cache_tree_delete(struct address_space *mapping, |
111 | struct page *page, void *shadow) | 111 | struct page *page, void *shadow) |
112 | { | 112 | { |
113 | if (shadow) { | 113 | struct radix_tree_node *node; |
114 | void **slot; | 114 | unsigned long index; |
115 | unsigned int offset; | ||
116 | unsigned int tag; | ||
117 | void **slot; | ||
115 | 118 | ||
116 | slot = radix_tree_lookup_slot(&mapping->page_tree, page->index); | 119 | VM_BUG_ON(!PageLocked(page)); |
117 | radix_tree_replace_slot(slot, shadow); | 120 | |
121 | __radix_tree_lookup(&mapping->page_tree, page->index, &node, &slot); | ||
122 | |||
123 | if (shadow) { | ||
118 | mapping->nrshadows++; | 124 | mapping->nrshadows++; |
119 | /* | 125 | /* |
120 | * Make sure the nrshadows update is committed before | 126 | * Make sure the nrshadows update is committed before |
@@ -123,9 +129,45 @@ static void page_cache_tree_delete(struct address_space *mapping, | |||
123 | * same time and miss a shadow entry. | 129 | * same time and miss a shadow entry. |
124 | */ | 130 | */ |
125 | smp_wmb(); | 131 | smp_wmb(); |
126 | } else | 132 | } |
127 | radix_tree_delete(&mapping->page_tree, page->index); | ||
128 | mapping->nrpages--; | 133 | mapping->nrpages--; |
134 | |||
135 | if (!node) { | ||
136 | /* Clear direct pointer tags in root node */ | ||
137 | mapping->page_tree.gfp_mask &= __GFP_BITS_MASK; | ||
138 | radix_tree_replace_slot(slot, shadow); | ||
139 | return; | ||
140 | } | ||
141 | |||
142 | /* Clear tree tags for the removed page */ | ||
143 | index = page->index; | ||
144 | offset = index & RADIX_TREE_MAP_MASK; | ||
145 | for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) { | ||
146 | if (test_bit(offset, node->tags[tag])) | ||
147 | radix_tree_tag_clear(&mapping->page_tree, index, tag); | ||
148 | } | ||
149 | |||
150 | /* Delete page, swap shadow entry */ | ||
151 | radix_tree_replace_slot(slot, shadow); | ||
152 | workingset_node_pages_dec(node); | ||
153 | if (shadow) | ||
154 | workingset_node_shadows_inc(node); | ||
155 | else | ||
156 | if (__radix_tree_delete_node(&mapping->page_tree, node)) | ||
157 | return; | ||
158 | |||
159 | /* | ||
160 | * Track node that only contains shadow entries. | ||
161 | * | ||
162 | * Avoid acquiring the list_lru lock if already tracked. The | ||
163 | * list_empty() test is safe as node->private_list is | ||
164 | * protected by mapping->tree_lock. | ||
165 | */ | ||
166 | if (!workingset_node_pages(node) && | ||
167 | list_empty(&node->private_list)) { | ||
168 | node->private_data = mapping; | ||
169 | list_lru_add(&workingset_shadow_nodes, &node->private_list); | ||
170 | } | ||
129 | } | 171 | } |
130 | 172 | ||
131 | /* | 173 | /* |
@@ -471,27 +513,43 @@ EXPORT_SYMBOL_GPL(replace_page_cache_page); | |||
471 | static int page_cache_tree_insert(struct address_space *mapping, | 513 | static int page_cache_tree_insert(struct address_space *mapping, |
472 | struct page *page, void **shadowp) | 514 | struct page *page, void **shadowp) |
473 | { | 515 | { |
516 | struct radix_tree_node *node; | ||
474 | void **slot; | 517 | void **slot; |
475 | int error; | 518 | int error; |
476 | 519 | ||
477 | slot = radix_tree_lookup_slot(&mapping->page_tree, page->index); | 520 | error = __radix_tree_create(&mapping->page_tree, page->index, |
478 | if (slot) { | 521 | &node, &slot); |
522 | if (error) | ||
523 | return error; | ||
524 | if (*slot) { | ||
479 | void *p; | 525 | void *p; |
480 | 526 | ||
481 | p = radix_tree_deref_slot_protected(slot, &mapping->tree_lock); | 527 | p = radix_tree_deref_slot_protected(slot, &mapping->tree_lock); |
482 | if (!radix_tree_exceptional_entry(p)) | 528 | if (!radix_tree_exceptional_entry(p)) |
483 | return -EEXIST; | 529 | return -EEXIST; |
484 | radix_tree_replace_slot(slot, page); | ||
485 | mapping->nrshadows--; | ||
486 | mapping->nrpages++; | ||
487 | if (shadowp) | 530 | if (shadowp) |
488 | *shadowp = p; | 531 | *shadowp = p; |
489 | return 0; | 532 | mapping->nrshadows--; |
533 | if (node) | ||
534 | workingset_node_shadows_dec(node); | ||
490 | } | 535 | } |
491 | error = radix_tree_insert(&mapping->page_tree, page->index, page); | 536 | radix_tree_replace_slot(slot, page); |
492 | if (!error) | 537 | mapping->nrpages++; |
493 | mapping->nrpages++; | 538 | if (node) { |
494 | return error; | 539 | workingset_node_pages_inc(node); |
540 | /* | ||
541 | * Don't track node that contains actual pages. | ||
542 | * | ||
543 | * Avoid acquiring the list_lru lock if already | ||
544 | * untracked. The list_empty() test is safe as | ||
545 | * node->private_list is protected by | ||
546 | * mapping->tree_lock. | ||
547 | */ | ||
548 | if (!list_empty(&node->private_list)) | ||
549 | list_lru_del(&workingset_shadow_nodes, | ||
550 | &node->private_list); | ||
551 | } | ||
552 | return 0; | ||
495 | } | 553 | } |
496 | 554 | ||
497 | static int __add_to_page_cache_locked(struct page *page, | 555 | static int __add_to_page_cache_locked(struct page *page, |
diff --git a/mm/list_lru.c b/mm/list_lru.c index 72f9decb0104..f1a0db194173 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c | |||
@@ -87,11 +87,20 @@ restart: | |||
87 | 87 | ||
88 | ret = isolate(item, &nlru->lock, cb_arg); | 88 | ret = isolate(item, &nlru->lock, cb_arg); |
89 | switch (ret) { | 89 | switch (ret) { |
90 | case LRU_REMOVED_RETRY: | ||
91 | assert_spin_locked(&nlru->lock); | ||
90 | case LRU_REMOVED: | 92 | case LRU_REMOVED: |
91 | if (--nlru->nr_items == 0) | 93 | if (--nlru->nr_items == 0) |
92 | node_clear(nid, lru->active_nodes); | 94 | node_clear(nid, lru->active_nodes); |
93 | WARN_ON_ONCE(nlru->nr_items < 0); | 95 | WARN_ON_ONCE(nlru->nr_items < 0); |
94 | isolated++; | 96 | isolated++; |
97 | /* | ||
98 | * If the lru lock has been dropped, our list | ||
99 | * traversal is now invalid and so we have to | ||
100 | * restart from scratch. | ||
101 | */ | ||
102 | if (ret == LRU_REMOVED_RETRY) | ||
103 | goto restart; | ||
95 | break; | 104 | break; |
96 | case LRU_ROTATE: | 105 | case LRU_ROTATE: |
97 | list_move_tail(item, &nlru->list); | 106 | list_move_tail(item, &nlru->list); |
@@ -103,6 +112,7 @@ restart: | |||
103 | * The lru lock has been dropped, our list traversal is | 112 | * The lru lock has been dropped, our list traversal is |
104 | * now invalid and so we have to restart from scratch. | 113 | * now invalid and so we have to restart from scratch. |
105 | */ | 114 | */ |
115 | assert_spin_locked(&nlru->lock); | ||
106 | goto restart; | 116 | goto restart; |
107 | default: | 117 | default: |
108 | BUG(); | 118 | BUG(); |
@@ -114,7 +124,7 @@ restart: | |||
114 | } | 124 | } |
115 | EXPORT_SYMBOL_GPL(list_lru_walk_node); | 125 | EXPORT_SYMBOL_GPL(list_lru_walk_node); |
116 | 126 | ||
117 | int list_lru_init(struct list_lru *lru) | 127 | int list_lru_init_key(struct list_lru *lru, struct lock_class_key *key) |
118 | { | 128 | { |
119 | int i; | 129 | int i; |
120 | size_t size = sizeof(*lru->node) * nr_node_ids; | 130 | size_t size = sizeof(*lru->node) * nr_node_ids; |
@@ -126,12 +136,14 @@ int list_lru_init(struct list_lru *lru) | |||
126 | nodes_clear(lru->active_nodes); | 136 | nodes_clear(lru->active_nodes); |
127 | for (i = 0; i < nr_node_ids; i++) { | 137 | for (i = 0; i < nr_node_ids; i++) { |
128 | spin_lock_init(&lru->node[i].lock); | 138 | spin_lock_init(&lru->node[i].lock); |
139 | if (key) | ||
140 | lockdep_set_class(&lru->node[i].lock, key); | ||
129 | INIT_LIST_HEAD(&lru->node[i].list); | 141 | INIT_LIST_HEAD(&lru->node[i].list); |
130 | lru->node[i].nr_items = 0; | 142 | lru->node[i].nr_items = 0; |
131 | } | 143 | } |
132 | return 0; | 144 | return 0; |
133 | } | 145 | } |
134 | EXPORT_SYMBOL_GPL(list_lru_init); | 146 | EXPORT_SYMBOL_GPL(list_lru_init_key); |
135 | 147 | ||
136 | void list_lru_destroy(struct list_lru *lru) | 148 | void list_lru_destroy(struct list_lru *lru) |
137 | { | 149 | { |
diff --git a/mm/truncate.c b/mm/truncate.c index 0db9258319f0..e5cc39ab0751 100644 --- a/mm/truncate.c +++ b/mm/truncate.c | |||
@@ -25,6 +25,9 @@ | |||
25 | static void clear_exceptional_entry(struct address_space *mapping, | 25 | static void clear_exceptional_entry(struct address_space *mapping, |
26 | pgoff_t index, void *entry) | 26 | pgoff_t index, void *entry) |
27 | { | 27 | { |
28 | struct radix_tree_node *node; | ||
29 | void **slot; | ||
30 | |||
28 | /* Handled by shmem itself */ | 31 | /* Handled by shmem itself */ |
29 | if (shmem_mapping(mapping)) | 32 | if (shmem_mapping(mapping)) |
30 | return; | 33 | return; |
@@ -35,8 +38,27 @@ static void clear_exceptional_entry(struct address_space *mapping, | |||
35 | * without the tree itself locked. These unlocked entries | 38 | * without the tree itself locked. These unlocked entries |
36 | * need verification under the tree lock. | 39 | * need verification under the tree lock. |
37 | */ | 40 | */ |
38 | if (radix_tree_delete_item(&mapping->page_tree, index, entry) == entry) | 41 | if (!__radix_tree_lookup(&mapping->page_tree, index, &node, &slot)) |
39 | mapping->nrshadows--; | 42 | goto unlock; |
43 | if (*slot != entry) | ||
44 | goto unlock; | ||
45 | radix_tree_replace_slot(slot, NULL); | ||
46 | mapping->nrshadows--; | ||
47 | if (!node) | ||
48 | goto unlock; | ||
49 | workingset_node_shadows_dec(node); | ||
50 | /* | ||
51 | * Don't track node without shadow entries. | ||
52 | * | ||
53 | * Avoid acquiring the list_lru lock if already untracked. | ||
54 | * The list_empty() test is safe as node->private_list is | ||
55 | * protected by mapping->tree_lock. | ||
56 | */ | ||
57 | if (!workingset_node_shadows(node) && | ||
58 | !list_empty(&node->private_list)) | ||
59 | list_lru_del(&workingset_shadow_nodes, &node->private_list); | ||
60 | __radix_tree_delete_node(&mapping->page_tree, node); | ||
61 | unlock: | ||
40 | spin_unlock_irq(&mapping->tree_lock); | 62 | spin_unlock_irq(&mapping->tree_lock); |
41 | } | 63 | } |
42 | 64 | ||
diff --git a/mm/vmstat.c b/mm/vmstat.c index 843125a0e255..f3155d51acfd 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
@@ -772,6 +772,7 @@ const char * const vmstat_text[] = { | |||
772 | #endif | 772 | #endif |
773 | "workingset_refault", | 773 | "workingset_refault", |
774 | "workingset_activate", | 774 | "workingset_activate", |
775 | "workingset_nodereclaim", | ||
775 | "nr_anon_transparent_hugepages", | 776 | "nr_anon_transparent_hugepages", |
776 | "nr_free_cma", | 777 | "nr_free_cma", |
777 | "nr_dirty_threshold", | 778 | "nr_dirty_threshold", |
diff --git a/mm/workingset.c b/mm/workingset.c index 8a6c7cff4923..f7216fa7da27 100644 --- a/mm/workingset.c +++ b/mm/workingset.c | |||
@@ -251,3 +251,164 @@ void workingset_activation(struct page *page) | |||
251 | { | 251 | { |
252 | atomic_long_inc(&page_zone(page)->inactive_age); | 252 | atomic_long_inc(&page_zone(page)->inactive_age); |
253 | } | 253 | } |
254 | |||
255 | /* | ||
256 | * Shadow entries reflect the share of the working set that does not | ||
257 | * fit into memory, so their number depends on the access pattern of | ||
258 | * the workload. In most cases, they will refault or get reclaimed | ||
259 | * along with the inode, but a (malicious) workload that streams | ||
260 | * through files with a total size several times that of available | ||
261 | * memory, while preventing the inodes from being reclaimed, can | ||
262 | * create excessive amounts of shadow nodes. To keep a lid on this, | ||
263 | * track shadow nodes and reclaim them when they grow way past the | ||
264 | * point where they would still be useful. | ||
265 | */ | ||
266 | |||
267 | struct list_lru workingset_shadow_nodes; | ||
268 | |||
269 | static unsigned long count_shadow_nodes(struct shrinker *shrinker, | ||
270 | struct shrink_control *sc) | ||
271 | { | ||
272 | unsigned long shadow_nodes; | ||
273 | unsigned long max_nodes; | ||
274 | unsigned long pages; | ||
275 | |||
276 | /* list_lru lock nests inside IRQ-safe mapping->tree_lock */ | ||
277 | local_irq_disable(); | ||
278 | shadow_nodes = list_lru_count_node(&workingset_shadow_nodes, sc->nid); | ||
279 | local_irq_enable(); | ||
280 | |||
281 | pages = node_present_pages(sc->nid); | ||
282 | /* | ||
283 | * Active cache pages are limited to 50% of memory, and shadow | ||
284 | * entries that represent a refault distance bigger than that | ||
285 | * do not have any effect. Limit the number of shadow nodes | ||
286 | * such that shadow entries do not exceed the number of active | ||
287 | * cache pages, assuming a worst-case node population density | ||
288 | * of 1/8th on average. | ||
289 | * | ||
290 | * On 64-bit with 7 radix_tree_nodes per page and 64 slots | ||
291 | * each, this will reclaim shadow entries when they consume | ||
292 | * ~2% of available memory: | ||
293 | * | ||
294 | * PAGE_SIZE / radix_tree_nodes / node_entries / PAGE_SIZE | ||
295 | */ | ||
296 | max_nodes = pages >> (1 + RADIX_TREE_MAP_SHIFT - 3); | ||
297 | |||
298 | if (shadow_nodes <= max_nodes) | ||
299 | return 0; | ||
300 | |||
301 | return shadow_nodes - max_nodes; | ||
302 | } | ||
303 | |||
304 | static enum lru_status shadow_lru_isolate(struct list_head *item, | ||
305 | spinlock_t *lru_lock, | ||
306 | void *arg) | ||
307 | { | ||
308 | struct address_space *mapping; | ||
309 | struct radix_tree_node *node; | ||
310 | unsigned int i; | ||
311 | int ret; | ||
312 | |||
313 | /* | ||
314 | * Page cache insertions and deletions synchroneously maintain | ||
315 | * the shadow node LRU under the mapping->tree_lock and the | ||
316 | * lru_lock. Because the page cache tree is emptied before | ||
317 | * the inode can be destroyed, holding the lru_lock pins any | ||
318 | * address_space that has radix tree nodes on the LRU. | ||
319 | * | ||
320 | * We can then safely transition to the mapping->tree_lock to | ||
321 | * pin only the address_space of the particular node we want | ||
322 | * to reclaim, take the node off-LRU, and drop the lru_lock. | ||
323 | */ | ||
324 | |||
325 | node = container_of(item, struct radix_tree_node, private_list); | ||
326 | mapping = node->private_data; | ||
327 | |||
328 | /* Coming from the list, invert the lock order */ | ||
329 | if (!spin_trylock(&mapping->tree_lock)) { | ||
330 | spin_unlock(lru_lock); | ||
331 | ret = LRU_RETRY; | ||
332 | goto out; | ||
333 | } | ||
334 | |||
335 | list_del_init(item); | ||
336 | spin_unlock(lru_lock); | ||
337 | |||
338 | /* | ||
339 | * The nodes should only contain one or more shadow entries, | ||
340 | * no pages, so we expect to be able to remove them all and | ||
341 | * delete and free the empty node afterwards. | ||
342 | */ | ||
343 | |||
344 | BUG_ON(!node->count); | ||
345 | BUG_ON(node->count & RADIX_TREE_COUNT_MASK); | ||
346 | |||
347 | for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) { | ||
348 | if (node->slots[i]) { | ||
349 | BUG_ON(!radix_tree_exceptional_entry(node->slots[i])); | ||
350 | node->slots[i] = NULL; | ||
351 | BUG_ON(node->count < (1U << RADIX_TREE_COUNT_SHIFT)); | ||
352 | node->count -= 1U << RADIX_TREE_COUNT_SHIFT; | ||
353 | BUG_ON(!mapping->nrshadows); | ||
354 | mapping->nrshadows--; | ||
355 | } | ||
356 | } | ||
357 | BUG_ON(node->count); | ||
358 | inc_zone_state(page_zone(virt_to_page(node)), WORKINGSET_NODERECLAIM); | ||
359 | if (!__radix_tree_delete_node(&mapping->page_tree, node)) | ||
360 | BUG(); | ||
361 | |||
362 | spin_unlock(&mapping->tree_lock); | ||
363 | ret = LRU_REMOVED_RETRY; | ||
364 | out: | ||
365 | local_irq_enable(); | ||
366 | cond_resched(); | ||
367 | local_irq_disable(); | ||
368 | spin_lock(lru_lock); | ||
369 | return ret; | ||
370 | } | ||
371 | |||
372 | static unsigned long scan_shadow_nodes(struct shrinker *shrinker, | ||
373 | struct shrink_control *sc) | ||
374 | { | ||
375 | unsigned long ret; | ||
376 | |||
377 | /* list_lru lock nests inside IRQ-safe mapping->tree_lock */ | ||
378 | local_irq_disable(); | ||
379 | ret = list_lru_walk_node(&workingset_shadow_nodes, sc->nid, | ||
380 | shadow_lru_isolate, NULL, &sc->nr_to_scan); | ||
381 | local_irq_enable(); | ||
382 | return ret; | ||
383 | } | ||
384 | |||
385 | static struct shrinker workingset_shadow_shrinker = { | ||
386 | .count_objects = count_shadow_nodes, | ||
387 | .scan_objects = scan_shadow_nodes, | ||
388 | .seeks = DEFAULT_SEEKS, | ||
389 | .flags = SHRINKER_NUMA_AWARE, | ||
390 | }; | ||
391 | |||
392 | /* | ||
393 | * Our list_lru->lock is IRQ-safe as it nests inside the IRQ-safe | ||
394 | * mapping->tree_lock. | ||
395 | */ | ||
396 | static struct lock_class_key shadow_nodes_key; | ||
397 | |||
398 | static int __init workingset_init(void) | ||
399 | { | ||
400 | int ret; | ||
401 | |||
402 | ret = list_lru_init_key(&workingset_shadow_nodes, &shadow_nodes_key); | ||
403 | if (ret) | ||
404 | goto err; | ||
405 | ret = register_shrinker(&workingset_shadow_shrinker); | ||
406 | if (ret) | ||
407 | goto err_list_lru; | ||
408 | return 0; | ||
409 | err_list_lru: | ||
410 | list_lru_destroy(&workingset_shadow_nodes); | ||
411 | err: | ||
412 | return ret; | ||
413 | } | ||
414 | module_init(workingset_init); | ||