aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2013-09-12 18:01:38 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2013-09-12 18:01:38 -0400
commit26935fb06ee88f1188789807687c03041f3c70d9 (patch)
tree381c487716540b52348d78bee6555f8fa61d77ef /mm
parent3cc69b638e11bfda5d013c2b75b60934aa0e88a1 (diff)
parentbf2ba3bc185269eca274b458aac46ba1ad7c1121 (diff)
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs
Pull vfs pile 4 from Al Viro: "list_lru pile, mostly" This came out of Andrew's pile, Al ended up doing the merge work so that Andrew didn't have to. Additionally, a few fixes. * 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs: (42 commits) super: fix for destroy lrus list_lru: dynamically adjust node arrays shrinker: Kill old ->shrink API. shrinker: convert remaining shrinkers to count/scan API staging/lustre/libcfs: cleanup linux-mem.h staging/lustre/ptlrpc: convert to new shrinker API staging/lustre/obdclass: convert lu_object shrinker to count/scan API staging/lustre/ldlm: convert to shrinkers to count/scan API hugepage: convert huge zero page shrinker to new shrinker API i915: bail out earlier when shrinker cannot acquire mutex drivers: convert shrinkers to new count/scan API fs: convert fs shrinkers to new scan/count API xfs: fix dquot isolation hang xfs-convert-dquot-cache-lru-to-list_lru-fix xfs: convert dquot cache lru to list_lru xfs: rework buffer dispose list tracking xfs-convert-buftarg-lru-to-generic-code-fix xfs: convert buftarg LRU to generic code fs: convert inode and dentry shrinking to be node aware vmscan: per-node deferred work ...
Diffstat (limited to 'mm')
-rw-r--r--mm/Makefile2
-rw-r--r--mm/huge_memory.c17
-rw-r--r--mm/list_lru.c139
-rw-r--r--mm/memory-failure.c2
-rw-r--r--mm/vmscan.c241
5 files changed, 293 insertions, 108 deletions
diff --git a/mm/Makefile b/mm/Makefile
index f00803386a67..305d10acd081 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -17,7 +17,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
17 util.o mmzone.o vmstat.o backing-dev.o \ 17 util.o mmzone.o vmstat.o backing-dev.o \
18 mm_init.o mmu_context.o percpu.o slab_common.o \ 18 mm_init.o mmu_context.o percpu.o slab_common.o \
19 compaction.o balloon_compaction.o \ 19 compaction.o balloon_compaction.o \
20 interval_tree.o $(mmu-y) 20 interval_tree.o list_lru.o $(mmu-y)
21 21
22obj-y += init-mm.o 22obj-y += init-mm.o
23 23
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 963e14c0486f..d66010e0049d 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -211,24 +211,29 @@ static void put_huge_zero_page(void)
211 BUG_ON(atomic_dec_and_test(&huge_zero_refcount)); 211 BUG_ON(atomic_dec_and_test(&huge_zero_refcount));
212} 212}
213 213
214static int shrink_huge_zero_page(struct shrinker *shrink, 214static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink,
215 struct shrink_control *sc) 215 struct shrink_control *sc)
216{ 216{
217 if (!sc->nr_to_scan) 217 /* we can free zero page only if last reference remains */
218 /* we can free zero page only if last reference remains */ 218 return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0;
219 return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0; 219}
220 220
221static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink,
222 struct shrink_control *sc)
223{
221 if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) { 224 if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
222 struct page *zero_page = xchg(&huge_zero_page, NULL); 225 struct page *zero_page = xchg(&huge_zero_page, NULL);
223 BUG_ON(zero_page == NULL); 226 BUG_ON(zero_page == NULL);
224 __free_page(zero_page); 227 __free_page(zero_page);
228 return HPAGE_PMD_NR;
225 } 229 }
226 230
227 return 0; 231 return 0;
228} 232}
229 233
230static struct shrinker huge_zero_page_shrinker = { 234static struct shrinker huge_zero_page_shrinker = {
231 .shrink = shrink_huge_zero_page, 235 .count_objects = shrink_huge_zero_page_count,
236 .scan_objects = shrink_huge_zero_page_scan,
232 .seeks = DEFAULT_SEEKS, 237 .seeks = DEFAULT_SEEKS,
233}; 238};
234 239
diff --git a/mm/list_lru.c b/mm/list_lru.c
new file mode 100644
index 000000000000..72467914b856
--- /dev/null
+++ b/mm/list_lru.c
@@ -0,0 +1,139 @@
1/*
2 * Copyright (c) 2013 Red Hat, Inc. and Parallels Inc. All rights reserved.
3 * Authors: David Chinner and Glauber Costa
4 *
5 * Generic LRU infrastructure
6 */
7#include <linux/kernel.h>
8#include <linux/module.h>
9#include <linux/mm.h>
10#include <linux/list_lru.h>
11#include <linux/slab.h>
12
13bool list_lru_add(struct list_lru *lru, struct list_head *item)
14{
15 int nid = page_to_nid(virt_to_page(item));
16 struct list_lru_node *nlru = &lru->node[nid];
17
18 spin_lock(&nlru->lock);
19 WARN_ON_ONCE(nlru->nr_items < 0);
20 if (list_empty(item)) {
21 list_add_tail(item, &nlru->list);
22 if (nlru->nr_items++ == 0)
23 node_set(nid, lru->active_nodes);
24 spin_unlock(&nlru->lock);
25 return true;
26 }
27 spin_unlock(&nlru->lock);
28 return false;
29}
30EXPORT_SYMBOL_GPL(list_lru_add);
31
32bool list_lru_del(struct list_lru *lru, struct list_head *item)
33{
34 int nid = page_to_nid(virt_to_page(item));
35 struct list_lru_node *nlru = &lru->node[nid];
36
37 spin_lock(&nlru->lock);
38 if (!list_empty(item)) {
39 list_del_init(item);
40 if (--nlru->nr_items == 0)
41 node_clear(nid, lru->active_nodes);
42 WARN_ON_ONCE(nlru->nr_items < 0);
43 spin_unlock(&nlru->lock);
44 return true;
45 }
46 spin_unlock(&nlru->lock);
47 return false;
48}
49EXPORT_SYMBOL_GPL(list_lru_del);
50
51unsigned long
52list_lru_count_node(struct list_lru *lru, int nid)
53{
54 unsigned long count = 0;
55 struct list_lru_node *nlru = &lru->node[nid];
56
57 spin_lock(&nlru->lock);
58 WARN_ON_ONCE(nlru->nr_items < 0);
59 count += nlru->nr_items;
60 spin_unlock(&nlru->lock);
61
62 return count;
63}
64EXPORT_SYMBOL_GPL(list_lru_count_node);
65
66unsigned long
67list_lru_walk_node(struct list_lru *lru, int nid, list_lru_walk_cb isolate,
68 void *cb_arg, unsigned long *nr_to_walk)
69{
70
71 struct list_lru_node *nlru = &lru->node[nid];
72 struct list_head *item, *n;
73 unsigned long isolated = 0;
74
75 spin_lock(&nlru->lock);
76restart:
77 list_for_each_safe(item, n, &nlru->list) {
78 enum lru_status ret;
79
80 /*
81 * decrement nr_to_walk first so that we don't livelock if we
82 * get stuck on large numbesr of LRU_RETRY items
83 */
84 if (--(*nr_to_walk) == 0)
85 break;
86
87 ret = isolate(item, &nlru->lock, cb_arg);
88 switch (ret) {
89 case LRU_REMOVED:
90 if (--nlru->nr_items == 0)
91 node_clear(nid, lru->active_nodes);
92 WARN_ON_ONCE(nlru->nr_items < 0);
93 isolated++;
94 break;
95 case LRU_ROTATE:
96 list_move_tail(item, &nlru->list);
97 break;
98 case LRU_SKIP:
99 break;
100 case LRU_RETRY:
101 /*
102 * The lru lock has been dropped, our list traversal is
103 * now invalid and so we have to restart from scratch.
104 */
105 goto restart;
106 default:
107 BUG();
108 }
109 }
110
111 spin_unlock(&nlru->lock);
112 return isolated;
113}
114EXPORT_SYMBOL_GPL(list_lru_walk_node);
115
116int list_lru_init(struct list_lru *lru)
117{
118 int i;
119 size_t size = sizeof(*lru->node) * nr_node_ids;
120
121 lru->node = kzalloc(size, GFP_KERNEL);
122 if (!lru->node)
123 return -ENOMEM;
124
125 nodes_clear(lru->active_nodes);
126 for (i = 0; i < nr_node_ids; i++) {
127 spin_lock_init(&lru->node[i].lock);
128 INIT_LIST_HEAD(&lru->node[i].list);
129 lru->node[i].nr_items = 0;
130 }
131 return 0;
132}
133EXPORT_SYMBOL_GPL(list_lru_init);
134
135void list_lru_destroy(struct list_lru *lru)
136{
137 kfree(lru->node);
138}
139EXPORT_SYMBOL_GPL(list_lru_destroy);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index d472e14c6808..947ed5413279 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -248,10 +248,12 @@ void shake_page(struct page *p, int access)
248 */ 248 */
249 if (access) { 249 if (access) {
250 int nr; 250 int nr;
251 int nid = page_to_nid(p);
251 do { 252 do {
252 struct shrink_control shrink = { 253 struct shrink_control shrink = {
253 .gfp_mask = GFP_KERNEL, 254 .gfp_mask = GFP_KERNEL,
254 }; 255 };
256 node_set(nid, shrink.nodes_to_scan);
255 257
256 nr = shrink_slab(&shrink, 1000, 1000); 258 nr = shrink_slab(&shrink, 1000, 1000);
257 if (page_count(p) == 1) 259 if (page_count(p) == 1)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index fe715daeb8bc..beb35778c69f 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -174,14 +174,31 @@ static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru)
174} 174}
175 175
176/* 176/*
177 * Add a shrinker callback to be called from the vm 177 * Add a shrinker callback to be called from the vm.
178 */ 178 */
179void register_shrinker(struct shrinker *shrinker) 179int register_shrinker(struct shrinker *shrinker)
180{ 180{
181 atomic_long_set(&shrinker->nr_in_batch, 0); 181 size_t size = sizeof(*shrinker->nr_deferred);
182
183 /*
184 * If we only have one possible node in the system anyway, save
185 * ourselves the trouble and disable NUMA aware behavior. This way we
186 * will save memory and some small loop time later.
187 */
188 if (nr_node_ids == 1)
189 shrinker->flags &= ~SHRINKER_NUMA_AWARE;
190
191 if (shrinker->flags & SHRINKER_NUMA_AWARE)
192 size *= nr_node_ids;
193
194 shrinker->nr_deferred = kzalloc(size, GFP_KERNEL);
195 if (!shrinker->nr_deferred)
196 return -ENOMEM;
197
182 down_write(&shrinker_rwsem); 198 down_write(&shrinker_rwsem);
183 list_add_tail(&shrinker->list, &shrinker_list); 199 list_add_tail(&shrinker->list, &shrinker_list);
184 up_write(&shrinker_rwsem); 200 up_write(&shrinker_rwsem);
201 return 0;
185} 202}
186EXPORT_SYMBOL(register_shrinker); 203EXPORT_SYMBOL(register_shrinker);
187 204
@@ -196,15 +213,102 @@ void unregister_shrinker(struct shrinker *shrinker)
196} 213}
197EXPORT_SYMBOL(unregister_shrinker); 214EXPORT_SYMBOL(unregister_shrinker);
198 215
199static inline int do_shrinker_shrink(struct shrinker *shrinker, 216#define SHRINK_BATCH 128
200 struct shrink_control *sc, 217
201 unsigned long nr_to_scan) 218static unsigned long
202{ 219shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
203 sc->nr_to_scan = nr_to_scan; 220 unsigned long nr_pages_scanned, unsigned long lru_pages)
204 return (*shrinker->shrink)(shrinker, sc); 221{
222 unsigned long freed = 0;
223 unsigned long long delta;
224 long total_scan;
225 long max_pass;
226 long nr;
227 long new_nr;
228 int nid = shrinkctl->nid;
229 long batch_size = shrinker->batch ? shrinker->batch
230 : SHRINK_BATCH;
231
232 max_pass = shrinker->count_objects(shrinker, shrinkctl);
233 if (max_pass == 0)
234 return 0;
235
236 /*
237 * copy the current shrinker scan count into a local variable
238 * and zero it so that other concurrent shrinker invocations
239 * don't also do this scanning work.
240 */
241 nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
242
243 total_scan = nr;
244 delta = (4 * nr_pages_scanned) / shrinker->seeks;
245 delta *= max_pass;
246 do_div(delta, lru_pages + 1);
247 total_scan += delta;
248 if (total_scan < 0) {
249 printk(KERN_ERR
250 "shrink_slab: %pF negative objects to delete nr=%ld\n",
251 shrinker->scan_objects, total_scan);
252 total_scan = max_pass;
253 }
254
255 /*
256 * We need to avoid excessive windup on filesystem shrinkers
257 * due to large numbers of GFP_NOFS allocations causing the
258 * shrinkers to return -1 all the time. This results in a large
259 * nr being built up so when a shrink that can do some work
260 * comes along it empties the entire cache due to nr >>>
261 * max_pass. This is bad for sustaining a working set in
262 * memory.
263 *
264 * Hence only allow the shrinker to scan the entire cache when
265 * a large delta change is calculated directly.
266 */
267 if (delta < max_pass / 4)
268 total_scan = min(total_scan, max_pass / 2);
269
270 /*
271 * Avoid risking looping forever due to too large nr value:
272 * never try to free more than twice the estimate number of
273 * freeable entries.
274 */
275 if (total_scan > max_pass * 2)
276 total_scan = max_pass * 2;
277
278 trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
279 nr_pages_scanned, lru_pages,
280 max_pass, delta, total_scan);
281
282 while (total_scan >= batch_size) {
283 unsigned long ret;
284
285 shrinkctl->nr_to_scan = batch_size;
286 ret = shrinker->scan_objects(shrinker, shrinkctl);
287 if (ret == SHRINK_STOP)
288 break;
289 freed += ret;
290
291 count_vm_events(SLABS_SCANNED, batch_size);
292 total_scan -= batch_size;
293
294 cond_resched();
295 }
296
297 /*
298 * move the unused scan count back into the shrinker in a
299 * manner that handles concurrent updates. If we exhausted the
300 * scan, there is no need to do an update.
301 */
302 if (total_scan > 0)
303 new_nr = atomic_long_add_return(total_scan,
304 &shrinker->nr_deferred[nid]);
305 else
306 new_nr = atomic_long_read(&shrinker->nr_deferred[nid]);
307
308 trace_mm_shrink_slab_end(shrinker, freed, nr, new_nr);
309 return freed;
205} 310}
206 311
207#define SHRINK_BATCH 128
208/* 312/*
209 * Call the shrink functions to age shrinkable caches 313 * Call the shrink functions to age shrinkable caches
210 * 314 *
@@ -224,115 +328,45 @@ static inline int do_shrinker_shrink(struct shrinker *shrinker,
224 * 328 *
225 * Returns the number of slab objects which we shrunk. 329 * Returns the number of slab objects which we shrunk.
226 */ 330 */
227unsigned long shrink_slab(struct shrink_control *shrink, 331unsigned long shrink_slab(struct shrink_control *shrinkctl,
228 unsigned long nr_pages_scanned, 332 unsigned long nr_pages_scanned,
229 unsigned long lru_pages) 333 unsigned long lru_pages)
230{ 334{
231 struct shrinker *shrinker; 335 struct shrinker *shrinker;
232 unsigned long ret = 0; 336 unsigned long freed = 0;
233 337
234 if (nr_pages_scanned == 0) 338 if (nr_pages_scanned == 0)
235 nr_pages_scanned = SWAP_CLUSTER_MAX; 339 nr_pages_scanned = SWAP_CLUSTER_MAX;
236 340
237 if (!down_read_trylock(&shrinker_rwsem)) { 341 if (!down_read_trylock(&shrinker_rwsem)) {
238 /* Assume we'll be able to shrink next time */ 342 /*
239 ret = 1; 343 * If we would return 0, our callers would understand that we
344 * have nothing else to shrink and give up trying. By returning
345 * 1 we keep it going and assume we'll be able to shrink next
346 * time.
347 */
348 freed = 1;
240 goto out; 349 goto out;
241 } 350 }
242 351
243 list_for_each_entry(shrinker, &shrinker_list, list) { 352 list_for_each_entry(shrinker, &shrinker_list, list) {
244 unsigned long long delta; 353 for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) {
245 long total_scan; 354 if (!node_online(shrinkctl->nid))
246 long max_pass; 355 continue;
247 int shrink_ret = 0;
248 long nr;
249 long new_nr;
250 long batch_size = shrinker->batch ? shrinker->batch
251 : SHRINK_BATCH;
252
253 max_pass = do_shrinker_shrink(shrinker, shrink, 0);
254 if (max_pass <= 0)
255 continue;
256
257 /*
258 * copy the current shrinker scan count into a local variable
259 * and zero it so that other concurrent shrinker invocations
260 * don't also do this scanning work.
261 */
262 nr = atomic_long_xchg(&shrinker->nr_in_batch, 0);
263
264 total_scan = nr;
265 delta = (4 * nr_pages_scanned) / shrinker->seeks;
266 delta *= max_pass;
267 do_div(delta, lru_pages + 1);
268 total_scan += delta;
269 if (total_scan < 0) {
270 printk(KERN_ERR "shrink_slab: %pF negative objects to "
271 "delete nr=%ld\n",
272 shrinker->shrink, total_scan);
273 total_scan = max_pass;
274 }
275
276 /*
277 * We need to avoid excessive windup on filesystem shrinkers
278 * due to large numbers of GFP_NOFS allocations causing the
279 * shrinkers to return -1 all the time. This results in a large
280 * nr being built up so when a shrink that can do some work
281 * comes along it empties the entire cache due to nr >>>
282 * max_pass. This is bad for sustaining a working set in
283 * memory.
284 *
285 * Hence only allow the shrinker to scan the entire cache when
286 * a large delta change is calculated directly.
287 */
288 if (delta < max_pass / 4)
289 total_scan = min(total_scan, max_pass / 2);
290
291 /*
292 * Avoid risking looping forever due to too large nr value:
293 * never try to free more than twice the estimate number of
294 * freeable entries.
295 */
296 if (total_scan > max_pass * 2)
297 total_scan = max_pass * 2;
298
299 trace_mm_shrink_slab_start(shrinker, shrink, nr,
300 nr_pages_scanned, lru_pages,
301 max_pass, delta, total_scan);
302
303 while (total_scan >= batch_size) {
304 int nr_before;
305 356
306 nr_before = do_shrinker_shrink(shrinker, shrink, 0); 357 if (!(shrinker->flags & SHRINKER_NUMA_AWARE) &&
307 shrink_ret = do_shrinker_shrink(shrinker, shrink, 358 (shrinkctl->nid != 0))
308 batch_size);
309 if (shrink_ret == -1)
310 break; 359 break;
311 if (shrink_ret < nr_before)
312 ret += nr_before - shrink_ret;
313 count_vm_events(SLABS_SCANNED, batch_size);
314 total_scan -= batch_size;
315 360
316 cond_resched(); 361 freed += shrink_slab_node(shrinkctl, shrinker,
317 } 362 nr_pages_scanned, lru_pages);
318 363
319 /* 364 }
320 * move the unused scan count back into the shrinker in a
321 * manner that handles concurrent updates. If we exhausted the
322 * scan, there is no need to do an update.
323 */
324 if (total_scan > 0)
325 new_nr = atomic_long_add_return(total_scan,
326 &shrinker->nr_in_batch);
327 else
328 new_nr = atomic_long_read(&shrinker->nr_in_batch);
329
330 trace_mm_shrink_slab_end(shrinker, shrink_ret, nr, new_nr);
331 } 365 }
332 up_read(&shrinker_rwsem); 366 up_read(&shrinker_rwsem);
333out: 367out:
334 cond_resched(); 368 cond_resched();
335 return ret; 369 return freed;
336} 370}
337 371
338static inline int is_page_cache_freeable(struct page *page) 372static inline int is_page_cache_freeable(struct page *page)
@@ -2368,12 +2402,16 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2368 */ 2402 */
2369 if (global_reclaim(sc)) { 2403 if (global_reclaim(sc)) {
2370 unsigned long lru_pages = 0; 2404 unsigned long lru_pages = 0;
2405
2406 nodes_clear(shrink->nodes_to_scan);
2371 for_each_zone_zonelist(zone, z, zonelist, 2407 for_each_zone_zonelist(zone, z, zonelist,
2372 gfp_zone(sc->gfp_mask)) { 2408 gfp_zone(sc->gfp_mask)) {
2373 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 2409 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
2374 continue; 2410 continue;
2375 2411
2376 lru_pages += zone_reclaimable_pages(zone); 2412 lru_pages += zone_reclaimable_pages(zone);
2413 node_set(zone_to_nid(zone),
2414 shrink->nodes_to_scan);
2377 } 2415 }
2378 2416
2379 shrink_slab(shrink, sc->nr_scanned, lru_pages); 2417 shrink_slab(shrink, sc->nr_scanned, lru_pages);
@@ -2829,6 +2867,8 @@ static bool kswapd_shrink_zone(struct zone *zone,
2829 return true; 2867 return true;
2830 2868
2831 shrink_zone(zone, sc); 2869 shrink_zone(zone, sc);
2870 nodes_clear(shrink.nodes_to_scan);
2871 node_set(zone_to_nid(zone), shrink.nodes_to_scan);
2832 2872
2833 reclaim_state->reclaimed_slab = 0; 2873 reclaim_state->reclaimed_slab = 0;
2834 shrink_slab(&shrink, sc->nr_scanned, lru_pages); 2874 shrink_slab(&shrink, sc->nr_scanned, lru_pages);
@@ -3520,10 +3560,9 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
3520 * number of slab pages and shake the slab until it is reduced 3560 * number of slab pages and shake the slab until it is reduced
3521 * by the same nr_pages that we used for reclaiming unmapped 3561 * by the same nr_pages that we used for reclaiming unmapped
3522 * pages. 3562 * pages.
3523 *
3524 * Note that shrink_slab will free memory on all zones and may
3525 * take a long time.
3526 */ 3563 */
3564 nodes_clear(shrink.nodes_to_scan);
3565 node_set(zone_to_nid(zone), shrink.nodes_to_scan);
3527 for (;;) { 3566 for (;;) {
3528 unsigned long lru_pages = zone_reclaimable_pages(zone); 3567 unsigned long lru_pages = zone_reclaimable_pages(zone);
3529 3568