diff options
-rw-r--r-- | include/linux/shrinker.h | 14 | ||||
-rw-r--r-- | mm/vmscan.c | 241 |
2 files changed, 152 insertions, 103 deletions
diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h index 76f520c4c394..8f80f243fed9 100644 --- a/include/linux/shrinker.h +++ b/include/linux/shrinker.h | |||
@@ -19,6 +19,8 @@ struct shrink_control { | |||
19 | 19 | ||
20 | /* shrink from these nodes */ | 20 | /* shrink from these nodes */ |
21 | nodemask_t nodes_to_scan; | 21 | nodemask_t nodes_to_scan; |
22 | /* current node being shrunk (for NUMA aware shrinkers) */ | ||
23 | int nid; | ||
22 | }; | 24 | }; |
23 | 25 | ||
24 | #define SHRINK_STOP (~0UL) | 26 | #define SHRINK_STOP (~0UL) |
@@ -44,6 +46,8 @@ struct shrink_control { | |||
44 | * due to potential deadlocks. If SHRINK_STOP is returned, then no further | 46 | * due to potential deadlocks. If SHRINK_STOP is returned, then no further |
45 | * attempts to call the @scan_objects will be made from the current reclaim | 47 | * attempts to call the @scan_objects will be made from the current reclaim |
46 | * context. | 48 | * context. |
49 | * | ||
50 | * @flags determine the shrinker abilities, like numa awareness | ||
47 | */ | 51 | */ |
48 | struct shrinker { | 52 | struct shrinker { |
49 | int (*shrink)(struct shrinker *, struct shrink_control *sc); | 53 | int (*shrink)(struct shrinker *, struct shrink_control *sc); |
@@ -54,12 +58,18 @@ struct shrinker { | |||
54 | 58 | ||
55 | int seeks; /* seeks to recreate an obj */ | 59 | int seeks; /* seeks to recreate an obj */ |
56 | long batch; /* reclaim batch size, 0 = default */ | 60 | long batch; /* reclaim batch size, 0 = default */ |
61 | unsigned long flags; | ||
57 | 62 | ||
58 | /* These are for internal use */ | 63 | /* These are for internal use */ |
59 | struct list_head list; | 64 | struct list_head list; |
60 | atomic_long_t nr_in_batch; /* objs pending delete */ | 65 | /* objs pending delete, per node */ |
66 | atomic_long_t *nr_deferred; | ||
61 | }; | 67 | }; |
62 | #define DEFAULT_SEEKS 2 /* A good number if you don't know better. */ | 68 | #define DEFAULT_SEEKS 2 /* A good number if you don't know better. */ |
63 | extern void register_shrinker(struct shrinker *); | 69 | |
70 | /* Flags */ | ||
71 | #define SHRINKER_NUMA_AWARE (1 << 0) | ||
72 | |||
73 | extern int register_shrinker(struct shrinker *); | ||
64 | extern void unregister_shrinker(struct shrinker *); | 74 | extern void unregister_shrinker(struct shrinker *); |
65 | #endif | 75 | #endif |
diff --git a/mm/vmscan.c b/mm/vmscan.c index fe0d5c458440..799ebceeb4f7 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -155,14 +155,31 @@ static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru) | |||
155 | } | 155 | } |
156 | 156 | ||
157 | /* | 157 | /* |
158 | * Add a shrinker callback to be called from the vm | 158 | * Add a shrinker callback to be called from the vm. |
159 | */ | 159 | */ |
160 | void register_shrinker(struct shrinker *shrinker) | 160 | int register_shrinker(struct shrinker *shrinker) |
161 | { | 161 | { |
162 | atomic_long_set(&shrinker->nr_in_batch, 0); | 162 | size_t size = sizeof(*shrinker->nr_deferred); |
163 | |||
164 | /* | ||
165 | * If we only have one possible node in the system anyway, save | ||
166 | * ourselves the trouble and disable NUMA aware behavior. This way we | ||
167 | * will save memory and some small loop time later. | ||
168 | */ | ||
169 | if (nr_node_ids == 1) | ||
170 | shrinker->flags &= ~SHRINKER_NUMA_AWARE; | ||
171 | |||
172 | if (shrinker->flags & SHRINKER_NUMA_AWARE) | ||
173 | size *= nr_node_ids; | ||
174 | |||
175 | shrinker->nr_deferred = kzalloc(size, GFP_KERNEL); | ||
176 | if (!shrinker->nr_deferred) | ||
177 | return -ENOMEM; | ||
178 | |||
163 | down_write(&shrinker_rwsem); | 179 | down_write(&shrinker_rwsem); |
164 | list_add_tail(&shrinker->list, &shrinker_list); | 180 | list_add_tail(&shrinker->list, &shrinker_list); |
165 | up_write(&shrinker_rwsem); | 181 | up_write(&shrinker_rwsem); |
182 | return 0; | ||
166 | } | 183 | } |
167 | EXPORT_SYMBOL(register_shrinker); | 184 | EXPORT_SYMBOL(register_shrinker); |
168 | 185 | ||
@@ -186,6 +203,118 @@ static inline int do_shrinker_shrink(struct shrinker *shrinker, | |||
186 | } | 203 | } |
187 | 204 | ||
188 | #define SHRINK_BATCH 128 | 205 | #define SHRINK_BATCH 128 |
206 | |||
207 | static unsigned long | ||
208 | shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker, | ||
209 | unsigned long nr_pages_scanned, unsigned long lru_pages) | ||
210 | { | ||
211 | unsigned long freed = 0; | ||
212 | unsigned long long delta; | ||
213 | long total_scan; | ||
214 | long max_pass; | ||
215 | long nr; | ||
216 | long new_nr; | ||
217 | int nid = shrinkctl->nid; | ||
218 | long batch_size = shrinker->batch ? shrinker->batch | ||
219 | : SHRINK_BATCH; | ||
220 | |||
221 | if (shrinker->count_objects) | ||
222 | max_pass = shrinker->count_objects(shrinker, shrinkctl); | ||
223 | else | ||
224 | max_pass = do_shrinker_shrink(shrinker, shrinkctl, 0); | ||
225 | if (max_pass == 0) | ||
226 | return 0; | ||
227 | |||
228 | /* | ||
229 | * copy the current shrinker scan count into a local variable | ||
230 | * and zero it so that other concurrent shrinker invocations | ||
231 | * don't also do this scanning work. | ||
232 | */ | ||
233 | nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0); | ||
234 | |||
235 | total_scan = nr; | ||
236 | delta = (4 * nr_pages_scanned) / shrinker->seeks; | ||
237 | delta *= max_pass; | ||
238 | do_div(delta, lru_pages + 1); | ||
239 | total_scan += delta; | ||
240 | if (total_scan < 0) { | ||
241 | printk(KERN_ERR | ||
242 | "shrink_slab: %pF negative objects to delete nr=%ld\n", | ||
243 | shrinker->shrink, total_scan); | ||
244 | total_scan = max_pass; | ||
245 | } | ||
246 | |||
247 | /* | ||
248 | * We need to avoid excessive windup on filesystem shrinkers | ||
249 | * due to large numbers of GFP_NOFS allocations causing the | ||
250 | * shrinkers to return -1 all the time. This results in a large | ||
251 | * nr being built up so when a shrink that can do some work | ||
252 | * comes along it empties the entire cache due to nr >>> | ||
253 | * max_pass. This is bad for sustaining a working set in | ||
254 | * memory. | ||
255 | * | ||
256 | * Hence only allow the shrinker to scan the entire cache when | ||
257 | * a large delta change is calculated directly. | ||
258 | */ | ||
259 | if (delta < max_pass / 4) | ||
260 | total_scan = min(total_scan, max_pass / 2); | ||
261 | |||
262 | /* | ||
263 | * Avoid risking looping forever due to too large nr value: | ||
264 | * never try to free more than twice the estimate number of | ||
265 | * freeable entries. | ||
266 | */ | ||
267 | if (total_scan > max_pass * 2) | ||
268 | total_scan = max_pass * 2; | ||
269 | |||
270 | trace_mm_shrink_slab_start(shrinker, shrinkctl, nr, | ||
271 | nr_pages_scanned, lru_pages, | ||
272 | max_pass, delta, total_scan); | ||
273 | |||
274 | while (total_scan >= batch_size) { | ||
275 | |||
276 | if (shrinker->scan_objects) { | ||
277 | unsigned long ret; | ||
278 | shrinkctl->nr_to_scan = batch_size; | ||
279 | ret = shrinker->scan_objects(shrinker, shrinkctl); | ||
280 | |||
281 | if (ret == SHRINK_STOP) | ||
282 | break; | ||
283 | freed += ret; | ||
284 | } else { | ||
285 | int nr_before; | ||
286 | long ret; | ||
287 | |||
288 | nr_before = do_shrinker_shrink(shrinker, shrinkctl, 0); | ||
289 | ret = do_shrinker_shrink(shrinker, shrinkctl, | ||
290 | batch_size); | ||
291 | if (ret == -1) | ||
292 | break; | ||
293 | if (ret < nr_before) | ||
294 | freed += nr_before - ret; | ||
295 | } | ||
296 | |||
297 | count_vm_events(SLABS_SCANNED, batch_size); | ||
298 | total_scan -= batch_size; | ||
299 | |||
300 | cond_resched(); | ||
301 | } | ||
302 | |||
303 | /* | ||
304 | * move the unused scan count back into the shrinker in a | ||
305 | * manner that handles concurrent updates. If we exhausted the | ||
306 | * scan, there is no need to do an update. | ||
307 | */ | ||
308 | if (total_scan > 0) | ||
309 | new_nr = atomic_long_add_return(total_scan, | ||
310 | &shrinker->nr_deferred[nid]); | ||
311 | else | ||
312 | new_nr = atomic_long_read(&shrinker->nr_deferred[nid]); | ||
313 | |||
314 | trace_mm_shrink_slab_end(shrinker, freed, nr, new_nr); | ||
315 | return freed; | ||
316 | } | ||
317 | |||
189 | /* | 318 | /* |
190 | * Call the shrink functions to age shrinkable caches | 319 | * Call the shrink functions to age shrinkable caches |
191 | * | 320 | * |
@@ -227,108 +356,18 @@ unsigned long shrink_slab(struct shrink_control *shrinkctl, | |||
227 | } | 356 | } |
228 | 357 | ||
229 | list_for_each_entry(shrinker, &shrinker_list, list) { | 358 | list_for_each_entry(shrinker, &shrinker_list, list) { |
230 | unsigned long long delta; | 359 | for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) { |
231 | long total_scan; | 360 | if (!node_online(shrinkctl->nid)) |
232 | long max_pass; | 361 | continue; |
233 | long nr; | ||
234 | long new_nr; | ||
235 | long batch_size = shrinker->batch ? shrinker->batch | ||
236 | : SHRINK_BATCH; | ||
237 | |||
238 | if (shrinker->count_objects) | ||
239 | max_pass = shrinker->count_objects(shrinker, shrinkctl); | ||
240 | else | ||
241 | max_pass = do_shrinker_shrink(shrinker, shrinkctl, 0); | ||
242 | if (max_pass == 0) | ||
243 | continue; | ||
244 | |||
245 | /* | ||
246 | * copy the current shrinker scan count into a local variable | ||
247 | * and zero it so that other concurrent shrinker invocations | ||
248 | * don't also do this scanning work. | ||
249 | */ | ||
250 | nr = atomic_long_xchg(&shrinker->nr_in_batch, 0); | ||
251 | |||
252 | total_scan = nr; | ||
253 | delta = (4 * nr_pages_scanned) / shrinker->seeks; | ||
254 | delta *= max_pass; | ||
255 | do_div(delta, lru_pages + 1); | ||
256 | total_scan += delta; | ||
257 | if (total_scan < 0) { | ||
258 | printk(KERN_ERR | ||
259 | "shrink_slab: %pF negative objects to delete nr=%ld\n", | ||
260 | shrinker->shrink, total_scan); | ||
261 | total_scan = max_pass; | ||
262 | } | ||
263 | |||
264 | /* | ||
265 | * We need to avoid excessive windup on filesystem shrinkers | ||
266 | * due to large numbers of GFP_NOFS allocations causing the | ||
267 | * shrinkers to return -1 all the time. This results in a large | ||
268 | * nr being built up so when a shrink that can do some work | ||
269 | * comes along it empties the entire cache due to nr >>> | ||
270 | * max_pass. This is bad for sustaining a working set in | ||
271 | * memory. | ||
272 | * | ||
273 | * Hence only allow the shrinker to scan the entire cache when | ||
274 | * a large delta change is calculated directly. | ||
275 | */ | ||
276 | if (delta < max_pass / 4) | ||
277 | total_scan = min(total_scan, max_pass / 2); | ||
278 | |||
279 | /* | ||
280 | * Avoid risking looping forever due to too large nr value: | ||
281 | * never try to free more than twice the estimate number of | ||
282 | * freeable entries. | ||
283 | */ | ||
284 | if (total_scan > max_pass * 2) | ||
285 | total_scan = max_pass * 2; | ||
286 | |||
287 | trace_mm_shrink_slab_start(shrinker, shrinkctl, nr, | ||
288 | nr_pages_scanned, lru_pages, | ||
289 | max_pass, delta, total_scan); | ||
290 | |||
291 | while (total_scan >= batch_size) { | ||
292 | |||
293 | if (shrinker->scan_objects) { | ||
294 | unsigned long ret; | ||
295 | shrinkctl->nr_to_scan = batch_size; | ||
296 | ret = shrinker->scan_objects(shrinker, shrinkctl); | ||
297 | 362 | ||
298 | if (ret == SHRINK_STOP) | 363 | if (!(shrinker->flags & SHRINKER_NUMA_AWARE) && |
299 | break; | 364 | (shrinkctl->nid != 0)) |
300 | freed += ret; | 365 | break; |
301 | } else { | ||
302 | int nr_before; | ||
303 | long ret; | ||
304 | |||
305 | nr_before = do_shrinker_shrink(shrinker, shrinkctl, 0); | ||
306 | ret = do_shrinker_shrink(shrinker, shrinkctl, | ||
307 | batch_size); | ||
308 | if (ret == -1) | ||
309 | break; | ||
310 | if (ret < nr_before) | ||
311 | freed += nr_before - ret; | ||
312 | } | ||
313 | 366 | ||
314 | count_vm_events(SLABS_SCANNED, batch_size); | 367 | freed += shrink_slab_node(shrinkctl, shrinker, |
315 | total_scan -= batch_size; | 368 | nr_pages_scanned, lru_pages); |
316 | 369 | ||
317 | cond_resched(); | ||
318 | } | 370 | } |
319 | |||
320 | /* | ||
321 | * move the unused scan count back into the shrinker in a | ||
322 | * manner that handles concurrent updates. If we exhausted the | ||
323 | * scan, there is no need to do an update. | ||
324 | */ | ||
325 | if (total_scan > 0) | ||
326 | new_nr = atomic_long_add_return(total_scan, | ||
327 | &shrinker->nr_in_batch); | ||
328 | else | ||
329 | new_nr = atomic_long_read(&shrinker->nr_in_batch); | ||
330 | |||
331 | trace_mm_shrink_slab_end(shrinker, freed, nr, new_nr); | ||
332 | } | 371 | } |
333 | up_read(&shrinker_rwsem); | 372 | up_read(&shrinker_rwsem); |
334 | out: | 373 | out: |