diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 6 | ||||
-rw-r--r-- | mm/Makefile | 2 | ||||
-rw-r--r-- | mm/backing-dev.c | 388 | ||||
-rw-r--r-- | mm/bootmem.c | 10 | ||||
-rw-r--r-- | mm/hugetlb.c | 2 | ||||
-rw-r--r-- | mm/kmemleak.c | 546 | ||||
-rw-r--r-- | mm/memcontrol.c | 25 | ||||
-rw-r--r-- | mm/memory.c | 11 | ||||
-rw-r--r-- | mm/mempolicy.c | 84 | ||||
-rw-r--r-- | mm/mempool.c | 4 | ||||
-rw-r--r-- | mm/mmap.c | 3 | ||||
-rw-r--r-- | mm/nommu.c | 10 | ||||
-rw-r--r-- | mm/oom_kill.c | 64 | ||||
-rw-r--r-- | mm/page-writeback.c | 184 | ||||
-rw-r--r-- | mm/page_alloc.c | 49 | ||||
-rw-r--r-- | mm/pdflush.c | 269 | ||||
-rw-r--r-- | mm/percpu.c | 50 | ||||
-rw-r--r-- | mm/rmap.c | 1 | ||||
-rw-r--r-- | mm/shmem.c | 6 | ||||
-rw-r--r-- | mm/shmem_acl.c | 11 | ||||
-rw-r--r-- | mm/slob.c | 5 | ||||
-rw-r--r-- | mm/slub.c | 19 | ||||
-rw-r--r-- | mm/swap_state.c | 1 | ||||
-rw-r--r-- | mm/swapfile.c | 4 | ||||
-rw-r--r-- | mm/vmscan.c | 19 |
25 files changed, 1043 insertions, 730 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index c948d4ca8bde..fe5f674d7a7d 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
@@ -225,9 +225,9 @@ config DEFAULT_MMAP_MIN_ADDR | |||
225 | For most ia64, ppc64 and x86 users with lots of address space | 225 | For most ia64, ppc64 and x86 users with lots of address space |
226 | a value of 65536 is reasonable and should cause no problems. | 226 | a value of 65536 is reasonable and should cause no problems. |
227 | On arm and other archs it should not be higher than 32768. | 227 | On arm and other archs it should not be higher than 32768. |
228 | Programs which use vm86 functionality would either need additional | 228 | Programs which use vm86 functionality or have some need to map |
229 | permissions from either the LSM or the capabilities module or have | 229 | this low address space will need CAP_SYS_RAWIO or disable this |
230 | this protection disabled. | 230 | protection by setting the value to 0. |
231 | 231 | ||
232 | This value can be changed after boot using the | 232 | This value can be changed after boot using the |
233 | /proc/sys/vm/mmap_min_addr tunable. | 233 | /proc/sys/vm/mmap_min_addr tunable. |
diff --git a/mm/Makefile b/mm/Makefile index 5e0bd6426693..147a7a7873c4 100644 --- a/mm/Makefile +++ b/mm/Makefile | |||
@@ -8,7 +8,7 @@ mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ | |||
8 | vmalloc.o | 8 | vmalloc.o |
9 | 9 | ||
10 | obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ | 10 | obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ |
11 | maccess.o page_alloc.o page-writeback.o pdflush.o \ | 11 | maccess.o page_alloc.o page-writeback.o \ |
12 | readahead.o swap.o truncate.o vmscan.o shmem.o \ | 12 | readahead.o swap.o truncate.o vmscan.o shmem.o \ |
13 | prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ | 13 | prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ |
14 | page_isolation.o mm_init.o $(mmu-y) | 14 | page_isolation.o mm_init.o $(mmu-y) |
diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 493b468a5035..d3ca0dac1111 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c | |||
@@ -1,8 +1,11 @@ | |||
1 | 1 | ||
2 | #include <linux/wait.h> | 2 | #include <linux/wait.h> |
3 | #include <linux/backing-dev.h> | 3 | #include <linux/backing-dev.h> |
4 | #include <linux/kthread.h> | ||
5 | #include <linux/freezer.h> | ||
4 | #include <linux/fs.h> | 6 | #include <linux/fs.h> |
5 | #include <linux/pagemap.h> | 7 | #include <linux/pagemap.h> |
8 | #include <linux/mm.h> | ||
6 | #include <linux/sched.h> | 9 | #include <linux/sched.h> |
7 | #include <linux/module.h> | 10 | #include <linux/module.h> |
8 | #include <linux/writeback.h> | 11 | #include <linux/writeback.h> |
@@ -14,6 +17,7 @@ void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) | |||
14 | EXPORT_SYMBOL(default_unplug_io_fn); | 17 | EXPORT_SYMBOL(default_unplug_io_fn); |
15 | 18 | ||
16 | struct backing_dev_info default_backing_dev_info = { | 19 | struct backing_dev_info default_backing_dev_info = { |
20 | .name = "default", | ||
17 | .ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE, | 21 | .ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE, |
18 | .state = 0, | 22 | .state = 0, |
19 | .capabilities = BDI_CAP_MAP_COPY, | 23 | .capabilities = BDI_CAP_MAP_COPY, |
@@ -22,6 +26,18 @@ struct backing_dev_info default_backing_dev_info = { | |||
22 | EXPORT_SYMBOL_GPL(default_backing_dev_info); | 26 | EXPORT_SYMBOL_GPL(default_backing_dev_info); |
23 | 27 | ||
24 | static struct class *bdi_class; | 28 | static struct class *bdi_class; |
29 | DEFINE_SPINLOCK(bdi_lock); | ||
30 | LIST_HEAD(bdi_list); | ||
31 | LIST_HEAD(bdi_pending_list); | ||
32 | |||
33 | static struct task_struct *sync_supers_tsk; | ||
34 | static struct timer_list sync_supers_timer; | ||
35 | |||
36 | static int bdi_sync_supers(void *); | ||
37 | static void sync_supers_timer_fn(unsigned long); | ||
38 | static void arm_supers_timer(void); | ||
39 | |||
40 | static void bdi_add_default_flusher_task(struct backing_dev_info *bdi); | ||
25 | 41 | ||
26 | #ifdef CONFIG_DEBUG_FS | 42 | #ifdef CONFIG_DEBUG_FS |
27 | #include <linux/debugfs.h> | 43 | #include <linux/debugfs.h> |
@@ -37,9 +53,29 @@ static void bdi_debug_init(void) | |||
37 | static int bdi_debug_stats_show(struct seq_file *m, void *v) | 53 | static int bdi_debug_stats_show(struct seq_file *m, void *v) |
38 | { | 54 | { |
39 | struct backing_dev_info *bdi = m->private; | 55 | struct backing_dev_info *bdi = m->private; |
56 | struct bdi_writeback *wb; | ||
40 | unsigned long background_thresh; | 57 | unsigned long background_thresh; |
41 | unsigned long dirty_thresh; | 58 | unsigned long dirty_thresh; |
42 | unsigned long bdi_thresh; | 59 | unsigned long bdi_thresh; |
60 | unsigned long nr_dirty, nr_io, nr_more_io, nr_wb; | ||
61 | struct inode *inode; | ||
62 | |||
63 | /* | ||
64 | * inode lock is enough here, the bdi->wb_list is protected by | ||
65 | * RCU on the reader side | ||
66 | */ | ||
67 | nr_wb = nr_dirty = nr_io = nr_more_io = 0; | ||
68 | spin_lock(&inode_lock); | ||
69 | list_for_each_entry(wb, &bdi->wb_list, list) { | ||
70 | nr_wb++; | ||
71 | list_for_each_entry(inode, &wb->b_dirty, i_list) | ||
72 | nr_dirty++; | ||
73 | list_for_each_entry(inode, &wb->b_io, i_list) | ||
74 | nr_io++; | ||
75 | list_for_each_entry(inode, &wb->b_more_io, i_list) | ||
76 | nr_more_io++; | ||
77 | } | ||
78 | spin_unlock(&inode_lock); | ||
43 | 79 | ||
44 | get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi); | 80 | get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi); |
45 | 81 | ||
@@ -49,12 +85,22 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) | |||
49 | "BdiReclaimable: %8lu kB\n" | 85 | "BdiReclaimable: %8lu kB\n" |
50 | "BdiDirtyThresh: %8lu kB\n" | 86 | "BdiDirtyThresh: %8lu kB\n" |
51 | "DirtyThresh: %8lu kB\n" | 87 | "DirtyThresh: %8lu kB\n" |
52 | "BackgroundThresh: %8lu kB\n", | 88 | "BackgroundThresh: %8lu kB\n" |
89 | "WriteBack threads:%8lu\n" | ||
90 | "b_dirty: %8lu\n" | ||
91 | "b_io: %8lu\n" | ||
92 | "b_more_io: %8lu\n" | ||
93 | "bdi_list: %8u\n" | ||
94 | "state: %8lx\n" | ||
95 | "wb_mask: %8lx\n" | ||
96 | "wb_list: %8u\n" | ||
97 | "wb_cnt: %8u\n", | ||
53 | (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)), | 98 | (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)), |
54 | (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)), | 99 | (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)), |
55 | K(bdi_thresh), | 100 | K(bdi_thresh), K(dirty_thresh), |
56 | K(dirty_thresh), | 101 | K(background_thresh), nr_wb, nr_dirty, nr_io, nr_more_io, |
57 | K(background_thresh)); | 102 | !list_empty(&bdi->bdi_list), bdi->state, bdi->wb_mask, |
103 | !list_empty(&bdi->wb_list), bdi->wb_cnt); | ||
58 | #undef K | 104 | #undef K |
59 | 105 | ||
60 | return 0; | 106 | return 0; |
@@ -185,6 +231,13 @@ static int __init default_bdi_init(void) | |||
185 | { | 231 | { |
186 | int err; | 232 | int err; |
187 | 233 | ||
234 | sync_supers_tsk = kthread_run(bdi_sync_supers, NULL, "sync_supers"); | ||
235 | BUG_ON(IS_ERR(sync_supers_tsk)); | ||
236 | |||
237 | init_timer(&sync_supers_timer); | ||
238 | setup_timer(&sync_supers_timer, sync_supers_timer_fn, 0); | ||
239 | arm_supers_timer(); | ||
240 | |||
188 | err = bdi_init(&default_backing_dev_info); | 241 | err = bdi_init(&default_backing_dev_info); |
189 | if (!err) | 242 | if (!err) |
190 | bdi_register(&default_backing_dev_info, NULL, "default"); | 243 | bdi_register(&default_backing_dev_info, NULL, "default"); |
@@ -193,6 +246,248 @@ static int __init default_bdi_init(void) | |||
193 | } | 246 | } |
194 | subsys_initcall(default_bdi_init); | 247 | subsys_initcall(default_bdi_init); |
195 | 248 | ||
249 | static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi) | ||
250 | { | ||
251 | memset(wb, 0, sizeof(*wb)); | ||
252 | |||
253 | wb->bdi = bdi; | ||
254 | wb->last_old_flush = jiffies; | ||
255 | INIT_LIST_HEAD(&wb->b_dirty); | ||
256 | INIT_LIST_HEAD(&wb->b_io); | ||
257 | INIT_LIST_HEAD(&wb->b_more_io); | ||
258 | } | ||
259 | |||
260 | static void bdi_task_init(struct backing_dev_info *bdi, | ||
261 | struct bdi_writeback *wb) | ||
262 | { | ||
263 | struct task_struct *tsk = current; | ||
264 | |||
265 | spin_lock(&bdi->wb_lock); | ||
266 | list_add_tail_rcu(&wb->list, &bdi->wb_list); | ||
267 | spin_unlock(&bdi->wb_lock); | ||
268 | |||
269 | tsk->flags |= PF_FLUSHER | PF_SWAPWRITE; | ||
270 | set_freezable(); | ||
271 | |||
272 | /* | ||
273 | * Our parent may run at a different priority, just set us to normal | ||
274 | */ | ||
275 | set_user_nice(tsk, 0); | ||
276 | } | ||
277 | |||
278 | static int bdi_start_fn(void *ptr) | ||
279 | { | ||
280 | struct bdi_writeback *wb = ptr; | ||
281 | struct backing_dev_info *bdi = wb->bdi; | ||
282 | int ret; | ||
283 | |||
284 | /* | ||
285 | * Add us to the active bdi_list | ||
286 | */ | ||
287 | spin_lock(&bdi_lock); | ||
288 | list_add(&bdi->bdi_list, &bdi_list); | ||
289 | spin_unlock(&bdi_lock); | ||
290 | |||
291 | bdi_task_init(bdi, wb); | ||
292 | |||
293 | /* | ||
294 | * Clear pending bit and wakeup anybody waiting to tear us down | ||
295 | */ | ||
296 | clear_bit(BDI_pending, &bdi->state); | ||
297 | smp_mb__after_clear_bit(); | ||
298 | wake_up_bit(&bdi->state, BDI_pending); | ||
299 | |||
300 | ret = bdi_writeback_task(wb); | ||
301 | |||
302 | /* | ||
303 | * Remove us from the list | ||
304 | */ | ||
305 | spin_lock(&bdi->wb_lock); | ||
306 | list_del_rcu(&wb->list); | ||
307 | spin_unlock(&bdi->wb_lock); | ||
308 | |||
309 | /* | ||
310 | * Flush any work that raced with us exiting. No new work | ||
311 | * will be added, since this bdi isn't discoverable anymore. | ||
312 | */ | ||
313 | if (!list_empty(&bdi->work_list)) | ||
314 | wb_do_writeback(wb, 1); | ||
315 | |||
316 | wb->task = NULL; | ||
317 | return ret; | ||
318 | } | ||
319 | |||
320 | int bdi_has_dirty_io(struct backing_dev_info *bdi) | ||
321 | { | ||
322 | return wb_has_dirty_io(&bdi->wb); | ||
323 | } | ||
324 | |||
325 | static void bdi_flush_io(struct backing_dev_info *bdi) | ||
326 | { | ||
327 | struct writeback_control wbc = { | ||
328 | .bdi = bdi, | ||
329 | .sync_mode = WB_SYNC_NONE, | ||
330 | .older_than_this = NULL, | ||
331 | .range_cyclic = 1, | ||
332 | .nr_to_write = 1024, | ||
333 | }; | ||
334 | |||
335 | writeback_inodes_wbc(&wbc); | ||
336 | } | ||
337 | |||
338 | /* | ||
339 | * kupdated() used to do this. We cannot do it from the bdi_forker_task() | ||
340 | * or we risk deadlocking on ->s_umount. The longer term solution would be | ||
341 | * to implement sync_supers_bdi() or similar and simply do it from the | ||
342 | * bdi writeback tasks individually. | ||
343 | */ | ||
344 | static int bdi_sync_supers(void *unused) | ||
345 | { | ||
346 | set_user_nice(current, 0); | ||
347 | |||
348 | while (!kthread_should_stop()) { | ||
349 | set_current_state(TASK_INTERRUPTIBLE); | ||
350 | schedule(); | ||
351 | |||
352 | /* | ||
353 | * Do this periodically, like kupdated() did before. | ||
354 | */ | ||
355 | sync_supers(); | ||
356 | } | ||
357 | |||
358 | return 0; | ||
359 | } | ||
360 | |||
361 | static void arm_supers_timer(void) | ||
362 | { | ||
363 | unsigned long next; | ||
364 | |||
365 | next = msecs_to_jiffies(dirty_writeback_interval * 10) + jiffies; | ||
366 | mod_timer(&sync_supers_timer, round_jiffies_up(next)); | ||
367 | } | ||
368 | |||
369 | static void sync_supers_timer_fn(unsigned long unused) | ||
370 | { | ||
371 | wake_up_process(sync_supers_tsk); | ||
372 | arm_supers_timer(); | ||
373 | } | ||
374 | |||
375 | static int bdi_forker_task(void *ptr) | ||
376 | { | ||
377 | struct bdi_writeback *me = ptr; | ||
378 | |||
379 | bdi_task_init(me->bdi, me); | ||
380 | |||
381 | for (;;) { | ||
382 | struct backing_dev_info *bdi, *tmp; | ||
383 | struct bdi_writeback *wb; | ||
384 | |||
385 | /* | ||
386 | * Temporary measure, we want to make sure we don't see | ||
387 | * dirty data on the default backing_dev_info | ||
388 | */ | ||
389 | if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list)) | ||
390 | wb_do_writeback(me, 0); | ||
391 | |||
392 | spin_lock(&bdi_lock); | ||
393 | |||
394 | /* | ||
395 | * Check if any existing bdi's have dirty data without | ||
396 | * a thread registered. If so, set that up. | ||
397 | */ | ||
398 | list_for_each_entry_safe(bdi, tmp, &bdi_list, bdi_list) { | ||
399 | if (bdi->wb.task) | ||
400 | continue; | ||
401 | if (list_empty(&bdi->work_list) && | ||
402 | !bdi_has_dirty_io(bdi)) | ||
403 | continue; | ||
404 | |||
405 | bdi_add_default_flusher_task(bdi); | ||
406 | } | ||
407 | |||
408 | set_current_state(TASK_INTERRUPTIBLE); | ||
409 | |||
410 | if (list_empty(&bdi_pending_list)) { | ||
411 | unsigned long wait; | ||
412 | |||
413 | spin_unlock(&bdi_lock); | ||
414 | wait = msecs_to_jiffies(dirty_writeback_interval * 10); | ||
415 | schedule_timeout(wait); | ||
416 | try_to_freeze(); | ||
417 | continue; | ||
418 | } | ||
419 | |||
420 | __set_current_state(TASK_RUNNING); | ||
421 | |||
422 | /* | ||
423 | * This is our real job - check for pending entries in | ||
424 | * bdi_pending_list, and create the tasks that got added | ||
425 | */ | ||
426 | bdi = list_entry(bdi_pending_list.next, struct backing_dev_info, | ||
427 | bdi_list); | ||
428 | list_del_init(&bdi->bdi_list); | ||
429 | spin_unlock(&bdi_lock); | ||
430 | |||
431 | wb = &bdi->wb; | ||
432 | wb->task = kthread_run(bdi_start_fn, wb, "flush-%s", | ||
433 | dev_name(bdi->dev)); | ||
434 | /* | ||
435 | * If task creation fails, then readd the bdi to | ||
436 | * the pending list and force writeout of the bdi | ||
437 | * from this forker thread. That will free some memory | ||
438 | * and we can try again. | ||
439 | */ | ||
440 | if (IS_ERR(wb->task)) { | ||
441 | wb->task = NULL; | ||
442 | |||
443 | /* | ||
444 | * Add this 'bdi' to the back, so we get | ||
445 | * a chance to flush other bdi's to free | ||
446 | * memory. | ||
447 | */ | ||
448 | spin_lock(&bdi_lock); | ||
449 | list_add_tail(&bdi->bdi_list, &bdi_pending_list); | ||
450 | spin_unlock(&bdi_lock); | ||
451 | |||
452 | bdi_flush_io(bdi); | ||
453 | } | ||
454 | } | ||
455 | |||
456 | return 0; | ||
457 | } | ||
458 | |||
459 | /* | ||
460 | * Add the default flusher task that gets created for any bdi | ||
461 | * that has dirty data pending writeout | ||
462 | */ | ||
463 | void static bdi_add_default_flusher_task(struct backing_dev_info *bdi) | ||
464 | { | ||
465 | if (!bdi_cap_writeback_dirty(bdi)) | ||
466 | return; | ||
467 | |||
468 | if (WARN_ON(!test_bit(BDI_registered, &bdi->state))) { | ||
469 | printk(KERN_ERR "bdi %p/%s is not registered!\n", | ||
470 | bdi, bdi->name); | ||
471 | return; | ||
472 | } | ||
473 | |||
474 | /* | ||
475 | * Check with the helper whether to proceed adding a task. Will only | ||
476 | * abort if we two or more simultanous calls to | ||
477 | * bdi_add_default_flusher_task() occured, further additions will block | ||
478 | * waiting for previous additions to finish. | ||
479 | */ | ||
480 | if (!test_and_set_bit(BDI_pending, &bdi->state)) { | ||
481 | list_move_tail(&bdi->bdi_list, &bdi_pending_list); | ||
482 | |||
483 | /* | ||
484 | * We are now on the pending list, wake up bdi_forker_task() | ||
485 | * to finish the job and add us back to the active bdi_list | ||
486 | */ | ||
487 | wake_up_process(default_backing_dev_info.wb.task); | ||
488 | } | ||
489 | } | ||
490 | |||
196 | int bdi_register(struct backing_dev_info *bdi, struct device *parent, | 491 | int bdi_register(struct backing_dev_info *bdi, struct device *parent, |
197 | const char *fmt, ...) | 492 | const char *fmt, ...) |
198 | { | 493 | { |
@@ -211,9 +506,35 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent, | |||
211 | goto exit; | 506 | goto exit; |
212 | } | 507 | } |
213 | 508 | ||
509 | spin_lock(&bdi_lock); | ||
510 | list_add_tail(&bdi->bdi_list, &bdi_list); | ||
511 | spin_unlock(&bdi_lock); | ||
512 | |||
214 | bdi->dev = dev; | 513 | bdi->dev = dev; |
215 | bdi_debug_register(bdi, dev_name(dev)); | ||
216 | 514 | ||
515 | /* | ||
516 | * Just start the forker thread for our default backing_dev_info, | ||
517 | * and add other bdi's to the list. They will get a thread created | ||
518 | * on-demand when they need it. | ||
519 | */ | ||
520 | if (bdi_cap_flush_forker(bdi)) { | ||
521 | struct bdi_writeback *wb = &bdi->wb; | ||
522 | |||
523 | wb->task = kthread_run(bdi_forker_task, wb, "bdi-%s", | ||
524 | dev_name(dev)); | ||
525 | if (IS_ERR(wb->task)) { | ||
526 | wb->task = NULL; | ||
527 | ret = -ENOMEM; | ||
528 | |||
529 | spin_lock(&bdi_lock); | ||
530 | list_del(&bdi->bdi_list); | ||
531 | spin_unlock(&bdi_lock); | ||
532 | goto exit; | ||
533 | } | ||
534 | } | ||
535 | |||
536 | bdi_debug_register(bdi, dev_name(dev)); | ||
537 | set_bit(BDI_registered, &bdi->state); | ||
217 | exit: | 538 | exit: |
218 | return ret; | 539 | return ret; |
219 | } | 540 | } |
@@ -225,9 +546,42 @@ int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev) | |||
225 | } | 546 | } |
226 | EXPORT_SYMBOL(bdi_register_dev); | 547 | EXPORT_SYMBOL(bdi_register_dev); |
227 | 548 | ||
549 | /* | ||
550 | * Remove bdi from the global list and shutdown any threads we have running | ||
551 | */ | ||
552 | static void bdi_wb_shutdown(struct backing_dev_info *bdi) | ||
553 | { | ||
554 | struct bdi_writeback *wb; | ||
555 | |||
556 | if (!bdi_cap_writeback_dirty(bdi)) | ||
557 | return; | ||
558 | |||
559 | /* | ||
560 | * If setup is pending, wait for that to complete first | ||
561 | */ | ||
562 | wait_on_bit(&bdi->state, BDI_pending, bdi_sched_wait, | ||
563 | TASK_UNINTERRUPTIBLE); | ||
564 | |||
565 | /* | ||
566 | * Make sure nobody finds us on the bdi_list anymore | ||
567 | */ | ||
568 | spin_lock(&bdi_lock); | ||
569 | list_del(&bdi->bdi_list); | ||
570 | spin_unlock(&bdi_lock); | ||
571 | |||
572 | /* | ||
573 | * Finally, kill the kernel threads. We don't need to be RCU | ||
574 | * safe anymore, since the bdi is gone from visibility. | ||
575 | */ | ||
576 | list_for_each_entry(wb, &bdi->wb_list, list) | ||
577 | kthread_stop(wb->task); | ||
578 | } | ||
579 | |||
228 | void bdi_unregister(struct backing_dev_info *bdi) | 580 | void bdi_unregister(struct backing_dev_info *bdi) |
229 | { | 581 | { |
230 | if (bdi->dev) { | 582 | if (bdi->dev) { |
583 | if (!bdi_cap_flush_forker(bdi)) | ||
584 | bdi_wb_shutdown(bdi); | ||
231 | bdi_debug_unregister(bdi); | 585 | bdi_debug_unregister(bdi); |
232 | device_unregister(bdi->dev); | 586 | device_unregister(bdi->dev); |
233 | bdi->dev = NULL; | 587 | bdi->dev = NULL; |
@@ -237,14 +591,25 @@ EXPORT_SYMBOL(bdi_unregister); | |||
237 | 591 | ||
238 | int bdi_init(struct backing_dev_info *bdi) | 592 | int bdi_init(struct backing_dev_info *bdi) |
239 | { | 593 | { |
240 | int i; | 594 | int i, err; |
241 | int err; | ||
242 | 595 | ||
243 | bdi->dev = NULL; | 596 | bdi->dev = NULL; |
244 | 597 | ||
245 | bdi->min_ratio = 0; | 598 | bdi->min_ratio = 0; |
246 | bdi->max_ratio = 100; | 599 | bdi->max_ratio = 100; |
247 | bdi->max_prop_frac = PROP_FRAC_BASE; | 600 | bdi->max_prop_frac = PROP_FRAC_BASE; |
601 | spin_lock_init(&bdi->wb_lock); | ||
602 | INIT_LIST_HEAD(&bdi->bdi_list); | ||
603 | INIT_LIST_HEAD(&bdi->wb_list); | ||
604 | INIT_LIST_HEAD(&bdi->work_list); | ||
605 | |||
606 | bdi_wb_init(&bdi->wb, bdi); | ||
607 | |||
608 | /* | ||
609 | * Just one thread support for now, hard code mask and count | ||
610 | */ | ||
611 | bdi->wb_mask = 1; | ||
612 | bdi->wb_cnt = 1; | ||
248 | 613 | ||
249 | for (i = 0; i < NR_BDI_STAT_ITEMS; i++) { | 614 | for (i = 0; i < NR_BDI_STAT_ITEMS; i++) { |
250 | err = percpu_counter_init(&bdi->bdi_stat[i], 0); | 615 | err = percpu_counter_init(&bdi->bdi_stat[i], 0); |
@@ -269,6 +634,8 @@ void bdi_destroy(struct backing_dev_info *bdi) | |||
269 | { | 634 | { |
270 | int i; | 635 | int i; |
271 | 636 | ||
637 | WARN_ON(bdi_has_dirty_io(bdi)); | ||
638 | |||
272 | bdi_unregister(bdi); | 639 | bdi_unregister(bdi); |
273 | 640 | ||
274 | for (i = 0; i < NR_BDI_STAT_ITEMS; i++) | 641 | for (i = 0; i < NR_BDI_STAT_ITEMS; i++) |
@@ -283,7 +650,6 @@ static wait_queue_head_t congestion_wqh[2] = { | |||
283 | __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1]) | 650 | __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1]) |
284 | }; | 651 | }; |
285 | 652 | ||
286 | |||
287 | void clear_bdi_congested(struct backing_dev_info *bdi, int sync) | 653 | void clear_bdi_congested(struct backing_dev_info *bdi, int sync) |
288 | { | 654 | { |
289 | enum bdi_state bit; | 655 | enum bdi_state bit; |
@@ -308,18 +674,18 @@ EXPORT_SYMBOL(set_bdi_congested); | |||
308 | 674 | ||
309 | /** | 675 | /** |
310 | * congestion_wait - wait for a backing_dev to become uncongested | 676 | * congestion_wait - wait for a backing_dev to become uncongested |
311 | * @rw: READ or WRITE | 677 | * @sync: SYNC or ASYNC IO |
312 | * @timeout: timeout in jiffies | 678 | * @timeout: timeout in jiffies |
313 | * | 679 | * |
314 | * Waits for up to @timeout jiffies for a backing_dev (any backing_dev) to exit | 680 | * Waits for up to @timeout jiffies for a backing_dev (any backing_dev) to exit |
315 | * write congestion. If no backing_devs are congested then just wait for the | 681 | * write congestion. If no backing_devs are congested then just wait for the |
316 | * next write to be completed. | 682 | * next write to be completed. |
317 | */ | 683 | */ |
318 | long congestion_wait(int rw, long timeout) | 684 | long congestion_wait(int sync, long timeout) |
319 | { | 685 | { |
320 | long ret; | 686 | long ret; |
321 | DEFINE_WAIT(wait); | 687 | DEFINE_WAIT(wait); |
322 | wait_queue_head_t *wqh = &congestion_wqh[rw]; | 688 | wait_queue_head_t *wqh = &congestion_wqh[sync]; |
323 | 689 | ||
324 | prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); | 690 | prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); |
325 | ret = io_schedule_timeout(timeout); | 691 | ret = io_schedule_timeout(timeout); |
diff --git a/mm/bootmem.c b/mm/bootmem.c index d2a9ce952768..555d5d2731c6 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c | |||
@@ -12,6 +12,7 @@ | |||
12 | #include <linux/pfn.h> | 12 | #include <linux/pfn.h> |
13 | #include <linux/bootmem.h> | 13 | #include <linux/bootmem.h> |
14 | #include <linux/module.h> | 14 | #include <linux/module.h> |
15 | #include <linux/kmemleak.h> | ||
15 | 16 | ||
16 | #include <asm/bug.h> | 17 | #include <asm/bug.h> |
17 | #include <asm/io.h> | 18 | #include <asm/io.h> |
@@ -335,6 +336,8 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, | |||
335 | { | 336 | { |
336 | unsigned long start, end; | 337 | unsigned long start, end; |
337 | 338 | ||
339 | kmemleak_free_part(__va(physaddr), size); | ||
340 | |||
338 | start = PFN_UP(physaddr); | 341 | start = PFN_UP(physaddr); |
339 | end = PFN_DOWN(physaddr + size); | 342 | end = PFN_DOWN(physaddr + size); |
340 | 343 | ||
@@ -354,6 +357,8 @@ void __init free_bootmem(unsigned long addr, unsigned long size) | |||
354 | { | 357 | { |
355 | unsigned long start, end; | 358 | unsigned long start, end; |
356 | 359 | ||
360 | kmemleak_free_part(__va(addr), size); | ||
361 | |||
357 | start = PFN_UP(addr); | 362 | start = PFN_UP(addr); |
358 | end = PFN_DOWN(addr + size); | 363 | end = PFN_DOWN(addr + size); |
359 | 364 | ||
@@ -516,6 +521,11 @@ find_block: | |||
516 | region = phys_to_virt(PFN_PHYS(bdata->node_min_pfn) + | 521 | region = phys_to_virt(PFN_PHYS(bdata->node_min_pfn) + |
517 | start_off); | 522 | start_off); |
518 | memset(region, 0, size); | 523 | memset(region, 0, size); |
524 | /* | ||
525 | * The min_count is set to 0 so that bootmem allocated blocks | ||
526 | * are never reported as leaks. | ||
527 | */ | ||
528 | kmemleak_alloc(region, size, 0, 0); | ||
519 | return region; | 529 | return region; |
520 | } | 530 | } |
521 | 531 | ||
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index d0351e31f474..cafdcee154e8 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -2370,7 +2370,7 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) | |||
2370 | long chg = region_truncate(&inode->i_mapping->private_list, offset); | 2370 | long chg = region_truncate(&inode->i_mapping->private_list, offset); |
2371 | 2371 | ||
2372 | spin_lock(&inode->i_lock); | 2372 | spin_lock(&inode->i_lock); |
2373 | inode->i_blocks -= blocks_per_huge_page(h); | 2373 | inode->i_blocks -= (blocks_per_huge_page(h) * freed); |
2374 | spin_unlock(&inode->i_lock); | 2374 | spin_unlock(&inode->i_lock); |
2375 | 2375 | ||
2376 | hugetlb_put_quota(inode->i_mapping, (chg - freed)); | 2376 | hugetlb_put_quota(inode->i_mapping, (chg - freed)); |
diff --git a/mm/kmemleak.c b/mm/kmemleak.c index e766e1da09d2..4ea4510e2996 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c | |||
@@ -92,21 +92,24 @@ | |||
92 | #include <linux/string.h> | 92 | #include <linux/string.h> |
93 | #include <linux/nodemask.h> | 93 | #include <linux/nodemask.h> |
94 | #include <linux/mm.h> | 94 | #include <linux/mm.h> |
95 | #include <linux/workqueue.h> | ||
95 | 96 | ||
96 | #include <asm/sections.h> | 97 | #include <asm/sections.h> |
97 | #include <asm/processor.h> | 98 | #include <asm/processor.h> |
98 | #include <asm/atomic.h> | 99 | #include <asm/atomic.h> |
99 | 100 | ||
101 | #include <linux/kmemcheck.h> | ||
100 | #include <linux/kmemleak.h> | 102 | #include <linux/kmemleak.h> |
101 | 103 | ||
102 | /* | 104 | /* |
103 | * Kmemleak configuration and common defines. | 105 | * Kmemleak configuration and common defines. |
104 | */ | 106 | */ |
105 | #define MAX_TRACE 16 /* stack trace length */ | 107 | #define MAX_TRACE 16 /* stack trace length */ |
106 | #define REPORTS_NR 50 /* maximum number of reported leaks */ | ||
107 | #define MSECS_MIN_AGE 5000 /* minimum object age for reporting */ | 108 | #define MSECS_MIN_AGE 5000 /* minimum object age for reporting */ |
108 | #define SECS_FIRST_SCAN 60 /* delay before the first scan */ | 109 | #define SECS_FIRST_SCAN 60 /* delay before the first scan */ |
109 | #define SECS_SCAN_WAIT 600 /* subsequent auto scanning delay */ | 110 | #define SECS_SCAN_WAIT 600 /* subsequent auto scanning delay */ |
111 | #define GRAY_LIST_PASSES 25 /* maximum number of gray list scans */ | ||
112 | #define MAX_SCAN_SIZE 4096 /* maximum size of a scanned block */ | ||
110 | 113 | ||
111 | #define BYTES_PER_POINTER sizeof(void *) | 114 | #define BYTES_PER_POINTER sizeof(void *) |
112 | 115 | ||
@@ -120,6 +123,9 @@ struct kmemleak_scan_area { | |||
120 | size_t length; | 123 | size_t length; |
121 | }; | 124 | }; |
122 | 125 | ||
126 | #define KMEMLEAK_GREY 0 | ||
127 | #define KMEMLEAK_BLACK -1 | ||
128 | |||
123 | /* | 129 | /* |
124 | * Structure holding the metadata for each allocated memory block. | 130 | * Structure holding the metadata for each allocated memory block. |
125 | * Modifications to such objects should be made while holding the | 131 | * Modifications to such objects should be made while holding the |
@@ -158,6 +164,17 @@ struct kmemleak_object { | |||
158 | #define OBJECT_REPORTED (1 << 1) | 164 | #define OBJECT_REPORTED (1 << 1) |
159 | /* flag set to not scan the object */ | 165 | /* flag set to not scan the object */ |
160 | #define OBJECT_NO_SCAN (1 << 2) | 166 | #define OBJECT_NO_SCAN (1 << 2) |
167 | /* flag set on newly allocated objects */ | ||
168 | #define OBJECT_NEW (1 << 3) | ||
169 | |||
170 | /* number of bytes to print per line; must be 16 or 32 */ | ||
171 | #define HEX_ROW_SIZE 16 | ||
172 | /* number of bytes to print at a time (1, 2, 4, 8) */ | ||
173 | #define HEX_GROUP_SIZE 1 | ||
174 | /* include ASCII after the hex output */ | ||
175 | #define HEX_ASCII 1 | ||
176 | /* max number of lines to be printed */ | ||
177 | #define HEX_MAX_LINES 2 | ||
161 | 178 | ||
162 | /* the list of all allocated objects */ | 179 | /* the list of all allocated objects */ |
163 | static LIST_HEAD(object_list); | 180 | static LIST_HEAD(object_list); |
@@ -196,9 +213,6 @@ static int kmemleak_stack_scan = 1; | |||
196 | /* protects the memory scanning, parameters and debug/kmemleak file access */ | 213 | /* protects the memory scanning, parameters and debug/kmemleak file access */ |
197 | static DEFINE_MUTEX(scan_mutex); | 214 | static DEFINE_MUTEX(scan_mutex); |
198 | 215 | ||
199 | /* number of leaks reported (for limitation purposes) */ | ||
200 | static int reported_leaks; | ||
201 | |||
202 | /* | 216 | /* |
203 | * Early object allocation/freeing logging. Kmemleak is initialized after the | 217 | * Early object allocation/freeing logging. Kmemleak is initialized after the |
204 | * kernel allocator. However, both the kernel allocator and kmemleak may | 218 | * kernel allocator. However, both the kernel allocator and kmemleak may |
@@ -211,6 +225,7 @@ static int reported_leaks; | |||
211 | enum { | 225 | enum { |
212 | KMEMLEAK_ALLOC, | 226 | KMEMLEAK_ALLOC, |
213 | KMEMLEAK_FREE, | 227 | KMEMLEAK_FREE, |
228 | KMEMLEAK_FREE_PART, | ||
214 | KMEMLEAK_NOT_LEAK, | 229 | KMEMLEAK_NOT_LEAK, |
215 | KMEMLEAK_IGNORE, | 230 | KMEMLEAK_IGNORE, |
216 | KMEMLEAK_SCAN_AREA, | 231 | KMEMLEAK_SCAN_AREA, |
@@ -228,11 +243,14 @@ struct early_log { | |||
228 | int min_count; /* minimum reference count */ | 243 | int min_count; /* minimum reference count */ |
229 | unsigned long offset; /* scan area offset */ | 244 | unsigned long offset; /* scan area offset */ |
230 | size_t length; /* scan area length */ | 245 | size_t length; /* scan area length */ |
246 | unsigned long trace[MAX_TRACE]; /* stack trace */ | ||
247 | unsigned int trace_len; /* stack trace length */ | ||
231 | }; | 248 | }; |
232 | 249 | ||
233 | /* early logging buffer and current position */ | 250 | /* early logging buffer and current position */ |
234 | static struct early_log early_log[CONFIG_DEBUG_KMEMLEAK_EARLY_LOG_SIZE]; | 251 | static struct early_log |
235 | static int crt_early_log; | 252 | early_log[CONFIG_DEBUG_KMEMLEAK_EARLY_LOG_SIZE] __initdata; |
253 | static int crt_early_log __initdata; | ||
236 | 254 | ||
237 | static void kmemleak_disable(void); | 255 | static void kmemleak_disable(void); |
238 | 256 | ||
@@ -255,6 +273,35 @@ static void kmemleak_disable(void); | |||
255 | } while (0) | 273 | } while (0) |
256 | 274 | ||
257 | /* | 275 | /* |
276 | * Printing of the objects hex dump to the seq file. The number of lines to be | ||
277 | * printed is limited to HEX_MAX_LINES to prevent seq file spamming. The | ||
278 | * actual number of printed bytes depends on HEX_ROW_SIZE. It must be called | ||
279 | * with the object->lock held. | ||
280 | */ | ||
281 | static void hex_dump_object(struct seq_file *seq, | ||
282 | struct kmemleak_object *object) | ||
283 | { | ||
284 | const u8 *ptr = (const u8 *)object->pointer; | ||
285 | int i, len, remaining; | ||
286 | unsigned char linebuf[HEX_ROW_SIZE * 5]; | ||
287 | |||
288 | /* limit the number of lines to HEX_MAX_LINES */ | ||
289 | remaining = len = | ||
290 | min(object->size, (size_t)(HEX_MAX_LINES * HEX_ROW_SIZE)); | ||
291 | |||
292 | seq_printf(seq, " hex dump (first %d bytes):\n", len); | ||
293 | for (i = 0; i < len; i += HEX_ROW_SIZE) { | ||
294 | int linelen = min(remaining, HEX_ROW_SIZE); | ||
295 | |||
296 | remaining -= HEX_ROW_SIZE; | ||
297 | hex_dump_to_buffer(ptr + i, linelen, HEX_ROW_SIZE, | ||
298 | HEX_GROUP_SIZE, linebuf, sizeof(linebuf), | ||
299 | HEX_ASCII); | ||
300 | seq_printf(seq, " %s\n", linebuf); | ||
301 | } | ||
302 | } | ||
303 | |||
304 | /* | ||
258 | * Object colors, encoded with count and min_count: | 305 | * Object colors, encoded with count and min_count: |
259 | * - white - orphan object, not enough references to it (count < min_count) | 306 | * - white - orphan object, not enough references to it (count < min_count) |
260 | * - gray - not orphan, not marked as false positive (min_count == 0) or | 307 | * - gray - not orphan, not marked as false positive (min_count == 0) or |
@@ -264,14 +311,21 @@ static void kmemleak_disable(void); | |||
264 | * Newly created objects don't have any color assigned (object->count == -1) | 311 | * Newly created objects don't have any color assigned (object->count == -1) |
265 | * before the next memory scan when they become white. | 312 | * before the next memory scan when they become white. |
266 | */ | 313 | */ |
267 | static int color_white(const struct kmemleak_object *object) | 314 | static bool color_white(const struct kmemleak_object *object) |
315 | { | ||
316 | return object->count != KMEMLEAK_BLACK && | ||
317 | object->count < object->min_count; | ||
318 | } | ||
319 | |||
320 | static bool color_gray(const struct kmemleak_object *object) | ||
268 | { | 321 | { |
269 | return object->count != -1 && object->count < object->min_count; | 322 | return object->min_count != KMEMLEAK_BLACK && |
323 | object->count >= object->min_count; | ||
270 | } | 324 | } |
271 | 325 | ||
272 | static int color_gray(const struct kmemleak_object *object) | 326 | static bool color_black(const struct kmemleak_object *object) |
273 | { | 327 | { |
274 | return object->min_count != -1 && object->count >= object->min_count; | 328 | return object->min_count == KMEMLEAK_BLACK; |
275 | } | 329 | } |
276 | 330 | ||
277 | /* | 331 | /* |
@@ -279,7 +333,7 @@ static int color_gray(const struct kmemleak_object *object) | |||
279 | * not be deleted and have a minimum age to avoid false positives caused by | 333 | * not be deleted and have a minimum age to avoid false positives caused by |
280 | * pointers temporarily stored in CPU registers. | 334 | * pointers temporarily stored in CPU registers. |
281 | */ | 335 | */ |
282 | static int unreferenced_object(struct kmemleak_object *object) | 336 | static bool unreferenced_object(struct kmemleak_object *object) |
283 | { | 337 | { |
284 | return (object->flags & OBJECT_ALLOCATED) && color_white(object) && | 338 | return (object->flags & OBJECT_ALLOCATED) && color_white(object) && |
285 | time_before_eq(object->jiffies + jiffies_min_age, | 339 | time_before_eq(object->jiffies + jiffies_min_age, |
@@ -299,6 +353,7 @@ static void print_unreferenced(struct seq_file *seq, | |||
299 | object->pointer, object->size); | 353 | object->pointer, object->size); |
300 | seq_printf(seq, " comm \"%s\", pid %d, jiffies %lu\n", | 354 | seq_printf(seq, " comm \"%s\", pid %d, jiffies %lu\n", |
301 | object->comm, object->pid, object->jiffies); | 355 | object->comm, object->pid, object->jiffies); |
356 | hex_dump_object(seq, object); | ||
302 | seq_printf(seq, " backtrace:\n"); | 357 | seq_printf(seq, " backtrace:\n"); |
303 | 358 | ||
304 | for (i = 0; i < object->trace_len; i++) { | 359 | for (i = 0; i < object->trace_len; i++) { |
@@ -325,6 +380,7 @@ static void dump_object_info(struct kmemleak_object *object) | |||
325 | object->comm, object->pid, object->jiffies); | 380 | object->comm, object->pid, object->jiffies); |
326 | pr_notice(" min_count = %d\n", object->min_count); | 381 | pr_notice(" min_count = %d\n", object->min_count); |
327 | pr_notice(" count = %d\n", object->count); | 382 | pr_notice(" count = %d\n", object->count); |
383 | pr_notice(" flags = 0x%lx\n", object->flags); | ||
328 | pr_notice(" backtrace:\n"); | 384 | pr_notice(" backtrace:\n"); |
329 | print_stack_trace(&trace, 4); | 385 | print_stack_trace(&trace, 4); |
330 | } | 386 | } |
@@ -429,21 +485,36 @@ static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias) | |||
429 | } | 485 | } |
430 | 486 | ||
431 | /* | 487 | /* |
488 | * Save stack trace to the given array of MAX_TRACE size. | ||
489 | */ | ||
490 | static int __save_stack_trace(unsigned long *trace) | ||
491 | { | ||
492 | struct stack_trace stack_trace; | ||
493 | |||
494 | stack_trace.max_entries = MAX_TRACE; | ||
495 | stack_trace.nr_entries = 0; | ||
496 | stack_trace.entries = trace; | ||
497 | stack_trace.skip = 2; | ||
498 | save_stack_trace(&stack_trace); | ||
499 | |||
500 | return stack_trace.nr_entries; | ||
501 | } | ||
502 | |||
503 | /* | ||
432 | * Create the metadata (struct kmemleak_object) corresponding to an allocated | 504 | * Create the metadata (struct kmemleak_object) corresponding to an allocated |
433 | * memory block and add it to the object_list and object_tree_root. | 505 | * memory block and add it to the object_list and object_tree_root. |
434 | */ | 506 | */ |
435 | static void create_object(unsigned long ptr, size_t size, int min_count, | 507 | static struct kmemleak_object *create_object(unsigned long ptr, size_t size, |
436 | gfp_t gfp) | 508 | int min_count, gfp_t gfp) |
437 | { | 509 | { |
438 | unsigned long flags; | 510 | unsigned long flags; |
439 | struct kmemleak_object *object; | 511 | struct kmemleak_object *object; |
440 | struct prio_tree_node *node; | 512 | struct prio_tree_node *node; |
441 | struct stack_trace trace; | ||
442 | 513 | ||
443 | object = kmem_cache_alloc(object_cache, gfp & GFP_KMEMLEAK_MASK); | 514 | object = kmem_cache_alloc(object_cache, gfp & GFP_KMEMLEAK_MASK); |
444 | if (!object) { | 515 | if (!object) { |
445 | kmemleak_stop("Cannot allocate a kmemleak_object structure\n"); | 516 | kmemleak_stop("Cannot allocate a kmemleak_object structure\n"); |
446 | return; | 517 | return NULL; |
447 | } | 518 | } |
448 | 519 | ||
449 | INIT_LIST_HEAD(&object->object_list); | 520 | INIT_LIST_HEAD(&object->object_list); |
@@ -451,7 +522,7 @@ static void create_object(unsigned long ptr, size_t size, int min_count, | |||
451 | INIT_HLIST_HEAD(&object->area_list); | 522 | INIT_HLIST_HEAD(&object->area_list); |
452 | spin_lock_init(&object->lock); | 523 | spin_lock_init(&object->lock); |
453 | atomic_set(&object->use_count, 1); | 524 | atomic_set(&object->use_count, 1); |
454 | object->flags = OBJECT_ALLOCATED; | 525 | object->flags = OBJECT_ALLOCATED | OBJECT_NEW; |
455 | object->pointer = ptr; | 526 | object->pointer = ptr; |
456 | object->size = size; | 527 | object->size = size; |
457 | object->min_count = min_count; | 528 | object->min_count = min_count; |
@@ -477,18 +548,14 @@ static void create_object(unsigned long ptr, size_t size, int min_count, | |||
477 | } | 548 | } |
478 | 549 | ||
479 | /* kernel backtrace */ | 550 | /* kernel backtrace */ |
480 | trace.max_entries = MAX_TRACE; | 551 | object->trace_len = __save_stack_trace(object->trace); |
481 | trace.nr_entries = 0; | ||
482 | trace.entries = object->trace; | ||
483 | trace.skip = 1; | ||
484 | save_stack_trace(&trace); | ||
485 | object->trace_len = trace.nr_entries; | ||
486 | 552 | ||
487 | INIT_PRIO_TREE_NODE(&object->tree_node); | 553 | INIT_PRIO_TREE_NODE(&object->tree_node); |
488 | object->tree_node.start = ptr; | 554 | object->tree_node.start = ptr; |
489 | object->tree_node.last = ptr + size - 1; | 555 | object->tree_node.last = ptr + size - 1; |
490 | 556 | ||
491 | write_lock_irqsave(&kmemleak_lock, flags); | 557 | write_lock_irqsave(&kmemleak_lock, flags); |
558 | |||
492 | min_addr = min(min_addr, ptr); | 559 | min_addr = min(min_addr, ptr); |
493 | max_addr = max(max_addr, ptr + size); | 560 | max_addr = max(max_addr, ptr + size); |
494 | node = prio_tree_insert(&object_tree_root, &object->tree_node); | 561 | node = prio_tree_insert(&object_tree_root, &object->tree_node); |
@@ -499,47 +566,36 @@ static void create_object(unsigned long ptr, size_t size, int min_count, | |||
499 | * random memory blocks. | 566 | * random memory blocks. |
500 | */ | 567 | */ |
501 | if (node != &object->tree_node) { | 568 | if (node != &object->tree_node) { |
502 | unsigned long flags; | ||
503 | |||
504 | kmemleak_stop("Cannot insert 0x%lx into the object search tree " | 569 | kmemleak_stop("Cannot insert 0x%lx into the object search tree " |
505 | "(already existing)\n", ptr); | 570 | "(already existing)\n", ptr); |
506 | object = lookup_object(ptr, 1); | 571 | object = lookup_object(ptr, 1); |
507 | spin_lock_irqsave(&object->lock, flags); | 572 | spin_lock(&object->lock); |
508 | dump_object_info(object); | 573 | dump_object_info(object); |
509 | spin_unlock_irqrestore(&object->lock, flags); | 574 | spin_unlock(&object->lock); |
510 | 575 | ||
511 | goto out; | 576 | goto out; |
512 | } | 577 | } |
513 | list_add_tail_rcu(&object->object_list, &object_list); | 578 | list_add_tail_rcu(&object->object_list, &object_list); |
514 | out: | 579 | out: |
515 | write_unlock_irqrestore(&kmemleak_lock, flags); | 580 | write_unlock_irqrestore(&kmemleak_lock, flags); |
581 | return object; | ||
516 | } | 582 | } |
517 | 583 | ||
518 | /* | 584 | /* |
519 | * Remove the metadata (struct kmemleak_object) for a memory block from the | 585 | * Remove the metadata (struct kmemleak_object) for a memory block from the |
520 | * object_list and object_tree_root and decrement its use_count. | 586 | * object_list and object_tree_root and decrement its use_count. |
521 | */ | 587 | */ |
522 | static void delete_object(unsigned long ptr) | 588 | static void __delete_object(struct kmemleak_object *object) |
523 | { | 589 | { |
524 | unsigned long flags; | 590 | unsigned long flags; |
525 | struct kmemleak_object *object; | ||
526 | 591 | ||
527 | write_lock_irqsave(&kmemleak_lock, flags); | 592 | write_lock_irqsave(&kmemleak_lock, flags); |
528 | object = lookup_object(ptr, 0); | ||
529 | if (!object) { | ||
530 | #ifdef DEBUG | ||
531 | kmemleak_warn("Freeing unknown object at 0x%08lx\n", | ||
532 | ptr); | ||
533 | #endif | ||
534 | write_unlock_irqrestore(&kmemleak_lock, flags); | ||
535 | return; | ||
536 | } | ||
537 | prio_tree_remove(&object_tree_root, &object->tree_node); | 593 | prio_tree_remove(&object_tree_root, &object->tree_node); |
538 | list_del_rcu(&object->object_list); | 594 | list_del_rcu(&object->object_list); |
539 | write_unlock_irqrestore(&kmemleak_lock, flags); | 595 | write_unlock_irqrestore(&kmemleak_lock, flags); |
540 | 596 | ||
541 | WARN_ON(!(object->flags & OBJECT_ALLOCATED)); | 597 | WARN_ON(!(object->flags & OBJECT_ALLOCATED)); |
542 | WARN_ON(atomic_read(&object->use_count) < 1); | 598 | WARN_ON(atomic_read(&object->use_count) < 2); |
543 | 599 | ||
544 | /* | 600 | /* |
545 | * Locking here also ensures that the corresponding memory block | 601 | * Locking here also ensures that the corresponding memory block |
@@ -552,48 +608,115 @@ static void delete_object(unsigned long ptr) | |||
552 | } | 608 | } |
553 | 609 | ||
554 | /* | 610 | /* |
555 | * Make a object permanently as gray-colored so that it can no longer be | 611 | * Look up the metadata (struct kmemleak_object) corresponding to ptr and |
556 | * reported as a leak. This is used in general to mark a false positive. | 612 | * delete it. |
557 | */ | 613 | */ |
558 | static void make_gray_object(unsigned long ptr) | 614 | static void delete_object_full(unsigned long ptr) |
559 | { | 615 | { |
560 | unsigned long flags; | ||
561 | struct kmemleak_object *object; | 616 | struct kmemleak_object *object; |
562 | 617 | ||
563 | object = find_and_get_object(ptr, 0); | 618 | object = find_and_get_object(ptr, 0); |
564 | if (!object) { | 619 | if (!object) { |
565 | kmemleak_warn("Graying unknown object at 0x%08lx\n", ptr); | 620 | #ifdef DEBUG |
621 | kmemleak_warn("Freeing unknown object at 0x%08lx\n", | ||
622 | ptr); | ||
623 | #endif | ||
566 | return; | 624 | return; |
567 | } | 625 | } |
568 | 626 | __delete_object(object); | |
569 | spin_lock_irqsave(&object->lock, flags); | ||
570 | object->min_count = 0; | ||
571 | spin_unlock_irqrestore(&object->lock, flags); | ||
572 | put_object(object); | 627 | put_object(object); |
573 | } | 628 | } |
574 | 629 | ||
575 | /* | 630 | /* |
576 | * Mark the object as black-colored so that it is ignored from scans and | 631 | * Look up the metadata (struct kmemleak_object) corresponding to ptr and |
577 | * reporting. | 632 | * delete it. If the memory block is partially freed, the function may create |
633 | * additional metadata for the remaining parts of the block. | ||
578 | */ | 634 | */ |
579 | static void make_black_object(unsigned long ptr) | 635 | static void delete_object_part(unsigned long ptr, size_t size) |
580 | { | 636 | { |
581 | unsigned long flags; | ||
582 | struct kmemleak_object *object; | 637 | struct kmemleak_object *object; |
638 | unsigned long start, end; | ||
583 | 639 | ||
584 | object = find_and_get_object(ptr, 0); | 640 | object = find_and_get_object(ptr, 1); |
585 | if (!object) { | 641 | if (!object) { |
586 | kmemleak_warn("Blacking unknown object at 0x%08lx\n", ptr); | 642 | #ifdef DEBUG |
643 | kmemleak_warn("Partially freeing unknown object at 0x%08lx " | ||
644 | "(size %zu)\n", ptr, size); | ||
645 | #endif | ||
587 | return; | 646 | return; |
588 | } | 647 | } |
648 | __delete_object(object); | ||
649 | |||
650 | /* | ||
651 | * Create one or two objects that may result from the memory block | ||
652 | * split. Note that partial freeing is only done by free_bootmem() and | ||
653 | * this happens before kmemleak_init() is called. The path below is | ||
654 | * only executed during early log recording in kmemleak_init(), so | ||
655 | * GFP_KERNEL is enough. | ||
656 | */ | ||
657 | start = object->pointer; | ||
658 | end = object->pointer + object->size; | ||
659 | if (ptr > start) | ||
660 | create_object(start, ptr - start, object->min_count, | ||
661 | GFP_KERNEL); | ||
662 | if (ptr + size < end) | ||
663 | create_object(ptr + size, end - ptr - size, object->min_count, | ||
664 | GFP_KERNEL); | ||
665 | |||
666 | put_object(object); | ||
667 | } | ||
668 | |||
669 | static void __paint_it(struct kmemleak_object *object, int color) | ||
670 | { | ||
671 | object->min_count = color; | ||
672 | if (color == KMEMLEAK_BLACK) | ||
673 | object->flags |= OBJECT_NO_SCAN; | ||
674 | } | ||
675 | |||
676 | static void paint_it(struct kmemleak_object *object, int color) | ||
677 | { | ||
678 | unsigned long flags; | ||
589 | 679 | ||
590 | spin_lock_irqsave(&object->lock, flags); | 680 | spin_lock_irqsave(&object->lock, flags); |
591 | object->min_count = -1; | 681 | __paint_it(object, color); |
592 | spin_unlock_irqrestore(&object->lock, flags); | 682 | spin_unlock_irqrestore(&object->lock, flags); |
683 | } | ||
684 | |||
685 | static void paint_ptr(unsigned long ptr, int color) | ||
686 | { | ||
687 | struct kmemleak_object *object; | ||
688 | |||
689 | object = find_and_get_object(ptr, 0); | ||
690 | if (!object) { | ||
691 | kmemleak_warn("Trying to color unknown object " | ||
692 | "at 0x%08lx as %s\n", ptr, | ||
693 | (color == KMEMLEAK_GREY) ? "Grey" : | ||
694 | (color == KMEMLEAK_BLACK) ? "Black" : "Unknown"); | ||
695 | return; | ||
696 | } | ||
697 | paint_it(object, color); | ||
593 | put_object(object); | 698 | put_object(object); |
594 | } | 699 | } |
595 | 700 | ||
596 | /* | 701 | /* |
702 | * Make a object permanently as gray-colored so that it can no longer be | ||
703 | * reported as a leak. This is used in general to mark a false positive. | ||
704 | */ | ||
705 | static void make_gray_object(unsigned long ptr) | ||
706 | { | ||
707 | paint_ptr(ptr, KMEMLEAK_GREY); | ||
708 | } | ||
709 | |||
710 | /* | ||
711 | * Mark the object as black-colored so that it is ignored from scans and | ||
712 | * reporting. | ||
713 | */ | ||
714 | static void make_black_object(unsigned long ptr) | ||
715 | { | ||
716 | paint_ptr(ptr, KMEMLEAK_BLACK); | ||
717 | } | ||
718 | |||
719 | /* | ||
597 | * Add a scanning area to the object. If at least one such area is added, | 720 | * Add a scanning area to the object. If at least one such area is added, |
598 | * kmemleak will only scan these ranges rather than the whole memory block. | 721 | * kmemleak will only scan these ranges rather than the whole memory block. |
599 | */ | 722 | */ |
@@ -662,14 +785,15 @@ static void object_no_scan(unsigned long ptr) | |||
662 | * Log an early kmemleak_* call to the early_log buffer. These calls will be | 785 | * Log an early kmemleak_* call to the early_log buffer. These calls will be |
663 | * processed later once kmemleak is fully initialized. | 786 | * processed later once kmemleak is fully initialized. |
664 | */ | 787 | */ |
665 | static void log_early(int op_type, const void *ptr, size_t size, | 788 | static void __init log_early(int op_type, const void *ptr, size_t size, |
666 | int min_count, unsigned long offset, size_t length) | 789 | int min_count, unsigned long offset, size_t length) |
667 | { | 790 | { |
668 | unsigned long flags; | 791 | unsigned long flags; |
669 | struct early_log *log; | 792 | struct early_log *log; |
670 | 793 | ||
671 | if (crt_early_log >= ARRAY_SIZE(early_log)) { | 794 | if (crt_early_log >= ARRAY_SIZE(early_log)) { |
672 | pr_warning("Early log buffer exceeded\n"); | 795 | pr_warning("Early log buffer exceeded, " |
796 | "please increase DEBUG_KMEMLEAK_EARLY_LOG_SIZE\n"); | ||
673 | kmemleak_disable(); | 797 | kmemleak_disable(); |
674 | return; | 798 | return; |
675 | } | 799 | } |
@@ -686,16 +810,45 @@ static void log_early(int op_type, const void *ptr, size_t size, | |||
686 | log->min_count = min_count; | 810 | log->min_count = min_count; |
687 | log->offset = offset; | 811 | log->offset = offset; |
688 | log->length = length; | 812 | log->length = length; |
813 | if (op_type == KMEMLEAK_ALLOC) | ||
814 | log->trace_len = __save_stack_trace(log->trace); | ||
689 | crt_early_log++; | 815 | crt_early_log++; |
690 | local_irq_restore(flags); | 816 | local_irq_restore(flags); |
691 | } | 817 | } |
692 | 818 | ||
693 | /* | 819 | /* |
820 | * Log an early allocated block and populate the stack trace. | ||
821 | */ | ||
822 | static void early_alloc(struct early_log *log) | ||
823 | { | ||
824 | struct kmemleak_object *object; | ||
825 | unsigned long flags; | ||
826 | int i; | ||
827 | |||
828 | if (!atomic_read(&kmemleak_enabled) || !log->ptr || IS_ERR(log->ptr)) | ||
829 | return; | ||
830 | |||
831 | /* | ||
832 | * RCU locking needed to ensure object is not freed via put_object(). | ||
833 | */ | ||
834 | rcu_read_lock(); | ||
835 | object = create_object((unsigned long)log->ptr, log->size, | ||
836 | log->min_count, GFP_KERNEL); | ||
837 | spin_lock_irqsave(&object->lock, flags); | ||
838 | for (i = 0; i < log->trace_len; i++) | ||
839 | object->trace[i] = log->trace[i]; | ||
840 | object->trace_len = log->trace_len; | ||
841 | spin_unlock_irqrestore(&object->lock, flags); | ||
842 | rcu_read_unlock(); | ||
843 | } | ||
844 | |||
845 | /* | ||
694 | * Memory allocation function callback. This function is called from the | 846 | * Memory allocation function callback. This function is called from the |
695 | * kernel allocators when a new block is allocated (kmem_cache_alloc, kmalloc, | 847 | * kernel allocators when a new block is allocated (kmem_cache_alloc, kmalloc, |
696 | * vmalloc etc.). | 848 | * vmalloc etc.). |
697 | */ | 849 | */ |
698 | void kmemleak_alloc(const void *ptr, size_t size, int min_count, gfp_t gfp) | 850 | void __ref kmemleak_alloc(const void *ptr, size_t size, int min_count, |
851 | gfp_t gfp) | ||
699 | { | 852 | { |
700 | pr_debug("%s(0x%p, %zu, %d)\n", __func__, ptr, size, min_count); | 853 | pr_debug("%s(0x%p, %zu, %d)\n", __func__, ptr, size, min_count); |
701 | 854 | ||
@@ -710,22 +863,37 @@ EXPORT_SYMBOL_GPL(kmemleak_alloc); | |||
710 | * Memory freeing function callback. This function is called from the kernel | 863 | * Memory freeing function callback. This function is called from the kernel |
711 | * allocators when a block is freed (kmem_cache_free, kfree, vfree etc.). | 864 | * allocators when a block is freed (kmem_cache_free, kfree, vfree etc.). |
712 | */ | 865 | */ |
713 | void kmemleak_free(const void *ptr) | 866 | void __ref kmemleak_free(const void *ptr) |
714 | { | 867 | { |
715 | pr_debug("%s(0x%p)\n", __func__, ptr); | 868 | pr_debug("%s(0x%p)\n", __func__, ptr); |
716 | 869 | ||
717 | if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) | 870 | if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) |
718 | delete_object((unsigned long)ptr); | 871 | delete_object_full((unsigned long)ptr); |
719 | else if (atomic_read(&kmemleak_early_log)) | 872 | else if (atomic_read(&kmemleak_early_log)) |
720 | log_early(KMEMLEAK_FREE, ptr, 0, 0, 0, 0); | 873 | log_early(KMEMLEAK_FREE, ptr, 0, 0, 0, 0); |
721 | } | 874 | } |
722 | EXPORT_SYMBOL_GPL(kmemleak_free); | 875 | EXPORT_SYMBOL_GPL(kmemleak_free); |
723 | 876 | ||
724 | /* | 877 | /* |
878 | * Partial memory freeing function callback. This function is usually called | ||
879 | * from bootmem allocator when (part of) a memory block is freed. | ||
880 | */ | ||
881 | void __ref kmemleak_free_part(const void *ptr, size_t size) | ||
882 | { | ||
883 | pr_debug("%s(0x%p)\n", __func__, ptr); | ||
884 | |||
885 | if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) | ||
886 | delete_object_part((unsigned long)ptr, size); | ||
887 | else if (atomic_read(&kmemleak_early_log)) | ||
888 | log_early(KMEMLEAK_FREE_PART, ptr, size, 0, 0, 0); | ||
889 | } | ||
890 | EXPORT_SYMBOL_GPL(kmemleak_free_part); | ||
891 | |||
892 | /* | ||
725 | * Mark an already allocated memory block as a false positive. This will cause | 893 | * Mark an already allocated memory block as a false positive. This will cause |
726 | * the block to no longer be reported as leak and always be scanned. | 894 | * the block to no longer be reported as leak and always be scanned. |
727 | */ | 895 | */ |
728 | void kmemleak_not_leak(const void *ptr) | 896 | void __ref kmemleak_not_leak(const void *ptr) |
729 | { | 897 | { |
730 | pr_debug("%s(0x%p)\n", __func__, ptr); | 898 | pr_debug("%s(0x%p)\n", __func__, ptr); |
731 | 899 | ||
@@ -741,7 +909,7 @@ EXPORT_SYMBOL(kmemleak_not_leak); | |||
741 | * corresponding block is not a leak and does not contain any references to | 909 | * corresponding block is not a leak and does not contain any references to |
742 | * other allocated memory blocks. | 910 | * other allocated memory blocks. |
743 | */ | 911 | */ |
744 | void kmemleak_ignore(const void *ptr) | 912 | void __ref kmemleak_ignore(const void *ptr) |
745 | { | 913 | { |
746 | pr_debug("%s(0x%p)\n", __func__, ptr); | 914 | pr_debug("%s(0x%p)\n", __func__, ptr); |
747 | 915 | ||
@@ -755,8 +923,8 @@ EXPORT_SYMBOL(kmemleak_ignore); | |||
755 | /* | 923 | /* |
756 | * Limit the range to be scanned in an allocated memory block. | 924 | * Limit the range to be scanned in an allocated memory block. |
757 | */ | 925 | */ |
758 | void kmemleak_scan_area(const void *ptr, unsigned long offset, size_t length, | 926 | void __ref kmemleak_scan_area(const void *ptr, unsigned long offset, |
759 | gfp_t gfp) | 927 | size_t length, gfp_t gfp) |
760 | { | 928 | { |
761 | pr_debug("%s(0x%p)\n", __func__, ptr); | 929 | pr_debug("%s(0x%p)\n", __func__, ptr); |
762 | 930 | ||
@@ -770,7 +938,7 @@ EXPORT_SYMBOL(kmemleak_scan_area); | |||
770 | /* | 938 | /* |
771 | * Inform kmemleak not to scan the given memory block. | 939 | * Inform kmemleak not to scan the given memory block. |
772 | */ | 940 | */ |
773 | void kmemleak_no_scan(const void *ptr) | 941 | void __ref kmemleak_no_scan(const void *ptr) |
774 | { | 942 | { |
775 | pr_debug("%s(0x%p)\n", __func__, ptr); | 943 | pr_debug("%s(0x%p)\n", __func__, ptr); |
776 | 944 | ||
@@ -807,20 +975,29 @@ static int scan_should_stop(void) | |||
807 | * found to the gray list. | 975 | * found to the gray list. |
808 | */ | 976 | */ |
809 | static void scan_block(void *_start, void *_end, | 977 | static void scan_block(void *_start, void *_end, |
810 | struct kmemleak_object *scanned) | 978 | struct kmemleak_object *scanned, int allow_resched) |
811 | { | 979 | { |
812 | unsigned long *ptr; | 980 | unsigned long *ptr; |
813 | unsigned long *start = PTR_ALIGN(_start, BYTES_PER_POINTER); | 981 | unsigned long *start = PTR_ALIGN(_start, BYTES_PER_POINTER); |
814 | unsigned long *end = _end - (BYTES_PER_POINTER - 1); | 982 | unsigned long *end = _end - (BYTES_PER_POINTER - 1); |
815 | 983 | ||
816 | for (ptr = start; ptr < end; ptr++) { | 984 | for (ptr = start; ptr < end; ptr++) { |
817 | unsigned long flags; | ||
818 | unsigned long pointer = *ptr; | ||
819 | struct kmemleak_object *object; | 985 | struct kmemleak_object *object; |
986 | unsigned long flags; | ||
987 | unsigned long pointer; | ||
820 | 988 | ||
989 | if (allow_resched) | ||
990 | cond_resched(); | ||
821 | if (scan_should_stop()) | 991 | if (scan_should_stop()) |
822 | break; | 992 | break; |
823 | 993 | ||
994 | /* don't scan uninitialized memory */ | ||
995 | if (!kmemcheck_is_obj_initialized((unsigned long)ptr, | ||
996 | BYTES_PER_POINTER)) | ||
997 | continue; | ||
998 | |||
999 | pointer = *ptr; | ||
1000 | |||
824 | object = find_and_get_object(pointer, 1); | 1001 | object = find_and_get_object(pointer, 1); |
825 | if (!object) | 1002 | if (!object) |
826 | continue; | 1003 | continue; |
@@ -879,14 +1056,25 @@ static void scan_object(struct kmemleak_object *object) | |||
879 | if (!(object->flags & OBJECT_ALLOCATED)) | 1056 | if (!(object->flags & OBJECT_ALLOCATED)) |
880 | /* already freed object */ | 1057 | /* already freed object */ |
881 | goto out; | 1058 | goto out; |
882 | if (hlist_empty(&object->area_list)) | 1059 | if (hlist_empty(&object->area_list)) { |
883 | scan_block((void *)object->pointer, | 1060 | void *start = (void *)object->pointer; |
884 | (void *)(object->pointer + object->size), object); | 1061 | void *end = (void *)(object->pointer + object->size); |
885 | else | 1062 | |
1063 | while (start < end && (object->flags & OBJECT_ALLOCATED) && | ||
1064 | !(object->flags & OBJECT_NO_SCAN)) { | ||
1065 | scan_block(start, min(start + MAX_SCAN_SIZE, end), | ||
1066 | object, 0); | ||
1067 | start += MAX_SCAN_SIZE; | ||
1068 | |||
1069 | spin_unlock_irqrestore(&object->lock, flags); | ||
1070 | cond_resched(); | ||
1071 | spin_lock_irqsave(&object->lock, flags); | ||
1072 | } | ||
1073 | } else | ||
886 | hlist_for_each_entry(area, elem, &object->area_list, node) | 1074 | hlist_for_each_entry(area, elem, &object->area_list, node) |
887 | scan_block((void *)(object->pointer + area->offset), | 1075 | scan_block((void *)(object->pointer + area->offset), |
888 | (void *)(object->pointer + area->offset | 1076 | (void *)(object->pointer + area->offset |
889 | + area->length), object); | 1077 | + area->length), object, 0); |
890 | out: | 1078 | out: |
891 | spin_unlock_irqrestore(&object->lock, flags); | 1079 | spin_unlock_irqrestore(&object->lock, flags); |
892 | } | 1080 | } |
@@ -900,9 +1088,9 @@ static void kmemleak_scan(void) | |||
900 | { | 1088 | { |
901 | unsigned long flags; | 1089 | unsigned long flags; |
902 | struct kmemleak_object *object, *tmp; | 1090 | struct kmemleak_object *object, *tmp; |
903 | struct task_struct *task; | ||
904 | int i; | 1091 | int i; |
905 | int new_leaks = 0; | 1092 | int new_leaks = 0; |
1093 | int gray_list_pass = 0; | ||
906 | 1094 | ||
907 | jiffies_last_scan = jiffies; | 1095 | jiffies_last_scan = jiffies; |
908 | 1096 | ||
@@ -923,6 +1111,7 @@ static void kmemleak_scan(void) | |||
923 | #endif | 1111 | #endif |
924 | /* reset the reference count (whiten the object) */ | 1112 | /* reset the reference count (whiten the object) */ |
925 | object->count = 0; | 1113 | object->count = 0; |
1114 | object->flags &= ~OBJECT_NEW; | ||
926 | if (color_gray(object) && get_object(object)) | 1115 | if (color_gray(object) && get_object(object)) |
927 | list_add_tail(&object->gray_list, &gray_list); | 1116 | list_add_tail(&object->gray_list, &gray_list); |
928 | 1117 | ||
@@ -931,14 +1120,14 @@ static void kmemleak_scan(void) | |||
931 | rcu_read_unlock(); | 1120 | rcu_read_unlock(); |
932 | 1121 | ||
933 | /* data/bss scanning */ | 1122 | /* data/bss scanning */ |
934 | scan_block(_sdata, _edata, NULL); | 1123 | scan_block(_sdata, _edata, NULL, 1); |
935 | scan_block(__bss_start, __bss_stop, NULL); | 1124 | scan_block(__bss_start, __bss_stop, NULL, 1); |
936 | 1125 | ||
937 | #ifdef CONFIG_SMP | 1126 | #ifdef CONFIG_SMP |
938 | /* per-cpu sections scanning */ | 1127 | /* per-cpu sections scanning */ |
939 | for_each_possible_cpu(i) | 1128 | for_each_possible_cpu(i) |
940 | scan_block(__per_cpu_start + per_cpu_offset(i), | 1129 | scan_block(__per_cpu_start + per_cpu_offset(i), |
941 | __per_cpu_end + per_cpu_offset(i), NULL); | 1130 | __per_cpu_end + per_cpu_offset(i), NULL, 1); |
942 | #endif | 1131 | #endif |
943 | 1132 | ||
944 | /* | 1133 | /* |
@@ -960,19 +1149,21 @@ static void kmemleak_scan(void) | |||
960 | /* only scan if page is in use */ | 1149 | /* only scan if page is in use */ |
961 | if (page_count(page) == 0) | 1150 | if (page_count(page) == 0) |
962 | continue; | 1151 | continue; |
963 | scan_block(page, page + 1, NULL); | 1152 | scan_block(page, page + 1, NULL, 1); |
964 | } | 1153 | } |
965 | } | 1154 | } |
966 | 1155 | ||
967 | /* | 1156 | /* |
968 | * Scanning the task stacks may introduce false negatives and it is | 1157 | * Scanning the task stacks (may introduce false negatives). |
969 | * not enabled by default. | ||
970 | */ | 1158 | */ |
971 | if (kmemleak_stack_scan) { | 1159 | if (kmemleak_stack_scan) { |
1160 | struct task_struct *p, *g; | ||
1161 | |||
972 | read_lock(&tasklist_lock); | 1162 | read_lock(&tasklist_lock); |
973 | for_each_process(task) | 1163 | do_each_thread(g, p) { |
974 | scan_block(task_stack_page(task), | 1164 | scan_block(task_stack_page(p), task_stack_page(p) + |
975 | task_stack_page(task) + THREAD_SIZE, NULL); | 1165 | THREAD_SIZE, NULL, 0); |
1166 | } while_each_thread(g, p); | ||
976 | read_unlock(&tasklist_lock); | 1167 | read_unlock(&tasklist_lock); |
977 | } | 1168 | } |
978 | 1169 | ||
@@ -984,6 +1175,7 @@ static void kmemleak_scan(void) | |||
984 | * kmemleak objects cannot be freed from outside the loop because their | 1175 | * kmemleak objects cannot be freed from outside the loop because their |
985 | * use_count was increased. | 1176 | * use_count was increased. |
986 | */ | 1177 | */ |
1178 | repeat: | ||
987 | object = list_entry(gray_list.next, typeof(*object), gray_list); | 1179 | object = list_entry(gray_list.next, typeof(*object), gray_list); |
988 | while (&object->gray_list != &gray_list) { | 1180 | while (&object->gray_list != &gray_list) { |
989 | cond_resched(); | 1181 | cond_resched(); |
@@ -1001,12 +1193,38 @@ static void kmemleak_scan(void) | |||
1001 | 1193 | ||
1002 | object = tmp; | 1194 | object = tmp; |
1003 | } | 1195 | } |
1196 | |||
1197 | if (scan_should_stop() || ++gray_list_pass >= GRAY_LIST_PASSES) | ||
1198 | goto scan_end; | ||
1199 | |||
1200 | /* | ||
1201 | * Check for new objects allocated during this scanning and add them | ||
1202 | * to the gray list. | ||
1203 | */ | ||
1204 | rcu_read_lock(); | ||
1205 | list_for_each_entry_rcu(object, &object_list, object_list) { | ||
1206 | spin_lock_irqsave(&object->lock, flags); | ||
1207 | if ((object->flags & OBJECT_NEW) && !color_black(object) && | ||
1208 | get_object(object)) { | ||
1209 | object->flags &= ~OBJECT_NEW; | ||
1210 | list_add_tail(&object->gray_list, &gray_list); | ||
1211 | } | ||
1212 | spin_unlock_irqrestore(&object->lock, flags); | ||
1213 | } | ||
1214 | rcu_read_unlock(); | ||
1215 | |||
1216 | if (!list_empty(&gray_list)) | ||
1217 | goto repeat; | ||
1218 | |||
1219 | scan_end: | ||
1004 | WARN_ON(!list_empty(&gray_list)); | 1220 | WARN_ON(!list_empty(&gray_list)); |
1005 | 1221 | ||
1006 | /* | 1222 | /* |
1007 | * If scanning was stopped do not report any new unreferenced objects. | 1223 | * If scanning was stopped or new objects were being allocated at a |
1224 | * higher rate than gray list scanning, do not report any new | ||
1225 | * unreferenced objects. | ||
1008 | */ | 1226 | */ |
1009 | if (scan_should_stop()) | 1227 | if (scan_should_stop() || gray_list_pass >= GRAY_LIST_PASSES) |
1010 | return; | 1228 | return; |
1011 | 1229 | ||
1012 | /* | 1230 | /* |
@@ -1039,6 +1257,7 @@ static int kmemleak_scan_thread(void *arg) | |||
1039 | static int first_run = 1; | 1257 | static int first_run = 1; |
1040 | 1258 | ||
1041 | pr_info("Automatic memory scanning thread started\n"); | 1259 | pr_info("Automatic memory scanning thread started\n"); |
1260 | set_user_nice(current, 10); | ||
1042 | 1261 | ||
1043 | /* | 1262 | /* |
1044 | * Wait before the first scan to allow the system to fully initialize. | 1263 | * Wait before the first scan to allow the system to fully initialize. |
@@ -1069,7 +1288,7 @@ static int kmemleak_scan_thread(void *arg) | |||
1069 | * Start the automatic memory scanning thread. This function must be called | 1288 | * Start the automatic memory scanning thread. This function must be called |
1070 | * with the scan_mutex held. | 1289 | * with the scan_mutex held. |
1071 | */ | 1290 | */ |
1072 | void start_scan_thread(void) | 1291 | static void start_scan_thread(void) |
1073 | { | 1292 | { |
1074 | if (scan_thread) | 1293 | if (scan_thread) |
1075 | return; | 1294 | return; |
@@ -1084,7 +1303,7 @@ void start_scan_thread(void) | |||
1084 | * Stop the automatic memory scanning thread. This function must be called | 1303 | * Stop the automatic memory scanning thread. This function must be called |
1085 | * with the scan_mutex held. | 1304 | * with the scan_mutex held. |
1086 | */ | 1305 | */ |
1087 | void stop_scan_thread(void) | 1306 | static void stop_scan_thread(void) |
1088 | { | 1307 | { |
1089 | if (scan_thread) { | 1308 | if (scan_thread) { |
1090 | kthread_stop(scan_thread); | 1309 | kthread_stop(scan_thread); |
@@ -1101,11 +1320,11 @@ static void *kmemleak_seq_start(struct seq_file *seq, loff_t *pos) | |||
1101 | { | 1320 | { |
1102 | struct kmemleak_object *object; | 1321 | struct kmemleak_object *object; |
1103 | loff_t n = *pos; | 1322 | loff_t n = *pos; |
1323 | int err; | ||
1104 | 1324 | ||
1105 | if (!n) | 1325 | err = mutex_lock_interruptible(&scan_mutex); |
1106 | reported_leaks = 0; | 1326 | if (err < 0) |
1107 | if (reported_leaks >= REPORTS_NR) | 1327 | return ERR_PTR(err); |
1108 | return NULL; | ||
1109 | 1328 | ||
1110 | rcu_read_lock(); | 1329 | rcu_read_lock(); |
1111 | list_for_each_entry_rcu(object, &object_list, object_list) { | 1330 | list_for_each_entry_rcu(object, &object_list, object_list) { |
@@ -1116,7 +1335,6 @@ static void *kmemleak_seq_start(struct seq_file *seq, loff_t *pos) | |||
1116 | } | 1335 | } |
1117 | object = NULL; | 1336 | object = NULL; |
1118 | out: | 1337 | out: |
1119 | rcu_read_unlock(); | ||
1120 | return object; | 1338 | return object; |
1121 | } | 1339 | } |
1122 | 1340 | ||
@@ -1131,17 +1349,13 @@ static void *kmemleak_seq_next(struct seq_file *seq, void *v, loff_t *pos) | |||
1131 | struct list_head *n = &prev_obj->object_list; | 1349 | struct list_head *n = &prev_obj->object_list; |
1132 | 1350 | ||
1133 | ++(*pos); | 1351 | ++(*pos); |
1134 | if (reported_leaks >= REPORTS_NR) | ||
1135 | goto out; | ||
1136 | 1352 | ||
1137 | rcu_read_lock(); | ||
1138 | list_for_each_continue_rcu(n, &object_list) { | 1353 | list_for_each_continue_rcu(n, &object_list) { |
1139 | next_obj = list_entry(n, struct kmemleak_object, object_list); | 1354 | next_obj = list_entry(n, struct kmemleak_object, object_list); |
1140 | if (get_object(next_obj)) | 1355 | if (get_object(next_obj)) |
1141 | break; | 1356 | break; |
1142 | } | 1357 | } |
1143 | rcu_read_unlock(); | 1358 | |
1144 | out: | ||
1145 | put_object(prev_obj); | 1359 | put_object(prev_obj); |
1146 | return next_obj; | 1360 | return next_obj; |
1147 | } | 1361 | } |
@@ -1151,8 +1365,16 @@ out: | |||
1151 | */ | 1365 | */ |
1152 | static void kmemleak_seq_stop(struct seq_file *seq, void *v) | 1366 | static void kmemleak_seq_stop(struct seq_file *seq, void *v) |
1153 | { | 1367 | { |
1154 | if (v) | 1368 | if (!IS_ERR(v)) { |
1155 | put_object(v); | 1369 | /* |
1370 | * kmemleak_seq_start may return ERR_PTR if the scan_mutex | ||
1371 | * waiting was interrupted, so only release it if !IS_ERR. | ||
1372 | */ | ||
1373 | rcu_read_unlock(); | ||
1374 | mutex_unlock(&scan_mutex); | ||
1375 | if (v) | ||
1376 | put_object(v); | ||
1377 | } | ||
1156 | } | 1378 | } |
1157 | 1379 | ||
1158 | /* | 1380 | /* |
@@ -1164,10 +1386,8 @@ static int kmemleak_seq_show(struct seq_file *seq, void *v) | |||
1164 | unsigned long flags; | 1386 | unsigned long flags; |
1165 | 1387 | ||
1166 | spin_lock_irqsave(&object->lock, flags); | 1388 | spin_lock_irqsave(&object->lock, flags); |
1167 | if ((object->flags & OBJECT_REPORTED) && unreferenced_object(object)) { | 1389 | if ((object->flags & OBJECT_REPORTED) && unreferenced_object(object)) |
1168 | print_unreferenced(seq, object); | 1390 | print_unreferenced(seq, object); |
1169 | reported_leaks++; | ||
1170 | } | ||
1171 | spin_unlock_irqrestore(&object->lock, flags); | 1391 | spin_unlock_irqrestore(&object->lock, flags); |
1172 | return 0; | 1392 | return 0; |
1173 | } | 1393 | } |
@@ -1181,36 +1401,58 @@ static const struct seq_operations kmemleak_seq_ops = { | |||
1181 | 1401 | ||
1182 | static int kmemleak_open(struct inode *inode, struct file *file) | 1402 | static int kmemleak_open(struct inode *inode, struct file *file) |
1183 | { | 1403 | { |
1184 | int ret = 0; | ||
1185 | |||
1186 | if (!atomic_read(&kmemleak_enabled)) | 1404 | if (!atomic_read(&kmemleak_enabled)) |
1187 | return -EBUSY; | 1405 | return -EBUSY; |
1188 | 1406 | ||
1189 | ret = mutex_lock_interruptible(&scan_mutex); | 1407 | return seq_open(file, &kmemleak_seq_ops); |
1190 | if (ret < 0) | ||
1191 | goto out; | ||
1192 | if (file->f_mode & FMODE_READ) { | ||
1193 | ret = seq_open(file, &kmemleak_seq_ops); | ||
1194 | if (ret < 0) | ||
1195 | goto scan_unlock; | ||
1196 | } | ||
1197 | return ret; | ||
1198 | |||
1199 | scan_unlock: | ||
1200 | mutex_unlock(&scan_mutex); | ||
1201 | out: | ||
1202 | return ret; | ||
1203 | } | 1408 | } |
1204 | 1409 | ||
1205 | static int kmemleak_release(struct inode *inode, struct file *file) | 1410 | static int kmemleak_release(struct inode *inode, struct file *file) |
1206 | { | 1411 | { |
1207 | int ret = 0; | 1412 | return seq_release(inode, file); |
1413 | } | ||
1208 | 1414 | ||
1209 | if (file->f_mode & FMODE_READ) | 1415 | static int dump_str_object_info(const char *str) |
1210 | seq_release(inode, file); | 1416 | { |
1211 | mutex_unlock(&scan_mutex); | 1417 | unsigned long flags; |
1418 | struct kmemleak_object *object; | ||
1419 | unsigned long addr; | ||
1212 | 1420 | ||
1213 | return ret; | 1421 | addr= simple_strtoul(str, NULL, 0); |
1422 | object = find_and_get_object(addr, 0); | ||
1423 | if (!object) { | ||
1424 | pr_info("Unknown object at 0x%08lx\n", addr); | ||
1425 | return -EINVAL; | ||
1426 | } | ||
1427 | |||
1428 | spin_lock_irqsave(&object->lock, flags); | ||
1429 | dump_object_info(object); | ||
1430 | spin_unlock_irqrestore(&object->lock, flags); | ||
1431 | |||
1432 | put_object(object); | ||
1433 | return 0; | ||
1434 | } | ||
1435 | |||
1436 | /* | ||
1437 | * We use grey instead of black to ensure we can do future scans on the same | ||
1438 | * objects. If we did not do future scans these black objects could | ||
1439 | * potentially contain references to newly allocated objects in the future and | ||
1440 | * we'd end up with false positives. | ||
1441 | */ | ||
1442 | static void kmemleak_clear(void) | ||
1443 | { | ||
1444 | struct kmemleak_object *object; | ||
1445 | unsigned long flags; | ||
1446 | |||
1447 | rcu_read_lock(); | ||
1448 | list_for_each_entry_rcu(object, &object_list, object_list) { | ||
1449 | spin_lock_irqsave(&object->lock, flags); | ||
1450 | if ((object->flags & OBJECT_REPORTED) && | ||
1451 | unreferenced_object(object)) | ||
1452 | __paint_it(object, KMEMLEAK_GREY); | ||
1453 | spin_unlock_irqrestore(&object->lock, flags); | ||
1454 | } | ||
1455 | rcu_read_unlock(); | ||
1214 | } | 1456 | } |
1215 | 1457 | ||
1216 | /* | 1458 | /* |
@@ -1224,21 +1466,26 @@ static int kmemleak_release(struct inode *inode, struct file *file) | |||
1224 | * scan=... - set the automatic memory scanning period in seconds (0 to | 1466 | * scan=... - set the automatic memory scanning period in seconds (0 to |
1225 | * disable it) | 1467 | * disable it) |
1226 | * scan - trigger a memory scan | 1468 | * scan - trigger a memory scan |
1469 | * clear - mark all current reported unreferenced kmemleak objects as | ||
1470 | * grey to ignore printing them | ||
1471 | * dump=... - dump information about the object found at the given address | ||
1227 | */ | 1472 | */ |
1228 | static ssize_t kmemleak_write(struct file *file, const char __user *user_buf, | 1473 | static ssize_t kmemleak_write(struct file *file, const char __user *user_buf, |
1229 | size_t size, loff_t *ppos) | 1474 | size_t size, loff_t *ppos) |
1230 | { | 1475 | { |
1231 | char buf[64]; | 1476 | char buf[64]; |
1232 | int buf_size; | 1477 | int buf_size; |
1233 | 1478 | int ret; | |
1234 | if (!atomic_read(&kmemleak_enabled)) | ||
1235 | return -EBUSY; | ||
1236 | 1479 | ||
1237 | buf_size = min(size, (sizeof(buf) - 1)); | 1480 | buf_size = min(size, (sizeof(buf) - 1)); |
1238 | if (strncpy_from_user(buf, user_buf, buf_size) < 0) | 1481 | if (strncpy_from_user(buf, user_buf, buf_size) < 0) |
1239 | return -EFAULT; | 1482 | return -EFAULT; |
1240 | buf[buf_size] = 0; | 1483 | buf[buf_size] = 0; |
1241 | 1484 | ||
1485 | ret = mutex_lock_interruptible(&scan_mutex); | ||
1486 | if (ret < 0) | ||
1487 | return ret; | ||
1488 | |||
1242 | if (strncmp(buf, "off", 3) == 0) | 1489 | if (strncmp(buf, "off", 3) == 0) |
1243 | kmemleak_disable(); | 1490 | kmemleak_disable(); |
1244 | else if (strncmp(buf, "stack=on", 8) == 0) | 1491 | else if (strncmp(buf, "stack=on", 8) == 0) |
@@ -1251,11 +1498,10 @@ static ssize_t kmemleak_write(struct file *file, const char __user *user_buf, | |||
1251 | stop_scan_thread(); | 1498 | stop_scan_thread(); |
1252 | else if (strncmp(buf, "scan=", 5) == 0) { | 1499 | else if (strncmp(buf, "scan=", 5) == 0) { |
1253 | unsigned long secs; | 1500 | unsigned long secs; |
1254 | int err; | ||
1255 | 1501 | ||
1256 | err = strict_strtoul(buf + 5, 0, &secs); | 1502 | ret = strict_strtoul(buf + 5, 0, &secs); |
1257 | if (err < 0) | 1503 | if (ret < 0) |
1258 | return err; | 1504 | goto out; |
1259 | stop_scan_thread(); | 1505 | stop_scan_thread(); |
1260 | if (secs) { | 1506 | if (secs) { |
1261 | jiffies_scan_wait = msecs_to_jiffies(secs * 1000); | 1507 | jiffies_scan_wait = msecs_to_jiffies(secs * 1000); |
@@ -1263,8 +1509,17 @@ static ssize_t kmemleak_write(struct file *file, const char __user *user_buf, | |||
1263 | } | 1509 | } |
1264 | } else if (strncmp(buf, "scan", 4) == 0) | 1510 | } else if (strncmp(buf, "scan", 4) == 0) |
1265 | kmemleak_scan(); | 1511 | kmemleak_scan(); |
1512 | else if (strncmp(buf, "clear", 5) == 0) | ||
1513 | kmemleak_clear(); | ||
1514 | else if (strncmp(buf, "dump=", 5) == 0) | ||
1515 | ret = dump_str_object_info(buf + 5); | ||
1266 | else | 1516 | else |
1267 | return -EINVAL; | 1517 | ret = -EINVAL; |
1518 | |||
1519 | out: | ||
1520 | mutex_unlock(&scan_mutex); | ||
1521 | if (ret < 0) | ||
1522 | return ret; | ||
1268 | 1523 | ||
1269 | /* ignore the rest of the buffer, only one command at a time */ | 1524 | /* ignore the rest of the buffer, only one command at a time */ |
1270 | *ppos += size; | 1525 | *ppos += size; |
@@ -1284,7 +1539,7 @@ static const struct file_operations kmemleak_fops = { | |||
1284 | * Perform the freeing of the kmemleak internal objects after waiting for any | 1539 | * Perform the freeing of the kmemleak internal objects after waiting for any |
1285 | * current memory scan to complete. | 1540 | * current memory scan to complete. |
1286 | */ | 1541 | */ |
1287 | static int kmemleak_cleanup_thread(void *arg) | 1542 | static void kmemleak_do_cleanup(struct work_struct *work) |
1288 | { | 1543 | { |
1289 | struct kmemleak_object *object; | 1544 | struct kmemleak_object *object; |
1290 | 1545 | ||
@@ -1293,25 +1548,12 @@ static int kmemleak_cleanup_thread(void *arg) | |||
1293 | 1548 | ||
1294 | rcu_read_lock(); | 1549 | rcu_read_lock(); |
1295 | list_for_each_entry_rcu(object, &object_list, object_list) | 1550 | list_for_each_entry_rcu(object, &object_list, object_list) |
1296 | delete_object(object->pointer); | 1551 | delete_object_full(object->pointer); |
1297 | rcu_read_unlock(); | 1552 | rcu_read_unlock(); |
1298 | mutex_unlock(&scan_mutex); | 1553 | mutex_unlock(&scan_mutex); |
1299 | |||
1300 | return 0; | ||
1301 | } | 1554 | } |
1302 | 1555 | ||
1303 | /* | 1556 | static DECLARE_WORK(cleanup_work, kmemleak_do_cleanup); |
1304 | * Start the clean-up thread. | ||
1305 | */ | ||
1306 | static void kmemleak_cleanup(void) | ||
1307 | { | ||
1308 | struct task_struct *cleanup_thread; | ||
1309 | |||
1310 | cleanup_thread = kthread_run(kmemleak_cleanup_thread, NULL, | ||
1311 | "kmemleak-clean"); | ||
1312 | if (IS_ERR(cleanup_thread)) | ||
1313 | pr_warning("Failed to create the clean-up thread\n"); | ||
1314 | } | ||
1315 | 1557 | ||
1316 | /* | 1558 | /* |
1317 | * Disable kmemleak. No memory allocation/freeing will be traced once this | 1559 | * Disable kmemleak. No memory allocation/freeing will be traced once this |
@@ -1329,7 +1571,7 @@ static void kmemleak_disable(void) | |||
1329 | 1571 | ||
1330 | /* check whether it is too early for a kernel thread */ | 1572 | /* check whether it is too early for a kernel thread */ |
1331 | if (atomic_read(&kmemleak_initialized)) | 1573 | if (atomic_read(&kmemleak_initialized)) |
1332 | kmemleak_cleanup(); | 1574 | schedule_work(&cleanup_work); |
1333 | 1575 | ||
1334 | pr_info("Kernel memory leak detector disabled\n"); | 1576 | pr_info("Kernel memory leak detector disabled\n"); |
1335 | } | 1577 | } |
@@ -1382,12 +1624,14 @@ void __init kmemleak_init(void) | |||
1382 | 1624 | ||
1383 | switch (log->op_type) { | 1625 | switch (log->op_type) { |
1384 | case KMEMLEAK_ALLOC: | 1626 | case KMEMLEAK_ALLOC: |
1385 | kmemleak_alloc(log->ptr, log->size, log->min_count, | 1627 | early_alloc(log); |
1386 | GFP_KERNEL); | ||
1387 | break; | 1628 | break; |
1388 | case KMEMLEAK_FREE: | 1629 | case KMEMLEAK_FREE: |
1389 | kmemleak_free(log->ptr); | 1630 | kmemleak_free(log->ptr); |
1390 | break; | 1631 | break; |
1632 | case KMEMLEAK_FREE_PART: | ||
1633 | kmemleak_free_part(log->ptr, log->size); | ||
1634 | break; | ||
1391 | case KMEMLEAK_NOT_LEAK: | 1635 | case KMEMLEAK_NOT_LEAK: |
1392 | kmemleak_not_leak(log->ptr); | 1636 | kmemleak_not_leak(log->ptr); |
1393 | break; | 1637 | break; |
@@ -1423,7 +1667,7 @@ static int __init kmemleak_late_init(void) | |||
1423 | * after setting kmemleak_initialized and we may end up with | 1667 | * after setting kmemleak_initialized and we may end up with |
1424 | * two clean-up threads but serialized by scan_mutex. | 1668 | * two clean-up threads but serialized by scan_mutex. |
1425 | */ | 1669 | */ |
1426 | kmemleak_cleanup(); | 1670 | schedule_work(&cleanup_work); |
1427 | return -ENOMEM; | 1671 | return -ENOMEM; |
1428 | } | 1672 | } |
1429 | 1673 | ||
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e2fa20dadf40..fd4529d86de5 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -1207,6 +1207,12 @@ static int mem_cgroup_move_account(struct page_cgroup *pc, | |||
1207 | ret = 0; | 1207 | ret = 0; |
1208 | out: | 1208 | out: |
1209 | unlock_page_cgroup(pc); | 1209 | unlock_page_cgroup(pc); |
1210 | /* | ||
1211 | * We charges against "to" which may not have any tasks. Then, "to" | ||
1212 | * can be under rmdir(). But in current implementation, caller of | ||
1213 | * this function is just force_empty() and it's garanteed that | ||
1214 | * "to" is never removed. So, we don't check rmdir status here. | ||
1215 | */ | ||
1210 | return ret; | 1216 | return ret; |
1211 | } | 1217 | } |
1212 | 1218 | ||
@@ -1428,6 +1434,7 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, | |||
1428 | return; | 1434 | return; |
1429 | if (!ptr) | 1435 | if (!ptr) |
1430 | return; | 1436 | return; |
1437 | cgroup_exclude_rmdir(&ptr->css); | ||
1431 | pc = lookup_page_cgroup(page); | 1438 | pc = lookup_page_cgroup(page); |
1432 | mem_cgroup_lru_del_before_commit_swapcache(page); | 1439 | mem_cgroup_lru_del_before_commit_swapcache(page); |
1433 | __mem_cgroup_commit_charge(ptr, pc, ctype); | 1440 | __mem_cgroup_commit_charge(ptr, pc, ctype); |
@@ -1457,8 +1464,12 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, | |||
1457 | } | 1464 | } |
1458 | rcu_read_unlock(); | 1465 | rcu_read_unlock(); |
1459 | } | 1466 | } |
1460 | /* add this page(page_cgroup) to the LRU we want. */ | 1467 | /* |
1461 | 1468 | * At swapin, we may charge account against cgroup which has no tasks. | |
1469 | * So, rmdir()->pre_destroy() can be called while we do this charge. | ||
1470 | * In that case, we need to call pre_destroy() again. check it here. | ||
1471 | */ | ||
1472 | cgroup_release_and_wakeup_rmdir(&ptr->css); | ||
1462 | } | 1473 | } |
1463 | 1474 | ||
1464 | void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr) | 1475 | void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr) |
@@ -1664,7 +1675,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem, | |||
1664 | 1675 | ||
1665 | if (!mem) | 1676 | if (!mem) |
1666 | return; | 1677 | return; |
1667 | 1678 | cgroup_exclude_rmdir(&mem->css); | |
1668 | /* at migration success, oldpage->mapping is NULL. */ | 1679 | /* at migration success, oldpage->mapping is NULL. */ |
1669 | if (oldpage->mapping) { | 1680 | if (oldpage->mapping) { |
1670 | target = oldpage; | 1681 | target = oldpage; |
@@ -1704,6 +1715,12 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem, | |||
1704 | */ | 1715 | */ |
1705 | if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED) | 1716 | if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED) |
1706 | mem_cgroup_uncharge_page(target); | 1717 | mem_cgroup_uncharge_page(target); |
1718 | /* | ||
1719 | * At migration, we may charge account against cgroup which has no tasks | ||
1720 | * So, rmdir()->pre_destroy() can be called while we do this charge. | ||
1721 | * In that case, we need to call pre_destroy() again. check it here. | ||
1722 | */ | ||
1723 | cgroup_release_and_wakeup_rmdir(&mem->css); | ||
1707 | } | 1724 | } |
1708 | 1725 | ||
1709 | /* | 1726 | /* |
@@ -1973,7 +1990,7 @@ try_to_free: | |||
1973 | if (!progress) { | 1990 | if (!progress) { |
1974 | nr_retries--; | 1991 | nr_retries--; |
1975 | /* maybe some writeback is necessary */ | 1992 | /* maybe some writeback is necessary */ |
1976 | congestion_wait(WRITE, HZ/10); | 1993 | congestion_wait(BLK_RW_ASYNC, HZ/10); |
1977 | } | 1994 | } |
1978 | 1995 | ||
1979 | } | 1996 | } |
diff --git a/mm/memory.c b/mm/memory.c index 65216194eb8d..aede2ce3aba4 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -135,11 +135,12 @@ void pmd_clear_bad(pmd_t *pmd) | |||
135 | * Note: this doesn't free the actual pages themselves. That | 135 | * Note: this doesn't free the actual pages themselves. That |
136 | * has been handled earlier when unmapping all the memory regions. | 136 | * has been handled earlier when unmapping all the memory regions. |
137 | */ | 137 | */ |
138 | static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd) | 138 | static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd, |
139 | unsigned long addr) | ||
139 | { | 140 | { |
140 | pgtable_t token = pmd_pgtable(*pmd); | 141 | pgtable_t token = pmd_pgtable(*pmd); |
141 | pmd_clear(pmd); | 142 | pmd_clear(pmd); |
142 | pte_free_tlb(tlb, token); | 143 | pte_free_tlb(tlb, token, addr); |
143 | tlb->mm->nr_ptes--; | 144 | tlb->mm->nr_ptes--; |
144 | } | 145 | } |
145 | 146 | ||
@@ -157,7 +158,7 @@ static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud, | |||
157 | next = pmd_addr_end(addr, end); | 158 | next = pmd_addr_end(addr, end); |
158 | if (pmd_none_or_clear_bad(pmd)) | 159 | if (pmd_none_or_clear_bad(pmd)) |
159 | continue; | 160 | continue; |
160 | free_pte_range(tlb, pmd); | 161 | free_pte_range(tlb, pmd, addr); |
161 | } while (pmd++, addr = next, addr != end); | 162 | } while (pmd++, addr = next, addr != end); |
162 | 163 | ||
163 | start &= PUD_MASK; | 164 | start &= PUD_MASK; |
@@ -173,7 +174,7 @@ static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud, | |||
173 | 174 | ||
174 | pmd = pmd_offset(pud, start); | 175 | pmd = pmd_offset(pud, start); |
175 | pud_clear(pud); | 176 | pud_clear(pud); |
176 | pmd_free_tlb(tlb, pmd); | 177 | pmd_free_tlb(tlb, pmd, start); |
177 | } | 178 | } |
178 | 179 | ||
179 | static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, | 180 | static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, |
@@ -206,7 +207,7 @@ static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, | |||
206 | 207 | ||
207 | pud = pud_offset(pgd, start); | 208 | pud = pud_offset(pgd, start); |
208 | pgd_clear(pgd); | 209 | pgd_clear(pgd); |
209 | pud_free_tlb(tlb, pud); | 210 | pud_free_tlb(tlb, pud, start); |
210 | } | 211 | } |
211 | 212 | ||
212 | /* | 213 | /* |
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index e08e2c4da63a..7dd9d9f80694 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -191,25 +191,27 @@ static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes) | |||
191 | * Must be called holding task's alloc_lock to protect task's mems_allowed | 191 | * Must be called holding task's alloc_lock to protect task's mems_allowed |
192 | * and mempolicy. May also be called holding the mmap_semaphore for write. | 192 | * and mempolicy. May also be called holding the mmap_semaphore for write. |
193 | */ | 193 | */ |
194 | static int mpol_set_nodemask(struct mempolicy *pol, const nodemask_t *nodes) | 194 | static int mpol_set_nodemask(struct mempolicy *pol, |
195 | const nodemask_t *nodes, struct nodemask_scratch *nsc) | ||
195 | { | 196 | { |
196 | nodemask_t cpuset_context_nmask; | ||
197 | int ret; | 197 | int ret; |
198 | 198 | ||
199 | /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */ | 199 | /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */ |
200 | if (pol == NULL) | 200 | if (pol == NULL) |
201 | return 0; | 201 | return 0; |
202 | /* Check N_HIGH_MEMORY */ | ||
203 | nodes_and(nsc->mask1, | ||
204 | cpuset_current_mems_allowed, node_states[N_HIGH_MEMORY]); | ||
202 | 205 | ||
203 | VM_BUG_ON(!nodes); | 206 | VM_BUG_ON(!nodes); |
204 | if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes)) | 207 | if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes)) |
205 | nodes = NULL; /* explicit local allocation */ | 208 | nodes = NULL; /* explicit local allocation */ |
206 | else { | 209 | else { |
207 | if (pol->flags & MPOL_F_RELATIVE_NODES) | 210 | if (pol->flags & MPOL_F_RELATIVE_NODES) |
208 | mpol_relative_nodemask(&cpuset_context_nmask, nodes, | 211 | mpol_relative_nodemask(&nsc->mask2, nodes,&nsc->mask1); |
209 | &cpuset_current_mems_allowed); | ||
210 | else | 212 | else |
211 | nodes_and(cpuset_context_nmask, *nodes, | 213 | nodes_and(nsc->mask2, *nodes, nsc->mask1); |
212 | cpuset_current_mems_allowed); | 214 | |
213 | if (mpol_store_user_nodemask(pol)) | 215 | if (mpol_store_user_nodemask(pol)) |
214 | pol->w.user_nodemask = *nodes; | 216 | pol->w.user_nodemask = *nodes; |
215 | else | 217 | else |
@@ -217,8 +219,10 @@ static int mpol_set_nodemask(struct mempolicy *pol, const nodemask_t *nodes) | |||
217 | cpuset_current_mems_allowed; | 219 | cpuset_current_mems_allowed; |
218 | } | 220 | } |
219 | 221 | ||
220 | ret = mpol_ops[pol->mode].create(pol, | 222 | if (nodes) |
221 | nodes ? &cpuset_context_nmask : NULL); | 223 | ret = mpol_ops[pol->mode].create(pol, &nsc->mask2); |
224 | else | ||
225 | ret = mpol_ops[pol->mode].create(pol, NULL); | ||
222 | return ret; | 226 | return ret; |
223 | } | 227 | } |
224 | 228 | ||
@@ -620,12 +624,17 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags, | |||
620 | { | 624 | { |
621 | struct mempolicy *new, *old; | 625 | struct mempolicy *new, *old; |
622 | struct mm_struct *mm = current->mm; | 626 | struct mm_struct *mm = current->mm; |
627 | NODEMASK_SCRATCH(scratch); | ||
623 | int ret; | 628 | int ret; |
624 | 629 | ||
625 | new = mpol_new(mode, flags, nodes); | 630 | if (!scratch) |
626 | if (IS_ERR(new)) | 631 | return -ENOMEM; |
627 | return PTR_ERR(new); | ||
628 | 632 | ||
633 | new = mpol_new(mode, flags, nodes); | ||
634 | if (IS_ERR(new)) { | ||
635 | ret = PTR_ERR(new); | ||
636 | goto out; | ||
637 | } | ||
629 | /* | 638 | /* |
630 | * prevent changing our mempolicy while show_numa_maps() | 639 | * prevent changing our mempolicy while show_numa_maps() |
631 | * is using it. | 640 | * is using it. |
@@ -635,13 +644,13 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags, | |||
635 | if (mm) | 644 | if (mm) |
636 | down_write(&mm->mmap_sem); | 645 | down_write(&mm->mmap_sem); |
637 | task_lock(current); | 646 | task_lock(current); |
638 | ret = mpol_set_nodemask(new, nodes); | 647 | ret = mpol_set_nodemask(new, nodes, scratch); |
639 | if (ret) { | 648 | if (ret) { |
640 | task_unlock(current); | 649 | task_unlock(current); |
641 | if (mm) | 650 | if (mm) |
642 | up_write(&mm->mmap_sem); | 651 | up_write(&mm->mmap_sem); |
643 | mpol_put(new); | 652 | mpol_put(new); |
644 | return ret; | 653 | goto out; |
645 | } | 654 | } |
646 | old = current->mempolicy; | 655 | old = current->mempolicy; |
647 | current->mempolicy = new; | 656 | current->mempolicy = new; |
@@ -654,7 +663,10 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags, | |||
654 | up_write(&mm->mmap_sem); | 663 | up_write(&mm->mmap_sem); |
655 | 664 | ||
656 | mpol_put(old); | 665 | mpol_put(old); |
657 | return 0; | 666 | ret = 0; |
667 | out: | ||
668 | NODEMASK_SCRATCH_FREE(scratch); | ||
669 | return ret; | ||
658 | } | 670 | } |
659 | 671 | ||
660 | /* | 672 | /* |
@@ -1014,12 +1026,20 @@ static long do_mbind(unsigned long start, unsigned long len, | |||
1014 | if (err) | 1026 | if (err) |
1015 | return err; | 1027 | return err; |
1016 | } | 1028 | } |
1017 | down_write(&mm->mmap_sem); | 1029 | { |
1018 | task_lock(current); | 1030 | NODEMASK_SCRATCH(scratch); |
1019 | err = mpol_set_nodemask(new, nmask); | 1031 | if (scratch) { |
1020 | task_unlock(current); | 1032 | down_write(&mm->mmap_sem); |
1033 | task_lock(current); | ||
1034 | err = mpol_set_nodemask(new, nmask, scratch); | ||
1035 | task_unlock(current); | ||
1036 | if (err) | ||
1037 | up_write(&mm->mmap_sem); | ||
1038 | } else | ||
1039 | err = -ENOMEM; | ||
1040 | NODEMASK_SCRATCH_FREE(scratch); | ||
1041 | } | ||
1021 | if (err) { | 1042 | if (err) { |
1022 | up_write(&mm->mmap_sem); | ||
1023 | mpol_put(new); | 1043 | mpol_put(new); |
1024 | return err; | 1044 | return err; |
1025 | } | 1045 | } |
@@ -1891,6 +1911,7 @@ restart: | |||
1891 | * Install non-NULL @mpol in inode's shared policy rb-tree. | 1911 | * Install non-NULL @mpol in inode's shared policy rb-tree. |
1892 | * On entry, the current task has a reference on a non-NULL @mpol. | 1912 | * On entry, the current task has a reference on a non-NULL @mpol. |
1893 | * This must be released on exit. | 1913 | * This must be released on exit. |
1914 | * This is called at get_inode() calls and we can use GFP_KERNEL. | ||
1894 | */ | 1915 | */ |
1895 | void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) | 1916 | void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) |
1896 | { | 1917 | { |
@@ -1902,19 +1923,24 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) | |||
1902 | if (mpol) { | 1923 | if (mpol) { |
1903 | struct vm_area_struct pvma; | 1924 | struct vm_area_struct pvma; |
1904 | struct mempolicy *new; | 1925 | struct mempolicy *new; |
1926 | NODEMASK_SCRATCH(scratch); | ||
1905 | 1927 | ||
1928 | if (!scratch) | ||
1929 | return; | ||
1906 | /* contextualize the tmpfs mount point mempolicy */ | 1930 | /* contextualize the tmpfs mount point mempolicy */ |
1907 | new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask); | 1931 | new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask); |
1908 | if (IS_ERR(new)) { | 1932 | if (IS_ERR(new)) { |
1909 | mpol_put(mpol); /* drop our ref on sb mpol */ | 1933 | mpol_put(mpol); /* drop our ref on sb mpol */ |
1934 | NODEMASK_SCRATCH_FREE(scratch); | ||
1910 | return; /* no valid nodemask intersection */ | 1935 | return; /* no valid nodemask intersection */ |
1911 | } | 1936 | } |
1912 | 1937 | ||
1913 | task_lock(current); | 1938 | task_lock(current); |
1914 | ret = mpol_set_nodemask(new, &mpol->w.user_nodemask); | 1939 | ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch); |
1915 | task_unlock(current); | 1940 | task_unlock(current); |
1916 | mpol_put(mpol); /* drop our ref on sb mpol */ | 1941 | mpol_put(mpol); /* drop our ref on sb mpol */ |
1917 | if (ret) { | 1942 | if (ret) { |
1943 | NODEMASK_SCRATCH_FREE(scratch); | ||
1918 | mpol_put(new); | 1944 | mpol_put(new); |
1919 | return; | 1945 | return; |
1920 | } | 1946 | } |
@@ -1924,6 +1950,7 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) | |||
1924 | pvma.vm_end = TASK_SIZE; /* policy covers entire file */ | 1950 | pvma.vm_end = TASK_SIZE; /* policy covers entire file */ |
1925 | mpol_set_shared_policy(sp, &pvma, new); /* adds ref */ | 1951 | mpol_set_shared_policy(sp, &pvma, new); /* adds ref */ |
1926 | mpol_put(new); /* drop initial ref */ | 1952 | mpol_put(new); /* drop initial ref */ |
1953 | NODEMASK_SCRATCH_FREE(scratch); | ||
1927 | } | 1954 | } |
1928 | } | 1955 | } |
1929 | 1956 | ||
@@ -2140,13 +2167,18 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) | |||
2140 | err = 1; | 2167 | err = 1; |
2141 | else { | 2168 | else { |
2142 | int ret; | 2169 | int ret; |
2143 | 2170 | NODEMASK_SCRATCH(scratch); | |
2144 | task_lock(current); | 2171 | if (scratch) { |
2145 | ret = mpol_set_nodemask(new, &nodes); | 2172 | task_lock(current); |
2146 | task_unlock(current); | 2173 | ret = mpol_set_nodemask(new, &nodes, scratch); |
2147 | if (ret) | 2174 | task_unlock(current); |
2175 | } else | ||
2176 | ret = -ENOMEM; | ||
2177 | NODEMASK_SCRATCH_FREE(scratch); | ||
2178 | if (ret) { | ||
2148 | err = 1; | 2179 | err = 1; |
2149 | else if (no_context) { | 2180 | mpol_put(new); |
2181 | } else if (no_context) { | ||
2150 | /* save for contextualization */ | 2182 | /* save for contextualization */ |
2151 | new->w.user_nodemask = nodes; | 2183 | new->w.user_nodemask = nodes; |
2152 | } | 2184 | } |
diff --git a/mm/mempool.c b/mm/mempool.c index a46eb1b4bb66..32e75d400503 100644 --- a/mm/mempool.c +++ b/mm/mempool.c | |||
@@ -303,14 +303,14 @@ EXPORT_SYMBOL(mempool_free_slab); | |||
303 | */ | 303 | */ |
304 | void *mempool_kmalloc(gfp_t gfp_mask, void *pool_data) | 304 | void *mempool_kmalloc(gfp_t gfp_mask, void *pool_data) |
305 | { | 305 | { |
306 | size_t size = (size_t)(long)pool_data; | 306 | size_t size = (size_t)pool_data; |
307 | return kmalloc(size, gfp_mask); | 307 | return kmalloc(size, gfp_mask); |
308 | } | 308 | } |
309 | EXPORT_SYMBOL(mempool_kmalloc); | 309 | EXPORT_SYMBOL(mempool_kmalloc); |
310 | 310 | ||
311 | void *mempool_kzalloc(gfp_t gfp_mask, void *pool_data) | 311 | void *mempool_kzalloc(gfp_t gfp_mask, void *pool_data) |
312 | { | 312 | { |
313 | size_t size = (size_t) pool_data; | 313 | size_t size = (size_t)pool_data; |
314 | return kzalloc(size, gfp_mask); | 314 | return kzalloc(size, gfp_mask); |
315 | } | 315 | } |
316 | EXPORT_SYMBOL(mempool_kzalloc); | 316 | EXPORT_SYMBOL(mempool_kzalloc); |
@@ -88,9 +88,6 @@ int sysctl_overcommit_ratio = 50; /* default is 50% */ | |||
88 | int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; | 88 | int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; |
89 | struct percpu_counter vm_committed_as; | 89 | struct percpu_counter vm_committed_as; |
90 | 90 | ||
91 | /* amount of vm to protect from userspace access */ | ||
92 | unsigned long mmap_min_addr = CONFIG_DEFAULT_MMAP_MIN_ADDR; | ||
93 | |||
94 | /* | 91 | /* |
95 | * Check that a process has enough memory to allocate a new virtual | 92 | * Check that a process has enough memory to allocate a new virtual |
96 | * mapping. 0 means there is enough memory for the allocation to | 93 | * mapping. 0 means there is enough memory for the allocation to |
diff --git a/mm/nommu.c b/mm/nommu.c index 53cab10fece4..66e81e7e9fe9 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
@@ -69,9 +69,6 @@ int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; | |||
69 | int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS; | 69 | int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS; |
70 | int heap_stack_gap = 0; | 70 | int heap_stack_gap = 0; |
71 | 71 | ||
72 | /* amount of vm to protect from userspace access */ | ||
73 | unsigned long mmap_min_addr = CONFIG_DEFAULT_MMAP_MIN_ADDR; | ||
74 | |||
75 | atomic_long_t mmap_pages_allocated; | 72 | atomic_long_t mmap_pages_allocated; |
76 | 73 | ||
77 | EXPORT_SYMBOL(mem_map); | 74 | EXPORT_SYMBOL(mem_map); |
@@ -922,6 +919,10 @@ static int validate_mmap_request(struct file *file, | |||
922 | if (!file->f_op->read) | 919 | if (!file->f_op->read) |
923 | capabilities &= ~BDI_CAP_MAP_COPY; | 920 | capabilities &= ~BDI_CAP_MAP_COPY; |
924 | 921 | ||
922 | /* The file shall have been opened with read permission. */ | ||
923 | if (!(file->f_mode & FMODE_READ)) | ||
924 | return -EACCES; | ||
925 | |||
925 | if (flags & MAP_SHARED) { | 926 | if (flags & MAP_SHARED) { |
926 | /* do checks for writing, appending and locking */ | 927 | /* do checks for writing, appending and locking */ |
927 | if ((prot & PROT_WRITE) && | 928 | if ((prot & PROT_WRITE) && |
@@ -1351,6 +1352,7 @@ unsigned long do_mmap_pgoff(struct file *file, | |||
1351 | } | 1352 | } |
1352 | 1353 | ||
1353 | vma->vm_region = region; | 1354 | vma->vm_region = region; |
1355 | add_nommu_region(region); | ||
1354 | 1356 | ||
1355 | /* set up the mapping */ | 1357 | /* set up the mapping */ |
1356 | if (file && vma->vm_flags & VM_SHARED) | 1358 | if (file && vma->vm_flags & VM_SHARED) |
@@ -1360,8 +1362,6 @@ unsigned long do_mmap_pgoff(struct file *file, | |||
1360 | if (ret < 0) | 1362 | if (ret < 0) |
1361 | goto error_put_region; | 1363 | goto error_put_region; |
1362 | 1364 | ||
1363 | add_nommu_region(region); | ||
1364 | |||
1365 | /* okay... we have a mapping; now we have to register it */ | 1365 | /* okay... we have a mapping; now we have to register it */ |
1366 | result = vma->vm_start; | 1366 | result = vma->vm_start; |
1367 | 1367 | ||
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 175a67a78a99..a7b2460e922b 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -58,7 +58,6 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) | |||
58 | unsigned long points, cpu_time, run_time; | 58 | unsigned long points, cpu_time, run_time; |
59 | struct mm_struct *mm; | 59 | struct mm_struct *mm; |
60 | struct task_struct *child; | 60 | struct task_struct *child; |
61 | int oom_adj; | ||
62 | 61 | ||
63 | task_lock(p); | 62 | task_lock(p); |
64 | mm = p->mm; | 63 | mm = p->mm; |
@@ -66,11 +65,6 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) | |||
66 | task_unlock(p); | 65 | task_unlock(p); |
67 | return 0; | 66 | return 0; |
68 | } | 67 | } |
69 | oom_adj = mm->oom_adj; | ||
70 | if (oom_adj == OOM_DISABLE) { | ||
71 | task_unlock(p); | ||
72 | return 0; | ||
73 | } | ||
74 | 68 | ||
75 | /* | 69 | /* |
76 | * The memory size of the process is the basis for the badness. | 70 | * The memory size of the process is the basis for the badness. |
@@ -154,15 +148,15 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) | |||
154 | points /= 8; | 148 | points /= 8; |
155 | 149 | ||
156 | /* | 150 | /* |
157 | * Adjust the score by oom_adj. | 151 | * Adjust the score by oomkilladj. |
158 | */ | 152 | */ |
159 | if (oom_adj) { | 153 | if (p->oomkilladj) { |
160 | if (oom_adj > 0) { | 154 | if (p->oomkilladj > 0) { |
161 | if (!points) | 155 | if (!points) |
162 | points = 1; | 156 | points = 1; |
163 | points <<= oom_adj; | 157 | points <<= p->oomkilladj; |
164 | } else | 158 | } else |
165 | points >>= -(oom_adj); | 159 | points >>= -(p->oomkilladj); |
166 | } | 160 | } |
167 | 161 | ||
168 | #ifdef DEBUG | 162 | #ifdef DEBUG |
@@ -257,8 +251,11 @@ static struct task_struct *select_bad_process(unsigned long *ppoints, | |||
257 | *ppoints = ULONG_MAX; | 251 | *ppoints = ULONG_MAX; |
258 | } | 252 | } |
259 | 253 | ||
254 | if (p->oomkilladj == OOM_DISABLE) | ||
255 | continue; | ||
256 | |||
260 | points = badness(p, uptime.tv_sec); | 257 | points = badness(p, uptime.tv_sec); |
261 | if (points > *ppoints) { | 258 | if (points > *ppoints || !chosen) { |
262 | chosen = p; | 259 | chosen = p; |
263 | *ppoints = points; | 260 | *ppoints = points; |
264 | } | 261 | } |
@@ -307,7 +304,8 @@ static void dump_tasks(const struct mem_cgroup *mem) | |||
307 | } | 304 | } |
308 | printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n", | 305 | printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n", |
309 | p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm, | 306 | p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm, |
310 | get_mm_rss(mm), (int)task_cpu(p), mm->oom_adj, p->comm); | 307 | get_mm_rss(mm), (int)task_cpu(p), p->oomkilladj, |
308 | p->comm); | ||
311 | task_unlock(p); | 309 | task_unlock(p); |
312 | } while_each_thread(g, p); | 310 | } while_each_thread(g, p); |
313 | } | 311 | } |
@@ -325,8 +323,11 @@ static void __oom_kill_task(struct task_struct *p, int verbose) | |||
325 | return; | 323 | return; |
326 | } | 324 | } |
327 | 325 | ||
328 | if (!p->mm) | 326 | if (!p->mm) { |
327 | WARN_ON(1); | ||
328 | printk(KERN_WARNING "tried to kill an mm-less task!\n"); | ||
329 | return; | 329 | return; |
330 | } | ||
330 | 331 | ||
331 | if (verbose) | 332 | if (verbose) |
332 | printk(KERN_ERR "Killed process %d (%s)\n", | 333 | printk(KERN_ERR "Killed process %d (%s)\n", |
@@ -348,13 +349,28 @@ static int oom_kill_task(struct task_struct *p) | |||
348 | struct mm_struct *mm; | 349 | struct mm_struct *mm; |
349 | struct task_struct *g, *q; | 350 | struct task_struct *g, *q; |
350 | 351 | ||
351 | task_lock(p); | ||
352 | mm = p->mm; | 352 | mm = p->mm; |
353 | if (!mm || mm->oom_adj == OOM_DISABLE) { | 353 | |
354 | task_unlock(p); | 354 | /* WARNING: mm may not be dereferenced since we did not obtain its |
355 | * value from get_task_mm(p). This is OK since all we need to do is | ||
356 | * compare mm to q->mm below. | ||
357 | * | ||
358 | * Furthermore, even if mm contains a non-NULL value, p->mm may | ||
359 | * change to NULL at any time since we do not hold task_lock(p). | ||
360 | * However, this is of no concern to us. | ||
361 | */ | ||
362 | |||
363 | if (mm == NULL) | ||
355 | return 1; | 364 | return 1; |
356 | } | 365 | |
357 | task_unlock(p); | 366 | /* |
367 | * Don't kill the process if any threads are set to OOM_DISABLE | ||
368 | */ | ||
369 | do_each_thread(g, q) { | ||
370 | if (q->mm == mm && q->oomkilladj == OOM_DISABLE) | ||
371 | return 1; | ||
372 | } while_each_thread(g, q); | ||
373 | |||
358 | __oom_kill_task(p, 1); | 374 | __oom_kill_task(p, 1); |
359 | 375 | ||
360 | /* | 376 | /* |
@@ -377,11 +393,10 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
377 | struct task_struct *c; | 393 | struct task_struct *c; |
378 | 394 | ||
379 | if (printk_ratelimit()) { | 395 | if (printk_ratelimit()) { |
380 | task_lock(current); | ||
381 | printk(KERN_WARNING "%s invoked oom-killer: " | 396 | printk(KERN_WARNING "%s invoked oom-killer: " |
382 | "gfp_mask=0x%x, order=%d, oom_adj=%d\n", | 397 | "gfp_mask=0x%x, order=%d, oomkilladj=%d\n", |
383 | current->comm, gfp_mask, order, | 398 | current->comm, gfp_mask, order, current->oomkilladj); |
384 | current->mm ? current->mm->oom_adj : OOM_DISABLE); | 399 | task_lock(current); |
385 | cpuset_print_task_mems_allowed(current); | 400 | cpuset_print_task_mems_allowed(current); |
386 | task_unlock(current); | 401 | task_unlock(current); |
387 | dump_stack(); | 402 | dump_stack(); |
@@ -394,9 +409,8 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
394 | /* | 409 | /* |
395 | * If the task is already exiting, don't alarm the sysadmin or kill | 410 | * If the task is already exiting, don't alarm the sysadmin or kill |
396 | * its children or threads, just set TIF_MEMDIE so it can die quickly | 411 | * its children or threads, just set TIF_MEMDIE so it can die quickly |
397 | * if its mm is still attached. | ||
398 | */ | 412 | */ |
399 | if (p->mm && (p->flags & PF_EXITING)) { | 413 | if (p->flags & PF_EXITING) { |
400 | __oom_kill_task(p, 0); | 414 | __oom_kill_task(p, 0); |
401 | return 0; | 415 | return 0; |
402 | } | 416 | } |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 7687879253b9..25e7770309b8 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -36,15 +36,6 @@ | |||
36 | #include <linux/pagevec.h> | 36 | #include <linux/pagevec.h> |
37 | 37 | ||
38 | /* | 38 | /* |
39 | * The maximum number of pages to writeout in a single bdflush/kupdate | ||
40 | * operation. We do this so we don't hold I_SYNC against an inode for | ||
41 | * enormous amounts of time, which would block a userspace task which has | ||
42 | * been forced to throttle against that inode. Also, the code reevaluates | ||
43 | * the dirty each time it has written this many pages. | ||
44 | */ | ||
45 | #define MAX_WRITEBACK_PAGES 1024 | ||
46 | |||
47 | /* | ||
48 | * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited | 39 | * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited |
49 | * will look to see if it needs to force writeback or throttling. | 40 | * will look to see if it needs to force writeback or throttling. |
50 | */ | 41 | */ |
@@ -117,8 +108,6 @@ EXPORT_SYMBOL(laptop_mode); | |||
117 | /* End of sysctl-exported parameters */ | 108 | /* End of sysctl-exported parameters */ |
118 | 109 | ||
119 | 110 | ||
120 | static void background_writeout(unsigned long _min_pages); | ||
121 | |||
122 | /* | 111 | /* |
123 | * Scale the writeback cache size proportional to the relative writeout speeds. | 112 | * Scale the writeback cache size proportional to the relative writeout speeds. |
124 | * | 113 | * |
@@ -320,15 +309,13 @@ static void task_dirty_limit(struct task_struct *tsk, unsigned long *pdirty) | |||
320 | /* | 309 | /* |
321 | * | 310 | * |
322 | */ | 311 | */ |
323 | static DEFINE_SPINLOCK(bdi_lock); | ||
324 | static unsigned int bdi_min_ratio; | 312 | static unsigned int bdi_min_ratio; |
325 | 313 | ||
326 | int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) | 314 | int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) |
327 | { | 315 | { |
328 | int ret = 0; | 316 | int ret = 0; |
329 | unsigned long flags; | ||
330 | 317 | ||
331 | spin_lock_irqsave(&bdi_lock, flags); | 318 | spin_lock(&bdi_lock); |
332 | if (min_ratio > bdi->max_ratio) { | 319 | if (min_ratio > bdi->max_ratio) { |
333 | ret = -EINVAL; | 320 | ret = -EINVAL; |
334 | } else { | 321 | } else { |
@@ -340,27 +327,26 @@ int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) | |||
340 | ret = -EINVAL; | 327 | ret = -EINVAL; |
341 | } | 328 | } |
342 | } | 329 | } |
343 | spin_unlock_irqrestore(&bdi_lock, flags); | 330 | spin_unlock(&bdi_lock); |
344 | 331 | ||
345 | return ret; | 332 | return ret; |
346 | } | 333 | } |
347 | 334 | ||
348 | int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio) | 335 | int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio) |
349 | { | 336 | { |
350 | unsigned long flags; | ||
351 | int ret = 0; | 337 | int ret = 0; |
352 | 338 | ||
353 | if (max_ratio > 100) | 339 | if (max_ratio > 100) |
354 | return -EINVAL; | 340 | return -EINVAL; |
355 | 341 | ||
356 | spin_lock_irqsave(&bdi_lock, flags); | 342 | spin_lock(&bdi_lock); |
357 | if (bdi->min_ratio > max_ratio) { | 343 | if (bdi->min_ratio > max_ratio) { |
358 | ret = -EINVAL; | 344 | ret = -EINVAL; |
359 | } else { | 345 | } else { |
360 | bdi->max_ratio = max_ratio; | 346 | bdi->max_ratio = max_ratio; |
361 | bdi->max_prop_frac = (PROP_FRAC_BASE * max_ratio) / 100; | 347 | bdi->max_prop_frac = (PROP_FRAC_BASE * max_ratio) / 100; |
362 | } | 348 | } |
363 | spin_unlock_irqrestore(&bdi_lock, flags); | 349 | spin_unlock(&bdi_lock); |
364 | 350 | ||
365 | return ret; | 351 | return ret; |
366 | } | 352 | } |
@@ -546,7 +532,7 @@ static void balance_dirty_pages(struct address_space *mapping) | |||
546 | * up. | 532 | * up. |
547 | */ | 533 | */ |
548 | if (bdi_nr_reclaimable > bdi_thresh) { | 534 | if (bdi_nr_reclaimable > bdi_thresh) { |
549 | writeback_inodes(&wbc); | 535 | writeback_inodes_wbc(&wbc); |
550 | pages_written += write_chunk - wbc.nr_to_write; | 536 | pages_written += write_chunk - wbc.nr_to_write; |
551 | get_dirty_limits(&background_thresh, &dirty_thresh, | 537 | get_dirty_limits(&background_thresh, &dirty_thresh, |
552 | &bdi_thresh, bdi); | 538 | &bdi_thresh, bdi); |
@@ -575,7 +561,7 @@ static void balance_dirty_pages(struct address_space *mapping) | |||
575 | if (pages_written >= write_chunk) | 561 | if (pages_written >= write_chunk) |
576 | break; /* We've done our duty */ | 562 | break; /* We've done our duty */ |
577 | 563 | ||
578 | congestion_wait(WRITE, HZ/10); | 564 | schedule_timeout(1); |
579 | } | 565 | } |
580 | 566 | ||
581 | if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh && | 567 | if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh && |
@@ -594,10 +580,18 @@ static void balance_dirty_pages(struct address_space *mapping) | |||
594 | * background_thresh, to keep the amount of dirty memory low. | 580 | * background_thresh, to keep the amount of dirty memory low. |
595 | */ | 581 | */ |
596 | if ((laptop_mode && pages_written) || | 582 | if ((laptop_mode && pages_written) || |
597 | (!laptop_mode && (global_page_state(NR_FILE_DIRTY) | 583 | (!laptop_mode && ((nr_writeback = global_page_state(NR_FILE_DIRTY) |
598 | + global_page_state(NR_UNSTABLE_NFS) | 584 | + global_page_state(NR_UNSTABLE_NFS)) |
599 | > background_thresh))) | 585 | > background_thresh))) { |
600 | pdflush_operation(background_writeout, 0); | 586 | struct writeback_control wbc = { |
587 | .bdi = bdi, | ||
588 | .sync_mode = WB_SYNC_NONE, | ||
589 | .nr_to_write = nr_writeback, | ||
590 | }; | ||
591 | |||
592 | |||
593 | bdi_start_writeback(&wbc); | ||
594 | } | ||
601 | } | 595 | } |
602 | 596 | ||
603 | void set_page_dirty_balance(struct page *page, int page_mkwrite) | 597 | void set_page_dirty_balance(struct page *page, int page_mkwrite) |
@@ -669,7 +663,7 @@ void throttle_vm_writeout(gfp_t gfp_mask) | |||
669 | if (global_page_state(NR_UNSTABLE_NFS) + | 663 | if (global_page_state(NR_UNSTABLE_NFS) + |
670 | global_page_state(NR_WRITEBACK) <= dirty_thresh) | 664 | global_page_state(NR_WRITEBACK) <= dirty_thresh) |
671 | break; | 665 | break; |
672 | congestion_wait(WRITE, HZ/10); | 666 | congestion_wait(BLK_RW_ASYNC, HZ/10); |
673 | 667 | ||
674 | /* | 668 | /* |
675 | * The caller might hold locks which can prevent IO completion | 669 | * The caller might hold locks which can prevent IO completion |
@@ -681,153 +675,35 @@ void throttle_vm_writeout(gfp_t gfp_mask) | |||
681 | } | 675 | } |
682 | } | 676 | } |
683 | 677 | ||
684 | /* | ||
685 | * writeback at least _min_pages, and keep writing until the amount of dirty | ||
686 | * memory is less than the background threshold, or until we're all clean. | ||
687 | */ | ||
688 | static void background_writeout(unsigned long _min_pages) | ||
689 | { | ||
690 | long min_pages = _min_pages; | ||
691 | struct writeback_control wbc = { | ||
692 | .bdi = NULL, | ||
693 | .sync_mode = WB_SYNC_NONE, | ||
694 | .older_than_this = NULL, | ||
695 | .nr_to_write = 0, | ||
696 | .nonblocking = 1, | ||
697 | .range_cyclic = 1, | ||
698 | }; | ||
699 | |||
700 | for ( ; ; ) { | ||
701 | unsigned long background_thresh; | ||
702 | unsigned long dirty_thresh; | ||
703 | |||
704 | get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL); | ||
705 | if (global_page_state(NR_FILE_DIRTY) + | ||
706 | global_page_state(NR_UNSTABLE_NFS) < background_thresh | ||
707 | && min_pages <= 0) | ||
708 | break; | ||
709 | wbc.more_io = 0; | ||
710 | wbc.encountered_congestion = 0; | ||
711 | wbc.nr_to_write = MAX_WRITEBACK_PAGES; | ||
712 | wbc.pages_skipped = 0; | ||
713 | writeback_inodes(&wbc); | ||
714 | min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; | ||
715 | if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) { | ||
716 | /* Wrote less than expected */ | ||
717 | if (wbc.encountered_congestion || wbc.more_io) | ||
718 | congestion_wait(WRITE, HZ/10); | ||
719 | else | ||
720 | break; | ||
721 | } | ||
722 | } | ||
723 | } | ||
724 | |||
725 | /* | ||
726 | * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back | ||
727 | * the whole world. Returns 0 if a pdflush thread was dispatched. Returns | ||
728 | * -1 if all pdflush threads were busy. | ||
729 | */ | ||
730 | int wakeup_pdflush(long nr_pages) | ||
731 | { | ||
732 | if (nr_pages == 0) | ||
733 | nr_pages = global_page_state(NR_FILE_DIRTY) + | ||
734 | global_page_state(NR_UNSTABLE_NFS); | ||
735 | return pdflush_operation(background_writeout, nr_pages); | ||
736 | } | ||
737 | |||
738 | static void wb_timer_fn(unsigned long unused); | ||
739 | static void laptop_timer_fn(unsigned long unused); | 678 | static void laptop_timer_fn(unsigned long unused); |
740 | 679 | ||
741 | static DEFINE_TIMER(wb_timer, wb_timer_fn, 0, 0); | ||
742 | static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0); | 680 | static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0); |
743 | 681 | ||
744 | /* | 682 | /* |
745 | * Periodic writeback of "old" data. | ||
746 | * | ||
747 | * Define "old": the first time one of an inode's pages is dirtied, we mark the | ||
748 | * dirtying-time in the inode's address_space. So this periodic writeback code | ||
749 | * just walks the superblock inode list, writing back any inodes which are | ||
750 | * older than a specific point in time. | ||
751 | * | ||
752 | * Try to run once per dirty_writeback_interval. But if a writeback event | ||
753 | * takes longer than a dirty_writeback_interval interval, then leave a | ||
754 | * one-second gap. | ||
755 | * | ||
756 | * older_than_this takes precedence over nr_to_write. So we'll only write back | ||
757 | * all dirty pages if they are all attached to "old" mappings. | ||
758 | */ | ||
759 | static void wb_kupdate(unsigned long arg) | ||
760 | { | ||
761 | unsigned long oldest_jif; | ||
762 | unsigned long start_jif; | ||
763 | unsigned long next_jif; | ||
764 | long nr_to_write; | ||
765 | struct writeback_control wbc = { | ||
766 | .bdi = NULL, | ||
767 | .sync_mode = WB_SYNC_NONE, | ||
768 | .older_than_this = &oldest_jif, | ||
769 | .nr_to_write = 0, | ||
770 | .nonblocking = 1, | ||
771 | .for_kupdate = 1, | ||
772 | .range_cyclic = 1, | ||
773 | }; | ||
774 | |||
775 | sync_supers(); | ||
776 | |||
777 | oldest_jif = jiffies - msecs_to_jiffies(dirty_expire_interval * 10); | ||
778 | start_jif = jiffies; | ||
779 | next_jif = start_jif + msecs_to_jiffies(dirty_writeback_interval * 10); | ||
780 | nr_to_write = global_page_state(NR_FILE_DIRTY) + | ||
781 | global_page_state(NR_UNSTABLE_NFS) + | ||
782 | (inodes_stat.nr_inodes - inodes_stat.nr_unused); | ||
783 | while (nr_to_write > 0) { | ||
784 | wbc.more_io = 0; | ||
785 | wbc.encountered_congestion = 0; | ||
786 | wbc.nr_to_write = MAX_WRITEBACK_PAGES; | ||
787 | writeback_inodes(&wbc); | ||
788 | if (wbc.nr_to_write > 0) { | ||
789 | if (wbc.encountered_congestion || wbc.more_io) | ||
790 | congestion_wait(WRITE, HZ/10); | ||
791 | else | ||
792 | break; /* All the old data is written */ | ||
793 | } | ||
794 | nr_to_write -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; | ||
795 | } | ||
796 | if (time_before(next_jif, jiffies + HZ)) | ||
797 | next_jif = jiffies + HZ; | ||
798 | if (dirty_writeback_interval) | ||
799 | mod_timer(&wb_timer, next_jif); | ||
800 | } | ||
801 | |||
802 | /* | ||
803 | * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs | 683 | * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs |
804 | */ | 684 | */ |
805 | int dirty_writeback_centisecs_handler(ctl_table *table, int write, | 685 | int dirty_writeback_centisecs_handler(ctl_table *table, int write, |
806 | struct file *file, void __user *buffer, size_t *length, loff_t *ppos) | 686 | struct file *file, void __user *buffer, size_t *length, loff_t *ppos) |
807 | { | 687 | { |
808 | proc_dointvec(table, write, file, buffer, length, ppos); | 688 | proc_dointvec(table, write, file, buffer, length, ppos); |
809 | if (dirty_writeback_interval) | ||
810 | mod_timer(&wb_timer, jiffies + | ||
811 | msecs_to_jiffies(dirty_writeback_interval * 10)); | ||
812 | else | ||
813 | del_timer(&wb_timer); | ||
814 | return 0; | 689 | return 0; |
815 | } | 690 | } |
816 | 691 | ||
817 | static void wb_timer_fn(unsigned long unused) | 692 | static void do_laptop_sync(struct work_struct *work) |
818 | { | ||
819 | if (pdflush_operation(wb_kupdate, 0) < 0) | ||
820 | mod_timer(&wb_timer, jiffies + HZ); /* delay 1 second */ | ||
821 | } | ||
822 | |||
823 | static void laptop_flush(unsigned long unused) | ||
824 | { | 693 | { |
825 | sys_sync(); | 694 | wakeup_flusher_threads(0); |
695 | kfree(work); | ||
826 | } | 696 | } |
827 | 697 | ||
828 | static void laptop_timer_fn(unsigned long unused) | 698 | static void laptop_timer_fn(unsigned long unused) |
829 | { | 699 | { |
830 | pdflush_operation(laptop_flush, 0); | 700 | struct work_struct *work; |
701 | |||
702 | work = kmalloc(sizeof(*work), GFP_ATOMIC); | ||
703 | if (work) { | ||
704 | INIT_WORK(work, do_laptop_sync); | ||
705 | schedule_work(work); | ||
706 | } | ||
831 | } | 707 | } |
832 | 708 | ||
833 | /* | 709 | /* |
@@ -910,8 +786,6 @@ void __init page_writeback_init(void) | |||
910 | { | 786 | { |
911 | int shift; | 787 | int shift; |
912 | 788 | ||
913 | mod_timer(&wb_timer, | ||
914 | jiffies + msecs_to_jiffies(dirty_writeback_interval * 10)); | ||
915 | writeback_set_ratelimit(); | 789 | writeback_set_ratelimit(); |
916 | register_cpu_notifier(&ratelimit_nb); | 790 | register_cpu_notifier(&ratelimit_nb); |
917 | 791 | ||
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ad7cd1c56b07..a0de15f46987 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -817,13 +817,15 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) | |||
817 | * agressive about taking ownership of free pages | 817 | * agressive about taking ownership of free pages |
818 | */ | 818 | */ |
819 | if (unlikely(current_order >= (pageblock_order >> 1)) || | 819 | if (unlikely(current_order >= (pageblock_order >> 1)) || |
820 | start_migratetype == MIGRATE_RECLAIMABLE) { | 820 | start_migratetype == MIGRATE_RECLAIMABLE || |
821 | page_group_by_mobility_disabled) { | ||
821 | unsigned long pages; | 822 | unsigned long pages; |
822 | pages = move_freepages_block(zone, page, | 823 | pages = move_freepages_block(zone, page, |
823 | start_migratetype); | 824 | start_migratetype); |
824 | 825 | ||
825 | /* Claim the whole block if over half of it is free */ | 826 | /* Claim the whole block if over half of it is free */ |
826 | if (pages >= (1 << (pageblock_order-1))) | 827 | if (pages >= (1 << (pageblock_order-1)) || |
828 | page_group_by_mobility_disabled) | ||
827 | set_pageblock_migratetype(page, | 829 | set_pageblock_migratetype(page, |
828 | start_migratetype); | 830 | start_migratetype); |
829 | 831 | ||
@@ -882,7 +884,7 @@ retry_reserve: | |||
882 | */ | 884 | */ |
883 | static int rmqueue_bulk(struct zone *zone, unsigned int order, | 885 | static int rmqueue_bulk(struct zone *zone, unsigned int order, |
884 | unsigned long count, struct list_head *list, | 886 | unsigned long count, struct list_head *list, |
885 | int migratetype) | 887 | int migratetype, int cold) |
886 | { | 888 | { |
887 | int i; | 889 | int i; |
888 | 890 | ||
@@ -901,7 +903,10 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, | |||
901 | * merge IO requests if the physical pages are ordered | 903 | * merge IO requests if the physical pages are ordered |
902 | * properly. | 904 | * properly. |
903 | */ | 905 | */ |
904 | list_add(&page->lru, list); | 906 | if (likely(cold == 0)) |
907 | list_add(&page->lru, list); | ||
908 | else | ||
909 | list_add_tail(&page->lru, list); | ||
905 | set_page_private(page, migratetype); | 910 | set_page_private(page, migratetype); |
906 | list = &page->lru; | 911 | list = &page->lru; |
907 | } | 912 | } |
@@ -1119,7 +1124,8 @@ again: | |||
1119 | local_irq_save(flags); | 1124 | local_irq_save(flags); |
1120 | if (!pcp->count) { | 1125 | if (!pcp->count) { |
1121 | pcp->count = rmqueue_bulk(zone, 0, | 1126 | pcp->count = rmqueue_bulk(zone, 0, |
1122 | pcp->batch, &pcp->list, migratetype); | 1127 | pcp->batch, &pcp->list, |
1128 | migratetype, cold); | ||
1123 | if (unlikely(!pcp->count)) | 1129 | if (unlikely(!pcp->count)) |
1124 | goto failed; | 1130 | goto failed; |
1125 | } | 1131 | } |
@@ -1138,7 +1144,8 @@ again: | |||
1138 | /* Allocate more to the pcp list if necessary */ | 1144 | /* Allocate more to the pcp list if necessary */ |
1139 | if (unlikely(&page->lru == &pcp->list)) { | 1145 | if (unlikely(&page->lru == &pcp->list)) { |
1140 | pcp->count += rmqueue_bulk(zone, 0, | 1146 | pcp->count += rmqueue_bulk(zone, 0, |
1141 | pcp->batch, &pcp->list, migratetype); | 1147 | pcp->batch, &pcp->list, |
1148 | migratetype, cold); | ||
1142 | page = list_entry(pcp->list.next, struct page, lru); | 1149 | page = list_entry(pcp->list.next, struct page, lru); |
1143 | } | 1150 | } |
1144 | 1151 | ||
@@ -1666,7 +1673,7 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, | |||
1666 | preferred_zone, migratetype); | 1673 | preferred_zone, migratetype); |
1667 | 1674 | ||
1668 | if (!page && gfp_mask & __GFP_NOFAIL) | 1675 | if (!page && gfp_mask & __GFP_NOFAIL) |
1669 | congestion_wait(WRITE, HZ/50); | 1676 | congestion_wait(BLK_RW_ASYNC, HZ/50); |
1670 | } while (!page && (gfp_mask & __GFP_NOFAIL)); | 1677 | } while (!page && (gfp_mask & __GFP_NOFAIL)); |
1671 | 1678 | ||
1672 | return page; | 1679 | return page; |
@@ -1740,8 +1747,10 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, | |||
1740 | * be using allocators in order of preference for an area that is | 1747 | * be using allocators in order of preference for an area that is |
1741 | * too large. | 1748 | * too large. |
1742 | */ | 1749 | */ |
1743 | if (WARN_ON_ONCE(order >= MAX_ORDER)) | 1750 | if (order >= MAX_ORDER) { |
1751 | WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN)); | ||
1744 | return NULL; | 1752 | return NULL; |
1753 | } | ||
1745 | 1754 | ||
1746 | /* | 1755 | /* |
1747 | * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and | 1756 | * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and |
@@ -1789,6 +1798,10 @@ rebalance: | |||
1789 | if (p->flags & PF_MEMALLOC) | 1798 | if (p->flags & PF_MEMALLOC) |
1790 | goto nopage; | 1799 | goto nopage; |
1791 | 1800 | ||
1801 | /* Avoid allocations with no watermarks from looping endlessly */ | ||
1802 | if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL)) | ||
1803 | goto nopage; | ||
1804 | |||
1792 | /* Try direct reclaim and then allocating */ | 1805 | /* Try direct reclaim and then allocating */ |
1793 | page = __alloc_pages_direct_reclaim(gfp_mask, order, | 1806 | page = __alloc_pages_direct_reclaim(gfp_mask, order, |
1794 | zonelist, high_zoneidx, | 1807 | zonelist, high_zoneidx, |
@@ -1831,7 +1844,7 @@ rebalance: | |||
1831 | pages_reclaimed += did_some_progress; | 1844 | pages_reclaimed += did_some_progress; |
1832 | if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) { | 1845 | if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) { |
1833 | /* Wait for some write requests to complete then retry */ | 1846 | /* Wait for some write requests to complete then retry */ |
1834 | congestion_wait(WRITE, HZ/50); | 1847 | congestion_wait(BLK_RW_ASYNC, HZ/50); |
1835 | goto rebalance; | 1848 | goto rebalance; |
1836 | } | 1849 | } |
1837 | 1850 | ||
@@ -2533,7 +2546,6 @@ static void build_zonelists(pg_data_t *pgdat) | |||
2533 | prev_node = local_node; | 2546 | prev_node = local_node; |
2534 | nodes_clear(used_mask); | 2547 | nodes_clear(used_mask); |
2535 | 2548 | ||
2536 | memset(node_load, 0, sizeof(node_load)); | ||
2537 | memset(node_order, 0, sizeof(node_order)); | 2549 | memset(node_order, 0, sizeof(node_order)); |
2538 | j = 0; | 2550 | j = 0; |
2539 | 2551 | ||
@@ -2642,6 +2654,9 @@ static int __build_all_zonelists(void *dummy) | |||
2642 | { | 2654 | { |
2643 | int nid; | 2655 | int nid; |
2644 | 2656 | ||
2657 | #ifdef CONFIG_NUMA | ||
2658 | memset(node_load, 0, sizeof(node_load)); | ||
2659 | #endif | ||
2645 | for_each_online_node(nid) { | 2660 | for_each_online_node(nid) { |
2646 | pg_data_t *pgdat = NODE_DATA(nid); | 2661 | pg_data_t *pgdat = NODE_DATA(nid); |
2647 | 2662 | ||
@@ -4745,8 +4760,10 @@ void *__init alloc_large_system_hash(const char *tablename, | |||
4745 | * some pages at the end of hash table which | 4760 | * some pages at the end of hash table which |
4746 | * alloc_pages_exact() automatically does | 4761 | * alloc_pages_exact() automatically does |
4747 | */ | 4762 | */ |
4748 | if (get_order(size) < MAX_ORDER) | 4763 | if (get_order(size) < MAX_ORDER) { |
4749 | table = alloc_pages_exact(size, GFP_ATOMIC); | 4764 | table = alloc_pages_exact(size, GFP_ATOMIC); |
4765 | kmemleak_alloc(table, size, 1, GFP_ATOMIC); | ||
4766 | } | ||
4750 | } | 4767 | } |
4751 | } while (!table && size > PAGE_SIZE && --log2qty); | 4768 | } while (!table && size > PAGE_SIZE && --log2qty); |
4752 | 4769 | ||
@@ -4764,16 +4781,6 @@ void *__init alloc_large_system_hash(const char *tablename, | |||
4764 | if (_hash_mask) | 4781 | if (_hash_mask) |
4765 | *_hash_mask = (1 << log2qty) - 1; | 4782 | *_hash_mask = (1 << log2qty) - 1; |
4766 | 4783 | ||
4767 | /* | ||
4768 | * If hashdist is set, the table allocation is done with __vmalloc() | ||
4769 | * which invokes the kmemleak_alloc() callback. This function may also | ||
4770 | * be called before the slab and kmemleak are initialised when | ||
4771 | * kmemleak simply buffers the request to be executed later | ||
4772 | * (GFP_ATOMIC flag ignored in this case). | ||
4773 | */ | ||
4774 | if (!hashdist) | ||
4775 | kmemleak_alloc(table, size, 1, GFP_ATOMIC); | ||
4776 | |||
4777 | return table; | 4784 | return table; |
4778 | } | 4785 | } |
4779 | 4786 | ||
diff --git a/mm/pdflush.c b/mm/pdflush.c deleted file mode 100644 index 235ac440c44e..000000000000 --- a/mm/pdflush.c +++ /dev/null | |||
@@ -1,269 +0,0 @@ | |||
1 | /* | ||
2 | * mm/pdflush.c - worker threads for writing back filesystem data | ||
3 | * | ||
4 | * Copyright (C) 2002, Linus Torvalds. | ||
5 | * | ||
6 | * 09Apr2002 Andrew Morton | ||
7 | * Initial version | ||
8 | * 29Feb2004 kaos@sgi.com | ||
9 | * Move worker thread creation to kthread to avoid chewing | ||
10 | * up stack space with nested calls to kernel_thread. | ||
11 | */ | ||
12 | |||
13 | #include <linux/sched.h> | ||
14 | #include <linux/list.h> | ||
15 | #include <linux/signal.h> | ||
16 | #include <linux/spinlock.h> | ||
17 | #include <linux/gfp.h> | ||
18 | #include <linux/init.h> | ||
19 | #include <linux/module.h> | ||
20 | #include <linux/fs.h> /* Needed by writeback.h */ | ||
21 | #include <linux/writeback.h> /* Prototypes pdflush_operation() */ | ||
22 | #include <linux/kthread.h> | ||
23 | #include <linux/cpuset.h> | ||
24 | #include <linux/freezer.h> | ||
25 | |||
26 | |||
27 | /* | ||
28 | * Minimum and maximum number of pdflush instances | ||
29 | */ | ||
30 | #define MIN_PDFLUSH_THREADS 2 | ||
31 | #define MAX_PDFLUSH_THREADS 8 | ||
32 | |||
33 | static void start_one_pdflush_thread(void); | ||
34 | |||
35 | |||
36 | /* | ||
37 | * The pdflush threads are worker threads for writing back dirty data. | ||
38 | * Ideally, we'd like one thread per active disk spindle. But the disk | ||
39 | * topology is very hard to divine at this level. Instead, we take | ||
40 | * care in various places to prevent more than one pdflush thread from | ||
41 | * performing writeback against a single filesystem. pdflush threads | ||
42 | * have the PF_FLUSHER flag set in current->flags to aid in this. | ||
43 | */ | ||
44 | |||
45 | /* | ||
46 | * All the pdflush threads. Protected by pdflush_lock | ||
47 | */ | ||
48 | static LIST_HEAD(pdflush_list); | ||
49 | static DEFINE_SPINLOCK(pdflush_lock); | ||
50 | |||
51 | /* | ||
52 | * The count of currently-running pdflush threads. Protected | ||
53 | * by pdflush_lock. | ||
54 | * | ||
55 | * Readable by sysctl, but not writable. Published to userspace at | ||
56 | * /proc/sys/vm/nr_pdflush_threads. | ||
57 | */ | ||
58 | int nr_pdflush_threads = 0; | ||
59 | |||
60 | /* | ||
61 | * The time at which the pdflush thread pool last went empty | ||
62 | */ | ||
63 | static unsigned long last_empty_jifs; | ||
64 | |||
65 | /* | ||
66 | * The pdflush thread. | ||
67 | * | ||
68 | * Thread pool management algorithm: | ||
69 | * | ||
70 | * - The minimum and maximum number of pdflush instances are bound | ||
71 | * by MIN_PDFLUSH_THREADS and MAX_PDFLUSH_THREADS. | ||
72 | * | ||
73 | * - If there have been no idle pdflush instances for 1 second, create | ||
74 | * a new one. | ||
75 | * | ||
76 | * - If the least-recently-went-to-sleep pdflush thread has been asleep | ||
77 | * for more than one second, terminate a thread. | ||
78 | */ | ||
79 | |||
80 | /* | ||
81 | * A structure for passing work to a pdflush thread. Also for passing | ||
82 | * state information between pdflush threads. Protected by pdflush_lock. | ||
83 | */ | ||
84 | struct pdflush_work { | ||
85 | struct task_struct *who; /* The thread */ | ||
86 | void (*fn)(unsigned long); /* A callback function */ | ||
87 | unsigned long arg0; /* An argument to the callback */ | ||
88 | struct list_head list; /* On pdflush_list, when idle */ | ||
89 | unsigned long when_i_went_to_sleep; | ||
90 | }; | ||
91 | |||
92 | static int __pdflush(struct pdflush_work *my_work) | ||
93 | { | ||
94 | current->flags |= PF_FLUSHER | PF_SWAPWRITE; | ||
95 | set_freezable(); | ||
96 | my_work->fn = NULL; | ||
97 | my_work->who = current; | ||
98 | INIT_LIST_HEAD(&my_work->list); | ||
99 | |||
100 | spin_lock_irq(&pdflush_lock); | ||
101 | for ( ; ; ) { | ||
102 | struct pdflush_work *pdf; | ||
103 | |||
104 | set_current_state(TASK_INTERRUPTIBLE); | ||
105 | list_move(&my_work->list, &pdflush_list); | ||
106 | my_work->when_i_went_to_sleep = jiffies; | ||
107 | spin_unlock_irq(&pdflush_lock); | ||
108 | schedule(); | ||
109 | try_to_freeze(); | ||
110 | spin_lock_irq(&pdflush_lock); | ||
111 | if (!list_empty(&my_work->list)) { | ||
112 | /* | ||
113 | * Someone woke us up, but without removing our control | ||
114 | * structure from the global list. swsusp will do this | ||
115 | * in try_to_freeze()->refrigerator(). Handle it. | ||
116 | */ | ||
117 | my_work->fn = NULL; | ||
118 | continue; | ||
119 | } | ||
120 | if (my_work->fn == NULL) { | ||
121 | printk("pdflush: bogus wakeup\n"); | ||
122 | continue; | ||
123 | } | ||
124 | spin_unlock_irq(&pdflush_lock); | ||
125 | |||
126 | (*my_work->fn)(my_work->arg0); | ||
127 | |||
128 | spin_lock_irq(&pdflush_lock); | ||
129 | |||
130 | /* | ||
131 | * Thread creation: For how long have there been zero | ||
132 | * available threads? | ||
133 | * | ||
134 | * To throttle creation, we reset last_empty_jifs. | ||
135 | */ | ||
136 | if (time_after(jiffies, last_empty_jifs + 1 * HZ)) { | ||
137 | if (list_empty(&pdflush_list)) { | ||
138 | if (nr_pdflush_threads < MAX_PDFLUSH_THREADS) { | ||
139 | last_empty_jifs = jiffies; | ||
140 | nr_pdflush_threads++; | ||
141 | spin_unlock_irq(&pdflush_lock); | ||
142 | start_one_pdflush_thread(); | ||
143 | spin_lock_irq(&pdflush_lock); | ||
144 | } | ||
145 | } | ||
146 | } | ||
147 | |||
148 | my_work->fn = NULL; | ||
149 | |||
150 | /* | ||
151 | * Thread destruction: For how long has the sleepiest | ||
152 | * thread slept? | ||
153 | */ | ||
154 | if (list_empty(&pdflush_list)) | ||
155 | continue; | ||
156 | if (nr_pdflush_threads <= MIN_PDFLUSH_THREADS) | ||
157 | continue; | ||
158 | pdf = list_entry(pdflush_list.prev, struct pdflush_work, list); | ||
159 | if (time_after(jiffies, pdf->when_i_went_to_sleep + 1 * HZ)) { | ||
160 | /* Limit exit rate */ | ||
161 | pdf->when_i_went_to_sleep = jiffies; | ||
162 | break; /* exeunt */ | ||
163 | } | ||
164 | } | ||
165 | nr_pdflush_threads--; | ||
166 | spin_unlock_irq(&pdflush_lock); | ||
167 | return 0; | ||
168 | } | ||
169 | |||
170 | /* | ||
171 | * Of course, my_work wants to be just a local in __pdflush(). It is | ||
172 | * separated out in this manner to hopefully prevent the compiler from | ||
173 | * performing unfortunate optimisations against the auto variables. Because | ||
174 | * these are visible to other tasks and CPUs. (No problem has actually | ||
175 | * been observed. This is just paranoia). | ||
176 | */ | ||
177 | static int pdflush(void *dummy) | ||
178 | { | ||
179 | struct pdflush_work my_work; | ||
180 | cpumask_var_t cpus_allowed; | ||
181 | |||
182 | /* | ||
183 | * Since the caller doesn't even check kthread_run() worked, let's not | ||
184 | * freak out too much if this fails. | ||
185 | */ | ||
186 | if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { | ||
187 | printk(KERN_WARNING "pdflush failed to allocate cpumask\n"); | ||
188 | return 0; | ||
189 | } | ||
190 | |||
191 | /* | ||
192 | * pdflush can spend a lot of time doing encryption via dm-crypt. We | ||
193 | * don't want to do that at keventd's priority. | ||
194 | */ | ||
195 | set_user_nice(current, 0); | ||
196 | |||
197 | /* | ||
198 | * Some configs put our parent kthread in a limited cpuset, | ||
199 | * which kthread() overrides, forcing cpus_allowed == cpu_all_mask. | ||
200 | * Our needs are more modest - cut back to our cpusets cpus_allowed. | ||
201 | * This is needed as pdflush's are dynamically created and destroyed. | ||
202 | * The boottime pdflush's are easily placed w/o these 2 lines. | ||
203 | */ | ||
204 | cpuset_cpus_allowed(current, cpus_allowed); | ||
205 | set_cpus_allowed_ptr(current, cpus_allowed); | ||
206 | free_cpumask_var(cpus_allowed); | ||
207 | |||
208 | return __pdflush(&my_work); | ||
209 | } | ||
210 | |||
211 | /* | ||
212 | * Attempt to wake up a pdflush thread, and get it to do some work for you. | ||
213 | * Returns zero if it indeed managed to find a worker thread, and passed your | ||
214 | * payload to it. | ||
215 | */ | ||
216 | int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0) | ||
217 | { | ||
218 | unsigned long flags; | ||
219 | int ret = 0; | ||
220 | |||
221 | BUG_ON(fn == NULL); /* Hard to diagnose if it's deferred */ | ||
222 | |||
223 | spin_lock_irqsave(&pdflush_lock, flags); | ||
224 | if (list_empty(&pdflush_list)) { | ||
225 | ret = -1; | ||
226 | } else { | ||
227 | struct pdflush_work *pdf; | ||
228 | |||
229 | pdf = list_entry(pdflush_list.next, struct pdflush_work, list); | ||
230 | list_del_init(&pdf->list); | ||
231 | if (list_empty(&pdflush_list)) | ||
232 | last_empty_jifs = jiffies; | ||
233 | pdf->fn = fn; | ||
234 | pdf->arg0 = arg0; | ||
235 | wake_up_process(pdf->who); | ||
236 | } | ||
237 | spin_unlock_irqrestore(&pdflush_lock, flags); | ||
238 | |||
239 | return ret; | ||
240 | } | ||
241 | |||
242 | static void start_one_pdflush_thread(void) | ||
243 | { | ||
244 | struct task_struct *k; | ||
245 | |||
246 | k = kthread_run(pdflush, NULL, "pdflush"); | ||
247 | if (unlikely(IS_ERR(k))) { | ||
248 | spin_lock_irq(&pdflush_lock); | ||
249 | nr_pdflush_threads--; | ||
250 | spin_unlock_irq(&pdflush_lock); | ||
251 | } | ||
252 | } | ||
253 | |||
254 | static int __init pdflush_init(void) | ||
255 | { | ||
256 | int i; | ||
257 | |||
258 | /* | ||
259 | * Pre-set nr_pdflush_threads... If we fail to create, | ||
260 | * the count will be decremented. | ||
261 | */ | ||
262 | nr_pdflush_threads = MIN_PDFLUSH_THREADS; | ||
263 | |||
264 | for (i = 0; i < MIN_PDFLUSH_THREADS; i++) | ||
265 | start_one_pdflush_thread(); | ||
266 | return 0; | ||
267 | } | ||
268 | |||
269 | module_init(pdflush_init); | ||
diff --git a/mm/percpu.c b/mm/percpu.c index b70f2acd8853..3311c8919f37 100644 --- a/mm/percpu.c +++ b/mm/percpu.c | |||
@@ -8,12 +8,12 @@ | |||
8 | * | 8 | * |
9 | * This is percpu allocator which can handle both static and dynamic | 9 | * This is percpu allocator which can handle both static and dynamic |
10 | * areas. Percpu areas are allocated in chunks in vmalloc area. Each | 10 | * areas. Percpu areas are allocated in chunks in vmalloc area. Each |
11 | * chunk is consisted of num_possible_cpus() units and the first chunk | 11 | * chunk is consisted of nr_cpu_ids units and the first chunk is used |
12 | * is used for static percpu variables in the kernel image (special | 12 | * for static percpu variables in the kernel image (special boot time |
13 | * boot time alloc/init handling necessary as these areas need to be | 13 | * alloc/init handling necessary as these areas need to be brought up |
14 | * brought up before allocation services are running). Unit grows as | 14 | * before allocation services are running). Unit grows as necessary |
15 | * necessary and all units grow or shrink in unison. When a chunk is | 15 | * and all units grow or shrink in unison. When a chunk is filled up, |
16 | * filled up, another chunk is allocated. ie. in vmalloc area | 16 | * another chunk is allocated. ie. in vmalloc area |
17 | * | 17 | * |
18 | * c0 c1 c2 | 18 | * c0 c1 c2 |
19 | * ------------------- ------------------- ------------ | 19 | * ------------------- ------------------- ------------ |
@@ -197,7 +197,12 @@ static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk, | |||
197 | static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk, | 197 | static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk, |
198 | int page_idx) | 198 | int page_idx) |
199 | { | 199 | { |
200 | return *pcpu_chunk_pagep(chunk, 0, page_idx) != NULL; | 200 | /* |
201 | * Any possible cpu id can be used here, so there's no need to | ||
202 | * worry about preemption or cpu hotplug. | ||
203 | */ | ||
204 | return *pcpu_chunk_pagep(chunk, raw_smp_processor_id(), | ||
205 | page_idx) != NULL; | ||
201 | } | 206 | } |
202 | 207 | ||
203 | /* set the pointer to a chunk in a page struct */ | 208 | /* set the pointer to a chunk in a page struct */ |
@@ -297,6 +302,14 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) | |||
297 | return pcpu_first_chunk; | 302 | return pcpu_first_chunk; |
298 | } | 303 | } |
299 | 304 | ||
305 | /* | ||
306 | * The address is relative to unit0 which might be unused and | ||
307 | * thus unmapped. Offset the address to the unit space of the | ||
308 | * current processor before looking it up in the vmalloc | ||
309 | * space. Note that any possible cpu id can be used here, so | ||
310 | * there's no need to worry about preemption or cpu hotplug. | ||
311 | */ | ||
312 | addr += raw_smp_processor_id() * pcpu_unit_size; | ||
300 | return pcpu_get_page_chunk(vmalloc_to_page(addr)); | 313 | return pcpu_get_page_chunk(vmalloc_to_page(addr)); |
301 | } | 314 | } |
302 | 315 | ||
@@ -558,7 +571,7 @@ static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme) | |||
558 | static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end, | 571 | static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end, |
559 | bool flush_tlb) | 572 | bool flush_tlb) |
560 | { | 573 | { |
561 | unsigned int last = num_possible_cpus() - 1; | 574 | unsigned int last = nr_cpu_ids - 1; |
562 | unsigned int cpu; | 575 | unsigned int cpu; |
563 | 576 | ||
564 | /* unmap must not be done on immutable chunk */ | 577 | /* unmap must not be done on immutable chunk */ |
@@ -643,7 +656,7 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size, | |||
643 | */ | 656 | */ |
644 | static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end) | 657 | static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end) |
645 | { | 658 | { |
646 | unsigned int last = num_possible_cpus() - 1; | 659 | unsigned int last = nr_cpu_ids - 1; |
647 | unsigned int cpu; | 660 | unsigned int cpu; |
648 | int err; | 661 | int err; |
649 | 662 | ||
@@ -749,7 +762,7 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void) | |||
749 | chunk->map[chunk->map_used++] = pcpu_unit_size; | 762 | chunk->map[chunk->map_used++] = pcpu_unit_size; |
750 | chunk->page = chunk->page_ar; | 763 | chunk->page = chunk->page_ar; |
751 | 764 | ||
752 | chunk->vm = get_vm_area(pcpu_chunk_size, GFP_KERNEL); | 765 | chunk->vm = get_vm_area(pcpu_chunk_size, VM_ALLOC); |
753 | if (!chunk->vm) { | 766 | if (!chunk->vm) { |
754 | free_pcpu_chunk(chunk); | 767 | free_pcpu_chunk(chunk); |
755 | return NULL; | 768 | return NULL; |
@@ -1067,9 +1080,9 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, | |||
1067 | PFN_UP(size_sum)); | 1080 | PFN_UP(size_sum)); |
1068 | 1081 | ||
1069 | pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT; | 1082 | pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT; |
1070 | pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size; | 1083 | pcpu_chunk_size = nr_cpu_ids * pcpu_unit_size; |
1071 | pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) | 1084 | pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) |
1072 | + num_possible_cpus() * pcpu_unit_pages * sizeof(struct page *); | 1085 | + nr_cpu_ids * pcpu_unit_pages * sizeof(struct page *); |
1073 | 1086 | ||
1074 | if (dyn_size < 0) | 1087 | if (dyn_size < 0) |
1075 | dyn_size = pcpu_unit_size - static_size - reserved_size; | 1088 | dyn_size = pcpu_unit_size - static_size - reserved_size; |
@@ -1248,7 +1261,7 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, | |||
1248 | } else | 1261 | } else |
1249 | pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE); | 1262 | pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE); |
1250 | 1263 | ||
1251 | chunk_size = pcpue_unit_size * num_possible_cpus(); | 1264 | chunk_size = pcpue_unit_size * nr_cpu_ids; |
1252 | 1265 | ||
1253 | pcpue_ptr = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE, | 1266 | pcpue_ptr = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE, |
1254 | __pa(MAX_DMA_ADDRESS)); | 1267 | __pa(MAX_DMA_ADDRESS)); |
@@ -1259,12 +1272,15 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, | |||
1259 | } | 1272 | } |
1260 | 1273 | ||
1261 | /* return the leftover and copy */ | 1274 | /* return the leftover and copy */ |
1262 | for_each_possible_cpu(cpu) { | 1275 | for (cpu = 0; cpu < nr_cpu_ids; cpu++) { |
1263 | void *ptr = pcpue_ptr + cpu * pcpue_unit_size; | 1276 | void *ptr = pcpue_ptr + cpu * pcpue_unit_size; |
1264 | 1277 | ||
1265 | free_bootmem(__pa(ptr + pcpue_size), | 1278 | if (cpu_possible(cpu)) { |
1266 | pcpue_unit_size - pcpue_size); | 1279 | free_bootmem(__pa(ptr + pcpue_size), |
1267 | memcpy(ptr, __per_cpu_load, static_size); | 1280 | pcpue_unit_size - pcpue_size); |
1281 | memcpy(ptr, __per_cpu_load, static_size); | ||
1282 | } else | ||
1283 | free_bootmem(__pa(ptr), pcpue_unit_size); | ||
1268 | } | 1284 | } |
1269 | 1285 | ||
1270 | /* we're ready, commit */ | 1286 | /* we're ready, commit */ |
@@ -358,6 +358,7 @@ static int page_referenced_one(struct page *page, | |||
358 | */ | 358 | */ |
359 | if (vma->vm_flags & VM_LOCKED) { | 359 | if (vma->vm_flags & VM_LOCKED) { |
360 | *mapcount = 1; /* break early from loop */ | 360 | *mapcount = 1; /* break early from loop */ |
361 | *vm_flags |= VM_LOCKED; | ||
361 | goto out_unmap; | 362 | goto out_unmap; |
362 | } | 363 | } |
363 | 364 | ||
diff --git a/mm/shmem.c b/mm/shmem.c index d713239ce2ce..5a0b3d4055f3 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -2446,7 +2446,7 @@ static const struct inode_operations shmem_inode_operations = { | |||
2446 | .getxattr = generic_getxattr, | 2446 | .getxattr = generic_getxattr, |
2447 | .listxattr = generic_listxattr, | 2447 | .listxattr = generic_listxattr, |
2448 | .removexattr = generic_removexattr, | 2448 | .removexattr = generic_removexattr, |
2449 | .permission = shmem_permission, | 2449 | .check_acl = shmem_check_acl, |
2450 | #endif | 2450 | #endif |
2451 | 2451 | ||
2452 | }; | 2452 | }; |
@@ -2469,7 +2469,7 @@ static const struct inode_operations shmem_dir_inode_operations = { | |||
2469 | .getxattr = generic_getxattr, | 2469 | .getxattr = generic_getxattr, |
2470 | .listxattr = generic_listxattr, | 2470 | .listxattr = generic_listxattr, |
2471 | .removexattr = generic_removexattr, | 2471 | .removexattr = generic_removexattr, |
2472 | .permission = shmem_permission, | 2472 | .check_acl = shmem_check_acl, |
2473 | #endif | 2473 | #endif |
2474 | }; | 2474 | }; |
2475 | 2475 | ||
@@ -2480,7 +2480,7 @@ static const struct inode_operations shmem_special_inode_operations = { | |||
2480 | .getxattr = generic_getxattr, | 2480 | .getxattr = generic_getxattr, |
2481 | .listxattr = generic_listxattr, | 2481 | .listxattr = generic_listxattr, |
2482 | .removexattr = generic_removexattr, | 2482 | .removexattr = generic_removexattr, |
2483 | .permission = shmem_permission, | 2483 | .check_acl = shmem_check_acl, |
2484 | #endif | 2484 | #endif |
2485 | }; | 2485 | }; |
2486 | 2486 | ||
diff --git a/mm/shmem_acl.c b/mm/shmem_acl.c index 606a8e757a42..df2c87fdae50 100644 --- a/mm/shmem_acl.c +++ b/mm/shmem_acl.c | |||
@@ -157,7 +157,7 @@ shmem_acl_init(struct inode *inode, struct inode *dir) | |||
157 | /** | 157 | /** |
158 | * shmem_check_acl - check_acl() callback for generic_permission() | 158 | * shmem_check_acl - check_acl() callback for generic_permission() |
159 | */ | 159 | */ |
160 | static int | 160 | int |
161 | shmem_check_acl(struct inode *inode, int mask) | 161 | shmem_check_acl(struct inode *inode, int mask) |
162 | { | 162 | { |
163 | struct posix_acl *acl = shmem_get_acl(inode, ACL_TYPE_ACCESS); | 163 | struct posix_acl *acl = shmem_get_acl(inode, ACL_TYPE_ACCESS); |
@@ -169,12 +169,3 @@ shmem_check_acl(struct inode *inode, int mask) | |||
169 | } | 169 | } |
170 | return -EAGAIN; | 170 | return -EAGAIN; |
171 | } | 171 | } |
172 | |||
173 | /** | ||
174 | * shmem_permission - permission() inode operation | ||
175 | */ | ||
176 | int | ||
177 | shmem_permission(struct inode *inode, int mask) | ||
178 | { | ||
179 | return generic_permission(inode, mask, shmem_check_acl); | ||
180 | } | ||
@@ -692,3 +692,8 @@ void __init kmem_cache_init(void) | |||
692 | { | 692 | { |
693 | slob_ready = 1; | 693 | slob_ready = 1; |
694 | } | 694 | } |
695 | |||
696 | void __init kmem_cache_init_late(void) | ||
697 | { | ||
698 | /* Nothing to do */ | ||
699 | } | ||
@@ -21,7 +21,6 @@ | |||
21 | #include <linux/kmemcheck.h> | 21 | #include <linux/kmemcheck.h> |
22 | #include <linux/cpu.h> | 22 | #include <linux/cpu.h> |
23 | #include <linux/cpuset.h> | 23 | #include <linux/cpuset.h> |
24 | #include <linux/kmemleak.h> | ||
25 | #include <linux/mempolicy.h> | 24 | #include <linux/mempolicy.h> |
26 | #include <linux/ctype.h> | 25 | #include <linux/ctype.h> |
27 | #include <linux/debugobjects.h> | 26 | #include <linux/debugobjects.h> |
@@ -1127,8 +1126,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) | |||
1127 | } | 1126 | } |
1128 | 1127 | ||
1129 | if (kmemcheck_enabled | 1128 | if (kmemcheck_enabled |
1130 | && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) | 1129 | && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) { |
1131 | { | ||
1132 | int pages = 1 << oo_order(oo); | 1130 | int pages = 1 << oo_order(oo); |
1133 | 1131 | ||
1134 | kmemcheck_alloc_shadow(page, oo_order(oo), flags, node); | 1132 | kmemcheck_alloc_shadow(page, oo_order(oo), flags, node); |
@@ -2023,7 +2021,7 @@ static inline int calculate_order(int size) | |||
2023 | return order; | 2021 | return order; |
2024 | fraction /= 2; | 2022 | fraction /= 2; |
2025 | } | 2023 | } |
2026 | min_objects --; | 2024 | min_objects--; |
2027 | } | 2025 | } |
2028 | 2026 | ||
2029 | /* | 2027 | /* |
@@ -2629,8 +2627,6 @@ static inline int kmem_cache_close(struct kmem_cache *s) | |||
2629 | */ | 2627 | */ |
2630 | void kmem_cache_destroy(struct kmem_cache *s) | 2628 | void kmem_cache_destroy(struct kmem_cache *s) |
2631 | { | 2629 | { |
2632 | if (s->flags & SLAB_DESTROY_BY_RCU) | ||
2633 | rcu_barrier(); | ||
2634 | down_write(&slub_lock); | 2630 | down_write(&slub_lock); |
2635 | s->refcount--; | 2631 | s->refcount--; |
2636 | if (!s->refcount) { | 2632 | if (!s->refcount) { |
@@ -2641,6 +2637,8 @@ void kmem_cache_destroy(struct kmem_cache *s) | |||
2641 | "still has objects.\n", s->name, __func__); | 2637 | "still has objects.\n", s->name, __func__); |
2642 | dump_stack(); | 2638 | dump_stack(); |
2643 | } | 2639 | } |
2640 | if (s->flags & SLAB_DESTROY_BY_RCU) | ||
2641 | rcu_barrier(); | ||
2644 | sysfs_slab_remove(s); | 2642 | sysfs_slab_remove(s); |
2645 | } else | 2643 | } else |
2646 | up_write(&slub_lock); | 2644 | up_write(&slub_lock); |
@@ -2874,13 +2872,15 @@ EXPORT_SYMBOL(__kmalloc); | |||
2874 | static void *kmalloc_large_node(size_t size, gfp_t flags, int node) | 2872 | static void *kmalloc_large_node(size_t size, gfp_t flags, int node) |
2875 | { | 2873 | { |
2876 | struct page *page; | 2874 | struct page *page; |
2875 | void *ptr = NULL; | ||
2877 | 2876 | ||
2878 | flags |= __GFP_COMP | __GFP_NOTRACK; | 2877 | flags |= __GFP_COMP | __GFP_NOTRACK; |
2879 | page = alloc_pages_node(node, flags, get_order(size)); | 2878 | page = alloc_pages_node(node, flags, get_order(size)); |
2880 | if (page) | 2879 | if (page) |
2881 | return page_address(page); | 2880 | ptr = page_address(page); |
2882 | else | 2881 | |
2883 | return NULL; | 2882 | kmemleak_alloc(ptr, size, 1, flags); |
2883 | return ptr; | ||
2884 | } | 2884 | } |
2885 | 2885 | ||
2886 | #ifdef CONFIG_NUMA | 2886 | #ifdef CONFIG_NUMA |
@@ -2965,6 +2965,7 @@ void kfree(const void *x) | |||
2965 | page = virt_to_head_page(x); | 2965 | page = virt_to_head_page(x); |
2966 | if (unlikely(!PageSlab(page))) { | 2966 | if (unlikely(!PageSlab(page))) { |
2967 | BUG_ON(!PageCompound(page)); | 2967 | BUG_ON(!PageCompound(page)); |
2968 | kmemleak_free(x); | ||
2968 | put_page(page); | 2969 | put_page(page); |
2969 | return; | 2970 | return; |
2970 | } | 2971 | } |
diff --git a/mm/swap_state.c b/mm/swap_state.c index 42cd38eba79f..5ae6b8b78c80 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c | |||
@@ -34,6 +34,7 @@ static const struct address_space_operations swap_aops = { | |||
34 | }; | 34 | }; |
35 | 35 | ||
36 | static struct backing_dev_info swap_backing_dev_info = { | 36 | static struct backing_dev_info swap_backing_dev_info = { |
37 | .name = "swap", | ||
37 | .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, | 38 | .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, |
38 | .unplug_io_fn = swap_unplug_io_fn, | 39 | .unplug_io_fn = swap_unplug_io_fn, |
39 | }; | 40 | }; |
diff --git a/mm/swapfile.c b/mm/swapfile.c index d1ade1a48ee7..8ffdc0d23c53 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -753,7 +753,7 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p) | |||
753 | 753 | ||
754 | if (!bdev) { | 754 | if (!bdev) { |
755 | if (bdev_p) | 755 | if (bdev_p) |
756 | *bdev_p = bdget(sis->bdev->bd_dev); | 756 | *bdev_p = bdgrab(sis->bdev); |
757 | 757 | ||
758 | spin_unlock(&swap_lock); | 758 | spin_unlock(&swap_lock); |
759 | return i; | 759 | return i; |
@@ -765,7 +765,7 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p) | |||
765 | struct swap_extent, list); | 765 | struct swap_extent, list); |
766 | if (se->start_block == offset) { | 766 | if (se->start_block == offset) { |
767 | if (bdev_p) | 767 | if (bdev_p) |
768 | *bdev_p = bdget(sis->bdev->bd_dev); | 768 | *bdev_p = bdgrab(sis->bdev); |
769 | 769 | ||
770 | spin_unlock(&swap_lock); | 770 | spin_unlock(&swap_lock); |
771 | bdput(bdev); | 771 | bdput(bdev); |
diff --git a/mm/vmscan.c b/mm/vmscan.c index 54155268dfca..ba8228e0a806 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -630,9 +630,14 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
630 | 630 | ||
631 | referenced = page_referenced(page, 1, | 631 | referenced = page_referenced(page, 1, |
632 | sc->mem_cgroup, &vm_flags); | 632 | sc->mem_cgroup, &vm_flags); |
633 | /* In active use or really unfreeable? Activate it. */ | 633 | /* |
634 | * In active use or really unfreeable? Activate it. | ||
635 | * If page which have PG_mlocked lost isoltation race, | ||
636 | * try_to_unmap moves it to unevictable list | ||
637 | */ | ||
634 | if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && | 638 | if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && |
635 | referenced && page_mapping_inuse(page)) | 639 | referenced && page_mapping_inuse(page) |
640 | && !(vm_flags & VM_LOCKED)) | ||
636 | goto activate_locked; | 641 | goto activate_locked; |
637 | 642 | ||
638 | /* | 643 | /* |
@@ -1104,7 +1109,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, | |||
1104 | */ | 1109 | */ |
1105 | if (nr_freed < nr_taken && !current_is_kswapd() && | 1110 | if (nr_freed < nr_taken && !current_is_kswapd() && |
1106 | lumpy_reclaim) { | 1111 | lumpy_reclaim) { |
1107 | congestion_wait(WRITE, HZ/10); | 1112 | congestion_wait(BLK_RW_ASYNC, HZ/10); |
1108 | 1113 | ||
1109 | /* | 1114 | /* |
1110 | * The attempt at page out may have made some | 1115 | * The attempt at page out may have made some |
@@ -1715,13 +1720,13 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
1715 | */ | 1720 | */ |
1716 | if (total_scanned > sc->swap_cluster_max + | 1721 | if (total_scanned > sc->swap_cluster_max + |
1717 | sc->swap_cluster_max / 2) { | 1722 | sc->swap_cluster_max / 2) { |
1718 | wakeup_pdflush(laptop_mode ? 0 : total_scanned); | 1723 | wakeup_flusher_threads(laptop_mode ? 0 : total_scanned); |
1719 | sc->may_writepage = 1; | 1724 | sc->may_writepage = 1; |
1720 | } | 1725 | } |
1721 | 1726 | ||
1722 | /* Take a nap, wait for some writeback to complete */ | 1727 | /* Take a nap, wait for some writeback to complete */ |
1723 | if (sc->nr_scanned && priority < DEF_PRIORITY - 2) | 1728 | if (sc->nr_scanned && priority < DEF_PRIORITY - 2) |
1724 | congestion_wait(WRITE, HZ/10); | 1729 | congestion_wait(BLK_RW_ASYNC, HZ/10); |
1725 | } | 1730 | } |
1726 | /* top priority shrink_zones still had more to do? don't OOM, then */ | 1731 | /* top priority shrink_zones still had more to do? don't OOM, then */ |
1727 | if (!sc->all_unreclaimable && scanning_global_lru(sc)) | 1732 | if (!sc->all_unreclaimable && scanning_global_lru(sc)) |
@@ -1960,7 +1965,7 @@ loop_again: | |||
1960 | * another pass across the zones. | 1965 | * another pass across the zones. |
1961 | */ | 1966 | */ |
1962 | if (total_scanned && priority < DEF_PRIORITY - 2) | 1967 | if (total_scanned && priority < DEF_PRIORITY - 2) |
1963 | congestion_wait(WRITE, HZ/10); | 1968 | congestion_wait(BLK_RW_ASYNC, HZ/10); |
1964 | 1969 | ||
1965 | /* | 1970 | /* |
1966 | * We do this so kswapd doesn't build up large priorities for | 1971 | * We do this so kswapd doesn't build up large priorities for |
@@ -2233,7 +2238,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages) | |||
2233 | goto out; | 2238 | goto out; |
2234 | 2239 | ||
2235 | if (sc.nr_scanned && prio < DEF_PRIORITY - 2) | 2240 | if (sc.nr_scanned && prio < DEF_PRIORITY - 2) |
2236 | congestion_wait(WRITE, HZ / 10); | 2241 | congestion_wait(BLK_RW_ASYNC, HZ / 10); |
2237 | } | 2242 | } |
2238 | } | 2243 | } |
2239 | 2244 | ||