aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig6
-rw-r--r--mm/Makefile2
-rw-r--r--mm/backing-dev.c388
-rw-r--r--mm/bootmem.c10
-rw-r--r--mm/hugetlb.c2
-rw-r--r--mm/kmemleak.c546
-rw-r--r--mm/memcontrol.c25
-rw-r--r--mm/memory.c11
-rw-r--r--mm/mempolicy.c84
-rw-r--r--mm/mempool.c4
-rw-r--r--mm/mmap.c3
-rw-r--r--mm/nommu.c10
-rw-r--r--mm/oom_kill.c64
-rw-r--r--mm/page-writeback.c184
-rw-r--r--mm/page_alloc.c49
-rw-r--r--mm/pdflush.c269
-rw-r--r--mm/percpu.c50
-rw-r--r--mm/rmap.c1
-rw-r--r--mm/shmem.c6
-rw-r--r--mm/shmem_acl.c11
-rw-r--r--mm/slob.c5
-rw-r--r--mm/slub.c19
-rw-r--r--mm/swap_state.c1
-rw-r--r--mm/swapfile.c4
-rw-r--r--mm/vmscan.c19
25 files changed, 1043 insertions, 730 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index c948d4ca8bde..fe5f674d7a7d 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -225,9 +225,9 @@ config DEFAULT_MMAP_MIN_ADDR
225 For most ia64, ppc64 and x86 users with lots of address space 225 For most ia64, ppc64 and x86 users with lots of address space
226 a value of 65536 is reasonable and should cause no problems. 226 a value of 65536 is reasonable and should cause no problems.
227 On arm and other archs it should not be higher than 32768. 227 On arm and other archs it should not be higher than 32768.
228 Programs which use vm86 functionality would either need additional 228 Programs which use vm86 functionality or have some need to map
229 permissions from either the LSM or the capabilities module or have 229 this low address space will need CAP_SYS_RAWIO or disable this
230 this protection disabled. 230 protection by setting the value to 0.
231 231
232 This value can be changed after boot using the 232 This value can be changed after boot using the
233 /proc/sys/vm/mmap_min_addr tunable. 233 /proc/sys/vm/mmap_min_addr tunable.
diff --git a/mm/Makefile b/mm/Makefile
index 5e0bd6426693..147a7a7873c4 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -8,7 +8,7 @@ mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \
8 vmalloc.o 8 vmalloc.o
9 9
10obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ 10obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
11 maccess.o page_alloc.o page-writeback.o pdflush.o \ 11 maccess.o page_alloc.o page-writeback.o \
12 readahead.o swap.o truncate.o vmscan.o shmem.o \ 12 readahead.o swap.o truncate.o vmscan.o shmem.o \
13 prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ 13 prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
14 page_isolation.o mm_init.o $(mmu-y) 14 page_isolation.o mm_init.o $(mmu-y)
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 493b468a5035..d3ca0dac1111 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -1,8 +1,11 @@
1 1
2#include <linux/wait.h> 2#include <linux/wait.h>
3#include <linux/backing-dev.h> 3#include <linux/backing-dev.h>
4#include <linux/kthread.h>
5#include <linux/freezer.h>
4#include <linux/fs.h> 6#include <linux/fs.h>
5#include <linux/pagemap.h> 7#include <linux/pagemap.h>
8#include <linux/mm.h>
6#include <linux/sched.h> 9#include <linux/sched.h>
7#include <linux/module.h> 10#include <linux/module.h>
8#include <linux/writeback.h> 11#include <linux/writeback.h>
@@ -14,6 +17,7 @@ void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
14EXPORT_SYMBOL(default_unplug_io_fn); 17EXPORT_SYMBOL(default_unplug_io_fn);
15 18
16struct backing_dev_info default_backing_dev_info = { 19struct backing_dev_info default_backing_dev_info = {
20 .name = "default",
17 .ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE, 21 .ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE,
18 .state = 0, 22 .state = 0,
19 .capabilities = BDI_CAP_MAP_COPY, 23 .capabilities = BDI_CAP_MAP_COPY,
@@ -22,6 +26,18 @@ struct backing_dev_info default_backing_dev_info = {
22EXPORT_SYMBOL_GPL(default_backing_dev_info); 26EXPORT_SYMBOL_GPL(default_backing_dev_info);
23 27
24static struct class *bdi_class; 28static struct class *bdi_class;
29DEFINE_SPINLOCK(bdi_lock);
30LIST_HEAD(bdi_list);
31LIST_HEAD(bdi_pending_list);
32
33static struct task_struct *sync_supers_tsk;
34static struct timer_list sync_supers_timer;
35
36static int bdi_sync_supers(void *);
37static void sync_supers_timer_fn(unsigned long);
38static void arm_supers_timer(void);
39
40static void bdi_add_default_flusher_task(struct backing_dev_info *bdi);
25 41
26#ifdef CONFIG_DEBUG_FS 42#ifdef CONFIG_DEBUG_FS
27#include <linux/debugfs.h> 43#include <linux/debugfs.h>
@@ -37,9 +53,29 @@ static void bdi_debug_init(void)
37static int bdi_debug_stats_show(struct seq_file *m, void *v) 53static int bdi_debug_stats_show(struct seq_file *m, void *v)
38{ 54{
39 struct backing_dev_info *bdi = m->private; 55 struct backing_dev_info *bdi = m->private;
56 struct bdi_writeback *wb;
40 unsigned long background_thresh; 57 unsigned long background_thresh;
41 unsigned long dirty_thresh; 58 unsigned long dirty_thresh;
42 unsigned long bdi_thresh; 59 unsigned long bdi_thresh;
60 unsigned long nr_dirty, nr_io, nr_more_io, nr_wb;
61 struct inode *inode;
62
63 /*
64 * inode lock is enough here, the bdi->wb_list is protected by
65 * RCU on the reader side
66 */
67 nr_wb = nr_dirty = nr_io = nr_more_io = 0;
68 spin_lock(&inode_lock);
69 list_for_each_entry(wb, &bdi->wb_list, list) {
70 nr_wb++;
71 list_for_each_entry(inode, &wb->b_dirty, i_list)
72 nr_dirty++;
73 list_for_each_entry(inode, &wb->b_io, i_list)
74 nr_io++;
75 list_for_each_entry(inode, &wb->b_more_io, i_list)
76 nr_more_io++;
77 }
78 spin_unlock(&inode_lock);
43 79
44 get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi); 80 get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi);
45 81
@@ -49,12 +85,22 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
49 "BdiReclaimable: %8lu kB\n" 85 "BdiReclaimable: %8lu kB\n"
50 "BdiDirtyThresh: %8lu kB\n" 86 "BdiDirtyThresh: %8lu kB\n"
51 "DirtyThresh: %8lu kB\n" 87 "DirtyThresh: %8lu kB\n"
52 "BackgroundThresh: %8lu kB\n", 88 "BackgroundThresh: %8lu kB\n"
89 "WriteBack threads:%8lu\n"
90 "b_dirty: %8lu\n"
91 "b_io: %8lu\n"
92 "b_more_io: %8lu\n"
93 "bdi_list: %8u\n"
94 "state: %8lx\n"
95 "wb_mask: %8lx\n"
96 "wb_list: %8u\n"
97 "wb_cnt: %8u\n",
53 (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)), 98 (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)),
54 (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)), 99 (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)),
55 K(bdi_thresh), 100 K(bdi_thresh), K(dirty_thresh),
56 K(dirty_thresh), 101 K(background_thresh), nr_wb, nr_dirty, nr_io, nr_more_io,
57 K(background_thresh)); 102 !list_empty(&bdi->bdi_list), bdi->state, bdi->wb_mask,
103 !list_empty(&bdi->wb_list), bdi->wb_cnt);
58#undef K 104#undef K
59 105
60 return 0; 106 return 0;
@@ -185,6 +231,13 @@ static int __init default_bdi_init(void)
185{ 231{
186 int err; 232 int err;
187 233
234 sync_supers_tsk = kthread_run(bdi_sync_supers, NULL, "sync_supers");
235 BUG_ON(IS_ERR(sync_supers_tsk));
236
237 init_timer(&sync_supers_timer);
238 setup_timer(&sync_supers_timer, sync_supers_timer_fn, 0);
239 arm_supers_timer();
240
188 err = bdi_init(&default_backing_dev_info); 241 err = bdi_init(&default_backing_dev_info);
189 if (!err) 242 if (!err)
190 bdi_register(&default_backing_dev_info, NULL, "default"); 243 bdi_register(&default_backing_dev_info, NULL, "default");
@@ -193,6 +246,248 @@ static int __init default_bdi_init(void)
193} 246}
194subsys_initcall(default_bdi_init); 247subsys_initcall(default_bdi_init);
195 248
249static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
250{
251 memset(wb, 0, sizeof(*wb));
252
253 wb->bdi = bdi;
254 wb->last_old_flush = jiffies;
255 INIT_LIST_HEAD(&wb->b_dirty);
256 INIT_LIST_HEAD(&wb->b_io);
257 INIT_LIST_HEAD(&wb->b_more_io);
258}
259
260static void bdi_task_init(struct backing_dev_info *bdi,
261 struct bdi_writeback *wb)
262{
263 struct task_struct *tsk = current;
264
265 spin_lock(&bdi->wb_lock);
266 list_add_tail_rcu(&wb->list, &bdi->wb_list);
267 spin_unlock(&bdi->wb_lock);
268
269 tsk->flags |= PF_FLUSHER | PF_SWAPWRITE;
270 set_freezable();
271
272 /*
273 * Our parent may run at a different priority, just set us to normal
274 */
275 set_user_nice(tsk, 0);
276}
277
278static int bdi_start_fn(void *ptr)
279{
280 struct bdi_writeback *wb = ptr;
281 struct backing_dev_info *bdi = wb->bdi;
282 int ret;
283
284 /*
285 * Add us to the active bdi_list
286 */
287 spin_lock(&bdi_lock);
288 list_add(&bdi->bdi_list, &bdi_list);
289 spin_unlock(&bdi_lock);
290
291 bdi_task_init(bdi, wb);
292
293 /*
294 * Clear pending bit and wakeup anybody waiting to tear us down
295 */
296 clear_bit(BDI_pending, &bdi->state);
297 smp_mb__after_clear_bit();
298 wake_up_bit(&bdi->state, BDI_pending);
299
300 ret = bdi_writeback_task(wb);
301
302 /*
303 * Remove us from the list
304 */
305 spin_lock(&bdi->wb_lock);
306 list_del_rcu(&wb->list);
307 spin_unlock(&bdi->wb_lock);
308
309 /*
310 * Flush any work that raced with us exiting. No new work
311 * will be added, since this bdi isn't discoverable anymore.
312 */
313 if (!list_empty(&bdi->work_list))
314 wb_do_writeback(wb, 1);
315
316 wb->task = NULL;
317 return ret;
318}
319
320int bdi_has_dirty_io(struct backing_dev_info *bdi)
321{
322 return wb_has_dirty_io(&bdi->wb);
323}
324
325static void bdi_flush_io(struct backing_dev_info *bdi)
326{
327 struct writeback_control wbc = {
328 .bdi = bdi,
329 .sync_mode = WB_SYNC_NONE,
330 .older_than_this = NULL,
331 .range_cyclic = 1,
332 .nr_to_write = 1024,
333 };
334
335 writeback_inodes_wbc(&wbc);
336}
337
338/*
339 * kupdated() used to do this. We cannot do it from the bdi_forker_task()
340 * or we risk deadlocking on ->s_umount. The longer term solution would be
341 * to implement sync_supers_bdi() or similar and simply do it from the
342 * bdi writeback tasks individually.
343 */
344static int bdi_sync_supers(void *unused)
345{
346 set_user_nice(current, 0);
347
348 while (!kthread_should_stop()) {
349 set_current_state(TASK_INTERRUPTIBLE);
350 schedule();
351
352 /*
353 * Do this periodically, like kupdated() did before.
354 */
355 sync_supers();
356 }
357
358 return 0;
359}
360
361static void arm_supers_timer(void)
362{
363 unsigned long next;
364
365 next = msecs_to_jiffies(dirty_writeback_interval * 10) + jiffies;
366 mod_timer(&sync_supers_timer, round_jiffies_up(next));
367}
368
369static void sync_supers_timer_fn(unsigned long unused)
370{
371 wake_up_process(sync_supers_tsk);
372 arm_supers_timer();
373}
374
375static int bdi_forker_task(void *ptr)
376{
377 struct bdi_writeback *me = ptr;
378
379 bdi_task_init(me->bdi, me);
380
381 for (;;) {
382 struct backing_dev_info *bdi, *tmp;
383 struct bdi_writeback *wb;
384
385 /*
386 * Temporary measure, we want to make sure we don't see
387 * dirty data on the default backing_dev_info
388 */
389 if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list))
390 wb_do_writeback(me, 0);
391
392 spin_lock(&bdi_lock);
393
394 /*
395 * Check if any existing bdi's have dirty data without
396 * a thread registered. If so, set that up.
397 */
398 list_for_each_entry_safe(bdi, tmp, &bdi_list, bdi_list) {
399 if (bdi->wb.task)
400 continue;
401 if (list_empty(&bdi->work_list) &&
402 !bdi_has_dirty_io(bdi))
403 continue;
404
405 bdi_add_default_flusher_task(bdi);
406 }
407
408 set_current_state(TASK_INTERRUPTIBLE);
409
410 if (list_empty(&bdi_pending_list)) {
411 unsigned long wait;
412
413 spin_unlock(&bdi_lock);
414 wait = msecs_to_jiffies(dirty_writeback_interval * 10);
415 schedule_timeout(wait);
416 try_to_freeze();
417 continue;
418 }
419
420 __set_current_state(TASK_RUNNING);
421
422 /*
423 * This is our real job - check for pending entries in
424 * bdi_pending_list, and create the tasks that got added
425 */
426 bdi = list_entry(bdi_pending_list.next, struct backing_dev_info,
427 bdi_list);
428 list_del_init(&bdi->bdi_list);
429 spin_unlock(&bdi_lock);
430
431 wb = &bdi->wb;
432 wb->task = kthread_run(bdi_start_fn, wb, "flush-%s",
433 dev_name(bdi->dev));
434 /*
435 * If task creation fails, then readd the bdi to
436 * the pending list and force writeout of the bdi
437 * from this forker thread. That will free some memory
438 * and we can try again.
439 */
440 if (IS_ERR(wb->task)) {
441 wb->task = NULL;
442
443 /*
444 * Add this 'bdi' to the back, so we get
445 * a chance to flush other bdi's to free
446 * memory.
447 */
448 spin_lock(&bdi_lock);
449 list_add_tail(&bdi->bdi_list, &bdi_pending_list);
450 spin_unlock(&bdi_lock);
451
452 bdi_flush_io(bdi);
453 }
454 }
455
456 return 0;
457}
458
459/*
460 * Add the default flusher task that gets created for any bdi
461 * that has dirty data pending writeout
462 */
463void static bdi_add_default_flusher_task(struct backing_dev_info *bdi)
464{
465 if (!bdi_cap_writeback_dirty(bdi))
466 return;
467
468 if (WARN_ON(!test_bit(BDI_registered, &bdi->state))) {
469 printk(KERN_ERR "bdi %p/%s is not registered!\n",
470 bdi, bdi->name);
471 return;
472 }
473
474 /*
475 * Check with the helper whether to proceed adding a task. Will only
476 * abort if we two or more simultanous calls to
477 * bdi_add_default_flusher_task() occured, further additions will block
478 * waiting for previous additions to finish.
479 */
480 if (!test_and_set_bit(BDI_pending, &bdi->state)) {
481 list_move_tail(&bdi->bdi_list, &bdi_pending_list);
482
483 /*
484 * We are now on the pending list, wake up bdi_forker_task()
485 * to finish the job and add us back to the active bdi_list
486 */
487 wake_up_process(default_backing_dev_info.wb.task);
488 }
489}
490
196int bdi_register(struct backing_dev_info *bdi, struct device *parent, 491int bdi_register(struct backing_dev_info *bdi, struct device *parent,
197 const char *fmt, ...) 492 const char *fmt, ...)
198{ 493{
@@ -211,9 +506,35 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
211 goto exit; 506 goto exit;
212 } 507 }
213 508
509 spin_lock(&bdi_lock);
510 list_add_tail(&bdi->bdi_list, &bdi_list);
511 spin_unlock(&bdi_lock);
512
214 bdi->dev = dev; 513 bdi->dev = dev;
215 bdi_debug_register(bdi, dev_name(dev));
216 514
515 /*
516 * Just start the forker thread for our default backing_dev_info,
517 * and add other bdi's to the list. They will get a thread created
518 * on-demand when they need it.
519 */
520 if (bdi_cap_flush_forker(bdi)) {
521 struct bdi_writeback *wb = &bdi->wb;
522
523 wb->task = kthread_run(bdi_forker_task, wb, "bdi-%s",
524 dev_name(dev));
525 if (IS_ERR(wb->task)) {
526 wb->task = NULL;
527 ret = -ENOMEM;
528
529 spin_lock(&bdi_lock);
530 list_del(&bdi->bdi_list);
531 spin_unlock(&bdi_lock);
532 goto exit;
533 }
534 }
535
536 bdi_debug_register(bdi, dev_name(dev));
537 set_bit(BDI_registered, &bdi->state);
217exit: 538exit:
218 return ret; 539 return ret;
219} 540}
@@ -225,9 +546,42 @@ int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev)
225} 546}
226EXPORT_SYMBOL(bdi_register_dev); 547EXPORT_SYMBOL(bdi_register_dev);
227 548
549/*
550 * Remove bdi from the global list and shutdown any threads we have running
551 */
552static void bdi_wb_shutdown(struct backing_dev_info *bdi)
553{
554 struct bdi_writeback *wb;
555
556 if (!bdi_cap_writeback_dirty(bdi))
557 return;
558
559 /*
560 * If setup is pending, wait for that to complete first
561 */
562 wait_on_bit(&bdi->state, BDI_pending, bdi_sched_wait,
563 TASK_UNINTERRUPTIBLE);
564
565 /*
566 * Make sure nobody finds us on the bdi_list anymore
567 */
568 spin_lock(&bdi_lock);
569 list_del(&bdi->bdi_list);
570 spin_unlock(&bdi_lock);
571
572 /*
573 * Finally, kill the kernel threads. We don't need to be RCU
574 * safe anymore, since the bdi is gone from visibility.
575 */
576 list_for_each_entry(wb, &bdi->wb_list, list)
577 kthread_stop(wb->task);
578}
579
228void bdi_unregister(struct backing_dev_info *bdi) 580void bdi_unregister(struct backing_dev_info *bdi)
229{ 581{
230 if (bdi->dev) { 582 if (bdi->dev) {
583 if (!bdi_cap_flush_forker(bdi))
584 bdi_wb_shutdown(bdi);
231 bdi_debug_unregister(bdi); 585 bdi_debug_unregister(bdi);
232 device_unregister(bdi->dev); 586 device_unregister(bdi->dev);
233 bdi->dev = NULL; 587 bdi->dev = NULL;
@@ -237,14 +591,25 @@ EXPORT_SYMBOL(bdi_unregister);
237 591
238int bdi_init(struct backing_dev_info *bdi) 592int bdi_init(struct backing_dev_info *bdi)
239{ 593{
240 int i; 594 int i, err;
241 int err;
242 595
243 bdi->dev = NULL; 596 bdi->dev = NULL;
244 597
245 bdi->min_ratio = 0; 598 bdi->min_ratio = 0;
246 bdi->max_ratio = 100; 599 bdi->max_ratio = 100;
247 bdi->max_prop_frac = PROP_FRAC_BASE; 600 bdi->max_prop_frac = PROP_FRAC_BASE;
601 spin_lock_init(&bdi->wb_lock);
602 INIT_LIST_HEAD(&bdi->bdi_list);
603 INIT_LIST_HEAD(&bdi->wb_list);
604 INIT_LIST_HEAD(&bdi->work_list);
605
606 bdi_wb_init(&bdi->wb, bdi);
607
608 /*
609 * Just one thread support for now, hard code mask and count
610 */
611 bdi->wb_mask = 1;
612 bdi->wb_cnt = 1;
248 613
249 for (i = 0; i < NR_BDI_STAT_ITEMS; i++) { 614 for (i = 0; i < NR_BDI_STAT_ITEMS; i++) {
250 err = percpu_counter_init(&bdi->bdi_stat[i], 0); 615 err = percpu_counter_init(&bdi->bdi_stat[i], 0);
@@ -269,6 +634,8 @@ void bdi_destroy(struct backing_dev_info *bdi)
269{ 634{
270 int i; 635 int i;
271 636
637 WARN_ON(bdi_has_dirty_io(bdi));
638
272 bdi_unregister(bdi); 639 bdi_unregister(bdi);
273 640
274 for (i = 0; i < NR_BDI_STAT_ITEMS; i++) 641 for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
@@ -283,7 +650,6 @@ static wait_queue_head_t congestion_wqh[2] = {
283 __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1]) 650 __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
284 }; 651 };
285 652
286
287void clear_bdi_congested(struct backing_dev_info *bdi, int sync) 653void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
288{ 654{
289 enum bdi_state bit; 655 enum bdi_state bit;
@@ -308,18 +674,18 @@ EXPORT_SYMBOL(set_bdi_congested);
308 674
309/** 675/**
310 * congestion_wait - wait for a backing_dev to become uncongested 676 * congestion_wait - wait for a backing_dev to become uncongested
311 * @rw: READ or WRITE 677 * @sync: SYNC or ASYNC IO
312 * @timeout: timeout in jiffies 678 * @timeout: timeout in jiffies
313 * 679 *
314 * Waits for up to @timeout jiffies for a backing_dev (any backing_dev) to exit 680 * Waits for up to @timeout jiffies for a backing_dev (any backing_dev) to exit
315 * write congestion. If no backing_devs are congested then just wait for the 681 * write congestion. If no backing_devs are congested then just wait for the
316 * next write to be completed. 682 * next write to be completed.
317 */ 683 */
318long congestion_wait(int rw, long timeout) 684long congestion_wait(int sync, long timeout)
319{ 685{
320 long ret; 686 long ret;
321 DEFINE_WAIT(wait); 687 DEFINE_WAIT(wait);
322 wait_queue_head_t *wqh = &congestion_wqh[rw]; 688 wait_queue_head_t *wqh = &congestion_wqh[sync];
323 689
324 prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); 690 prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
325 ret = io_schedule_timeout(timeout); 691 ret = io_schedule_timeout(timeout);
diff --git a/mm/bootmem.c b/mm/bootmem.c
index d2a9ce952768..555d5d2731c6 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -12,6 +12,7 @@
12#include <linux/pfn.h> 12#include <linux/pfn.h>
13#include <linux/bootmem.h> 13#include <linux/bootmem.h>
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/kmemleak.h>
15 16
16#include <asm/bug.h> 17#include <asm/bug.h>
17#include <asm/io.h> 18#include <asm/io.h>
@@ -335,6 +336,8 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
335{ 336{
336 unsigned long start, end; 337 unsigned long start, end;
337 338
339 kmemleak_free_part(__va(physaddr), size);
340
338 start = PFN_UP(physaddr); 341 start = PFN_UP(physaddr);
339 end = PFN_DOWN(physaddr + size); 342 end = PFN_DOWN(physaddr + size);
340 343
@@ -354,6 +357,8 @@ void __init free_bootmem(unsigned long addr, unsigned long size)
354{ 357{
355 unsigned long start, end; 358 unsigned long start, end;
356 359
360 kmemleak_free_part(__va(addr), size);
361
357 start = PFN_UP(addr); 362 start = PFN_UP(addr);
358 end = PFN_DOWN(addr + size); 363 end = PFN_DOWN(addr + size);
359 364
@@ -516,6 +521,11 @@ find_block:
516 region = phys_to_virt(PFN_PHYS(bdata->node_min_pfn) + 521 region = phys_to_virt(PFN_PHYS(bdata->node_min_pfn) +
517 start_off); 522 start_off);
518 memset(region, 0, size); 523 memset(region, 0, size);
524 /*
525 * The min_count is set to 0 so that bootmem allocated blocks
526 * are never reported as leaks.
527 */
528 kmemleak_alloc(region, size, 0, 0);
519 return region; 529 return region;
520 } 530 }
521 531
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index d0351e31f474..cafdcee154e8 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2370,7 +2370,7 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
2370 long chg = region_truncate(&inode->i_mapping->private_list, offset); 2370 long chg = region_truncate(&inode->i_mapping->private_list, offset);
2371 2371
2372 spin_lock(&inode->i_lock); 2372 spin_lock(&inode->i_lock);
2373 inode->i_blocks -= blocks_per_huge_page(h); 2373 inode->i_blocks -= (blocks_per_huge_page(h) * freed);
2374 spin_unlock(&inode->i_lock); 2374 spin_unlock(&inode->i_lock);
2375 2375
2376 hugetlb_put_quota(inode->i_mapping, (chg - freed)); 2376 hugetlb_put_quota(inode->i_mapping, (chg - freed));
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index e766e1da09d2..4ea4510e2996 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -92,21 +92,24 @@
92#include <linux/string.h> 92#include <linux/string.h>
93#include <linux/nodemask.h> 93#include <linux/nodemask.h>
94#include <linux/mm.h> 94#include <linux/mm.h>
95#include <linux/workqueue.h>
95 96
96#include <asm/sections.h> 97#include <asm/sections.h>
97#include <asm/processor.h> 98#include <asm/processor.h>
98#include <asm/atomic.h> 99#include <asm/atomic.h>
99 100
101#include <linux/kmemcheck.h>
100#include <linux/kmemleak.h> 102#include <linux/kmemleak.h>
101 103
102/* 104/*
103 * Kmemleak configuration and common defines. 105 * Kmemleak configuration and common defines.
104 */ 106 */
105#define MAX_TRACE 16 /* stack trace length */ 107#define MAX_TRACE 16 /* stack trace length */
106#define REPORTS_NR 50 /* maximum number of reported leaks */
107#define MSECS_MIN_AGE 5000 /* minimum object age for reporting */ 108#define MSECS_MIN_AGE 5000 /* minimum object age for reporting */
108#define SECS_FIRST_SCAN 60 /* delay before the first scan */ 109#define SECS_FIRST_SCAN 60 /* delay before the first scan */
109#define SECS_SCAN_WAIT 600 /* subsequent auto scanning delay */ 110#define SECS_SCAN_WAIT 600 /* subsequent auto scanning delay */
111#define GRAY_LIST_PASSES 25 /* maximum number of gray list scans */
112#define MAX_SCAN_SIZE 4096 /* maximum size of a scanned block */
110 113
111#define BYTES_PER_POINTER sizeof(void *) 114#define BYTES_PER_POINTER sizeof(void *)
112 115
@@ -120,6 +123,9 @@ struct kmemleak_scan_area {
120 size_t length; 123 size_t length;
121}; 124};
122 125
126#define KMEMLEAK_GREY 0
127#define KMEMLEAK_BLACK -1
128
123/* 129/*
124 * Structure holding the metadata for each allocated memory block. 130 * Structure holding the metadata for each allocated memory block.
125 * Modifications to such objects should be made while holding the 131 * Modifications to such objects should be made while holding the
@@ -158,6 +164,17 @@ struct kmemleak_object {
158#define OBJECT_REPORTED (1 << 1) 164#define OBJECT_REPORTED (1 << 1)
159/* flag set to not scan the object */ 165/* flag set to not scan the object */
160#define OBJECT_NO_SCAN (1 << 2) 166#define OBJECT_NO_SCAN (1 << 2)
167/* flag set on newly allocated objects */
168#define OBJECT_NEW (1 << 3)
169
170/* number of bytes to print per line; must be 16 or 32 */
171#define HEX_ROW_SIZE 16
172/* number of bytes to print at a time (1, 2, 4, 8) */
173#define HEX_GROUP_SIZE 1
174/* include ASCII after the hex output */
175#define HEX_ASCII 1
176/* max number of lines to be printed */
177#define HEX_MAX_LINES 2
161 178
162/* the list of all allocated objects */ 179/* the list of all allocated objects */
163static LIST_HEAD(object_list); 180static LIST_HEAD(object_list);
@@ -196,9 +213,6 @@ static int kmemleak_stack_scan = 1;
196/* protects the memory scanning, parameters and debug/kmemleak file access */ 213/* protects the memory scanning, parameters and debug/kmemleak file access */
197static DEFINE_MUTEX(scan_mutex); 214static DEFINE_MUTEX(scan_mutex);
198 215
199/* number of leaks reported (for limitation purposes) */
200static int reported_leaks;
201
202/* 216/*
203 * Early object allocation/freeing logging. Kmemleak is initialized after the 217 * Early object allocation/freeing logging. Kmemleak is initialized after the
204 * kernel allocator. However, both the kernel allocator and kmemleak may 218 * kernel allocator. However, both the kernel allocator and kmemleak may
@@ -211,6 +225,7 @@ static int reported_leaks;
211enum { 225enum {
212 KMEMLEAK_ALLOC, 226 KMEMLEAK_ALLOC,
213 KMEMLEAK_FREE, 227 KMEMLEAK_FREE,
228 KMEMLEAK_FREE_PART,
214 KMEMLEAK_NOT_LEAK, 229 KMEMLEAK_NOT_LEAK,
215 KMEMLEAK_IGNORE, 230 KMEMLEAK_IGNORE,
216 KMEMLEAK_SCAN_AREA, 231 KMEMLEAK_SCAN_AREA,
@@ -228,11 +243,14 @@ struct early_log {
228 int min_count; /* minimum reference count */ 243 int min_count; /* minimum reference count */
229 unsigned long offset; /* scan area offset */ 244 unsigned long offset; /* scan area offset */
230 size_t length; /* scan area length */ 245 size_t length; /* scan area length */
246 unsigned long trace[MAX_TRACE]; /* stack trace */
247 unsigned int trace_len; /* stack trace length */
231}; 248};
232 249
233/* early logging buffer and current position */ 250/* early logging buffer and current position */
234static struct early_log early_log[CONFIG_DEBUG_KMEMLEAK_EARLY_LOG_SIZE]; 251static struct early_log
235static int crt_early_log; 252 early_log[CONFIG_DEBUG_KMEMLEAK_EARLY_LOG_SIZE] __initdata;
253static int crt_early_log __initdata;
236 254
237static void kmemleak_disable(void); 255static void kmemleak_disable(void);
238 256
@@ -255,6 +273,35 @@ static void kmemleak_disable(void);
255} while (0) 273} while (0)
256 274
257/* 275/*
276 * Printing of the objects hex dump to the seq file. The number of lines to be
277 * printed is limited to HEX_MAX_LINES to prevent seq file spamming. The
278 * actual number of printed bytes depends on HEX_ROW_SIZE. It must be called
279 * with the object->lock held.
280 */
281static void hex_dump_object(struct seq_file *seq,
282 struct kmemleak_object *object)
283{
284 const u8 *ptr = (const u8 *)object->pointer;
285 int i, len, remaining;
286 unsigned char linebuf[HEX_ROW_SIZE * 5];
287
288 /* limit the number of lines to HEX_MAX_LINES */
289 remaining = len =
290 min(object->size, (size_t)(HEX_MAX_LINES * HEX_ROW_SIZE));
291
292 seq_printf(seq, " hex dump (first %d bytes):\n", len);
293 for (i = 0; i < len; i += HEX_ROW_SIZE) {
294 int linelen = min(remaining, HEX_ROW_SIZE);
295
296 remaining -= HEX_ROW_SIZE;
297 hex_dump_to_buffer(ptr + i, linelen, HEX_ROW_SIZE,
298 HEX_GROUP_SIZE, linebuf, sizeof(linebuf),
299 HEX_ASCII);
300 seq_printf(seq, " %s\n", linebuf);
301 }
302}
303
304/*
258 * Object colors, encoded with count and min_count: 305 * Object colors, encoded with count and min_count:
259 * - white - orphan object, not enough references to it (count < min_count) 306 * - white - orphan object, not enough references to it (count < min_count)
260 * - gray - not orphan, not marked as false positive (min_count == 0) or 307 * - gray - not orphan, not marked as false positive (min_count == 0) or
@@ -264,14 +311,21 @@ static void kmemleak_disable(void);
264 * Newly created objects don't have any color assigned (object->count == -1) 311 * Newly created objects don't have any color assigned (object->count == -1)
265 * before the next memory scan when they become white. 312 * before the next memory scan when they become white.
266 */ 313 */
267static int color_white(const struct kmemleak_object *object) 314static bool color_white(const struct kmemleak_object *object)
315{
316 return object->count != KMEMLEAK_BLACK &&
317 object->count < object->min_count;
318}
319
320static bool color_gray(const struct kmemleak_object *object)
268{ 321{
269 return object->count != -1 && object->count < object->min_count; 322 return object->min_count != KMEMLEAK_BLACK &&
323 object->count >= object->min_count;
270} 324}
271 325
272static int color_gray(const struct kmemleak_object *object) 326static bool color_black(const struct kmemleak_object *object)
273{ 327{
274 return object->min_count != -1 && object->count >= object->min_count; 328 return object->min_count == KMEMLEAK_BLACK;
275} 329}
276 330
277/* 331/*
@@ -279,7 +333,7 @@ static int color_gray(const struct kmemleak_object *object)
279 * not be deleted and have a minimum age to avoid false positives caused by 333 * not be deleted and have a minimum age to avoid false positives caused by
280 * pointers temporarily stored in CPU registers. 334 * pointers temporarily stored in CPU registers.
281 */ 335 */
282static int unreferenced_object(struct kmemleak_object *object) 336static bool unreferenced_object(struct kmemleak_object *object)
283{ 337{
284 return (object->flags & OBJECT_ALLOCATED) && color_white(object) && 338 return (object->flags & OBJECT_ALLOCATED) && color_white(object) &&
285 time_before_eq(object->jiffies + jiffies_min_age, 339 time_before_eq(object->jiffies + jiffies_min_age,
@@ -299,6 +353,7 @@ static void print_unreferenced(struct seq_file *seq,
299 object->pointer, object->size); 353 object->pointer, object->size);
300 seq_printf(seq, " comm \"%s\", pid %d, jiffies %lu\n", 354 seq_printf(seq, " comm \"%s\", pid %d, jiffies %lu\n",
301 object->comm, object->pid, object->jiffies); 355 object->comm, object->pid, object->jiffies);
356 hex_dump_object(seq, object);
302 seq_printf(seq, " backtrace:\n"); 357 seq_printf(seq, " backtrace:\n");
303 358
304 for (i = 0; i < object->trace_len; i++) { 359 for (i = 0; i < object->trace_len; i++) {
@@ -325,6 +380,7 @@ static void dump_object_info(struct kmemleak_object *object)
325 object->comm, object->pid, object->jiffies); 380 object->comm, object->pid, object->jiffies);
326 pr_notice(" min_count = %d\n", object->min_count); 381 pr_notice(" min_count = %d\n", object->min_count);
327 pr_notice(" count = %d\n", object->count); 382 pr_notice(" count = %d\n", object->count);
383 pr_notice(" flags = 0x%lx\n", object->flags);
328 pr_notice(" backtrace:\n"); 384 pr_notice(" backtrace:\n");
329 print_stack_trace(&trace, 4); 385 print_stack_trace(&trace, 4);
330} 386}
@@ -429,21 +485,36 @@ static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias)
429} 485}
430 486
431/* 487/*
488 * Save stack trace to the given array of MAX_TRACE size.
489 */
490static int __save_stack_trace(unsigned long *trace)
491{
492 struct stack_trace stack_trace;
493
494 stack_trace.max_entries = MAX_TRACE;
495 stack_trace.nr_entries = 0;
496 stack_trace.entries = trace;
497 stack_trace.skip = 2;
498 save_stack_trace(&stack_trace);
499
500 return stack_trace.nr_entries;
501}
502
503/*
432 * Create the metadata (struct kmemleak_object) corresponding to an allocated 504 * Create the metadata (struct kmemleak_object) corresponding to an allocated
433 * memory block and add it to the object_list and object_tree_root. 505 * memory block and add it to the object_list and object_tree_root.
434 */ 506 */
435static void create_object(unsigned long ptr, size_t size, int min_count, 507static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
436 gfp_t gfp) 508 int min_count, gfp_t gfp)
437{ 509{
438 unsigned long flags; 510 unsigned long flags;
439 struct kmemleak_object *object; 511 struct kmemleak_object *object;
440 struct prio_tree_node *node; 512 struct prio_tree_node *node;
441 struct stack_trace trace;
442 513
443 object = kmem_cache_alloc(object_cache, gfp & GFP_KMEMLEAK_MASK); 514 object = kmem_cache_alloc(object_cache, gfp & GFP_KMEMLEAK_MASK);
444 if (!object) { 515 if (!object) {
445 kmemleak_stop("Cannot allocate a kmemleak_object structure\n"); 516 kmemleak_stop("Cannot allocate a kmemleak_object structure\n");
446 return; 517 return NULL;
447 } 518 }
448 519
449 INIT_LIST_HEAD(&object->object_list); 520 INIT_LIST_HEAD(&object->object_list);
@@ -451,7 +522,7 @@ static void create_object(unsigned long ptr, size_t size, int min_count,
451 INIT_HLIST_HEAD(&object->area_list); 522 INIT_HLIST_HEAD(&object->area_list);
452 spin_lock_init(&object->lock); 523 spin_lock_init(&object->lock);
453 atomic_set(&object->use_count, 1); 524 atomic_set(&object->use_count, 1);
454 object->flags = OBJECT_ALLOCATED; 525 object->flags = OBJECT_ALLOCATED | OBJECT_NEW;
455 object->pointer = ptr; 526 object->pointer = ptr;
456 object->size = size; 527 object->size = size;
457 object->min_count = min_count; 528 object->min_count = min_count;
@@ -477,18 +548,14 @@ static void create_object(unsigned long ptr, size_t size, int min_count,
477 } 548 }
478 549
479 /* kernel backtrace */ 550 /* kernel backtrace */
480 trace.max_entries = MAX_TRACE; 551 object->trace_len = __save_stack_trace(object->trace);
481 trace.nr_entries = 0;
482 trace.entries = object->trace;
483 trace.skip = 1;
484 save_stack_trace(&trace);
485 object->trace_len = trace.nr_entries;
486 552
487 INIT_PRIO_TREE_NODE(&object->tree_node); 553 INIT_PRIO_TREE_NODE(&object->tree_node);
488 object->tree_node.start = ptr; 554 object->tree_node.start = ptr;
489 object->tree_node.last = ptr + size - 1; 555 object->tree_node.last = ptr + size - 1;
490 556
491 write_lock_irqsave(&kmemleak_lock, flags); 557 write_lock_irqsave(&kmemleak_lock, flags);
558
492 min_addr = min(min_addr, ptr); 559 min_addr = min(min_addr, ptr);
493 max_addr = max(max_addr, ptr + size); 560 max_addr = max(max_addr, ptr + size);
494 node = prio_tree_insert(&object_tree_root, &object->tree_node); 561 node = prio_tree_insert(&object_tree_root, &object->tree_node);
@@ -499,47 +566,36 @@ static void create_object(unsigned long ptr, size_t size, int min_count,
499 * random memory blocks. 566 * random memory blocks.
500 */ 567 */
501 if (node != &object->tree_node) { 568 if (node != &object->tree_node) {
502 unsigned long flags;
503
504 kmemleak_stop("Cannot insert 0x%lx into the object search tree " 569 kmemleak_stop("Cannot insert 0x%lx into the object search tree "
505 "(already existing)\n", ptr); 570 "(already existing)\n", ptr);
506 object = lookup_object(ptr, 1); 571 object = lookup_object(ptr, 1);
507 spin_lock_irqsave(&object->lock, flags); 572 spin_lock(&object->lock);
508 dump_object_info(object); 573 dump_object_info(object);
509 spin_unlock_irqrestore(&object->lock, flags); 574 spin_unlock(&object->lock);
510 575
511 goto out; 576 goto out;
512 } 577 }
513 list_add_tail_rcu(&object->object_list, &object_list); 578 list_add_tail_rcu(&object->object_list, &object_list);
514out: 579out:
515 write_unlock_irqrestore(&kmemleak_lock, flags); 580 write_unlock_irqrestore(&kmemleak_lock, flags);
581 return object;
516} 582}
517 583
518/* 584/*
519 * Remove the metadata (struct kmemleak_object) for a memory block from the 585 * Remove the metadata (struct kmemleak_object) for a memory block from the
520 * object_list and object_tree_root and decrement its use_count. 586 * object_list and object_tree_root and decrement its use_count.
521 */ 587 */
522static void delete_object(unsigned long ptr) 588static void __delete_object(struct kmemleak_object *object)
523{ 589{
524 unsigned long flags; 590 unsigned long flags;
525 struct kmemleak_object *object;
526 591
527 write_lock_irqsave(&kmemleak_lock, flags); 592 write_lock_irqsave(&kmemleak_lock, flags);
528 object = lookup_object(ptr, 0);
529 if (!object) {
530#ifdef DEBUG
531 kmemleak_warn("Freeing unknown object at 0x%08lx\n",
532 ptr);
533#endif
534 write_unlock_irqrestore(&kmemleak_lock, flags);
535 return;
536 }
537 prio_tree_remove(&object_tree_root, &object->tree_node); 593 prio_tree_remove(&object_tree_root, &object->tree_node);
538 list_del_rcu(&object->object_list); 594 list_del_rcu(&object->object_list);
539 write_unlock_irqrestore(&kmemleak_lock, flags); 595 write_unlock_irqrestore(&kmemleak_lock, flags);
540 596
541 WARN_ON(!(object->flags & OBJECT_ALLOCATED)); 597 WARN_ON(!(object->flags & OBJECT_ALLOCATED));
542 WARN_ON(atomic_read(&object->use_count) < 1); 598 WARN_ON(atomic_read(&object->use_count) < 2);
543 599
544 /* 600 /*
545 * Locking here also ensures that the corresponding memory block 601 * Locking here also ensures that the corresponding memory block
@@ -552,48 +608,115 @@ static void delete_object(unsigned long ptr)
552} 608}
553 609
554/* 610/*
555 * Make a object permanently as gray-colored so that it can no longer be 611 * Look up the metadata (struct kmemleak_object) corresponding to ptr and
556 * reported as a leak. This is used in general to mark a false positive. 612 * delete it.
557 */ 613 */
558static void make_gray_object(unsigned long ptr) 614static void delete_object_full(unsigned long ptr)
559{ 615{
560 unsigned long flags;
561 struct kmemleak_object *object; 616 struct kmemleak_object *object;
562 617
563 object = find_and_get_object(ptr, 0); 618 object = find_and_get_object(ptr, 0);
564 if (!object) { 619 if (!object) {
565 kmemleak_warn("Graying unknown object at 0x%08lx\n", ptr); 620#ifdef DEBUG
621 kmemleak_warn("Freeing unknown object at 0x%08lx\n",
622 ptr);
623#endif
566 return; 624 return;
567 } 625 }
568 626 __delete_object(object);
569 spin_lock_irqsave(&object->lock, flags);
570 object->min_count = 0;
571 spin_unlock_irqrestore(&object->lock, flags);
572 put_object(object); 627 put_object(object);
573} 628}
574 629
575/* 630/*
576 * Mark the object as black-colored so that it is ignored from scans and 631 * Look up the metadata (struct kmemleak_object) corresponding to ptr and
577 * reporting. 632 * delete it. If the memory block is partially freed, the function may create
633 * additional metadata for the remaining parts of the block.
578 */ 634 */
579static void make_black_object(unsigned long ptr) 635static void delete_object_part(unsigned long ptr, size_t size)
580{ 636{
581 unsigned long flags;
582 struct kmemleak_object *object; 637 struct kmemleak_object *object;
638 unsigned long start, end;
583 639
584 object = find_and_get_object(ptr, 0); 640 object = find_and_get_object(ptr, 1);
585 if (!object) { 641 if (!object) {
586 kmemleak_warn("Blacking unknown object at 0x%08lx\n", ptr); 642#ifdef DEBUG
643 kmemleak_warn("Partially freeing unknown object at 0x%08lx "
644 "(size %zu)\n", ptr, size);
645#endif
587 return; 646 return;
588 } 647 }
648 __delete_object(object);
649
650 /*
651 * Create one or two objects that may result from the memory block
652 * split. Note that partial freeing is only done by free_bootmem() and
653 * this happens before kmemleak_init() is called. The path below is
654 * only executed during early log recording in kmemleak_init(), so
655 * GFP_KERNEL is enough.
656 */
657 start = object->pointer;
658 end = object->pointer + object->size;
659 if (ptr > start)
660 create_object(start, ptr - start, object->min_count,
661 GFP_KERNEL);
662 if (ptr + size < end)
663 create_object(ptr + size, end - ptr - size, object->min_count,
664 GFP_KERNEL);
665
666 put_object(object);
667}
668
669static void __paint_it(struct kmemleak_object *object, int color)
670{
671 object->min_count = color;
672 if (color == KMEMLEAK_BLACK)
673 object->flags |= OBJECT_NO_SCAN;
674}
675
676static void paint_it(struct kmemleak_object *object, int color)
677{
678 unsigned long flags;
589 679
590 spin_lock_irqsave(&object->lock, flags); 680 spin_lock_irqsave(&object->lock, flags);
591 object->min_count = -1; 681 __paint_it(object, color);
592 spin_unlock_irqrestore(&object->lock, flags); 682 spin_unlock_irqrestore(&object->lock, flags);
683}
684
685static void paint_ptr(unsigned long ptr, int color)
686{
687 struct kmemleak_object *object;
688
689 object = find_and_get_object(ptr, 0);
690 if (!object) {
691 kmemleak_warn("Trying to color unknown object "
692 "at 0x%08lx as %s\n", ptr,
693 (color == KMEMLEAK_GREY) ? "Grey" :
694 (color == KMEMLEAK_BLACK) ? "Black" : "Unknown");
695 return;
696 }
697 paint_it(object, color);
593 put_object(object); 698 put_object(object);
594} 699}
595 700
596/* 701/*
702 * Make a object permanently as gray-colored so that it can no longer be
703 * reported as a leak. This is used in general to mark a false positive.
704 */
705static void make_gray_object(unsigned long ptr)
706{
707 paint_ptr(ptr, KMEMLEAK_GREY);
708}
709
710/*
711 * Mark the object as black-colored so that it is ignored from scans and
712 * reporting.
713 */
714static void make_black_object(unsigned long ptr)
715{
716 paint_ptr(ptr, KMEMLEAK_BLACK);
717}
718
719/*
597 * Add a scanning area to the object. If at least one such area is added, 720 * Add a scanning area to the object. If at least one such area is added,
598 * kmemleak will only scan these ranges rather than the whole memory block. 721 * kmemleak will only scan these ranges rather than the whole memory block.
599 */ 722 */
@@ -662,14 +785,15 @@ static void object_no_scan(unsigned long ptr)
662 * Log an early kmemleak_* call to the early_log buffer. These calls will be 785 * Log an early kmemleak_* call to the early_log buffer. These calls will be
663 * processed later once kmemleak is fully initialized. 786 * processed later once kmemleak is fully initialized.
664 */ 787 */
665static void log_early(int op_type, const void *ptr, size_t size, 788static void __init log_early(int op_type, const void *ptr, size_t size,
666 int min_count, unsigned long offset, size_t length) 789 int min_count, unsigned long offset, size_t length)
667{ 790{
668 unsigned long flags; 791 unsigned long flags;
669 struct early_log *log; 792 struct early_log *log;
670 793
671 if (crt_early_log >= ARRAY_SIZE(early_log)) { 794 if (crt_early_log >= ARRAY_SIZE(early_log)) {
672 pr_warning("Early log buffer exceeded\n"); 795 pr_warning("Early log buffer exceeded, "
796 "please increase DEBUG_KMEMLEAK_EARLY_LOG_SIZE\n");
673 kmemleak_disable(); 797 kmemleak_disable();
674 return; 798 return;
675 } 799 }
@@ -686,16 +810,45 @@ static void log_early(int op_type, const void *ptr, size_t size,
686 log->min_count = min_count; 810 log->min_count = min_count;
687 log->offset = offset; 811 log->offset = offset;
688 log->length = length; 812 log->length = length;
813 if (op_type == KMEMLEAK_ALLOC)
814 log->trace_len = __save_stack_trace(log->trace);
689 crt_early_log++; 815 crt_early_log++;
690 local_irq_restore(flags); 816 local_irq_restore(flags);
691} 817}
692 818
693/* 819/*
820 * Log an early allocated block and populate the stack trace.
821 */
822static void early_alloc(struct early_log *log)
823{
824 struct kmemleak_object *object;
825 unsigned long flags;
826 int i;
827
828 if (!atomic_read(&kmemleak_enabled) || !log->ptr || IS_ERR(log->ptr))
829 return;
830
831 /*
832 * RCU locking needed to ensure object is not freed via put_object().
833 */
834 rcu_read_lock();
835 object = create_object((unsigned long)log->ptr, log->size,
836 log->min_count, GFP_KERNEL);
837 spin_lock_irqsave(&object->lock, flags);
838 for (i = 0; i < log->trace_len; i++)
839 object->trace[i] = log->trace[i];
840 object->trace_len = log->trace_len;
841 spin_unlock_irqrestore(&object->lock, flags);
842 rcu_read_unlock();
843}
844
845/*
694 * Memory allocation function callback. This function is called from the 846 * Memory allocation function callback. This function is called from the
695 * kernel allocators when a new block is allocated (kmem_cache_alloc, kmalloc, 847 * kernel allocators when a new block is allocated (kmem_cache_alloc, kmalloc,
696 * vmalloc etc.). 848 * vmalloc etc.).
697 */ 849 */
698void kmemleak_alloc(const void *ptr, size_t size, int min_count, gfp_t gfp) 850void __ref kmemleak_alloc(const void *ptr, size_t size, int min_count,
851 gfp_t gfp)
699{ 852{
700 pr_debug("%s(0x%p, %zu, %d)\n", __func__, ptr, size, min_count); 853 pr_debug("%s(0x%p, %zu, %d)\n", __func__, ptr, size, min_count);
701 854
@@ -710,22 +863,37 @@ EXPORT_SYMBOL_GPL(kmemleak_alloc);
710 * Memory freeing function callback. This function is called from the kernel 863 * Memory freeing function callback. This function is called from the kernel
711 * allocators when a block is freed (kmem_cache_free, kfree, vfree etc.). 864 * allocators when a block is freed (kmem_cache_free, kfree, vfree etc.).
712 */ 865 */
713void kmemleak_free(const void *ptr) 866void __ref kmemleak_free(const void *ptr)
714{ 867{
715 pr_debug("%s(0x%p)\n", __func__, ptr); 868 pr_debug("%s(0x%p)\n", __func__, ptr);
716 869
717 if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) 870 if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
718 delete_object((unsigned long)ptr); 871 delete_object_full((unsigned long)ptr);
719 else if (atomic_read(&kmemleak_early_log)) 872 else if (atomic_read(&kmemleak_early_log))
720 log_early(KMEMLEAK_FREE, ptr, 0, 0, 0, 0); 873 log_early(KMEMLEAK_FREE, ptr, 0, 0, 0, 0);
721} 874}
722EXPORT_SYMBOL_GPL(kmemleak_free); 875EXPORT_SYMBOL_GPL(kmemleak_free);
723 876
724/* 877/*
878 * Partial memory freeing function callback. This function is usually called
879 * from bootmem allocator when (part of) a memory block is freed.
880 */
881void __ref kmemleak_free_part(const void *ptr, size_t size)
882{
883 pr_debug("%s(0x%p)\n", __func__, ptr);
884
885 if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
886 delete_object_part((unsigned long)ptr, size);
887 else if (atomic_read(&kmemleak_early_log))
888 log_early(KMEMLEAK_FREE_PART, ptr, size, 0, 0, 0);
889}
890EXPORT_SYMBOL_GPL(kmemleak_free_part);
891
892/*
725 * Mark an already allocated memory block as a false positive. This will cause 893 * Mark an already allocated memory block as a false positive. This will cause
726 * the block to no longer be reported as leak and always be scanned. 894 * the block to no longer be reported as leak and always be scanned.
727 */ 895 */
728void kmemleak_not_leak(const void *ptr) 896void __ref kmemleak_not_leak(const void *ptr)
729{ 897{
730 pr_debug("%s(0x%p)\n", __func__, ptr); 898 pr_debug("%s(0x%p)\n", __func__, ptr);
731 899
@@ -741,7 +909,7 @@ EXPORT_SYMBOL(kmemleak_not_leak);
741 * corresponding block is not a leak and does not contain any references to 909 * corresponding block is not a leak and does not contain any references to
742 * other allocated memory blocks. 910 * other allocated memory blocks.
743 */ 911 */
744void kmemleak_ignore(const void *ptr) 912void __ref kmemleak_ignore(const void *ptr)
745{ 913{
746 pr_debug("%s(0x%p)\n", __func__, ptr); 914 pr_debug("%s(0x%p)\n", __func__, ptr);
747 915
@@ -755,8 +923,8 @@ EXPORT_SYMBOL(kmemleak_ignore);
755/* 923/*
756 * Limit the range to be scanned in an allocated memory block. 924 * Limit the range to be scanned in an allocated memory block.
757 */ 925 */
758void kmemleak_scan_area(const void *ptr, unsigned long offset, size_t length, 926void __ref kmemleak_scan_area(const void *ptr, unsigned long offset,
759 gfp_t gfp) 927 size_t length, gfp_t gfp)
760{ 928{
761 pr_debug("%s(0x%p)\n", __func__, ptr); 929 pr_debug("%s(0x%p)\n", __func__, ptr);
762 930
@@ -770,7 +938,7 @@ EXPORT_SYMBOL(kmemleak_scan_area);
770/* 938/*
771 * Inform kmemleak not to scan the given memory block. 939 * Inform kmemleak not to scan the given memory block.
772 */ 940 */
773void kmemleak_no_scan(const void *ptr) 941void __ref kmemleak_no_scan(const void *ptr)
774{ 942{
775 pr_debug("%s(0x%p)\n", __func__, ptr); 943 pr_debug("%s(0x%p)\n", __func__, ptr);
776 944
@@ -807,20 +975,29 @@ static int scan_should_stop(void)
807 * found to the gray list. 975 * found to the gray list.
808 */ 976 */
809static void scan_block(void *_start, void *_end, 977static void scan_block(void *_start, void *_end,
810 struct kmemleak_object *scanned) 978 struct kmemleak_object *scanned, int allow_resched)
811{ 979{
812 unsigned long *ptr; 980 unsigned long *ptr;
813 unsigned long *start = PTR_ALIGN(_start, BYTES_PER_POINTER); 981 unsigned long *start = PTR_ALIGN(_start, BYTES_PER_POINTER);
814 unsigned long *end = _end - (BYTES_PER_POINTER - 1); 982 unsigned long *end = _end - (BYTES_PER_POINTER - 1);
815 983
816 for (ptr = start; ptr < end; ptr++) { 984 for (ptr = start; ptr < end; ptr++) {
817 unsigned long flags;
818 unsigned long pointer = *ptr;
819 struct kmemleak_object *object; 985 struct kmemleak_object *object;
986 unsigned long flags;
987 unsigned long pointer;
820 988
989 if (allow_resched)
990 cond_resched();
821 if (scan_should_stop()) 991 if (scan_should_stop())
822 break; 992 break;
823 993
994 /* don't scan uninitialized memory */
995 if (!kmemcheck_is_obj_initialized((unsigned long)ptr,
996 BYTES_PER_POINTER))
997 continue;
998
999 pointer = *ptr;
1000
824 object = find_and_get_object(pointer, 1); 1001 object = find_and_get_object(pointer, 1);
825 if (!object) 1002 if (!object)
826 continue; 1003 continue;
@@ -879,14 +1056,25 @@ static void scan_object(struct kmemleak_object *object)
879 if (!(object->flags & OBJECT_ALLOCATED)) 1056 if (!(object->flags & OBJECT_ALLOCATED))
880 /* already freed object */ 1057 /* already freed object */
881 goto out; 1058 goto out;
882 if (hlist_empty(&object->area_list)) 1059 if (hlist_empty(&object->area_list)) {
883 scan_block((void *)object->pointer, 1060 void *start = (void *)object->pointer;
884 (void *)(object->pointer + object->size), object); 1061 void *end = (void *)(object->pointer + object->size);
885 else 1062
1063 while (start < end && (object->flags & OBJECT_ALLOCATED) &&
1064 !(object->flags & OBJECT_NO_SCAN)) {
1065 scan_block(start, min(start + MAX_SCAN_SIZE, end),
1066 object, 0);
1067 start += MAX_SCAN_SIZE;
1068
1069 spin_unlock_irqrestore(&object->lock, flags);
1070 cond_resched();
1071 spin_lock_irqsave(&object->lock, flags);
1072 }
1073 } else
886 hlist_for_each_entry(area, elem, &object->area_list, node) 1074 hlist_for_each_entry(area, elem, &object->area_list, node)
887 scan_block((void *)(object->pointer + area->offset), 1075 scan_block((void *)(object->pointer + area->offset),
888 (void *)(object->pointer + area->offset 1076 (void *)(object->pointer + area->offset
889 + area->length), object); 1077 + area->length), object, 0);
890out: 1078out:
891 spin_unlock_irqrestore(&object->lock, flags); 1079 spin_unlock_irqrestore(&object->lock, flags);
892} 1080}
@@ -900,9 +1088,9 @@ static void kmemleak_scan(void)
900{ 1088{
901 unsigned long flags; 1089 unsigned long flags;
902 struct kmemleak_object *object, *tmp; 1090 struct kmemleak_object *object, *tmp;
903 struct task_struct *task;
904 int i; 1091 int i;
905 int new_leaks = 0; 1092 int new_leaks = 0;
1093 int gray_list_pass = 0;
906 1094
907 jiffies_last_scan = jiffies; 1095 jiffies_last_scan = jiffies;
908 1096
@@ -923,6 +1111,7 @@ static void kmemleak_scan(void)
923#endif 1111#endif
924 /* reset the reference count (whiten the object) */ 1112 /* reset the reference count (whiten the object) */
925 object->count = 0; 1113 object->count = 0;
1114 object->flags &= ~OBJECT_NEW;
926 if (color_gray(object) && get_object(object)) 1115 if (color_gray(object) && get_object(object))
927 list_add_tail(&object->gray_list, &gray_list); 1116 list_add_tail(&object->gray_list, &gray_list);
928 1117
@@ -931,14 +1120,14 @@ static void kmemleak_scan(void)
931 rcu_read_unlock(); 1120 rcu_read_unlock();
932 1121
933 /* data/bss scanning */ 1122 /* data/bss scanning */
934 scan_block(_sdata, _edata, NULL); 1123 scan_block(_sdata, _edata, NULL, 1);
935 scan_block(__bss_start, __bss_stop, NULL); 1124 scan_block(__bss_start, __bss_stop, NULL, 1);
936 1125
937#ifdef CONFIG_SMP 1126#ifdef CONFIG_SMP
938 /* per-cpu sections scanning */ 1127 /* per-cpu sections scanning */
939 for_each_possible_cpu(i) 1128 for_each_possible_cpu(i)
940 scan_block(__per_cpu_start + per_cpu_offset(i), 1129 scan_block(__per_cpu_start + per_cpu_offset(i),
941 __per_cpu_end + per_cpu_offset(i), NULL); 1130 __per_cpu_end + per_cpu_offset(i), NULL, 1);
942#endif 1131#endif
943 1132
944 /* 1133 /*
@@ -960,19 +1149,21 @@ static void kmemleak_scan(void)
960 /* only scan if page is in use */ 1149 /* only scan if page is in use */
961 if (page_count(page) == 0) 1150 if (page_count(page) == 0)
962 continue; 1151 continue;
963 scan_block(page, page + 1, NULL); 1152 scan_block(page, page + 1, NULL, 1);
964 } 1153 }
965 } 1154 }
966 1155
967 /* 1156 /*
968 * Scanning the task stacks may introduce false negatives and it is 1157 * Scanning the task stacks (may introduce false negatives).
969 * not enabled by default.
970 */ 1158 */
971 if (kmemleak_stack_scan) { 1159 if (kmemleak_stack_scan) {
1160 struct task_struct *p, *g;
1161
972 read_lock(&tasklist_lock); 1162 read_lock(&tasklist_lock);
973 for_each_process(task) 1163 do_each_thread(g, p) {
974 scan_block(task_stack_page(task), 1164 scan_block(task_stack_page(p), task_stack_page(p) +
975 task_stack_page(task) + THREAD_SIZE, NULL); 1165 THREAD_SIZE, NULL, 0);
1166 } while_each_thread(g, p);
976 read_unlock(&tasklist_lock); 1167 read_unlock(&tasklist_lock);
977 } 1168 }
978 1169
@@ -984,6 +1175,7 @@ static void kmemleak_scan(void)
984 * kmemleak objects cannot be freed from outside the loop because their 1175 * kmemleak objects cannot be freed from outside the loop because their
985 * use_count was increased. 1176 * use_count was increased.
986 */ 1177 */
1178repeat:
987 object = list_entry(gray_list.next, typeof(*object), gray_list); 1179 object = list_entry(gray_list.next, typeof(*object), gray_list);
988 while (&object->gray_list != &gray_list) { 1180 while (&object->gray_list != &gray_list) {
989 cond_resched(); 1181 cond_resched();
@@ -1001,12 +1193,38 @@ static void kmemleak_scan(void)
1001 1193
1002 object = tmp; 1194 object = tmp;
1003 } 1195 }
1196
1197 if (scan_should_stop() || ++gray_list_pass >= GRAY_LIST_PASSES)
1198 goto scan_end;
1199
1200 /*
1201 * Check for new objects allocated during this scanning and add them
1202 * to the gray list.
1203 */
1204 rcu_read_lock();
1205 list_for_each_entry_rcu(object, &object_list, object_list) {
1206 spin_lock_irqsave(&object->lock, flags);
1207 if ((object->flags & OBJECT_NEW) && !color_black(object) &&
1208 get_object(object)) {
1209 object->flags &= ~OBJECT_NEW;
1210 list_add_tail(&object->gray_list, &gray_list);
1211 }
1212 spin_unlock_irqrestore(&object->lock, flags);
1213 }
1214 rcu_read_unlock();
1215
1216 if (!list_empty(&gray_list))
1217 goto repeat;
1218
1219scan_end:
1004 WARN_ON(!list_empty(&gray_list)); 1220 WARN_ON(!list_empty(&gray_list));
1005 1221
1006 /* 1222 /*
1007 * If scanning was stopped do not report any new unreferenced objects. 1223 * If scanning was stopped or new objects were being allocated at a
1224 * higher rate than gray list scanning, do not report any new
1225 * unreferenced objects.
1008 */ 1226 */
1009 if (scan_should_stop()) 1227 if (scan_should_stop() || gray_list_pass >= GRAY_LIST_PASSES)
1010 return; 1228 return;
1011 1229
1012 /* 1230 /*
@@ -1039,6 +1257,7 @@ static int kmemleak_scan_thread(void *arg)
1039 static int first_run = 1; 1257 static int first_run = 1;
1040 1258
1041 pr_info("Automatic memory scanning thread started\n"); 1259 pr_info("Automatic memory scanning thread started\n");
1260 set_user_nice(current, 10);
1042 1261
1043 /* 1262 /*
1044 * Wait before the first scan to allow the system to fully initialize. 1263 * Wait before the first scan to allow the system to fully initialize.
@@ -1069,7 +1288,7 @@ static int kmemleak_scan_thread(void *arg)
1069 * Start the automatic memory scanning thread. This function must be called 1288 * Start the automatic memory scanning thread. This function must be called
1070 * with the scan_mutex held. 1289 * with the scan_mutex held.
1071 */ 1290 */
1072void start_scan_thread(void) 1291static void start_scan_thread(void)
1073{ 1292{
1074 if (scan_thread) 1293 if (scan_thread)
1075 return; 1294 return;
@@ -1084,7 +1303,7 @@ void start_scan_thread(void)
1084 * Stop the automatic memory scanning thread. This function must be called 1303 * Stop the automatic memory scanning thread. This function must be called
1085 * with the scan_mutex held. 1304 * with the scan_mutex held.
1086 */ 1305 */
1087void stop_scan_thread(void) 1306static void stop_scan_thread(void)
1088{ 1307{
1089 if (scan_thread) { 1308 if (scan_thread) {
1090 kthread_stop(scan_thread); 1309 kthread_stop(scan_thread);
@@ -1101,11 +1320,11 @@ static void *kmemleak_seq_start(struct seq_file *seq, loff_t *pos)
1101{ 1320{
1102 struct kmemleak_object *object; 1321 struct kmemleak_object *object;
1103 loff_t n = *pos; 1322 loff_t n = *pos;
1323 int err;
1104 1324
1105 if (!n) 1325 err = mutex_lock_interruptible(&scan_mutex);
1106 reported_leaks = 0; 1326 if (err < 0)
1107 if (reported_leaks >= REPORTS_NR) 1327 return ERR_PTR(err);
1108 return NULL;
1109 1328
1110 rcu_read_lock(); 1329 rcu_read_lock();
1111 list_for_each_entry_rcu(object, &object_list, object_list) { 1330 list_for_each_entry_rcu(object, &object_list, object_list) {
@@ -1116,7 +1335,6 @@ static void *kmemleak_seq_start(struct seq_file *seq, loff_t *pos)
1116 } 1335 }
1117 object = NULL; 1336 object = NULL;
1118out: 1337out:
1119 rcu_read_unlock();
1120 return object; 1338 return object;
1121} 1339}
1122 1340
@@ -1131,17 +1349,13 @@ static void *kmemleak_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1131 struct list_head *n = &prev_obj->object_list; 1349 struct list_head *n = &prev_obj->object_list;
1132 1350
1133 ++(*pos); 1351 ++(*pos);
1134 if (reported_leaks >= REPORTS_NR)
1135 goto out;
1136 1352
1137 rcu_read_lock();
1138 list_for_each_continue_rcu(n, &object_list) { 1353 list_for_each_continue_rcu(n, &object_list) {
1139 next_obj = list_entry(n, struct kmemleak_object, object_list); 1354 next_obj = list_entry(n, struct kmemleak_object, object_list);
1140 if (get_object(next_obj)) 1355 if (get_object(next_obj))
1141 break; 1356 break;
1142 } 1357 }
1143 rcu_read_unlock(); 1358
1144out:
1145 put_object(prev_obj); 1359 put_object(prev_obj);
1146 return next_obj; 1360 return next_obj;
1147} 1361}
@@ -1151,8 +1365,16 @@ out:
1151 */ 1365 */
1152static void kmemleak_seq_stop(struct seq_file *seq, void *v) 1366static void kmemleak_seq_stop(struct seq_file *seq, void *v)
1153{ 1367{
1154 if (v) 1368 if (!IS_ERR(v)) {
1155 put_object(v); 1369 /*
1370 * kmemleak_seq_start may return ERR_PTR if the scan_mutex
1371 * waiting was interrupted, so only release it if !IS_ERR.
1372 */
1373 rcu_read_unlock();
1374 mutex_unlock(&scan_mutex);
1375 if (v)
1376 put_object(v);
1377 }
1156} 1378}
1157 1379
1158/* 1380/*
@@ -1164,10 +1386,8 @@ static int kmemleak_seq_show(struct seq_file *seq, void *v)
1164 unsigned long flags; 1386 unsigned long flags;
1165 1387
1166 spin_lock_irqsave(&object->lock, flags); 1388 spin_lock_irqsave(&object->lock, flags);
1167 if ((object->flags & OBJECT_REPORTED) && unreferenced_object(object)) { 1389 if ((object->flags & OBJECT_REPORTED) && unreferenced_object(object))
1168 print_unreferenced(seq, object); 1390 print_unreferenced(seq, object);
1169 reported_leaks++;
1170 }
1171 spin_unlock_irqrestore(&object->lock, flags); 1391 spin_unlock_irqrestore(&object->lock, flags);
1172 return 0; 1392 return 0;
1173} 1393}
@@ -1181,36 +1401,58 @@ static const struct seq_operations kmemleak_seq_ops = {
1181 1401
1182static int kmemleak_open(struct inode *inode, struct file *file) 1402static int kmemleak_open(struct inode *inode, struct file *file)
1183{ 1403{
1184 int ret = 0;
1185
1186 if (!atomic_read(&kmemleak_enabled)) 1404 if (!atomic_read(&kmemleak_enabled))
1187 return -EBUSY; 1405 return -EBUSY;
1188 1406
1189 ret = mutex_lock_interruptible(&scan_mutex); 1407 return seq_open(file, &kmemleak_seq_ops);
1190 if (ret < 0)
1191 goto out;
1192 if (file->f_mode & FMODE_READ) {
1193 ret = seq_open(file, &kmemleak_seq_ops);
1194 if (ret < 0)
1195 goto scan_unlock;
1196 }
1197 return ret;
1198
1199scan_unlock:
1200 mutex_unlock(&scan_mutex);
1201out:
1202 return ret;
1203} 1408}
1204 1409
1205static int kmemleak_release(struct inode *inode, struct file *file) 1410static int kmemleak_release(struct inode *inode, struct file *file)
1206{ 1411{
1207 int ret = 0; 1412 return seq_release(inode, file);
1413}
1208 1414
1209 if (file->f_mode & FMODE_READ) 1415static int dump_str_object_info(const char *str)
1210 seq_release(inode, file); 1416{
1211 mutex_unlock(&scan_mutex); 1417 unsigned long flags;
1418 struct kmemleak_object *object;
1419 unsigned long addr;
1212 1420
1213 return ret; 1421 addr= simple_strtoul(str, NULL, 0);
1422 object = find_and_get_object(addr, 0);
1423 if (!object) {
1424 pr_info("Unknown object at 0x%08lx\n", addr);
1425 return -EINVAL;
1426 }
1427
1428 spin_lock_irqsave(&object->lock, flags);
1429 dump_object_info(object);
1430 spin_unlock_irqrestore(&object->lock, flags);
1431
1432 put_object(object);
1433 return 0;
1434}
1435
1436/*
1437 * We use grey instead of black to ensure we can do future scans on the same
1438 * objects. If we did not do future scans these black objects could
1439 * potentially contain references to newly allocated objects in the future and
1440 * we'd end up with false positives.
1441 */
1442static void kmemleak_clear(void)
1443{
1444 struct kmemleak_object *object;
1445 unsigned long flags;
1446
1447 rcu_read_lock();
1448 list_for_each_entry_rcu(object, &object_list, object_list) {
1449 spin_lock_irqsave(&object->lock, flags);
1450 if ((object->flags & OBJECT_REPORTED) &&
1451 unreferenced_object(object))
1452 __paint_it(object, KMEMLEAK_GREY);
1453 spin_unlock_irqrestore(&object->lock, flags);
1454 }
1455 rcu_read_unlock();
1214} 1456}
1215 1457
1216/* 1458/*
@@ -1224,21 +1466,26 @@ static int kmemleak_release(struct inode *inode, struct file *file)
1224 * scan=... - set the automatic memory scanning period in seconds (0 to 1466 * scan=... - set the automatic memory scanning period in seconds (0 to
1225 * disable it) 1467 * disable it)
1226 * scan - trigger a memory scan 1468 * scan - trigger a memory scan
1469 * clear - mark all current reported unreferenced kmemleak objects as
1470 * grey to ignore printing them
1471 * dump=... - dump information about the object found at the given address
1227 */ 1472 */
1228static ssize_t kmemleak_write(struct file *file, const char __user *user_buf, 1473static ssize_t kmemleak_write(struct file *file, const char __user *user_buf,
1229 size_t size, loff_t *ppos) 1474 size_t size, loff_t *ppos)
1230{ 1475{
1231 char buf[64]; 1476 char buf[64];
1232 int buf_size; 1477 int buf_size;
1233 1478 int ret;
1234 if (!atomic_read(&kmemleak_enabled))
1235 return -EBUSY;
1236 1479
1237 buf_size = min(size, (sizeof(buf) - 1)); 1480 buf_size = min(size, (sizeof(buf) - 1));
1238 if (strncpy_from_user(buf, user_buf, buf_size) < 0) 1481 if (strncpy_from_user(buf, user_buf, buf_size) < 0)
1239 return -EFAULT; 1482 return -EFAULT;
1240 buf[buf_size] = 0; 1483 buf[buf_size] = 0;
1241 1484
1485 ret = mutex_lock_interruptible(&scan_mutex);
1486 if (ret < 0)
1487 return ret;
1488
1242 if (strncmp(buf, "off", 3) == 0) 1489 if (strncmp(buf, "off", 3) == 0)
1243 kmemleak_disable(); 1490 kmemleak_disable();
1244 else if (strncmp(buf, "stack=on", 8) == 0) 1491 else if (strncmp(buf, "stack=on", 8) == 0)
@@ -1251,11 +1498,10 @@ static ssize_t kmemleak_write(struct file *file, const char __user *user_buf,
1251 stop_scan_thread(); 1498 stop_scan_thread();
1252 else if (strncmp(buf, "scan=", 5) == 0) { 1499 else if (strncmp(buf, "scan=", 5) == 0) {
1253 unsigned long secs; 1500 unsigned long secs;
1254 int err;
1255 1501
1256 err = strict_strtoul(buf + 5, 0, &secs); 1502 ret = strict_strtoul(buf + 5, 0, &secs);
1257 if (err < 0) 1503 if (ret < 0)
1258 return err; 1504 goto out;
1259 stop_scan_thread(); 1505 stop_scan_thread();
1260 if (secs) { 1506 if (secs) {
1261 jiffies_scan_wait = msecs_to_jiffies(secs * 1000); 1507 jiffies_scan_wait = msecs_to_jiffies(secs * 1000);
@@ -1263,8 +1509,17 @@ static ssize_t kmemleak_write(struct file *file, const char __user *user_buf,
1263 } 1509 }
1264 } else if (strncmp(buf, "scan", 4) == 0) 1510 } else if (strncmp(buf, "scan", 4) == 0)
1265 kmemleak_scan(); 1511 kmemleak_scan();
1512 else if (strncmp(buf, "clear", 5) == 0)
1513 kmemleak_clear();
1514 else if (strncmp(buf, "dump=", 5) == 0)
1515 ret = dump_str_object_info(buf + 5);
1266 else 1516 else
1267 return -EINVAL; 1517 ret = -EINVAL;
1518
1519out:
1520 mutex_unlock(&scan_mutex);
1521 if (ret < 0)
1522 return ret;
1268 1523
1269 /* ignore the rest of the buffer, only one command at a time */ 1524 /* ignore the rest of the buffer, only one command at a time */
1270 *ppos += size; 1525 *ppos += size;
@@ -1284,7 +1539,7 @@ static const struct file_operations kmemleak_fops = {
1284 * Perform the freeing of the kmemleak internal objects after waiting for any 1539 * Perform the freeing of the kmemleak internal objects after waiting for any
1285 * current memory scan to complete. 1540 * current memory scan to complete.
1286 */ 1541 */
1287static int kmemleak_cleanup_thread(void *arg) 1542static void kmemleak_do_cleanup(struct work_struct *work)
1288{ 1543{
1289 struct kmemleak_object *object; 1544 struct kmemleak_object *object;
1290 1545
@@ -1293,25 +1548,12 @@ static int kmemleak_cleanup_thread(void *arg)
1293 1548
1294 rcu_read_lock(); 1549 rcu_read_lock();
1295 list_for_each_entry_rcu(object, &object_list, object_list) 1550 list_for_each_entry_rcu(object, &object_list, object_list)
1296 delete_object(object->pointer); 1551 delete_object_full(object->pointer);
1297 rcu_read_unlock(); 1552 rcu_read_unlock();
1298 mutex_unlock(&scan_mutex); 1553 mutex_unlock(&scan_mutex);
1299
1300 return 0;
1301} 1554}
1302 1555
1303/* 1556static DECLARE_WORK(cleanup_work, kmemleak_do_cleanup);
1304 * Start the clean-up thread.
1305 */
1306static void kmemleak_cleanup(void)
1307{
1308 struct task_struct *cleanup_thread;
1309
1310 cleanup_thread = kthread_run(kmemleak_cleanup_thread, NULL,
1311 "kmemleak-clean");
1312 if (IS_ERR(cleanup_thread))
1313 pr_warning("Failed to create the clean-up thread\n");
1314}
1315 1557
1316/* 1558/*
1317 * Disable kmemleak. No memory allocation/freeing will be traced once this 1559 * Disable kmemleak. No memory allocation/freeing will be traced once this
@@ -1329,7 +1571,7 @@ static void kmemleak_disable(void)
1329 1571
1330 /* check whether it is too early for a kernel thread */ 1572 /* check whether it is too early for a kernel thread */
1331 if (atomic_read(&kmemleak_initialized)) 1573 if (atomic_read(&kmemleak_initialized))
1332 kmemleak_cleanup(); 1574 schedule_work(&cleanup_work);
1333 1575
1334 pr_info("Kernel memory leak detector disabled\n"); 1576 pr_info("Kernel memory leak detector disabled\n");
1335} 1577}
@@ -1382,12 +1624,14 @@ void __init kmemleak_init(void)
1382 1624
1383 switch (log->op_type) { 1625 switch (log->op_type) {
1384 case KMEMLEAK_ALLOC: 1626 case KMEMLEAK_ALLOC:
1385 kmemleak_alloc(log->ptr, log->size, log->min_count, 1627 early_alloc(log);
1386 GFP_KERNEL);
1387 break; 1628 break;
1388 case KMEMLEAK_FREE: 1629 case KMEMLEAK_FREE:
1389 kmemleak_free(log->ptr); 1630 kmemleak_free(log->ptr);
1390 break; 1631 break;
1632 case KMEMLEAK_FREE_PART:
1633 kmemleak_free_part(log->ptr, log->size);
1634 break;
1391 case KMEMLEAK_NOT_LEAK: 1635 case KMEMLEAK_NOT_LEAK:
1392 kmemleak_not_leak(log->ptr); 1636 kmemleak_not_leak(log->ptr);
1393 break; 1637 break;
@@ -1423,7 +1667,7 @@ static int __init kmemleak_late_init(void)
1423 * after setting kmemleak_initialized and we may end up with 1667 * after setting kmemleak_initialized and we may end up with
1424 * two clean-up threads but serialized by scan_mutex. 1668 * two clean-up threads but serialized by scan_mutex.
1425 */ 1669 */
1426 kmemleak_cleanup(); 1670 schedule_work(&cleanup_work);
1427 return -ENOMEM; 1671 return -ENOMEM;
1428 } 1672 }
1429 1673
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e2fa20dadf40..fd4529d86de5 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1207,6 +1207,12 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
1207 ret = 0; 1207 ret = 0;
1208out: 1208out:
1209 unlock_page_cgroup(pc); 1209 unlock_page_cgroup(pc);
1210 /*
1211 * We charges against "to" which may not have any tasks. Then, "to"
1212 * can be under rmdir(). But in current implementation, caller of
1213 * this function is just force_empty() and it's garanteed that
1214 * "to" is never removed. So, we don't check rmdir status here.
1215 */
1210 return ret; 1216 return ret;
1211} 1217}
1212 1218
@@ -1428,6 +1434,7 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
1428 return; 1434 return;
1429 if (!ptr) 1435 if (!ptr)
1430 return; 1436 return;
1437 cgroup_exclude_rmdir(&ptr->css);
1431 pc = lookup_page_cgroup(page); 1438 pc = lookup_page_cgroup(page);
1432 mem_cgroup_lru_del_before_commit_swapcache(page); 1439 mem_cgroup_lru_del_before_commit_swapcache(page);
1433 __mem_cgroup_commit_charge(ptr, pc, ctype); 1440 __mem_cgroup_commit_charge(ptr, pc, ctype);
@@ -1457,8 +1464,12 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
1457 } 1464 }
1458 rcu_read_unlock(); 1465 rcu_read_unlock();
1459 } 1466 }
1460 /* add this page(page_cgroup) to the LRU we want. */ 1467 /*
1461 1468 * At swapin, we may charge account against cgroup which has no tasks.
1469 * So, rmdir()->pre_destroy() can be called while we do this charge.
1470 * In that case, we need to call pre_destroy() again. check it here.
1471 */
1472 cgroup_release_and_wakeup_rmdir(&ptr->css);
1462} 1473}
1463 1474
1464void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr) 1475void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)
@@ -1664,7 +1675,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem,
1664 1675
1665 if (!mem) 1676 if (!mem)
1666 return; 1677 return;
1667 1678 cgroup_exclude_rmdir(&mem->css);
1668 /* at migration success, oldpage->mapping is NULL. */ 1679 /* at migration success, oldpage->mapping is NULL. */
1669 if (oldpage->mapping) { 1680 if (oldpage->mapping) {
1670 target = oldpage; 1681 target = oldpage;
@@ -1704,6 +1715,12 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem,
1704 */ 1715 */
1705 if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED) 1716 if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)
1706 mem_cgroup_uncharge_page(target); 1717 mem_cgroup_uncharge_page(target);
1718 /*
1719 * At migration, we may charge account against cgroup which has no tasks
1720 * So, rmdir()->pre_destroy() can be called while we do this charge.
1721 * In that case, we need to call pre_destroy() again. check it here.
1722 */
1723 cgroup_release_and_wakeup_rmdir(&mem->css);
1707} 1724}
1708 1725
1709/* 1726/*
@@ -1973,7 +1990,7 @@ try_to_free:
1973 if (!progress) { 1990 if (!progress) {
1974 nr_retries--; 1991 nr_retries--;
1975 /* maybe some writeback is necessary */ 1992 /* maybe some writeback is necessary */
1976 congestion_wait(WRITE, HZ/10); 1993 congestion_wait(BLK_RW_ASYNC, HZ/10);
1977 } 1994 }
1978 1995
1979 } 1996 }
diff --git a/mm/memory.c b/mm/memory.c
index 65216194eb8d..aede2ce3aba4 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -135,11 +135,12 @@ void pmd_clear_bad(pmd_t *pmd)
135 * Note: this doesn't free the actual pages themselves. That 135 * Note: this doesn't free the actual pages themselves. That
136 * has been handled earlier when unmapping all the memory regions. 136 * has been handled earlier when unmapping all the memory regions.
137 */ 137 */
138static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd) 138static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
139 unsigned long addr)
139{ 140{
140 pgtable_t token = pmd_pgtable(*pmd); 141 pgtable_t token = pmd_pgtable(*pmd);
141 pmd_clear(pmd); 142 pmd_clear(pmd);
142 pte_free_tlb(tlb, token); 143 pte_free_tlb(tlb, token, addr);
143 tlb->mm->nr_ptes--; 144 tlb->mm->nr_ptes--;
144} 145}
145 146
@@ -157,7 +158,7 @@ static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
157 next = pmd_addr_end(addr, end); 158 next = pmd_addr_end(addr, end);
158 if (pmd_none_or_clear_bad(pmd)) 159 if (pmd_none_or_clear_bad(pmd))
159 continue; 160 continue;
160 free_pte_range(tlb, pmd); 161 free_pte_range(tlb, pmd, addr);
161 } while (pmd++, addr = next, addr != end); 162 } while (pmd++, addr = next, addr != end);
162 163
163 start &= PUD_MASK; 164 start &= PUD_MASK;
@@ -173,7 +174,7 @@ static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
173 174
174 pmd = pmd_offset(pud, start); 175 pmd = pmd_offset(pud, start);
175 pud_clear(pud); 176 pud_clear(pud);
176 pmd_free_tlb(tlb, pmd); 177 pmd_free_tlb(tlb, pmd, start);
177} 178}
178 179
179static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, 180static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
@@ -206,7 +207,7 @@ static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
206 207
207 pud = pud_offset(pgd, start); 208 pud = pud_offset(pgd, start);
208 pgd_clear(pgd); 209 pgd_clear(pgd);
209 pud_free_tlb(tlb, pud); 210 pud_free_tlb(tlb, pud, start);
210} 211}
211 212
212/* 213/*
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index e08e2c4da63a..7dd9d9f80694 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -191,25 +191,27 @@ static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
191 * Must be called holding task's alloc_lock to protect task's mems_allowed 191 * Must be called holding task's alloc_lock to protect task's mems_allowed
192 * and mempolicy. May also be called holding the mmap_semaphore for write. 192 * and mempolicy. May also be called holding the mmap_semaphore for write.
193 */ 193 */
194static int mpol_set_nodemask(struct mempolicy *pol, const nodemask_t *nodes) 194static int mpol_set_nodemask(struct mempolicy *pol,
195 const nodemask_t *nodes, struct nodemask_scratch *nsc)
195{ 196{
196 nodemask_t cpuset_context_nmask;
197 int ret; 197 int ret;
198 198
199 /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */ 199 /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
200 if (pol == NULL) 200 if (pol == NULL)
201 return 0; 201 return 0;
202 /* Check N_HIGH_MEMORY */
203 nodes_and(nsc->mask1,
204 cpuset_current_mems_allowed, node_states[N_HIGH_MEMORY]);
202 205
203 VM_BUG_ON(!nodes); 206 VM_BUG_ON(!nodes);
204 if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes)) 207 if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
205 nodes = NULL; /* explicit local allocation */ 208 nodes = NULL; /* explicit local allocation */
206 else { 209 else {
207 if (pol->flags & MPOL_F_RELATIVE_NODES) 210 if (pol->flags & MPOL_F_RELATIVE_NODES)
208 mpol_relative_nodemask(&cpuset_context_nmask, nodes, 211 mpol_relative_nodemask(&nsc->mask2, nodes,&nsc->mask1);
209 &cpuset_current_mems_allowed);
210 else 212 else
211 nodes_and(cpuset_context_nmask, *nodes, 213 nodes_and(nsc->mask2, *nodes, nsc->mask1);
212 cpuset_current_mems_allowed); 214
213 if (mpol_store_user_nodemask(pol)) 215 if (mpol_store_user_nodemask(pol))
214 pol->w.user_nodemask = *nodes; 216 pol->w.user_nodemask = *nodes;
215 else 217 else
@@ -217,8 +219,10 @@ static int mpol_set_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
217 cpuset_current_mems_allowed; 219 cpuset_current_mems_allowed;
218 } 220 }
219 221
220 ret = mpol_ops[pol->mode].create(pol, 222 if (nodes)
221 nodes ? &cpuset_context_nmask : NULL); 223 ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
224 else
225 ret = mpol_ops[pol->mode].create(pol, NULL);
222 return ret; 226 return ret;
223} 227}
224 228
@@ -620,12 +624,17 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
620{ 624{
621 struct mempolicy *new, *old; 625 struct mempolicy *new, *old;
622 struct mm_struct *mm = current->mm; 626 struct mm_struct *mm = current->mm;
627 NODEMASK_SCRATCH(scratch);
623 int ret; 628 int ret;
624 629
625 new = mpol_new(mode, flags, nodes); 630 if (!scratch)
626 if (IS_ERR(new)) 631 return -ENOMEM;
627 return PTR_ERR(new);
628 632
633 new = mpol_new(mode, flags, nodes);
634 if (IS_ERR(new)) {
635 ret = PTR_ERR(new);
636 goto out;
637 }
629 /* 638 /*
630 * prevent changing our mempolicy while show_numa_maps() 639 * prevent changing our mempolicy while show_numa_maps()
631 * is using it. 640 * is using it.
@@ -635,13 +644,13 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
635 if (mm) 644 if (mm)
636 down_write(&mm->mmap_sem); 645 down_write(&mm->mmap_sem);
637 task_lock(current); 646 task_lock(current);
638 ret = mpol_set_nodemask(new, nodes); 647 ret = mpol_set_nodemask(new, nodes, scratch);
639 if (ret) { 648 if (ret) {
640 task_unlock(current); 649 task_unlock(current);
641 if (mm) 650 if (mm)
642 up_write(&mm->mmap_sem); 651 up_write(&mm->mmap_sem);
643 mpol_put(new); 652 mpol_put(new);
644 return ret; 653 goto out;
645 } 654 }
646 old = current->mempolicy; 655 old = current->mempolicy;
647 current->mempolicy = new; 656 current->mempolicy = new;
@@ -654,7 +663,10 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
654 up_write(&mm->mmap_sem); 663 up_write(&mm->mmap_sem);
655 664
656 mpol_put(old); 665 mpol_put(old);
657 return 0; 666 ret = 0;
667out:
668 NODEMASK_SCRATCH_FREE(scratch);
669 return ret;
658} 670}
659 671
660/* 672/*
@@ -1014,12 +1026,20 @@ static long do_mbind(unsigned long start, unsigned long len,
1014 if (err) 1026 if (err)
1015 return err; 1027 return err;
1016 } 1028 }
1017 down_write(&mm->mmap_sem); 1029 {
1018 task_lock(current); 1030 NODEMASK_SCRATCH(scratch);
1019 err = mpol_set_nodemask(new, nmask); 1031 if (scratch) {
1020 task_unlock(current); 1032 down_write(&mm->mmap_sem);
1033 task_lock(current);
1034 err = mpol_set_nodemask(new, nmask, scratch);
1035 task_unlock(current);
1036 if (err)
1037 up_write(&mm->mmap_sem);
1038 } else
1039 err = -ENOMEM;
1040 NODEMASK_SCRATCH_FREE(scratch);
1041 }
1021 if (err) { 1042 if (err) {
1022 up_write(&mm->mmap_sem);
1023 mpol_put(new); 1043 mpol_put(new);
1024 return err; 1044 return err;
1025 } 1045 }
@@ -1891,6 +1911,7 @@ restart:
1891 * Install non-NULL @mpol in inode's shared policy rb-tree. 1911 * Install non-NULL @mpol in inode's shared policy rb-tree.
1892 * On entry, the current task has a reference on a non-NULL @mpol. 1912 * On entry, the current task has a reference on a non-NULL @mpol.
1893 * This must be released on exit. 1913 * This must be released on exit.
1914 * This is called at get_inode() calls and we can use GFP_KERNEL.
1894 */ 1915 */
1895void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) 1916void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
1896{ 1917{
@@ -1902,19 +1923,24 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
1902 if (mpol) { 1923 if (mpol) {
1903 struct vm_area_struct pvma; 1924 struct vm_area_struct pvma;
1904 struct mempolicy *new; 1925 struct mempolicy *new;
1926 NODEMASK_SCRATCH(scratch);
1905 1927
1928 if (!scratch)
1929 return;
1906 /* contextualize the tmpfs mount point mempolicy */ 1930 /* contextualize the tmpfs mount point mempolicy */
1907 new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask); 1931 new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
1908 if (IS_ERR(new)) { 1932 if (IS_ERR(new)) {
1909 mpol_put(mpol); /* drop our ref on sb mpol */ 1933 mpol_put(mpol); /* drop our ref on sb mpol */
1934 NODEMASK_SCRATCH_FREE(scratch);
1910 return; /* no valid nodemask intersection */ 1935 return; /* no valid nodemask intersection */
1911 } 1936 }
1912 1937
1913 task_lock(current); 1938 task_lock(current);
1914 ret = mpol_set_nodemask(new, &mpol->w.user_nodemask); 1939 ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
1915 task_unlock(current); 1940 task_unlock(current);
1916 mpol_put(mpol); /* drop our ref on sb mpol */ 1941 mpol_put(mpol); /* drop our ref on sb mpol */
1917 if (ret) { 1942 if (ret) {
1943 NODEMASK_SCRATCH_FREE(scratch);
1918 mpol_put(new); 1944 mpol_put(new);
1919 return; 1945 return;
1920 } 1946 }
@@ -1924,6 +1950,7 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
1924 pvma.vm_end = TASK_SIZE; /* policy covers entire file */ 1950 pvma.vm_end = TASK_SIZE; /* policy covers entire file */
1925 mpol_set_shared_policy(sp, &pvma, new); /* adds ref */ 1951 mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
1926 mpol_put(new); /* drop initial ref */ 1952 mpol_put(new); /* drop initial ref */
1953 NODEMASK_SCRATCH_FREE(scratch);
1927 } 1954 }
1928} 1955}
1929 1956
@@ -2140,13 +2167,18 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
2140 err = 1; 2167 err = 1;
2141 else { 2168 else {
2142 int ret; 2169 int ret;
2143 2170 NODEMASK_SCRATCH(scratch);
2144 task_lock(current); 2171 if (scratch) {
2145 ret = mpol_set_nodemask(new, &nodes); 2172 task_lock(current);
2146 task_unlock(current); 2173 ret = mpol_set_nodemask(new, &nodes, scratch);
2147 if (ret) 2174 task_unlock(current);
2175 } else
2176 ret = -ENOMEM;
2177 NODEMASK_SCRATCH_FREE(scratch);
2178 if (ret) {
2148 err = 1; 2179 err = 1;
2149 else if (no_context) { 2180 mpol_put(new);
2181 } else if (no_context) {
2150 /* save for contextualization */ 2182 /* save for contextualization */
2151 new->w.user_nodemask = nodes; 2183 new->w.user_nodemask = nodes;
2152 } 2184 }
diff --git a/mm/mempool.c b/mm/mempool.c
index a46eb1b4bb66..32e75d400503 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -303,14 +303,14 @@ EXPORT_SYMBOL(mempool_free_slab);
303 */ 303 */
304void *mempool_kmalloc(gfp_t gfp_mask, void *pool_data) 304void *mempool_kmalloc(gfp_t gfp_mask, void *pool_data)
305{ 305{
306 size_t size = (size_t)(long)pool_data; 306 size_t size = (size_t)pool_data;
307 return kmalloc(size, gfp_mask); 307 return kmalloc(size, gfp_mask);
308} 308}
309EXPORT_SYMBOL(mempool_kmalloc); 309EXPORT_SYMBOL(mempool_kmalloc);
310 310
311void *mempool_kzalloc(gfp_t gfp_mask, void *pool_data) 311void *mempool_kzalloc(gfp_t gfp_mask, void *pool_data)
312{ 312{
313 size_t size = (size_t) pool_data; 313 size_t size = (size_t)pool_data;
314 return kzalloc(size, gfp_mask); 314 return kzalloc(size, gfp_mask);
315} 315}
316EXPORT_SYMBOL(mempool_kzalloc); 316EXPORT_SYMBOL(mempool_kzalloc);
diff --git a/mm/mmap.c b/mm/mmap.c
index 34579b23ebd5..8101de490c73 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -88,9 +88,6 @@ int sysctl_overcommit_ratio = 50; /* default is 50% */
88int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; 88int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
89struct percpu_counter vm_committed_as; 89struct percpu_counter vm_committed_as;
90 90
91/* amount of vm to protect from userspace access */
92unsigned long mmap_min_addr = CONFIG_DEFAULT_MMAP_MIN_ADDR;
93
94/* 91/*
95 * Check that a process has enough memory to allocate a new virtual 92 * Check that a process has enough memory to allocate a new virtual
96 * mapping. 0 means there is enough memory for the allocation to 93 * mapping. 0 means there is enough memory for the allocation to
diff --git a/mm/nommu.c b/mm/nommu.c
index 53cab10fece4..66e81e7e9fe9 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -69,9 +69,6 @@ int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
69int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS; 69int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS;
70int heap_stack_gap = 0; 70int heap_stack_gap = 0;
71 71
72/* amount of vm to protect from userspace access */
73unsigned long mmap_min_addr = CONFIG_DEFAULT_MMAP_MIN_ADDR;
74
75atomic_long_t mmap_pages_allocated; 72atomic_long_t mmap_pages_allocated;
76 73
77EXPORT_SYMBOL(mem_map); 74EXPORT_SYMBOL(mem_map);
@@ -922,6 +919,10 @@ static int validate_mmap_request(struct file *file,
922 if (!file->f_op->read) 919 if (!file->f_op->read)
923 capabilities &= ~BDI_CAP_MAP_COPY; 920 capabilities &= ~BDI_CAP_MAP_COPY;
924 921
922 /* The file shall have been opened with read permission. */
923 if (!(file->f_mode & FMODE_READ))
924 return -EACCES;
925
925 if (flags & MAP_SHARED) { 926 if (flags & MAP_SHARED) {
926 /* do checks for writing, appending and locking */ 927 /* do checks for writing, appending and locking */
927 if ((prot & PROT_WRITE) && 928 if ((prot & PROT_WRITE) &&
@@ -1351,6 +1352,7 @@ unsigned long do_mmap_pgoff(struct file *file,
1351 } 1352 }
1352 1353
1353 vma->vm_region = region; 1354 vma->vm_region = region;
1355 add_nommu_region(region);
1354 1356
1355 /* set up the mapping */ 1357 /* set up the mapping */
1356 if (file && vma->vm_flags & VM_SHARED) 1358 if (file && vma->vm_flags & VM_SHARED)
@@ -1360,8 +1362,6 @@ unsigned long do_mmap_pgoff(struct file *file,
1360 if (ret < 0) 1362 if (ret < 0)
1361 goto error_put_region; 1363 goto error_put_region;
1362 1364
1363 add_nommu_region(region);
1364
1365 /* okay... we have a mapping; now we have to register it */ 1365 /* okay... we have a mapping; now we have to register it */
1366 result = vma->vm_start; 1366 result = vma->vm_start;
1367 1367
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 175a67a78a99..a7b2460e922b 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -58,7 +58,6 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
58 unsigned long points, cpu_time, run_time; 58 unsigned long points, cpu_time, run_time;
59 struct mm_struct *mm; 59 struct mm_struct *mm;
60 struct task_struct *child; 60 struct task_struct *child;
61 int oom_adj;
62 61
63 task_lock(p); 62 task_lock(p);
64 mm = p->mm; 63 mm = p->mm;
@@ -66,11 +65,6 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
66 task_unlock(p); 65 task_unlock(p);
67 return 0; 66 return 0;
68 } 67 }
69 oom_adj = mm->oom_adj;
70 if (oom_adj == OOM_DISABLE) {
71 task_unlock(p);
72 return 0;
73 }
74 68
75 /* 69 /*
76 * The memory size of the process is the basis for the badness. 70 * The memory size of the process is the basis for the badness.
@@ -154,15 +148,15 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
154 points /= 8; 148 points /= 8;
155 149
156 /* 150 /*
157 * Adjust the score by oom_adj. 151 * Adjust the score by oomkilladj.
158 */ 152 */
159 if (oom_adj) { 153 if (p->oomkilladj) {
160 if (oom_adj > 0) { 154 if (p->oomkilladj > 0) {
161 if (!points) 155 if (!points)
162 points = 1; 156 points = 1;
163 points <<= oom_adj; 157 points <<= p->oomkilladj;
164 } else 158 } else
165 points >>= -(oom_adj); 159 points >>= -(p->oomkilladj);
166 } 160 }
167 161
168#ifdef DEBUG 162#ifdef DEBUG
@@ -257,8 +251,11 @@ static struct task_struct *select_bad_process(unsigned long *ppoints,
257 *ppoints = ULONG_MAX; 251 *ppoints = ULONG_MAX;
258 } 252 }
259 253
254 if (p->oomkilladj == OOM_DISABLE)
255 continue;
256
260 points = badness(p, uptime.tv_sec); 257 points = badness(p, uptime.tv_sec);
261 if (points > *ppoints) { 258 if (points > *ppoints || !chosen) {
262 chosen = p; 259 chosen = p;
263 *ppoints = points; 260 *ppoints = points;
264 } 261 }
@@ -307,7 +304,8 @@ static void dump_tasks(const struct mem_cgroup *mem)
307 } 304 }
308 printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n", 305 printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n",
309 p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm, 306 p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm,
310 get_mm_rss(mm), (int)task_cpu(p), mm->oom_adj, p->comm); 307 get_mm_rss(mm), (int)task_cpu(p), p->oomkilladj,
308 p->comm);
311 task_unlock(p); 309 task_unlock(p);
312 } while_each_thread(g, p); 310 } while_each_thread(g, p);
313} 311}
@@ -325,8 +323,11 @@ static void __oom_kill_task(struct task_struct *p, int verbose)
325 return; 323 return;
326 } 324 }
327 325
328 if (!p->mm) 326 if (!p->mm) {
327 WARN_ON(1);
328 printk(KERN_WARNING "tried to kill an mm-less task!\n");
329 return; 329 return;
330 }
330 331
331 if (verbose) 332 if (verbose)
332 printk(KERN_ERR "Killed process %d (%s)\n", 333 printk(KERN_ERR "Killed process %d (%s)\n",
@@ -348,13 +349,28 @@ static int oom_kill_task(struct task_struct *p)
348 struct mm_struct *mm; 349 struct mm_struct *mm;
349 struct task_struct *g, *q; 350 struct task_struct *g, *q;
350 351
351 task_lock(p);
352 mm = p->mm; 352 mm = p->mm;
353 if (!mm || mm->oom_adj == OOM_DISABLE) { 353
354 task_unlock(p); 354 /* WARNING: mm may not be dereferenced since we did not obtain its
355 * value from get_task_mm(p). This is OK since all we need to do is
356 * compare mm to q->mm below.
357 *
358 * Furthermore, even if mm contains a non-NULL value, p->mm may
359 * change to NULL at any time since we do not hold task_lock(p).
360 * However, this is of no concern to us.
361 */
362
363 if (mm == NULL)
355 return 1; 364 return 1;
356 } 365
357 task_unlock(p); 366 /*
367 * Don't kill the process if any threads are set to OOM_DISABLE
368 */
369 do_each_thread(g, q) {
370 if (q->mm == mm && q->oomkilladj == OOM_DISABLE)
371 return 1;
372 } while_each_thread(g, q);
373
358 __oom_kill_task(p, 1); 374 __oom_kill_task(p, 1);
359 375
360 /* 376 /*
@@ -377,11 +393,10 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
377 struct task_struct *c; 393 struct task_struct *c;
378 394
379 if (printk_ratelimit()) { 395 if (printk_ratelimit()) {
380 task_lock(current);
381 printk(KERN_WARNING "%s invoked oom-killer: " 396 printk(KERN_WARNING "%s invoked oom-killer: "
382 "gfp_mask=0x%x, order=%d, oom_adj=%d\n", 397 "gfp_mask=0x%x, order=%d, oomkilladj=%d\n",
383 current->comm, gfp_mask, order, 398 current->comm, gfp_mask, order, current->oomkilladj);
384 current->mm ? current->mm->oom_adj : OOM_DISABLE); 399 task_lock(current);
385 cpuset_print_task_mems_allowed(current); 400 cpuset_print_task_mems_allowed(current);
386 task_unlock(current); 401 task_unlock(current);
387 dump_stack(); 402 dump_stack();
@@ -394,9 +409,8 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
394 /* 409 /*
395 * If the task is already exiting, don't alarm the sysadmin or kill 410 * If the task is already exiting, don't alarm the sysadmin or kill
396 * its children or threads, just set TIF_MEMDIE so it can die quickly 411 * its children or threads, just set TIF_MEMDIE so it can die quickly
397 * if its mm is still attached.
398 */ 412 */
399 if (p->mm && (p->flags & PF_EXITING)) { 413 if (p->flags & PF_EXITING) {
400 __oom_kill_task(p, 0); 414 __oom_kill_task(p, 0);
401 return 0; 415 return 0;
402 } 416 }
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 7687879253b9..25e7770309b8 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -36,15 +36,6 @@
36#include <linux/pagevec.h> 36#include <linux/pagevec.h>
37 37
38/* 38/*
39 * The maximum number of pages to writeout in a single bdflush/kupdate
40 * operation. We do this so we don't hold I_SYNC against an inode for
41 * enormous amounts of time, which would block a userspace task which has
42 * been forced to throttle against that inode. Also, the code reevaluates
43 * the dirty each time it has written this many pages.
44 */
45#define MAX_WRITEBACK_PAGES 1024
46
47/*
48 * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited 39 * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
49 * will look to see if it needs to force writeback or throttling. 40 * will look to see if it needs to force writeback or throttling.
50 */ 41 */
@@ -117,8 +108,6 @@ EXPORT_SYMBOL(laptop_mode);
117/* End of sysctl-exported parameters */ 108/* End of sysctl-exported parameters */
118 109
119 110
120static void background_writeout(unsigned long _min_pages);
121
122/* 111/*
123 * Scale the writeback cache size proportional to the relative writeout speeds. 112 * Scale the writeback cache size proportional to the relative writeout speeds.
124 * 113 *
@@ -320,15 +309,13 @@ static void task_dirty_limit(struct task_struct *tsk, unsigned long *pdirty)
320/* 309/*
321 * 310 *
322 */ 311 */
323static DEFINE_SPINLOCK(bdi_lock);
324static unsigned int bdi_min_ratio; 312static unsigned int bdi_min_ratio;
325 313
326int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) 314int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio)
327{ 315{
328 int ret = 0; 316 int ret = 0;
329 unsigned long flags;
330 317
331 spin_lock_irqsave(&bdi_lock, flags); 318 spin_lock(&bdi_lock);
332 if (min_ratio > bdi->max_ratio) { 319 if (min_ratio > bdi->max_ratio) {
333 ret = -EINVAL; 320 ret = -EINVAL;
334 } else { 321 } else {
@@ -340,27 +327,26 @@ int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio)
340 ret = -EINVAL; 327 ret = -EINVAL;
341 } 328 }
342 } 329 }
343 spin_unlock_irqrestore(&bdi_lock, flags); 330 spin_unlock(&bdi_lock);
344 331
345 return ret; 332 return ret;
346} 333}
347 334
348int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio) 335int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio)
349{ 336{
350 unsigned long flags;
351 int ret = 0; 337 int ret = 0;
352 338
353 if (max_ratio > 100) 339 if (max_ratio > 100)
354 return -EINVAL; 340 return -EINVAL;
355 341
356 spin_lock_irqsave(&bdi_lock, flags); 342 spin_lock(&bdi_lock);
357 if (bdi->min_ratio > max_ratio) { 343 if (bdi->min_ratio > max_ratio) {
358 ret = -EINVAL; 344 ret = -EINVAL;
359 } else { 345 } else {
360 bdi->max_ratio = max_ratio; 346 bdi->max_ratio = max_ratio;
361 bdi->max_prop_frac = (PROP_FRAC_BASE * max_ratio) / 100; 347 bdi->max_prop_frac = (PROP_FRAC_BASE * max_ratio) / 100;
362 } 348 }
363 spin_unlock_irqrestore(&bdi_lock, flags); 349 spin_unlock(&bdi_lock);
364 350
365 return ret; 351 return ret;
366} 352}
@@ -546,7 +532,7 @@ static void balance_dirty_pages(struct address_space *mapping)
546 * up. 532 * up.
547 */ 533 */
548 if (bdi_nr_reclaimable > bdi_thresh) { 534 if (bdi_nr_reclaimable > bdi_thresh) {
549 writeback_inodes(&wbc); 535 writeback_inodes_wbc(&wbc);
550 pages_written += write_chunk - wbc.nr_to_write; 536 pages_written += write_chunk - wbc.nr_to_write;
551 get_dirty_limits(&background_thresh, &dirty_thresh, 537 get_dirty_limits(&background_thresh, &dirty_thresh,
552 &bdi_thresh, bdi); 538 &bdi_thresh, bdi);
@@ -575,7 +561,7 @@ static void balance_dirty_pages(struct address_space *mapping)
575 if (pages_written >= write_chunk) 561 if (pages_written >= write_chunk)
576 break; /* We've done our duty */ 562 break; /* We've done our duty */
577 563
578 congestion_wait(WRITE, HZ/10); 564 schedule_timeout(1);
579 } 565 }
580 566
581 if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh && 567 if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh &&
@@ -594,10 +580,18 @@ static void balance_dirty_pages(struct address_space *mapping)
594 * background_thresh, to keep the amount of dirty memory low. 580 * background_thresh, to keep the amount of dirty memory low.
595 */ 581 */
596 if ((laptop_mode && pages_written) || 582 if ((laptop_mode && pages_written) ||
597 (!laptop_mode && (global_page_state(NR_FILE_DIRTY) 583 (!laptop_mode && ((nr_writeback = global_page_state(NR_FILE_DIRTY)
598 + global_page_state(NR_UNSTABLE_NFS) 584 + global_page_state(NR_UNSTABLE_NFS))
599 > background_thresh))) 585 > background_thresh))) {
600 pdflush_operation(background_writeout, 0); 586 struct writeback_control wbc = {
587 .bdi = bdi,
588 .sync_mode = WB_SYNC_NONE,
589 .nr_to_write = nr_writeback,
590 };
591
592
593 bdi_start_writeback(&wbc);
594 }
601} 595}
602 596
603void set_page_dirty_balance(struct page *page, int page_mkwrite) 597void set_page_dirty_balance(struct page *page, int page_mkwrite)
@@ -669,7 +663,7 @@ void throttle_vm_writeout(gfp_t gfp_mask)
669 if (global_page_state(NR_UNSTABLE_NFS) + 663 if (global_page_state(NR_UNSTABLE_NFS) +
670 global_page_state(NR_WRITEBACK) <= dirty_thresh) 664 global_page_state(NR_WRITEBACK) <= dirty_thresh)
671 break; 665 break;
672 congestion_wait(WRITE, HZ/10); 666 congestion_wait(BLK_RW_ASYNC, HZ/10);
673 667
674 /* 668 /*
675 * The caller might hold locks which can prevent IO completion 669 * The caller might hold locks which can prevent IO completion
@@ -681,153 +675,35 @@ void throttle_vm_writeout(gfp_t gfp_mask)
681 } 675 }
682} 676}
683 677
684/*
685 * writeback at least _min_pages, and keep writing until the amount of dirty
686 * memory is less than the background threshold, or until we're all clean.
687 */
688static void background_writeout(unsigned long _min_pages)
689{
690 long min_pages = _min_pages;
691 struct writeback_control wbc = {
692 .bdi = NULL,
693 .sync_mode = WB_SYNC_NONE,
694 .older_than_this = NULL,
695 .nr_to_write = 0,
696 .nonblocking = 1,
697 .range_cyclic = 1,
698 };
699
700 for ( ; ; ) {
701 unsigned long background_thresh;
702 unsigned long dirty_thresh;
703
704 get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
705 if (global_page_state(NR_FILE_DIRTY) +
706 global_page_state(NR_UNSTABLE_NFS) < background_thresh
707 && min_pages <= 0)
708 break;
709 wbc.more_io = 0;
710 wbc.encountered_congestion = 0;
711 wbc.nr_to_write = MAX_WRITEBACK_PAGES;
712 wbc.pages_skipped = 0;
713 writeback_inodes(&wbc);
714 min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
715 if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
716 /* Wrote less than expected */
717 if (wbc.encountered_congestion || wbc.more_io)
718 congestion_wait(WRITE, HZ/10);
719 else
720 break;
721 }
722 }
723}
724
725/*
726 * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back
727 * the whole world. Returns 0 if a pdflush thread was dispatched. Returns
728 * -1 if all pdflush threads were busy.
729 */
730int wakeup_pdflush(long nr_pages)
731{
732 if (nr_pages == 0)
733 nr_pages = global_page_state(NR_FILE_DIRTY) +
734 global_page_state(NR_UNSTABLE_NFS);
735 return pdflush_operation(background_writeout, nr_pages);
736}
737
738static void wb_timer_fn(unsigned long unused);
739static void laptop_timer_fn(unsigned long unused); 678static void laptop_timer_fn(unsigned long unused);
740 679
741static DEFINE_TIMER(wb_timer, wb_timer_fn, 0, 0);
742static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0); 680static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0);
743 681
744/* 682/*
745 * Periodic writeback of "old" data.
746 *
747 * Define "old": the first time one of an inode's pages is dirtied, we mark the
748 * dirtying-time in the inode's address_space. So this periodic writeback code
749 * just walks the superblock inode list, writing back any inodes which are
750 * older than a specific point in time.
751 *
752 * Try to run once per dirty_writeback_interval. But if a writeback event
753 * takes longer than a dirty_writeback_interval interval, then leave a
754 * one-second gap.
755 *
756 * older_than_this takes precedence over nr_to_write. So we'll only write back
757 * all dirty pages if they are all attached to "old" mappings.
758 */
759static void wb_kupdate(unsigned long arg)
760{
761 unsigned long oldest_jif;
762 unsigned long start_jif;
763 unsigned long next_jif;
764 long nr_to_write;
765 struct writeback_control wbc = {
766 .bdi = NULL,
767 .sync_mode = WB_SYNC_NONE,
768 .older_than_this = &oldest_jif,
769 .nr_to_write = 0,
770 .nonblocking = 1,
771 .for_kupdate = 1,
772 .range_cyclic = 1,
773 };
774
775 sync_supers();
776
777 oldest_jif = jiffies - msecs_to_jiffies(dirty_expire_interval * 10);
778 start_jif = jiffies;
779 next_jif = start_jif + msecs_to_jiffies(dirty_writeback_interval * 10);
780 nr_to_write = global_page_state(NR_FILE_DIRTY) +
781 global_page_state(NR_UNSTABLE_NFS) +
782 (inodes_stat.nr_inodes - inodes_stat.nr_unused);
783 while (nr_to_write > 0) {
784 wbc.more_io = 0;
785 wbc.encountered_congestion = 0;
786 wbc.nr_to_write = MAX_WRITEBACK_PAGES;
787 writeback_inodes(&wbc);
788 if (wbc.nr_to_write > 0) {
789 if (wbc.encountered_congestion || wbc.more_io)
790 congestion_wait(WRITE, HZ/10);
791 else
792 break; /* All the old data is written */
793 }
794 nr_to_write -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
795 }
796 if (time_before(next_jif, jiffies + HZ))
797 next_jif = jiffies + HZ;
798 if (dirty_writeback_interval)
799 mod_timer(&wb_timer, next_jif);
800}
801
802/*
803 * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs 683 * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
804 */ 684 */
805int dirty_writeback_centisecs_handler(ctl_table *table, int write, 685int dirty_writeback_centisecs_handler(ctl_table *table, int write,
806 struct file *file, void __user *buffer, size_t *length, loff_t *ppos) 686 struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
807{ 687{
808 proc_dointvec(table, write, file, buffer, length, ppos); 688 proc_dointvec(table, write, file, buffer, length, ppos);
809 if (dirty_writeback_interval)
810 mod_timer(&wb_timer, jiffies +
811 msecs_to_jiffies(dirty_writeback_interval * 10));
812 else
813 del_timer(&wb_timer);
814 return 0; 689 return 0;
815} 690}
816 691
817static void wb_timer_fn(unsigned long unused) 692static void do_laptop_sync(struct work_struct *work)
818{
819 if (pdflush_operation(wb_kupdate, 0) < 0)
820 mod_timer(&wb_timer, jiffies + HZ); /* delay 1 second */
821}
822
823static void laptop_flush(unsigned long unused)
824{ 693{
825 sys_sync(); 694 wakeup_flusher_threads(0);
695 kfree(work);
826} 696}
827 697
828static void laptop_timer_fn(unsigned long unused) 698static void laptop_timer_fn(unsigned long unused)
829{ 699{
830 pdflush_operation(laptop_flush, 0); 700 struct work_struct *work;
701
702 work = kmalloc(sizeof(*work), GFP_ATOMIC);
703 if (work) {
704 INIT_WORK(work, do_laptop_sync);
705 schedule_work(work);
706 }
831} 707}
832 708
833/* 709/*
@@ -910,8 +786,6 @@ void __init page_writeback_init(void)
910{ 786{
911 int shift; 787 int shift;
912 788
913 mod_timer(&wb_timer,
914 jiffies + msecs_to_jiffies(dirty_writeback_interval * 10));
915 writeback_set_ratelimit(); 789 writeback_set_ratelimit();
916 register_cpu_notifier(&ratelimit_nb); 790 register_cpu_notifier(&ratelimit_nb);
917 791
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ad7cd1c56b07..a0de15f46987 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -817,13 +817,15 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
817 * agressive about taking ownership of free pages 817 * agressive about taking ownership of free pages
818 */ 818 */
819 if (unlikely(current_order >= (pageblock_order >> 1)) || 819 if (unlikely(current_order >= (pageblock_order >> 1)) ||
820 start_migratetype == MIGRATE_RECLAIMABLE) { 820 start_migratetype == MIGRATE_RECLAIMABLE ||
821 page_group_by_mobility_disabled) {
821 unsigned long pages; 822 unsigned long pages;
822 pages = move_freepages_block(zone, page, 823 pages = move_freepages_block(zone, page,
823 start_migratetype); 824 start_migratetype);
824 825
825 /* Claim the whole block if over half of it is free */ 826 /* Claim the whole block if over half of it is free */
826 if (pages >= (1 << (pageblock_order-1))) 827 if (pages >= (1 << (pageblock_order-1)) ||
828 page_group_by_mobility_disabled)
827 set_pageblock_migratetype(page, 829 set_pageblock_migratetype(page,
828 start_migratetype); 830 start_migratetype);
829 831
@@ -882,7 +884,7 @@ retry_reserve:
882 */ 884 */
883static int rmqueue_bulk(struct zone *zone, unsigned int order, 885static int rmqueue_bulk(struct zone *zone, unsigned int order,
884 unsigned long count, struct list_head *list, 886 unsigned long count, struct list_head *list,
885 int migratetype) 887 int migratetype, int cold)
886{ 888{
887 int i; 889 int i;
888 890
@@ -901,7 +903,10 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
901 * merge IO requests if the physical pages are ordered 903 * merge IO requests if the physical pages are ordered
902 * properly. 904 * properly.
903 */ 905 */
904 list_add(&page->lru, list); 906 if (likely(cold == 0))
907 list_add(&page->lru, list);
908 else
909 list_add_tail(&page->lru, list);
905 set_page_private(page, migratetype); 910 set_page_private(page, migratetype);
906 list = &page->lru; 911 list = &page->lru;
907 } 912 }
@@ -1119,7 +1124,8 @@ again:
1119 local_irq_save(flags); 1124 local_irq_save(flags);
1120 if (!pcp->count) { 1125 if (!pcp->count) {
1121 pcp->count = rmqueue_bulk(zone, 0, 1126 pcp->count = rmqueue_bulk(zone, 0,
1122 pcp->batch, &pcp->list, migratetype); 1127 pcp->batch, &pcp->list,
1128 migratetype, cold);
1123 if (unlikely(!pcp->count)) 1129 if (unlikely(!pcp->count))
1124 goto failed; 1130 goto failed;
1125 } 1131 }
@@ -1138,7 +1144,8 @@ again:
1138 /* Allocate more to the pcp list if necessary */ 1144 /* Allocate more to the pcp list if necessary */
1139 if (unlikely(&page->lru == &pcp->list)) { 1145 if (unlikely(&page->lru == &pcp->list)) {
1140 pcp->count += rmqueue_bulk(zone, 0, 1146 pcp->count += rmqueue_bulk(zone, 0,
1141 pcp->batch, &pcp->list, migratetype); 1147 pcp->batch, &pcp->list,
1148 migratetype, cold);
1142 page = list_entry(pcp->list.next, struct page, lru); 1149 page = list_entry(pcp->list.next, struct page, lru);
1143 } 1150 }
1144 1151
@@ -1666,7 +1673,7 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
1666 preferred_zone, migratetype); 1673 preferred_zone, migratetype);
1667 1674
1668 if (!page && gfp_mask & __GFP_NOFAIL) 1675 if (!page && gfp_mask & __GFP_NOFAIL)
1669 congestion_wait(WRITE, HZ/50); 1676 congestion_wait(BLK_RW_ASYNC, HZ/50);
1670 } while (!page && (gfp_mask & __GFP_NOFAIL)); 1677 } while (!page && (gfp_mask & __GFP_NOFAIL));
1671 1678
1672 return page; 1679 return page;
@@ -1740,8 +1747,10 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
1740 * be using allocators in order of preference for an area that is 1747 * be using allocators in order of preference for an area that is
1741 * too large. 1748 * too large.
1742 */ 1749 */
1743 if (WARN_ON_ONCE(order >= MAX_ORDER)) 1750 if (order >= MAX_ORDER) {
1751 WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
1744 return NULL; 1752 return NULL;
1753 }
1745 1754
1746 /* 1755 /*
1747 * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and 1756 * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
@@ -1789,6 +1798,10 @@ rebalance:
1789 if (p->flags & PF_MEMALLOC) 1798 if (p->flags & PF_MEMALLOC)
1790 goto nopage; 1799 goto nopage;
1791 1800
1801 /* Avoid allocations with no watermarks from looping endlessly */
1802 if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
1803 goto nopage;
1804
1792 /* Try direct reclaim and then allocating */ 1805 /* Try direct reclaim and then allocating */
1793 page = __alloc_pages_direct_reclaim(gfp_mask, order, 1806 page = __alloc_pages_direct_reclaim(gfp_mask, order,
1794 zonelist, high_zoneidx, 1807 zonelist, high_zoneidx,
@@ -1831,7 +1844,7 @@ rebalance:
1831 pages_reclaimed += did_some_progress; 1844 pages_reclaimed += did_some_progress;
1832 if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) { 1845 if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) {
1833 /* Wait for some write requests to complete then retry */ 1846 /* Wait for some write requests to complete then retry */
1834 congestion_wait(WRITE, HZ/50); 1847 congestion_wait(BLK_RW_ASYNC, HZ/50);
1835 goto rebalance; 1848 goto rebalance;
1836 } 1849 }
1837 1850
@@ -2533,7 +2546,6 @@ static void build_zonelists(pg_data_t *pgdat)
2533 prev_node = local_node; 2546 prev_node = local_node;
2534 nodes_clear(used_mask); 2547 nodes_clear(used_mask);
2535 2548
2536 memset(node_load, 0, sizeof(node_load));
2537 memset(node_order, 0, sizeof(node_order)); 2549 memset(node_order, 0, sizeof(node_order));
2538 j = 0; 2550 j = 0;
2539 2551
@@ -2642,6 +2654,9 @@ static int __build_all_zonelists(void *dummy)
2642{ 2654{
2643 int nid; 2655 int nid;
2644 2656
2657#ifdef CONFIG_NUMA
2658 memset(node_load, 0, sizeof(node_load));
2659#endif
2645 for_each_online_node(nid) { 2660 for_each_online_node(nid) {
2646 pg_data_t *pgdat = NODE_DATA(nid); 2661 pg_data_t *pgdat = NODE_DATA(nid);
2647 2662
@@ -4745,8 +4760,10 @@ void *__init alloc_large_system_hash(const char *tablename,
4745 * some pages at the end of hash table which 4760 * some pages at the end of hash table which
4746 * alloc_pages_exact() automatically does 4761 * alloc_pages_exact() automatically does
4747 */ 4762 */
4748 if (get_order(size) < MAX_ORDER) 4763 if (get_order(size) < MAX_ORDER) {
4749 table = alloc_pages_exact(size, GFP_ATOMIC); 4764 table = alloc_pages_exact(size, GFP_ATOMIC);
4765 kmemleak_alloc(table, size, 1, GFP_ATOMIC);
4766 }
4750 } 4767 }
4751 } while (!table && size > PAGE_SIZE && --log2qty); 4768 } while (!table && size > PAGE_SIZE && --log2qty);
4752 4769
@@ -4764,16 +4781,6 @@ void *__init alloc_large_system_hash(const char *tablename,
4764 if (_hash_mask) 4781 if (_hash_mask)
4765 *_hash_mask = (1 << log2qty) - 1; 4782 *_hash_mask = (1 << log2qty) - 1;
4766 4783
4767 /*
4768 * If hashdist is set, the table allocation is done with __vmalloc()
4769 * which invokes the kmemleak_alloc() callback. This function may also
4770 * be called before the slab and kmemleak are initialised when
4771 * kmemleak simply buffers the request to be executed later
4772 * (GFP_ATOMIC flag ignored in this case).
4773 */
4774 if (!hashdist)
4775 kmemleak_alloc(table, size, 1, GFP_ATOMIC);
4776
4777 return table; 4784 return table;
4778} 4785}
4779 4786
diff --git a/mm/pdflush.c b/mm/pdflush.c
deleted file mode 100644
index 235ac440c44e..000000000000
--- a/mm/pdflush.c
+++ /dev/null
@@ -1,269 +0,0 @@
1/*
2 * mm/pdflush.c - worker threads for writing back filesystem data
3 *
4 * Copyright (C) 2002, Linus Torvalds.
5 *
6 * 09Apr2002 Andrew Morton
7 * Initial version
8 * 29Feb2004 kaos@sgi.com
9 * Move worker thread creation to kthread to avoid chewing
10 * up stack space with nested calls to kernel_thread.
11 */
12
13#include <linux/sched.h>
14#include <linux/list.h>
15#include <linux/signal.h>
16#include <linux/spinlock.h>
17#include <linux/gfp.h>
18#include <linux/init.h>
19#include <linux/module.h>
20#include <linux/fs.h> /* Needed by writeback.h */
21#include <linux/writeback.h> /* Prototypes pdflush_operation() */
22#include <linux/kthread.h>
23#include <linux/cpuset.h>
24#include <linux/freezer.h>
25
26
27/*
28 * Minimum and maximum number of pdflush instances
29 */
30#define MIN_PDFLUSH_THREADS 2
31#define MAX_PDFLUSH_THREADS 8
32
33static void start_one_pdflush_thread(void);
34
35
36/*
37 * The pdflush threads are worker threads for writing back dirty data.
38 * Ideally, we'd like one thread per active disk spindle. But the disk
39 * topology is very hard to divine at this level. Instead, we take
40 * care in various places to prevent more than one pdflush thread from
41 * performing writeback against a single filesystem. pdflush threads
42 * have the PF_FLUSHER flag set in current->flags to aid in this.
43 */
44
45/*
46 * All the pdflush threads. Protected by pdflush_lock
47 */
48static LIST_HEAD(pdflush_list);
49static DEFINE_SPINLOCK(pdflush_lock);
50
51/*
52 * The count of currently-running pdflush threads. Protected
53 * by pdflush_lock.
54 *
55 * Readable by sysctl, but not writable. Published to userspace at
56 * /proc/sys/vm/nr_pdflush_threads.
57 */
58int nr_pdflush_threads = 0;
59
60/*
61 * The time at which the pdflush thread pool last went empty
62 */
63static unsigned long last_empty_jifs;
64
65/*
66 * The pdflush thread.
67 *
68 * Thread pool management algorithm:
69 *
70 * - The minimum and maximum number of pdflush instances are bound
71 * by MIN_PDFLUSH_THREADS and MAX_PDFLUSH_THREADS.
72 *
73 * - If there have been no idle pdflush instances for 1 second, create
74 * a new one.
75 *
76 * - If the least-recently-went-to-sleep pdflush thread has been asleep
77 * for more than one second, terminate a thread.
78 */
79
80/*
81 * A structure for passing work to a pdflush thread. Also for passing
82 * state information between pdflush threads. Protected by pdflush_lock.
83 */
84struct pdflush_work {
85 struct task_struct *who; /* The thread */
86 void (*fn)(unsigned long); /* A callback function */
87 unsigned long arg0; /* An argument to the callback */
88 struct list_head list; /* On pdflush_list, when idle */
89 unsigned long when_i_went_to_sleep;
90};
91
92static int __pdflush(struct pdflush_work *my_work)
93{
94 current->flags |= PF_FLUSHER | PF_SWAPWRITE;
95 set_freezable();
96 my_work->fn = NULL;
97 my_work->who = current;
98 INIT_LIST_HEAD(&my_work->list);
99
100 spin_lock_irq(&pdflush_lock);
101 for ( ; ; ) {
102 struct pdflush_work *pdf;
103
104 set_current_state(TASK_INTERRUPTIBLE);
105 list_move(&my_work->list, &pdflush_list);
106 my_work->when_i_went_to_sleep = jiffies;
107 spin_unlock_irq(&pdflush_lock);
108 schedule();
109 try_to_freeze();
110 spin_lock_irq(&pdflush_lock);
111 if (!list_empty(&my_work->list)) {
112 /*
113 * Someone woke us up, but without removing our control
114 * structure from the global list. swsusp will do this
115 * in try_to_freeze()->refrigerator(). Handle it.
116 */
117 my_work->fn = NULL;
118 continue;
119 }
120 if (my_work->fn == NULL) {
121 printk("pdflush: bogus wakeup\n");
122 continue;
123 }
124 spin_unlock_irq(&pdflush_lock);
125
126 (*my_work->fn)(my_work->arg0);
127
128 spin_lock_irq(&pdflush_lock);
129
130 /*
131 * Thread creation: For how long have there been zero
132 * available threads?
133 *
134 * To throttle creation, we reset last_empty_jifs.
135 */
136 if (time_after(jiffies, last_empty_jifs + 1 * HZ)) {
137 if (list_empty(&pdflush_list)) {
138 if (nr_pdflush_threads < MAX_PDFLUSH_THREADS) {
139 last_empty_jifs = jiffies;
140 nr_pdflush_threads++;
141 spin_unlock_irq(&pdflush_lock);
142 start_one_pdflush_thread();
143 spin_lock_irq(&pdflush_lock);
144 }
145 }
146 }
147
148 my_work->fn = NULL;
149
150 /*
151 * Thread destruction: For how long has the sleepiest
152 * thread slept?
153 */
154 if (list_empty(&pdflush_list))
155 continue;
156 if (nr_pdflush_threads <= MIN_PDFLUSH_THREADS)
157 continue;
158 pdf = list_entry(pdflush_list.prev, struct pdflush_work, list);
159 if (time_after(jiffies, pdf->when_i_went_to_sleep + 1 * HZ)) {
160 /* Limit exit rate */
161 pdf->when_i_went_to_sleep = jiffies;
162 break; /* exeunt */
163 }
164 }
165 nr_pdflush_threads--;
166 spin_unlock_irq(&pdflush_lock);
167 return 0;
168}
169
170/*
171 * Of course, my_work wants to be just a local in __pdflush(). It is
172 * separated out in this manner to hopefully prevent the compiler from
173 * performing unfortunate optimisations against the auto variables. Because
174 * these are visible to other tasks and CPUs. (No problem has actually
175 * been observed. This is just paranoia).
176 */
177static int pdflush(void *dummy)
178{
179 struct pdflush_work my_work;
180 cpumask_var_t cpus_allowed;
181
182 /*
183 * Since the caller doesn't even check kthread_run() worked, let's not
184 * freak out too much if this fails.
185 */
186 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
187 printk(KERN_WARNING "pdflush failed to allocate cpumask\n");
188 return 0;
189 }
190
191 /*
192 * pdflush can spend a lot of time doing encryption via dm-crypt. We
193 * don't want to do that at keventd's priority.
194 */
195 set_user_nice(current, 0);
196
197 /*
198 * Some configs put our parent kthread in a limited cpuset,
199 * which kthread() overrides, forcing cpus_allowed == cpu_all_mask.
200 * Our needs are more modest - cut back to our cpusets cpus_allowed.
201 * This is needed as pdflush's are dynamically created and destroyed.
202 * The boottime pdflush's are easily placed w/o these 2 lines.
203 */
204 cpuset_cpus_allowed(current, cpus_allowed);
205 set_cpus_allowed_ptr(current, cpus_allowed);
206 free_cpumask_var(cpus_allowed);
207
208 return __pdflush(&my_work);
209}
210
211/*
212 * Attempt to wake up a pdflush thread, and get it to do some work for you.
213 * Returns zero if it indeed managed to find a worker thread, and passed your
214 * payload to it.
215 */
216int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0)
217{
218 unsigned long flags;
219 int ret = 0;
220
221 BUG_ON(fn == NULL); /* Hard to diagnose if it's deferred */
222
223 spin_lock_irqsave(&pdflush_lock, flags);
224 if (list_empty(&pdflush_list)) {
225 ret = -1;
226 } else {
227 struct pdflush_work *pdf;
228
229 pdf = list_entry(pdflush_list.next, struct pdflush_work, list);
230 list_del_init(&pdf->list);
231 if (list_empty(&pdflush_list))
232 last_empty_jifs = jiffies;
233 pdf->fn = fn;
234 pdf->arg0 = arg0;
235 wake_up_process(pdf->who);
236 }
237 spin_unlock_irqrestore(&pdflush_lock, flags);
238
239 return ret;
240}
241
242static void start_one_pdflush_thread(void)
243{
244 struct task_struct *k;
245
246 k = kthread_run(pdflush, NULL, "pdflush");
247 if (unlikely(IS_ERR(k))) {
248 spin_lock_irq(&pdflush_lock);
249 nr_pdflush_threads--;
250 spin_unlock_irq(&pdflush_lock);
251 }
252}
253
254static int __init pdflush_init(void)
255{
256 int i;
257
258 /*
259 * Pre-set nr_pdflush_threads... If we fail to create,
260 * the count will be decremented.
261 */
262 nr_pdflush_threads = MIN_PDFLUSH_THREADS;
263
264 for (i = 0; i < MIN_PDFLUSH_THREADS; i++)
265 start_one_pdflush_thread();
266 return 0;
267}
268
269module_init(pdflush_init);
diff --git a/mm/percpu.c b/mm/percpu.c
index b70f2acd8853..3311c8919f37 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -8,12 +8,12 @@
8 * 8 *
9 * This is percpu allocator which can handle both static and dynamic 9 * This is percpu allocator which can handle both static and dynamic
10 * areas. Percpu areas are allocated in chunks in vmalloc area. Each 10 * areas. Percpu areas are allocated in chunks in vmalloc area. Each
11 * chunk is consisted of num_possible_cpus() units and the first chunk 11 * chunk is consisted of nr_cpu_ids units and the first chunk is used
12 * is used for static percpu variables in the kernel image (special 12 * for static percpu variables in the kernel image (special boot time
13 * boot time alloc/init handling necessary as these areas need to be 13 * alloc/init handling necessary as these areas need to be brought up
14 * brought up before allocation services are running). Unit grows as 14 * before allocation services are running). Unit grows as necessary
15 * necessary and all units grow or shrink in unison. When a chunk is 15 * and all units grow or shrink in unison. When a chunk is filled up,
16 * filled up, another chunk is allocated. ie. in vmalloc area 16 * another chunk is allocated. ie. in vmalloc area
17 * 17 *
18 * c0 c1 c2 18 * c0 c1 c2
19 * ------------------- ------------------- ------------ 19 * ------------------- ------------------- ------------
@@ -197,7 +197,12 @@ static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
197static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk, 197static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk,
198 int page_idx) 198 int page_idx)
199{ 199{
200 return *pcpu_chunk_pagep(chunk, 0, page_idx) != NULL; 200 /*
201 * Any possible cpu id can be used here, so there's no need to
202 * worry about preemption or cpu hotplug.
203 */
204 return *pcpu_chunk_pagep(chunk, raw_smp_processor_id(),
205 page_idx) != NULL;
201} 206}
202 207
203/* set the pointer to a chunk in a page struct */ 208/* set the pointer to a chunk in a page struct */
@@ -297,6 +302,14 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
297 return pcpu_first_chunk; 302 return pcpu_first_chunk;
298 } 303 }
299 304
305 /*
306 * The address is relative to unit0 which might be unused and
307 * thus unmapped. Offset the address to the unit space of the
308 * current processor before looking it up in the vmalloc
309 * space. Note that any possible cpu id can be used here, so
310 * there's no need to worry about preemption or cpu hotplug.
311 */
312 addr += raw_smp_processor_id() * pcpu_unit_size;
300 return pcpu_get_page_chunk(vmalloc_to_page(addr)); 313 return pcpu_get_page_chunk(vmalloc_to_page(addr));
301} 314}
302 315
@@ -558,7 +571,7 @@ static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme)
558static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end, 571static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end,
559 bool flush_tlb) 572 bool flush_tlb)
560{ 573{
561 unsigned int last = num_possible_cpus() - 1; 574 unsigned int last = nr_cpu_ids - 1;
562 unsigned int cpu; 575 unsigned int cpu;
563 576
564 /* unmap must not be done on immutable chunk */ 577 /* unmap must not be done on immutable chunk */
@@ -643,7 +656,7 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size,
643 */ 656 */
644static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end) 657static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end)
645{ 658{
646 unsigned int last = num_possible_cpus() - 1; 659 unsigned int last = nr_cpu_ids - 1;
647 unsigned int cpu; 660 unsigned int cpu;
648 int err; 661 int err;
649 662
@@ -749,7 +762,7 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void)
749 chunk->map[chunk->map_used++] = pcpu_unit_size; 762 chunk->map[chunk->map_used++] = pcpu_unit_size;
750 chunk->page = chunk->page_ar; 763 chunk->page = chunk->page_ar;
751 764
752 chunk->vm = get_vm_area(pcpu_chunk_size, GFP_KERNEL); 765 chunk->vm = get_vm_area(pcpu_chunk_size, VM_ALLOC);
753 if (!chunk->vm) { 766 if (!chunk->vm) {
754 free_pcpu_chunk(chunk); 767 free_pcpu_chunk(chunk);
755 return NULL; 768 return NULL;
@@ -1067,9 +1080,9 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
1067 PFN_UP(size_sum)); 1080 PFN_UP(size_sum));
1068 1081
1069 pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT; 1082 pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
1070 pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size; 1083 pcpu_chunk_size = nr_cpu_ids * pcpu_unit_size;
1071 pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) 1084 pcpu_chunk_struct_size = sizeof(struct pcpu_chunk)
1072 + num_possible_cpus() * pcpu_unit_pages * sizeof(struct page *); 1085 + nr_cpu_ids * pcpu_unit_pages * sizeof(struct page *);
1073 1086
1074 if (dyn_size < 0) 1087 if (dyn_size < 0)
1075 dyn_size = pcpu_unit_size - static_size - reserved_size; 1088 dyn_size = pcpu_unit_size - static_size - reserved_size;
@@ -1248,7 +1261,7 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size,
1248 } else 1261 } else
1249 pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE); 1262 pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE);
1250 1263
1251 chunk_size = pcpue_unit_size * num_possible_cpus(); 1264 chunk_size = pcpue_unit_size * nr_cpu_ids;
1252 1265
1253 pcpue_ptr = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE, 1266 pcpue_ptr = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE,
1254 __pa(MAX_DMA_ADDRESS)); 1267 __pa(MAX_DMA_ADDRESS));
@@ -1259,12 +1272,15 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size,
1259 } 1272 }
1260 1273
1261 /* return the leftover and copy */ 1274 /* return the leftover and copy */
1262 for_each_possible_cpu(cpu) { 1275 for (cpu = 0; cpu < nr_cpu_ids; cpu++) {
1263 void *ptr = pcpue_ptr + cpu * pcpue_unit_size; 1276 void *ptr = pcpue_ptr + cpu * pcpue_unit_size;
1264 1277
1265 free_bootmem(__pa(ptr + pcpue_size), 1278 if (cpu_possible(cpu)) {
1266 pcpue_unit_size - pcpue_size); 1279 free_bootmem(__pa(ptr + pcpue_size),
1267 memcpy(ptr, __per_cpu_load, static_size); 1280 pcpue_unit_size - pcpue_size);
1281 memcpy(ptr, __per_cpu_load, static_size);
1282 } else
1283 free_bootmem(__pa(ptr), pcpue_unit_size);
1268 } 1284 }
1269 1285
1270 /* we're ready, commit */ 1286 /* we're ready, commit */
diff --git a/mm/rmap.c b/mm/rmap.c
index 836c6c63e1f2..0895b5c7cbff 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -358,6 +358,7 @@ static int page_referenced_one(struct page *page,
358 */ 358 */
359 if (vma->vm_flags & VM_LOCKED) { 359 if (vma->vm_flags & VM_LOCKED) {
360 *mapcount = 1; /* break early from loop */ 360 *mapcount = 1; /* break early from loop */
361 *vm_flags |= VM_LOCKED;
361 goto out_unmap; 362 goto out_unmap;
362 } 363 }
363 364
diff --git a/mm/shmem.c b/mm/shmem.c
index d713239ce2ce..5a0b3d4055f3 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2446,7 +2446,7 @@ static const struct inode_operations shmem_inode_operations = {
2446 .getxattr = generic_getxattr, 2446 .getxattr = generic_getxattr,
2447 .listxattr = generic_listxattr, 2447 .listxattr = generic_listxattr,
2448 .removexattr = generic_removexattr, 2448 .removexattr = generic_removexattr,
2449 .permission = shmem_permission, 2449 .check_acl = shmem_check_acl,
2450#endif 2450#endif
2451 2451
2452}; 2452};
@@ -2469,7 +2469,7 @@ static const struct inode_operations shmem_dir_inode_operations = {
2469 .getxattr = generic_getxattr, 2469 .getxattr = generic_getxattr,
2470 .listxattr = generic_listxattr, 2470 .listxattr = generic_listxattr,
2471 .removexattr = generic_removexattr, 2471 .removexattr = generic_removexattr,
2472 .permission = shmem_permission, 2472 .check_acl = shmem_check_acl,
2473#endif 2473#endif
2474}; 2474};
2475 2475
@@ -2480,7 +2480,7 @@ static const struct inode_operations shmem_special_inode_operations = {
2480 .getxattr = generic_getxattr, 2480 .getxattr = generic_getxattr,
2481 .listxattr = generic_listxattr, 2481 .listxattr = generic_listxattr,
2482 .removexattr = generic_removexattr, 2482 .removexattr = generic_removexattr,
2483 .permission = shmem_permission, 2483 .check_acl = shmem_check_acl,
2484#endif 2484#endif
2485}; 2485};
2486 2486
diff --git a/mm/shmem_acl.c b/mm/shmem_acl.c
index 606a8e757a42..df2c87fdae50 100644
--- a/mm/shmem_acl.c
+++ b/mm/shmem_acl.c
@@ -157,7 +157,7 @@ shmem_acl_init(struct inode *inode, struct inode *dir)
157/** 157/**
158 * shmem_check_acl - check_acl() callback for generic_permission() 158 * shmem_check_acl - check_acl() callback for generic_permission()
159 */ 159 */
160static int 160int
161shmem_check_acl(struct inode *inode, int mask) 161shmem_check_acl(struct inode *inode, int mask)
162{ 162{
163 struct posix_acl *acl = shmem_get_acl(inode, ACL_TYPE_ACCESS); 163 struct posix_acl *acl = shmem_get_acl(inode, ACL_TYPE_ACCESS);
@@ -169,12 +169,3 @@ shmem_check_acl(struct inode *inode, int mask)
169 } 169 }
170 return -EAGAIN; 170 return -EAGAIN;
171} 171}
172
173/**
174 * shmem_permission - permission() inode operation
175 */
176int
177shmem_permission(struct inode *inode, int mask)
178{
179 return generic_permission(inode, mask, shmem_check_acl);
180}
diff --git a/mm/slob.c b/mm/slob.c
index 9641da3d5e58..837ebd64cc34 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -692,3 +692,8 @@ void __init kmem_cache_init(void)
692{ 692{
693 slob_ready = 1; 693 slob_ready = 1;
694} 694}
695
696void __init kmem_cache_init_late(void)
697{
698 /* Nothing to do */
699}
diff --git a/mm/slub.c b/mm/slub.c
index b1cb2dfa109a..417ed843b251 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -21,7 +21,6 @@
21#include <linux/kmemcheck.h> 21#include <linux/kmemcheck.h>
22#include <linux/cpu.h> 22#include <linux/cpu.h>
23#include <linux/cpuset.h> 23#include <linux/cpuset.h>
24#include <linux/kmemleak.h>
25#include <linux/mempolicy.h> 24#include <linux/mempolicy.h>
26#include <linux/ctype.h> 25#include <linux/ctype.h>
27#include <linux/debugobjects.h> 26#include <linux/debugobjects.h>
@@ -1127,8 +1126,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
1127 } 1126 }
1128 1127
1129 if (kmemcheck_enabled 1128 if (kmemcheck_enabled
1130 && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) 1129 && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) {
1131 {
1132 int pages = 1 << oo_order(oo); 1130 int pages = 1 << oo_order(oo);
1133 1131
1134 kmemcheck_alloc_shadow(page, oo_order(oo), flags, node); 1132 kmemcheck_alloc_shadow(page, oo_order(oo), flags, node);
@@ -2023,7 +2021,7 @@ static inline int calculate_order(int size)
2023 return order; 2021 return order;
2024 fraction /= 2; 2022 fraction /= 2;
2025 } 2023 }
2026 min_objects --; 2024 min_objects--;
2027 } 2025 }
2028 2026
2029 /* 2027 /*
@@ -2629,8 +2627,6 @@ static inline int kmem_cache_close(struct kmem_cache *s)
2629 */ 2627 */
2630void kmem_cache_destroy(struct kmem_cache *s) 2628void kmem_cache_destroy(struct kmem_cache *s)
2631{ 2629{
2632 if (s->flags & SLAB_DESTROY_BY_RCU)
2633 rcu_barrier();
2634 down_write(&slub_lock); 2630 down_write(&slub_lock);
2635 s->refcount--; 2631 s->refcount--;
2636 if (!s->refcount) { 2632 if (!s->refcount) {
@@ -2641,6 +2637,8 @@ void kmem_cache_destroy(struct kmem_cache *s)
2641 "still has objects.\n", s->name, __func__); 2637 "still has objects.\n", s->name, __func__);
2642 dump_stack(); 2638 dump_stack();
2643 } 2639 }
2640 if (s->flags & SLAB_DESTROY_BY_RCU)
2641 rcu_barrier();
2644 sysfs_slab_remove(s); 2642 sysfs_slab_remove(s);
2645 } else 2643 } else
2646 up_write(&slub_lock); 2644 up_write(&slub_lock);
@@ -2874,13 +2872,15 @@ EXPORT_SYMBOL(__kmalloc);
2874static void *kmalloc_large_node(size_t size, gfp_t flags, int node) 2872static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
2875{ 2873{
2876 struct page *page; 2874 struct page *page;
2875 void *ptr = NULL;
2877 2876
2878 flags |= __GFP_COMP | __GFP_NOTRACK; 2877 flags |= __GFP_COMP | __GFP_NOTRACK;
2879 page = alloc_pages_node(node, flags, get_order(size)); 2878 page = alloc_pages_node(node, flags, get_order(size));
2880 if (page) 2879 if (page)
2881 return page_address(page); 2880 ptr = page_address(page);
2882 else 2881
2883 return NULL; 2882 kmemleak_alloc(ptr, size, 1, flags);
2883 return ptr;
2884} 2884}
2885 2885
2886#ifdef CONFIG_NUMA 2886#ifdef CONFIG_NUMA
@@ -2965,6 +2965,7 @@ void kfree(const void *x)
2965 page = virt_to_head_page(x); 2965 page = virt_to_head_page(x);
2966 if (unlikely(!PageSlab(page))) { 2966 if (unlikely(!PageSlab(page))) {
2967 BUG_ON(!PageCompound(page)); 2967 BUG_ON(!PageCompound(page));
2968 kmemleak_free(x);
2968 put_page(page); 2969 put_page(page);
2969 return; 2970 return;
2970 } 2971 }
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 42cd38eba79f..5ae6b8b78c80 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -34,6 +34,7 @@ static const struct address_space_operations swap_aops = {
34}; 34};
35 35
36static struct backing_dev_info swap_backing_dev_info = { 36static struct backing_dev_info swap_backing_dev_info = {
37 .name = "swap",
37 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, 38 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED,
38 .unplug_io_fn = swap_unplug_io_fn, 39 .unplug_io_fn = swap_unplug_io_fn,
39}; 40};
diff --git a/mm/swapfile.c b/mm/swapfile.c
index d1ade1a48ee7..8ffdc0d23c53 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -753,7 +753,7 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
753 753
754 if (!bdev) { 754 if (!bdev) {
755 if (bdev_p) 755 if (bdev_p)
756 *bdev_p = bdget(sis->bdev->bd_dev); 756 *bdev_p = bdgrab(sis->bdev);
757 757
758 spin_unlock(&swap_lock); 758 spin_unlock(&swap_lock);
759 return i; 759 return i;
@@ -765,7 +765,7 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
765 struct swap_extent, list); 765 struct swap_extent, list);
766 if (se->start_block == offset) { 766 if (se->start_block == offset) {
767 if (bdev_p) 767 if (bdev_p)
768 *bdev_p = bdget(sis->bdev->bd_dev); 768 *bdev_p = bdgrab(sis->bdev);
769 769
770 spin_unlock(&swap_lock); 770 spin_unlock(&swap_lock);
771 bdput(bdev); 771 bdput(bdev);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 54155268dfca..ba8228e0a806 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -630,9 +630,14 @@ static unsigned long shrink_page_list(struct list_head *page_list,
630 630
631 referenced = page_referenced(page, 1, 631 referenced = page_referenced(page, 1,
632 sc->mem_cgroup, &vm_flags); 632 sc->mem_cgroup, &vm_flags);
633 /* In active use or really unfreeable? Activate it. */ 633 /*
634 * In active use or really unfreeable? Activate it.
635 * If page which have PG_mlocked lost isoltation race,
636 * try_to_unmap moves it to unevictable list
637 */
634 if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && 638 if (sc->order <= PAGE_ALLOC_COSTLY_ORDER &&
635 referenced && page_mapping_inuse(page)) 639 referenced && page_mapping_inuse(page)
640 && !(vm_flags & VM_LOCKED))
636 goto activate_locked; 641 goto activate_locked;
637 642
638 /* 643 /*
@@ -1104,7 +1109,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
1104 */ 1109 */
1105 if (nr_freed < nr_taken && !current_is_kswapd() && 1110 if (nr_freed < nr_taken && !current_is_kswapd() &&
1106 lumpy_reclaim) { 1111 lumpy_reclaim) {
1107 congestion_wait(WRITE, HZ/10); 1112 congestion_wait(BLK_RW_ASYNC, HZ/10);
1108 1113
1109 /* 1114 /*
1110 * The attempt at page out may have made some 1115 * The attempt at page out may have made some
@@ -1715,13 +1720,13 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1715 */ 1720 */
1716 if (total_scanned > sc->swap_cluster_max + 1721 if (total_scanned > sc->swap_cluster_max +
1717 sc->swap_cluster_max / 2) { 1722 sc->swap_cluster_max / 2) {
1718 wakeup_pdflush(laptop_mode ? 0 : total_scanned); 1723 wakeup_flusher_threads(laptop_mode ? 0 : total_scanned);
1719 sc->may_writepage = 1; 1724 sc->may_writepage = 1;
1720 } 1725 }
1721 1726
1722 /* Take a nap, wait for some writeback to complete */ 1727 /* Take a nap, wait for some writeback to complete */
1723 if (sc->nr_scanned && priority < DEF_PRIORITY - 2) 1728 if (sc->nr_scanned && priority < DEF_PRIORITY - 2)
1724 congestion_wait(WRITE, HZ/10); 1729 congestion_wait(BLK_RW_ASYNC, HZ/10);
1725 } 1730 }
1726 /* top priority shrink_zones still had more to do? don't OOM, then */ 1731 /* top priority shrink_zones still had more to do? don't OOM, then */
1727 if (!sc->all_unreclaimable && scanning_global_lru(sc)) 1732 if (!sc->all_unreclaimable && scanning_global_lru(sc))
@@ -1960,7 +1965,7 @@ loop_again:
1960 * another pass across the zones. 1965 * another pass across the zones.
1961 */ 1966 */
1962 if (total_scanned && priority < DEF_PRIORITY - 2) 1967 if (total_scanned && priority < DEF_PRIORITY - 2)
1963 congestion_wait(WRITE, HZ/10); 1968 congestion_wait(BLK_RW_ASYNC, HZ/10);
1964 1969
1965 /* 1970 /*
1966 * We do this so kswapd doesn't build up large priorities for 1971 * We do this so kswapd doesn't build up large priorities for
@@ -2233,7 +2238,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
2233 goto out; 2238 goto out;
2234 2239
2235 if (sc.nr_scanned && prio < DEF_PRIORITY - 2) 2240 if (sc.nr_scanned && prio < DEF_PRIORITY - 2)
2236 congestion_wait(WRITE, HZ / 10); 2241 congestion_wait(BLK_RW_ASYNC, HZ / 10);
2237 } 2242 }
2238 } 2243 }
2239 2244