aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2009-09-11 23:35:13 -0400
committerDavid S. Miller <davem@davemloft.net>2009-09-11 23:35:13 -0400
commitcabc5c0f7fa1342049042d6e147db5a73773955b (patch)
tree2be09ae1777d580c7dfe05d6d5b76e57281ec447 /mm
parentb73d884756303316ead4cd7dad51236b2a515a1a (diff)
parent86d710146fb9975f04c505ec78caa43d227c1018 (diff)
Merge branch 'master' of /home/davem/src/GIT/linux-2.6/
Conflicts: arch/sparc/Kconfig
Diffstat (limited to 'mm')
-rw-r--r--mm/Makefile2
-rw-r--r--mm/backing-dev.c381
-rw-r--r--mm/bootmem.c6
-rw-r--r--mm/kmemleak.c336
-rw-r--r--mm/nommu.c3
-rw-r--r--mm/page-writeback.c182
-rw-r--r--mm/page_alloc.c6
-rw-r--r--mm/pdflush.c269
-rw-r--r--mm/percpu.c15
-rw-r--r--mm/shmem.c6
-rw-r--r--mm/shmem_acl.c11
-rw-r--r--mm/slub.c4
-rw-r--r--mm/swap_state.c1
-rw-r--r--mm/vmscan.c2
14 files changed, 680 insertions, 544 deletions
diff --git a/mm/Makefile b/mm/Makefile
index 5e0bd6426693..147a7a7873c4 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -8,7 +8,7 @@ mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \
8 vmalloc.o 8 vmalloc.o
9 9
10obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ 10obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
11 maccess.o page_alloc.o page-writeback.o pdflush.o \ 11 maccess.o page_alloc.o page-writeback.o \
12 readahead.o swap.o truncate.o vmscan.o shmem.o \ 12 readahead.o swap.o truncate.o vmscan.o shmem.o \
13 prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ 13 prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
14 page_isolation.o mm_init.o $(mmu-y) 14 page_isolation.o mm_init.o $(mmu-y)
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index c86edd244294..d3ca0dac1111 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -1,8 +1,11 @@
1 1
2#include <linux/wait.h> 2#include <linux/wait.h>
3#include <linux/backing-dev.h> 3#include <linux/backing-dev.h>
4#include <linux/kthread.h>
5#include <linux/freezer.h>
4#include <linux/fs.h> 6#include <linux/fs.h>
5#include <linux/pagemap.h> 7#include <linux/pagemap.h>
8#include <linux/mm.h>
6#include <linux/sched.h> 9#include <linux/sched.h>
7#include <linux/module.h> 10#include <linux/module.h>
8#include <linux/writeback.h> 11#include <linux/writeback.h>
@@ -14,6 +17,7 @@ void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
14EXPORT_SYMBOL(default_unplug_io_fn); 17EXPORT_SYMBOL(default_unplug_io_fn);
15 18
16struct backing_dev_info default_backing_dev_info = { 19struct backing_dev_info default_backing_dev_info = {
20 .name = "default",
17 .ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE, 21 .ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE,
18 .state = 0, 22 .state = 0,
19 .capabilities = BDI_CAP_MAP_COPY, 23 .capabilities = BDI_CAP_MAP_COPY,
@@ -22,6 +26,18 @@ struct backing_dev_info default_backing_dev_info = {
22EXPORT_SYMBOL_GPL(default_backing_dev_info); 26EXPORT_SYMBOL_GPL(default_backing_dev_info);
23 27
24static struct class *bdi_class; 28static struct class *bdi_class;
29DEFINE_SPINLOCK(bdi_lock);
30LIST_HEAD(bdi_list);
31LIST_HEAD(bdi_pending_list);
32
33static struct task_struct *sync_supers_tsk;
34static struct timer_list sync_supers_timer;
35
36static int bdi_sync_supers(void *);
37static void sync_supers_timer_fn(unsigned long);
38static void arm_supers_timer(void);
39
40static void bdi_add_default_flusher_task(struct backing_dev_info *bdi);
25 41
26#ifdef CONFIG_DEBUG_FS 42#ifdef CONFIG_DEBUG_FS
27#include <linux/debugfs.h> 43#include <linux/debugfs.h>
@@ -37,9 +53,29 @@ static void bdi_debug_init(void)
37static int bdi_debug_stats_show(struct seq_file *m, void *v) 53static int bdi_debug_stats_show(struct seq_file *m, void *v)
38{ 54{
39 struct backing_dev_info *bdi = m->private; 55 struct backing_dev_info *bdi = m->private;
56 struct bdi_writeback *wb;
40 unsigned long background_thresh; 57 unsigned long background_thresh;
41 unsigned long dirty_thresh; 58 unsigned long dirty_thresh;
42 unsigned long bdi_thresh; 59 unsigned long bdi_thresh;
60 unsigned long nr_dirty, nr_io, nr_more_io, nr_wb;
61 struct inode *inode;
62
63 /*
64 * inode lock is enough here, the bdi->wb_list is protected by
65 * RCU on the reader side
66 */
67 nr_wb = nr_dirty = nr_io = nr_more_io = 0;
68 spin_lock(&inode_lock);
69 list_for_each_entry(wb, &bdi->wb_list, list) {
70 nr_wb++;
71 list_for_each_entry(inode, &wb->b_dirty, i_list)
72 nr_dirty++;
73 list_for_each_entry(inode, &wb->b_io, i_list)
74 nr_io++;
75 list_for_each_entry(inode, &wb->b_more_io, i_list)
76 nr_more_io++;
77 }
78 spin_unlock(&inode_lock);
43 79
44 get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi); 80 get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi);
45 81
@@ -49,12 +85,22 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
49 "BdiReclaimable: %8lu kB\n" 85 "BdiReclaimable: %8lu kB\n"
50 "BdiDirtyThresh: %8lu kB\n" 86 "BdiDirtyThresh: %8lu kB\n"
51 "DirtyThresh: %8lu kB\n" 87 "DirtyThresh: %8lu kB\n"
52 "BackgroundThresh: %8lu kB\n", 88 "BackgroundThresh: %8lu kB\n"
89 "WriteBack threads:%8lu\n"
90 "b_dirty: %8lu\n"
91 "b_io: %8lu\n"
92 "b_more_io: %8lu\n"
93 "bdi_list: %8u\n"
94 "state: %8lx\n"
95 "wb_mask: %8lx\n"
96 "wb_list: %8u\n"
97 "wb_cnt: %8u\n",
53 (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)), 98 (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)),
54 (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)), 99 (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)),
55 K(bdi_thresh), 100 K(bdi_thresh), K(dirty_thresh),
56 K(dirty_thresh), 101 K(background_thresh), nr_wb, nr_dirty, nr_io, nr_more_io,
57 K(background_thresh)); 102 !list_empty(&bdi->bdi_list), bdi->state, bdi->wb_mask,
103 !list_empty(&bdi->wb_list), bdi->wb_cnt);
58#undef K 104#undef K
59 105
60 return 0; 106 return 0;
@@ -185,6 +231,13 @@ static int __init default_bdi_init(void)
185{ 231{
186 int err; 232 int err;
187 233
234 sync_supers_tsk = kthread_run(bdi_sync_supers, NULL, "sync_supers");
235 BUG_ON(IS_ERR(sync_supers_tsk));
236
237 init_timer(&sync_supers_timer);
238 setup_timer(&sync_supers_timer, sync_supers_timer_fn, 0);
239 arm_supers_timer();
240
188 err = bdi_init(&default_backing_dev_info); 241 err = bdi_init(&default_backing_dev_info);
189 if (!err) 242 if (!err)
190 bdi_register(&default_backing_dev_info, NULL, "default"); 243 bdi_register(&default_backing_dev_info, NULL, "default");
@@ -193,6 +246,248 @@ static int __init default_bdi_init(void)
193} 246}
194subsys_initcall(default_bdi_init); 247subsys_initcall(default_bdi_init);
195 248
249static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
250{
251 memset(wb, 0, sizeof(*wb));
252
253 wb->bdi = bdi;
254 wb->last_old_flush = jiffies;
255 INIT_LIST_HEAD(&wb->b_dirty);
256 INIT_LIST_HEAD(&wb->b_io);
257 INIT_LIST_HEAD(&wb->b_more_io);
258}
259
260static void bdi_task_init(struct backing_dev_info *bdi,
261 struct bdi_writeback *wb)
262{
263 struct task_struct *tsk = current;
264
265 spin_lock(&bdi->wb_lock);
266 list_add_tail_rcu(&wb->list, &bdi->wb_list);
267 spin_unlock(&bdi->wb_lock);
268
269 tsk->flags |= PF_FLUSHER | PF_SWAPWRITE;
270 set_freezable();
271
272 /*
273 * Our parent may run at a different priority, just set us to normal
274 */
275 set_user_nice(tsk, 0);
276}
277
278static int bdi_start_fn(void *ptr)
279{
280 struct bdi_writeback *wb = ptr;
281 struct backing_dev_info *bdi = wb->bdi;
282 int ret;
283
284 /*
285 * Add us to the active bdi_list
286 */
287 spin_lock(&bdi_lock);
288 list_add(&bdi->bdi_list, &bdi_list);
289 spin_unlock(&bdi_lock);
290
291 bdi_task_init(bdi, wb);
292
293 /*
294 * Clear pending bit and wakeup anybody waiting to tear us down
295 */
296 clear_bit(BDI_pending, &bdi->state);
297 smp_mb__after_clear_bit();
298 wake_up_bit(&bdi->state, BDI_pending);
299
300 ret = bdi_writeback_task(wb);
301
302 /*
303 * Remove us from the list
304 */
305 spin_lock(&bdi->wb_lock);
306 list_del_rcu(&wb->list);
307 spin_unlock(&bdi->wb_lock);
308
309 /*
310 * Flush any work that raced with us exiting. No new work
311 * will be added, since this bdi isn't discoverable anymore.
312 */
313 if (!list_empty(&bdi->work_list))
314 wb_do_writeback(wb, 1);
315
316 wb->task = NULL;
317 return ret;
318}
319
320int bdi_has_dirty_io(struct backing_dev_info *bdi)
321{
322 return wb_has_dirty_io(&bdi->wb);
323}
324
325static void bdi_flush_io(struct backing_dev_info *bdi)
326{
327 struct writeback_control wbc = {
328 .bdi = bdi,
329 .sync_mode = WB_SYNC_NONE,
330 .older_than_this = NULL,
331 .range_cyclic = 1,
332 .nr_to_write = 1024,
333 };
334
335 writeback_inodes_wbc(&wbc);
336}
337
338/*
339 * kupdated() used to do this. We cannot do it from the bdi_forker_task()
340 * or we risk deadlocking on ->s_umount. The longer term solution would be
341 * to implement sync_supers_bdi() or similar and simply do it from the
342 * bdi writeback tasks individually.
343 */
344static int bdi_sync_supers(void *unused)
345{
346 set_user_nice(current, 0);
347
348 while (!kthread_should_stop()) {
349 set_current_state(TASK_INTERRUPTIBLE);
350 schedule();
351
352 /*
353 * Do this periodically, like kupdated() did before.
354 */
355 sync_supers();
356 }
357
358 return 0;
359}
360
361static void arm_supers_timer(void)
362{
363 unsigned long next;
364
365 next = msecs_to_jiffies(dirty_writeback_interval * 10) + jiffies;
366 mod_timer(&sync_supers_timer, round_jiffies_up(next));
367}
368
369static void sync_supers_timer_fn(unsigned long unused)
370{
371 wake_up_process(sync_supers_tsk);
372 arm_supers_timer();
373}
374
375static int bdi_forker_task(void *ptr)
376{
377 struct bdi_writeback *me = ptr;
378
379 bdi_task_init(me->bdi, me);
380
381 for (;;) {
382 struct backing_dev_info *bdi, *tmp;
383 struct bdi_writeback *wb;
384
385 /*
386 * Temporary measure, we want to make sure we don't see
387 * dirty data on the default backing_dev_info
388 */
389 if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list))
390 wb_do_writeback(me, 0);
391
392 spin_lock(&bdi_lock);
393
394 /*
395 * Check if any existing bdi's have dirty data without
396 * a thread registered. If so, set that up.
397 */
398 list_for_each_entry_safe(bdi, tmp, &bdi_list, bdi_list) {
399 if (bdi->wb.task)
400 continue;
401 if (list_empty(&bdi->work_list) &&
402 !bdi_has_dirty_io(bdi))
403 continue;
404
405 bdi_add_default_flusher_task(bdi);
406 }
407
408 set_current_state(TASK_INTERRUPTIBLE);
409
410 if (list_empty(&bdi_pending_list)) {
411 unsigned long wait;
412
413 spin_unlock(&bdi_lock);
414 wait = msecs_to_jiffies(dirty_writeback_interval * 10);
415 schedule_timeout(wait);
416 try_to_freeze();
417 continue;
418 }
419
420 __set_current_state(TASK_RUNNING);
421
422 /*
423 * This is our real job - check for pending entries in
424 * bdi_pending_list, and create the tasks that got added
425 */
426 bdi = list_entry(bdi_pending_list.next, struct backing_dev_info,
427 bdi_list);
428 list_del_init(&bdi->bdi_list);
429 spin_unlock(&bdi_lock);
430
431 wb = &bdi->wb;
432 wb->task = kthread_run(bdi_start_fn, wb, "flush-%s",
433 dev_name(bdi->dev));
434 /*
435 * If task creation fails, then readd the bdi to
436 * the pending list and force writeout of the bdi
437 * from this forker thread. That will free some memory
438 * and we can try again.
439 */
440 if (IS_ERR(wb->task)) {
441 wb->task = NULL;
442
443 /*
444 * Add this 'bdi' to the back, so we get
445 * a chance to flush other bdi's to free
446 * memory.
447 */
448 spin_lock(&bdi_lock);
449 list_add_tail(&bdi->bdi_list, &bdi_pending_list);
450 spin_unlock(&bdi_lock);
451
452 bdi_flush_io(bdi);
453 }
454 }
455
456 return 0;
457}
458
459/*
460 * Add the default flusher task that gets created for any bdi
461 * that has dirty data pending writeout
462 */
463void static bdi_add_default_flusher_task(struct backing_dev_info *bdi)
464{
465 if (!bdi_cap_writeback_dirty(bdi))
466 return;
467
468 if (WARN_ON(!test_bit(BDI_registered, &bdi->state))) {
469 printk(KERN_ERR "bdi %p/%s is not registered!\n",
470 bdi, bdi->name);
471 return;
472 }
473
474 /*
475 * Check with the helper whether to proceed adding a task. Will only
476 * abort if we two or more simultanous calls to
477 * bdi_add_default_flusher_task() occured, further additions will block
478 * waiting for previous additions to finish.
479 */
480 if (!test_and_set_bit(BDI_pending, &bdi->state)) {
481 list_move_tail(&bdi->bdi_list, &bdi_pending_list);
482
483 /*
484 * We are now on the pending list, wake up bdi_forker_task()
485 * to finish the job and add us back to the active bdi_list
486 */
487 wake_up_process(default_backing_dev_info.wb.task);
488 }
489}
490
196int bdi_register(struct backing_dev_info *bdi, struct device *parent, 491int bdi_register(struct backing_dev_info *bdi, struct device *parent,
197 const char *fmt, ...) 492 const char *fmt, ...)
198{ 493{
@@ -211,9 +506,35 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
211 goto exit; 506 goto exit;
212 } 507 }
213 508
509 spin_lock(&bdi_lock);
510 list_add_tail(&bdi->bdi_list, &bdi_list);
511 spin_unlock(&bdi_lock);
512
214 bdi->dev = dev; 513 bdi->dev = dev;
215 bdi_debug_register(bdi, dev_name(dev));
216 514
515 /*
516 * Just start the forker thread for our default backing_dev_info,
517 * and add other bdi's to the list. They will get a thread created
518 * on-demand when they need it.
519 */
520 if (bdi_cap_flush_forker(bdi)) {
521 struct bdi_writeback *wb = &bdi->wb;
522
523 wb->task = kthread_run(bdi_forker_task, wb, "bdi-%s",
524 dev_name(dev));
525 if (IS_ERR(wb->task)) {
526 wb->task = NULL;
527 ret = -ENOMEM;
528
529 spin_lock(&bdi_lock);
530 list_del(&bdi->bdi_list);
531 spin_unlock(&bdi_lock);
532 goto exit;
533 }
534 }
535
536 bdi_debug_register(bdi, dev_name(dev));
537 set_bit(BDI_registered, &bdi->state);
217exit: 538exit:
218 return ret; 539 return ret;
219} 540}
@@ -225,9 +546,42 @@ int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev)
225} 546}
226EXPORT_SYMBOL(bdi_register_dev); 547EXPORT_SYMBOL(bdi_register_dev);
227 548
549/*
550 * Remove bdi from the global list and shutdown any threads we have running
551 */
552static void bdi_wb_shutdown(struct backing_dev_info *bdi)
553{
554 struct bdi_writeback *wb;
555
556 if (!bdi_cap_writeback_dirty(bdi))
557 return;
558
559 /*
560 * If setup is pending, wait for that to complete first
561 */
562 wait_on_bit(&bdi->state, BDI_pending, bdi_sched_wait,
563 TASK_UNINTERRUPTIBLE);
564
565 /*
566 * Make sure nobody finds us on the bdi_list anymore
567 */
568 spin_lock(&bdi_lock);
569 list_del(&bdi->bdi_list);
570 spin_unlock(&bdi_lock);
571
572 /*
573 * Finally, kill the kernel threads. We don't need to be RCU
574 * safe anymore, since the bdi is gone from visibility.
575 */
576 list_for_each_entry(wb, &bdi->wb_list, list)
577 kthread_stop(wb->task);
578}
579
228void bdi_unregister(struct backing_dev_info *bdi) 580void bdi_unregister(struct backing_dev_info *bdi)
229{ 581{
230 if (bdi->dev) { 582 if (bdi->dev) {
583 if (!bdi_cap_flush_forker(bdi))
584 bdi_wb_shutdown(bdi);
231 bdi_debug_unregister(bdi); 585 bdi_debug_unregister(bdi);
232 device_unregister(bdi->dev); 586 device_unregister(bdi->dev);
233 bdi->dev = NULL; 587 bdi->dev = NULL;
@@ -237,14 +591,25 @@ EXPORT_SYMBOL(bdi_unregister);
237 591
238int bdi_init(struct backing_dev_info *bdi) 592int bdi_init(struct backing_dev_info *bdi)
239{ 593{
240 int i; 594 int i, err;
241 int err;
242 595
243 bdi->dev = NULL; 596 bdi->dev = NULL;
244 597
245 bdi->min_ratio = 0; 598 bdi->min_ratio = 0;
246 bdi->max_ratio = 100; 599 bdi->max_ratio = 100;
247 bdi->max_prop_frac = PROP_FRAC_BASE; 600 bdi->max_prop_frac = PROP_FRAC_BASE;
601 spin_lock_init(&bdi->wb_lock);
602 INIT_LIST_HEAD(&bdi->bdi_list);
603 INIT_LIST_HEAD(&bdi->wb_list);
604 INIT_LIST_HEAD(&bdi->work_list);
605
606 bdi_wb_init(&bdi->wb, bdi);
607
608 /*
609 * Just one thread support for now, hard code mask and count
610 */
611 bdi->wb_mask = 1;
612 bdi->wb_cnt = 1;
248 613
249 for (i = 0; i < NR_BDI_STAT_ITEMS; i++) { 614 for (i = 0; i < NR_BDI_STAT_ITEMS; i++) {
250 err = percpu_counter_init(&bdi->bdi_stat[i], 0); 615 err = percpu_counter_init(&bdi->bdi_stat[i], 0);
@@ -269,6 +634,8 @@ void bdi_destroy(struct backing_dev_info *bdi)
269{ 634{
270 int i; 635 int i;
271 636
637 WARN_ON(bdi_has_dirty_io(bdi));
638
272 bdi_unregister(bdi); 639 bdi_unregister(bdi);
273 640
274 for (i = 0; i < NR_BDI_STAT_ITEMS; i++) 641 for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 701740c9e81b..555d5d2731c6 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -521,7 +521,11 @@ find_block:
521 region = phys_to_virt(PFN_PHYS(bdata->node_min_pfn) + 521 region = phys_to_virt(PFN_PHYS(bdata->node_min_pfn) +
522 start_off); 522 start_off);
523 memset(region, 0, size); 523 memset(region, 0, size);
524 kmemleak_alloc(region, size, 1, 0); 524 /*
525 * The min_count is set to 0 so that bootmem allocated blocks
526 * are never reported as leaks.
527 */
528 kmemleak_alloc(region, size, 0, 0);
525 return region; 529 return region;
526 } 530 }
527 531
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 487267310a84..4ea4510e2996 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -92,11 +92,13 @@
92#include <linux/string.h> 92#include <linux/string.h>
93#include <linux/nodemask.h> 93#include <linux/nodemask.h>
94#include <linux/mm.h> 94#include <linux/mm.h>
95#include <linux/workqueue.h>
95 96
96#include <asm/sections.h> 97#include <asm/sections.h>
97#include <asm/processor.h> 98#include <asm/processor.h>
98#include <asm/atomic.h> 99#include <asm/atomic.h>
99 100
101#include <linux/kmemcheck.h>
100#include <linux/kmemleak.h> 102#include <linux/kmemleak.h>
101 103
102/* 104/*
@@ -107,6 +109,7 @@
107#define SECS_FIRST_SCAN 60 /* delay before the first scan */ 109#define SECS_FIRST_SCAN 60 /* delay before the first scan */
108#define SECS_SCAN_WAIT 600 /* subsequent auto scanning delay */ 110#define SECS_SCAN_WAIT 600 /* subsequent auto scanning delay */
109#define GRAY_LIST_PASSES 25 /* maximum number of gray list scans */ 111#define GRAY_LIST_PASSES 25 /* maximum number of gray list scans */
112#define MAX_SCAN_SIZE 4096 /* maximum size of a scanned block */
110 113
111#define BYTES_PER_POINTER sizeof(void *) 114#define BYTES_PER_POINTER sizeof(void *)
112 115
@@ -120,6 +123,9 @@ struct kmemleak_scan_area {
120 size_t length; 123 size_t length;
121}; 124};
122 125
126#define KMEMLEAK_GREY 0
127#define KMEMLEAK_BLACK -1
128
123/* 129/*
124 * Structure holding the metadata for each allocated memory block. 130 * Structure holding the metadata for each allocated memory block.
125 * Modifications to such objects should be made while holding the 131 * Modifications to such objects should be made while holding the
@@ -161,6 +167,15 @@ struct kmemleak_object {
161/* flag set on newly allocated objects */ 167/* flag set on newly allocated objects */
162#define OBJECT_NEW (1 << 3) 168#define OBJECT_NEW (1 << 3)
163 169
170/* number of bytes to print per line; must be 16 or 32 */
171#define HEX_ROW_SIZE 16
172/* number of bytes to print at a time (1, 2, 4, 8) */
173#define HEX_GROUP_SIZE 1
174/* include ASCII after the hex output */
175#define HEX_ASCII 1
176/* max number of lines to be printed */
177#define HEX_MAX_LINES 2
178
164/* the list of all allocated objects */ 179/* the list of all allocated objects */
165static LIST_HEAD(object_list); 180static LIST_HEAD(object_list);
166/* the list of gray-colored objects (see color_gray comment below) */ 181/* the list of gray-colored objects (see color_gray comment below) */
@@ -228,11 +243,14 @@ struct early_log {
228 int min_count; /* minimum reference count */ 243 int min_count; /* minimum reference count */
229 unsigned long offset; /* scan area offset */ 244 unsigned long offset; /* scan area offset */
230 size_t length; /* scan area length */ 245 size_t length; /* scan area length */
246 unsigned long trace[MAX_TRACE]; /* stack trace */
247 unsigned int trace_len; /* stack trace length */
231}; 248};
232 249
233/* early logging buffer and current position */ 250/* early logging buffer and current position */
234static struct early_log early_log[CONFIG_DEBUG_KMEMLEAK_EARLY_LOG_SIZE]; 251static struct early_log
235static int crt_early_log; 252 early_log[CONFIG_DEBUG_KMEMLEAK_EARLY_LOG_SIZE] __initdata;
253static int crt_early_log __initdata;
236 254
237static void kmemleak_disable(void); 255static void kmemleak_disable(void);
238 256
@@ -255,6 +273,35 @@ static void kmemleak_disable(void);
255} while (0) 273} while (0)
256 274
257/* 275/*
276 * Printing of the objects hex dump to the seq file. The number of lines to be
277 * printed is limited to HEX_MAX_LINES to prevent seq file spamming. The
278 * actual number of printed bytes depends on HEX_ROW_SIZE. It must be called
279 * with the object->lock held.
280 */
281static void hex_dump_object(struct seq_file *seq,
282 struct kmemleak_object *object)
283{
284 const u8 *ptr = (const u8 *)object->pointer;
285 int i, len, remaining;
286 unsigned char linebuf[HEX_ROW_SIZE * 5];
287
288 /* limit the number of lines to HEX_MAX_LINES */
289 remaining = len =
290 min(object->size, (size_t)(HEX_MAX_LINES * HEX_ROW_SIZE));
291
292 seq_printf(seq, " hex dump (first %d bytes):\n", len);
293 for (i = 0; i < len; i += HEX_ROW_SIZE) {
294 int linelen = min(remaining, HEX_ROW_SIZE);
295
296 remaining -= HEX_ROW_SIZE;
297 hex_dump_to_buffer(ptr + i, linelen, HEX_ROW_SIZE,
298 HEX_GROUP_SIZE, linebuf, sizeof(linebuf),
299 HEX_ASCII);
300 seq_printf(seq, " %s\n", linebuf);
301 }
302}
303
304/*
258 * Object colors, encoded with count and min_count: 305 * Object colors, encoded with count and min_count:
259 * - white - orphan object, not enough references to it (count < min_count) 306 * - white - orphan object, not enough references to it (count < min_count)
260 * - gray - not orphan, not marked as false positive (min_count == 0) or 307 * - gray - not orphan, not marked as false positive (min_count == 0) or
@@ -264,19 +311,21 @@ static void kmemleak_disable(void);
264 * Newly created objects don't have any color assigned (object->count == -1) 311 * Newly created objects don't have any color assigned (object->count == -1)
265 * before the next memory scan when they become white. 312 * before the next memory scan when they become white.
266 */ 313 */
267static int color_white(const struct kmemleak_object *object) 314static bool color_white(const struct kmemleak_object *object)
268{ 315{
269 return object->count != -1 && object->count < object->min_count; 316 return object->count != KMEMLEAK_BLACK &&
317 object->count < object->min_count;
270} 318}
271 319
272static int color_gray(const struct kmemleak_object *object) 320static bool color_gray(const struct kmemleak_object *object)
273{ 321{
274 return object->min_count != -1 && object->count >= object->min_count; 322 return object->min_count != KMEMLEAK_BLACK &&
323 object->count >= object->min_count;
275} 324}
276 325
277static int color_black(const struct kmemleak_object *object) 326static bool color_black(const struct kmemleak_object *object)
278{ 327{
279 return object->min_count == -1; 328 return object->min_count == KMEMLEAK_BLACK;
280} 329}
281 330
282/* 331/*
@@ -284,7 +333,7 @@ static int color_black(const struct kmemleak_object *object)
284 * not be deleted and have a minimum age to avoid false positives caused by 333 * not be deleted and have a minimum age to avoid false positives caused by
285 * pointers temporarily stored in CPU registers. 334 * pointers temporarily stored in CPU registers.
286 */ 335 */
287static int unreferenced_object(struct kmemleak_object *object) 336static bool unreferenced_object(struct kmemleak_object *object)
288{ 337{
289 return (object->flags & OBJECT_ALLOCATED) && color_white(object) && 338 return (object->flags & OBJECT_ALLOCATED) && color_white(object) &&
290 time_before_eq(object->jiffies + jiffies_min_age, 339 time_before_eq(object->jiffies + jiffies_min_age,
@@ -304,6 +353,7 @@ static void print_unreferenced(struct seq_file *seq,
304 object->pointer, object->size); 353 object->pointer, object->size);
305 seq_printf(seq, " comm \"%s\", pid %d, jiffies %lu\n", 354 seq_printf(seq, " comm \"%s\", pid %d, jiffies %lu\n",
306 object->comm, object->pid, object->jiffies); 355 object->comm, object->pid, object->jiffies);
356 hex_dump_object(seq, object);
307 seq_printf(seq, " backtrace:\n"); 357 seq_printf(seq, " backtrace:\n");
308 358
309 for (i = 0; i < object->trace_len; i++) { 359 for (i = 0; i < object->trace_len; i++) {
@@ -330,6 +380,7 @@ static void dump_object_info(struct kmemleak_object *object)
330 object->comm, object->pid, object->jiffies); 380 object->comm, object->pid, object->jiffies);
331 pr_notice(" min_count = %d\n", object->min_count); 381 pr_notice(" min_count = %d\n", object->min_count);
332 pr_notice(" count = %d\n", object->count); 382 pr_notice(" count = %d\n", object->count);
383 pr_notice(" flags = 0x%lx\n", object->flags);
333 pr_notice(" backtrace:\n"); 384 pr_notice(" backtrace:\n");
334 print_stack_trace(&trace, 4); 385 print_stack_trace(&trace, 4);
335} 386}
@@ -434,21 +485,36 @@ static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias)
434} 485}
435 486
436/* 487/*
488 * Save stack trace to the given array of MAX_TRACE size.
489 */
490static int __save_stack_trace(unsigned long *trace)
491{
492 struct stack_trace stack_trace;
493
494 stack_trace.max_entries = MAX_TRACE;
495 stack_trace.nr_entries = 0;
496 stack_trace.entries = trace;
497 stack_trace.skip = 2;
498 save_stack_trace(&stack_trace);
499
500 return stack_trace.nr_entries;
501}
502
503/*
437 * Create the metadata (struct kmemleak_object) corresponding to an allocated 504 * Create the metadata (struct kmemleak_object) corresponding to an allocated
438 * memory block and add it to the object_list and object_tree_root. 505 * memory block and add it to the object_list and object_tree_root.
439 */ 506 */
440static void create_object(unsigned long ptr, size_t size, int min_count, 507static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
441 gfp_t gfp) 508 int min_count, gfp_t gfp)
442{ 509{
443 unsigned long flags; 510 unsigned long flags;
444 struct kmemleak_object *object; 511 struct kmemleak_object *object;
445 struct prio_tree_node *node; 512 struct prio_tree_node *node;
446 struct stack_trace trace;
447 513
448 object = kmem_cache_alloc(object_cache, gfp & GFP_KMEMLEAK_MASK); 514 object = kmem_cache_alloc(object_cache, gfp & GFP_KMEMLEAK_MASK);
449 if (!object) { 515 if (!object) {
450 kmemleak_stop("Cannot allocate a kmemleak_object structure\n"); 516 kmemleak_stop("Cannot allocate a kmemleak_object structure\n");
451 return; 517 return NULL;
452 } 518 }
453 519
454 INIT_LIST_HEAD(&object->object_list); 520 INIT_LIST_HEAD(&object->object_list);
@@ -482,18 +548,14 @@ static void create_object(unsigned long ptr, size_t size, int min_count,
482 } 548 }
483 549
484 /* kernel backtrace */ 550 /* kernel backtrace */
485 trace.max_entries = MAX_TRACE; 551 object->trace_len = __save_stack_trace(object->trace);
486 trace.nr_entries = 0;
487 trace.entries = object->trace;
488 trace.skip = 1;
489 save_stack_trace(&trace);
490 object->trace_len = trace.nr_entries;
491 552
492 INIT_PRIO_TREE_NODE(&object->tree_node); 553 INIT_PRIO_TREE_NODE(&object->tree_node);
493 object->tree_node.start = ptr; 554 object->tree_node.start = ptr;
494 object->tree_node.last = ptr + size - 1; 555 object->tree_node.last = ptr + size - 1;
495 556
496 write_lock_irqsave(&kmemleak_lock, flags); 557 write_lock_irqsave(&kmemleak_lock, flags);
558
497 min_addr = min(min_addr, ptr); 559 min_addr = min(min_addr, ptr);
498 max_addr = max(max_addr, ptr + size); 560 max_addr = max(max_addr, ptr + size);
499 node = prio_tree_insert(&object_tree_root, &object->tree_node); 561 node = prio_tree_insert(&object_tree_root, &object->tree_node);
@@ -504,20 +566,19 @@ static void create_object(unsigned long ptr, size_t size, int min_count,
504 * random memory blocks. 566 * random memory blocks.
505 */ 567 */
506 if (node != &object->tree_node) { 568 if (node != &object->tree_node) {
507 unsigned long flags;
508
509 kmemleak_stop("Cannot insert 0x%lx into the object search tree " 569 kmemleak_stop("Cannot insert 0x%lx into the object search tree "
510 "(already existing)\n", ptr); 570 "(already existing)\n", ptr);
511 object = lookup_object(ptr, 1); 571 object = lookup_object(ptr, 1);
512 spin_lock_irqsave(&object->lock, flags); 572 spin_lock(&object->lock);
513 dump_object_info(object); 573 dump_object_info(object);
514 spin_unlock_irqrestore(&object->lock, flags); 574 spin_unlock(&object->lock);
515 575
516 goto out; 576 goto out;
517 } 577 }
518 list_add_tail_rcu(&object->object_list, &object_list); 578 list_add_tail_rcu(&object->object_list, &object_list);
519out: 579out:
520 write_unlock_irqrestore(&kmemleak_lock, flags); 580 write_unlock_irqrestore(&kmemleak_lock, flags);
581 return object;
521} 582}
522 583
523/* 584/*
@@ -604,46 +665,55 @@ static void delete_object_part(unsigned long ptr, size_t size)
604 665
605 put_object(object); 666 put_object(object);
606} 667}
607/* 668
608 * Make a object permanently as gray-colored so that it can no longer be 669static void __paint_it(struct kmemleak_object *object, int color)
609 * reported as a leak. This is used in general to mark a false positive. 670{
610 */ 671 object->min_count = color;
611static void make_gray_object(unsigned long ptr) 672 if (color == KMEMLEAK_BLACK)
673 object->flags |= OBJECT_NO_SCAN;
674}
675
676static void paint_it(struct kmemleak_object *object, int color)
612{ 677{
613 unsigned long flags; 678 unsigned long flags;
679
680 spin_lock_irqsave(&object->lock, flags);
681 __paint_it(object, color);
682 spin_unlock_irqrestore(&object->lock, flags);
683}
684
685static void paint_ptr(unsigned long ptr, int color)
686{
614 struct kmemleak_object *object; 687 struct kmemleak_object *object;
615 688
616 object = find_and_get_object(ptr, 0); 689 object = find_and_get_object(ptr, 0);
617 if (!object) { 690 if (!object) {
618 kmemleak_warn("Graying unknown object at 0x%08lx\n", ptr); 691 kmemleak_warn("Trying to color unknown object "
692 "at 0x%08lx as %s\n", ptr,
693 (color == KMEMLEAK_GREY) ? "Grey" :
694 (color == KMEMLEAK_BLACK) ? "Black" : "Unknown");
619 return; 695 return;
620 } 696 }
621 697 paint_it(object, color);
622 spin_lock_irqsave(&object->lock, flags);
623 object->min_count = 0;
624 spin_unlock_irqrestore(&object->lock, flags);
625 put_object(object); 698 put_object(object);
626} 699}
627 700
628/* 701/*
702 * Make a object permanently as gray-colored so that it can no longer be
703 * reported as a leak. This is used in general to mark a false positive.
704 */
705static void make_gray_object(unsigned long ptr)
706{
707 paint_ptr(ptr, KMEMLEAK_GREY);
708}
709
710/*
629 * Mark the object as black-colored so that it is ignored from scans and 711 * Mark the object as black-colored so that it is ignored from scans and
630 * reporting. 712 * reporting.
631 */ 713 */
632static void make_black_object(unsigned long ptr) 714static void make_black_object(unsigned long ptr)
633{ 715{
634 unsigned long flags; 716 paint_ptr(ptr, KMEMLEAK_BLACK);
635 struct kmemleak_object *object;
636
637 object = find_and_get_object(ptr, 0);
638 if (!object) {
639 kmemleak_warn("Blacking unknown object at 0x%08lx\n", ptr);
640 return;
641 }
642
643 spin_lock_irqsave(&object->lock, flags);
644 object->min_count = -1;
645 spin_unlock_irqrestore(&object->lock, flags);
646 put_object(object);
647} 717}
648 718
649/* 719/*
@@ -715,14 +785,15 @@ static void object_no_scan(unsigned long ptr)
715 * Log an early kmemleak_* call to the early_log buffer. These calls will be 785 * Log an early kmemleak_* call to the early_log buffer. These calls will be
716 * processed later once kmemleak is fully initialized. 786 * processed later once kmemleak is fully initialized.
717 */ 787 */
718static void log_early(int op_type, const void *ptr, size_t size, 788static void __init log_early(int op_type, const void *ptr, size_t size,
719 int min_count, unsigned long offset, size_t length) 789 int min_count, unsigned long offset, size_t length)
720{ 790{
721 unsigned long flags; 791 unsigned long flags;
722 struct early_log *log; 792 struct early_log *log;
723 793
724 if (crt_early_log >= ARRAY_SIZE(early_log)) { 794 if (crt_early_log >= ARRAY_SIZE(early_log)) {
725 pr_warning("Early log buffer exceeded\n"); 795 pr_warning("Early log buffer exceeded, "
796 "please increase DEBUG_KMEMLEAK_EARLY_LOG_SIZE\n");
726 kmemleak_disable(); 797 kmemleak_disable();
727 return; 798 return;
728 } 799 }
@@ -739,16 +810,45 @@ static void log_early(int op_type, const void *ptr, size_t size,
739 log->min_count = min_count; 810 log->min_count = min_count;
740 log->offset = offset; 811 log->offset = offset;
741 log->length = length; 812 log->length = length;
813 if (op_type == KMEMLEAK_ALLOC)
814 log->trace_len = __save_stack_trace(log->trace);
742 crt_early_log++; 815 crt_early_log++;
743 local_irq_restore(flags); 816 local_irq_restore(flags);
744} 817}
745 818
746/* 819/*
820 * Log an early allocated block and populate the stack trace.
821 */
822static void early_alloc(struct early_log *log)
823{
824 struct kmemleak_object *object;
825 unsigned long flags;
826 int i;
827
828 if (!atomic_read(&kmemleak_enabled) || !log->ptr || IS_ERR(log->ptr))
829 return;
830
831 /*
832 * RCU locking needed to ensure object is not freed via put_object().
833 */
834 rcu_read_lock();
835 object = create_object((unsigned long)log->ptr, log->size,
836 log->min_count, GFP_KERNEL);
837 spin_lock_irqsave(&object->lock, flags);
838 for (i = 0; i < log->trace_len; i++)
839 object->trace[i] = log->trace[i];
840 object->trace_len = log->trace_len;
841 spin_unlock_irqrestore(&object->lock, flags);
842 rcu_read_unlock();
843}
844
845/*
747 * Memory allocation function callback. This function is called from the 846 * Memory allocation function callback. This function is called from the
748 * kernel allocators when a new block is allocated (kmem_cache_alloc, kmalloc, 847 * kernel allocators when a new block is allocated (kmem_cache_alloc, kmalloc,
749 * vmalloc etc.). 848 * vmalloc etc.).
750 */ 849 */
751void kmemleak_alloc(const void *ptr, size_t size, int min_count, gfp_t gfp) 850void __ref kmemleak_alloc(const void *ptr, size_t size, int min_count,
851 gfp_t gfp)
752{ 852{
753 pr_debug("%s(0x%p, %zu, %d)\n", __func__, ptr, size, min_count); 853 pr_debug("%s(0x%p, %zu, %d)\n", __func__, ptr, size, min_count);
754 854
@@ -763,7 +863,7 @@ EXPORT_SYMBOL_GPL(kmemleak_alloc);
763 * Memory freeing function callback. This function is called from the kernel 863 * Memory freeing function callback. This function is called from the kernel
764 * allocators when a block is freed (kmem_cache_free, kfree, vfree etc.). 864 * allocators when a block is freed (kmem_cache_free, kfree, vfree etc.).
765 */ 865 */
766void kmemleak_free(const void *ptr) 866void __ref kmemleak_free(const void *ptr)
767{ 867{
768 pr_debug("%s(0x%p)\n", __func__, ptr); 868 pr_debug("%s(0x%p)\n", __func__, ptr);
769 869
@@ -778,7 +878,7 @@ EXPORT_SYMBOL_GPL(kmemleak_free);
778 * Partial memory freeing function callback. This function is usually called 878 * Partial memory freeing function callback. This function is usually called
779 * from bootmem allocator when (part of) a memory block is freed. 879 * from bootmem allocator when (part of) a memory block is freed.
780 */ 880 */
781void kmemleak_free_part(const void *ptr, size_t size) 881void __ref kmemleak_free_part(const void *ptr, size_t size)
782{ 882{
783 pr_debug("%s(0x%p)\n", __func__, ptr); 883 pr_debug("%s(0x%p)\n", __func__, ptr);
784 884
@@ -793,7 +893,7 @@ EXPORT_SYMBOL_GPL(kmemleak_free_part);
793 * Mark an already allocated memory block as a false positive. This will cause 893 * Mark an already allocated memory block as a false positive. This will cause
794 * the block to no longer be reported as leak and always be scanned. 894 * the block to no longer be reported as leak and always be scanned.
795 */ 895 */
796void kmemleak_not_leak(const void *ptr) 896void __ref kmemleak_not_leak(const void *ptr)
797{ 897{
798 pr_debug("%s(0x%p)\n", __func__, ptr); 898 pr_debug("%s(0x%p)\n", __func__, ptr);
799 899
@@ -809,7 +909,7 @@ EXPORT_SYMBOL(kmemleak_not_leak);
809 * corresponding block is not a leak and does not contain any references to 909 * corresponding block is not a leak and does not contain any references to
810 * other allocated memory blocks. 910 * other allocated memory blocks.
811 */ 911 */
812void kmemleak_ignore(const void *ptr) 912void __ref kmemleak_ignore(const void *ptr)
813{ 913{
814 pr_debug("%s(0x%p)\n", __func__, ptr); 914 pr_debug("%s(0x%p)\n", __func__, ptr);
815 915
@@ -823,8 +923,8 @@ EXPORT_SYMBOL(kmemleak_ignore);
823/* 923/*
824 * Limit the range to be scanned in an allocated memory block. 924 * Limit the range to be scanned in an allocated memory block.
825 */ 925 */
826void kmemleak_scan_area(const void *ptr, unsigned long offset, size_t length, 926void __ref kmemleak_scan_area(const void *ptr, unsigned long offset,
827 gfp_t gfp) 927 size_t length, gfp_t gfp)
828{ 928{
829 pr_debug("%s(0x%p)\n", __func__, ptr); 929 pr_debug("%s(0x%p)\n", __func__, ptr);
830 930
@@ -838,7 +938,7 @@ EXPORT_SYMBOL(kmemleak_scan_area);
838/* 938/*
839 * Inform kmemleak not to scan the given memory block. 939 * Inform kmemleak not to scan the given memory block.
840 */ 940 */
841void kmemleak_no_scan(const void *ptr) 941void __ref kmemleak_no_scan(const void *ptr)
842{ 942{
843 pr_debug("%s(0x%p)\n", __func__, ptr); 943 pr_debug("%s(0x%p)\n", __func__, ptr);
844 944
@@ -882,15 +982,22 @@ static void scan_block(void *_start, void *_end,
882 unsigned long *end = _end - (BYTES_PER_POINTER - 1); 982 unsigned long *end = _end - (BYTES_PER_POINTER - 1);
883 983
884 for (ptr = start; ptr < end; ptr++) { 984 for (ptr = start; ptr < end; ptr++) {
885 unsigned long flags;
886 unsigned long pointer = *ptr;
887 struct kmemleak_object *object; 985 struct kmemleak_object *object;
986 unsigned long flags;
987 unsigned long pointer;
888 988
889 if (allow_resched) 989 if (allow_resched)
890 cond_resched(); 990 cond_resched();
891 if (scan_should_stop()) 991 if (scan_should_stop())
892 break; 992 break;
893 993
994 /* don't scan uninitialized memory */
995 if (!kmemcheck_is_obj_initialized((unsigned long)ptr,
996 BYTES_PER_POINTER))
997 continue;
998
999 pointer = *ptr;
1000
894 object = find_and_get_object(pointer, 1); 1001 object = find_and_get_object(pointer, 1);
895 if (!object) 1002 if (!object)
896 continue; 1003 continue;
@@ -949,10 +1056,21 @@ static void scan_object(struct kmemleak_object *object)
949 if (!(object->flags & OBJECT_ALLOCATED)) 1056 if (!(object->flags & OBJECT_ALLOCATED))
950 /* already freed object */ 1057 /* already freed object */
951 goto out; 1058 goto out;
952 if (hlist_empty(&object->area_list)) 1059 if (hlist_empty(&object->area_list)) {
953 scan_block((void *)object->pointer, 1060 void *start = (void *)object->pointer;
954 (void *)(object->pointer + object->size), object, 0); 1061 void *end = (void *)(object->pointer + object->size);
955 else 1062
1063 while (start < end && (object->flags & OBJECT_ALLOCATED) &&
1064 !(object->flags & OBJECT_NO_SCAN)) {
1065 scan_block(start, min(start + MAX_SCAN_SIZE, end),
1066 object, 0);
1067 start += MAX_SCAN_SIZE;
1068
1069 spin_unlock_irqrestore(&object->lock, flags);
1070 cond_resched();
1071 spin_lock_irqsave(&object->lock, flags);
1072 }
1073 } else
956 hlist_for_each_entry(area, elem, &object->area_list, node) 1074 hlist_for_each_entry(area, elem, &object->area_list, node)
957 scan_block((void *)(object->pointer + area->offset), 1075 scan_block((void *)(object->pointer + area->offset),
958 (void *)(object->pointer + area->offset 1076 (void *)(object->pointer + area->offset
@@ -970,7 +1088,6 @@ static void kmemleak_scan(void)
970{ 1088{
971 unsigned long flags; 1089 unsigned long flags;
972 struct kmemleak_object *object, *tmp; 1090 struct kmemleak_object *object, *tmp;
973 struct task_struct *task;
974 int i; 1091 int i;
975 int new_leaks = 0; 1092 int new_leaks = 0;
976 int gray_list_pass = 0; 1093 int gray_list_pass = 0;
@@ -1037,15 +1154,16 @@ static void kmemleak_scan(void)
1037 } 1154 }
1038 1155
1039 /* 1156 /*
1040 * Scanning the task stacks may introduce false negatives and it is 1157 * Scanning the task stacks (may introduce false negatives).
1041 * not enabled by default.
1042 */ 1158 */
1043 if (kmemleak_stack_scan) { 1159 if (kmemleak_stack_scan) {
1160 struct task_struct *p, *g;
1161
1044 read_lock(&tasklist_lock); 1162 read_lock(&tasklist_lock);
1045 for_each_process(task) 1163 do_each_thread(g, p) {
1046 scan_block(task_stack_page(task), 1164 scan_block(task_stack_page(p), task_stack_page(p) +
1047 task_stack_page(task) + THREAD_SIZE, 1165 THREAD_SIZE, NULL, 0);
1048 NULL, 0); 1166 } while_each_thread(g, p);
1049 read_unlock(&tasklist_lock); 1167 read_unlock(&tasklist_lock);
1050 } 1168 }
1051 1169
@@ -1170,7 +1288,7 @@ static int kmemleak_scan_thread(void *arg)
1170 * Start the automatic memory scanning thread. This function must be called 1288 * Start the automatic memory scanning thread. This function must be called
1171 * with the scan_mutex held. 1289 * with the scan_mutex held.
1172 */ 1290 */
1173void start_scan_thread(void) 1291static void start_scan_thread(void)
1174{ 1292{
1175 if (scan_thread) 1293 if (scan_thread)
1176 return; 1294 return;
@@ -1185,7 +1303,7 @@ void start_scan_thread(void)
1185 * Stop the automatic memory scanning thread. This function must be called 1303 * Stop the automatic memory scanning thread. This function must be called
1186 * with the scan_mutex held. 1304 * with the scan_mutex held.
1187 */ 1305 */
1188void stop_scan_thread(void) 1306static void stop_scan_thread(void)
1189{ 1307{
1190 if (scan_thread) { 1308 if (scan_thread) {
1191 kthread_stop(scan_thread); 1309 kthread_stop(scan_thread);
@@ -1294,6 +1412,49 @@ static int kmemleak_release(struct inode *inode, struct file *file)
1294 return seq_release(inode, file); 1412 return seq_release(inode, file);
1295} 1413}
1296 1414
1415static int dump_str_object_info(const char *str)
1416{
1417 unsigned long flags;
1418 struct kmemleak_object *object;
1419 unsigned long addr;
1420
1421 addr= simple_strtoul(str, NULL, 0);
1422 object = find_and_get_object(addr, 0);
1423 if (!object) {
1424 pr_info("Unknown object at 0x%08lx\n", addr);
1425 return -EINVAL;
1426 }
1427
1428 spin_lock_irqsave(&object->lock, flags);
1429 dump_object_info(object);
1430 spin_unlock_irqrestore(&object->lock, flags);
1431
1432 put_object(object);
1433 return 0;
1434}
1435
1436/*
1437 * We use grey instead of black to ensure we can do future scans on the same
1438 * objects. If we did not do future scans these black objects could
1439 * potentially contain references to newly allocated objects in the future and
1440 * we'd end up with false positives.
1441 */
1442static void kmemleak_clear(void)
1443{
1444 struct kmemleak_object *object;
1445 unsigned long flags;
1446
1447 rcu_read_lock();
1448 list_for_each_entry_rcu(object, &object_list, object_list) {
1449 spin_lock_irqsave(&object->lock, flags);
1450 if ((object->flags & OBJECT_REPORTED) &&
1451 unreferenced_object(object))
1452 __paint_it(object, KMEMLEAK_GREY);
1453 spin_unlock_irqrestore(&object->lock, flags);
1454 }
1455 rcu_read_unlock();
1456}
1457
1297/* 1458/*
1298 * File write operation to configure kmemleak at run-time. The following 1459 * File write operation to configure kmemleak at run-time. The following
1299 * commands can be written to the /sys/kernel/debug/kmemleak file: 1460 * commands can be written to the /sys/kernel/debug/kmemleak file:
@@ -1305,6 +1466,9 @@ static int kmemleak_release(struct inode *inode, struct file *file)
1305 * scan=... - set the automatic memory scanning period in seconds (0 to 1466 * scan=... - set the automatic memory scanning period in seconds (0 to
1306 * disable it) 1467 * disable it)
1307 * scan - trigger a memory scan 1468 * scan - trigger a memory scan
1469 * clear - mark all current reported unreferenced kmemleak objects as
1470 * grey to ignore printing them
1471 * dump=... - dump information about the object found at the given address
1308 */ 1472 */
1309static ssize_t kmemleak_write(struct file *file, const char __user *user_buf, 1473static ssize_t kmemleak_write(struct file *file, const char __user *user_buf,
1310 size_t size, loff_t *ppos) 1474 size_t size, loff_t *ppos)
@@ -1345,6 +1509,10 @@ static ssize_t kmemleak_write(struct file *file, const char __user *user_buf,
1345 } 1509 }
1346 } else if (strncmp(buf, "scan", 4) == 0) 1510 } else if (strncmp(buf, "scan", 4) == 0)
1347 kmemleak_scan(); 1511 kmemleak_scan();
1512 else if (strncmp(buf, "clear", 5) == 0)
1513 kmemleak_clear();
1514 else if (strncmp(buf, "dump=", 5) == 0)
1515 ret = dump_str_object_info(buf + 5);
1348 else 1516 else
1349 ret = -EINVAL; 1517 ret = -EINVAL;
1350 1518
@@ -1371,7 +1539,7 @@ static const struct file_operations kmemleak_fops = {
1371 * Perform the freeing of the kmemleak internal objects after waiting for any 1539 * Perform the freeing of the kmemleak internal objects after waiting for any
1372 * current memory scan to complete. 1540 * current memory scan to complete.
1373 */ 1541 */
1374static int kmemleak_cleanup_thread(void *arg) 1542static void kmemleak_do_cleanup(struct work_struct *work)
1375{ 1543{
1376 struct kmemleak_object *object; 1544 struct kmemleak_object *object;
1377 1545
@@ -1383,22 +1551,9 @@ static int kmemleak_cleanup_thread(void *arg)
1383 delete_object_full(object->pointer); 1551 delete_object_full(object->pointer);
1384 rcu_read_unlock(); 1552 rcu_read_unlock();
1385 mutex_unlock(&scan_mutex); 1553 mutex_unlock(&scan_mutex);
1386
1387 return 0;
1388} 1554}
1389 1555
1390/* 1556static DECLARE_WORK(cleanup_work, kmemleak_do_cleanup);
1391 * Start the clean-up thread.
1392 */
1393static void kmemleak_cleanup(void)
1394{
1395 struct task_struct *cleanup_thread;
1396
1397 cleanup_thread = kthread_run(kmemleak_cleanup_thread, NULL,
1398 "kmemleak-clean");
1399 if (IS_ERR(cleanup_thread))
1400 pr_warning("Failed to create the clean-up thread\n");
1401}
1402 1557
1403/* 1558/*
1404 * Disable kmemleak. No memory allocation/freeing will be traced once this 1559 * Disable kmemleak. No memory allocation/freeing will be traced once this
@@ -1416,7 +1571,7 @@ static void kmemleak_disable(void)
1416 1571
1417 /* check whether it is too early for a kernel thread */ 1572 /* check whether it is too early for a kernel thread */
1418 if (atomic_read(&kmemleak_initialized)) 1573 if (atomic_read(&kmemleak_initialized))
1419 kmemleak_cleanup(); 1574 schedule_work(&cleanup_work);
1420 1575
1421 pr_info("Kernel memory leak detector disabled\n"); 1576 pr_info("Kernel memory leak detector disabled\n");
1422} 1577}
@@ -1469,8 +1624,7 @@ void __init kmemleak_init(void)
1469 1624
1470 switch (log->op_type) { 1625 switch (log->op_type) {
1471 case KMEMLEAK_ALLOC: 1626 case KMEMLEAK_ALLOC:
1472 kmemleak_alloc(log->ptr, log->size, log->min_count, 1627 early_alloc(log);
1473 GFP_KERNEL);
1474 break; 1628 break;
1475 case KMEMLEAK_FREE: 1629 case KMEMLEAK_FREE:
1476 kmemleak_free(log->ptr); 1630 kmemleak_free(log->ptr);
@@ -1513,7 +1667,7 @@ static int __init kmemleak_late_init(void)
1513 * after setting kmemleak_initialized and we may end up with 1667 * after setting kmemleak_initialized and we may end up with
1514 * two clean-up threads but serialized by scan_mutex. 1668 * two clean-up threads but serialized by scan_mutex.
1515 */ 1669 */
1516 kmemleak_cleanup(); 1670 schedule_work(&cleanup_work);
1517 return -ENOMEM; 1671 return -ENOMEM;
1518 } 1672 }
1519 1673
diff --git a/mm/nommu.c b/mm/nommu.c
index 4bde489ec431..66e81e7e9fe9 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1352,6 +1352,7 @@ unsigned long do_mmap_pgoff(struct file *file,
1352 } 1352 }
1353 1353
1354 vma->vm_region = region; 1354 vma->vm_region = region;
1355 add_nommu_region(region);
1355 1356
1356 /* set up the mapping */ 1357 /* set up the mapping */
1357 if (file && vma->vm_flags & VM_SHARED) 1358 if (file && vma->vm_flags & VM_SHARED)
@@ -1361,8 +1362,6 @@ unsigned long do_mmap_pgoff(struct file *file,
1361 if (ret < 0) 1362 if (ret < 0)
1362 goto error_put_region; 1363 goto error_put_region;
1363 1364
1364 add_nommu_region(region);
1365
1366 /* okay... we have a mapping; now we have to register it */ 1365 /* okay... we have a mapping; now we have to register it */
1367 result = vma->vm_start; 1366 result = vma->vm_start;
1368 1367
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 81627ebcd313..25e7770309b8 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -36,15 +36,6 @@
36#include <linux/pagevec.h> 36#include <linux/pagevec.h>
37 37
38/* 38/*
39 * The maximum number of pages to writeout in a single bdflush/kupdate
40 * operation. We do this so we don't hold I_SYNC against an inode for
41 * enormous amounts of time, which would block a userspace task which has
42 * been forced to throttle against that inode. Also, the code reevaluates
43 * the dirty each time it has written this many pages.
44 */
45#define MAX_WRITEBACK_PAGES 1024
46
47/*
48 * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited 39 * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
49 * will look to see if it needs to force writeback or throttling. 40 * will look to see if it needs to force writeback or throttling.
50 */ 41 */
@@ -117,8 +108,6 @@ EXPORT_SYMBOL(laptop_mode);
117/* End of sysctl-exported parameters */ 108/* End of sysctl-exported parameters */
118 109
119 110
120static void background_writeout(unsigned long _min_pages);
121
122/* 111/*
123 * Scale the writeback cache size proportional to the relative writeout speeds. 112 * Scale the writeback cache size proportional to the relative writeout speeds.
124 * 113 *
@@ -320,15 +309,13 @@ static void task_dirty_limit(struct task_struct *tsk, unsigned long *pdirty)
320/* 309/*
321 * 310 *
322 */ 311 */
323static DEFINE_SPINLOCK(bdi_lock);
324static unsigned int bdi_min_ratio; 312static unsigned int bdi_min_ratio;
325 313
326int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) 314int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio)
327{ 315{
328 int ret = 0; 316 int ret = 0;
329 unsigned long flags;
330 317
331 spin_lock_irqsave(&bdi_lock, flags); 318 spin_lock(&bdi_lock);
332 if (min_ratio > bdi->max_ratio) { 319 if (min_ratio > bdi->max_ratio) {
333 ret = -EINVAL; 320 ret = -EINVAL;
334 } else { 321 } else {
@@ -340,27 +327,26 @@ int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio)
340 ret = -EINVAL; 327 ret = -EINVAL;
341 } 328 }
342 } 329 }
343 spin_unlock_irqrestore(&bdi_lock, flags); 330 spin_unlock(&bdi_lock);
344 331
345 return ret; 332 return ret;
346} 333}
347 334
348int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio) 335int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio)
349{ 336{
350 unsigned long flags;
351 int ret = 0; 337 int ret = 0;
352 338
353 if (max_ratio > 100) 339 if (max_ratio > 100)
354 return -EINVAL; 340 return -EINVAL;
355 341
356 spin_lock_irqsave(&bdi_lock, flags); 342 spin_lock(&bdi_lock);
357 if (bdi->min_ratio > max_ratio) { 343 if (bdi->min_ratio > max_ratio) {
358 ret = -EINVAL; 344 ret = -EINVAL;
359 } else { 345 } else {
360 bdi->max_ratio = max_ratio; 346 bdi->max_ratio = max_ratio;
361 bdi->max_prop_frac = (PROP_FRAC_BASE * max_ratio) / 100; 347 bdi->max_prop_frac = (PROP_FRAC_BASE * max_ratio) / 100;
362 } 348 }
363 spin_unlock_irqrestore(&bdi_lock, flags); 349 spin_unlock(&bdi_lock);
364 350
365 return ret; 351 return ret;
366} 352}
@@ -546,7 +532,7 @@ static void balance_dirty_pages(struct address_space *mapping)
546 * up. 532 * up.
547 */ 533 */
548 if (bdi_nr_reclaimable > bdi_thresh) { 534 if (bdi_nr_reclaimable > bdi_thresh) {
549 writeback_inodes(&wbc); 535 writeback_inodes_wbc(&wbc);
550 pages_written += write_chunk - wbc.nr_to_write; 536 pages_written += write_chunk - wbc.nr_to_write;
551 get_dirty_limits(&background_thresh, &dirty_thresh, 537 get_dirty_limits(&background_thresh, &dirty_thresh,
552 &bdi_thresh, bdi); 538 &bdi_thresh, bdi);
@@ -575,7 +561,7 @@ static void balance_dirty_pages(struct address_space *mapping)
575 if (pages_written >= write_chunk) 561 if (pages_written >= write_chunk)
576 break; /* We've done our duty */ 562 break; /* We've done our duty */
577 563
578 congestion_wait(BLK_RW_ASYNC, HZ/10); 564 schedule_timeout(1);
579 } 565 }
580 566
581 if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh && 567 if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh &&
@@ -594,10 +580,18 @@ static void balance_dirty_pages(struct address_space *mapping)
594 * background_thresh, to keep the amount of dirty memory low. 580 * background_thresh, to keep the amount of dirty memory low.
595 */ 581 */
596 if ((laptop_mode && pages_written) || 582 if ((laptop_mode && pages_written) ||
597 (!laptop_mode && (global_page_state(NR_FILE_DIRTY) 583 (!laptop_mode && ((nr_writeback = global_page_state(NR_FILE_DIRTY)
598 + global_page_state(NR_UNSTABLE_NFS) 584 + global_page_state(NR_UNSTABLE_NFS))
599 > background_thresh))) 585 > background_thresh))) {
600 pdflush_operation(background_writeout, 0); 586 struct writeback_control wbc = {
587 .bdi = bdi,
588 .sync_mode = WB_SYNC_NONE,
589 .nr_to_write = nr_writeback,
590 };
591
592
593 bdi_start_writeback(&wbc);
594 }
601} 595}
602 596
603void set_page_dirty_balance(struct page *page, int page_mkwrite) 597void set_page_dirty_balance(struct page *page, int page_mkwrite)
@@ -681,153 +675,35 @@ void throttle_vm_writeout(gfp_t gfp_mask)
681 } 675 }
682} 676}
683 677
684/*
685 * writeback at least _min_pages, and keep writing until the amount of dirty
686 * memory is less than the background threshold, or until we're all clean.
687 */
688static void background_writeout(unsigned long _min_pages)
689{
690 long min_pages = _min_pages;
691 struct writeback_control wbc = {
692 .bdi = NULL,
693 .sync_mode = WB_SYNC_NONE,
694 .older_than_this = NULL,
695 .nr_to_write = 0,
696 .nonblocking = 1,
697 .range_cyclic = 1,
698 };
699
700 for ( ; ; ) {
701 unsigned long background_thresh;
702 unsigned long dirty_thresh;
703
704 get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
705 if (global_page_state(NR_FILE_DIRTY) +
706 global_page_state(NR_UNSTABLE_NFS) < background_thresh
707 && min_pages <= 0)
708 break;
709 wbc.more_io = 0;
710 wbc.encountered_congestion = 0;
711 wbc.nr_to_write = MAX_WRITEBACK_PAGES;
712 wbc.pages_skipped = 0;
713 writeback_inodes(&wbc);
714 min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
715 if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
716 /* Wrote less than expected */
717 if (wbc.encountered_congestion || wbc.more_io)
718 congestion_wait(BLK_RW_ASYNC, HZ/10);
719 else
720 break;
721 }
722 }
723}
724
725/*
726 * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back
727 * the whole world. Returns 0 if a pdflush thread was dispatched. Returns
728 * -1 if all pdflush threads were busy.
729 */
730int wakeup_pdflush(long nr_pages)
731{
732 if (nr_pages == 0)
733 nr_pages = global_page_state(NR_FILE_DIRTY) +
734 global_page_state(NR_UNSTABLE_NFS);
735 return pdflush_operation(background_writeout, nr_pages);
736}
737
738static void wb_timer_fn(unsigned long unused);
739static void laptop_timer_fn(unsigned long unused); 678static void laptop_timer_fn(unsigned long unused);
740 679
741static DEFINE_TIMER(wb_timer, wb_timer_fn, 0, 0);
742static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0); 680static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0);
743 681
744/* 682/*
745 * Periodic writeback of "old" data.
746 *
747 * Define "old": the first time one of an inode's pages is dirtied, we mark the
748 * dirtying-time in the inode's address_space. So this periodic writeback code
749 * just walks the superblock inode list, writing back any inodes which are
750 * older than a specific point in time.
751 *
752 * Try to run once per dirty_writeback_interval. But if a writeback event
753 * takes longer than a dirty_writeback_interval interval, then leave a
754 * one-second gap.
755 *
756 * older_than_this takes precedence over nr_to_write. So we'll only write back
757 * all dirty pages if they are all attached to "old" mappings.
758 */
759static void wb_kupdate(unsigned long arg)
760{
761 unsigned long oldest_jif;
762 unsigned long start_jif;
763 unsigned long next_jif;
764 long nr_to_write;
765 struct writeback_control wbc = {
766 .bdi = NULL,
767 .sync_mode = WB_SYNC_NONE,
768 .older_than_this = &oldest_jif,
769 .nr_to_write = 0,
770 .nonblocking = 1,
771 .for_kupdate = 1,
772 .range_cyclic = 1,
773 };
774
775 sync_supers();
776
777 oldest_jif = jiffies - msecs_to_jiffies(dirty_expire_interval * 10);
778 start_jif = jiffies;
779 next_jif = start_jif + msecs_to_jiffies(dirty_writeback_interval * 10);
780 nr_to_write = global_page_state(NR_FILE_DIRTY) +
781 global_page_state(NR_UNSTABLE_NFS) +
782 (inodes_stat.nr_inodes - inodes_stat.nr_unused);
783 while (nr_to_write > 0) {
784 wbc.more_io = 0;
785 wbc.encountered_congestion = 0;
786 wbc.nr_to_write = MAX_WRITEBACK_PAGES;
787 writeback_inodes(&wbc);
788 if (wbc.nr_to_write > 0) {
789 if (wbc.encountered_congestion || wbc.more_io)
790 congestion_wait(BLK_RW_ASYNC, HZ/10);
791 else
792 break; /* All the old data is written */
793 }
794 nr_to_write -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
795 }
796 if (time_before(next_jif, jiffies + HZ))
797 next_jif = jiffies + HZ;
798 if (dirty_writeback_interval)
799 mod_timer(&wb_timer, next_jif);
800}
801
802/*
803 * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs 683 * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
804 */ 684 */
805int dirty_writeback_centisecs_handler(ctl_table *table, int write, 685int dirty_writeback_centisecs_handler(ctl_table *table, int write,
806 struct file *file, void __user *buffer, size_t *length, loff_t *ppos) 686 struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
807{ 687{
808 proc_dointvec(table, write, file, buffer, length, ppos); 688 proc_dointvec(table, write, file, buffer, length, ppos);
809 if (dirty_writeback_interval)
810 mod_timer(&wb_timer, jiffies +
811 msecs_to_jiffies(dirty_writeback_interval * 10));
812 else
813 del_timer(&wb_timer);
814 return 0; 689 return 0;
815} 690}
816 691
817static void wb_timer_fn(unsigned long unused) 692static void do_laptop_sync(struct work_struct *work)
818{
819 if (pdflush_operation(wb_kupdate, 0) < 0)
820 mod_timer(&wb_timer, jiffies + HZ); /* delay 1 second */
821}
822
823static void laptop_flush(unsigned long unused)
824{ 693{
825 sys_sync(); 694 wakeup_flusher_threads(0);
695 kfree(work);
826} 696}
827 697
828static void laptop_timer_fn(unsigned long unused) 698static void laptop_timer_fn(unsigned long unused)
829{ 699{
830 pdflush_operation(laptop_flush, 0); 700 struct work_struct *work;
701
702 work = kmalloc(sizeof(*work), GFP_ATOMIC);
703 if (work) {
704 INIT_WORK(work, do_laptop_sync);
705 schedule_work(work);
706 }
831} 707}
832 708
833/* 709/*
@@ -910,8 +786,6 @@ void __init page_writeback_init(void)
910{ 786{
911 int shift; 787 int shift;
912 788
913 mod_timer(&wb_timer,
914 jiffies + msecs_to_jiffies(dirty_writeback_interval * 10));
915 writeback_set_ratelimit(); 789 writeback_set_ratelimit();
916 register_cpu_notifier(&ratelimit_nb); 790 register_cpu_notifier(&ratelimit_nb);
917 791
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5cc986eb9f6f..a0de15f46987 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -817,13 +817,15 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
817 * agressive about taking ownership of free pages 817 * agressive about taking ownership of free pages
818 */ 818 */
819 if (unlikely(current_order >= (pageblock_order >> 1)) || 819 if (unlikely(current_order >= (pageblock_order >> 1)) ||
820 start_migratetype == MIGRATE_RECLAIMABLE) { 820 start_migratetype == MIGRATE_RECLAIMABLE ||
821 page_group_by_mobility_disabled) {
821 unsigned long pages; 822 unsigned long pages;
822 pages = move_freepages_block(zone, page, 823 pages = move_freepages_block(zone, page,
823 start_migratetype); 824 start_migratetype);
824 825
825 /* Claim the whole block if over half of it is free */ 826 /* Claim the whole block if over half of it is free */
826 if (pages >= (1 << (pageblock_order-1))) 827 if (pages >= (1 << (pageblock_order-1)) ||
828 page_group_by_mobility_disabled)
827 set_pageblock_migratetype(page, 829 set_pageblock_migratetype(page,
828 start_migratetype); 830 start_migratetype);
829 831
diff --git a/mm/pdflush.c b/mm/pdflush.c
deleted file mode 100644
index 235ac440c44e..000000000000
--- a/mm/pdflush.c
+++ /dev/null
@@ -1,269 +0,0 @@
1/*
2 * mm/pdflush.c - worker threads for writing back filesystem data
3 *
4 * Copyright (C) 2002, Linus Torvalds.
5 *
6 * 09Apr2002 Andrew Morton
7 * Initial version
8 * 29Feb2004 kaos@sgi.com
9 * Move worker thread creation to kthread to avoid chewing
10 * up stack space with nested calls to kernel_thread.
11 */
12
13#include <linux/sched.h>
14#include <linux/list.h>
15#include <linux/signal.h>
16#include <linux/spinlock.h>
17#include <linux/gfp.h>
18#include <linux/init.h>
19#include <linux/module.h>
20#include <linux/fs.h> /* Needed by writeback.h */
21#include <linux/writeback.h> /* Prototypes pdflush_operation() */
22#include <linux/kthread.h>
23#include <linux/cpuset.h>
24#include <linux/freezer.h>
25
26
27/*
28 * Minimum and maximum number of pdflush instances
29 */
30#define MIN_PDFLUSH_THREADS 2
31#define MAX_PDFLUSH_THREADS 8
32
33static void start_one_pdflush_thread(void);
34
35
36/*
37 * The pdflush threads are worker threads for writing back dirty data.
38 * Ideally, we'd like one thread per active disk spindle. But the disk
39 * topology is very hard to divine at this level. Instead, we take
40 * care in various places to prevent more than one pdflush thread from
41 * performing writeback against a single filesystem. pdflush threads
42 * have the PF_FLUSHER flag set in current->flags to aid in this.
43 */
44
45/*
46 * All the pdflush threads. Protected by pdflush_lock
47 */
48static LIST_HEAD(pdflush_list);
49static DEFINE_SPINLOCK(pdflush_lock);
50
51/*
52 * The count of currently-running pdflush threads. Protected
53 * by pdflush_lock.
54 *
55 * Readable by sysctl, but not writable. Published to userspace at
56 * /proc/sys/vm/nr_pdflush_threads.
57 */
58int nr_pdflush_threads = 0;
59
60/*
61 * The time at which the pdflush thread pool last went empty
62 */
63static unsigned long last_empty_jifs;
64
65/*
66 * The pdflush thread.
67 *
68 * Thread pool management algorithm:
69 *
70 * - The minimum and maximum number of pdflush instances are bound
71 * by MIN_PDFLUSH_THREADS and MAX_PDFLUSH_THREADS.
72 *
73 * - If there have been no idle pdflush instances for 1 second, create
74 * a new one.
75 *
76 * - If the least-recently-went-to-sleep pdflush thread has been asleep
77 * for more than one second, terminate a thread.
78 */
79
80/*
81 * A structure for passing work to a pdflush thread. Also for passing
82 * state information between pdflush threads. Protected by pdflush_lock.
83 */
84struct pdflush_work {
85 struct task_struct *who; /* The thread */
86 void (*fn)(unsigned long); /* A callback function */
87 unsigned long arg0; /* An argument to the callback */
88 struct list_head list; /* On pdflush_list, when idle */
89 unsigned long when_i_went_to_sleep;
90};
91
92static int __pdflush(struct pdflush_work *my_work)
93{
94 current->flags |= PF_FLUSHER | PF_SWAPWRITE;
95 set_freezable();
96 my_work->fn = NULL;
97 my_work->who = current;
98 INIT_LIST_HEAD(&my_work->list);
99
100 spin_lock_irq(&pdflush_lock);
101 for ( ; ; ) {
102 struct pdflush_work *pdf;
103
104 set_current_state(TASK_INTERRUPTIBLE);
105 list_move(&my_work->list, &pdflush_list);
106 my_work->when_i_went_to_sleep = jiffies;
107 spin_unlock_irq(&pdflush_lock);
108 schedule();
109 try_to_freeze();
110 spin_lock_irq(&pdflush_lock);
111 if (!list_empty(&my_work->list)) {
112 /*
113 * Someone woke us up, but without removing our control
114 * structure from the global list. swsusp will do this
115 * in try_to_freeze()->refrigerator(). Handle it.
116 */
117 my_work->fn = NULL;
118 continue;
119 }
120 if (my_work->fn == NULL) {
121 printk("pdflush: bogus wakeup\n");
122 continue;
123 }
124 spin_unlock_irq(&pdflush_lock);
125
126 (*my_work->fn)(my_work->arg0);
127
128 spin_lock_irq(&pdflush_lock);
129
130 /*
131 * Thread creation: For how long have there been zero
132 * available threads?
133 *
134 * To throttle creation, we reset last_empty_jifs.
135 */
136 if (time_after(jiffies, last_empty_jifs + 1 * HZ)) {
137 if (list_empty(&pdflush_list)) {
138 if (nr_pdflush_threads < MAX_PDFLUSH_THREADS) {
139 last_empty_jifs = jiffies;
140 nr_pdflush_threads++;
141 spin_unlock_irq(&pdflush_lock);
142 start_one_pdflush_thread();
143 spin_lock_irq(&pdflush_lock);
144 }
145 }
146 }
147
148 my_work->fn = NULL;
149
150 /*
151 * Thread destruction: For how long has the sleepiest
152 * thread slept?
153 */
154 if (list_empty(&pdflush_list))
155 continue;
156 if (nr_pdflush_threads <= MIN_PDFLUSH_THREADS)
157 continue;
158 pdf = list_entry(pdflush_list.prev, struct pdflush_work, list);
159 if (time_after(jiffies, pdf->when_i_went_to_sleep + 1 * HZ)) {
160 /* Limit exit rate */
161 pdf->when_i_went_to_sleep = jiffies;
162 break; /* exeunt */
163 }
164 }
165 nr_pdflush_threads--;
166 spin_unlock_irq(&pdflush_lock);
167 return 0;
168}
169
170/*
171 * Of course, my_work wants to be just a local in __pdflush(). It is
172 * separated out in this manner to hopefully prevent the compiler from
173 * performing unfortunate optimisations against the auto variables. Because
174 * these are visible to other tasks and CPUs. (No problem has actually
175 * been observed. This is just paranoia).
176 */
177static int pdflush(void *dummy)
178{
179 struct pdflush_work my_work;
180 cpumask_var_t cpus_allowed;
181
182 /*
183 * Since the caller doesn't even check kthread_run() worked, let's not
184 * freak out too much if this fails.
185 */
186 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
187 printk(KERN_WARNING "pdflush failed to allocate cpumask\n");
188 return 0;
189 }
190
191 /*
192 * pdflush can spend a lot of time doing encryption via dm-crypt. We
193 * don't want to do that at keventd's priority.
194 */
195 set_user_nice(current, 0);
196
197 /*
198 * Some configs put our parent kthread in a limited cpuset,
199 * which kthread() overrides, forcing cpus_allowed == cpu_all_mask.
200 * Our needs are more modest - cut back to our cpusets cpus_allowed.
201 * This is needed as pdflush's are dynamically created and destroyed.
202 * The boottime pdflush's are easily placed w/o these 2 lines.
203 */
204 cpuset_cpus_allowed(current, cpus_allowed);
205 set_cpus_allowed_ptr(current, cpus_allowed);
206 free_cpumask_var(cpus_allowed);
207
208 return __pdflush(&my_work);
209}
210
211/*
212 * Attempt to wake up a pdflush thread, and get it to do some work for you.
213 * Returns zero if it indeed managed to find a worker thread, and passed your
214 * payload to it.
215 */
216int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0)
217{
218 unsigned long flags;
219 int ret = 0;
220
221 BUG_ON(fn == NULL); /* Hard to diagnose if it's deferred */
222
223 spin_lock_irqsave(&pdflush_lock, flags);
224 if (list_empty(&pdflush_list)) {
225 ret = -1;
226 } else {
227 struct pdflush_work *pdf;
228
229 pdf = list_entry(pdflush_list.next, struct pdflush_work, list);
230 list_del_init(&pdf->list);
231 if (list_empty(&pdflush_list))
232 last_empty_jifs = jiffies;
233 pdf->fn = fn;
234 pdf->arg0 = arg0;
235 wake_up_process(pdf->who);
236 }
237 spin_unlock_irqrestore(&pdflush_lock, flags);
238
239 return ret;
240}
241
242static void start_one_pdflush_thread(void)
243{
244 struct task_struct *k;
245
246 k = kthread_run(pdflush, NULL, "pdflush");
247 if (unlikely(IS_ERR(k))) {
248 spin_lock_irq(&pdflush_lock);
249 nr_pdflush_threads--;
250 spin_unlock_irq(&pdflush_lock);
251 }
252}
253
254static int __init pdflush_init(void)
255{
256 int i;
257
258 /*
259 * Pre-set nr_pdflush_threads... If we fail to create,
260 * the count will be decremented.
261 */
262 nr_pdflush_threads = MIN_PDFLUSH_THREADS;
263
264 for (i = 0; i < MIN_PDFLUSH_THREADS; i++)
265 start_one_pdflush_thread();
266 return 0;
267}
268
269module_init(pdflush_init);
diff --git a/mm/percpu.c b/mm/percpu.c
index 5fe37842e0ea..3311c8919f37 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -197,7 +197,12 @@ static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
197static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk, 197static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk,
198 int page_idx) 198 int page_idx)
199{ 199{
200 return *pcpu_chunk_pagep(chunk, 0, page_idx) != NULL; 200 /*
201 * Any possible cpu id can be used here, so there's no need to
202 * worry about preemption or cpu hotplug.
203 */
204 return *pcpu_chunk_pagep(chunk, raw_smp_processor_id(),
205 page_idx) != NULL;
201} 206}
202 207
203/* set the pointer to a chunk in a page struct */ 208/* set the pointer to a chunk in a page struct */
@@ -297,6 +302,14 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
297 return pcpu_first_chunk; 302 return pcpu_first_chunk;
298 } 303 }
299 304
305 /*
306 * The address is relative to unit0 which might be unused and
307 * thus unmapped. Offset the address to the unit space of the
308 * current processor before looking it up in the vmalloc
309 * space. Note that any possible cpu id can be used here, so
310 * there's no need to worry about preemption or cpu hotplug.
311 */
312 addr += raw_smp_processor_id() * pcpu_unit_size;
300 return pcpu_get_page_chunk(vmalloc_to_page(addr)); 313 return pcpu_get_page_chunk(vmalloc_to_page(addr));
301} 314}
302 315
diff --git a/mm/shmem.c b/mm/shmem.c
index d713239ce2ce..5a0b3d4055f3 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2446,7 +2446,7 @@ static const struct inode_operations shmem_inode_operations = {
2446 .getxattr = generic_getxattr, 2446 .getxattr = generic_getxattr,
2447 .listxattr = generic_listxattr, 2447 .listxattr = generic_listxattr,
2448 .removexattr = generic_removexattr, 2448 .removexattr = generic_removexattr,
2449 .permission = shmem_permission, 2449 .check_acl = shmem_check_acl,
2450#endif 2450#endif
2451 2451
2452}; 2452};
@@ -2469,7 +2469,7 @@ static const struct inode_operations shmem_dir_inode_operations = {
2469 .getxattr = generic_getxattr, 2469 .getxattr = generic_getxattr,
2470 .listxattr = generic_listxattr, 2470 .listxattr = generic_listxattr,
2471 .removexattr = generic_removexattr, 2471 .removexattr = generic_removexattr,
2472 .permission = shmem_permission, 2472 .check_acl = shmem_check_acl,
2473#endif 2473#endif
2474}; 2474};
2475 2475
@@ -2480,7 +2480,7 @@ static const struct inode_operations shmem_special_inode_operations = {
2480 .getxattr = generic_getxattr, 2480 .getxattr = generic_getxattr,
2481 .listxattr = generic_listxattr, 2481 .listxattr = generic_listxattr,
2482 .removexattr = generic_removexattr, 2482 .removexattr = generic_removexattr,
2483 .permission = shmem_permission, 2483 .check_acl = shmem_check_acl,
2484#endif 2484#endif
2485}; 2485};
2486 2486
diff --git a/mm/shmem_acl.c b/mm/shmem_acl.c
index 606a8e757a42..df2c87fdae50 100644
--- a/mm/shmem_acl.c
+++ b/mm/shmem_acl.c
@@ -157,7 +157,7 @@ shmem_acl_init(struct inode *inode, struct inode *dir)
157/** 157/**
158 * shmem_check_acl - check_acl() callback for generic_permission() 158 * shmem_check_acl - check_acl() callback for generic_permission()
159 */ 159 */
160static int 160int
161shmem_check_acl(struct inode *inode, int mask) 161shmem_check_acl(struct inode *inode, int mask)
162{ 162{
163 struct posix_acl *acl = shmem_get_acl(inode, ACL_TYPE_ACCESS); 163 struct posix_acl *acl = shmem_get_acl(inode, ACL_TYPE_ACCESS);
@@ -169,12 +169,3 @@ shmem_check_acl(struct inode *inode, int mask)
169 } 169 }
170 return -EAGAIN; 170 return -EAGAIN;
171} 171}
172
173/**
174 * shmem_permission - permission() inode operation
175 */
176int
177shmem_permission(struct inode *inode, int mask)
178{
179 return generic_permission(inode, mask, shmem_check_acl);
180}
diff --git a/mm/slub.c b/mm/slub.c
index b9f1491a58a1..b6276753626e 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2594,8 +2594,6 @@ static inline int kmem_cache_close(struct kmem_cache *s)
2594 */ 2594 */
2595void kmem_cache_destroy(struct kmem_cache *s) 2595void kmem_cache_destroy(struct kmem_cache *s)
2596{ 2596{
2597 if (s->flags & SLAB_DESTROY_BY_RCU)
2598 rcu_barrier();
2599 down_write(&slub_lock); 2597 down_write(&slub_lock);
2600 s->refcount--; 2598 s->refcount--;
2601 if (!s->refcount) { 2599 if (!s->refcount) {
@@ -2606,6 +2604,8 @@ void kmem_cache_destroy(struct kmem_cache *s)
2606 "still has objects.\n", s->name, __func__); 2604 "still has objects.\n", s->name, __func__);
2607 dump_stack(); 2605 dump_stack();
2608 } 2606 }
2607 if (s->flags & SLAB_DESTROY_BY_RCU)
2608 rcu_barrier();
2609 sysfs_slab_remove(s); 2609 sysfs_slab_remove(s);
2610 } else 2610 } else
2611 up_write(&slub_lock); 2611 up_write(&slub_lock);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 42cd38eba79f..5ae6b8b78c80 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -34,6 +34,7 @@ static const struct address_space_operations swap_aops = {
34}; 34};
35 35
36static struct backing_dev_info swap_backing_dev_info = { 36static struct backing_dev_info swap_backing_dev_info = {
37 .name = "swap",
37 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, 38 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED,
38 .unplug_io_fn = swap_unplug_io_fn, 39 .unplug_io_fn = swap_unplug_io_fn,
39}; 40};
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 94e86dd6954c..ba8228e0a806 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1720,7 +1720,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1720 */ 1720 */
1721 if (total_scanned > sc->swap_cluster_max + 1721 if (total_scanned > sc->swap_cluster_max +
1722 sc->swap_cluster_max / 2) { 1722 sc->swap_cluster_max / 2) {
1723 wakeup_pdflush(laptop_mode ? 0 : total_scanned); 1723 wakeup_flusher_threads(laptop_mode ? 0 : total_scanned);
1724 sc->may_writepage = 1; 1724 sc->may_writepage = 1;
1725 } 1725 }
1726 1726