aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig8
-rw-r--r--mm/Makefile2
-rw-r--r--mm/backing-dev.c381
-rw-r--r--mm/bootmem.c6
-rw-r--r--mm/filemap.c170
-rw-r--r--mm/hugetlb.c1
-rw-r--r--mm/kmemleak.c336
-rw-r--r--mm/mmap.c3
-rw-r--r--mm/nommu.c10
-rw-r--r--mm/oom_kill.c64
-rw-r--r--mm/page-writeback.c182
-rw-r--r--mm/page_alloc.c10
-rw-r--r--mm/pdflush.c269
-rw-r--r--mm/rmap.c1
-rw-r--r--mm/shmem.c6
-rw-r--r--mm/shmem_acl.c11
-rw-r--r--mm/slob.c5
-rw-r--r--mm/slub.c86
-rw-r--r--mm/swap_state.c1
-rw-r--r--mm/swapfile.c6
-rw-r--r--mm/vmscan.c11
21 files changed, 854 insertions, 715 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index c948d4ca8bde..3aa519f52e18 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -153,7 +153,7 @@ config MEMORY_HOTREMOVE
153# 153#
154config PAGEFLAGS_EXTENDED 154config PAGEFLAGS_EXTENDED
155 def_bool y 155 def_bool y
156 depends on 64BIT || SPARSEMEM_VMEMMAP || !NUMA || !SPARSEMEM 156 depends on 64BIT || SPARSEMEM_VMEMMAP || !SPARSEMEM
157 157
158# Heavily threaded applications may benefit from splitting the mm-wide 158# Heavily threaded applications may benefit from splitting the mm-wide
159# page_table_lock, so that faults on different parts of the user address 159# page_table_lock, so that faults on different parts of the user address
@@ -225,9 +225,9 @@ config DEFAULT_MMAP_MIN_ADDR
225 For most ia64, ppc64 and x86 users with lots of address space 225 For most ia64, ppc64 and x86 users with lots of address space
226 a value of 65536 is reasonable and should cause no problems. 226 a value of 65536 is reasonable and should cause no problems.
227 On arm and other archs it should not be higher than 32768. 227 On arm and other archs it should not be higher than 32768.
228 Programs which use vm86 functionality would either need additional 228 Programs which use vm86 functionality or have some need to map
229 permissions from either the LSM or the capabilities module or have 229 this low address space will need CAP_SYS_RAWIO or disable this
230 this protection disabled. 230 protection by setting the value to 0.
231 231
232 This value can be changed after boot using the 232 This value can be changed after boot using the
233 /proc/sys/vm/mmap_min_addr tunable. 233 /proc/sys/vm/mmap_min_addr tunable.
diff --git a/mm/Makefile b/mm/Makefile
index c77c6487552f..ea4b18bd3960 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -8,7 +8,7 @@ mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \
8 vmalloc.o 8 vmalloc.o
9 9
10obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ 10obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
11 maccess.o page_alloc.o page-writeback.o pdflush.o \ 11 maccess.o page_alloc.o page-writeback.o \
12 readahead.o swap.o truncate.o vmscan.o shmem.o \ 12 readahead.o swap.o truncate.o vmscan.o shmem.o \
13 prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ 13 prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
14 page_isolation.o mm_init.o $(mmu-y) 14 page_isolation.o mm_init.o $(mmu-y)
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index c86edd244294..d3ca0dac1111 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -1,8 +1,11 @@
1 1
2#include <linux/wait.h> 2#include <linux/wait.h>
3#include <linux/backing-dev.h> 3#include <linux/backing-dev.h>
4#include <linux/kthread.h>
5#include <linux/freezer.h>
4#include <linux/fs.h> 6#include <linux/fs.h>
5#include <linux/pagemap.h> 7#include <linux/pagemap.h>
8#include <linux/mm.h>
6#include <linux/sched.h> 9#include <linux/sched.h>
7#include <linux/module.h> 10#include <linux/module.h>
8#include <linux/writeback.h> 11#include <linux/writeback.h>
@@ -14,6 +17,7 @@ void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
14EXPORT_SYMBOL(default_unplug_io_fn); 17EXPORT_SYMBOL(default_unplug_io_fn);
15 18
16struct backing_dev_info default_backing_dev_info = { 19struct backing_dev_info default_backing_dev_info = {
20 .name = "default",
17 .ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE, 21 .ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE,
18 .state = 0, 22 .state = 0,
19 .capabilities = BDI_CAP_MAP_COPY, 23 .capabilities = BDI_CAP_MAP_COPY,
@@ -22,6 +26,18 @@ struct backing_dev_info default_backing_dev_info = {
22EXPORT_SYMBOL_GPL(default_backing_dev_info); 26EXPORT_SYMBOL_GPL(default_backing_dev_info);
23 27
24static struct class *bdi_class; 28static struct class *bdi_class;
29DEFINE_SPINLOCK(bdi_lock);
30LIST_HEAD(bdi_list);
31LIST_HEAD(bdi_pending_list);
32
33static struct task_struct *sync_supers_tsk;
34static struct timer_list sync_supers_timer;
35
36static int bdi_sync_supers(void *);
37static void sync_supers_timer_fn(unsigned long);
38static void arm_supers_timer(void);
39
40static void bdi_add_default_flusher_task(struct backing_dev_info *bdi);
25 41
26#ifdef CONFIG_DEBUG_FS 42#ifdef CONFIG_DEBUG_FS
27#include <linux/debugfs.h> 43#include <linux/debugfs.h>
@@ -37,9 +53,29 @@ static void bdi_debug_init(void)
37static int bdi_debug_stats_show(struct seq_file *m, void *v) 53static int bdi_debug_stats_show(struct seq_file *m, void *v)
38{ 54{
39 struct backing_dev_info *bdi = m->private; 55 struct backing_dev_info *bdi = m->private;
56 struct bdi_writeback *wb;
40 unsigned long background_thresh; 57 unsigned long background_thresh;
41 unsigned long dirty_thresh; 58 unsigned long dirty_thresh;
42 unsigned long bdi_thresh; 59 unsigned long bdi_thresh;
60 unsigned long nr_dirty, nr_io, nr_more_io, nr_wb;
61 struct inode *inode;
62
63 /*
64 * inode lock is enough here, the bdi->wb_list is protected by
65 * RCU on the reader side
66 */
67 nr_wb = nr_dirty = nr_io = nr_more_io = 0;
68 spin_lock(&inode_lock);
69 list_for_each_entry(wb, &bdi->wb_list, list) {
70 nr_wb++;
71 list_for_each_entry(inode, &wb->b_dirty, i_list)
72 nr_dirty++;
73 list_for_each_entry(inode, &wb->b_io, i_list)
74 nr_io++;
75 list_for_each_entry(inode, &wb->b_more_io, i_list)
76 nr_more_io++;
77 }
78 spin_unlock(&inode_lock);
43 79
44 get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi); 80 get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi);
45 81
@@ -49,12 +85,22 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
49 "BdiReclaimable: %8lu kB\n" 85 "BdiReclaimable: %8lu kB\n"
50 "BdiDirtyThresh: %8lu kB\n" 86 "BdiDirtyThresh: %8lu kB\n"
51 "DirtyThresh: %8lu kB\n" 87 "DirtyThresh: %8lu kB\n"
52 "BackgroundThresh: %8lu kB\n", 88 "BackgroundThresh: %8lu kB\n"
89 "WriteBack threads:%8lu\n"
90 "b_dirty: %8lu\n"
91 "b_io: %8lu\n"
92 "b_more_io: %8lu\n"
93 "bdi_list: %8u\n"
94 "state: %8lx\n"
95 "wb_mask: %8lx\n"
96 "wb_list: %8u\n"
97 "wb_cnt: %8u\n",
53 (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)), 98 (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)),
54 (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)), 99 (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)),
55 K(bdi_thresh), 100 K(bdi_thresh), K(dirty_thresh),
56 K(dirty_thresh), 101 K(background_thresh), nr_wb, nr_dirty, nr_io, nr_more_io,
57 K(background_thresh)); 102 !list_empty(&bdi->bdi_list), bdi->state, bdi->wb_mask,
103 !list_empty(&bdi->wb_list), bdi->wb_cnt);
58#undef K 104#undef K
59 105
60 return 0; 106 return 0;
@@ -185,6 +231,13 @@ static int __init default_bdi_init(void)
185{ 231{
186 int err; 232 int err;
187 233
234 sync_supers_tsk = kthread_run(bdi_sync_supers, NULL, "sync_supers");
235 BUG_ON(IS_ERR(sync_supers_tsk));
236
237 init_timer(&sync_supers_timer);
238 setup_timer(&sync_supers_timer, sync_supers_timer_fn, 0);
239 arm_supers_timer();
240
188 err = bdi_init(&default_backing_dev_info); 241 err = bdi_init(&default_backing_dev_info);
189 if (!err) 242 if (!err)
190 bdi_register(&default_backing_dev_info, NULL, "default"); 243 bdi_register(&default_backing_dev_info, NULL, "default");
@@ -193,6 +246,248 @@ static int __init default_bdi_init(void)
193} 246}
194subsys_initcall(default_bdi_init); 247subsys_initcall(default_bdi_init);
195 248
249static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
250{
251 memset(wb, 0, sizeof(*wb));
252
253 wb->bdi = bdi;
254 wb->last_old_flush = jiffies;
255 INIT_LIST_HEAD(&wb->b_dirty);
256 INIT_LIST_HEAD(&wb->b_io);
257 INIT_LIST_HEAD(&wb->b_more_io);
258}
259
260static void bdi_task_init(struct backing_dev_info *bdi,
261 struct bdi_writeback *wb)
262{
263 struct task_struct *tsk = current;
264
265 spin_lock(&bdi->wb_lock);
266 list_add_tail_rcu(&wb->list, &bdi->wb_list);
267 spin_unlock(&bdi->wb_lock);
268
269 tsk->flags |= PF_FLUSHER | PF_SWAPWRITE;
270 set_freezable();
271
272 /*
273 * Our parent may run at a different priority, just set us to normal
274 */
275 set_user_nice(tsk, 0);
276}
277
278static int bdi_start_fn(void *ptr)
279{
280 struct bdi_writeback *wb = ptr;
281 struct backing_dev_info *bdi = wb->bdi;
282 int ret;
283
284 /*
285 * Add us to the active bdi_list
286 */
287 spin_lock(&bdi_lock);
288 list_add(&bdi->bdi_list, &bdi_list);
289 spin_unlock(&bdi_lock);
290
291 bdi_task_init(bdi, wb);
292
293 /*
294 * Clear pending bit and wakeup anybody waiting to tear us down
295 */
296 clear_bit(BDI_pending, &bdi->state);
297 smp_mb__after_clear_bit();
298 wake_up_bit(&bdi->state, BDI_pending);
299
300 ret = bdi_writeback_task(wb);
301
302 /*
303 * Remove us from the list
304 */
305 spin_lock(&bdi->wb_lock);
306 list_del_rcu(&wb->list);
307 spin_unlock(&bdi->wb_lock);
308
309 /*
310 * Flush any work that raced with us exiting. No new work
311 * will be added, since this bdi isn't discoverable anymore.
312 */
313 if (!list_empty(&bdi->work_list))
314 wb_do_writeback(wb, 1);
315
316 wb->task = NULL;
317 return ret;
318}
319
320int bdi_has_dirty_io(struct backing_dev_info *bdi)
321{
322 return wb_has_dirty_io(&bdi->wb);
323}
324
325static void bdi_flush_io(struct backing_dev_info *bdi)
326{
327 struct writeback_control wbc = {
328 .bdi = bdi,
329 .sync_mode = WB_SYNC_NONE,
330 .older_than_this = NULL,
331 .range_cyclic = 1,
332 .nr_to_write = 1024,
333 };
334
335 writeback_inodes_wbc(&wbc);
336}
337
338/*
339 * kupdated() used to do this. We cannot do it from the bdi_forker_task()
340 * or we risk deadlocking on ->s_umount. The longer term solution would be
341 * to implement sync_supers_bdi() or similar and simply do it from the
342 * bdi writeback tasks individually.
343 */
344static int bdi_sync_supers(void *unused)
345{
346 set_user_nice(current, 0);
347
348 while (!kthread_should_stop()) {
349 set_current_state(TASK_INTERRUPTIBLE);
350 schedule();
351
352 /*
353 * Do this periodically, like kupdated() did before.
354 */
355 sync_supers();
356 }
357
358 return 0;
359}
360
361static void arm_supers_timer(void)
362{
363 unsigned long next;
364
365 next = msecs_to_jiffies(dirty_writeback_interval * 10) + jiffies;
366 mod_timer(&sync_supers_timer, round_jiffies_up(next));
367}
368
369static void sync_supers_timer_fn(unsigned long unused)
370{
371 wake_up_process(sync_supers_tsk);
372 arm_supers_timer();
373}
374
375static int bdi_forker_task(void *ptr)
376{
377 struct bdi_writeback *me = ptr;
378
379 bdi_task_init(me->bdi, me);
380
381 for (;;) {
382 struct backing_dev_info *bdi, *tmp;
383 struct bdi_writeback *wb;
384
385 /*
386 * Temporary measure, we want to make sure we don't see
387 * dirty data on the default backing_dev_info
388 */
389 if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list))
390 wb_do_writeback(me, 0);
391
392 spin_lock(&bdi_lock);
393
394 /*
395 * Check if any existing bdi's have dirty data without
396 * a thread registered. If so, set that up.
397 */
398 list_for_each_entry_safe(bdi, tmp, &bdi_list, bdi_list) {
399 if (bdi->wb.task)
400 continue;
401 if (list_empty(&bdi->work_list) &&
402 !bdi_has_dirty_io(bdi))
403 continue;
404
405 bdi_add_default_flusher_task(bdi);
406 }
407
408 set_current_state(TASK_INTERRUPTIBLE);
409
410 if (list_empty(&bdi_pending_list)) {
411 unsigned long wait;
412
413 spin_unlock(&bdi_lock);
414 wait = msecs_to_jiffies(dirty_writeback_interval * 10);
415 schedule_timeout(wait);
416 try_to_freeze();
417 continue;
418 }
419
420 __set_current_state(TASK_RUNNING);
421
422 /*
423 * This is our real job - check for pending entries in
424 * bdi_pending_list, and create the tasks that got added
425 */
426 bdi = list_entry(bdi_pending_list.next, struct backing_dev_info,
427 bdi_list);
428 list_del_init(&bdi->bdi_list);
429 spin_unlock(&bdi_lock);
430
431 wb = &bdi->wb;
432 wb->task = kthread_run(bdi_start_fn, wb, "flush-%s",
433 dev_name(bdi->dev));
434 /*
435 * If task creation fails, then readd the bdi to
436 * the pending list and force writeout of the bdi
437 * from this forker thread. That will free some memory
438 * and we can try again.
439 */
440 if (IS_ERR(wb->task)) {
441 wb->task = NULL;
442
443 /*
444 * Add this 'bdi' to the back, so we get
445 * a chance to flush other bdi's to free
446 * memory.
447 */
448 spin_lock(&bdi_lock);
449 list_add_tail(&bdi->bdi_list, &bdi_pending_list);
450 spin_unlock(&bdi_lock);
451
452 bdi_flush_io(bdi);
453 }
454 }
455
456 return 0;
457}
458
459/*
460 * Add the default flusher task that gets created for any bdi
461 * that has dirty data pending writeout
462 */
463void static bdi_add_default_flusher_task(struct backing_dev_info *bdi)
464{
465 if (!bdi_cap_writeback_dirty(bdi))
466 return;
467
468 if (WARN_ON(!test_bit(BDI_registered, &bdi->state))) {
469 printk(KERN_ERR "bdi %p/%s is not registered!\n",
470 bdi, bdi->name);
471 return;
472 }
473
474 /*
475 * Check with the helper whether to proceed adding a task. Will only
476 * abort if we two or more simultanous calls to
477 * bdi_add_default_flusher_task() occured, further additions will block
478 * waiting for previous additions to finish.
479 */
480 if (!test_and_set_bit(BDI_pending, &bdi->state)) {
481 list_move_tail(&bdi->bdi_list, &bdi_pending_list);
482
483 /*
484 * We are now on the pending list, wake up bdi_forker_task()
485 * to finish the job and add us back to the active bdi_list
486 */
487 wake_up_process(default_backing_dev_info.wb.task);
488 }
489}
490
196int bdi_register(struct backing_dev_info *bdi, struct device *parent, 491int bdi_register(struct backing_dev_info *bdi, struct device *parent,
197 const char *fmt, ...) 492 const char *fmt, ...)
198{ 493{
@@ -211,9 +506,35 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
211 goto exit; 506 goto exit;
212 } 507 }
213 508
509 spin_lock(&bdi_lock);
510 list_add_tail(&bdi->bdi_list, &bdi_list);
511 spin_unlock(&bdi_lock);
512
214 bdi->dev = dev; 513 bdi->dev = dev;
215 bdi_debug_register(bdi, dev_name(dev));
216 514
515 /*
516 * Just start the forker thread for our default backing_dev_info,
517 * and add other bdi's to the list. They will get a thread created
518 * on-demand when they need it.
519 */
520 if (bdi_cap_flush_forker(bdi)) {
521 struct bdi_writeback *wb = &bdi->wb;
522
523 wb->task = kthread_run(bdi_forker_task, wb, "bdi-%s",
524 dev_name(dev));
525 if (IS_ERR(wb->task)) {
526 wb->task = NULL;
527 ret = -ENOMEM;
528
529 spin_lock(&bdi_lock);
530 list_del(&bdi->bdi_list);
531 spin_unlock(&bdi_lock);
532 goto exit;
533 }
534 }
535
536 bdi_debug_register(bdi, dev_name(dev));
537 set_bit(BDI_registered, &bdi->state);
217exit: 538exit:
218 return ret; 539 return ret;
219} 540}
@@ -225,9 +546,42 @@ int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev)
225} 546}
226EXPORT_SYMBOL(bdi_register_dev); 547EXPORT_SYMBOL(bdi_register_dev);
227 548
549/*
550 * Remove bdi from the global list and shutdown any threads we have running
551 */
552static void bdi_wb_shutdown(struct backing_dev_info *bdi)
553{
554 struct bdi_writeback *wb;
555
556 if (!bdi_cap_writeback_dirty(bdi))
557 return;
558
559 /*
560 * If setup is pending, wait for that to complete first
561 */
562 wait_on_bit(&bdi->state, BDI_pending, bdi_sched_wait,
563 TASK_UNINTERRUPTIBLE);
564
565 /*
566 * Make sure nobody finds us on the bdi_list anymore
567 */
568 spin_lock(&bdi_lock);
569 list_del(&bdi->bdi_list);
570 spin_unlock(&bdi_lock);
571
572 /*
573 * Finally, kill the kernel threads. We don't need to be RCU
574 * safe anymore, since the bdi is gone from visibility.
575 */
576 list_for_each_entry(wb, &bdi->wb_list, list)
577 kthread_stop(wb->task);
578}
579
228void bdi_unregister(struct backing_dev_info *bdi) 580void bdi_unregister(struct backing_dev_info *bdi)
229{ 581{
230 if (bdi->dev) { 582 if (bdi->dev) {
583 if (!bdi_cap_flush_forker(bdi))
584 bdi_wb_shutdown(bdi);
231 bdi_debug_unregister(bdi); 585 bdi_debug_unregister(bdi);
232 device_unregister(bdi->dev); 586 device_unregister(bdi->dev);
233 bdi->dev = NULL; 587 bdi->dev = NULL;
@@ -237,14 +591,25 @@ EXPORT_SYMBOL(bdi_unregister);
237 591
238int bdi_init(struct backing_dev_info *bdi) 592int bdi_init(struct backing_dev_info *bdi)
239{ 593{
240 int i; 594 int i, err;
241 int err;
242 595
243 bdi->dev = NULL; 596 bdi->dev = NULL;
244 597
245 bdi->min_ratio = 0; 598 bdi->min_ratio = 0;
246 bdi->max_ratio = 100; 599 bdi->max_ratio = 100;
247 bdi->max_prop_frac = PROP_FRAC_BASE; 600 bdi->max_prop_frac = PROP_FRAC_BASE;
601 spin_lock_init(&bdi->wb_lock);
602 INIT_LIST_HEAD(&bdi->bdi_list);
603 INIT_LIST_HEAD(&bdi->wb_list);
604 INIT_LIST_HEAD(&bdi->work_list);
605
606 bdi_wb_init(&bdi->wb, bdi);
607
608 /*
609 * Just one thread support for now, hard code mask and count
610 */
611 bdi->wb_mask = 1;
612 bdi->wb_cnt = 1;
248 613
249 for (i = 0; i < NR_BDI_STAT_ITEMS; i++) { 614 for (i = 0; i < NR_BDI_STAT_ITEMS; i++) {
250 err = percpu_counter_init(&bdi->bdi_stat[i], 0); 615 err = percpu_counter_init(&bdi->bdi_stat[i], 0);
@@ -269,6 +634,8 @@ void bdi_destroy(struct backing_dev_info *bdi)
269{ 634{
270 int i; 635 int i;
271 636
637 WARN_ON(bdi_has_dirty_io(bdi));
638
272 bdi_unregister(bdi); 639 bdi_unregister(bdi);
273 640
274 for (i = 0; i < NR_BDI_STAT_ITEMS; i++) 641 for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 701740c9e81b..555d5d2731c6 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -521,7 +521,11 @@ find_block:
521 region = phys_to_virt(PFN_PHYS(bdata->node_min_pfn) + 521 region = phys_to_virt(PFN_PHYS(bdata->node_min_pfn) +
522 start_off); 522 start_off);
523 memset(region, 0, size); 523 memset(region, 0, size);
524 kmemleak_alloc(region, size, 1, 0); 524 /*
525 * The min_count is set to 0 so that bootmem allocated blocks
526 * are never reported as leaks.
527 */
528 kmemleak_alloc(region, size, 0, 0);
525 return region; 529 return region;
526 } 530 }
527 531
diff --git a/mm/filemap.c b/mm/filemap.c
index ccea3b665c12..dd51c68e2b86 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -39,11 +39,10 @@
39/* 39/*
40 * FIXME: remove all knowledge of the buffer layer from the core VM 40 * FIXME: remove all knowledge of the buffer layer from the core VM
41 */ 41 */
42#include <linux/buffer_head.h> /* for generic_osync_inode */ 42#include <linux/buffer_head.h> /* for try_to_free_buffers */
43 43
44#include <asm/mman.h> 44#include <asm/mman.h>
45 45
46
47/* 46/*
48 * Shared mappings implemented 30.11.1994. It's not fully working yet, 47 * Shared mappings implemented 30.11.1994. It's not fully working yet,
49 * though. 48 * though.
@@ -307,68 +306,24 @@ int wait_on_page_writeback_range(struct address_space *mapping,
307} 306}
308 307
309/** 308/**
310 * sync_page_range - write and wait on all pages in the passed range 309 * filemap_fdatawait_range - wait for all under-writeback pages to complete in a given range
311 * @inode: target inode 310 * @mapping: address space structure to wait for
312 * @mapping: target address_space 311 * @start: offset in bytes where the range starts
313 * @pos: beginning offset in pages to write 312 * @end: offset in bytes where the range ends (inclusive)
314 * @count: number of bytes to write
315 *
316 * Write and wait upon all the pages in the passed range. This is a "data
317 * integrity" operation. It waits upon in-flight writeout before starting and
318 * waiting upon new writeout. If there was an IO error, return it.
319 * 313 *
320 * We need to re-take i_mutex during the generic_osync_inode list walk because 314 * Walk the list of under-writeback pages of the given address space
321 * it is otherwise livelockable. 315 * in the given range and wait for all of them.
322 */
323int sync_page_range(struct inode *inode, struct address_space *mapping,
324 loff_t pos, loff_t count)
325{
326 pgoff_t start = pos >> PAGE_CACHE_SHIFT;
327 pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
328 int ret;
329
330 if (!mapping_cap_writeback_dirty(mapping) || !count)
331 return 0;
332 ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1);
333 if (ret == 0) {
334 mutex_lock(&inode->i_mutex);
335 ret = generic_osync_inode(inode, mapping, OSYNC_METADATA);
336 mutex_unlock(&inode->i_mutex);
337 }
338 if (ret == 0)
339 ret = wait_on_page_writeback_range(mapping, start, end);
340 return ret;
341}
342EXPORT_SYMBOL(sync_page_range);
343
344/**
345 * sync_page_range_nolock - write & wait on all pages in the passed range without locking
346 * @inode: target inode
347 * @mapping: target address_space
348 * @pos: beginning offset in pages to write
349 * @count: number of bytes to write
350 * 316 *
351 * Note: Holding i_mutex across sync_page_range_nolock() is not a good idea 317 * This is just a simple wrapper so that callers don't have to convert offsets
352 * as it forces O_SYNC writers to different parts of the same file 318 * to page indexes themselves
353 * to be serialised right until io completion.
354 */ 319 */
355int sync_page_range_nolock(struct inode *inode, struct address_space *mapping, 320int filemap_fdatawait_range(struct address_space *mapping, loff_t start,
356 loff_t pos, loff_t count) 321 loff_t end)
357{ 322{
358 pgoff_t start = pos >> PAGE_CACHE_SHIFT; 323 return wait_on_page_writeback_range(mapping, start >> PAGE_CACHE_SHIFT,
359 pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT; 324 end >> PAGE_CACHE_SHIFT);
360 int ret;
361
362 if (!mapping_cap_writeback_dirty(mapping) || !count)
363 return 0;
364 ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1);
365 if (ret == 0)
366 ret = generic_osync_inode(inode, mapping, OSYNC_METADATA);
367 if (ret == 0)
368 ret = wait_on_page_writeback_range(mapping, start, end);
369 return ret;
370} 325}
371EXPORT_SYMBOL(sync_page_range_nolock); 326EXPORT_SYMBOL(filemap_fdatawait_range);
372 327
373/** 328/**
374 * filemap_fdatawait - wait for all under-writeback pages to complete 329 * filemap_fdatawait - wait for all under-writeback pages to complete
@@ -2167,20 +2122,7 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
2167 } 2122 }
2168 *ppos = end; 2123 *ppos = end;
2169 } 2124 }
2170
2171 /*
2172 * Sync the fs metadata but not the minor inode changes and
2173 * of course not the data as we did direct DMA for the IO.
2174 * i_mutex is held, which protects generic_osync_inode() from
2175 * livelocking. AIO O_DIRECT ops attempt to sync metadata here.
2176 */
2177out: 2125out:
2178 if ((written >= 0 || written == -EIOCBQUEUED) &&
2179 ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2180 int err = generic_osync_inode(inode, mapping, OSYNC_METADATA);
2181 if (err < 0)
2182 written = err;
2183 }
2184 return written; 2126 return written;
2185} 2127}
2186EXPORT_SYMBOL(generic_file_direct_write); 2128EXPORT_SYMBOL(generic_file_direct_write);
@@ -2312,8 +2254,6 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
2312{ 2254{
2313 struct file *file = iocb->ki_filp; 2255 struct file *file = iocb->ki_filp;
2314 struct address_space *mapping = file->f_mapping; 2256 struct address_space *mapping = file->f_mapping;
2315 const struct address_space_operations *a_ops = mapping->a_ops;
2316 struct inode *inode = mapping->host;
2317 ssize_t status; 2257 ssize_t status;
2318 struct iov_iter i; 2258 struct iov_iter i;
2319 2259
@@ -2323,16 +2263,6 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
2323 if (likely(status >= 0)) { 2263 if (likely(status >= 0)) {
2324 written += status; 2264 written += status;
2325 *ppos = pos + status; 2265 *ppos = pos + status;
2326
2327 /*
2328 * For now, when the user asks for O_SYNC, we'll actually give
2329 * O_DSYNC
2330 */
2331 if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2332 if (!a_ops->writepage || !is_sync_kiocb(iocb))
2333 status = generic_osync_inode(inode, mapping,
2334 OSYNC_METADATA|OSYNC_DATA);
2335 }
2336 } 2266 }
2337 2267
2338 /* 2268 /*
@@ -2348,9 +2278,27 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
2348} 2278}
2349EXPORT_SYMBOL(generic_file_buffered_write); 2279EXPORT_SYMBOL(generic_file_buffered_write);
2350 2280
2351static ssize_t 2281/**
2352__generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, 2282 * __generic_file_aio_write - write data to a file
2353 unsigned long nr_segs, loff_t *ppos) 2283 * @iocb: IO state structure (file, offset, etc.)
2284 * @iov: vector with data to write
2285 * @nr_segs: number of segments in the vector
2286 * @ppos: position where to write
2287 *
2288 * This function does all the work needed for actually writing data to a
2289 * file. It does all basic checks, removes SUID from the file, updates
2290 * modification times and calls proper subroutines depending on whether we
2291 * do direct IO or a standard buffered write.
2292 *
2293 * It expects i_mutex to be grabbed unless we work on a block device or similar
2294 * object which does not need locking at all.
2295 *
2296 * This function does *not* take care of syncing data in case of O_SYNC write.
2297 * A caller has to handle it. This is mainly due to the fact that we want to
2298 * avoid syncing under i_mutex.
2299 */
2300ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
2301 unsigned long nr_segs, loff_t *ppos)
2354{ 2302{
2355 struct file *file = iocb->ki_filp; 2303 struct file *file = iocb->ki_filp;
2356 struct address_space * mapping = file->f_mapping; 2304 struct address_space * mapping = file->f_mapping;
@@ -2447,51 +2395,37 @@ out:
2447 current->backing_dev_info = NULL; 2395 current->backing_dev_info = NULL;
2448 return written ? written : err; 2396 return written ? written : err;
2449} 2397}
2398EXPORT_SYMBOL(__generic_file_aio_write);
2450 2399
2451ssize_t generic_file_aio_write_nolock(struct kiocb *iocb, 2400/**
2452 const struct iovec *iov, unsigned long nr_segs, loff_t pos) 2401 * generic_file_aio_write - write data to a file
2453{ 2402 * @iocb: IO state structure
2454 struct file *file = iocb->ki_filp; 2403 * @iov: vector with data to write
2455 struct address_space *mapping = file->f_mapping; 2404 * @nr_segs: number of segments in the vector
2456 struct inode *inode = mapping->host; 2405 * @pos: position in file where to write
2457 ssize_t ret; 2406 *
2458 2407 * This is a wrapper around __generic_file_aio_write() to be used by most
2459 BUG_ON(iocb->ki_pos != pos); 2408 * filesystems. It takes care of syncing the file in case of O_SYNC file
2460 2409 * and acquires i_mutex as needed.
2461 ret = __generic_file_aio_write_nolock(iocb, iov, nr_segs, 2410 */
2462 &iocb->ki_pos);
2463
2464 if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2465 ssize_t err;
2466
2467 err = sync_page_range_nolock(inode, mapping, pos, ret);
2468 if (err < 0)
2469 ret = err;
2470 }
2471 return ret;
2472}
2473EXPORT_SYMBOL(generic_file_aio_write_nolock);
2474
2475ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, 2411ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
2476 unsigned long nr_segs, loff_t pos) 2412 unsigned long nr_segs, loff_t pos)
2477{ 2413{
2478 struct file *file = iocb->ki_filp; 2414 struct file *file = iocb->ki_filp;
2479 struct address_space *mapping = file->f_mapping; 2415 struct inode *inode = file->f_mapping->host;
2480 struct inode *inode = mapping->host;
2481 ssize_t ret; 2416 ssize_t ret;
2482 2417
2483 BUG_ON(iocb->ki_pos != pos); 2418 BUG_ON(iocb->ki_pos != pos);
2484 2419
2485 mutex_lock(&inode->i_mutex); 2420 mutex_lock(&inode->i_mutex);
2486 ret = __generic_file_aio_write_nolock(iocb, iov, nr_segs, 2421 ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
2487 &iocb->ki_pos);
2488 mutex_unlock(&inode->i_mutex); 2422 mutex_unlock(&inode->i_mutex);
2489 2423
2490 if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { 2424 if (ret > 0 || ret == -EIOCBQUEUED) {
2491 ssize_t err; 2425 ssize_t err;
2492 2426
2493 err = sync_page_range(inode, mapping, pos, ret); 2427 err = generic_write_sync(file, pos, ret);
2494 if (err < 0) 2428 if (err < 0 && ret > 0)
2495 ret = err; 2429 ret = err;
2496 } 2430 }
2497 return ret; 2431 return ret;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index cafdcee154e8..b16d63634777 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -234,6 +234,7 @@ unsigned long vma_kernel_pagesize(struct vm_area_struct *vma)
234 234
235 return 1UL << (hstate->order + PAGE_SHIFT); 235 return 1UL << (hstate->order + PAGE_SHIFT);
236} 236}
237EXPORT_SYMBOL_GPL(vma_kernel_pagesize);
237 238
238/* 239/*
239 * Return the page size being used by the MMU to back a VMA. In the majority 240 * Return the page size being used by the MMU to back a VMA. In the majority
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 487267310a84..4ea4510e2996 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -92,11 +92,13 @@
92#include <linux/string.h> 92#include <linux/string.h>
93#include <linux/nodemask.h> 93#include <linux/nodemask.h>
94#include <linux/mm.h> 94#include <linux/mm.h>
95#include <linux/workqueue.h>
95 96
96#include <asm/sections.h> 97#include <asm/sections.h>
97#include <asm/processor.h> 98#include <asm/processor.h>
98#include <asm/atomic.h> 99#include <asm/atomic.h>
99 100
101#include <linux/kmemcheck.h>
100#include <linux/kmemleak.h> 102#include <linux/kmemleak.h>
101 103
102/* 104/*
@@ -107,6 +109,7 @@
107#define SECS_FIRST_SCAN 60 /* delay before the first scan */ 109#define SECS_FIRST_SCAN 60 /* delay before the first scan */
108#define SECS_SCAN_WAIT 600 /* subsequent auto scanning delay */ 110#define SECS_SCAN_WAIT 600 /* subsequent auto scanning delay */
109#define GRAY_LIST_PASSES 25 /* maximum number of gray list scans */ 111#define GRAY_LIST_PASSES 25 /* maximum number of gray list scans */
112#define MAX_SCAN_SIZE 4096 /* maximum size of a scanned block */
110 113
111#define BYTES_PER_POINTER sizeof(void *) 114#define BYTES_PER_POINTER sizeof(void *)
112 115
@@ -120,6 +123,9 @@ struct kmemleak_scan_area {
120 size_t length; 123 size_t length;
121}; 124};
122 125
126#define KMEMLEAK_GREY 0
127#define KMEMLEAK_BLACK -1
128
123/* 129/*
124 * Structure holding the metadata for each allocated memory block. 130 * Structure holding the metadata for each allocated memory block.
125 * Modifications to such objects should be made while holding the 131 * Modifications to such objects should be made while holding the
@@ -161,6 +167,15 @@ struct kmemleak_object {
161/* flag set on newly allocated objects */ 167/* flag set on newly allocated objects */
162#define OBJECT_NEW (1 << 3) 168#define OBJECT_NEW (1 << 3)
163 169
170/* number of bytes to print per line; must be 16 or 32 */
171#define HEX_ROW_SIZE 16
172/* number of bytes to print at a time (1, 2, 4, 8) */
173#define HEX_GROUP_SIZE 1
174/* include ASCII after the hex output */
175#define HEX_ASCII 1
176/* max number of lines to be printed */
177#define HEX_MAX_LINES 2
178
164/* the list of all allocated objects */ 179/* the list of all allocated objects */
165static LIST_HEAD(object_list); 180static LIST_HEAD(object_list);
166/* the list of gray-colored objects (see color_gray comment below) */ 181/* the list of gray-colored objects (see color_gray comment below) */
@@ -228,11 +243,14 @@ struct early_log {
228 int min_count; /* minimum reference count */ 243 int min_count; /* minimum reference count */
229 unsigned long offset; /* scan area offset */ 244 unsigned long offset; /* scan area offset */
230 size_t length; /* scan area length */ 245 size_t length; /* scan area length */
246 unsigned long trace[MAX_TRACE]; /* stack trace */
247 unsigned int trace_len; /* stack trace length */
231}; 248};
232 249
233/* early logging buffer and current position */ 250/* early logging buffer and current position */
234static struct early_log early_log[CONFIG_DEBUG_KMEMLEAK_EARLY_LOG_SIZE]; 251static struct early_log
235static int crt_early_log; 252 early_log[CONFIG_DEBUG_KMEMLEAK_EARLY_LOG_SIZE] __initdata;
253static int crt_early_log __initdata;
236 254
237static void kmemleak_disable(void); 255static void kmemleak_disable(void);
238 256
@@ -255,6 +273,35 @@ static void kmemleak_disable(void);
255} while (0) 273} while (0)
256 274
257/* 275/*
276 * Printing of the objects hex dump to the seq file. The number of lines to be
277 * printed is limited to HEX_MAX_LINES to prevent seq file spamming. The
278 * actual number of printed bytes depends on HEX_ROW_SIZE. It must be called
279 * with the object->lock held.
280 */
281static void hex_dump_object(struct seq_file *seq,
282 struct kmemleak_object *object)
283{
284 const u8 *ptr = (const u8 *)object->pointer;
285 int i, len, remaining;
286 unsigned char linebuf[HEX_ROW_SIZE * 5];
287
288 /* limit the number of lines to HEX_MAX_LINES */
289 remaining = len =
290 min(object->size, (size_t)(HEX_MAX_LINES * HEX_ROW_SIZE));
291
292 seq_printf(seq, " hex dump (first %d bytes):\n", len);
293 for (i = 0; i < len; i += HEX_ROW_SIZE) {
294 int linelen = min(remaining, HEX_ROW_SIZE);
295
296 remaining -= HEX_ROW_SIZE;
297 hex_dump_to_buffer(ptr + i, linelen, HEX_ROW_SIZE,
298 HEX_GROUP_SIZE, linebuf, sizeof(linebuf),
299 HEX_ASCII);
300 seq_printf(seq, " %s\n", linebuf);
301 }
302}
303
304/*
258 * Object colors, encoded with count and min_count: 305 * Object colors, encoded with count and min_count:
259 * - white - orphan object, not enough references to it (count < min_count) 306 * - white - orphan object, not enough references to it (count < min_count)
260 * - gray - not orphan, not marked as false positive (min_count == 0) or 307 * - gray - not orphan, not marked as false positive (min_count == 0) or
@@ -264,19 +311,21 @@ static void kmemleak_disable(void);
264 * Newly created objects don't have any color assigned (object->count == -1) 311 * Newly created objects don't have any color assigned (object->count == -1)
265 * before the next memory scan when they become white. 312 * before the next memory scan when they become white.
266 */ 313 */
267static int color_white(const struct kmemleak_object *object) 314static bool color_white(const struct kmemleak_object *object)
268{ 315{
269 return object->count != -1 && object->count < object->min_count; 316 return object->count != KMEMLEAK_BLACK &&
317 object->count < object->min_count;
270} 318}
271 319
272static int color_gray(const struct kmemleak_object *object) 320static bool color_gray(const struct kmemleak_object *object)
273{ 321{
274 return object->min_count != -1 && object->count >= object->min_count; 322 return object->min_count != KMEMLEAK_BLACK &&
323 object->count >= object->min_count;
275} 324}
276 325
277static int color_black(const struct kmemleak_object *object) 326static bool color_black(const struct kmemleak_object *object)
278{ 327{
279 return object->min_count == -1; 328 return object->min_count == KMEMLEAK_BLACK;
280} 329}
281 330
282/* 331/*
@@ -284,7 +333,7 @@ static int color_black(const struct kmemleak_object *object)
284 * not be deleted and have a minimum age to avoid false positives caused by 333 * not be deleted and have a minimum age to avoid false positives caused by
285 * pointers temporarily stored in CPU registers. 334 * pointers temporarily stored in CPU registers.
286 */ 335 */
287static int unreferenced_object(struct kmemleak_object *object) 336static bool unreferenced_object(struct kmemleak_object *object)
288{ 337{
289 return (object->flags & OBJECT_ALLOCATED) && color_white(object) && 338 return (object->flags & OBJECT_ALLOCATED) && color_white(object) &&
290 time_before_eq(object->jiffies + jiffies_min_age, 339 time_before_eq(object->jiffies + jiffies_min_age,
@@ -304,6 +353,7 @@ static void print_unreferenced(struct seq_file *seq,
304 object->pointer, object->size); 353 object->pointer, object->size);
305 seq_printf(seq, " comm \"%s\", pid %d, jiffies %lu\n", 354 seq_printf(seq, " comm \"%s\", pid %d, jiffies %lu\n",
306 object->comm, object->pid, object->jiffies); 355 object->comm, object->pid, object->jiffies);
356 hex_dump_object(seq, object);
307 seq_printf(seq, " backtrace:\n"); 357 seq_printf(seq, " backtrace:\n");
308 358
309 for (i = 0; i < object->trace_len; i++) { 359 for (i = 0; i < object->trace_len; i++) {
@@ -330,6 +380,7 @@ static void dump_object_info(struct kmemleak_object *object)
330 object->comm, object->pid, object->jiffies); 380 object->comm, object->pid, object->jiffies);
331 pr_notice(" min_count = %d\n", object->min_count); 381 pr_notice(" min_count = %d\n", object->min_count);
332 pr_notice(" count = %d\n", object->count); 382 pr_notice(" count = %d\n", object->count);
383 pr_notice(" flags = 0x%lx\n", object->flags);
333 pr_notice(" backtrace:\n"); 384 pr_notice(" backtrace:\n");
334 print_stack_trace(&trace, 4); 385 print_stack_trace(&trace, 4);
335} 386}
@@ -434,21 +485,36 @@ static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias)
434} 485}
435 486
436/* 487/*
488 * Save stack trace to the given array of MAX_TRACE size.
489 */
490static int __save_stack_trace(unsigned long *trace)
491{
492 struct stack_trace stack_trace;
493
494 stack_trace.max_entries = MAX_TRACE;
495 stack_trace.nr_entries = 0;
496 stack_trace.entries = trace;
497 stack_trace.skip = 2;
498 save_stack_trace(&stack_trace);
499
500 return stack_trace.nr_entries;
501}
502
503/*
437 * Create the metadata (struct kmemleak_object) corresponding to an allocated 504 * Create the metadata (struct kmemleak_object) corresponding to an allocated
438 * memory block and add it to the object_list and object_tree_root. 505 * memory block and add it to the object_list and object_tree_root.
439 */ 506 */
440static void create_object(unsigned long ptr, size_t size, int min_count, 507static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
441 gfp_t gfp) 508 int min_count, gfp_t gfp)
442{ 509{
443 unsigned long flags; 510 unsigned long flags;
444 struct kmemleak_object *object; 511 struct kmemleak_object *object;
445 struct prio_tree_node *node; 512 struct prio_tree_node *node;
446 struct stack_trace trace;
447 513
448 object = kmem_cache_alloc(object_cache, gfp & GFP_KMEMLEAK_MASK); 514 object = kmem_cache_alloc(object_cache, gfp & GFP_KMEMLEAK_MASK);
449 if (!object) { 515 if (!object) {
450 kmemleak_stop("Cannot allocate a kmemleak_object structure\n"); 516 kmemleak_stop("Cannot allocate a kmemleak_object structure\n");
451 return; 517 return NULL;
452 } 518 }
453 519
454 INIT_LIST_HEAD(&object->object_list); 520 INIT_LIST_HEAD(&object->object_list);
@@ -482,18 +548,14 @@ static void create_object(unsigned long ptr, size_t size, int min_count,
482 } 548 }
483 549
484 /* kernel backtrace */ 550 /* kernel backtrace */
485 trace.max_entries = MAX_TRACE; 551 object->trace_len = __save_stack_trace(object->trace);
486 trace.nr_entries = 0;
487 trace.entries = object->trace;
488 trace.skip = 1;
489 save_stack_trace(&trace);
490 object->trace_len = trace.nr_entries;
491 552
492 INIT_PRIO_TREE_NODE(&object->tree_node); 553 INIT_PRIO_TREE_NODE(&object->tree_node);
493 object->tree_node.start = ptr; 554 object->tree_node.start = ptr;
494 object->tree_node.last = ptr + size - 1; 555 object->tree_node.last = ptr + size - 1;
495 556
496 write_lock_irqsave(&kmemleak_lock, flags); 557 write_lock_irqsave(&kmemleak_lock, flags);
558
497 min_addr = min(min_addr, ptr); 559 min_addr = min(min_addr, ptr);
498 max_addr = max(max_addr, ptr + size); 560 max_addr = max(max_addr, ptr + size);
499 node = prio_tree_insert(&object_tree_root, &object->tree_node); 561 node = prio_tree_insert(&object_tree_root, &object->tree_node);
@@ -504,20 +566,19 @@ static void create_object(unsigned long ptr, size_t size, int min_count,
504 * random memory blocks. 566 * random memory blocks.
505 */ 567 */
506 if (node != &object->tree_node) { 568 if (node != &object->tree_node) {
507 unsigned long flags;
508
509 kmemleak_stop("Cannot insert 0x%lx into the object search tree " 569 kmemleak_stop("Cannot insert 0x%lx into the object search tree "
510 "(already existing)\n", ptr); 570 "(already existing)\n", ptr);
511 object = lookup_object(ptr, 1); 571 object = lookup_object(ptr, 1);
512 spin_lock_irqsave(&object->lock, flags); 572 spin_lock(&object->lock);
513 dump_object_info(object); 573 dump_object_info(object);
514 spin_unlock_irqrestore(&object->lock, flags); 574 spin_unlock(&object->lock);
515 575
516 goto out; 576 goto out;
517 } 577 }
518 list_add_tail_rcu(&object->object_list, &object_list); 578 list_add_tail_rcu(&object->object_list, &object_list);
519out: 579out:
520 write_unlock_irqrestore(&kmemleak_lock, flags); 580 write_unlock_irqrestore(&kmemleak_lock, flags);
581 return object;
521} 582}
522 583
523/* 584/*
@@ -604,46 +665,55 @@ static void delete_object_part(unsigned long ptr, size_t size)
604 665
605 put_object(object); 666 put_object(object);
606} 667}
607/* 668
608 * Make a object permanently as gray-colored so that it can no longer be 669static void __paint_it(struct kmemleak_object *object, int color)
609 * reported as a leak. This is used in general to mark a false positive. 670{
610 */ 671 object->min_count = color;
611static void make_gray_object(unsigned long ptr) 672 if (color == KMEMLEAK_BLACK)
673 object->flags |= OBJECT_NO_SCAN;
674}
675
676static void paint_it(struct kmemleak_object *object, int color)
612{ 677{
613 unsigned long flags; 678 unsigned long flags;
679
680 spin_lock_irqsave(&object->lock, flags);
681 __paint_it(object, color);
682 spin_unlock_irqrestore(&object->lock, flags);
683}
684
685static void paint_ptr(unsigned long ptr, int color)
686{
614 struct kmemleak_object *object; 687 struct kmemleak_object *object;
615 688
616 object = find_and_get_object(ptr, 0); 689 object = find_and_get_object(ptr, 0);
617 if (!object) { 690 if (!object) {
618 kmemleak_warn("Graying unknown object at 0x%08lx\n", ptr); 691 kmemleak_warn("Trying to color unknown object "
692 "at 0x%08lx as %s\n", ptr,
693 (color == KMEMLEAK_GREY) ? "Grey" :
694 (color == KMEMLEAK_BLACK) ? "Black" : "Unknown");
619 return; 695 return;
620 } 696 }
621 697 paint_it(object, color);
622 spin_lock_irqsave(&object->lock, flags);
623 object->min_count = 0;
624 spin_unlock_irqrestore(&object->lock, flags);
625 put_object(object); 698 put_object(object);
626} 699}
627 700
628/* 701/*
702 * Make a object permanently as gray-colored so that it can no longer be
703 * reported as a leak. This is used in general to mark a false positive.
704 */
705static void make_gray_object(unsigned long ptr)
706{
707 paint_ptr(ptr, KMEMLEAK_GREY);
708}
709
710/*
629 * Mark the object as black-colored so that it is ignored from scans and 711 * Mark the object as black-colored so that it is ignored from scans and
630 * reporting. 712 * reporting.
631 */ 713 */
632static void make_black_object(unsigned long ptr) 714static void make_black_object(unsigned long ptr)
633{ 715{
634 unsigned long flags; 716 paint_ptr(ptr, KMEMLEAK_BLACK);
635 struct kmemleak_object *object;
636
637 object = find_and_get_object(ptr, 0);
638 if (!object) {
639 kmemleak_warn("Blacking unknown object at 0x%08lx\n", ptr);
640 return;
641 }
642
643 spin_lock_irqsave(&object->lock, flags);
644 object->min_count = -1;
645 spin_unlock_irqrestore(&object->lock, flags);
646 put_object(object);
647} 717}
648 718
649/* 719/*
@@ -715,14 +785,15 @@ static void object_no_scan(unsigned long ptr)
715 * Log an early kmemleak_* call to the early_log buffer. These calls will be 785 * Log an early kmemleak_* call to the early_log buffer. These calls will be
716 * processed later once kmemleak is fully initialized. 786 * processed later once kmemleak is fully initialized.
717 */ 787 */
718static void log_early(int op_type, const void *ptr, size_t size, 788static void __init log_early(int op_type, const void *ptr, size_t size,
719 int min_count, unsigned long offset, size_t length) 789 int min_count, unsigned long offset, size_t length)
720{ 790{
721 unsigned long flags; 791 unsigned long flags;
722 struct early_log *log; 792 struct early_log *log;
723 793
724 if (crt_early_log >= ARRAY_SIZE(early_log)) { 794 if (crt_early_log >= ARRAY_SIZE(early_log)) {
725 pr_warning("Early log buffer exceeded\n"); 795 pr_warning("Early log buffer exceeded, "
796 "please increase DEBUG_KMEMLEAK_EARLY_LOG_SIZE\n");
726 kmemleak_disable(); 797 kmemleak_disable();
727 return; 798 return;
728 } 799 }
@@ -739,16 +810,45 @@ static void log_early(int op_type, const void *ptr, size_t size,
739 log->min_count = min_count; 810 log->min_count = min_count;
740 log->offset = offset; 811 log->offset = offset;
741 log->length = length; 812 log->length = length;
813 if (op_type == KMEMLEAK_ALLOC)
814 log->trace_len = __save_stack_trace(log->trace);
742 crt_early_log++; 815 crt_early_log++;
743 local_irq_restore(flags); 816 local_irq_restore(flags);
744} 817}
745 818
746/* 819/*
820 * Log an early allocated block and populate the stack trace.
821 */
822static void early_alloc(struct early_log *log)
823{
824 struct kmemleak_object *object;
825 unsigned long flags;
826 int i;
827
828 if (!atomic_read(&kmemleak_enabled) || !log->ptr || IS_ERR(log->ptr))
829 return;
830
831 /*
832 * RCU locking needed to ensure object is not freed via put_object().
833 */
834 rcu_read_lock();
835 object = create_object((unsigned long)log->ptr, log->size,
836 log->min_count, GFP_KERNEL);
837 spin_lock_irqsave(&object->lock, flags);
838 for (i = 0; i < log->trace_len; i++)
839 object->trace[i] = log->trace[i];
840 object->trace_len = log->trace_len;
841 spin_unlock_irqrestore(&object->lock, flags);
842 rcu_read_unlock();
843}
844
845/*
747 * Memory allocation function callback. This function is called from the 846 * Memory allocation function callback. This function is called from the
748 * kernel allocators when a new block is allocated (kmem_cache_alloc, kmalloc, 847 * kernel allocators when a new block is allocated (kmem_cache_alloc, kmalloc,
749 * vmalloc etc.). 848 * vmalloc etc.).
750 */ 849 */
751void kmemleak_alloc(const void *ptr, size_t size, int min_count, gfp_t gfp) 850void __ref kmemleak_alloc(const void *ptr, size_t size, int min_count,
851 gfp_t gfp)
752{ 852{
753 pr_debug("%s(0x%p, %zu, %d)\n", __func__, ptr, size, min_count); 853 pr_debug("%s(0x%p, %zu, %d)\n", __func__, ptr, size, min_count);
754 854
@@ -763,7 +863,7 @@ EXPORT_SYMBOL_GPL(kmemleak_alloc);
763 * Memory freeing function callback. This function is called from the kernel 863 * Memory freeing function callback. This function is called from the kernel
764 * allocators when a block is freed (kmem_cache_free, kfree, vfree etc.). 864 * allocators when a block is freed (kmem_cache_free, kfree, vfree etc.).
765 */ 865 */
766void kmemleak_free(const void *ptr) 866void __ref kmemleak_free(const void *ptr)
767{ 867{
768 pr_debug("%s(0x%p)\n", __func__, ptr); 868 pr_debug("%s(0x%p)\n", __func__, ptr);
769 869
@@ -778,7 +878,7 @@ EXPORT_SYMBOL_GPL(kmemleak_free);
778 * Partial memory freeing function callback. This function is usually called 878 * Partial memory freeing function callback. This function is usually called
779 * from bootmem allocator when (part of) a memory block is freed. 879 * from bootmem allocator when (part of) a memory block is freed.
780 */ 880 */
781void kmemleak_free_part(const void *ptr, size_t size) 881void __ref kmemleak_free_part(const void *ptr, size_t size)
782{ 882{
783 pr_debug("%s(0x%p)\n", __func__, ptr); 883 pr_debug("%s(0x%p)\n", __func__, ptr);
784 884
@@ -793,7 +893,7 @@ EXPORT_SYMBOL_GPL(kmemleak_free_part);
793 * Mark an already allocated memory block as a false positive. This will cause 893 * Mark an already allocated memory block as a false positive. This will cause
794 * the block to no longer be reported as leak and always be scanned. 894 * the block to no longer be reported as leak and always be scanned.
795 */ 895 */
796void kmemleak_not_leak(const void *ptr) 896void __ref kmemleak_not_leak(const void *ptr)
797{ 897{
798 pr_debug("%s(0x%p)\n", __func__, ptr); 898 pr_debug("%s(0x%p)\n", __func__, ptr);
799 899
@@ -809,7 +909,7 @@ EXPORT_SYMBOL(kmemleak_not_leak);
809 * corresponding block is not a leak and does not contain any references to 909 * corresponding block is not a leak and does not contain any references to
810 * other allocated memory blocks. 910 * other allocated memory blocks.
811 */ 911 */
812void kmemleak_ignore(const void *ptr) 912void __ref kmemleak_ignore(const void *ptr)
813{ 913{
814 pr_debug("%s(0x%p)\n", __func__, ptr); 914 pr_debug("%s(0x%p)\n", __func__, ptr);
815 915
@@ -823,8 +923,8 @@ EXPORT_SYMBOL(kmemleak_ignore);
823/* 923/*
824 * Limit the range to be scanned in an allocated memory block. 924 * Limit the range to be scanned in an allocated memory block.
825 */ 925 */
826void kmemleak_scan_area(const void *ptr, unsigned long offset, size_t length, 926void __ref kmemleak_scan_area(const void *ptr, unsigned long offset,
827 gfp_t gfp) 927 size_t length, gfp_t gfp)
828{ 928{
829 pr_debug("%s(0x%p)\n", __func__, ptr); 929 pr_debug("%s(0x%p)\n", __func__, ptr);
830 930
@@ -838,7 +938,7 @@ EXPORT_SYMBOL(kmemleak_scan_area);
838/* 938/*
839 * Inform kmemleak not to scan the given memory block. 939 * Inform kmemleak not to scan the given memory block.
840 */ 940 */
841void kmemleak_no_scan(const void *ptr) 941void __ref kmemleak_no_scan(const void *ptr)
842{ 942{
843 pr_debug("%s(0x%p)\n", __func__, ptr); 943 pr_debug("%s(0x%p)\n", __func__, ptr);
844 944
@@ -882,15 +982,22 @@ static void scan_block(void *_start, void *_end,
882 unsigned long *end = _end - (BYTES_PER_POINTER - 1); 982 unsigned long *end = _end - (BYTES_PER_POINTER - 1);
883 983
884 for (ptr = start; ptr < end; ptr++) { 984 for (ptr = start; ptr < end; ptr++) {
885 unsigned long flags;
886 unsigned long pointer = *ptr;
887 struct kmemleak_object *object; 985 struct kmemleak_object *object;
986 unsigned long flags;
987 unsigned long pointer;
888 988
889 if (allow_resched) 989 if (allow_resched)
890 cond_resched(); 990 cond_resched();
891 if (scan_should_stop()) 991 if (scan_should_stop())
892 break; 992 break;
893 993
994 /* don't scan uninitialized memory */
995 if (!kmemcheck_is_obj_initialized((unsigned long)ptr,
996 BYTES_PER_POINTER))
997 continue;
998
999 pointer = *ptr;
1000
894 object = find_and_get_object(pointer, 1); 1001 object = find_and_get_object(pointer, 1);
895 if (!object) 1002 if (!object)
896 continue; 1003 continue;
@@ -949,10 +1056,21 @@ static void scan_object(struct kmemleak_object *object)
949 if (!(object->flags & OBJECT_ALLOCATED)) 1056 if (!(object->flags & OBJECT_ALLOCATED))
950 /* already freed object */ 1057 /* already freed object */
951 goto out; 1058 goto out;
952 if (hlist_empty(&object->area_list)) 1059 if (hlist_empty(&object->area_list)) {
953 scan_block((void *)object->pointer, 1060 void *start = (void *)object->pointer;
954 (void *)(object->pointer + object->size), object, 0); 1061 void *end = (void *)(object->pointer + object->size);
955 else 1062
1063 while (start < end && (object->flags & OBJECT_ALLOCATED) &&
1064 !(object->flags & OBJECT_NO_SCAN)) {
1065 scan_block(start, min(start + MAX_SCAN_SIZE, end),
1066 object, 0);
1067 start += MAX_SCAN_SIZE;
1068
1069 spin_unlock_irqrestore(&object->lock, flags);
1070 cond_resched();
1071 spin_lock_irqsave(&object->lock, flags);
1072 }
1073 } else
956 hlist_for_each_entry(area, elem, &object->area_list, node) 1074 hlist_for_each_entry(area, elem, &object->area_list, node)
957 scan_block((void *)(object->pointer + area->offset), 1075 scan_block((void *)(object->pointer + area->offset),
958 (void *)(object->pointer + area->offset 1076 (void *)(object->pointer + area->offset
@@ -970,7 +1088,6 @@ static void kmemleak_scan(void)
970{ 1088{
971 unsigned long flags; 1089 unsigned long flags;
972 struct kmemleak_object *object, *tmp; 1090 struct kmemleak_object *object, *tmp;
973 struct task_struct *task;
974 int i; 1091 int i;
975 int new_leaks = 0; 1092 int new_leaks = 0;
976 int gray_list_pass = 0; 1093 int gray_list_pass = 0;
@@ -1037,15 +1154,16 @@ static void kmemleak_scan(void)
1037 } 1154 }
1038 1155
1039 /* 1156 /*
1040 * Scanning the task stacks may introduce false negatives and it is 1157 * Scanning the task stacks (may introduce false negatives).
1041 * not enabled by default.
1042 */ 1158 */
1043 if (kmemleak_stack_scan) { 1159 if (kmemleak_stack_scan) {
1160 struct task_struct *p, *g;
1161
1044 read_lock(&tasklist_lock); 1162 read_lock(&tasklist_lock);
1045 for_each_process(task) 1163 do_each_thread(g, p) {
1046 scan_block(task_stack_page(task), 1164 scan_block(task_stack_page(p), task_stack_page(p) +
1047 task_stack_page(task) + THREAD_SIZE, 1165 THREAD_SIZE, NULL, 0);
1048 NULL, 0); 1166 } while_each_thread(g, p);
1049 read_unlock(&tasklist_lock); 1167 read_unlock(&tasklist_lock);
1050 } 1168 }
1051 1169
@@ -1170,7 +1288,7 @@ static int kmemleak_scan_thread(void *arg)
1170 * Start the automatic memory scanning thread. This function must be called 1288 * Start the automatic memory scanning thread. This function must be called
1171 * with the scan_mutex held. 1289 * with the scan_mutex held.
1172 */ 1290 */
1173void start_scan_thread(void) 1291static void start_scan_thread(void)
1174{ 1292{
1175 if (scan_thread) 1293 if (scan_thread)
1176 return; 1294 return;
@@ -1185,7 +1303,7 @@ void start_scan_thread(void)
1185 * Stop the automatic memory scanning thread. This function must be called 1303 * Stop the automatic memory scanning thread. This function must be called
1186 * with the scan_mutex held. 1304 * with the scan_mutex held.
1187 */ 1305 */
1188void stop_scan_thread(void) 1306static void stop_scan_thread(void)
1189{ 1307{
1190 if (scan_thread) { 1308 if (scan_thread) {
1191 kthread_stop(scan_thread); 1309 kthread_stop(scan_thread);
@@ -1294,6 +1412,49 @@ static int kmemleak_release(struct inode *inode, struct file *file)
1294 return seq_release(inode, file); 1412 return seq_release(inode, file);
1295} 1413}
1296 1414
1415static int dump_str_object_info(const char *str)
1416{
1417 unsigned long flags;
1418 struct kmemleak_object *object;
1419 unsigned long addr;
1420
1421 addr= simple_strtoul(str, NULL, 0);
1422 object = find_and_get_object(addr, 0);
1423 if (!object) {
1424 pr_info("Unknown object at 0x%08lx\n", addr);
1425 return -EINVAL;
1426 }
1427
1428 spin_lock_irqsave(&object->lock, flags);
1429 dump_object_info(object);
1430 spin_unlock_irqrestore(&object->lock, flags);
1431
1432 put_object(object);
1433 return 0;
1434}
1435
1436/*
1437 * We use grey instead of black to ensure we can do future scans on the same
1438 * objects. If we did not do future scans these black objects could
1439 * potentially contain references to newly allocated objects in the future and
1440 * we'd end up with false positives.
1441 */
1442static void kmemleak_clear(void)
1443{
1444 struct kmemleak_object *object;
1445 unsigned long flags;
1446
1447 rcu_read_lock();
1448 list_for_each_entry_rcu(object, &object_list, object_list) {
1449 spin_lock_irqsave(&object->lock, flags);
1450 if ((object->flags & OBJECT_REPORTED) &&
1451 unreferenced_object(object))
1452 __paint_it(object, KMEMLEAK_GREY);
1453 spin_unlock_irqrestore(&object->lock, flags);
1454 }
1455 rcu_read_unlock();
1456}
1457
1297/* 1458/*
1298 * File write operation to configure kmemleak at run-time. The following 1459 * File write operation to configure kmemleak at run-time. The following
1299 * commands can be written to the /sys/kernel/debug/kmemleak file: 1460 * commands can be written to the /sys/kernel/debug/kmemleak file:
@@ -1305,6 +1466,9 @@ static int kmemleak_release(struct inode *inode, struct file *file)
1305 * scan=... - set the automatic memory scanning period in seconds (0 to 1466 * scan=... - set the automatic memory scanning period in seconds (0 to
1306 * disable it) 1467 * disable it)
1307 * scan - trigger a memory scan 1468 * scan - trigger a memory scan
1469 * clear - mark all current reported unreferenced kmemleak objects as
1470 * grey to ignore printing them
1471 * dump=... - dump information about the object found at the given address
1308 */ 1472 */
1309static ssize_t kmemleak_write(struct file *file, const char __user *user_buf, 1473static ssize_t kmemleak_write(struct file *file, const char __user *user_buf,
1310 size_t size, loff_t *ppos) 1474 size_t size, loff_t *ppos)
@@ -1345,6 +1509,10 @@ static ssize_t kmemleak_write(struct file *file, const char __user *user_buf,
1345 } 1509 }
1346 } else if (strncmp(buf, "scan", 4) == 0) 1510 } else if (strncmp(buf, "scan", 4) == 0)
1347 kmemleak_scan(); 1511 kmemleak_scan();
1512 else if (strncmp(buf, "clear", 5) == 0)
1513 kmemleak_clear();
1514 else if (strncmp(buf, "dump=", 5) == 0)
1515 ret = dump_str_object_info(buf + 5);
1348 else 1516 else
1349 ret = -EINVAL; 1517 ret = -EINVAL;
1350 1518
@@ -1371,7 +1539,7 @@ static const struct file_operations kmemleak_fops = {
1371 * Perform the freeing of the kmemleak internal objects after waiting for any 1539 * Perform the freeing of the kmemleak internal objects after waiting for any
1372 * current memory scan to complete. 1540 * current memory scan to complete.
1373 */ 1541 */
1374static int kmemleak_cleanup_thread(void *arg) 1542static void kmemleak_do_cleanup(struct work_struct *work)
1375{ 1543{
1376 struct kmemleak_object *object; 1544 struct kmemleak_object *object;
1377 1545
@@ -1383,22 +1551,9 @@ static int kmemleak_cleanup_thread(void *arg)
1383 delete_object_full(object->pointer); 1551 delete_object_full(object->pointer);
1384 rcu_read_unlock(); 1552 rcu_read_unlock();
1385 mutex_unlock(&scan_mutex); 1553 mutex_unlock(&scan_mutex);
1386
1387 return 0;
1388} 1554}
1389 1555
1390/* 1556static DECLARE_WORK(cleanup_work, kmemleak_do_cleanup);
1391 * Start the clean-up thread.
1392 */
1393static void kmemleak_cleanup(void)
1394{
1395 struct task_struct *cleanup_thread;
1396
1397 cleanup_thread = kthread_run(kmemleak_cleanup_thread, NULL,
1398 "kmemleak-clean");
1399 if (IS_ERR(cleanup_thread))
1400 pr_warning("Failed to create the clean-up thread\n");
1401}
1402 1557
1403/* 1558/*
1404 * Disable kmemleak. No memory allocation/freeing will be traced once this 1559 * Disable kmemleak. No memory allocation/freeing will be traced once this
@@ -1416,7 +1571,7 @@ static void kmemleak_disable(void)
1416 1571
1417 /* check whether it is too early for a kernel thread */ 1572 /* check whether it is too early for a kernel thread */
1418 if (atomic_read(&kmemleak_initialized)) 1573 if (atomic_read(&kmemleak_initialized))
1419 kmemleak_cleanup(); 1574 schedule_work(&cleanup_work);
1420 1575
1421 pr_info("Kernel memory leak detector disabled\n"); 1576 pr_info("Kernel memory leak detector disabled\n");
1422} 1577}
@@ -1469,8 +1624,7 @@ void __init kmemleak_init(void)
1469 1624
1470 switch (log->op_type) { 1625 switch (log->op_type) {
1471 case KMEMLEAK_ALLOC: 1626 case KMEMLEAK_ALLOC:
1472 kmemleak_alloc(log->ptr, log->size, log->min_count, 1627 early_alloc(log);
1473 GFP_KERNEL);
1474 break; 1628 break;
1475 case KMEMLEAK_FREE: 1629 case KMEMLEAK_FREE:
1476 kmemleak_free(log->ptr); 1630 kmemleak_free(log->ptr);
@@ -1513,7 +1667,7 @@ static int __init kmemleak_late_init(void)
1513 * after setting kmemleak_initialized and we may end up with 1667 * after setting kmemleak_initialized and we may end up with
1514 * two clean-up threads but serialized by scan_mutex. 1668 * two clean-up threads but serialized by scan_mutex.
1515 */ 1669 */
1516 kmemleak_cleanup(); 1670 schedule_work(&cleanup_work);
1517 return -ENOMEM; 1671 return -ENOMEM;
1518 } 1672 }
1519 1673
diff --git a/mm/mmap.c b/mm/mmap.c
index 34579b23ebd5..8101de490c73 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -88,9 +88,6 @@ int sysctl_overcommit_ratio = 50; /* default is 50% */
88int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; 88int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
89struct percpu_counter vm_committed_as; 89struct percpu_counter vm_committed_as;
90 90
91/* amount of vm to protect from userspace access */
92unsigned long mmap_min_addr = CONFIG_DEFAULT_MMAP_MIN_ADDR;
93
94/* 91/*
95 * Check that a process has enough memory to allocate a new virtual 92 * Check that a process has enough memory to allocate a new virtual
96 * mapping. 0 means there is enough memory for the allocation to 93 * mapping. 0 means there is enough memory for the allocation to
diff --git a/mm/nommu.c b/mm/nommu.c
index 53cab10fece4..66e81e7e9fe9 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -69,9 +69,6 @@ int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
69int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS; 69int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS;
70int heap_stack_gap = 0; 70int heap_stack_gap = 0;
71 71
72/* amount of vm to protect from userspace access */
73unsigned long mmap_min_addr = CONFIG_DEFAULT_MMAP_MIN_ADDR;
74
75atomic_long_t mmap_pages_allocated; 72atomic_long_t mmap_pages_allocated;
76 73
77EXPORT_SYMBOL(mem_map); 74EXPORT_SYMBOL(mem_map);
@@ -922,6 +919,10 @@ static int validate_mmap_request(struct file *file,
922 if (!file->f_op->read) 919 if (!file->f_op->read)
923 capabilities &= ~BDI_CAP_MAP_COPY; 920 capabilities &= ~BDI_CAP_MAP_COPY;
924 921
922 /* The file shall have been opened with read permission. */
923 if (!(file->f_mode & FMODE_READ))
924 return -EACCES;
925
925 if (flags & MAP_SHARED) { 926 if (flags & MAP_SHARED) {
926 /* do checks for writing, appending and locking */ 927 /* do checks for writing, appending and locking */
927 if ((prot & PROT_WRITE) && 928 if ((prot & PROT_WRITE) &&
@@ -1351,6 +1352,7 @@ unsigned long do_mmap_pgoff(struct file *file,
1351 } 1352 }
1352 1353
1353 vma->vm_region = region; 1354 vma->vm_region = region;
1355 add_nommu_region(region);
1354 1356
1355 /* set up the mapping */ 1357 /* set up the mapping */
1356 if (file && vma->vm_flags & VM_SHARED) 1358 if (file && vma->vm_flags & VM_SHARED)
@@ -1360,8 +1362,6 @@ unsigned long do_mmap_pgoff(struct file *file,
1360 if (ret < 0) 1362 if (ret < 0)
1361 goto error_put_region; 1363 goto error_put_region;
1362 1364
1363 add_nommu_region(region);
1364
1365 /* okay... we have a mapping; now we have to register it */ 1365 /* okay... we have a mapping; now we have to register it */
1366 result = vma->vm_start; 1366 result = vma->vm_start;
1367 1367
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 175a67a78a99..a7b2460e922b 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -58,7 +58,6 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
58 unsigned long points, cpu_time, run_time; 58 unsigned long points, cpu_time, run_time;
59 struct mm_struct *mm; 59 struct mm_struct *mm;
60 struct task_struct *child; 60 struct task_struct *child;
61 int oom_adj;
62 61
63 task_lock(p); 62 task_lock(p);
64 mm = p->mm; 63 mm = p->mm;
@@ -66,11 +65,6 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
66 task_unlock(p); 65 task_unlock(p);
67 return 0; 66 return 0;
68 } 67 }
69 oom_adj = mm->oom_adj;
70 if (oom_adj == OOM_DISABLE) {
71 task_unlock(p);
72 return 0;
73 }
74 68
75 /* 69 /*
76 * The memory size of the process is the basis for the badness. 70 * The memory size of the process is the basis for the badness.
@@ -154,15 +148,15 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
154 points /= 8; 148 points /= 8;
155 149
156 /* 150 /*
157 * Adjust the score by oom_adj. 151 * Adjust the score by oomkilladj.
158 */ 152 */
159 if (oom_adj) { 153 if (p->oomkilladj) {
160 if (oom_adj > 0) { 154 if (p->oomkilladj > 0) {
161 if (!points) 155 if (!points)
162 points = 1; 156 points = 1;
163 points <<= oom_adj; 157 points <<= p->oomkilladj;
164 } else 158 } else
165 points >>= -(oom_adj); 159 points >>= -(p->oomkilladj);
166 } 160 }
167 161
168#ifdef DEBUG 162#ifdef DEBUG
@@ -257,8 +251,11 @@ static struct task_struct *select_bad_process(unsigned long *ppoints,
257 *ppoints = ULONG_MAX; 251 *ppoints = ULONG_MAX;
258 } 252 }
259 253
254 if (p->oomkilladj == OOM_DISABLE)
255 continue;
256
260 points = badness(p, uptime.tv_sec); 257 points = badness(p, uptime.tv_sec);
261 if (points > *ppoints) { 258 if (points > *ppoints || !chosen) {
262 chosen = p; 259 chosen = p;
263 *ppoints = points; 260 *ppoints = points;
264 } 261 }
@@ -307,7 +304,8 @@ static void dump_tasks(const struct mem_cgroup *mem)
307 } 304 }
308 printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n", 305 printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n",
309 p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm, 306 p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm,
310 get_mm_rss(mm), (int)task_cpu(p), mm->oom_adj, p->comm); 307 get_mm_rss(mm), (int)task_cpu(p), p->oomkilladj,
308 p->comm);
311 task_unlock(p); 309 task_unlock(p);
312 } while_each_thread(g, p); 310 } while_each_thread(g, p);
313} 311}
@@ -325,8 +323,11 @@ static void __oom_kill_task(struct task_struct *p, int verbose)
325 return; 323 return;
326 } 324 }
327 325
328 if (!p->mm) 326 if (!p->mm) {
327 WARN_ON(1);
328 printk(KERN_WARNING "tried to kill an mm-less task!\n");
329 return; 329 return;
330 }
330 331
331 if (verbose) 332 if (verbose)
332 printk(KERN_ERR "Killed process %d (%s)\n", 333 printk(KERN_ERR "Killed process %d (%s)\n",
@@ -348,13 +349,28 @@ static int oom_kill_task(struct task_struct *p)
348 struct mm_struct *mm; 349 struct mm_struct *mm;
349 struct task_struct *g, *q; 350 struct task_struct *g, *q;
350 351
351 task_lock(p);
352 mm = p->mm; 352 mm = p->mm;
353 if (!mm || mm->oom_adj == OOM_DISABLE) { 353
354 task_unlock(p); 354 /* WARNING: mm may not be dereferenced since we did not obtain its
355 * value from get_task_mm(p). This is OK since all we need to do is
356 * compare mm to q->mm below.
357 *
358 * Furthermore, even if mm contains a non-NULL value, p->mm may
359 * change to NULL at any time since we do not hold task_lock(p).
360 * However, this is of no concern to us.
361 */
362
363 if (mm == NULL)
355 return 1; 364 return 1;
356 } 365
357 task_unlock(p); 366 /*
367 * Don't kill the process if any threads are set to OOM_DISABLE
368 */
369 do_each_thread(g, q) {
370 if (q->mm == mm && q->oomkilladj == OOM_DISABLE)
371 return 1;
372 } while_each_thread(g, q);
373
358 __oom_kill_task(p, 1); 374 __oom_kill_task(p, 1);
359 375
360 /* 376 /*
@@ -377,11 +393,10 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
377 struct task_struct *c; 393 struct task_struct *c;
378 394
379 if (printk_ratelimit()) { 395 if (printk_ratelimit()) {
380 task_lock(current);
381 printk(KERN_WARNING "%s invoked oom-killer: " 396 printk(KERN_WARNING "%s invoked oom-killer: "
382 "gfp_mask=0x%x, order=%d, oom_adj=%d\n", 397 "gfp_mask=0x%x, order=%d, oomkilladj=%d\n",
383 current->comm, gfp_mask, order, 398 current->comm, gfp_mask, order, current->oomkilladj);
384 current->mm ? current->mm->oom_adj : OOM_DISABLE); 399 task_lock(current);
385 cpuset_print_task_mems_allowed(current); 400 cpuset_print_task_mems_allowed(current);
386 task_unlock(current); 401 task_unlock(current);
387 dump_stack(); 402 dump_stack();
@@ -394,9 +409,8 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
394 /* 409 /*
395 * If the task is already exiting, don't alarm the sysadmin or kill 410 * If the task is already exiting, don't alarm the sysadmin or kill
396 * its children or threads, just set TIF_MEMDIE so it can die quickly 411 * its children or threads, just set TIF_MEMDIE so it can die quickly
397 * if its mm is still attached.
398 */ 412 */
399 if (p->mm && (p->flags & PF_EXITING)) { 413 if (p->flags & PF_EXITING) {
400 __oom_kill_task(p, 0); 414 __oom_kill_task(p, 0);
401 return 0; 415 return 0;
402 } 416 }
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 997186c0b519..dd73d29c15a8 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -36,15 +36,6 @@
36#include <linux/pagevec.h> 36#include <linux/pagevec.h>
37 37
38/* 38/*
39 * The maximum number of pages to writeout in a single bdflush/kupdate
40 * operation. We do this so we don't hold I_SYNC against an inode for
41 * enormous amounts of time, which would block a userspace task which has
42 * been forced to throttle against that inode. Also, the code reevaluates
43 * the dirty each time it has written this many pages.
44 */
45#define MAX_WRITEBACK_PAGES 1024
46
47/*
48 * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited 39 * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
49 * will look to see if it needs to force writeback or throttling. 40 * will look to see if it needs to force writeback or throttling.
50 */ 41 */
@@ -117,8 +108,6 @@ EXPORT_SYMBOL(laptop_mode);
117/* End of sysctl-exported parameters */ 108/* End of sysctl-exported parameters */
118 109
119 110
120static void background_writeout(unsigned long _min_pages);
121
122/* 111/*
123 * Scale the writeback cache size proportional to the relative writeout speeds. 112 * Scale the writeback cache size proportional to the relative writeout speeds.
124 * 113 *
@@ -320,15 +309,13 @@ static void task_dirty_limit(struct task_struct *tsk, unsigned long *pdirty)
320/* 309/*
321 * 310 *
322 */ 311 */
323static DEFINE_SPINLOCK(bdi_lock);
324static unsigned int bdi_min_ratio; 312static unsigned int bdi_min_ratio;
325 313
326int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) 314int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio)
327{ 315{
328 int ret = 0; 316 int ret = 0;
329 unsigned long flags;
330 317
331 spin_lock_irqsave(&bdi_lock, flags); 318 spin_lock(&bdi_lock);
332 if (min_ratio > bdi->max_ratio) { 319 if (min_ratio > bdi->max_ratio) {
333 ret = -EINVAL; 320 ret = -EINVAL;
334 } else { 321 } else {
@@ -340,27 +327,26 @@ int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio)
340 ret = -EINVAL; 327 ret = -EINVAL;
341 } 328 }
342 } 329 }
343 spin_unlock_irqrestore(&bdi_lock, flags); 330 spin_unlock(&bdi_lock);
344 331
345 return ret; 332 return ret;
346} 333}
347 334
348int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio) 335int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio)
349{ 336{
350 unsigned long flags;
351 int ret = 0; 337 int ret = 0;
352 338
353 if (max_ratio > 100) 339 if (max_ratio > 100)
354 return -EINVAL; 340 return -EINVAL;
355 341
356 spin_lock_irqsave(&bdi_lock, flags); 342 spin_lock(&bdi_lock);
357 if (bdi->min_ratio > max_ratio) { 343 if (bdi->min_ratio > max_ratio) {
358 ret = -EINVAL; 344 ret = -EINVAL;
359 } else { 345 } else {
360 bdi->max_ratio = max_ratio; 346 bdi->max_ratio = max_ratio;
361 bdi->max_prop_frac = (PROP_FRAC_BASE * max_ratio) / 100; 347 bdi->max_prop_frac = (PROP_FRAC_BASE * max_ratio) / 100;
362 } 348 }
363 spin_unlock_irqrestore(&bdi_lock, flags); 349 spin_unlock(&bdi_lock);
364 350
365 return ret; 351 return ret;
366} 352}
@@ -546,7 +532,7 @@ static void balance_dirty_pages(struct address_space *mapping)
546 * up. 532 * up.
547 */ 533 */
548 if (bdi_nr_reclaimable > bdi_thresh) { 534 if (bdi_nr_reclaimable > bdi_thresh) {
549 writeback_inodes(&wbc); 535 writeback_inodes_wbc(&wbc);
550 pages_written += write_chunk - wbc.nr_to_write; 536 pages_written += write_chunk - wbc.nr_to_write;
551 get_dirty_limits(&background_thresh, &dirty_thresh, 537 get_dirty_limits(&background_thresh, &dirty_thresh,
552 &bdi_thresh, bdi); 538 &bdi_thresh, bdi);
@@ -575,7 +561,7 @@ static void balance_dirty_pages(struct address_space *mapping)
575 if (pages_written >= write_chunk) 561 if (pages_written >= write_chunk)
576 break; /* We've done our duty */ 562 break; /* We've done our duty */
577 563
578 congestion_wait(BLK_RW_ASYNC, HZ/10); 564 schedule_timeout(1);
579 } 565 }
580 566
581 if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh && 567 if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh &&
@@ -594,10 +580,18 @@ static void balance_dirty_pages(struct address_space *mapping)
594 * background_thresh, to keep the amount of dirty memory low. 580 * background_thresh, to keep the amount of dirty memory low.
595 */ 581 */
596 if ((laptop_mode && pages_written) || 582 if ((laptop_mode && pages_written) ||
597 (!laptop_mode && (global_page_state(NR_FILE_DIRTY) 583 (!laptop_mode && ((nr_writeback = global_page_state(NR_FILE_DIRTY)
598 + global_page_state(NR_UNSTABLE_NFS) 584 + global_page_state(NR_UNSTABLE_NFS))
599 > background_thresh))) 585 > background_thresh))) {
600 pdflush_operation(background_writeout, 0); 586 struct writeback_control wbc = {
587 .bdi = bdi,
588 .sync_mode = WB_SYNC_NONE,
589 .nr_to_write = nr_writeback,
590 };
591
592
593 bdi_start_writeback(&wbc);
594 }
601} 595}
602 596
603void set_page_dirty_balance(struct page *page, int page_mkwrite) 597void set_page_dirty_balance(struct page *page, int page_mkwrite)
@@ -682,153 +676,35 @@ void throttle_vm_writeout(gfp_t gfp_mask)
682 } 676 }
683} 677}
684 678
685/*
686 * writeback at least _min_pages, and keep writing until the amount of dirty
687 * memory is less than the background threshold, or until we're all clean.
688 */
689static void background_writeout(unsigned long _min_pages)
690{
691 long min_pages = _min_pages;
692 struct writeback_control wbc = {
693 .bdi = NULL,
694 .sync_mode = WB_SYNC_NONE,
695 .older_than_this = NULL,
696 .nr_to_write = 0,
697 .nonblocking = 1,
698 .range_cyclic = 1,
699 };
700
701 for ( ; ; ) {
702 unsigned long background_thresh;
703 unsigned long dirty_thresh;
704
705 get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
706 if (global_page_state(NR_FILE_DIRTY) +
707 global_page_state(NR_UNSTABLE_NFS) < background_thresh
708 && min_pages <= 0)
709 break;
710 wbc.more_io = 0;
711 wbc.encountered_congestion = 0;
712 wbc.nr_to_write = MAX_WRITEBACK_PAGES;
713 wbc.pages_skipped = 0;
714 writeback_inodes(&wbc);
715 min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
716 if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
717 /* Wrote less than expected */
718 if (wbc.encountered_congestion || wbc.more_io)
719 congestion_wait(BLK_RW_ASYNC, HZ/10);
720 else
721 break;
722 }
723 }
724}
725
726/*
727 * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back
728 * the whole world. Returns 0 if a pdflush thread was dispatched. Returns
729 * -1 if all pdflush threads were busy.
730 */
731int wakeup_pdflush(long nr_pages)
732{
733 if (nr_pages == 0)
734 nr_pages = global_page_state(NR_FILE_DIRTY) +
735 global_page_state(NR_UNSTABLE_NFS);
736 return pdflush_operation(background_writeout, nr_pages);
737}
738
739static void wb_timer_fn(unsigned long unused);
740static void laptop_timer_fn(unsigned long unused); 679static void laptop_timer_fn(unsigned long unused);
741 680
742static DEFINE_TIMER(wb_timer, wb_timer_fn, 0, 0);
743static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0); 681static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0);
744 682
745/* 683/*
746 * Periodic writeback of "old" data.
747 *
748 * Define "old": the first time one of an inode's pages is dirtied, we mark the
749 * dirtying-time in the inode's address_space. So this periodic writeback code
750 * just walks the superblock inode list, writing back any inodes which are
751 * older than a specific point in time.
752 *
753 * Try to run once per dirty_writeback_interval. But if a writeback event
754 * takes longer than a dirty_writeback_interval interval, then leave a
755 * one-second gap.
756 *
757 * older_than_this takes precedence over nr_to_write. So we'll only write back
758 * all dirty pages if they are all attached to "old" mappings.
759 */
760static void wb_kupdate(unsigned long arg)
761{
762 unsigned long oldest_jif;
763 unsigned long start_jif;
764 unsigned long next_jif;
765 long nr_to_write;
766 struct writeback_control wbc = {
767 .bdi = NULL,
768 .sync_mode = WB_SYNC_NONE,
769 .older_than_this = &oldest_jif,
770 .nr_to_write = 0,
771 .nonblocking = 1,
772 .for_kupdate = 1,
773 .range_cyclic = 1,
774 };
775
776 sync_supers();
777
778 oldest_jif = jiffies - msecs_to_jiffies(dirty_expire_interval * 10);
779 start_jif = jiffies;
780 next_jif = start_jif + msecs_to_jiffies(dirty_writeback_interval * 10);
781 nr_to_write = global_page_state(NR_FILE_DIRTY) +
782 global_page_state(NR_UNSTABLE_NFS) +
783 (inodes_stat.nr_inodes - inodes_stat.nr_unused);
784 while (nr_to_write > 0) {
785 wbc.more_io = 0;
786 wbc.encountered_congestion = 0;
787 wbc.nr_to_write = MAX_WRITEBACK_PAGES;
788 writeback_inodes(&wbc);
789 if (wbc.nr_to_write > 0) {
790 if (wbc.encountered_congestion || wbc.more_io)
791 congestion_wait(BLK_RW_ASYNC, HZ/10);
792 else
793 break; /* All the old data is written */
794 }
795 nr_to_write -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
796 }
797 if (time_before(next_jif, jiffies + HZ))
798 next_jif = jiffies + HZ;
799 if (dirty_writeback_interval)
800 mod_timer(&wb_timer, next_jif);
801}
802
803/*
804 * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs 684 * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
805 */ 685 */
806int dirty_writeback_centisecs_handler(ctl_table *table, int write, 686int dirty_writeback_centisecs_handler(ctl_table *table, int write,
807 struct file *file, void __user *buffer, size_t *length, loff_t *ppos) 687 struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
808{ 688{
809 proc_dointvec(table, write, file, buffer, length, ppos); 689 proc_dointvec(table, write, file, buffer, length, ppos);
810 if (dirty_writeback_interval)
811 mod_timer(&wb_timer, jiffies +
812 msecs_to_jiffies(dirty_writeback_interval * 10));
813 else
814 del_timer(&wb_timer);
815 return 0; 690 return 0;
816} 691}
817 692
818static void wb_timer_fn(unsigned long unused) 693static void do_laptop_sync(struct work_struct *work)
819{
820 if (pdflush_operation(wb_kupdate, 0) < 0)
821 mod_timer(&wb_timer, jiffies + HZ); /* delay 1 second */
822}
823
824static void laptop_flush(unsigned long unused)
825{ 694{
826 sys_sync(); 695 wakeup_flusher_threads(0);
696 kfree(work);
827} 697}
828 698
829static void laptop_timer_fn(unsigned long unused) 699static void laptop_timer_fn(unsigned long unused)
830{ 700{
831 pdflush_operation(laptop_flush, 0); 701 struct work_struct *work;
702
703 work = kmalloc(sizeof(*work), GFP_ATOMIC);
704 if (work) {
705 INIT_WORK(work, do_laptop_sync);
706 schedule_work(work);
707 }
832} 708}
833 709
834/* 710/*
@@ -911,8 +787,6 @@ void __init page_writeback_init(void)
911{ 787{
912 int shift; 788 int shift;
913 789
914 mod_timer(&wb_timer,
915 jiffies + msecs_to_jiffies(dirty_writeback_interval * 10));
916 writeback_set_ratelimit(); 790 writeback_set_ratelimit();
917 register_cpu_notifier(&ratelimit_nb); 791 register_cpu_notifier(&ratelimit_nb);
918 792
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d052abbe3063..a0de15f46987 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -817,13 +817,15 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
817 * agressive about taking ownership of free pages 817 * agressive about taking ownership of free pages
818 */ 818 */
819 if (unlikely(current_order >= (pageblock_order >> 1)) || 819 if (unlikely(current_order >= (pageblock_order >> 1)) ||
820 start_migratetype == MIGRATE_RECLAIMABLE) { 820 start_migratetype == MIGRATE_RECLAIMABLE ||
821 page_group_by_mobility_disabled) {
821 unsigned long pages; 822 unsigned long pages;
822 pages = move_freepages_block(zone, page, 823 pages = move_freepages_block(zone, page,
823 start_migratetype); 824 start_migratetype);
824 825
825 /* Claim the whole block if over half of it is free */ 826 /* Claim the whole block if over half of it is free */
826 if (pages >= (1 << (pageblock_order-1))) 827 if (pages >= (1 << (pageblock_order-1)) ||
828 page_group_by_mobility_disabled)
827 set_pageblock_migratetype(page, 829 set_pageblock_migratetype(page,
828 start_migratetype); 830 start_migratetype);
829 831
@@ -2544,7 +2546,6 @@ static void build_zonelists(pg_data_t *pgdat)
2544 prev_node = local_node; 2546 prev_node = local_node;
2545 nodes_clear(used_mask); 2547 nodes_clear(used_mask);
2546 2548
2547 memset(node_load, 0, sizeof(node_load));
2548 memset(node_order, 0, sizeof(node_order)); 2549 memset(node_order, 0, sizeof(node_order));
2549 j = 0; 2550 j = 0;
2550 2551
@@ -2653,6 +2654,9 @@ static int __build_all_zonelists(void *dummy)
2653{ 2654{
2654 int nid; 2655 int nid;
2655 2656
2657#ifdef CONFIG_NUMA
2658 memset(node_load, 0, sizeof(node_load));
2659#endif
2656 for_each_online_node(nid) { 2660 for_each_online_node(nid) {
2657 pg_data_t *pgdat = NODE_DATA(nid); 2661 pg_data_t *pgdat = NODE_DATA(nid);
2658 2662
diff --git a/mm/pdflush.c b/mm/pdflush.c
deleted file mode 100644
index 235ac440c44e..000000000000
--- a/mm/pdflush.c
+++ /dev/null
@@ -1,269 +0,0 @@
1/*
2 * mm/pdflush.c - worker threads for writing back filesystem data
3 *
4 * Copyright (C) 2002, Linus Torvalds.
5 *
6 * 09Apr2002 Andrew Morton
7 * Initial version
8 * 29Feb2004 kaos@sgi.com
9 * Move worker thread creation to kthread to avoid chewing
10 * up stack space with nested calls to kernel_thread.
11 */
12
13#include <linux/sched.h>
14#include <linux/list.h>
15#include <linux/signal.h>
16#include <linux/spinlock.h>
17#include <linux/gfp.h>
18#include <linux/init.h>
19#include <linux/module.h>
20#include <linux/fs.h> /* Needed by writeback.h */
21#include <linux/writeback.h> /* Prototypes pdflush_operation() */
22#include <linux/kthread.h>
23#include <linux/cpuset.h>
24#include <linux/freezer.h>
25
26
27/*
28 * Minimum and maximum number of pdflush instances
29 */
30#define MIN_PDFLUSH_THREADS 2
31#define MAX_PDFLUSH_THREADS 8
32
33static void start_one_pdflush_thread(void);
34
35
36/*
37 * The pdflush threads are worker threads for writing back dirty data.
38 * Ideally, we'd like one thread per active disk spindle. But the disk
39 * topology is very hard to divine at this level. Instead, we take
40 * care in various places to prevent more than one pdflush thread from
41 * performing writeback against a single filesystem. pdflush threads
42 * have the PF_FLUSHER flag set in current->flags to aid in this.
43 */
44
45/*
46 * All the pdflush threads. Protected by pdflush_lock
47 */
48static LIST_HEAD(pdflush_list);
49static DEFINE_SPINLOCK(pdflush_lock);
50
51/*
52 * The count of currently-running pdflush threads. Protected
53 * by pdflush_lock.
54 *
55 * Readable by sysctl, but not writable. Published to userspace at
56 * /proc/sys/vm/nr_pdflush_threads.
57 */
58int nr_pdflush_threads = 0;
59
60/*
61 * The time at which the pdflush thread pool last went empty
62 */
63static unsigned long last_empty_jifs;
64
65/*
66 * The pdflush thread.
67 *
68 * Thread pool management algorithm:
69 *
70 * - The minimum and maximum number of pdflush instances are bound
71 * by MIN_PDFLUSH_THREADS and MAX_PDFLUSH_THREADS.
72 *
73 * - If there have been no idle pdflush instances for 1 second, create
74 * a new one.
75 *
76 * - If the least-recently-went-to-sleep pdflush thread has been asleep
77 * for more than one second, terminate a thread.
78 */
79
80/*
81 * A structure for passing work to a pdflush thread. Also for passing
82 * state information between pdflush threads. Protected by pdflush_lock.
83 */
84struct pdflush_work {
85 struct task_struct *who; /* The thread */
86 void (*fn)(unsigned long); /* A callback function */
87 unsigned long arg0; /* An argument to the callback */
88 struct list_head list; /* On pdflush_list, when idle */
89 unsigned long when_i_went_to_sleep;
90};
91
92static int __pdflush(struct pdflush_work *my_work)
93{
94 current->flags |= PF_FLUSHER | PF_SWAPWRITE;
95 set_freezable();
96 my_work->fn = NULL;
97 my_work->who = current;
98 INIT_LIST_HEAD(&my_work->list);
99
100 spin_lock_irq(&pdflush_lock);
101 for ( ; ; ) {
102 struct pdflush_work *pdf;
103
104 set_current_state(TASK_INTERRUPTIBLE);
105 list_move(&my_work->list, &pdflush_list);
106 my_work->when_i_went_to_sleep = jiffies;
107 spin_unlock_irq(&pdflush_lock);
108 schedule();
109 try_to_freeze();
110 spin_lock_irq(&pdflush_lock);
111 if (!list_empty(&my_work->list)) {
112 /*
113 * Someone woke us up, but without removing our control
114 * structure from the global list. swsusp will do this
115 * in try_to_freeze()->refrigerator(). Handle it.
116 */
117 my_work->fn = NULL;
118 continue;
119 }
120 if (my_work->fn == NULL) {
121 printk("pdflush: bogus wakeup\n");
122 continue;
123 }
124 spin_unlock_irq(&pdflush_lock);
125
126 (*my_work->fn)(my_work->arg0);
127
128 spin_lock_irq(&pdflush_lock);
129
130 /*
131 * Thread creation: For how long have there been zero
132 * available threads?
133 *
134 * To throttle creation, we reset last_empty_jifs.
135 */
136 if (time_after(jiffies, last_empty_jifs + 1 * HZ)) {
137 if (list_empty(&pdflush_list)) {
138 if (nr_pdflush_threads < MAX_PDFLUSH_THREADS) {
139 last_empty_jifs = jiffies;
140 nr_pdflush_threads++;
141 spin_unlock_irq(&pdflush_lock);
142 start_one_pdflush_thread();
143 spin_lock_irq(&pdflush_lock);
144 }
145 }
146 }
147
148 my_work->fn = NULL;
149
150 /*
151 * Thread destruction: For how long has the sleepiest
152 * thread slept?
153 */
154 if (list_empty(&pdflush_list))
155 continue;
156 if (nr_pdflush_threads <= MIN_PDFLUSH_THREADS)
157 continue;
158 pdf = list_entry(pdflush_list.prev, struct pdflush_work, list);
159 if (time_after(jiffies, pdf->when_i_went_to_sleep + 1 * HZ)) {
160 /* Limit exit rate */
161 pdf->when_i_went_to_sleep = jiffies;
162 break; /* exeunt */
163 }
164 }
165 nr_pdflush_threads--;
166 spin_unlock_irq(&pdflush_lock);
167 return 0;
168}
169
170/*
171 * Of course, my_work wants to be just a local in __pdflush(). It is
172 * separated out in this manner to hopefully prevent the compiler from
173 * performing unfortunate optimisations against the auto variables. Because
174 * these are visible to other tasks and CPUs. (No problem has actually
175 * been observed. This is just paranoia).
176 */
177static int pdflush(void *dummy)
178{
179 struct pdflush_work my_work;
180 cpumask_var_t cpus_allowed;
181
182 /*
183 * Since the caller doesn't even check kthread_run() worked, let's not
184 * freak out too much if this fails.
185 */
186 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
187 printk(KERN_WARNING "pdflush failed to allocate cpumask\n");
188 return 0;
189 }
190
191 /*
192 * pdflush can spend a lot of time doing encryption via dm-crypt. We
193 * don't want to do that at keventd's priority.
194 */
195 set_user_nice(current, 0);
196
197 /*
198 * Some configs put our parent kthread in a limited cpuset,
199 * which kthread() overrides, forcing cpus_allowed == cpu_all_mask.
200 * Our needs are more modest - cut back to our cpusets cpus_allowed.
201 * This is needed as pdflush's are dynamically created and destroyed.
202 * The boottime pdflush's are easily placed w/o these 2 lines.
203 */
204 cpuset_cpus_allowed(current, cpus_allowed);
205 set_cpus_allowed_ptr(current, cpus_allowed);
206 free_cpumask_var(cpus_allowed);
207
208 return __pdflush(&my_work);
209}
210
211/*
212 * Attempt to wake up a pdflush thread, and get it to do some work for you.
213 * Returns zero if it indeed managed to find a worker thread, and passed your
214 * payload to it.
215 */
216int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0)
217{
218 unsigned long flags;
219 int ret = 0;
220
221 BUG_ON(fn == NULL); /* Hard to diagnose if it's deferred */
222
223 spin_lock_irqsave(&pdflush_lock, flags);
224 if (list_empty(&pdflush_list)) {
225 ret = -1;
226 } else {
227 struct pdflush_work *pdf;
228
229 pdf = list_entry(pdflush_list.next, struct pdflush_work, list);
230 list_del_init(&pdf->list);
231 if (list_empty(&pdflush_list))
232 last_empty_jifs = jiffies;
233 pdf->fn = fn;
234 pdf->arg0 = arg0;
235 wake_up_process(pdf->who);
236 }
237 spin_unlock_irqrestore(&pdflush_lock, flags);
238
239 return ret;
240}
241
242static void start_one_pdflush_thread(void)
243{
244 struct task_struct *k;
245
246 k = kthread_run(pdflush, NULL, "pdflush");
247 if (unlikely(IS_ERR(k))) {
248 spin_lock_irq(&pdflush_lock);
249 nr_pdflush_threads--;
250 spin_unlock_irq(&pdflush_lock);
251 }
252}
253
254static int __init pdflush_init(void)
255{
256 int i;
257
258 /*
259 * Pre-set nr_pdflush_threads... If we fail to create,
260 * the count will be decremented.
261 */
262 nr_pdflush_threads = MIN_PDFLUSH_THREADS;
263
264 for (i = 0; i < MIN_PDFLUSH_THREADS; i++)
265 start_one_pdflush_thread();
266 return 0;
267}
268
269module_init(pdflush_init);
diff --git a/mm/rmap.c b/mm/rmap.c
index 836c6c63e1f2..0895b5c7cbff 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -358,6 +358,7 @@ static int page_referenced_one(struct page *page,
358 */ 358 */
359 if (vma->vm_flags & VM_LOCKED) { 359 if (vma->vm_flags & VM_LOCKED) {
360 *mapcount = 1; /* break early from loop */ 360 *mapcount = 1; /* break early from loop */
361 *vm_flags |= VM_LOCKED;
361 goto out_unmap; 362 goto out_unmap;
362 } 363 }
363 364
diff --git a/mm/shmem.c b/mm/shmem.c
index d713239ce2ce..5a0b3d4055f3 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2446,7 +2446,7 @@ static const struct inode_operations shmem_inode_operations = {
2446 .getxattr = generic_getxattr, 2446 .getxattr = generic_getxattr,
2447 .listxattr = generic_listxattr, 2447 .listxattr = generic_listxattr,
2448 .removexattr = generic_removexattr, 2448 .removexattr = generic_removexattr,
2449 .permission = shmem_permission, 2449 .check_acl = shmem_check_acl,
2450#endif 2450#endif
2451 2451
2452}; 2452};
@@ -2469,7 +2469,7 @@ static const struct inode_operations shmem_dir_inode_operations = {
2469 .getxattr = generic_getxattr, 2469 .getxattr = generic_getxattr,
2470 .listxattr = generic_listxattr, 2470 .listxattr = generic_listxattr,
2471 .removexattr = generic_removexattr, 2471 .removexattr = generic_removexattr,
2472 .permission = shmem_permission, 2472 .check_acl = shmem_check_acl,
2473#endif 2473#endif
2474}; 2474};
2475 2475
@@ -2480,7 +2480,7 @@ static const struct inode_operations shmem_special_inode_operations = {
2480 .getxattr = generic_getxattr, 2480 .getxattr = generic_getxattr,
2481 .listxattr = generic_listxattr, 2481 .listxattr = generic_listxattr,
2482 .removexattr = generic_removexattr, 2482 .removexattr = generic_removexattr,
2483 .permission = shmem_permission, 2483 .check_acl = shmem_check_acl,
2484#endif 2484#endif
2485}; 2485};
2486 2486
diff --git a/mm/shmem_acl.c b/mm/shmem_acl.c
index 606a8e757a42..df2c87fdae50 100644
--- a/mm/shmem_acl.c
+++ b/mm/shmem_acl.c
@@ -157,7 +157,7 @@ shmem_acl_init(struct inode *inode, struct inode *dir)
157/** 157/**
158 * shmem_check_acl - check_acl() callback for generic_permission() 158 * shmem_check_acl - check_acl() callback for generic_permission()
159 */ 159 */
160static int 160int
161shmem_check_acl(struct inode *inode, int mask) 161shmem_check_acl(struct inode *inode, int mask)
162{ 162{
163 struct posix_acl *acl = shmem_get_acl(inode, ACL_TYPE_ACCESS); 163 struct posix_acl *acl = shmem_get_acl(inode, ACL_TYPE_ACCESS);
@@ -169,12 +169,3 @@ shmem_check_acl(struct inode *inode, int mask)
169 } 169 }
170 return -EAGAIN; 170 return -EAGAIN;
171} 171}
172
173/**
174 * shmem_permission - permission() inode operation
175 */
176int
177shmem_permission(struct inode *inode, int mask)
178{
179 return generic_permission(inode, mask, shmem_check_acl);
180}
diff --git a/mm/slob.c b/mm/slob.c
index 9641da3d5e58..837ebd64cc34 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -692,3 +692,8 @@ void __init kmem_cache_init(void)
692{ 692{
693 slob_ready = 1; 693 slob_ready = 1;
694} 694}
695
696void __init kmem_cache_init_late(void)
697{
698 /* Nothing to do */
699}
diff --git a/mm/slub.c b/mm/slub.c
index dc9765bb49dc..a5789b91d179 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -141,6 +141,13 @@
141 SLAB_POISON | SLAB_STORE_USER) 141 SLAB_POISON | SLAB_STORE_USER)
142 142
143/* 143/*
144 * Debugging flags that require metadata to be stored in the slab. These get
145 * disabled when slub_debug=O is used and a cache's min order increases with
146 * metadata.
147 */
148#define DEBUG_METADATA_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER)
149
150/*
144 * Set of flags that will prevent slab merging 151 * Set of flags that will prevent slab merging
145 */ 152 */
146#define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ 153#define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
@@ -325,6 +332,7 @@ static int slub_debug;
325#endif 332#endif
326 333
327static char *slub_debug_slabs; 334static char *slub_debug_slabs;
335static int disable_higher_order_debug;
328 336
329/* 337/*
330 * Object debugging 338 * Object debugging
@@ -646,7 +654,7 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page)
646 slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1); 654 slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1);
647 print_section("Padding", end - remainder, remainder); 655 print_section("Padding", end - remainder, remainder);
648 656
649 restore_bytes(s, "slab padding", POISON_INUSE, start, end); 657 restore_bytes(s, "slab padding", POISON_INUSE, end - remainder, end);
650 return 0; 658 return 0;
651} 659}
652 660
@@ -976,6 +984,15 @@ static int __init setup_slub_debug(char *str)
976 */ 984 */
977 goto check_slabs; 985 goto check_slabs;
978 986
987 if (tolower(*str) == 'o') {
988 /*
989 * Avoid enabling debugging on caches if its minimum order
990 * would increase as a result.
991 */
992 disable_higher_order_debug = 1;
993 goto out;
994 }
995
979 slub_debug = 0; 996 slub_debug = 0;
980 if (*str == '-') 997 if (*str == '-')
981 /* 998 /*
@@ -1026,8 +1043,8 @@ static unsigned long kmem_cache_flags(unsigned long objsize,
1026 * Enable debugging if selected on the kernel commandline. 1043 * Enable debugging if selected on the kernel commandline.
1027 */ 1044 */
1028 if (slub_debug && (!slub_debug_slabs || 1045 if (slub_debug && (!slub_debug_slabs ||
1029 strncmp(slub_debug_slabs, name, strlen(slub_debug_slabs)) == 0)) 1046 !strncmp(slub_debug_slabs, name, strlen(slub_debug_slabs))))
1030 flags |= slub_debug; 1047 flags |= slub_debug;
1031 1048
1032 return flags; 1049 return flags;
1033} 1050}
@@ -1109,8 +1126,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
1109 } 1126 }
1110 1127
1111 if (kmemcheck_enabled 1128 if (kmemcheck_enabled
1112 && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) 1129 && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) {
1113 {
1114 int pages = 1 << oo_order(oo); 1130 int pages = 1 << oo_order(oo);
1115 1131
1116 kmemcheck_alloc_shadow(page, oo_order(oo), flags, node); 1132 kmemcheck_alloc_shadow(page, oo_order(oo), flags, node);
@@ -1560,6 +1576,10 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
1560 "default order: %d, min order: %d\n", s->name, s->objsize, 1576 "default order: %d, min order: %d\n", s->name, s->objsize,
1561 s->size, oo_order(s->oo), oo_order(s->min)); 1577 s->size, oo_order(s->oo), oo_order(s->min));
1562 1578
1579 if (oo_order(s->min) > get_order(s->objsize))
1580 printk(KERN_WARNING " %s debugging increased min order, use "
1581 "slub_debug=O to disable.\n", s->name);
1582
1563 for_each_online_node(node) { 1583 for_each_online_node(node) {
1564 struct kmem_cache_node *n = get_node(s, node); 1584 struct kmem_cache_node *n = get_node(s, node);
1565 unsigned long nr_slabs; 1585 unsigned long nr_slabs;
@@ -2001,7 +2021,7 @@ static inline int calculate_order(int size)
2001 return order; 2021 return order;
2002 fraction /= 2; 2022 fraction /= 2;
2003 } 2023 }
2004 min_objects --; 2024 min_objects--;
2005 } 2025 }
2006 2026
2007 /* 2027 /*
@@ -2400,6 +2420,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
2400 * on bootup. 2420 * on bootup.
2401 */ 2421 */
2402 align = calculate_alignment(flags, align, s->objsize); 2422 align = calculate_alignment(flags, align, s->objsize);
2423 s->align = align;
2403 2424
2404 /* 2425 /*
2405 * SLUB stores one object immediately after another beginning from 2426 * SLUB stores one object immediately after another beginning from
@@ -2452,6 +2473,18 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
2452 2473
2453 if (!calculate_sizes(s, -1)) 2474 if (!calculate_sizes(s, -1))
2454 goto error; 2475 goto error;
2476 if (disable_higher_order_debug) {
2477 /*
2478 * Disable debugging flags that store metadata if the min slab
2479 * order increased.
2480 */
2481 if (get_order(s->size) > get_order(s->objsize)) {
2482 s->flags &= ~DEBUG_METADATA_FLAGS;
2483 s->offset = 0;
2484 if (!calculate_sizes(s, -1))
2485 goto error;
2486 }
2487 }
2455 2488
2456 /* 2489 /*
2457 * The larger the object size is, the more pages we want on the partial 2490 * The larger the object size is, the more pages we want on the partial
@@ -2594,8 +2627,6 @@ static inline int kmem_cache_close(struct kmem_cache *s)
2594 */ 2627 */
2595void kmem_cache_destroy(struct kmem_cache *s) 2628void kmem_cache_destroy(struct kmem_cache *s)
2596{ 2629{
2597 if (s->flags & SLAB_DESTROY_BY_RCU)
2598 rcu_barrier();
2599 down_write(&slub_lock); 2630 down_write(&slub_lock);
2600 s->refcount--; 2631 s->refcount--;
2601 if (!s->refcount) { 2632 if (!s->refcount) {
@@ -2606,6 +2637,8 @@ void kmem_cache_destroy(struct kmem_cache *s)
2606 "still has objects.\n", s->name, __func__); 2637 "still has objects.\n", s->name, __func__);
2607 dump_stack(); 2638 dump_stack();
2608 } 2639 }
2640 if (s->flags & SLAB_DESTROY_BY_RCU)
2641 rcu_barrier();
2609 sysfs_slab_remove(s); 2642 sysfs_slab_remove(s);
2610 } else 2643 } else
2611 up_write(&slub_lock); 2644 up_write(&slub_lock);
@@ -2790,6 +2823,11 @@ static s8 size_index[24] = {
2790 2 /* 192 */ 2823 2 /* 192 */
2791}; 2824};
2792 2825
2826static inline int size_index_elem(size_t bytes)
2827{
2828 return (bytes - 1) / 8;
2829}
2830
2793static struct kmem_cache *get_slab(size_t size, gfp_t flags) 2831static struct kmem_cache *get_slab(size_t size, gfp_t flags)
2794{ 2832{
2795 int index; 2833 int index;
@@ -2798,7 +2836,7 @@ static struct kmem_cache *get_slab(size_t size, gfp_t flags)
2798 if (!size) 2836 if (!size)
2799 return ZERO_SIZE_PTR; 2837 return ZERO_SIZE_PTR;
2800 2838
2801 index = size_index[(size - 1) / 8]; 2839 index = size_index[size_index_elem(size)];
2802 } else 2840 } else
2803 index = fls(size - 1); 2841 index = fls(size - 1);
2804 2842
@@ -3156,10 +3194,12 @@ void __init kmem_cache_init(void)
3156 slab_state = PARTIAL; 3194 slab_state = PARTIAL;
3157 3195
3158 /* Caches that are not of the two-to-the-power-of size */ 3196 /* Caches that are not of the two-to-the-power-of size */
3159 if (KMALLOC_MIN_SIZE <= 64) { 3197 if (KMALLOC_MIN_SIZE <= 32) {
3160 create_kmalloc_cache(&kmalloc_caches[1], 3198 create_kmalloc_cache(&kmalloc_caches[1],
3161 "kmalloc-96", 96, GFP_NOWAIT); 3199 "kmalloc-96", 96, GFP_NOWAIT);
3162 caches++; 3200 caches++;
3201 }
3202 if (KMALLOC_MIN_SIZE <= 64) {
3163 create_kmalloc_cache(&kmalloc_caches[2], 3203 create_kmalloc_cache(&kmalloc_caches[2],
3164 "kmalloc-192", 192, GFP_NOWAIT); 3204 "kmalloc-192", 192, GFP_NOWAIT);
3165 caches++; 3205 caches++;
@@ -3186,17 +3226,28 @@ void __init kmem_cache_init(void)
3186 BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 || 3226 BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 ||
3187 (KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1))); 3227 (KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1)));
3188 3228
3189 for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) 3229 for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) {
3190 size_index[(i - 1) / 8] = KMALLOC_SHIFT_LOW; 3230 int elem = size_index_elem(i);
3231 if (elem >= ARRAY_SIZE(size_index))
3232 break;
3233 size_index[elem] = KMALLOC_SHIFT_LOW;
3234 }
3191 3235
3192 if (KMALLOC_MIN_SIZE == 128) { 3236 if (KMALLOC_MIN_SIZE == 64) {
3237 /*
3238 * The 96 byte size cache is not used if the alignment
3239 * is 64 byte.
3240 */
3241 for (i = 64 + 8; i <= 96; i += 8)
3242 size_index[size_index_elem(i)] = 7;
3243 } else if (KMALLOC_MIN_SIZE == 128) {
3193 /* 3244 /*
3194 * The 192 byte sized cache is not used if the alignment 3245 * The 192 byte sized cache is not used if the alignment
3195 * is 128 byte. Redirect kmalloc to use the 256 byte cache 3246 * is 128 byte. Redirect kmalloc to use the 256 byte cache
3196 * instead. 3247 * instead.
3197 */ 3248 */
3198 for (i = 128 + 8; i <= 192; i += 8) 3249 for (i = 128 + 8; i <= 192; i += 8)
3199 size_index[(i - 1) / 8] = 8; 3250 size_index[size_index_elem(i)] = 8;
3200 } 3251 }
3201 3252
3202 slab_state = UP; 3253 slab_state = UP;
@@ -4543,8 +4594,11 @@ static int sysfs_slab_add(struct kmem_cache *s)
4543 } 4594 }
4544 4595
4545 err = sysfs_create_group(&s->kobj, &slab_attr_group); 4596 err = sysfs_create_group(&s->kobj, &slab_attr_group);
4546 if (err) 4597 if (err) {
4598 kobject_del(&s->kobj);
4599 kobject_put(&s->kobj);
4547 return err; 4600 return err;
4601 }
4548 kobject_uevent(&s->kobj, KOBJ_ADD); 4602 kobject_uevent(&s->kobj, KOBJ_ADD);
4549 if (!unmergeable) { 4603 if (!unmergeable) {
4550 /* Setup first alias */ 4604 /* Setup first alias */
@@ -4726,7 +4780,7 @@ static const struct file_operations proc_slabinfo_operations = {
4726 4780
4727static int __init slab_proc_init(void) 4781static int __init slab_proc_init(void)
4728{ 4782{
4729 proc_create("slabinfo",S_IWUSR|S_IRUGO,NULL,&proc_slabinfo_operations); 4783 proc_create("slabinfo", S_IRUGO, NULL, &proc_slabinfo_operations);
4730 return 0; 4784 return 0;
4731} 4785}
4732module_init(slab_proc_init); 4786module_init(slab_proc_init);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 42cd38eba79f..5ae6b8b78c80 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -34,6 +34,7 @@ static const struct address_space_operations swap_aops = {
34}; 34};
35 35
36static struct backing_dev_info swap_backing_dev_info = { 36static struct backing_dev_info swap_backing_dev_info = {
37 .name = "swap",
37 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, 38 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED,
38 .unplug_io_fn = swap_unplug_io_fn, 39 .unplug_io_fn = swap_unplug_io_fn,
39}; 40};
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 8ffdc0d23c53..74f1102e8749 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -161,7 +161,8 @@ static int discard_swap(struct swap_info_struct *si)
161 } 161 }
162 162
163 err = blkdev_issue_discard(si->bdev, start_block, 163 err = blkdev_issue_discard(si->bdev, start_block,
164 nr_blocks, GFP_KERNEL); 164 nr_blocks, GFP_KERNEL,
165 DISCARD_FL_BARRIER);
165 if (err) 166 if (err)
166 break; 167 break;
167 168
@@ -200,7 +201,8 @@ static void discard_swap_cluster(struct swap_info_struct *si,
200 start_block <<= PAGE_SHIFT - 9; 201 start_block <<= PAGE_SHIFT - 9;
201 nr_blocks <<= PAGE_SHIFT - 9; 202 nr_blocks <<= PAGE_SHIFT - 9;
202 if (blkdev_issue_discard(si->bdev, start_block, 203 if (blkdev_issue_discard(si->bdev, start_block,
203 nr_blocks, GFP_NOIO)) 204 nr_blocks, GFP_NOIO,
205 DISCARD_FL_BARRIER))
204 break; 206 break;
205 } 207 }
206 208
diff --git a/mm/vmscan.c b/mm/vmscan.c
index dea7abd31098..ba8228e0a806 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -630,9 +630,14 @@ static unsigned long shrink_page_list(struct list_head *page_list,
630 630
631 referenced = page_referenced(page, 1, 631 referenced = page_referenced(page, 1,
632 sc->mem_cgroup, &vm_flags); 632 sc->mem_cgroup, &vm_flags);
633 /* In active use or really unfreeable? Activate it. */ 633 /*
634 * In active use or really unfreeable? Activate it.
635 * If page which have PG_mlocked lost isoltation race,
636 * try_to_unmap moves it to unevictable list
637 */
634 if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && 638 if (sc->order <= PAGE_ALLOC_COSTLY_ORDER &&
635 referenced && page_mapping_inuse(page)) 639 referenced && page_mapping_inuse(page)
640 && !(vm_flags & VM_LOCKED))
636 goto activate_locked; 641 goto activate_locked;
637 642
638 /* 643 /*
@@ -1715,7 +1720,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1715 */ 1720 */
1716 if (total_scanned > sc->swap_cluster_max + 1721 if (total_scanned > sc->swap_cluster_max +
1717 sc->swap_cluster_max / 2) { 1722 sc->swap_cluster_max / 2) {
1718 wakeup_pdflush(laptop_mode ? 0 : total_scanned); 1723 wakeup_flusher_threads(laptop_mode ? 0 : total_scanned);
1719 sc->may_writepage = 1; 1724 sc->may_writepage = 1;
1720 } 1725 }
1721 1726