diff options
Diffstat (limited to 'mm')
63 files changed, 5101 insertions, 3349 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 7c5697116fcf..e338407f1225 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
@@ -137,6 +137,9 @@ config HAVE_MEMBLOCK_NODE_MAP | |||
137 | config ARCH_DISCARD_MEMBLOCK | 137 | config ARCH_DISCARD_MEMBLOCK |
138 | boolean | 138 | boolean |
139 | 139 | ||
140 | config NO_BOOTMEM | ||
141 | boolean | ||
142 | |||
140 | # eventually, we can have this option just 'select SPARSEMEM' | 143 | # eventually, we can have this option just 'select SPARSEMEM' |
141 | config MEMORY_HOTPLUG | 144 | config MEMORY_HOTPLUG |
142 | bool "Allow for memory hot-add" | 145 | bool "Allow for memory hot-add" |
@@ -362,7 +365,7 @@ config CLEANCACHE | |||
362 | for clean pages that the kernel's pageframe replacement algorithm | 365 | for clean pages that the kernel's pageframe replacement algorithm |
363 | (PFRA) would like to keep around, but can't since there isn't enough | 366 | (PFRA) would like to keep around, but can't since there isn't enough |
364 | memory. So when the PFRA "evicts" a page, it first attempts to use | 367 | memory. So when the PFRA "evicts" a page, it first attempts to use |
365 | cleancacne code to put the data contained in that page into | 368 | cleancache code to put the data contained in that page into |
366 | "transcendent memory", memory that is not directly accessible or | 369 | "transcendent memory", memory that is not directly accessible or |
367 | addressable by the kernel and is of unknown and possibly | 370 | addressable by the kernel and is of unknown and possibly |
368 | time-varying size. And when a cleancache-enabled | 371 | time-varying size. And when a cleancache-enabled |
diff --git a/mm/Makefile b/mm/Makefile index 836e4163c1bf..50ec00ef2a0e 100644 --- a/mm/Makefile +++ b/mm/Makefile | |||
@@ -5,7 +5,8 @@ | |||
5 | mmu-y := nommu.o | 5 | mmu-y := nommu.o |
6 | mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ | 6 | mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ |
7 | mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ | 7 | mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ |
8 | vmalloc.o pagewalk.o pgtable-generic.o | 8 | vmalloc.o pagewalk.o pgtable-generic.o \ |
9 | process_vm_access.o | ||
9 | 10 | ||
10 | obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ | 11 | obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ |
11 | maccess.o page_alloc.o page-writeback.o \ | 12 | maccess.o page_alloc.o page-writeback.o \ |
diff --git a/mm/backing-dev.c b/mm/backing-dev.c index f032e6e1e09a..71034f41a2ba 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c | |||
@@ -45,6 +45,17 @@ static struct timer_list sync_supers_timer; | |||
45 | static int bdi_sync_supers(void *); | 45 | static int bdi_sync_supers(void *); |
46 | static void sync_supers_timer_fn(unsigned long); | 46 | static void sync_supers_timer_fn(unsigned long); |
47 | 47 | ||
48 | void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2) | ||
49 | { | ||
50 | if (wb1 < wb2) { | ||
51 | spin_lock(&wb1->list_lock); | ||
52 | spin_lock_nested(&wb2->list_lock, 1); | ||
53 | } else { | ||
54 | spin_lock(&wb2->list_lock); | ||
55 | spin_lock_nested(&wb1->list_lock, 1); | ||
56 | } | ||
57 | } | ||
58 | |||
48 | #ifdef CONFIG_DEBUG_FS | 59 | #ifdef CONFIG_DEBUG_FS |
49 | #include <linux/debugfs.h> | 60 | #include <linux/debugfs.h> |
50 | #include <linux/seq_file.h> | 61 | #include <linux/seq_file.h> |
@@ -67,34 +78,44 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) | |||
67 | struct inode *inode; | 78 | struct inode *inode; |
68 | 79 | ||
69 | nr_dirty = nr_io = nr_more_io = 0; | 80 | nr_dirty = nr_io = nr_more_io = 0; |
70 | spin_lock(&inode_wb_list_lock); | 81 | spin_lock(&wb->list_lock); |
71 | list_for_each_entry(inode, &wb->b_dirty, i_wb_list) | 82 | list_for_each_entry(inode, &wb->b_dirty, i_wb_list) |
72 | nr_dirty++; | 83 | nr_dirty++; |
73 | list_for_each_entry(inode, &wb->b_io, i_wb_list) | 84 | list_for_each_entry(inode, &wb->b_io, i_wb_list) |
74 | nr_io++; | 85 | nr_io++; |
75 | list_for_each_entry(inode, &wb->b_more_io, i_wb_list) | 86 | list_for_each_entry(inode, &wb->b_more_io, i_wb_list) |
76 | nr_more_io++; | 87 | nr_more_io++; |
77 | spin_unlock(&inode_wb_list_lock); | 88 | spin_unlock(&wb->list_lock); |
78 | 89 | ||
79 | global_dirty_limits(&background_thresh, &dirty_thresh); | 90 | global_dirty_limits(&background_thresh, &dirty_thresh); |
80 | bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); | 91 | bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); |
81 | 92 | ||
82 | #define K(x) ((x) << (PAGE_SHIFT - 10)) | 93 | #define K(x) ((x) << (PAGE_SHIFT - 10)) |
83 | seq_printf(m, | 94 | seq_printf(m, |
84 | "BdiWriteback: %8lu kB\n" | 95 | "BdiWriteback: %10lu kB\n" |
85 | "BdiReclaimable: %8lu kB\n" | 96 | "BdiReclaimable: %10lu kB\n" |
86 | "BdiDirtyThresh: %8lu kB\n" | 97 | "BdiDirtyThresh: %10lu kB\n" |
87 | "DirtyThresh: %8lu kB\n" | 98 | "DirtyThresh: %10lu kB\n" |
88 | "BackgroundThresh: %8lu kB\n" | 99 | "BackgroundThresh: %10lu kB\n" |
89 | "b_dirty: %8lu\n" | 100 | "BdiDirtied: %10lu kB\n" |
90 | "b_io: %8lu\n" | 101 | "BdiWritten: %10lu kB\n" |
91 | "b_more_io: %8lu\n" | 102 | "BdiWriteBandwidth: %10lu kBps\n" |
92 | "bdi_list: %8u\n" | 103 | "b_dirty: %10lu\n" |
93 | "state: %8lx\n", | 104 | "b_io: %10lu\n" |
105 | "b_more_io: %10lu\n" | ||
106 | "bdi_list: %10u\n" | ||
107 | "state: %10lx\n", | ||
94 | (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)), | 108 | (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)), |
95 | (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)), | 109 | (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)), |
96 | K(bdi_thresh), K(dirty_thresh), | 110 | K(bdi_thresh), |
97 | K(background_thresh), nr_dirty, nr_io, nr_more_io, | 111 | K(dirty_thresh), |
112 | K(background_thresh), | ||
113 | (unsigned long) K(bdi_stat(bdi, BDI_DIRTIED)), | ||
114 | (unsigned long) K(bdi_stat(bdi, BDI_WRITTEN)), | ||
115 | (unsigned long) K(bdi->write_bandwidth), | ||
116 | nr_dirty, | ||
117 | nr_io, | ||
118 | nr_more_io, | ||
98 | !list_empty(&bdi->bdi_list), bdi->state); | 119 | !list_empty(&bdi->bdi_list), bdi->state); |
99 | #undef K | 120 | #undef K |
100 | 121 | ||
@@ -249,18 +270,6 @@ int bdi_has_dirty_io(struct backing_dev_info *bdi) | |||
249 | return wb_has_dirty_io(&bdi->wb); | 270 | return wb_has_dirty_io(&bdi->wb); |
250 | } | 271 | } |
251 | 272 | ||
252 | static void bdi_flush_io(struct backing_dev_info *bdi) | ||
253 | { | ||
254 | struct writeback_control wbc = { | ||
255 | .sync_mode = WB_SYNC_NONE, | ||
256 | .older_than_this = NULL, | ||
257 | .range_cyclic = 1, | ||
258 | .nr_to_write = 1024, | ||
259 | }; | ||
260 | |||
261 | writeback_inodes_wb(&bdi->wb, &wbc); | ||
262 | } | ||
263 | |||
264 | /* | 273 | /* |
265 | * kupdated() used to do this. We cannot do it from the bdi_forker_thread() | 274 | * kupdated() used to do this. We cannot do it from the bdi_forker_thread() |
266 | * or we risk deadlocking on ->s_umount. The longer term solution would be | 275 | * or we risk deadlocking on ->s_umount. The longer term solution would be |
@@ -352,6 +361,17 @@ static unsigned long bdi_longest_inactive(void) | |||
352 | return max(5UL * 60 * HZ, interval); | 361 | return max(5UL * 60 * HZ, interval); |
353 | } | 362 | } |
354 | 363 | ||
364 | /* | ||
365 | * Clear pending bit and wakeup anybody waiting for flusher thread creation or | ||
366 | * shutdown | ||
367 | */ | ||
368 | static void bdi_clear_pending(struct backing_dev_info *bdi) | ||
369 | { | ||
370 | clear_bit(BDI_pending, &bdi->state); | ||
371 | smp_mb__after_clear_bit(); | ||
372 | wake_up_bit(&bdi->state, BDI_pending); | ||
373 | } | ||
374 | |||
355 | static int bdi_forker_thread(void *ptr) | 375 | static int bdi_forker_thread(void *ptr) |
356 | { | 376 | { |
357 | struct bdi_writeback *me = ptr; | 377 | struct bdi_writeback *me = ptr; |
@@ -383,6 +403,12 @@ static int bdi_forker_thread(void *ptr) | |||
383 | } | 403 | } |
384 | 404 | ||
385 | spin_lock_bh(&bdi_lock); | 405 | spin_lock_bh(&bdi_lock); |
406 | /* | ||
407 | * In the following loop we are going to check whether we have | ||
408 | * some work to do without any synchronization with tasks | ||
409 | * waking us up to do work for them. Set the task state here | ||
410 | * so that we don't miss wakeups after verifying conditions. | ||
411 | */ | ||
386 | set_current_state(TASK_INTERRUPTIBLE); | 412 | set_current_state(TASK_INTERRUPTIBLE); |
387 | 413 | ||
388 | list_for_each_entry(bdi, &bdi_list, bdi_list) { | 414 | list_for_each_entry(bdi, &bdi_list, bdi_list) { |
@@ -446,9 +472,11 @@ static int bdi_forker_thread(void *ptr) | |||
446 | if (IS_ERR(task)) { | 472 | if (IS_ERR(task)) { |
447 | /* | 473 | /* |
448 | * If thread creation fails, force writeout of | 474 | * If thread creation fails, force writeout of |
449 | * the bdi from the thread. | 475 | * the bdi from the thread. Hopefully 1024 is |
476 | * large enough for efficient IO. | ||
450 | */ | 477 | */ |
451 | bdi_flush_io(bdi); | 478 | writeback_inodes_wb(&bdi->wb, 1024, |
479 | WB_REASON_FORKER_THREAD); | ||
452 | } else { | 480 | } else { |
453 | /* | 481 | /* |
454 | * The spinlock makes sure we do not lose | 482 | * The spinlock makes sure we do not lose |
@@ -461,11 +489,13 @@ static int bdi_forker_thread(void *ptr) | |||
461 | spin_unlock_bh(&bdi->wb_lock); | 489 | spin_unlock_bh(&bdi->wb_lock); |
462 | wake_up_process(task); | 490 | wake_up_process(task); |
463 | } | 491 | } |
492 | bdi_clear_pending(bdi); | ||
464 | break; | 493 | break; |
465 | 494 | ||
466 | case KILL_THREAD: | 495 | case KILL_THREAD: |
467 | __set_current_state(TASK_RUNNING); | 496 | __set_current_state(TASK_RUNNING); |
468 | kthread_stop(task); | 497 | kthread_stop(task); |
498 | bdi_clear_pending(bdi); | ||
469 | break; | 499 | break; |
470 | 500 | ||
471 | case NO_ACTION: | 501 | case NO_ACTION: |
@@ -481,16 +511,8 @@ static int bdi_forker_thread(void *ptr) | |||
481 | else | 511 | else |
482 | schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10)); | 512 | schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10)); |
483 | try_to_freeze(); | 513 | try_to_freeze(); |
484 | /* Back to the main loop */ | 514 | break; |
485 | continue; | ||
486 | } | 515 | } |
487 | |||
488 | /* | ||
489 | * Clear pending bit and wakeup anybody waiting to tear us down. | ||
490 | */ | ||
491 | clear_bit(BDI_pending, &bdi->state); | ||
492 | smp_mb__after_clear_bit(); | ||
493 | wake_up_bit(&bdi->state, BDI_pending); | ||
494 | } | 516 | } |
495 | 517 | ||
496 | return 0; | 518 | return 0; |
@@ -505,7 +527,7 @@ static void bdi_remove_from_list(struct backing_dev_info *bdi) | |||
505 | list_del_rcu(&bdi->bdi_list); | 527 | list_del_rcu(&bdi->bdi_list); |
506 | spin_unlock_bh(&bdi_lock); | 528 | spin_unlock_bh(&bdi_lock); |
507 | 529 | ||
508 | synchronize_rcu(); | 530 | synchronize_rcu_expedited(); |
509 | } | 531 | } |
510 | 532 | ||
511 | int bdi_register(struct backing_dev_info *bdi, struct device *parent, | 533 | int bdi_register(struct backing_dev_info *bdi, struct device *parent, |
@@ -606,6 +628,7 @@ static void bdi_prune_sb(struct backing_dev_info *bdi) | |||
606 | void bdi_unregister(struct backing_dev_info *bdi) | 628 | void bdi_unregister(struct backing_dev_info *bdi) |
607 | { | 629 | { |
608 | if (bdi->dev) { | 630 | if (bdi->dev) { |
631 | bdi_set_min_ratio(bdi, 0); | ||
609 | trace_writeback_bdi_unregister(bdi); | 632 | trace_writeback_bdi_unregister(bdi); |
610 | bdi_prune_sb(bdi); | 633 | bdi_prune_sb(bdi); |
611 | del_timer_sync(&bdi->wb.wakeup_timer); | 634 | del_timer_sync(&bdi->wb.wakeup_timer); |
@@ -628,9 +651,15 @@ static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi) | |||
628 | INIT_LIST_HEAD(&wb->b_dirty); | 651 | INIT_LIST_HEAD(&wb->b_dirty); |
629 | INIT_LIST_HEAD(&wb->b_io); | 652 | INIT_LIST_HEAD(&wb->b_io); |
630 | INIT_LIST_HEAD(&wb->b_more_io); | 653 | INIT_LIST_HEAD(&wb->b_more_io); |
654 | spin_lock_init(&wb->list_lock); | ||
631 | setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi); | 655 | setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi); |
632 | } | 656 | } |
633 | 657 | ||
658 | /* | ||
659 | * Initial write bandwidth: 100 MB/s | ||
660 | */ | ||
661 | #define INIT_BW (100 << (20 - PAGE_SHIFT)) | ||
662 | |||
634 | int bdi_init(struct backing_dev_info *bdi) | 663 | int bdi_init(struct backing_dev_info *bdi) |
635 | { | 664 | { |
636 | int i, err; | 665 | int i, err; |
@@ -653,6 +682,15 @@ int bdi_init(struct backing_dev_info *bdi) | |||
653 | } | 682 | } |
654 | 683 | ||
655 | bdi->dirty_exceeded = 0; | 684 | bdi->dirty_exceeded = 0; |
685 | |||
686 | bdi->bw_time_stamp = jiffies; | ||
687 | bdi->written_stamp = 0; | ||
688 | |||
689 | bdi->balanced_dirty_ratelimit = INIT_BW; | ||
690 | bdi->dirty_ratelimit = INIT_BW; | ||
691 | bdi->write_bandwidth = INIT_BW; | ||
692 | bdi->avg_write_bandwidth = INIT_BW; | ||
693 | |||
656 | err = prop_local_init_percpu(&bdi->completions); | 694 | err = prop_local_init_percpu(&bdi->completions); |
657 | 695 | ||
658 | if (err) { | 696 | if (err) { |
@@ -676,15 +714,24 @@ void bdi_destroy(struct backing_dev_info *bdi) | |||
676 | if (bdi_has_dirty_io(bdi)) { | 714 | if (bdi_has_dirty_io(bdi)) { |
677 | struct bdi_writeback *dst = &default_backing_dev_info.wb; | 715 | struct bdi_writeback *dst = &default_backing_dev_info.wb; |
678 | 716 | ||
679 | spin_lock(&inode_wb_list_lock); | 717 | bdi_lock_two(&bdi->wb, dst); |
680 | list_splice(&bdi->wb.b_dirty, &dst->b_dirty); | 718 | list_splice(&bdi->wb.b_dirty, &dst->b_dirty); |
681 | list_splice(&bdi->wb.b_io, &dst->b_io); | 719 | list_splice(&bdi->wb.b_io, &dst->b_io); |
682 | list_splice(&bdi->wb.b_more_io, &dst->b_more_io); | 720 | list_splice(&bdi->wb.b_more_io, &dst->b_more_io); |
683 | spin_unlock(&inode_wb_list_lock); | 721 | spin_unlock(&bdi->wb.list_lock); |
722 | spin_unlock(&dst->list_lock); | ||
684 | } | 723 | } |
685 | 724 | ||
686 | bdi_unregister(bdi); | 725 | bdi_unregister(bdi); |
687 | 726 | ||
727 | /* | ||
728 | * If bdi_unregister() had already been called earlier, the | ||
729 | * wakeup_timer could still be armed because bdi_prune_sb() | ||
730 | * can race with the bdi_wakeup_thread_delayed() calls from | ||
731 | * __mark_inode_dirty(). | ||
732 | */ | ||
733 | del_timer_sync(&bdi->wb.wakeup_timer); | ||
734 | |||
688 | for (i = 0; i < NR_BDI_STAT_ITEMS; i++) | 735 | for (i = 0; i < NR_BDI_STAT_ITEMS; i++) |
689 | percpu_counter_destroy(&bdi->bdi_stat[i]); | 736 | percpu_counter_destroy(&bdi->bdi_stat[i]); |
690 | 737 | ||
diff --git a/mm/bootmem.c b/mm/bootmem.c index 01d5a4b3dd0c..1a77012ecdb3 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c | |||
@@ -12,7 +12,7 @@ | |||
12 | #include <linux/pfn.h> | 12 | #include <linux/pfn.h> |
13 | #include <linux/slab.h> | 13 | #include <linux/slab.h> |
14 | #include <linux/bootmem.h> | 14 | #include <linux/bootmem.h> |
15 | #include <linux/module.h> | 15 | #include <linux/export.h> |
16 | #include <linux/kmemleak.h> | 16 | #include <linux/kmemleak.h> |
17 | #include <linux/range.h> | 17 | #include <linux/range.h> |
18 | #include <linux/memblock.h> | 18 | #include <linux/memblock.h> |
diff --git a/mm/bounce.c b/mm/bounce.c index 1481de68184b..4e9ae722af83 100644 --- a/mm/bounce.c +++ b/mm/bounce.c | |||
@@ -4,7 +4,7 @@ | |||
4 | */ | 4 | */ |
5 | 5 | ||
6 | #include <linux/mm.h> | 6 | #include <linux/mm.h> |
7 | #include <linux/module.h> | 7 | #include <linux/export.h> |
8 | #include <linux/swap.h> | 8 | #include <linux/swap.h> |
9 | #include <linux/gfp.h> | 9 | #include <linux/gfp.h> |
10 | #include <linux/bio.h> | 10 | #include <linux/bio.h> |
@@ -14,6 +14,7 @@ | |||
14 | #include <linux/init.h> | 14 | #include <linux/init.h> |
15 | #include <linux/hash.h> | 15 | #include <linux/hash.h> |
16 | #include <linux/highmem.h> | 16 | #include <linux/highmem.h> |
17 | #include <linux/bootmem.h> | ||
17 | #include <asm/tlbflush.h> | 18 | #include <asm/tlbflush.h> |
18 | 19 | ||
19 | #include <trace/events/block.h> | 20 | #include <trace/events/block.h> |
@@ -26,12 +27,10 @@ static mempool_t *page_pool, *isa_page_pool; | |||
26 | #ifdef CONFIG_HIGHMEM | 27 | #ifdef CONFIG_HIGHMEM |
27 | static __init int init_emergency_pool(void) | 28 | static __init int init_emergency_pool(void) |
28 | { | 29 | { |
29 | struct sysinfo i; | 30 | #ifndef CONFIG_MEMORY_HOTPLUG |
30 | si_meminfo(&i); | 31 | if (max_pfn <= max_low_pfn) |
31 | si_swapinfo(&i); | ||
32 | |||
33 | if (!i.totalhigh) | ||
34 | return 0; | 32 | return 0; |
33 | #endif | ||
35 | 34 | ||
36 | page_pool = mempool_create_page_pool(POOL_SIZE, 0); | 35 | page_pool = mempool_create_page_pool(POOL_SIZE, 0); |
37 | BUG_ON(!page_pool); | 36 | BUG_ON(!page_pool); |
diff --git a/mm/compaction.c b/mm/compaction.c index 6cc604bd5649..899d95638586 100644 --- a/mm/compaction.c +++ b/mm/compaction.c | |||
@@ -35,10 +35,6 @@ struct compact_control { | |||
35 | unsigned long migrate_pfn; /* isolate_migratepages search base */ | 35 | unsigned long migrate_pfn; /* isolate_migratepages search base */ |
36 | bool sync; /* Synchronous migration */ | 36 | bool sync; /* Synchronous migration */ |
37 | 37 | ||
38 | /* Account for isolated anon and file pages */ | ||
39 | unsigned long nr_anon; | ||
40 | unsigned long nr_file; | ||
41 | |||
42 | unsigned int order; /* order a direct compactor needs */ | 38 | unsigned int order; /* order a direct compactor needs */ |
43 | int migratetype; /* MOVABLE, RECLAIMABLE etc */ | 39 | int migratetype; /* MOVABLE, RECLAIMABLE etc */ |
44 | struct zone *zone; | 40 | struct zone *zone; |
@@ -223,17 +219,13 @@ static void isolate_freepages(struct zone *zone, | |||
223 | static void acct_isolated(struct zone *zone, struct compact_control *cc) | 219 | static void acct_isolated(struct zone *zone, struct compact_control *cc) |
224 | { | 220 | { |
225 | struct page *page; | 221 | struct page *page; |
226 | unsigned int count[NR_LRU_LISTS] = { 0, }; | 222 | unsigned int count[2] = { 0, }; |
227 | 223 | ||
228 | list_for_each_entry(page, &cc->migratepages, lru) { | 224 | list_for_each_entry(page, &cc->migratepages, lru) |
229 | int lru = page_lru_base_type(page); | 225 | count[!!page_is_file_cache(page)]++; |
230 | count[lru]++; | ||
231 | } | ||
232 | 226 | ||
233 | cc->nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON]; | 227 | __mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]); |
234 | cc->nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE]; | 228 | __mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]); |
235 | __mod_zone_page_state(zone, NR_ISOLATED_ANON, cc->nr_anon); | ||
236 | __mod_zone_page_state(zone, NR_ISOLATED_FILE, cc->nr_file); | ||
237 | } | 229 | } |
238 | 230 | ||
239 | /* Similar to reclaim, but different enough that they don't share logic */ | 231 | /* Similar to reclaim, but different enough that they don't share logic */ |
@@ -269,6 +261,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, | |||
269 | unsigned long last_pageblock_nr = 0, pageblock_nr; | 261 | unsigned long last_pageblock_nr = 0, pageblock_nr; |
270 | unsigned long nr_scanned = 0, nr_isolated = 0; | 262 | unsigned long nr_scanned = 0, nr_isolated = 0; |
271 | struct list_head *migratelist = &cc->migratepages; | 263 | struct list_head *migratelist = &cc->migratepages; |
264 | isolate_mode_t mode = ISOLATE_ACTIVE|ISOLATE_INACTIVE; | ||
272 | 265 | ||
273 | /* Do not scan outside zone boundaries */ | 266 | /* Do not scan outside zone boundaries */ |
274 | low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn); | 267 | low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn); |
@@ -356,8 +349,11 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, | |||
356 | continue; | 349 | continue; |
357 | } | 350 | } |
358 | 351 | ||
352 | if (!cc->sync) | ||
353 | mode |= ISOLATE_CLEAN; | ||
354 | |||
359 | /* Try isolate the page */ | 355 | /* Try isolate the page */ |
360 | if (__isolate_lru_page(page, ISOLATE_BOTH, 0) != 0) | 356 | if (__isolate_lru_page(page, mode, 0) != 0) |
361 | continue; | 357 | continue; |
362 | 358 | ||
363 | VM_BUG_ON(PageTransCompound(page)); | 359 | VM_BUG_ON(PageTransCompound(page)); |
@@ -586,7 +582,7 @@ out: | |||
586 | return ret; | 582 | return ret; |
587 | } | 583 | } |
588 | 584 | ||
589 | unsigned long compact_zone_order(struct zone *zone, | 585 | static unsigned long compact_zone_order(struct zone *zone, |
590 | int order, gfp_t gfp_mask, | 586 | int order, gfp_t gfp_mask, |
591 | bool sync) | 587 | bool sync) |
592 | { | 588 | { |
diff --git a/mm/debug-pagealloc.c b/mm/debug-pagealloc.c index a1e3324de2b5..7cea557407f4 100644 --- a/mm/debug-pagealloc.c +++ b/mm/debug-pagealloc.c | |||
@@ -1,7 +1,10 @@ | |||
1 | #include <linux/kernel.h> | 1 | #include <linux/kernel.h> |
2 | #include <linux/string.h> | ||
2 | #include <linux/mm.h> | 3 | #include <linux/mm.h> |
4 | #include <linux/highmem.h> | ||
3 | #include <linux/page-debug-flags.h> | 5 | #include <linux/page-debug-flags.h> |
4 | #include <linux/poison.h> | 6 | #include <linux/poison.h> |
7 | #include <linux/ratelimit.h> | ||
5 | 8 | ||
6 | static inline void set_page_poison(struct page *page) | 9 | static inline void set_page_poison(struct page *page) |
7 | { | 10 | { |
@@ -18,28 +21,13 @@ static inline bool page_poison(struct page *page) | |||
18 | return test_bit(PAGE_DEBUG_FLAG_POISON, &page->debug_flags); | 21 | return test_bit(PAGE_DEBUG_FLAG_POISON, &page->debug_flags); |
19 | } | 22 | } |
20 | 23 | ||
21 | static void poison_highpage(struct page *page) | ||
22 | { | ||
23 | /* | ||
24 | * Page poisoning for highmem pages is not implemented. | ||
25 | * | ||
26 | * This can be called from interrupt contexts. | ||
27 | * So we need to create a new kmap_atomic slot for this | ||
28 | * application and it will need interrupt protection. | ||
29 | */ | ||
30 | } | ||
31 | |||
32 | static void poison_page(struct page *page) | 24 | static void poison_page(struct page *page) |
33 | { | 25 | { |
34 | void *addr; | 26 | void *addr = kmap_atomic(page); |
35 | 27 | ||
36 | if (PageHighMem(page)) { | ||
37 | poison_highpage(page); | ||
38 | return; | ||
39 | } | ||
40 | set_page_poison(page); | 28 | set_page_poison(page); |
41 | addr = page_address(page); | ||
42 | memset(addr, PAGE_POISON, PAGE_SIZE); | 29 | memset(addr, PAGE_POISON, PAGE_SIZE); |
30 | kunmap_atomic(addr); | ||
43 | } | 31 | } |
44 | 32 | ||
45 | static void poison_pages(struct page *page, int n) | 33 | static void poison_pages(struct page *page, int n) |
@@ -59,14 +47,12 @@ static bool single_bit_flip(unsigned char a, unsigned char b) | |||
59 | 47 | ||
60 | static void check_poison_mem(unsigned char *mem, size_t bytes) | 48 | static void check_poison_mem(unsigned char *mem, size_t bytes) |
61 | { | 49 | { |
50 | static DEFINE_RATELIMIT_STATE(ratelimit, 5 * HZ, 10); | ||
62 | unsigned char *start; | 51 | unsigned char *start; |
63 | unsigned char *end; | 52 | unsigned char *end; |
64 | 53 | ||
65 | for (start = mem; start < mem + bytes; start++) { | 54 | start = memchr_inv(mem, PAGE_POISON, bytes); |
66 | if (*start != PAGE_POISON) | 55 | if (!start) |
67 | break; | ||
68 | } | ||
69 | if (start == mem + bytes) | ||
70 | return; | 56 | return; |
71 | 57 | ||
72 | for (end = mem + bytes - 1; end > start; end--) { | 58 | for (end = mem + bytes - 1; end > start; end--) { |
@@ -74,7 +60,7 @@ static void check_poison_mem(unsigned char *mem, size_t bytes) | |||
74 | break; | 60 | break; |
75 | } | 61 | } |
76 | 62 | ||
77 | if (!printk_ratelimit()) | 63 | if (!__ratelimit(&ratelimit)) |
78 | return; | 64 | return; |
79 | else if (start == end && single_bit_flip(*start, PAGE_POISON)) | 65 | else if (start == end && single_bit_flip(*start, PAGE_POISON)) |
80 | printk(KERN_ERR "pagealloc: single bit error\n"); | 66 | printk(KERN_ERR "pagealloc: single bit error\n"); |
@@ -86,27 +72,17 @@ static void check_poison_mem(unsigned char *mem, size_t bytes) | |||
86 | dump_stack(); | 72 | dump_stack(); |
87 | } | 73 | } |
88 | 74 | ||
89 | static void unpoison_highpage(struct page *page) | ||
90 | { | ||
91 | /* | ||
92 | * See comment in poison_highpage(). | ||
93 | * Highmem pages should not be poisoned for now | ||
94 | */ | ||
95 | BUG_ON(page_poison(page)); | ||
96 | } | ||
97 | |||
98 | static void unpoison_page(struct page *page) | 75 | static void unpoison_page(struct page *page) |
99 | { | 76 | { |
100 | if (PageHighMem(page)) { | 77 | void *addr; |
101 | unpoison_highpage(page); | 78 | |
79 | if (!page_poison(page)) | ||
102 | return; | 80 | return; |
103 | } | ||
104 | if (page_poison(page)) { | ||
105 | void *addr = page_address(page); | ||
106 | 81 | ||
107 | check_poison_mem(addr, PAGE_SIZE); | 82 | addr = kmap_atomic(page); |
108 | clear_page_poison(page); | 83 | check_poison_mem(addr, PAGE_SIZE); |
109 | } | 84 | clear_page_poison(page); |
85 | kunmap_atomic(addr); | ||
110 | } | 86 | } |
111 | 87 | ||
112 | static void unpoison_pages(struct page *page, int n) | 88 | static void unpoison_pages(struct page *page, int n) |
diff --git a/mm/dmapool.c b/mm/dmapool.c index 03bf3bb4519a..c5ab33bca0a8 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c | |||
@@ -27,11 +27,12 @@ | |||
27 | #include <linux/dmapool.h> | 27 | #include <linux/dmapool.h> |
28 | #include <linux/kernel.h> | 28 | #include <linux/kernel.h> |
29 | #include <linux/list.h> | 29 | #include <linux/list.h> |
30 | #include <linux/module.h> | 30 | #include <linux/export.h> |
31 | #include <linux/mutex.h> | 31 | #include <linux/mutex.h> |
32 | #include <linux/poison.h> | 32 | #include <linux/poison.h> |
33 | #include <linux/sched.h> | 33 | #include <linux/sched.h> |
34 | #include <linux/slab.h> | 34 | #include <linux/slab.h> |
35 | #include <linux/stat.h> | ||
35 | #include <linux/spinlock.h> | 36 | #include <linux/spinlock.h> |
36 | #include <linux/string.h> | 37 | #include <linux/string.h> |
37 | #include <linux/types.h> | 38 | #include <linux/types.h> |
@@ -500,7 +501,7 @@ void dmam_pool_destroy(struct dma_pool *pool) | |||
500 | { | 501 | { |
501 | struct device *dev = pool->dev; | 502 | struct device *dev = pool->dev; |
502 | 503 | ||
503 | dma_pool_destroy(pool); | ||
504 | WARN_ON(devres_destroy(dev, dmam_pool_release, dmam_pool_match, pool)); | 504 | WARN_ON(devres_destroy(dev, dmam_pool_release, dmam_pool_match, pool)); |
505 | dma_pool_destroy(pool); | ||
505 | } | 506 | } |
506 | EXPORT_SYMBOL(dmam_pool_destroy); | 507 | EXPORT_SYMBOL(dmam_pool_destroy); |
diff --git a/mm/failslab.c b/mm/failslab.c index c5f88f240ddc..0dd7b8fec71c 100644 --- a/mm/failslab.c +++ b/mm/failslab.c | |||
@@ -5,10 +5,6 @@ static struct { | |||
5 | struct fault_attr attr; | 5 | struct fault_attr attr; |
6 | u32 ignore_gfp_wait; | 6 | u32 ignore_gfp_wait; |
7 | int cache_filter; | 7 | int cache_filter; |
8 | #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS | ||
9 | struct dentry *ignore_gfp_wait_file; | ||
10 | struct dentry *cache_filter_file; | ||
11 | #endif | ||
12 | } failslab = { | 8 | } failslab = { |
13 | .attr = FAULT_ATTR_INITIALIZER, | 9 | .attr = FAULT_ATTR_INITIALIZER, |
14 | .ignore_gfp_wait = 1, | 10 | .ignore_gfp_wait = 1, |
@@ -38,32 +34,25 @@ __setup("failslab=", setup_failslab); | |||
38 | #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS | 34 | #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS |
39 | static int __init failslab_debugfs_init(void) | 35 | static int __init failslab_debugfs_init(void) |
40 | { | 36 | { |
41 | mode_t mode = S_IFREG | S_IRUSR | S_IWUSR; | ||
42 | struct dentry *dir; | 37 | struct dentry *dir; |
43 | int err; | 38 | mode_t mode = S_IFREG | S_IRUSR | S_IWUSR; |
44 | |||
45 | err = init_fault_attr_dentries(&failslab.attr, "failslab"); | ||
46 | if (err) | ||
47 | return err; | ||
48 | dir = failslab.attr.dentries.dir; | ||
49 | 39 | ||
50 | failslab.ignore_gfp_wait_file = | 40 | dir = fault_create_debugfs_attr("failslab", NULL, &failslab.attr); |
51 | debugfs_create_bool("ignore-gfp-wait", mode, dir, | 41 | if (IS_ERR(dir)) |
52 | &failslab.ignore_gfp_wait); | 42 | return PTR_ERR(dir); |
53 | 43 | ||
54 | failslab.cache_filter_file = | 44 | if (!debugfs_create_bool("ignore-gfp-wait", mode, dir, |
55 | debugfs_create_bool("cache-filter", mode, dir, | 45 | &failslab.ignore_gfp_wait)) |
56 | &failslab.cache_filter); | 46 | goto fail; |
47 | if (!debugfs_create_bool("cache-filter", mode, dir, | ||
48 | &failslab.cache_filter)) | ||
49 | goto fail; | ||
57 | 50 | ||
58 | if (!failslab.ignore_gfp_wait_file || | 51 | return 0; |
59 | !failslab.cache_filter_file) { | 52 | fail: |
60 | err = -ENOMEM; | 53 | debugfs_remove_recursive(dir); |
61 | debugfs_remove(failslab.cache_filter_file); | ||
62 | debugfs_remove(failslab.ignore_gfp_wait_file); | ||
63 | cleanup_fault_attr_dentries(&failslab.attr); | ||
64 | } | ||
65 | 54 | ||
66 | return err; | 55 | return -ENOMEM; |
67 | } | 56 | } |
68 | 57 | ||
69 | late_initcall(failslab_debugfs_init); | 58 | late_initcall(failslab_debugfs_init); |
diff --git a/mm/filemap.c b/mm/filemap.c index a8251a8d3457..c0018f2d50e0 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -9,7 +9,7 @@ | |||
9 | * most "normal" filesystems (but you don't /have/ to use this: | 9 | * most "normal" filesystems (but you don't /have/ to use this: |
10 | * the NFS filesystem used to do this differently, for example) | 10 | * the NFS filesystem used to do this differently, for example) |
11 | */ | 11 | */ |
12 | #include <linux/module.h> | 12 | #include <linux/export.h> |
13 | #include <linux/compiler.h> | 13 | #include <linux/compiler.h> |
14 | #include <linux/fs.h> | 14 | #include <linux/fs.h> |
15 | #include <linux/uaccess.h> | 15 | #include <linux/uaccess.h> |
@@ -33,7 +33,6 @@ | |||
33 | #include <linux/cpuset.h> | 33 | #include <linux/cpuset.h> |
34 | #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */ | 34 | #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */ |
35 | #include <linux/memcontrol.h> | 35 | #include <linux/memcontrol.h> |
36 | #include <linux/mm_inline.h> /* for page_is_file_cache() */ | ||
37 | #include <linux/cleancache.h> | 36 | #include <linux/cleancache.h> |
38 | #include "internal.h" | 37 | #include "internal.h" |
39 | 38 | ||
@@ -78,10 +77,7 @@ | |||
78 | * ->i_mutex (generic_file_buffered_write) | 77 | * ->i_mutex (generic_file_buffered_write) |
79 | * ->mmap_sem (fault_in_pages_readable->do_page_fault) | 78 | * ->mmap_sem (fault_in_pages_readable->do_page_fault) |
80 | * | 79 | * |
81 | * ->i_mutex | 80 | * bdi->wb.list_lock |
82 | * ->i_alloc_sem (various) | ||
83 | * | ||
84 | * inode_wb_list_lock | ||
85 | * sb_lock (fs/fs-writeback.c) | 81 | * sb_lock (fs/fs-writeback.c) |
86 | * ->mapping->tree_lock (__sync_single_inode) | 82 | * ->mapping->tree_lock (__sync_single_inode) |
87 | * | 83 | * |
@@ -99,9 +95,9 @@ | |||
99 | * ->zone.lru_lock (check_pte_range->isolate_lru_page) | 95 | * ->zone.lru_lock (check_pte_range->isolate_lru_page) |
100 | * ->private_lock (page_remove_rmap->set_page_dirty) | 96 | * ->private_lock (page_remove_rmap->set_page_dirty) |
101 | * ->tree_lock (page_remove_rmap->set_page_dirty) | 97 | * ->tree_lock (page_remove_rmap->set_page_dirty) |
102 | * inode_wb_list_lock (page_remove_rmap->set_page_dirty) | 98 | * bdi.wb->list_lock (page_remove_rmap->set_page_dirty) |
103 | * ->inode->i_lock (page_remove_rmap->set_page_dirty) | 99 | * ->inode->i_lock (page_remove_rmap->set_page_dirty) |
104 | * inode_wb_list_lock (zap_pte_range->set_page_dirty) | 100 | * bdi.wb->list_lock (zap_pte_range->set_page_dirty) |
105 | * ->inode->i_lock (zap_pte_range->set_page_dirty) | 101 | * ->inode->i_lock (zap_pte_range->set_page_dirty) |
106 | * ->private_lock (zap_pte_range->__set_page_dirty_buffers) | 102 | * ->private_lock (zap_pte_range->__set_page_dirty_buffers) |
107 | * | 103 | * |
@@ -131,6 +127,7 @@ void __delete_from_page_cache(struct page *page) | |||
131 | 127 | ||
132 | radix_tree_delete(&mapping->page_tree, page->index); | 128 | radix_tree_delete(&mapping->page_tree, page->index); |
133 | page->mapping = NULL; | 129 | page->mapping = NULL; |
130 | /* Leave page->index set: truncation lookup relies upon it */ | ||
134 | mapping->nrpages--; | 131 | mapping->nrpages--; |
135 | __dec_zone_page_state(page, NR_FILE_PAGES); | 132 | __dec_zone_page_state(page, NR_FILE_PAGES); |
136 | if (PageSwapBacked(page)) | 133 | if (PageSwapBacked(page)) |
@@ -464,6 +461,7 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping, | |||
464 | int error; | 461 | int error; |
465 | 462 | ||
466 | VM_BUG_ON(!PageLocked(page)); | 463 | VM_BUG_ON(!PageLocked(page)); |
464 | VM_BUG_ON(PageSwapBacked(page)); | ||
467 | 465 | ||
468 | error = mem_cgroup_cache_charge(page, current->mm, | 466 | error = mem_cgroup_cache_charge(page, current->mm, |
469 | gfp_mask & GFP_RECLAIM_MASK); | 467 | gfp_mask & GFP_RECLAIM_MASK); |
@@ -481,11 +479,10 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping, | |||
481 | if (likely(!error)) { | 479 | if (likely(!error)) { |
482 | mapping->nrpages++; | 480 | mapping->nrpages++; |
483 | __inc_zone_page_state(page, NR_FILE_PAGES); | 481 | __inc_zone_page_state(page, NR_FILE_PAGES); |
484 | if (PageSwapBacked(page)) | ||
485 | __inc_zone_page_state(page, NR_SHMEM); | ||
486 | spin_unlock_irq(&mapping->tree_lock); | 482 | spin_unlock_irq(&mapping->tree_lock); |
487 | } else { | 483 | } else { |
488 | page->mapping = NULL; | 484 | page->mapping = NULL; |
485 | /* Leave page->index set: truncation relies upon it */ | ||
489 | spin_unlock_irq(&mapping->tree_lock); | 486 | spin_unlock_irq(&mapping->tree_lock); |
490 | mem_cgroup_uncharge_cache_page(page); | 487 | mem_cgroup_uncharge_cache_page(page); |
491 | page_cache_release(page); | 488 | page_cache_release(page); |
@@ -503,22 +500,9 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping, | |||
503 | { | 500 | { |
504 | int ret; | 501 | int ret; |
505 | 502 | ||
506 | /* | ||
507 | * Splice_read and readahead add shmem/tmpfs pages into the page cache | ||
508 | * before shmem_readpage has a chance to mark them as SwapBacked: they | ||
509 | * need to go on the anon lru below, and mem_cgroup_cache_charge | ||
510 | * (called in add_to_page_cache) needs to know where they're going too. | ||
511 | */ | ||
512 | if (mapping_cap_swap_backed(mapping)) | ||
513 | SetPageSwapBacked(page); | ||
514 | |||
515 | ret = add_to_page_cache(page, mapping, offset, gfp_mask); | 503 | ret = add_to_page_cache(page, mapping, offset, gfp_mask); |
516 | if (ret == 0) { | 504 | if (ret == 0) |
517 | if (page_is_file_cache(page)) | 505 | lru_cache_add_file(page); |
518 | lru_cache_add_file(page); | ||
519 | else | ||
520 | lru_cache_add_anon(page); | ||
521 | } | ||
522 | return ret; | 506 | return ret; |
523 | } | 507 | } |
524 | EXPORT_SYMBOL_GPL(add_to_page_cache_lru); | 508 | EXPORT_SYMBOL_GPL(add_to_page_cache_lru); |
@@ -715,9 +699,16 @@ repeat: | |||
715 | page = radix_tree_deref_slot(pagep); | 699 | page = radix_tree_deref_slot(pagep); |
716 | if (unlikely(!page)) | 700 | if (unlikely(!page)) |
717 | goto out; | 701 | goto out; |
718 | if (radix_tree_deref_retry(page)) | 702 | if (radix_tree_exception(page)) { |
719 | goto repeat; | 703 | if (radix_tree_deref_retry(page)) |
720 | 704 | goto repeat; | |
705 | /* | ||
706 | * Otherwise, shmem/tmpfs must be storing a swap entry | ||
707 | * here as an exceptional entry: so return it without | ||
708 | * attempting to raise page count. | ||
709 | */ | ||
710 | goto out; | ||
711 | } | ||
721 | if (!page_cache_get_speculative(page)) | 712 | if (!page_cache_get_speculative(page)) |
722 | goto repeat; | 713 | goto repeat; |
723 | 714 | ||
@@ -754,7 +745,7 @@ struct page *find_lock_page(struct address_space *mapping, pgoff_t offset) | |||
754 | 745 | ||
755 | repeat: | 746 | repeat: |
756 | page = find_get_page(mapping, offset); | 747 | page = find_get_page(mapping, offset); |
757 | if (page) { | 748 | if (page && !radix_tree_exception(page)) { |
758 | lock_page(page); | 749 | lock_page(page); |
759 | /* Has the page been truncated? */ | 750 | /* Has the page been truncated? */ |
760 | if (unlikely(page->mapping != mapping)) { | 751 | if (unlikely(page->mapping != mapping)) { |
@@ -836,13 +827,14 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start, | |||
836 | { | 827 | { |
837 | unsigned int i; | 828 | unsigned int i; |
838 | unsigned int ret; | 829 | unsigned int ret; |
839 | unsigned int nr_found; | 830 | unsigned int nr_found, nr_skip; |
840 | 831 | ||
841 | rcu_read_lock(); | 832 | rcu_read_lock(); |
842 | restart: | 833 | restart: |
843 | nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, | 834 | nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, |
844 | (void ***)pages, start, nr_pages); | 835 | (void ***)pages, NULL, start, nr_pages); |
845 | ret = 0; | 836 | ret = 0; |
837 | nr_skip = 0; | ||
846 | for (i = 0; i < nr_found; i++) { | 838 | for (i = 0; i < nr_found; i++) { |
847 | struct page *page; | 839 | struct page *page; |
848 | repeat: | 840 | repeat: |
@@ -850,13 +842,23 @@ repeat: | |||
850 | if (unlikely(!page)) | 842 | if (unlikely(!page)) |
851 | continue; | 843 | continue; |
852 | 844 | ||
853 | /* | 845 | if (radix_tree_exception(page)) { |
854 | * This can only trigger when the entry at index 0 moves out | 846 | if (radix_tree_deref_retry(page)) { |
855 | * of or back to the root: none yet gotten, safe to restart. | 847 | /* |
856 | */ | 848 | * Transient condition which can only trigger |
857 | if (radix_tree_deref_retry(page)) { | 849 | * when entry at index 0 moves out of or back |
858 | WARN_ON(start | i); | 850 | * to root: none yet gotten, safe to restart. |
859 | goto restart; | 851 | */ |
852 | WARN_ON(start | i); | ||
853 | goto restart; | ||
854 | } | ||
855 | /* | ||
856 | * Otherwise, shmem/tmpfs must be storing a swap entry | ||
857 | * here as an exceptional entry: so skip over it - | ||
858 | * we only reach this from invalidate_mapping_pages(). | ||
859 | */ | ||
860 | nr_skip++; | ||
861 | continue; | ||
860 | } | 862 | } |
861 | 863 | ||
862 | if (!page_cache_get_speculative(page)) | 864 | if (!page_cache_get_speculative(page)) |
@@ -876,7 +878,7 @@ repeat: | |||
876 | * If all entries were removed before we could secure them, | 878 | * If all entries were removed before we could secure them, |
877 | * try again, because callers stop trying once 0 is returned. | 879 | * try again, because callers stop trying once 0 is returned. |
878 | */ | 880 | */ |
879 | if (unlikely(!ret && nr_found)) | 881 | if (unlikely(!ret && nr_found > nr_skip)) |
880 | goto restart; | 882 | goto restart; |
881 | rcu_read_unlock(); | 883 | rcu_read_unlock(); |
882 | return ret; | 884 | return ret; |
@@ -904,7 +906,7 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, | |||
904 | rcu_read_lock(); | 906 | rcu_read_lock(); |
905 | restart: | 907 | restart: |
906 | nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, | 908 | nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, |
907 | (void ***)pages, index, nr_pages); | 909 | (void ***)pages, NULL, index, nr_pages); |
908 | ret = 0; | 910 | ret = 0; |
909 | for (i = 0; i < nr_found; i++) { | 911 | for (i = 0; i < nr_found; i++) { |
910 | struct page *page; | 912 | struct page *page; |
@@ -913,12 +915,22 @@ repeat: | |||
913 | if (unlikely(!page)) | 915 | if (unlikely(!page)) |
914 | continue; | 916 | continue; |
915 | 917 | ||
916 | /* | 918 | if (radix_tree_exception(page)) { |
917 | * This can only trigger when the entry at index 0 moves out | 919 | if (radix_tree_deref_retry(page)) { |
918 | * of or back to the root: none yet gotten, safe to restart. | 920 | /* |
919 | */ | 921 | * Transient condition which can only trigger |
920 | if (radix_tree_deref_retry(page)) | 922 | * when entry at index 0 moves out of or back |
921 | goto restart; | 923 | * to root: none yet gotten, safe to restart. |
924 | */ | ||
925 | goto restart; | ||
926 | } | ||
927 | /* | ||
928 | * Otherwise, shmem/tmpfs must be storing a swap entry | ||
929 | * here as an exceptional entry: so stop looking for | ||
930 | * contiguous pages. | ||
931 | */ | ||
932 | break; | ||
933 | } | ||
922 | 934 | ||
923 | if (!page_cache_get_speculative(page)) | 935 | if (!page_cache_get_speculative(page)) |
924 | goto repeat; | 936 | goto repeat; |
@@ -978,12 +990,21 @@ repeat: | |||
978 | if (unlikely(!page)) | 990 | if (unlikely(!page)) |
979 | continue; | 991 | continue; |
980 | 992 | ||
981 | /* | 993 | if (radix_tree_exception(page)) { |
982 | * This can only trigger when the entry at index 0 moves out | 994 | if (radix_tree_deref_retry(page)) { |
983 | * of or back to the root: none yet gotten, safe to restart. | 995 | /* |
984 | */ | 996 | * Transient condition which can only trigger |
985 | if (radix_tree_deref_retry(page)) | 997 | * when entry at index 0 moves out of or back |
986 | goto restart; | 998 | * to root: none yet gotten, safe to restart. |
999 | */ | ||
1000 | goto restart; | ||
1001 | } | ||
1002 | /* | ||
1003 | * This function is never used on a shmem/tmpfs | ||
1004 | * mapping, so a swap entry won't be found here. | ||
1005 | */ | ||
1006 | BUG(); | ||
1007 | } | ||
987 | 1008 | ||
988 | if (!page_cache_get_speculative(page)) | 1009 | if (!page_cache_get_speculative(page)) |
989 | goto repeat; | 1010 | goto repeat; |
@@ -1795,7 +1816,7 @@ EXPORT_SYMBOL(generic_file_readonly_mmap); | |||
1795 | 1816 | ||
1796 | static struct page *__read_cache_page(struct address_space *mapping, | 1817 | static struct page *__read_cache_page(struct address_space *mapping, |
1797 | pgoff_t index, | 1818 | pgoff_t index, |
1798 | int (*filler)(void *,struct page*), | 1819 | int (*filler)(void *, struct page *), |
1799 | void *data, | 1820 | void *data, |
1800 | gfp_t gfp) | 1821 | gfp_t gfp) |
1801 | { | 1822 | { |
@@ -1826,7 +1847,7 @@ repeat: | |||
1826 | 1847 | ||
1827 | static struct page *do_read_cache_page(struct address_space *mapping, | 1848 | static struct page *do_read_cache_page(struct address_space *mapping, |
1828 | pgoff_t index, | 1849 | pgoff_t index, |
1829 | int (*filler)(void *,struct page*), | 1850 | int (*filler)(void *, struct page *), |
1830 | void *data, | 1851 | void *data, |
1831 | gfp_t gfp) | 1852 | gfp_t gfp) |
1832 | 1853 | ||
@@ -1866,7 +1887,7 @@ out: | |||
1866 | * @mapping: the page's address_space | 1887 | * @mapping: the page's address_space |
1867 | * @index: the page index | 1888 | * @index: the page index |
1868 | * @filler: function to perform the read | 1889 | * @filler: function to perform the read |
1869 | * @data: destination for read data | 1890 | * @data: first arg to filler(data, page) function, often left as NULL |
1870 | * | 1891 | * |
1871 | * Same as read_cache_page, but don't wait for page to become unlocked | 1892 | * Same as read_cache_page, but don't wait for page to become unlocked |
1872 | * after submitting it to the filler. | 1893 | * after submitting it to the filler. |
@@ -1878,7 +1899,7 @@ out: | |||
1878 | */ | 1899 | */ |
1879 | struct page *read_cache_page_async(struct address_space *mapping, | 1900 | struct page *read_cache_page_async(struct address_space *mapping, |
1880 | pgoff_t index, | 1901 | pgoff_t index, |
1881 | int (*filler)(void *,struct page*), | 1902 | int (*filler)(void *, struct page *), |
1882 | void *data) | 1903 | void *data) |
1883 | { | 1904 | { |
1884 | return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping)); | 1905 | return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping)); |
@@ -1926,7 +1947,7 @@ EXPORT_SYMBOL(read_cache_page_gfp); | |||
1926 | * @mapping: the page's address_space | 1947 | * @mapping: the page's address_space |
1927 | * @index: the page index | 1948 | * @index: the page index |
1928 | * @filler: function to perform the read | 1949 | * @filler: function to perform the read |
1929 | * @data: destination for read data | 1950 | * @data: first arg to filler(data, page) function, often left as NULL |
1930 | * | 1951 | * |
1931 | * Read into the page cache. If a page already exists, and PageUptodate() is | 1952 | * Read into the page cache. If a page already exists, and PageUptodate() is |
1932 | * not set, try to fill the page then wait for it to become unlocked. | 1953 | * not set, try to fill the page then wait for it to become unlocked. |
@@ -1935,7 +1956,7 @@ EXPORT_SYMBOL(read_cache_page_gfp); | |||
1935 | */ | 1956 | */ |
1936 | struct page *read_cache_page(struct address_space *mapping, | 1957 | struct page *read_cache_page(struct address_space *mapping, |
1937 | pgoff_t index, | 1958 | pgoff_t index, |
1938 | int (*filler)(void *,struct page*), | 1959 | int (*filler)(void *, struct page *), |
1939 | void *data) | 1960 | void *data) |
1940 | { | 1961 | { |
1941 | return wait_on_page_read(read_cache_page_async(mapping, index, filler, data)); | 1962 | return wait_on_page_read(read_cache_page_async(mapping, index, filler, data)); |
@@ -2094,6 +2115,7 @@ void iov_iter_advance(struct iov_iter *i, size_t bytes) | |||
2094 | } else { | 2115 | } else { |
2095 | const struct iovec *iov = i->iov; | 2116 | const struct iovec *iov = i->iov; |
2096 | size_t base = i->iov_offset; | 2117 | size_t base = i->iov_offset; |
2118 | unsigned long nr_segs = i->nr_segs; | ||
2097 | 2119 | ||
2098 | /* | 2120 | /* |
2099 | * The !iov->iov_len check ensures we skip over unlikely | 2121 | * The !iov->iov_len check ensures we skip over unlikely |
@@ -2109,11 +2131,13 @@ void iov_iter_advance(struct iov_iter *i, size_t bytes) | |||
2109 | base += copy; | 2131 | base += copy; |
2110 | if (iov->iov_len == base) { | 2132 | if (iov->iov_len == base) { |
2111 | iov++; | 2133 | iov++; |
2134 | nr_segs--; | ||
2112 | base = 0; | 2135 | base = 0; |
2113 | } | 2136 | } |
2114 | } | 2137 | } |
2115 | i->iov = iov; | 2138 | i->iov = iov; |
2116 | i->iov_offset = base; | 2139 | i->iov_offset = base; |
2140 | i->nr_segs = nr_segs; | ||
2117 | } | 2141 | } |
2118 | } | 2142 | } |
2119 | EXPORT_SYMBOL(iov_iter_advance); | 2143 | EXPORT_SYMBOL(iov_iter_advance); |
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index 93356cd12828..f91b2f687343 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c | |||
@@ -10,7 +10,7 @@ | |||
10 | 10 | ||
11 | #include <linux/fs.h> | 11 | #include <linux/fs.h> |
12 | #include <linux/pagemap.h> | 12 | #include <linux/pagemap.h> |
13 | #include <linux/module.h> | 13 | #include <linux/export.h> |
14 | #include <linux/uio.h> | 14 | #include <linux/uio.h> |
15 | #include <linux/rmap.h> | 15 | #include <linux/rmap.h> |
16 | #include <linux/mmu_notifier.h> | 16 | #include <linux/mmu_notifier.h> |
diff --git a/mm/fremap.c b/mm/fremap.c index b8e0e2d468af..9ed4fd432467 100644 --- a/mm/fremap.c +++ b/mm/fremap.c | |||
@@ -13,7 +13,6 @@ | |||
13 | #include <linux/pagemap.h> | 13 | #include <linux/pagemap.h> |
14 | #include <linux/swapops.h> | 14 | #include <linux/swapops.h> |
15 | #include <linux/rmap.h> | 15 | #include <linux/rmap.h> |
16 | #include <linux/module.h> | ||
17 | #include <linux/syscalls.h> | 16 | #include <linux/syscalls.h> |
18 | #include <linux/mmu_notifier.h> | 17 | #include <linux/mmu_notifier.h> |
19 | 18 | ||
diff --git a/mm/highmem.c b/mm/highmem.c index 693394daa2ed..57d82c6250c3 100644 --- a/mm/highmem.c +++ b/mm/highmem.c | |||
@@ -17,7 +17,7 @@ | |||
17 | */ | 17 | */ |
18 | 18 | ||
19 | #include <linux/mm.h> | 19 | #include <linux/mm.h> |
20 | #include <linux/module.h> | 20 | #include <linux/export.h> |
21 | #include <linux/swap.h> | 21 | #include <linux/swap.h> |
22 | #include <linux/bio.h> | 22 | #include <linux/bio.h> |
23 | #include <linux/pagemap.h> | 23 | #include <linux/pagemap.h> |
@@ -250,7 +250,7 @@ void *kmap_high_get(struct page *page) | |||
250 | #endif | 250 | #endif |
251 | 251 | ||
252 | /** | 252 | /** |
253 | * kunmap_high - map a highmem page into memory | 253 | * kunmap_high - unmap a highmem page into memory |
254 | * @page: &struct page to unmap | 254 | * @page: &struct page to unmap |
255 | * | 255 | * |
256 | * If ARCH_NEEDS_KMAP_HIGH_GET is not defined then this may be called | 256 | * If ARCH_NEEDS_KMAP_HIGH_GET is not defined then this may be called |
@@ -326,7 +326,7 @@ static struct page_address_slot { | |||
326 | spinlock_t lock; /* Protect this bucket's list */ | 326 | spinlock_t lock; /* Protect this bucket's list */ |
327 | } ____cacheline_aligned_in_smp page_address_htable[1<<PA_HASH_ORDER]; | 327 | } ____cacheline_aligned_in_smp page_address_htable[1<<PA_HASH_ORDER]; |
328 | 328 | ||
329 | static struct page_address_slot *page_slot(struct page *page) | 329 | static struct page_address_slot *page_slot(const struct page *page) |
330 | { | 330 | { |
331 | return &page_address_htable[hash_ptr(page, PA_HASH_ORDER)]; | 331 | return &page_address_htable[hash_ptr(page, PA_HASH_ORDER)]; |
332 | } | 332 | } |
@@ -337,7 +337,7 @@ static struct page_address_slot *page_slot(struct page *page) | |||
337 | * | 337 | * |
338 | * Returns the page's virtual address. | 338 | * Returns the page's virtual address. |
339 | */ | 339 | */ |
340 | void *page_address(struct page *page) | 340 | void *page_address(const struct page *page) |
341 | { | 341 | { |
342 | unsigned long flags; | 342 | unsigned long flags; |
343 | void *ret; | 343 | void *ret; |
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 81532f297fd2..4298abaae153 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
@@ -89,7 +89,8 @@ struct khugepaged_scan { | |||
89 | struct list_head mm_head; | 89 | struct list_head mm_head; |
90 | struct mm_slot *mm_slot; | 90 | struct mm_slot *mm_slot; |
91 | unsigned long address; | 91 | unsigned long address; |
92 | } khugepaged_scan = { | 92 | }; |
93 | static struct khugepaged_scan khugepaged_scan = { | ||
93 | .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head), | 94 | .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head), |
94 | }; | 95 | }; |
95 | 96 | ||
@@ -829,7 +830,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, | |||
829 | 830 | ||
830 | for (i = 0; i < HPAGE_PMD_NR; i++) { | 831 | for (i = 0; i < HPAGE_PMD_NR; i++) { |
831 | copy_user_highpage(pages[i], page + i, | 832 | copy_user_highpage(pages[i], page + i, |
832 | haddr + PAGE_SHIFT*i, vma); | 833 | haddr + PAGE_SIZE * i, vma); |
833 | __SetPageUptodate(pages[i]); | 834 | __SetPageUptodate(pages[i]); |
834 | cond_resched(); | 835 | cond_resched(); |
835 | } | 836 | } |
@@ -989,7 +990,7 @@ struct page *follow_trans_huge_pmd(struct mm_struct *mm, | |||
989 | page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; | 990 | page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; |
990 | VM_BUG_ON(!PageCompound(page)); | 991 | VM_BUG_ON(!PageCompound(page)); |
991 | if (flags & FOLL_GET) | 992 | if (flags & FOLL_GET) |
992 | get_page(page); | 993 | get_page_foll(page); |
993 | 994 | ||
994 | out: | 995 | out: |
995 | return page; | 996 | return page; |
@@ -1052,6 +1053,51 @@ int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, | |||
1052 | return ret; | 1053 | return ret; |
1053 | } | 1054 | } |
1054 | 1055 | ||
1056 | int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, | ||
1057 | unsigned long old_addr, | ||
1058 | unsigned long new_addr, unsigned long old_end, | ||
1059 | pmd_t *old_pmd, pmd_t *new_pmd) | ||
1060 | { | ||
1061 | int ret = 0; | ||
1062 | pmd_t pmd; | ||
1063 | |||
1064 | struct mm_struct *mm = vma->vm_mm; | ||
1065 | |||
1066 | if ((old_addr & ~HPAGE_PMD_MASK) || | ||
1067 | (new_addr & ~HPAGE_PMD_MASK) || | ||
1068 | old_end - old_addr < HPAGE_PMD_SIZE || | ||
1069 | (new_vma->vm_flags & VM_NOHUGEPAGE)) | ||
1070 | goto out; | ||
1071 | |||
1072 | /* | ||
1073 | * The destination pmd shouldn't be established, free_pgtables() | ||
1074 | * should have release it. | ||
1075 | */ | ||
1076 | if (WARN_ON(!pmd_none(*new_pmd))) { | ||
1077 | VM_BUG_ON(pmd_trans_huge(*new_pmd)); | ||
1078 | goto out; | ||
1079 | } | ||
1080 | |||
1081 | spin_lock(&mm->page_table_lock); | ||
1082 | if (likely(pmd_trans_huge(*old_pmd))) { | ||
1083 | if (pmd_trans_splitting(*old_pmd)) { | ||
1084 | spin_unlock(&mm->page_table_lock); | ||
1085 | wait_split_huge_page(vma->anon_vma, old_pmd); | ||
1086 | ret = -1; | ||
1087 | } else { | ||
1088 | pmd = pmdp_get_and_clear(mm, old_addr, old_pmd); | ||
1089 | VM_BUG_ON(!pmd_none(*new_pmd)); | ||
1090 | set_pmd_at(mm, new_addr, new_pmd, pmd); | ||
1091 | spin_unlock(&mm->page_table_lock); | ||
1092 | ret = 1; | ||
1093 | } | ||
1094 | } else { | ||
1095 | spin_unlock(&mm->page_table_lock); | ||
1096 | } | ||
1097 | out: | ||
1098 | return ret; | ||
1099 | } | ||
1100 | |||
1055 | int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, | 1101 | int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, |
1056 | unsigned long addr, pgprot_t newprot) | 1102 | unsigned long addr, pgprot_t newprot) |
1057 | { | 1103 | { |
@@ -1156,6 +1202,7 @@ static void __split_huge_page_refcount(struct page *page) | |||
1156 | unsigned long head_index = page->index; | 1202 | unsigned long head_index = page->index; |
1157 | struct zone *zone = page_zone(page); | 1203 | struct zone *zone = page_zone(page); |
1158 | int zonestat; | 1204 | int zonestat; |
1205 | int tail_count = 0; | ||
1159 | 1206 | ||
1160 | /* prevent PageLRU to go away from under us, and freeze lru stats */ | 1207 | /* prevent PageLRU to go away from under us, and freeze lru stats */ |
1161 | spin_lock_irq(&zone->lru_lock); | 1208 | spin_lock_irq(&zone->lru_lock); |
@@ -1164,11 +1211,27 @@ static void __split_huge_page_refcount(struct page *page) | |||
1164 | for (i = 1; i < HPAGE_PMD_NR; i++) { | 1211 | for (i = 1; i < HPAGE_PMD_NR; i++) { |
1165 | struct page *page_tail = page + i; | 1212 | struct page *page_tail = page + i; |
1166 | 1213 | ||
1167 | /* tail_page->_count cannot change */ | 1214 | /* tail_page->_mapcount cannot change */ |
1168 | atomic_sub(atomic_read(&page_tail->_count), &page->_count); | 1215 | BUG_ON(page_mapcount(page_tail) < 0); |
1169 | BUG_ON(page_count(page) <= 0); | 1216 | tail_count += page_mapcount(page_tail); |
1170 | atomic_add(page_mapcount(page) + 1, &page_tail->_count); | 1217 | /* check for overflow */ |
1171 | BUG_ON(atomic_read(&page_tail->_count) <= 0); | 1218 | BUG_ON(tail_count < 0); |
1219 | BUG_ON(atomic_read(&page_tail->_count) != 0); | ||
1220 | /* | ||
1221 | * tail_page->_count is zero and not changing from | ||
1222 | * under us. But get_page_unless_zero() may be running | ||
1223 | * from under us on the tail_page. If we used | ||
1224 | * atomic_set() below instead of atomic_add(), we | ||
1225 | * would then run atomic_set() concurrently with | ||
1226 | * get_page_unless_zero(), and atomic_set() is | ||
1227 | * implemented in C not using locked ops. spin_unlock | ||
1228 | * on x86 sometime uses locked ops because of PPro | ||
1229 | * errata 66, 92, so unless somebody can guarantee | ||
1230 | * atomic_set() here would be safe on all archs (and | ||
1231 | * not only on x86), it's safer to use atomic_add(). | ||
1232 | */ | ||
1233 | atomic_add(page_mapcount(page) + page_mapcount(page_tail) + 1, | ||
1234 | &page_tail->_count); | ||
1172 | 1235 | ||
1173 | /* after clearing PageTail the gup refcount can be released */ | 1236 | /* after clearing PageTail the gup refcount can be released */ |
1174 | smp_mb(); | 1237 | smp_mb(); |
@@ -1186,10 +1249,7 @@ static void __split_huge_page_refcount(struct page *page) | |||
1186 | (1L << PG_uptodate))); | 1249 | (1L << PG_uptodate))); |
1187 | page_tail->flags |= (1L << PG_dirty); | 1250 | page_tail->flags |= (1L << PG_dirty); |
1188 | 1251 | ||
1189 | /* | 1252 | /* clear PageTail before overwriting first_page */ |
1190 | * 1) clear PageTail before overwriting first_page | ||
1191 | * 2) clear PageTail before clearing PageHead for VM_BUG_ON | ||
1192 | */ | ||
1193 | smp_wmb(); | 1253 | smp_wmb(); |
1194 | 1254 | ||
1195 | /* | 1255 | /* |
@@ -1206,7 +1266,6 @@ static void __split_huge_page_refcount(struct page *page) | |||
1206 | * status is achieved setting a reserved bit in the | 1266 | * status is achieved setting a reserved bit in the |
1207 | * pmd, not by clearing the present bit. | 1267 | * pmd, not by clearing the present bit. |
1208 | */ | 1268 | */ |
1209 | BUG_ON(page_mapcount(page_tail)); | ||
1210 | page_tail->_mapcount = page->_mapcount; | 1269 | page_tail->_mapcount = page->_mapcount; |
1211 | 1270 | ||
1212 | BUG_ON(page_tail->mapping); | 1271 | BUG_ON(page_tail->mapping); |
@@ -1223,6 +1282,8 @@ static void __split_huge_page_refcount(struct page *page) | |||
1223 | 1282 | ||
1224 | lru_add_page_tail(zone, page, page_tail); | 1283 | lru_add_page_tail(zone, page, page_tail); |
1225 | } | 1284 | } |
1285 | atomic_sub(tail_count, &page->_count); | ||
1286 | BUG_ON(atomic_read(&page->_count) <= 0); | ||
1226 | 1287 | ||
1227 | __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); | 1288 | __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); |
1228 | __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR); | 1289 | __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR); |
@@ -1596,14 +1657,13 @@ void __khugepaged_exit(struct mm_struct *mm) | |||
1596 | list_del(&mm_slot->mm_node); | 1657 | list_del(&mm_slot->mm_node); |
1597 | free = 1; | 1658 | free = 1; |
1598 | } | 1659 | } |
1660 | spin_unlock(&khugepaged_mm_lock); | ||
1599 | 1661 | ||
1600 | if (free) { | 1662 | if (free) { |
1601 | spin_unlock(&khugepaged_mm_lock); | ||
1602 | clear_bit(MMF_VM_HUGEPAGE, &mm->flags); | 1663 | clear_bit(MMF_VM_HUGEPAGE, &mm->flags); |
1603 | free_mm_slot(mm_slot); | 1664 | free_mm_slot(mm_slot); |
1604 | mmdrop(mm); | 1665 | mmdrop(mm); |
1605 | } else if (mm_slot) { | 1666 | } else if (mm_slot) { |
1606 | spin_unlock(&khugepaged_mm_lock); | ||
1607 | /* | 1667 | /* |
1608 | * This is required to serialize against | 1668 | * This is required to serialize against |
1609 | * khugepaged_test_exit() (which is guaranteed to run | 1669 | * khugepaged_test_exit() (which is guaranteed to run |
@@ -1614,8 +1674,7 @@ void __khugepaged_exit(struct mm_struct *mm) | |||
1614 | */ | 1674 | */ |
1615 | down_write(&mm->mmap_sem); | 1675 | down_write(&mm->mmap_sem); |
1616 | up_write(&mm->mmap_sem); | 1676 | up_write(&mm->mmap_sem); |
1617 | } else | 1677 | } |
1618 | spin_unlock(&khugepaged_mm_lock); | ||
1619 | } | 1678 | } |
1620 | 1679 | ||
1621 | static void release_pte_page(struct page *page) | 1680 | static void release_pte_page(struct page *page) |
@@ -1908,7 +1967,7 @@ static void collapse_huge_page(struct mm_struct *mm, | |||
1908 | BUG_ON(!pmd_none(*pmd)); | 1967 | BUG_ON(!pmd_none(*pmd)); |
1909 | page_add_new_anon_rmap(new_page, vma, address); | 1968 | page_add_new_anon_rmap(new_page, vma, address); |
1910 | set_pmd_at(mm, address, pmd, _pmd); | 1969 | set_pmd_at(mm, address, pmd, _pmd); |
1911 | update_mmu_cache(vma, address, entry); | 1970 | update_mmu_cache(vma, address, _pmd); |
1912 | prepare_pmd_huge_pte(pgtable, mm); | 1971 | prepare_pmd_huge_pte(pgtable, mm); |
1913 | mm->nr_ptes--; | 1972 | mm->nr_ptes--; |
1914 | spin_unlock(&mm->page_table_lock); | 1973 | spin_unlock(&mm->page_table_lock); |
@@ -2026,6 +2085,8 @@ static void collect_mm_slot(struct mm_slot *mm_slot) | |||
2026 | 2085 | ||
2027 | static unsigned int khugepaged_scan_mm_slot(unsigned int pages, | 2086 | static unsigned int khugepaged_scan_mm_slot(unsigned int pages, |
2028 | struct page **hpage) | 2087 | struct page **hpage) |
2088 | __releases(&khugepaged_mm_lock) | ||
2089 | __acquires(&khugepaged_mm_lock) | ||
2029 | { | 2090 | { |
2030 | struct mm_slot *mm_slot; | 2091 | struct mm_slot *mm_slot; |
2031 | struct mm_struct *mm; | 2092 | struct mm_struct *mm; |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index bfcf153bc829..bb28a5f9db8d 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -24,7 +24,7 @@ | |||
24 | 24 | ||
25 | #include <asm/page.h> | 25 | #include <asm/page.h> |
26 | #include <asm/pgtable.h> | 26 | #include <asm/pgtable.h> |
27 | #include <asm/io.h> | 27 | #include <linux/io.h> |
28 | 28 | ||
29 | #include <linux/hugetlb.h> | 29 | #include <linux/hugetlb.h> |
30 | #include <linux/node.h> | 30 | #include <linux/node.h> |
@@ -62,10 +62,10 @@ static DEFINE_SPINLOCK(hugetlb_lock); | |||
62 | * must either hold the mmap_sem for write, or the mmap_sem for read and | 62 | * must either hold the mmap_sem for write, or the mmap_sem for read and |
63 | * the hugetlb_instantiation mutex: | 63 | * the hugetlb_instantiation mutex: |
64 | * | 64 | * |
65 | * down_write(&mm->mmap_sem); | 65 | * down_write(&mm->mmap_sem); |
66 | * or | 66 | * or |
67 | * down_read(&mm->mmap_sem); | 67 | * down_read(&mm->mmap_sem); |
68 | * mutex_lock(&hugetlb_instantiation_mutex); | 68 | * mutex_lock(&hugetlb_instantiation_mutex); |
69 | */ | 69 | */ |
70 | struct file_region { | 70 | struct file_region { |
71 | struct list_head link; | 71 | struct list_head link; |
@@ -503,9 +503,10 @@ static void update_and_free_page(struct hstate *h, struct page *page) | |||
503 | h->nr_huge_pages--; | 503 | h->nr_huge_pages--; |
504 | h->nr_huge_pages_node[page_to_nid(page)]--; | 504 | h->nr_huge_pages_node[page_to_nid(page)]--; |
505 | for (i = 0; i < pages_per_huge_page(h); i++) { | 505 | for (i = 0; i < pages_per_huge_page(h); i++) { |
506 | page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | | 506 | page[i].flags &= ~(1 << PG_locked | 1 << PG_error | |
507 | 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved | | 507 | 1 << PG_referenced | 1 << PG_dirty | |
508 | 1 << PG_private | 1<< PG_writeback); | 508 | 1 << PG_active | 1 << PG_reserved | |
509 | 1 << PG_private | 1 << PG_writeback); | ||
509 | } | 510 | } |
510 | set_compound_page_dtor(page, NULL); | 511 | set_compound_page_dtor(page, NULL); |
511 | set_page_refcounted(page); | 512 | set_page_refcounted(page); |
@@ -591,7 +592,6 @@ int PageHuge(struct page *page) | |||
591 | 592 | ||
592 | return dtor == free_huge_page; | 593 | return dtor == free_huge_page; |
593 | } | 594 | } |
594 | |||
595 | EXPORT_SYMBOL_GPL(PageHuge); | 595 | EXPORT_SYMBOL_GPL(PageHuge); |
596 | 596 | ||
597 | static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) | 597 | static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) |
@@ -1105,8 +1105,16 @@ static void __init gather_bootmem_prealloc(void) | |||
1105 | struct huge_bootmem_page *m; | 1105 | struct huge_bootmem_page *m; |
1106 | 1106 | ||
1107 | list_for_each_entry(m, &huge_boot_pages, list) { | 1107 | list_for_each_entry(m, &huge_boot_pages, list) { |
1108 | struct page *page = virt_to_page(m); | ||
1109 | struct hstate *h = m->hstate; | 1108 | struct hstate *h = m->hstate; |
1109 | struct page *page; | ||
1110 | |||
1111 | #ifdef CONFIG_HIGHMEM | ||
1112 | page = pfn_to_page(m->phys >> PAGE_SHIFT); | ||
1113 | free_bootmem_late((unsigned long)m, | ||
1114 | sizeof(struct huge_bootmem_page)); | ||
1115 | #else | ||
1116 | page = virt_to_page(m); | ||
1117 | #endif | ||
1110 | __ClearPageReserved(page); | 1118 | __ClearPageReserved(page); |
1111 | WARN_ON(page_count(page) != 1); | 1119 | WARN_ON(page_count(page) != 1); |
1112 | prep_compound_huge_page(page, h->order); | 1120 | prep_compound_huge_page(page, h->order); |
@@ -2124,9 +2132,8 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma, | |||
2124 | pte_t entry; | 2132 | pte_t entry; |
2125 | 2133 | ||
2126 | entry = pte_mkwrite(pte_mkdirty(huge_ptep_get(ptep))); | 2134 | entry = pte_mkwrite(pte_mkdirty(huge_ptep_get(ptep))); |
2127 | if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) { | 2135 | if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) |
2128 | update_mmu_cache(vma, address, ptep); | 2136 | update_mmu_cache(vma, address, ptep); |
2129 | } | ||
2130 | } | 2137 | } |
2131 | 2138 | ||
2132 | 2139 | ||
@@ -2181,9 +2188,9 @@ static int is_hugetlb_entry_migration(pte_t pte) | |||
2181 | if (huge_pte_none(pte) || pte_present(pte)) | 2188 | if (huge_pte_none(pte) || pte_present(pte)) |
2182 | return 0; | 2189 | return 0; |
2183 | swp = pte_to_swp_entry(pte); | 2190 | swp = pte_to_swp_entry(pte); |
2184 | if (non_swap_entry(swp) && is_migration_entry(swp)) { | 2191 | if (non_swap_entry(swp) && is_migration_entry(swp)) |
2185 | return 1; | 2192 | return 1; |
2186 | } else | 2193 | else |
2187 | return 0; | 2194 | return 0; |
2188 | } | 2195 | } |
2189 | 2196 | ||
@@ -2194,9 +2201,9 @@ static int is_hugetlb_entry_hwpoisoned(pte_t pte) | |||
2194 | if (huge_pte_none(pte) || pte_present(pte)) | 2201 | if (huge_pte_none(pte) || pte_present(pte)) |
2195 | return 0; | 2202 | return 0; |
2196 | swp = pte_to_swp_entry(pte); | 2203 | swp = pte_to_swp_entry(pte); |
2197 | if (non_swap_entry(swp) && is_hwpoison_entry(swp)) { | 2204 | if (non_swap_entry(swp) && is_hwpoison_entry(swp)) |
2198 | return 1; | 2205 | return 1; |
2199 | } else | 2206 | else |
2200 | return 0; | 2207 | return 0; |
2201 | } | 2208 | } |
2202 | 2209 | ||
@@ -2415,6 +2422,8 @@ retry_avoidcopy: | |||
2415 | * anon_vma prepared. | 2422 | * anon_vma prepared. |
2416 | */ | 2423 | */ |
2417 | if (unlikely(anon_vma_prepare(vma))) { | 2424 | if (unlikely(anon_vma_prepare(vma))) { |
2425 | page_cache_release(new_page); | ||
2426 | page_cache_release(old_page); | ||
2418 | /* Caller expects lock to be held */ | 2427 | /* Caller expects lock to be held */ |
2419 | spin_lock(&mm->page_table_lock); | 2428 | spin_lock(&mm->page_table_lock); |
2420 | return VM_FAULT_OOM; | 2429 | return VM_FAULT_OOM; |
@@ -2559,7 +2568,7 @@ retry: | |||
2559 | * So we need to block hugepage fault by PG_hwpoison bit check. | 2568 | * So we need to block hugepage fault by PG_hwpoison bit check. |
2560 | */ | 2569 | */ |
2561 | if (unlikely(PageHWPoison(page))) { | 2570 | if (unlikely(PageHWPoison(page))) { |
2562 | ret = VM_FAULT_HWPOISON | | 2571 | ret = VM_FAULT_HWPOISON | |
2563 | VM_FAULT_SET_HINDEX(h - hstates); | 2572 | VM_FAULT_SET_HINDEX(h - hstates); |
2564 | goto backout_unlocked; | 2573 | goto backout_unlocked; |
2565 | } | 2574 | } |
@@ -2627,7 +2636,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2627 | migration_entry_wait(mm, (pmd_t *)ptep, address); | 2636 | migration_entry_wait(mm, (pmd_t *)ptep, address); |
2628 | return 0; | 2637 | return 0; |
2629 | } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) | 2638 | } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) |
2630 | return VM_FAULT_HWPOISON_LARGE | | 2639 | return VM_FAULT_HWPOISON_LARGE | |
2631 | VM_FAULT_SET_HINDEX(h - hstates); | 2640 | VM_FAULT_SET_HINDEX(h - hstates); |
2632 | } | 2641 | } |
2633 | 2642 | ||
diff --git a/mm/init-mm.c b/mm/init-mm.c index 4019979b2637..a56a851908d2 100644 --- a/mm/init-mm.c +++ b/mm/init-mm.c | |||
@@ -5,7 +5,7 @@ | |||
5 | #include <linux/list.h> | 5 | #include <linux/list.h> |
6 | #include <linux/cpumask.h> | 6 | #include <linux/cpumask.h> |
7 | 7 | ||
8 | #include <asm/atomic.h> | 8 | #include <linux/atomic.h> |
9 | #include <asm/pgtable.h> | 9 | #include <asm/pgtable.h> |
10 | #include <asm/mmu.h> | 10 | #include <asm/mmu.h> |
11 | 11 | ||
diff --git a/mm/internal.h b/mm/internal.h index d071d380fb49..2189af491783 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
@@ -37,6 +37,52 @@ static inline void __put_page(struct page *page) | |||
37 | atomic_dec(&page->_count); | 37 | atomic_dec(&page->_count); |
38 | } | 38 | } |
39 | 39 | ||
40 | static inline void __get_page_tail_foll(struct page *page, | ||
41 | bool get_page_head) | ||
42 | { | ||
43 | /* | ||
44 | * If we're getting a tail page, the elevated page->_count is | ||
45 | * required only in the head page and we will elevate the head | ||
46 | * page->_count and tail page->_mapcount. | ||
47 | * | ||
48 | * We elevate page_tail->_mapcount for tail pages to force | ||
49 | * page_tail->_count to be zero at all times to avoid getting | ||
50 | * false positives from get_page_unless_zero() with | ||
51 | * speculative page access (like in | ||
52 | * page_cache_get_speculative()) on tail pages. | ||
53 | */ | ||
54 | VM_BUG_ON(atomic_read(&page->first_page->_count) <= 0); | ||
55 | VM_BUG_ON(atomic_read(&page->_count) != 0); | ||
56 | VM_BUG_ON(page_mapcount(page) < 0); | ||
57 | if (get_page_head) | ||
58 | atomic_inc(&page->first_page->_count); | ||
59 | atomic_inc(&page->_mapcount); | ||
60 | } | ||
61 | |||
62 | /* | ||
63 | * This is meant to be called as the FOLL_GET operation of | ||
64 | * follow_page() and it must be called while holding the proper PT | ||
65 | * lock while the pte (or pmd_trans_huge) is still mapping the page. | ||
66 | */ | ||
67 | static inline void get_page_foll(struct page *page) | ||
68 | { | ||
69 | if (unlikely(PageTail(page))) | ||
70 | /* | ||
71 | * This is safe only because | ||
72 | * __split_huge_page_refcount() can't run under | ||
73 | * get_page_foll() because we hold the proper PT lock. | ||
74 | */ | ||
75 | __get_page_tail_foll(page, true); | ||
76 | else { | ||
77 | /* | ||
78 | * Getting a normal page or the head of a compound page | ||
79 | * requires to already have an elevated page->_count. | ||
80 | */ | ||
81 | VM_BUG_ON(atomic_read(&page->_count) <= 0); | ||
82 | atomic_inc(&page->_count); | ||
83 | } | ||
84 | } | ||
85 | |||
40 | extern unsigned long highest_memmap_pfn; | 86 | extern unsigned long highest_memmap_pfn; |
41 | 87 | ||
42 | /* | 88 | /* |
diff --git a/mm/kmemleak.c b/mm/kmemleak.c index aacee45616fc..f3b2a00fe9c1 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c | |||
@@ -69,7 +69,7 @@ | |||
69 | #include <linux/sched.h> | 69 | #include <linux/sched.h> |
70 | #include <linux/jiffies.h> | 70 | #include <linux/jiffies.h> |
71 | #include <linux/delay.h> | 71 | #include <linux/delay.h> |
72 | #include <linux/module.h> | 72 | #include <linux/export.h> |
73 | #include <linux/kthread.h> | 73 | #include <linux/kthread.h> |
74 | #include <linux/prio_tree.h> | 74 | #include <linux/prio_tree.h> |
75 | #include <linux/fs.h> | 75 | #include <linux/fs.h> |
@@ -96,7 +96,7 @@ | |||
96 | 96 | ||
97 | #include <asm/sections.h> | 97 | #include <asm/sections.h> |
98 | #include <asm/processor.h> | 98 | #include <asm/processor.h> |
99 | #include <asm/atomic.h> | 99 | #include <linux/atomic.h> |
100 | 100 | ||
101 | #include <linux/kmemcheck.h> | 101 | #include <linux/kmemcheck.h> |
102 | #include <linux/kmemleak.h> | 102 | #include <linux/kmemleak.h> |
@@ -1905,7 +1905,8 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, | |||
1905 | 1905 | ||
1906 | oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX); | 1906 | oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX); |
1907 | err = unmerge_and_remove_all_rmap_items(); | 1907 | err = unmerge_and_remove_all_rmap_items(); |
1908 | test_set_oom_score_adj(oom_score_adj); | 1908 | compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, |
1909 | oom_score_adj); | ||
1909 | if (err) { | 1910 | if (err) { |
1910 | ksm_run = KSM_RUN_STOP; | 1911 | ksm_run = KSM_RUN_STOP; |
1911 | count = err; | 1912 | count = err; |
diff --git a/mm/maccess.c b/mm/maccess.c index 4cee182ab5f3..d53adf9ba84b 100644 --- a/mm/maccess.c +++ b/mm/maccess.c | |||
@@ -1,7 +1,7 @@ | |||
1 | /* | 1 | /* |
2 | * Access kernel memory without faulting. | 2 | * Access kernel memory without faulting. |
3 | */ | 3 | */ |
4 | #include <linux/module.h> | 4 | #include <linux/export.h> |
5 | #include <linux/mm.h> | 5 | #include <linux/mm.h> |
6 | #include <linux/uaccess.h> | 6 | #include <linux/uaccess.h> |
7 | 7 | ||
diff --git a/mm/madvise.c b/mm/madvise.c index 2221491ed503..74bf193eff04 100644 --- a/mm/madvise.c +++ b/mm/madvise.c | |||
@@ -218,7 +218,7 @@ static long madvise_remove(struct vm_area_struct *vma, | |||
218 | endoff = (loff_t)(end - vma->vm_start - 1) | 218 | endoff = (loff_t)(end - vma->vm_start - 1) |
219 | + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); | 219 | + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); |
220 | 220 | ||
221 | /* vmtruncate_range needs to take i_mutex and i_alloc_sem */ | 221 | /* vmtruncate_range needs to take i_mutex */ |
222 | up_read(¤t->mm->mmap_sem); | 222 | up_read(¤t->mm->mmap_sem); |
223 | error = vmtruncate_range(mapping->host, offset, endoff); | 223 | error = vmtruncate_range(mapping->host, offset, endoff); |
224 | down_read(¤t->mm->mmap_sem); | 224 | down_read(¤t->mm->mmap_sem); |
diff --git a/mm/memblock.c b/mm/memblock.c index a75723d62631..a57092f63a86 100644 --- a/mm/memblock.c +++ b/mm/memblock.c | |||
@@ -47,7 +47,8 @@ static unsigned long __init_memblock memblock_addrs_overlap(phys_addr_t base1, p | |||
47 | return ((base1 < (base2 + size2)) && (base2 < (base1 + size1))); | 47 | return ((base1 < (base2 + size2)) && (base2 < (base1 + size1))); |
48 | } | 48 | } |
49 | 49 | ||
50 | long __init_memblock memblock_overlaps_region(struct memblock_type *type, phys_addr_t base, phys_addr_t size) | 50 | static long __init_memblock memblock_overlaps_region(struct memblock_type *type, |
51 | phys_addr_t base, phys_addr_t size) | ||
51 | { | 52 | { |
52 | unsigned long i; | 53 | unsigned long i; |
53 | 54 | ||
@@ -773,6 +774,12 @@ phys_addr_t __init memblock_phys_mem_size(void) | |||
773 | return memblock.memory_size; | 774 | return memblock.memory_size; |
774 | } | 775 | } |
775 | 776 | ||
777 | /* lowest address */ | ||
778 | phys_addr_t __init_memblock memblock_start_of_DRAM(void) | ||
779 | { | ||
780 | return memblock.memory.regions[0].base; | ||
781 | } | ||
782 | |||
776 | phys_addr_t __init_memblock memblock_end_of_DRAM(void) | 783 | phys_addr_t __init_memblock memblock_end_of_DRAM(void) |
777 | { | 784 | { |
778 | int idx = memblock.memory.cnt - 1; | 785 | int idx = memblock.memory.cnt - 1; |
@@ -912,9 +919,9 @@ void __init memblock_analyze(void) | |||
912 | 919 | ||
913 | /* Check marker in the unused last array entry */ | 920 | /* Check marker in the unused last array entry */ |
914 | WARN_ON(memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS].base | 921 | WARN_ON(memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS].base |
915 | != (phys_addr_t)RED_INACTIVE); | 922 | != MEMBLOCK_INACTIVE); |
916 | WARN_ON(memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS].base | 923 | WARN_ON(memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS].base |
917 | != (phys_addr_t)RED_INACTIVE); | 924 | != MEMBLOCK_INACTIVE); |
918 | 925 | ||
919 | memblock.memory_size = 0; | 926 | memblock.memory_size = 0; |
920 | 927 | ||
@@ -940,8 +947,8 @@ void __init memblock_init(void) | |||
940 | memblock.reserved.max = INIT_MEMBLOCK_REGIONS; | 947 | memblock.reserved.max = INIT_MEMBLOCK_REGIONS; |
941 | 948 | ||
942 | /* Write a marker in the unused last array entry */ | 949 | /* Write a marker in the unused last array entry */ |
943 | memblock.memory.regions[INIT_MEMBLOCK_REGIONS].base = (phys_addr_t)RED_INACTIVE; | 950 | memblock.memory.regions[INIT_MEMBLOCK_REGIONS].base = MEMBLOCK_INACTIVE; |
944 | memblock.reserved.regions[INIT_MEMBLOCK_REGIONS].base = (phys_addr_t)RED_INACTIVE; | 951 | memblock.reserved.regions[INIT_MEMBLOCK_REGIONS].base = MEMBLOCK_INACTIVE; |
945 | 952 | ||
946 | /* Create a dummy zero size MEMBLOCK which will get coalesced away later. | 953 | /* Create a dummy zero size MEMBLOCK which will get coalesced away later. |
947 | * This simplifies the memblock_add() code below... | 954 | * This simplifies the memblock_add() code below... |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e013b8e57d25..6aff93c98aca 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -33,9 +33,9 @@ | |||
33 | #include <linux/bit_spinlock.h> | 33 | #include <linux/bit_spinlock.h> |
34 | #include <linux/rcupdate.h> | 34 | #include <linux/rcupdate.h> |
35 | #include <linux/limits.h> | 35 | #include <linux/limits.h> |
36 | #include <linux/export.h> | ||
36 | #include <linux/mutex.h> | 37 | #include <linux/mutex.h> |
37 | #include <linux/rbtree.h> | 38 | #include <linux/rbtree.h> |
38 | #include <linux/shmem_fs.h> | ||
39 | #include <linux/slab.h> | 39 | #include <linux/slab.h> |
40 | #include <linux/swap.h> | 40 | #include <linux/swap.h> |
41 | #include <linux/swapops.h> | 41 | #include <linux/swapops.h> |
@@ -202,8 +202,8 @@ struct mem_cgroup_eventfd_list { | |||
202 | struct eventfd_ctx *eventfd; | 202 | struct eventfd_ctx *eventfd; |
203 | }; | 203 | }; |
204 | 204 | ||
205 | static void mem_cgroup_threshold(struct mem_cgroup *mem); | 205 | static void mem_cgroup_threshold(struct mem_cgroup *memcg); |
206 | static void mem_cgroup_oom_notify(struct mem_cgroup *mem); | 206 | static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); |
207 | 207 | ||
208 | /* | 208 | /* |
209 | * The memory controller data structure. The memory controller controls both | 209 | * The memory controller data structure. The memory controller controls both |
@@ -246,10 +246,13 @@ struct mem_cgroup { | |||
246 | * Should the accounting and control be hierarchical, per subtree? | 246 | * Should the accounting and control be hierarchical, per subtree? |
247 | */ | 247 | */ |
248 | bool use_hierarchy; | 248 | bool use_hierarchy; |
249 | atomic_t oom_lock; | 249 | |
250 | bool oom_lock; | ||
251 | atomic_t under_oom; | ||
252 | |||
250 | atomic_t refcnt; | 253 | atomic_t refcnt; |
251 | 254 | ||
252 | unsigned int swappiness; | 255 | int swappiness; |
253 | /* OOM-Killer disable */ | 256 | /* OOM-Killer disable */ |
254 | int oom_kill_disable; | 257 | int oom_kill_disable; |
255 | 258 | ||
@@ -360,29 +363,29 @@ enum charge_type { | |||
360 | #define MEM_CGROUP_RECLAIM_SOFT_BIT 0x2 | 363 | #define MEM_CGROUP_RECLAIM_SOFT_BIT 0x2 |
361 | #define MEM_CGROUP_RECLAIM_SOFT (1 << MEM_CGROUP_RECLAIM_SOFT_BIT) | 364 | #define MEM_CGROUP_RECLAIM_SOFT (1 << MEM_CGROUP_RECLAIM_SOFT_BIT) |
362 | 365 | ||
363 | static void mem_cgroup_get(struct mem_cgroup *mem); | 366 | static void mem_cgroup_get(struct mem_cgroup *memcg); |
364 | static void mem_cgroup_put(struct mem_cgroup *mem); | 367 | static void mem_cgroup_put(struct mem_cgroup *memcg); |
365 | static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); | 368 | static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg); |
366 | static void drain_all_stock_async(struct mem_cgroup *mem); | 369 | static void drain_all_stock_async(struct mem_cgroup *memcg); |
367 | 370 | ||
368 | static struct mem_cgroup_per_zone * | 371 | static struct mem_cgroup_per_zone * |
369 | mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) | 372 | mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid) |
370 | { | 373 | { |
371 | return &mem->info.nodeinfo[nid]->zoneinfo[zid]; | 374 | return &memcg->info.nodeinfo[nid]->zoneinfo[zid]; |
372 | } | 375 | } |
373 | 376 | ||
374 | struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem) | 377 | struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg) |
375 | { | 378 | { |
376 | return &mem->css; | 379 | return &memcg->css; |
377 | } | 380 | } |
378 | 381 | ||
379 | static struct mem_cgroup_per_zone * | 382 | static struct mem_cgroup_per_zone * |
380 | page_cgroup_zoneinfo(struct mem_cgroup *mem, struct page *page) | 383 | page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page) |
381 | { | 384 | { |
382 | int nid = page_to_nid(page); | 385 | int nid = page_to_nid(page); |
383 | int zid = page_zonenum(page); | 386 | int zid = page_zonenum(page); |
384 | 387 | ||
385 | return mem_cgroup_zoneinfo(mem, nid, zid); | 388 | return mem_cgroup_zoneinfo(memcg, nid, zid); |
386 | } | 389 | } |
387 | 390 | ||
388 | static struct mem_cgroup_tree_per_zone * | 391 | static struct mem_cgroup_tree_per_zone * |
@@ -401,7 +404,7 @@ soft_limit_tree_from_page(struct page *page) | |||
401 | } | 404 | } |
402 | 405 | ||
403 | static void | 406 | static void |
404 | __mem_cgroup_insert_exceeded(struct mem_cgroup *mem, | 407 | __mem_cgroup_insert_exceeded(struct mem_cgroup *memcg, |
405 | struct mem_cgroup_per_zone *mz, | 408 | struct mem_cgroup_per_zone *mz, |
406 | struct mem_cgroup_tree_per_zone *mctz, | 409 | struct mem_cgroup_tree_per_zone *mctz, |
407 | unsigned long long new_usage_in_excess) | 410 | unsigned long long new_usage_in_excess) |
@@ -435,7 +438,7 @@ __mem_cgroup_insert_exceeded(struct mem_cgroup *mem, | |||
435 | } | 438 | } |
436 | 439 | ||
437 | static void | 440 | static void |
438 | __mem_cgroup_remove_exceeded(struct mem_cgroup *mem, | 441 | __mem_cgroup_remove_exceeded(struct mem_cgroup *memcg, |
439 | struct mem_cgroup_per_zone *mz, | 442 | struct mem_cgroup_per_zone *mz, |
440 | struct mem_cgroup_tree_per_zone *mctz) | 443 | struct mem_cgroup_tree_per_zone *mctz) |
441 | { | 444 | { |
@@ -446,17 +449,17 @@ __mem_cgroup_remove_exceeded(struct mem_cgroup *mem, | |||
446 | } | 449 | } |
447 | 450 | ||
448 | static void | 451 | static void |
449 | mem_cgroup_remove_exceeded(struct mem_cgroup *mem, | 452 | mem_cgroup_remove_exceeded(struct mem_cgroup *memcg, |
450 | struct mem_cgroup_per_zone *mz, | 453 | struct mem_cgroup_per_zone *mz, |
451 | struct mem_cgroup_tree_per_zone *mctz) | 454 | struct mem_cgroup_tree_per_zone *mctz) |
452 | { | 455 | { |
453 | spin_lock(&mctz->lock); | 456 | spin_lock(&mctz->lock); |
454 | __mem_cgroup_remove_exceeded(mem, mz, mctz); | 457 | __mem_cgroup_remove_exceeded(memcg, mz, mctz); |
455 | spin_unlock(&mctz->lock); | 458 | spin_unlock(&mctz->lock); |
456 | } | 459 | } |
457 | 460 | ||
458 | 461 | ||
459 | static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page) | 462 | static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) |
460 | { | 463 | { |
461 | unsigned long long excess; | 464 | unsigned long long excess; |
462 | struct mem_cgroup_per_zone *mz; | 465 | struct mem_cgroup_per_zone *mz; |
@@ -469,9 +472,9 @@ static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page) | |||
469 | * Necessary to update all ancestors when hierarchy is used. | 472 | * Necessary to update all ancestors when hierarchy is used. |
470 | * because their event counter is not touched. | 473 | * because their event counter is not touched. |
471 | */ | 474 | */ |
472 | for (; mem; mem = parent_mem_cgroup(mem)) { | 475 | for (; memcg; memcg = parent_mem_cgroup(memcg)) { |
473 | mz = mem_cgroup_zoneinfo(mem, nid, zid); | 476 | mz = mem_cgroup_zoneinfo(memcg, nid, zid); |
474 | excess = res_counter_soft_limit_excess(&mem->res); | 477 | excess = res_counter_soft_limit_excess(&memcg->res); |
475 | /* | 478 | /* |
476 | * We have to update the tree if mz is on RB-tree or | 479 | * We have to update the tree if mz is on RB-tree or |
477 | * mem is over its softlimit. | 480 | * mem is over its softlimit. |
@@ -480,18 +483,18 @@ static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page) | |||
480 | spin_lock(&mctz->lock); | 483 | spin_lock(&mctz->lock); |
481 | /* if on-tree, remove it */ | 484 | /* if on-tree, remove it */ |
482 | if (mz->on_tree) | 485 | if (mz->on_tree) |
483 | __mem_cgroup_remove_exceeded(mem, mz, mctz); | 486 | __mem_cgroup_remove_exceeded(memcg, mz, mctz); |
484 | /* | 487 | /* |
485 | * Insert again. mz->usage_in_excess will be updated. | 488 | * Insert again. mz->usage_in_excess will be updated. |
486 | * If excess is 0, no tree ops. | 489 | * If excess is 0, no tree ops. |
487 | */ | 490 | */ |
488 | __mem_cgroup_insert_exceeded(mem, mz, mctz, excess); | 491 | __mem_cgroup_insert_exceeded(memcg, mz, mctz, excess); |
489 | spin_unlock(&mctz->lock); | 492 | spin_unlock(&mctz->lock); |
490 | } | 493 | } |
491 | } | 494 | } |
492 | } | 495 | } |
493 | 496 | ||
494 | static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem) | 497 | static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) |
495 | { | 498 | { |
496 | int node, zone; | 499 | int node, zone; |
497 | struct mem_cgroup_per_zone *mz; | 500 | struct mem_cgroup_per_zone *mz; |
@@ -499,9 +502,9 @@ static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem) | |||
499 | 502 | ||
500 | for_each_node_state(node, N_POSSIBLE) { | 503 | for_each_node_state(node, N_POSSIBLE) { |
501 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { | 504 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { |
502 | mz = mem_cgroup_zoneinfo(mem, node, zone); | 505 | mz = mem_cgroup_zoneinfo(memcg, node, zone); |
503 | mctz = soft_limit_tree_node_zone(node, zone); | 506 | mctz = soft_limit_tree_node_zone(node, zone); |
504 | mem_cgroup_remove_exceeded(mem, mz, mctz); | 507 | mem_cgroup_remove_exceeded(memcg, mz, mctz); |
505 | } | 508 | } |
506 | } | 509 | } |
507 | } | 510 | } |
@@ -562,7 +565,7 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) | |||
562 | * common workload, threashold and synchonization as vmstat[] should be | 565 | * common workload, threashold and synchonization as vmstat[] should be |
563 | * implemented. | 566 | * implemented. |
564 | */ | 567 | */ |
565 | static long mem_cgroup_read_stat(struct mem_cgroup *mem, | 568 | static long mem_cgroup_read_stat(struct mem_cgroup *memcg, |
566 | enum mem_cgroup_stat_index idx) | 569 | enum mem_cgroup_stat_index idx) |
567 | { | 570 | { |
568 | long val = 0; | 571 | long val = 0; |
@@ -570,111 +573,131 @@ static long mem_cgroup_read_stat(struct mem_cgroup *mem, | |||
570 | 573 | ||
571 | get_online_cpus(); | 574 | get_online_cpus(); |
572 | for_each_online_cpu(cpu) | 575 | for_each_online_cpu(cpu) |
573 | val += per_cpu(mem->stat->count[idx], cpu); | 576 | val += per_cpu(memcg->stat->count[idx], cpu); |
574 | #ifdef CONFIG_HOTPLUG_CPU | 577 | #ifdef CONFIG_HOTPLUG_CPU |
575 | spin_lock(&mem->pcp_counter_lock); | 578 | spin_lock(&memcg->pcp_counter_lock); |
576 | val += mem->nocpu_base.count[idx]; | 579 | val += memcg->nocpu_base.count[idx]; |
577 | spin_unlock(&mem->pcp_counter_lock); | 580 | spin_unlock(&memcg->pcp_counter_lock); |
578 | #endif | 581 | #endif |
579 | put_online_cpus(); | 582 | put_online_cpus(); |
580 | return val; | 583 | return val; |
581 | } | 584 | } |
582 | 585 | ||
583 | static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, | 586 | static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, |
584 | bool charge) | 587 | bool charge) |
585 | { | 588 | { |
586 | int val = (charge) ? 1 : -1; | 589 | int val = (charge) ? 1 : -1; |
587 | this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val); | 590 | this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAPOUT], val); |
588 | } | 591 | } |
589 | 592 | ||
590 | void mem_cgroup_pgfault(struct mem_cgroup *mem, int val) | 593 | void mem_cgroup_pgfault(struct mem_cgroup *memcg, int val) |
591 | { | 594 | { |
592 | this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_PGFAULT], val); | 595 | this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT], val); |
593 | } | 596 | } |
594 | 597 | ||
595 | void mem_cgroup_pgmajfault(struct mem_cgroup *mem, int val) | 598 | void mem_cgroup_pgmajfault(struct mem_cgroup *memcg, int val) |
596 | { | 599 | { |
597 | this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT], val); | 600 | this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT], val); |
598 | } | 601 | } |
599 | 602 | ||
600 | static unsigned long mem_cgroup_read_events(struct mem_cgroup *mem, | 603 | static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, |
601 | enum mem_cgroup_events_index idx) | 604 | enum mem_cgroup_events_index idx) |
602 | { | 605 | { |
603 | unsigned long val = 0; | 606 | unsigned long val = 0; |
604 | int cpu; | 607 | int cpu; |
605 | 608 | ||
606 | for_each_online_cpu(cpu) | 609 | for_each_online_cpu(cpu) |
607 | val += per_cpu(mem->stat->events[idx], cpu); | 610 | val += per_cpu(memcg->stat->events[idx], cpu); |
608 | #ifdef CONFIG_HOTPLUG_CPU | 611 | #ifdef CONFIG_HOTPLUG_CPU |
609 | spin_lock(&mem->pcp_counter_lock); | 612 | spin_lock(&memcg->pcp_counter_lock); |
610 | val += mem->nocpu_base.events[idx]; | 613 | val += memcg->nocpu_base.events[idx]; |
611 | spin_unlock(&mem->pcp_counter_lock); | 614 | spin_unlock(&memcg->pcp_counter_lock); |
612 | #endif | 615 | #endif |
613 | return val; | 616 | return val; |
614 | } | 617 | } |
615 | 618 | ||
616 | static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, | 619 | static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, |
617 | bool file, int nr_pages) | 620 | bool file, int nr_pages) |
618 | { | 621 | { |
619 | preempt_disable(); | 622 | preempt_disable(); |
620 | 623 | ||
621 | if (file) | 624 | if (file) |
622 | __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], nr_pages); | 625 | __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE], |
626 | nr_pages); | ||
623 | else | 627 | else |
624 | __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], nr_pages); | 628 | __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS], |
629 | nr_pages); | ||
625 | 630 | ||
626 | /* pagein of a big page is an event. So, ignore page size */ | 631 | /* pagein of a big page is an event. So, ignore page size */ |
627 | if (nr_pages > 0) | 632 | if (nr_pages > 0) |
628 | __this_cpu_inc(mem->stat->events[MEM_CGROUP_EVENTS_PGPGIN]); | 633 | __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]); |
629 | else { | 634 | else { |
630 | __this_cpu_inc(mem->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]); | 635 | __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]); |
631 | nr_pages = -nr_pages; /* for event */ | 636 | nr_pages = -nr_pages; /* for event */ |
632 | } | 637 | } |
633 | 638 | ||
634 | __this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_COUNT], nr_pages); | 639 | __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT], nr_pages); |
635 | 640 | ||
636 | preempt_enable(); | 641 | preempt_enable(); |
637 | } | 642 | } |
638 | 643 | ||
639 | static unsigned long | 644 | unsigned long |
640 | mem_cgroup_get_zonestat_node(struct mem_cgroup *mem, int nid, enum lru_list idx) | 645 | mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid, |
646 | unsigned int lru_mask) | ||
641 | { | 647 | { |
642 | struct mem_cgroup_per_zone *mz; | 648 | struct mem_cgroup_per_zone *mz; |
649 | enum lru_list l; | ||
650 | unsigned long ret = 0; | ||
651 | |||
652 | mz = mem_cgroup_zoneinfo(memcg, nid, zid); | ||
653 | |||
654 | for_each_lru(l) { | ||
655 | if (BIT(l) & lru_mask) | ||
656 | ret += MEM_CGROUP_ZSTAT(mz, l); | ||
657 | } | ||
658 | return ret; | ||
659 | } | ||
660 | |||
661 | static unsigned long | ||
662 | mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, | ||
663 | int nid, unsigned int lru_mask) | ||
664 | { | ||
643 | u64 total = 0; | 665 | u64 total = 0; |
644 | int zid; | 666 | int zid; |
645 | 667 | ||
646 | for (zid = 0; zid < MAX_NR_ZONES; zid++) { | 668 | for (zid = 0; zid < MAX_NR_ZONES; zid++) |
647 | mz = mem_cgroup_zoneinfo(mem, nid, zid); | 669 | total += mem_cgroup_zone_nr_lru_pages(memcg, |
648 | total += MEM_CGROUP_ZSTAT(mz, idx); | 670 | nid, zid, lru_mask); |
649 | } | 671 | |
650 | return total; | 672 | return total; |
651 | } | 673 | } |
652 | static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, | 674 | |
653 | enum lru_list idx) | 675 | static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, |
676 | unsigned int lru_mask) | ||
654 | { | 677 | { |
655 | int nid; | 678 | int nid; |
656 | u64 total = 0; | 679 | u64 total = 0; |
657 | 680 | ||
658 | for_each_online_node(nid) | 681 | for_each_node_state(nid, N_HIGH_MEMORY) |
659 | total += mem_cgroup_get_zonestat_node(mem, nid, idx); | 682 | total += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask); |
660 | return total; | 683 | return total; |
661 | } | 684 | } |
662 | 685 | ||
663 | static bool __memcg_event_check(struct mem_cgroup *mem, int target) | 686 | static bool __memcg_event_check(struct mem_cgroup *memcg, int target) |
664 | { | 687 | { |
665 | unsigned long val, next; | 688 | unsigned long val, next; |
666 | 689 | ||
667 | val = this_cpu_read(mem->stat->events[MEM_CGROUP_EVENTS_COUNT]); | 690 | val = __this_cpu_read(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT]); |
668 | next = this_cpu_read(mem->stat->targets[target]); | 691 | next = __this_cpu_read(memcg->stat->targets[target]); |
669 | /* from time_after() in jiffies.h */ | 692 | /* from time_after() in jiffies.h */ |
670 | return ((long)next - (long)val < 0); | 693 | return ((long)next - (long)val < 0); |
671 | } | 694 | } |
672 | 695 | ||
673 | static void __mem_cgroup_target_update(struct mem_cgroup *mem, int target) | 696 | static void __mem_cgroup_target_update(struct mem_cgroup *memcg, int target) |
674 | { | 697 | { |
675 | unsigned long val, next; | 698 | unsigned long val, next; |
676 | 699 | ||
677 | val = this_cpu_read(mem->stat->events[MEM_CGROUP_EVENTS_COUNT]); | 700 | val = __this_cpu_read(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT]); |
678 | 701 | ||
679 | switch (target) { | 702 | switch (target) { |
680 | case MEM_CGROUP_TARGET_THRESH: | 703 | case MEM_CGROUP_TARGET_THRESH: |
@@ -690,34 +713,36 @@ static void __mem_cgroup_target_update(struct mem_cgroup *mem, int target) | |||
690 | return; | 713 | return; |
691 | } | 714 | } |
692 | 715 | ||
693 | this_cpu_write(mem->stat->targets[target], next); | 716 | __this_cpu_write(memcg->stat->targets[target], next); |
694 | } | 717 | } |
695 | 718 | ||
696 | /* | 719 | /* |
697 | * Check events in order. | 720 | * Check events in order. |
698 | * | 721 | * |
699 | */ | 722 | */ |
700 | static void memcg_check_events(struct mem_cgroup *mem, struct page *page) | 723 | static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) |
701 | { | 724 | { |
725 | preempt_disable(); | ||
702 | /* threshold event is triggered in finer grain than soft limit */ | 726 | /* threshold event is triggered in finer grain than soft limit */ |
703 | if (unlikely(__memcg_event_check(mem, MEM_CGROUP_TARGET_THRESH))) { | 727 | if (unlikely(__memcg_event_check(memcg, MEM_CGROUP_TARGET_THRESH))) { |
704 | mem_cgroup_threshold(mem); | 728 | mem_cgroup_threshold(memcg); |
705 | __mem_cgroup_target_update(mem, MEM_CGROUP_TARGET_THRESH); | 729 | __mem_cgroup_target_update(memcg, MEM_CGROUP_TARGET_THRESH); |
706 | if (unlikely(__memcg_event_check(mem, | 730 | if (unlikely(__memcg_event_check(memcg, |
707 | MEM_CGROUP_TARGET_SOFTLIMIT))) { | 731 | MEM_CGROUP_TARGET_SOFTLIMIT))) { |
708 | mem_cgroup_update_tree(mem, page); | 732 | mem_cgroup_update_tree(memcg, page); |
709 | __mem_cgroup_target_update(mem, | 733 | __mem_cgroup_target_update(memcg, |
710 | MEM_CGROUP_TARGET_SOFTLIMIT); | 734 | MEM_CGROUP_TARGET_SOFTLIMIT); |
711 | } | 735 | } |
712 | #if MAX_NUMNODES > 1 | 736 | #if MAX_NUMNODES > 1 |
713 | if (unlikely(__memcg_event_check(mem, | 737 | if (unlikely(__memcg_event_check(memcg, |
714 | MEM_CGROUP_TARGET_NUMAINFO))) { | 738 | MEM_CGROUP_TARGET_NUMAINFO))) { |
715 | atomic_inc(&mem->numainfo_events); | 739 | atomic_inc(&memcg->numainfo_events); |
716 | __mem_cgroup_target_update(mem, | 740 | __mem_cgroup_target_update(memcg, |
717 | MEM_CGROUP_TARGET_NUMAINFO); | 741 | MEM_CGROUP_TARGET_NUMAINFO); |
718 | } | 742 | } |
719 | #endif | 743 | #endif |
720 | } | 744 | } |
745 | preempt_enable(); | ||
721 | } | 746 | } |
722 | 747 | ||
723 | static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) | 748 | static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) |
@@ -743,7 +768,7 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) | |||
743 | 768 | ||
744 | struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) | 769 | struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) |
745 | { | 770 | { |
746 | struct mem_cgroup *mem = NULL; | 771 | struct mem_cgroup *memcg = NULL; |
747 | 772 | ||
748 | if (!mm) | 773 | if (!mm) |
749 | return NULL; | 774 | return NULL; |
@@ -754,25 +779,25 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) | |||
754 | */ | 779 | */ |
755 | rcu_read_lock(); | 780 | rcu_read_lock(); |
756 | do { | 781 | do { |
757 | mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); | 782 | memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); |
758 | if (unlikely(!mem)) | 783 | if (unlikely(!memcg)) |
759 | break; | 784 | break; |
760 | } while (!css_tryget(&mem->css)); | 785 | } while (!css_tryget(&memcg->css)); |
761 | rcu_read_unlock(); | 786 | rcu_read_unlock(); |
762 | return mem; | 787 | return memcg; |
763 | } | 788 | } |
764 | 789 | ||
765 | /* The caller has to guarantee "mem" exists before calling this */ | 790 | /* The caller has to guarantee "mem" exists before calling this */ |
766 | static struct mem_cgroup *mem_cgroup_start_loop(struct mem_cgroup *mem) | 791 | static struct mem_cgroup *mem_cgroup_start_loop(struct mem_cgroup *memcg) |
767 | { | 792 | { |
768 | struct cgroup_subsys_state *css; | 793 | struct cgroup_subsys_state *css; |
769 | int found; | 794 | int found; |
770 | 795 | ||
771 | if (!mem) /* ROOT cgroup has the smallest ID */ | 796 | if (!memcg) /* ROOT cgroup has the smallest ID */ |
772 | return root_mem_cgroup; /*css_put/get against root is ignored*/ | 797 | return root_mem_cgroup; /*css_put/get against root is ignored*/ |
773 | if (!mem->use_hierarchy) { | 798 | if (!memcg->use_hierarchy) { |
774 | if (css_tryget(&mem->css)) | 799 | if (css_tryget(&memcg->css)) |
775 | return mem; | 800 | return memcg; |
776 | return NULL; | 801 | return NULL; |
777 | } | 802 | } |
778 | rcu_read_lock(); | 803 | rcu_read_lock(); |
@@ -780,13 +805,13 @@ static struct mem_cgroup *mem_cgroup_start_loop(struct mem_cgroup *mem) | |||
780 | * searching a memory cgroup which has the smallest ID under given | 805 | * searching a memory cgroup which has the smallest ID under given |
781 | * ROOT cgroup. (ID >= 1) | 806 | * ROOT cgroup. (ID >= 1) |
782 | */ | 807 | */ |
783 | css = css_get_next(&mem_cgroup_subsys, 1, &mem->css, &found); | 808 | css = css_get_next(&mem_cgroup_subsys, 1, &memcg->css, &found); |
784 | if (css && css_tryget(css)) | 809 | if (css && css_tryget(css)) |
785 | mem = container_of(css, struct mem_cgroup, css); | 810 | memcg = container_of(css, struct mem_cgroup, css); |
786 | else | 811 | else |
787 | mem = NULL; | 812 | memcg = NULL; |
788 | rcu_read_unlock(); | 813 | rcu_read_unlock(); |
789 | return mem; | 814 | return memcg; |
790 | } | 815 | } |
791 | 816 | ||
792 | static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter, | 817 | static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter, |
@@ -840,29 +865,29 @@ static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter, | |||
840 | for_each_mem_cgroup_tree_cond(iter, NULL, true) | 865 | for_each_mem_cgroup_tree_cond(iter, NULL, true) |
841 | 866 | ||
842 | 867 | ||
843 | static inline bool mem_cgroup_is_root(struct mem_cgroup *mem) | 868 | static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) |
844 | { | 869 | { |
845 | return (mem == root_mem_cgroup); | 870 | return (memcg == root_mem_cgroup); |
846 | } | 871 | } |
847 | 872 | ||
848 | void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) | 873 | void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) |
849 | { | 874 | { |
850 | struct mem_cgroup *mem; | 875 | struct mem_cgroup *memcg; |
851 | 876 | ||
852 | if (!mm) | 877 | if (!mm) |
853 | return; | 878 | return; |
854 | 879 | ||
855 | rcu_read_lock(); | 880 | rcu_read_lock(); |
856 | mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); | 881 | memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); |
857 | if (unlikely(!mem)) | 882 | if (unlikely(!memcg)) |
858 | goto out; | 883 | goto out; |
859 | 884 | ||
860 | switch (idx) { | 885 | switch (idx) { |
861 | case PGMAJFAULT: | 886 | case PGMAJFAULT: |
862 | mem_cgroup_pgmajfault(mem, 1); | 887 | mem_cgroup_pgmajfault(memcg, 1); |
863 | break; | 888 | break; |
864 | case PGFAULT: | 889 | case PGFAULT: |
865 | mem_cgroup_pgfault(mem, 1); | 890 | mem_cgroup_pgfault(memcg, 1); |
866 | break; | 891 | break; |
867 | default: | 892 | default: |
868 | BUG(); | 893 | BUG(); |
@@ -971,6 +996,16 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) | |||
971 | return; | 996 | return; |
972 | pc = lookup_page_cgroup(page); | 997 | pc = lookup_page_cgroup(page); |
973 | VM_BUG_ON(PageCgroupAcctLRU(pc)); | 998 | VM_BUG_ON(PageCgroupAcctLRU(pc)); |
999 | /* | ||
1000 | * putback: charge: | ||
1001 | * SetPageLRU SetPageCgroupUsed | ||
1002 | * smp_mb smp_mb | ||
1003 | * PageCgroupUsed && add to memcg LRU PageLRU && add to memcg LRU | ||
1004 | * | ||
1005 | * Ensure that one of the two sides adds the page to the memcg | ||
1006 | * LRU during a race. | ||
1007 | */ | ||
1008 | smp_mb(); | ||
974 | if (!PageCgroupUsed(pc)) | 1009 | if (!PageCgroupUsed(pc)) |
975 | return; | 1010 | return; |
976 | /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ | 1011 | /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ |
@@ -1022,7 +1057,16 @@ static void mem_cgroup_lru_add_after_commit(struct page *page) | |||
1022 | unsigned long flags; | 1057 | unsigned long flags; |
1023 | struct zone *zone = page_zone(page); | 1058 | struct zone *zone = page_zone(page); |
1024 | struct page_cgroup *pc = lookup_page_cgroup(page); | 1059 | struct page_cgroup *pc = lookup_page_cgroup(page); |
1025 | 1060 | /* | |
1061 | * putback: charge: | ||
1062 | * SetPageLRU SetPageCgroupUsed | ||
1063 | * smp_mb smp_mb | ||
1064 | * PageCgroupUsed && add to memcg LRU PageLRU && add to memcg LRU | ||
1065 | * | ||
1066 | * Ensure that one of the two sides adds the page to the memcg | ||
1067 | * LRU during a race. | ||
1068 | */ | ||
1069 | smp_mb(); | ||
1026 | /* taking care of that the page is added to LRU while we commit it */ | 1070 | /* taking care of that the page is added to LRU while we commit it */ |
1027 | if (likely(!PageLRU(page))) | 1071 | if (likely(!PageLRU(page))) |
1028 | return; | 1072 | return; |
@@ -1043,7 +1087,22 @@ void mem_cgroup_move_lists(struct page *page, | |||
1043 | mem_cgroup_add_lru_list(page, to); | 1087 | mem_cgroup_add_lru_list(page, to); |
1044 | } | 1088 | } |
1045 | 1089 | ||
1046 | int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) | 1090 | /* |
1091 | * Checks whether given mem is same or in the root_mem_cgroup's | ||
1092 | * hierarchy subtree | ||
1093 | */ | ||
1094 | static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, | ||
1095 | struct mem_cgroup *memcg) | ||
1096 | { | ||
1097 | if (root_memcg != memcg) { | ||
1098 | return (root_memcg->use_hierarchy && | ||
1099 | css_is_ancestor(&memcg->css, &root_memcg->css)); | ||
1100 | } | ||
1101 | |||
1102 | return true; | ||
1103 | } | ||
1104 | |||
1105 | int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg) | ||
1047 | { | 1106 | { |
1048 | int ret; | 1107 | int ret; |
1049 | struct mem_cgroup *curr = NULL; | 1108 | struct mem_cgroup *curr = NULL; |
@@ -1057,28 +1116,29 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) | |||
1057 | if (!curr) | 1116 | if (!curr) |
1058 | return 0; | 1117 | return 0; |
1059 | /* | 1118 | /* |
1060 | * We should check use_hierarchy of "mem" not "curr". Because checking | 1119 | * We should check use_hierarchy of "memcg" not "curr". Because checking |
1061 | * use_hierarchy of "curr" here make this function true if hierarchy is | 1120 | * use_hierarchy of "curr" here make this function true if hierarchy is |
1062 | * enabled in "curr" and "curr" is a child of "mem" in *cgroup* | 1121 | * enabled in "curr" and "curr" is a child of "memcg" in *cgroup* |
1063 | * hierarchy(even if use_hierarchy is disabled in "mem"). | 1122 | * hierarchy(even if use_hierarchy is disabled in "memcg"). |
1064 | */ | 1123 | */ |
1065 | if (mem->use_hierarchy) | 1124 | ret = mem_cgroup_same_or_subtree(memcg, curr); |
1066 | ret = css_is_ancestor(&curr->css, &mem->css); | ||
1067 | else | ||
1068 | ret = (curr == mem); | ||
1069 | css_put(&curr->css); | 1125 | css_put(&curr->css); |
1070 | return ret; | 1126 | return ret; |
1071 | } | 1127 | } |
1072 | 1128 | ||
1073 | static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages) | 1129 | int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg, struct zone *zone) |
1074 | { | 1130 | { |
1075 | unsigned long active; | 1131 | unsigned long inactive_ratio; |
1132 | int nid = zone_to_nid(zone); | ||
1133 | int zid = zone_idx(zone); | ||
1076 | unsigned long inactive; | 1134 | unsigned long inactive; |
1135 | unsigned long active; | ||
1077 | unsigned long gb; | 1136 | unsigned long gb; |
1078 | unsigned long inactive_ratio; | ||
1079 | 1137 | ||
1080 | inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_ANON); | 1138 | inactive = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid, |
1081 | active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_ANON); | 1139 | BIT(LRU_INACTIVE_ANON)); |
1140 | active = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid, | ||
1141 | BIT(LRU_ACTIVE_ANON)); | ||
1082 | 1142 | ||
1083 | gb = (inactive + active) >> (30 - PAGE_SHIFT); | 1143 | gb = (inactive + active) >> (30 - PAGE_SHIFT); |
1084 | if (gb) | 1144 | if (gb) |
@@ -1086,139 +1146,23 @@ static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_ | |||
1086 | else | 1146 | else |
1087 | inactive_ratio = 1; | 1147 | inactive_ratio = 1; |
1088 | 1148 | ||
1089 | if (present_pages) { | 1149 | return inactive * inactive_ratio < active; |
1090 | present_pages[0] = inactive; | ||
1091 | present_pages[1] = active; | ||
1092 | } | ||
1093 | |||
1094 | return inactive_ratio; | ||
1095 | } | 1150 | } |
1096 | 1151 | ||
1097 | int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg) | 1152 | int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg, struct zone *zone) |
1098 | { | 1153 | { |
1099 | unsigned long active; | 1154 | unsigned long active; |
1100 | unsigned long inactive; | 1155 | unsigned long inactive; |
1101 | unsigned long present_pages[2]; | ||
1102 | unsigned long inactive_ratio; | ||
1103 | |||
1104 | inactive_ratio = calc_inactive_ratio(memcg, present_pages); | ||
1105 | |||
1106 | inactive = present_pages[0]; | ||
1107 | active = present_pages[1]; | ||
1108 | |||
1109 | if (inactive * inactive_ratio < active) | ||
1110 | return 1; | ||
1111 | |||
1112 | return 0; | ||
1113 | } | ||
1114 | |||
1115 | int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg) | ||
1116 | { | ||
1117 | unsigned long active; | ||
1118 | unsigned long inactive; | ||
1119 | |||
1120 | inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_FILE); | ||
1121 | active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_FILE); | ||
1122 | |||
1123 | return (active > inactive); | ||
1124 | } | ||
1125 | |||
1126 | unsigned long mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, | ||
1127 | struct zone *zone, | ||
1128 | enum lru_list lru) | ||
1129 | { | ||
1130 | int nid = zone_to_nid(zone); | ||
1131 | int zid = zone_idx(zone); | 1156 | int zid = zone_idx(zone); |
1132 | struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid); | 1157 | int nid = zone_to_nid(zone); |
1133 | |||
1134 | return MEM_CGROUP_ZSTAT(mz, lru); | ||
1135 | } | ||
1136 | |||
1137 | static unsigned long mem_cgroup_node_nr_file_lru_pages(struct mem_cgroup *memcg, | ||
1138 | int nid) | ||
1139 | { | ||
1140 | unsigned long ret; | ||
1141 | |||
1142 | ret = mem_cgroup_get_zonestat_node(memcg, nid, LRU_INACTIVE_FILE) + | ||
1143 | mem_cgroup_get_zonestat_node(memcg, nid, LRU_ACTIVE_FILE); | ||
1144 | |||
1145 | return ret; | ||
1146 | } | ||
1147 | |||
1148 | static unsigned long mem_cgroup_node_nr_anon_lru_pages(struct mem_cgroup *memcg, | ||
1149 | int nid) | ||
1150 | { | ||
1151 | unsigned long ret; | ||
1152 | |||
1153 | ret = mem_cgroup_get_zonestat_node(memcg, nid, LRU_INACTIVE_ANON) + | ||
1154 | mem_cgroup_get_zonestat_node(memcg, nid, LRU_ACTIVE_ANON); | ||
1155 | return ret; | ||
1156 | } | ||
1157 | |||
1158 | #if MAX_NUMNODES > 1 | ||
1159 | static unsigned long mem_cgroup_nr_file_lru_pages(struct mem_cgroup *memcg) | ||
1160 | { | ||
1161 | u64 total = 0; | ||
1162 | int nid; | ||
1163 | |||
1164 | for_each_node_state(nid, N_HIGH_MEMORY) | ||
1165 | total += mem_cgroup_node_nr_file_lru_pages(memcg, nid); | ||
1166 | |||
1167 | return total; | ||
1168 | } | ||
1169 | |||
1170 | static unsigned long mem_cgroup_nr_anon_lru_pages(struct mem_cgroup *memcg) | ||
1171 | { | ||
1172 | u64 total = 0; | ||
1173 | int nid; | ||
1174 | |||
1175 | for_each_node_state(nid, N_HIGH_MEMORY) | ||
1176 | total += mem_cgroup_node_nr_anon_lru_pages(memcg, nid); | ||
1177 | |||
1178 | return total; | ||
1179 | } | ||
1180 | |||
1181 | static unsigned long | ||
1182 | mem_cgroup_node_nr_unevictable_lru_pages(struct mem_cgroup *memcg, int nid) | ||
1183 | { | ||
1184 | return mem_cgroup_get_zonestat_node(memcg, nid, LRU_UNEVICTABLE); | ||
1185 | } | ||
1186 | |||
1187 | static unsigned long | ||
1188 | mem_cgroup_nr_unevictable_lru_pages(struct mem_cgroup *memcg) | ||
1189 | { | ||
1190 | u64 total = 0; | ||
1191 | int nid; | ||
1192 | |||
1193 | for_each_node_state(nid, N_HIGH_MEMORY) | ||
1194 | total += mem_cgroup_node_nr_unevictable_lru_pages(memcg, nid); | ||
1195 | |||
1196 | return total; | ||
1197 | } | ||
1198 | |||
1199 | static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, | ||
1200 | int nid) | ||
1201 | { | ||
1202 | enum lru_list l; | ||
1203 | u64 total = 0; | ||
1204 | |||
1205 | for_each_lru(l) | ||
1206 | total += mem_cgroup_get_zonestat_node(memcg, nid, l); | ||
1207 | |||
1208 | return total; | ||
1209 | } | ||
1210 | |||
1211 | static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg) | ||
1212 | { | ||
1213 | u64 total = 0; | ||
1214 | int nid; | ||
1215 | 1158 | ||
1216 | for_each_node_state(nid, N_HIGH_MEMORY) | 1159 | inactive = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid, |
1217 | total += mem_cgroup_node_nr_lru_pages(memcg, nid); | 1160 | BIT(LRU_INACTIVE_FILE)); |
1161 | active = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid, | ||
1162 | BIT(LRU_ACTIVE_FILE)); | ||
1218 | 1163 | ||
1219 | return total; | 1164 | return (active > inactive); |
1220 | } | 1165 | } |
1221 | #endif /* CONFIG_NUMA */ | ||
1222 | 1166 | ||
1223 | struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg, | 1167 | struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg, |
1224 | struct zone *zone) | 1168 | struct zone *zone) |
@@ -1251,7 +1195,8 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page) | |||
1251 | unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, | 1195 | unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, |
1252 | struct list_head *dst, | 1196 | struct list_head *dst, |
1253 | unsigned long *scanned, int order, | 1197 | unsigned long *scanned, int order, |
1254 | int mode, struct zone *z, | 1198 | isolate_mode_t mode, |
1199 | struct zone *z, | ||
1255 | struct mem_cgroup *mem_cont, | 1200 | struct mem_cgroup *mem_cont, |
1256 | int active, int file) | 1201 | int active, int file) |
1257 | { | 1202 | { |
@@ -1319,17 +1264,17 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, | |||
1319 | * Returns the maximum amount of memory @mem can be charged with, in | 1264 | * Returns the maximum amount of memory @mem can be charged with, in |
1320 | * pages. | 1265 | * pages. |
1321 | */ | 1266 | */ |
1322 | static unsigned long mem_cgroup_margin(struct mem_cgroup *mem) | 1267 | static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) |
1323 | { | 1268 | { |
1324 | unsigned long long margin; | 1269 | unsigned long long margin; |
1325 | 1270 | ||
1326 | margin = res_counter_margin(&mem->res); | 1271 | margin = res_counter_margin(&memcg->res); |
1327 | if (do_swap_account) | 1272 | if (do_swap_account) |
1328 | margin = min(margin, res_counter_margin(&mem->memsw)); | 1273 | margin = min(margin, res_counter_margin(&memcg->memsw)); |
1329 | return margin >> PAGE_SHIFT; | 1274 | return margin >> PAGE_SHIFT; |
1330 | } | 1275 | } |
1331 | 1276 | ||
1332 | static unsigned int get_swappiness(struct mem_cgroup *memcg) | 1277 | int mem_cgroup_swappiness(struct mem_cgroup *memcg) |
1333 | { | 1278 | { |
1334 | struct cgroup *cgrp = memcg->css.cgroup; | 1279 | struct cgroup *cgrp = memcg->css.cgroup; |
1335 | 1280 | ||
@@ -1340,33 +1285,33 @@ static unsigned int get_swappiness(struct mem_cgroup *memcg) | |||
1340 | return memcg->swappiness; | 1285 | return memcg->swappiness; |
1341 | } | 1286 | } |
1342 | 1287 | ||
1343 | static void mem_cgroup_start_move(struct mem_cgroup *mem) | 1288 | static void mem_cgroup_start_move(struct mem_cgroup *memcg) |
1344 | { | 1289 | { |
1345 | int cpu; | 1290 | int cpu; |
1346 | 1291 | ||
1347 | get_online_cpus(); | 1292 | get_online_cpus(); |
1348 | spin_lock(&mem->pcp_counter_lock); | 1293 | spin_lock(&memcg->pcp_counter_lock); |
1349 | for_each_online_cpu(cpu) | 1294 | for_each_online_cpu(cpu) |
1350 | per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1; | 1295 | per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1; |
1351 | mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] += 1; | 1296 | memcg->nocpu_base.count[MEM_CGROUP_ON_MOVE] += 1; |
1352 | spin_unlock(&mem->pcp_counter_lock); | 1297 | spin_unlock(&memcg->pcp_counter_lock); |
1353 | put_online_cpus(); | 1298 | put_online_cpus(); |
1354 | 1299 | ||
1355 | synchronize_rcu(); | 1300 | synchronize_rcu(); |
1356 | } | 1301 | } |
1357 | 1302 | ||
1358 | static void mem_cgroup_end_move(struct mem_cgroup *mem) | 1303 | static void mem_cgroup_end_move(struct mem_cgroup *memcg) |
1359 | { | 1304 | { |
1360 | int cpu; | 1305 | int cpu; |
1361 | 1306 | ||
1362 | if (!mem) | 1307 | if (!memcg) |
1363 | return; | 1308 | return; |
1364 | get_online_cpus(); | 1309 | get_online_cpus(); |
1365 | spin_lock(&mem->pcp_counter_lock); | 1310 | spin_lock(&memcg->pcp_counter_lock); |
1366 | for_each_online_cpu(cpu) | 1311 | for_each_online_cpu(cpu) |
1367 | per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1; | 1312 | per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1; |
1368 | mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] -= 1; | 1313 | memcg->nocpu_base.count[MEM_CGROUP_ON_MOVE] -= 1; |
1369 | spin_unlock(&mem->pcp_counter_lock); | 1314 | spin_unlock(&memcg->pcp_counter_lock); |
1370 | put_online_cpus(); | 1315 | put_online_cpus(); |
1371 | } | 1316 | } |
1372 | /* | 1317 | /* |
@@ -1381,13 +1326,13 @@ static void mem_cgroup_end_move(struct mem_cgroup *mem) | |||
1381 | * waiting at hith-memory prressure caused by "move". | 1326 | * waiting at hith-memory prressure caused by "move". |
1382 | */ | 1327 | */ |
1383 | 1328 | ||
1384 | static bool mem_cgroup_stealed(struct mem_cgroup *mem) | 1329 | static bool mem_cgroup_stealed(struct mem_cgroup *memcg) |
1385 | { | 1330 | { |
1386 | VM_BUG_ON(!rcu_read_lock_held()); | 1331 | VM_BUG_ON(!rcu_read_lock_held()); |
1387 | return this_cpu_read(mem->stat->count[MEM_CGROUP_ON_MOVE]) > 0; | 1332 | return this_cpu_read(memcg->stat->count[MEM_CGROUP_ON_MOVE]) > 0; |
1388 | } | 1333 | } |
1389 | 1334 | ||
1390 | static bool mem_cgroup_under_move(struct mem_cgroup *mem) | 1335 | static bool mem_cgroup_under_move(struct mem_cgroup *memcg) |
1391 | { | 1336 | { |
1392 | struct mem_cgroup *from; | 1337 | struct mem_cgroup *from; |
1393 | struct mem_cgroup *to; | 1338 | struct mem_cgroup *to; |
@@ -1401,19 +1346,18 @@ static bool mem_cgroup_under_move(struct mem_cgroup *mem) | |||
1401 | to = mc.to; | 1346 | to = mc.to; |
1402 | if (!from) | 1347 | if (!from) |
1403 | goto unlock; | 1348 | goto unlock; |
1404 | if (from == mem || to == mem | 1349 | |
1405 | || (mem->use_hierarchy && css_is_ancestor(&from->css, &mem->css)) | 1350 | ret = mem_cgroup_same_or_subtree(memcg, from) |
1406 | || (mem->use_hierarchy && css_is_ancestor(&to->css, &mem->css))) | 1351 | || mem_cgroup_same_or_subtree(memcg, to); |
1407 | ret = true; | ||
1408 | unlock: | 1352 | unlock: |
1409 | spin_unlock(&mc.lock); | 1353 | spin_unlock(&mc.lock); |
1410 | return ret; | 1354 | return ret; |
1411 | } | 1355 | } |
1412 | 1356 | ||
1413 | static bool mem_cgroup_wait_acct_move(struct mem_cgroup *mem) | 1357 | static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg) |
1414 | { | 1358 | { |
1415 | if (mc.moving_task && current != mc.moving_task) { | 1359 | if (mc.moving_task && current != mc.moving_task) { |
1416 | if (mem_cgroup_under_move(mem)) { | 1360 | if (mem_cgroup_under_move(memcg)) { |
1417 | DEFINE_WAIT(wait); | 1361 | DEFINE_WAIT(wait); |
1418 | prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE); | 1362 | prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE); |
1419 | /* moving charge context might have finished. */ | 1363 | /* moving charge context might have finished. */ |
@@ -1497,12 +1441,12 @@ done: | |||
1497 | * This function returns the number of memcg under hierarchy tree. Returns | 1441 | * This function returns the number of memcg under hierarchy tree. Returns |
1498 | * 1(self count) if no children. | 1442 | * 1(self count) if no children. |
1499 | */ | 1443 | */ |
1500 | static int mem_cgroup_count_children(struct mem_cgroup *mem) | 1444 | static int mem_cgroup_count_children(struct mem_cgroup *memcg) |
1501 | { | 1445 | { |
1502 | int num = 0; | 1446 | int num = 0; |
1503 | struct mem_cgroup *iter; | 1447 | struct mem_cgroup *iter; |
1504 | 1448 | ||
1505 | for_each_mem_cgroup_tree(iter, mem) | 1449 | for_each_mem_cgroup_tree(iter, memcg) |
1506 | num++; | 1450 | num++; |
1507 | return num; | 1451 | return num; |
1508 | } | 1452 | } |
@@ -1532,21 +1476,21 @@ u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) | |||
1532 | * that to reclaim free pages from. | 1476 | * that to reclaim free pages from. |
1533 | */ | 1477 | */ |
1534 | static struct mem_cgroup * | 1478 | static struct mem_cgroup * |
1535 | mem_cgroup_select_victim(struct mem_cgroup *root_mem) | 1479 | mem_cgroup_select_victim(struct mem_cgroup *root_memcg) |
1536 | { | 1480 | { |
1537 | struct mem_cgroup *ret = NULL; | 1481 | struct mem_cgroup *ret = NULL; |
1538 | struct cgroup_subsys_state *css; | 1482 | struct cgroup_subsys_state *css; |
1539 | int nextid, found; | 1483 | int nextid, found; |
1540 | 1484 | ||
1541 | if (!root_mem->use_hierarchy) { | 1485 | if (!root_memcg->use_hierarchy) { |
1542 | css_get(&root_mem->css); | 1486 | css_get(&root_memcg->css); |
1543 | ret = root_mem; | 1487 | ret = root_memcg; |
1544 | } | 1488 | } |
1545 | 1489 | ||
1546 | while (!ret) { | 1490 | while (!ret) { |
1547 | rcu_read_lock(); | 1491 | rcu_read_lock(); |
1548 | nextid = root_mem->last_scanned_child + 1; | 1492 | nextid = root_memcg->last_scanned_child + 1; |
1549 | css = css_get_next(&mem_cgroup_subsys, nextid, &root_mem->css, | 1493 | css = css_get_next(&mem_cgroup_subsys, nextid, &root_memcg->css, |
1550 | &found); | 1494 | &found); |
1551 | if (css && css_tryget(css)) | 1495 | if (css && css_tryget(css)) |
1552 | ret = container_of(css, struct mem_cgroup, css); | 1496 | ret = container_of(css, struct mem_cgroup, css); |
@@ -1555,9 +1499,9 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem) | |||
1555 | /* Updates scanning parameter */ | 1499 | /* Updates scanning parameter */ |
1556 | if (!css) { | 1500 | if (!css) { |
1557 | /* this means start scan from ID:1 */ | 1501 | /* this means start scan from ID:1 */ |
1558 | root_mem->last_scanned_child = 0; | 1502 | root_memcg->last_scanned_child = 0; |
1559 | } else | 1503 | } else |
1560 | root_mem->last_scanned_child = found; | 1504 | root_memcg->last_scanned_child = found; |
1561 | } | 1505 | } |
1562 | 1506 | ||
1563 | return ret; | 1507 | return ret; |
@@ -1573,14 +1517,14 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem) | |||
1573 | * reclaimable pages on a node. Returns true if there are any reclaimable | 1517 | * reclaimable pages on a node. Returns true if there are any reclaimable |
1574 | * pages in the node. | 1518 | * pages in the node. |
1575 | */ | 1519 | */ |
1576 | static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *mem, | 1520 | static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg, |
1577 | int nid, bool noswap) | 1521 | int nid, bool noswap) |
1578 | { | 1522 | { |
1579 | if (mem_cgroup_node_nr_file_lru_pages(mem, nid)) | 1523 | if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE)) |
1580 | return true; | 1524 | return true; |
1581 | if (noswap || !total_swap_pages) | 1525 | if (noswap || !total_swap_pages) |
1582 | return false; | 1526 | return false; |
1583 | if (mem_cgroup_node_nr_anon_lru_pages(mem, nid)) | 1527 | if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON)) |
1584 | return true; | 1528 | return true; |
1585 | return false; | 1529 | return false; |
1586 | 1530 | ||
@@ -1593,29 +1537,29 @@ static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *mem, | |||
1593 | * nodes based on the zonelist. So update the list loosely once per 10 secs. | 1537 | * nodes based on the zonelist. So update the list loosely once per 10 secs. |
1594 | * | 1538 | * |
1595 | */ | 1539 | */ |
1596 | static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem) | 1540 | static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg) |
1597 | { | 1541 | { |
1598 | int nid; | 1542 | int nid; |
1599 | /* | 1543 | /* |
1600 | * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET | 1544 | * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET |
1601 | * pagein/pageout changes since the last update. | 1545 | * pagein/pageout changes since the last update. |
1602 | */ | 1546 | */ |
1603 | if (!atomic_read(&mem->numainfo_events)) | 1547 | if (!atomic_read(&memcg->numainfo_events)) |
1604 | return; | 1548 | return; |
1605 | if (atomic_inc_return(&mem->numainfo_updating) > 1) | 1549 | if (atomic_inc_return(&memcg->numainfo_updating) > 1) |
1606 | return; | 1550 | return; |
1607 | 1551 | ||
1608 | /* make a nodemask where this memcg uses memory from */ | 1552 | /* make a nodemask where this memcg uses memory from */ |
1609 | mem->scan_nodes = node_states[N_HIGH_MEMORY]; | 1553 | memcg->scan_nodes = node_states[N_HIGH_MEMORY]; |
1610 | 1554 | ||
1611 | for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) { | 1555 | for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) { |
1612 | 1556 | ||
1613 | if (!test_mem_cgroup_node_reclaimable(mem, nid, false)) | 1557 | if (!test_mem_cgroup_node_reclaimable(memcg, nid, false)) |
1614 | node_clear(nid, mem->scan_nodes); | 1558 | node_clear(nid, memcg->scan_nodes); |
1615 | } | 1559 | } |
1616 | 1560 | ||
1617 | atomic_set(&mem->numainfo_events, 0); | 1561 | atomic_set(&memcg->numainfo_events, 0); |
1618 | atomic_set(&mem->numainfo_updating, 0); | 1562 | atomic_set(&memcg->numainfo_updating, 0); |
1619 | } | 1563 | } |
1620 | 1564 | ||
1621 | /* | 1565 | /* |
@@ -1630,16 +1574,16 @@ static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem) | |||
1630 | * | 1574 | * |
1631 | * Now, we use round-robin. Better algorithm is welcomed. | 1575 | * Now, we use round-robin. Better algorithm is welcomed. |
1632 | */ | 1576 | */ |
1633 | int mem_cgroup_select_victim_node(struct mem_cgroup *mem) | 1577 | int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) |
1634 | { | 1578 | { |
1635 | int node; | 1579 | int node; |
1636 | 1580 | ||
1637 | mem_cgroup_may_update_nodemask(mem); | 1581 | mem_cgroup_may_update_nodemask(memcg); |
1638 | node = mem->last_scanned_node; | 1582 | node = memcg->last_scanned_node; |
1639 | 1583 | ||
1640 | node = next_node(node, mem->scan_nodes); | 1584 | node = next_node(node, memcg->scan_nodes); |
1641 | if (node == MAX_NUMNODES) | 1585 | if (node == MAX_NUMNODES) |
1642 | node = first_node(mem->scan_nodes); | 1586 | node = first_node(memcg->scan_nodes); |
1643 | /* | 1587 | /* |
1644 | * We call this when we hit limit, not when pages are added to LRU. | 1588 | * We call this when we hit limit, not when pages are added to LRU. |
1645 | * No LRU may hold pages because all pages are UNEVICTABLE or | 1589 | * No LRU may hold pages because all pages are UNEVICTABLE or |
@@ -1649,7 +1593,7 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *mem) | |||
1649 | if (unlikely(node == MAX_NUMNODES)) | 1593 | if (unlikely(node == MAX_NUMNODES)) |
1650 | node = numa_node_id(); | 1594 | node = numa_node_id(); |
1651 | 1595 | ||
1652 | mem->last_scanned_node = node; | 1596 | memcg->last_scanned_node = node; |
1653 | return node; | 1597 | return node; |
1654 | } | 1598 | } |
1655 | 1599 | ||
@@ -1659,7 +1603,7 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *mem) | |||
1659 | * unused nodes. But scan_nodes is lazily updated and may not cotain | 1603 | * unused nodes. But scan_nodes is lazily updated and may not cotain |
1660 | * enough new information. We need to do double check. | 1604 | * enough new information. We need to do double check. |
1661 | */ | 1605 | */ |
1662 | bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap) | 1606 | bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) |
1663 | { | 1607 | { |
1664 | int nid; | 1608 | int nid; |
1665 | 1609 | ||
@@ -1667,12 +1611,12 @@ bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap) | |||
1667 | * quick check...making use of scan_node. | 1611 | * quick check...making use of scan_node. |
1668 | * We can skip unused nodes. | 1612 | * We can skip unused nodes. |
1669 | */ | 1613 | */ |
1670 | if (!nodes_empty(mem->scan_nodes)) { | 1614 | if (!nodes_empty(memcg->scan_nodes)) { |
1671 | for (nid = first_node(mem->scan_nodes); | 1615 | for (nid = first_node(memcg->scan_nodes); |
1672 | nid < MAX_NUMNODES; | 1616 | nid < MAX_NUMNODES; |
1673 | nid = next_node(nid, mem->scan_nodes)) { | 1617 | nid = next_node(nid, memcg->scan_nodes)) { |
1674 | 1618 | ||
1675 | if (test_mem_cgroup_node_reclaimable(mem, nid, noswap)) | 1619 | if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) |
1676 | return true; | 1620 | return true; |
1677 | } | 1621 | } |
1678 | } | 1622 | } |
@@ -1680,23 +1624,23 @@ bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap) | |||
1680 | * Check rest of nodes. | 1624 | * Check rest of nodes. |
1681 | */ | 1625 | */ |
1682 | for_each_node_state(nid, N_HIGH_MEMORY) { | 1626 | for_each_node_state(nid, N_HIGH_MEMORY) { |
1683 | if (node_isset(nid, mem->scan_nodes)) | 1627 | if (node_isset(nid, memcg->scan_nodes)) |
1684 | continue; | 1628 | continue; |
1685 | if (test_mem_cgroup_node_reclaimable(mem, nid, noswap)) | 1629 | if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) |
1686 | return true; | 1630 | return true; |
1687 | } | 1631 | } |
1688 | return false; | 1632 | return false; |
1689 | } | 1633 | } |
1690 | 1634 | ||
1691 | #else | 1635 | #else |
1692 | int mem_cgroup_select_victim_node(struct mem_cgroup *mem) | 1636 | int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) |
1693 | { | 1637 | { |
1694 | return 0; | 1638 | return 0; |
1695 | } | 1639 | } |
1696 | 1640 | ||
1697 | bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap) | 1641 | bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) |
1698 | { | 1642 | { |
1699 | return test_mem_cgroup_node_reclaimable(mem, 0, noswap); | 1643 | return test_mem_cgroup_node_reclaimable(memcg, 0, noswap); |
1700 | } | 1644 | } |
1701 | #endif | 1645 | #endif |
1702 | 1646 | ||
@@ -1705,14 +1649,14 @@ bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap) | |||
1705 | * we reclaimed from, so that we don't end up penalizing one child extensively | 1649 | * we reclaimed from, so that we don't end up penalizing one child extensively |
1706 | * based on its position in the children list. | 1650 | * based on its position in the children list. |
1707 | * | 1651 | * |
1708 | * root_mem is the original ancestor that we've been reclaim from. | 1652 | * root_memcg is the original ancestor that we've been reclaim from. |
1709 | * | 1653 | * |
1710 | * We give up and return to the caller when we visit root_mem twice. | 1654 | * We give up and return to the caller when we visit root_memcg twice. |
1711 | * (other groups can be removed while we're walking....) | 1655 | * (other groups can be removed while we're walking....) |
1712 | * | 1656 | * |
1713 | * If shrink==true, for avoiding to free too much, this returns immedieately. | 1657 | * If shrink==true, for avoiding to free too much, this returns immedieately. |
1714 | */ | 1658 | */ |
1715 | static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, | 1659 | static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_memcg, |
1716 | struct zone *zone, | 1660 | struct zone *zone, |
1717 | gfp_t gfp_mask, | 1661 | gfp_t gfp_mask, |
1718 | unsigned long reclaim_options, | 1662 | unsigned long reclaim_options, |
@@ -1727,15 +1671,15 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, | |||
1727 | unsigned long excess; | 1671 | unsigned long excess; |
1728 | unsigned long nr_scanned; | 1672 | unsigned long nr_scanned; |
1729 | 1673 | ||
1730 | excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT; | 1674 | excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT; |
1731 | 1675 | ||
1732 | /* If memsw_is_minimum==1, swap-out is of-no-use. */ | 1676 | /* If memsw_is_minimum==1, swap-out is of-no-use. */ |
1733 | if (!check_soft && root_mem->memsw_is_minimum) | 1677 | if (!check_soft && !shrink && root_memcg->memsw_is_minimum) |
1734 | noswap = true; | 1678 | noswap = true; |
1735 | 1679 | ||
1736 | while (1) { | 1680 | while (1) { |
1737 | victim = mem_cgroup_select_victim(root_mem); | 1681 | victim = mem_cgroup_select_victim(root_memcg); |
1738 | if (victim == root_mem) { | 1682 | if (victim == root_memcg) { |
1739 | loop++; | 1683 | loop++; |
1740 | /* | 1684 | /* |
1741 | * We are not draining per cpu cached charges during | 1685 | * We are not draining per cpu cached charges during |
@@ -1744,7 +1688,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, | |||
1744 | * charges will not give any. | 1688 | * charges will not give any. |
1745 | */ | 1689 | */ |
1746 | if (!check_soft && loop >= 1) | 1690 | if (!check_soft && loop >= 1) |
1747 | drain_all_stock_async(root_mem); | 1691 | drain_all_stock_async(root_memcg); |
1748 | if (loop >= 2) { | 1692 | if (loop >= 2) { |
1749 | /* | 1693 | /* |
1750 | * If we have not been able to reclaim | 1694 | * If we have not been able to reclaim |
@@ -1776,12 +1720,11 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, | |||
1776 | /* we use swappiness of local cgroup */ | 1720 | /* we use swappiness of local cgroup */ |
1777 | if (check_soft) { | 1721 | if (check_soft) { |
1778 | ret = mem_cgroup_shrink_node_zone(victim, gfp_mask, | 1722 | ret = mem_cgroup_shrink_node_zone(victim, gfp_mask, |
1779 | noswap, get_swappiness(victim), zone, | 1723 | noswap, zone, &nr_scanned); |
1780 | &nr_scanned); | ||
1781 | *total_scanned += nr_scanned; | 1724 | *total_scanned += nr_scanned; |
1782 | } else | 1725 | } else |
1783 | ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, | 1726 | ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, |
1784 | noswap, get_swappiness(victim)); | 1727 | noswap); |
1785 | css_put(&victim->css); | 1728 | css_put(&victim->css); |
1786 | /* | 1729 | /* |
1787 | * At shrinking usage, we can't check we should stop here or | 1730 | * At shrinking usage, we can't check we should stop here or |
@@ -1792,9 +1735,9 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, | |||
1792 | return ret; | 1735 | return ret; |
1793 | total += ret; | 1736 | total += ret; |
1794 | if (check_soft) { | 1737 | if (check_soft) { |
1795 | if (!res_counter_soft_limit_excess(&root_mem->res)) | 1738 | if (!res_counter_soft_limit_excess(&root_memcg->res)) |
1796 | return total; | 1739 | return total; |
1797 | } else if (mem_cgroup_margin(root_mem)) | 1740 | } else if (mem_cgroup_margin(root_memcg)) |
1798 | return total; | 1741 | return total; |
1799 | } | 1742 | } |
1800 | return total; | 1743 | return total; |
@@ -1803,23 +1746,64 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, | |||
1803 | /* | 1746 | /* |
1804 | * Check OOM-Killer is already running under our hierarchy. | 1747 | * Check OOM-Killer is already running under our hierarchy. |
1805 | * If someone is running, return false. | 1748 | * If someone is running, return false. |
1749 | * Has to be called with memcg_oom_lock | ||
1806 | */ | 1750 | */ |
1807 | static bool mem_cgroup_oom_lock(struct mem_cgroup *mem) | 1751 | static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg) |
1808 | { | 1752 | { |
1809 | int x, lock_count = 0; | 1753 | struct mem_cgroup *iter, *failed = NULL; |
1810 | struct mem_cgroup *iter; | 1754 | bool cond = true; |
1811 | 1755 | ||
1812 | for_each_mem_cgroup_tree(iter, mem) { | 1756 | for_each_mem_cgroup_tree_cond(iter, memcg, cond) { |
1813 | x = atomic_inc_return(&iter->oom_lock); | 1757 | if (iter->oom_lock) { |
1814 | lock_count = max(x, lock_count); | 1758 | /* |
1759 | * this subtree of our hierarchy is already locked | ||
1760 | * so we cannot give a lock. | ||
1761 | */ | ||
1762 | failed = iter; | ||
1763 | cond = false; | ||
1764 | } else | ||
1765 | iter->oom_lock = true; | ||
1815 | } | 1766 | } |
1816 | 1767 | ||
1817 | if (lock_count == 1) | 1768 | if (!failed) |
1818 | return true; | 1769 | return true; |
1770 | |||
1771 | /* | ||
1772 | * OK, we failed to lock the whole subtree so we have to clean up | ||
1773 | * what we set up to the failing subtree | ||
1774 | */ | ||
1775 | cond = true; | ||
1776 | for_each_mem_cgroup_tree_cond(iter, memcg, cond) { | ||
1777 | if (iter == failed) { | ||
1778 | cond = false; | ||
1779 | continue; | ||
1780 | } | ||
1781 | iter->oom_lock = false; | ||
1782 | } | ||
1819 | return false; | 1783 | return false; |
1820 | } | 1784 | } |
1821 | 1785 | ||
1822 | static int mem_cgroup_oom_unlock(struct mem_cgroup *mem) | 1786 | /* |
1787 | * Has to be called with memcg_oom_lock | ||
1788 | */ | ||
1789 | static int mem_cgroup_oom_unlock(struct mem_cgroup *memcg) | ||
1790 | { | ||
1791 | struct mem_cgroup *iter; | ||
1792 | |||
1793 | for_each_mem_cgroup_tree(iter, memcg) | ||
1794 | iter->oom_lock = false; | ||
1795 | return 0; | ||
1796 | } | ||
1797 | |||
1798 | static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg) | ||
1799 | { | ||
1800 | struct mem_cgroup *iter; | ||
1801 | |||
1802 | for_each_mem_cgroup_tree(iter, memcg) | ||
1803 | atomic_inc(&iter->under_oom); | ||
1804 | } | ||
1805 | |||
1806 | static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg) | ||
1823 | { | 1807 | { |
1824 | struct mem_cgroup *iter; | 1808 | struct mem_cgroup *iter; |
1825 | 1809 | ||
@@ -1828,13 +1812,11 @@ static int mem_cgroup_oom_unlock(struct mem_cgroup *mem) | |||
1828 | * mem_cgroup_oom_lock() may not be called. We have to use | 1812 | * mem_cgroup_oom_lock() may not be called. We have to use |
1829 | * atomic_add_unless() here. | 1813 | * atomic_add_unless() here. |
1830 | */ | 1814 | */ |
1831 | for_each_mem_cgroup_tree(iter, mem) | 1815 | for_each_mem_cgroup_tree(iter, memcg) |
1832 | atomic_add_unless(&iter->oom_lock, -1, 0); | 1816 | atomic_add_unless(&iter->under_oom, -1, 0); |
1833 | return 0; | ||
1834 | } | 1817 | } |
1835 | 1818 | ||
1836 | 1819 | static DEFINE_SPINLOCK(memcg_oom_lock); | |
1837 | static DEFINE_MUTEX(memcg_oom_mutex); | ||
1838 | static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); | 1820 | static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); |
1839 | 1821 | ||
1840 | struct oom_wait_info { | 1822 | struct oom_wait_info { |
@@ -1845,85 +1827,85 @@ struct oom_wait_info { | |||
1845 | static int memcg_oom_wake_function(wait_queue_t *wait, | 1827 | static int memcg_oom_wake_function(wait_queue_t *wait, |
1846 | unsigned mode, int sync, void *arg) | 1828 | unsigned mode, int sync, void *arg) |
1847 | { | 1829 | { |
1848 | struct mem_cgroup *wake_mem = (struct mem_cgroup *)arg; | 1830 | struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg, |
1831 | *oom_wait_memcg; | ||
1849 | struct oom_wait_info *oom_wait_info; | 1832 | struct oom_wait_info *oom_wait_info; |
1850 | 1833 | ||
1851 | oom_wait_info = container_of(wait, struct oom_wait_info, wait); | 1834 | oom_wait_info = container_of(wait, struct oom_wait_info, wait); |
1835 | oom_wait_memcg = oom_wait_info->mem; | ||
1852 | 1836 | ||
1853 | if (oom_wait_info->mem == wake_mem) | ||
1854 | goto wakeup; | ||
1855 | /* if no hierarchy, no match */ | ||
1856 | if (!oom_wait_info->mem->use_hierarchy || !wake_mem->use_hierarchy) | ||
1857 | return 0; | ||
1858 | /* | 1837 | /* |
1859 | * Both of oom_wait_info->mem and wake_mem are stable under us. | 1838 | * Both of oom_wait_info->mem and wake_mem are stable under us. |
1860 | * Then we can use css_is_ancestor without taking care of RCU. | 1839 | * Then we can use css_is_ancestor without taking care of RCU. |
1861 | */ | 1840 | */ |
1862 | if (!css_is_ancestor(&oom_wait_info->mem->css, &wake_mem->css) && | 1841 | if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg) |
1863 | !css_is_ancestor(&wake_mem->css, &oom_wait_info->mem->css)) | 1842 | && !mem_cgroup_same_or_subtree(wake_memcg, oom_wait_memcg)) |
1864 | return 0; | 1843 | return 0; |
1865 | |||
1866 | wakeup: | ||
1867 | return autoremove_wake_function(wait, mode, sync, arg); | 1844 | return autoremove_wake_function(wait, mode, sync, arg); |
1868 | } | 1845 | } |
1869 | 1846 | ||
1870 | static void memcg_wakeup_oom(struct mem_cgroup *mem) | 1847 | static void memcg_wakeup_oom(struct mem_cgroup *memcg) |
1871 | { | 1848 | { |
1872 | /* for filtering, pass "mem" as argument. */ | 1849 | /* for filtering, pass "memcg" as argument. */ |
1873 | __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, mem); | 1850 | __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg); |
1874 | } | 1851 | } |
1875 | 1852 | ||
1876 | static void memcg_oom_recover(struct mem_cgroup *mem) | 1853 | static void memcg_oom_recover(struct mem_cgroup *memcg) |
1877 | { | 1854 | { |
1878 | if (mem && atomic_read(&mem->oom_lock)) | 1855 | if (memcg && atomic_read(&memcg->under_oom)) |
1879 | memcg_wakeup_oom(mem); | 1856 | memcg_wakeup_oom(memcg); |
1880 | } | 1857 | } |
1881 | 1858 | ||
1882 | /* | 1859 | /* |
1883 | * try to call OOM killer. returns false if we should exit memory-reclaim loop. | 1860 | * try to call OOM killer. returns false if we should exit memory-reclaim loop. |
1884 | */ | 1861 | */ |
1885 | bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) | 1862 | bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask) |
1886 | { | 1863 | { |
1887 | struct oom_wait_info owait; | 1864 | struct oom_wait_info owait; |
1888 | bool locked, need_to_kill; | 1865 | bool locked, need_to_kill; |
1889 | 1866 | ||
1890 | owait.mem = mem; | 1867 | owait.mem = memcg; |
1891 | owait.wait.flags = 0; | 1868 | owait.wait.flags = 0; |
1892 | owait.wait.func = memcg_oom_wake_function; | 1869 | owait.wait.func = memcg_oom_wake_function; |
1893 | owait.wait.private = current; | 1870 | owait.wait.private = current; |
1894 | INIT_LIST_HEAD(&owait.wait.task_list); | 1871 | INIT_LIST_HEAD(&owait.wait.task_list); |
1895 | need_to_kill = true; | 1872 | need_to_kill = true; |
1896 | /* At first, try to OOM lock hierarchy under mem.*/ | 1873 | mem_cgroup_mark_under_oom(memcg); |
1897 | mutex_lock(&memcg_oom_mutex); | 1874 | |
1898 | locked = mem_cgroup_oom_lock(mem); | 1875 | /* At first, try to OOM lock hierarchy under memcg.*/ |
1876 | spin_lock(&memcg_oom_lock); | ||
1877 | locked = mem_cgroup_oom_lock(memcg); | ||
1899 | /* | 1878 | /* |
1900 | * Even if signal_pending(), we can't quit charge() loop without | 1879 | * Even if signal_pending(), we can't quit charge() loop without |
1901 | * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL | 1880 | * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL |
1902 | * under OOM is always welcomed, use TASK_KILLABLE here. | 1881 | * under OOM is always welcomed, use TASK_KILLABLE here. |
1903 | */ | 1882 | */ |
1904 | prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); | 1883 | prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); |
1905 | if (!locked || mem->oom_kill_disable) | 1884 | if (!locked || memcg->oom_kill_disable) |
1906 | need_to_kill = false; | 1885 | need_to_kill = false; |
1907 | if (locked) | 1886 | if (locked) |
1908 | mem_cgroup_oom_notify(mem); | 1887 | mem_cgroup_oom_notify(memcg); |
1909 | mutex_unlock(&memcg_oom_mutex); | 1888 | spin_unlock(&memcg_oom_lock); |
1910 | 1889 | ||
1911 | if (need_to_kill) { | 1890 | if (need_to_kill) { |
1912 | finish_wait(&memcg_oom_waitq, &owait.wait); | 1891 | finish_wait(&memcg_oom_waitq, &owait.wait); |
1913 | mem_cgroup_out_of_memory(mem, mask); | 1892 | mem_cgroup_out_of_memory(memcg, mask); |
1914 | } else { | 1893 | } else { |
1915 | schedule(); | 1894 | schedule(); |
1916 | finish_wait(&memcg_oom_waitq, &owait.wait); | 1895 | finish_wait(&memcg_oom_waitq, &owait.wait); |
1917 | } | 1896 | } |
1918 | mutex_lock(&memcg_oom_mutex); | 1897 | spin_lock(&memcg_oom_lock); |
1919 | mem_cgroup_oom_unlock(mem); | 1898 | if (locked) |
1920 | memcg_wakeup_oom(mem); | 1899 | mem_cgroup_oom_unlock(memcg); |
1921 | mutex_unlock(&memcg_oom_mutex); | 1900 | memcg_wakeup_oom(memcg); |
1901 | spin_unlock(&memcg_oom_lock); | ||
1902 | |||
1903 | mem_cgroup_unmark_under_oom(memcg); | ||
1922 | 1904 | ||
1923 | if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) | 1905 | if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) |
1924 | return false; | 1906 | return false; |
1925 | /* Give chance to dying process */ | 1907 | /* Give chance to dying process */ |
1926 | schedule_timeout(1); | 1908 | schedule_timeout_uninterruptible(1); |
1927 | return true; | 1909 | return true; |
1928 | } | 1910 | } |
1929 | 1911 | ||
@@ -1954,7 +1936,7 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) | |||
1954 | void mem_cgroup_update_page_stat(struct page *page, | 1936 | void mem_cgroup_update_page_stat(struct page *page, |
1955 | enum mem_cgroup_page_stat_item idx, int val) | 1937 | enum mem_cgroup_page_stat_item idx, int val) |
1956 | { | 1938 | { |
1957 | struct mem_cgroup *mem; | 1939 | struct mem_cgroup *memcg; |
1958 | struct page_cgroup *pc = lookup_page_cgroup(page); | 1940 | struct page_cgroup *pc = lookup_page_cgroup(page); |
1959 | bool need_unlock = false; | 1941 | bool need_unlock = false; |
1960 | unsigned long uninitialized_var(flags); | 1942 | unsigned long uninitialized_var(flags); |
@@ -1963,16 +1945,16 @@ void mem_cgroup_update_page_stat(struct page *page, | |||
1963 | return; | 1945 | return; |
1964 | 1946 | ||
1965 | rcu_read_lock(); | 1947 | rcu_read_lock(); |
1966 | mem = pc->mem_cgroup; | 1948 | memcg = pc->mem_cgroup; |
1967 | if (unlikely(!mem || !PageCgroupUsed(pc))) | 1949 | if (unlikely(!memcg || !PageCgroupUsed(pc))) |
1968 | goto out; | 1950 | goto out; |
1969 | /* pc->mem_cgroup is unstable ? */ | 1951 | /* pc->mem_cgroup is unstable ? */ |
1970 | if (unlikely(mem_cgroup_stealed(mem)) || PageTransHuge(page)) { | 1952 | if (unlikely(mem_cgroup_stealed(memcg)) || PageTransHuge(page)) { |
1971 | /* take a lock against to access pc->mem_cgroup */ | 1953 | /* take a lock against to access pc->mem_cgroup */ |
1972 | move_lock_page_cgroup(pc, &flags); | 1954 | move_lock_page_cgroup(pc, &flags); |
1973 | need_unlock = true; | 1955 | need_unlock = true; |
1974 | mem = pc->mem_cgroup; | 1956 | memcg = pc->mem_cgroup; |
1975 | if (!mem || !PageCgroupUsed(pc)) | 1957 | if (!memcg || !PageCgroupUsed(pc)) |
1976 | goto out; | 1958 | goto out; |
1977 | } | 1959 | } |
1978 | 1960 | ||
@@ -1988,7 +1970,7 @@ void mem_cgroup_update_page_stat(struct page *page, | |||
1988 | BUG(); | 1970 | BUG(); |
1989 | } | 1971 | } |
1990 | 1972 | ||
1991 | this_cpu_add(mem->stat->count[idx], val); | 1973 | this_cpu_add(memcg->stat->count[idx], val); |
1992 | 1974 | ||
1993 | out: | 1975 | out: |
1994 | if (unlikely(need_unlock)) | 1976 | if (unlikely(need_unlock)) |
@@ -2019,13 +2001,13 @@ static DEFINE_MUTEX(percpu_charge_mutex); | |||
2019 | * cgroup which is not current target, returns false. This stock will be | 2001 | * cgroup which is not current target, returns false. This stock will be |
2020 | * refilled. | 2002 | * refilled. |
2021 | */ | 2003 | */ |
2022 | static bool consume_stock(struct mem_cgroup *mem) | 2004 | static bool consume_stock(struct mem_cgroup *memcg) |
2023 | { | 2005 | { |
2024 | struct memcg_stock_pcp *stock; | 2006 | struct memcg_stock_pcp *stock; |
2025 | bool ret = true; | 2007 | bool ret = true; |
2026 | 2008 | ||
2027 | stock = &get_cpu_var(memcg_stock); | 2009 | stock = &get_cpu_var(memcg_stock); |
2028 | if (mem == stock->cached && stock->nr_pages) | 2010 | if (memcg == stock->cached && stock->nr_pages) |
2029 | stock->nr_pages--; | 2011 | stock->nr_pages--; |
2030 | else /* need to call res_counter_charge */ | 2012 | else /* need to call res_counter_charge */ |
2031 | ret = false; | 2013 | ret = false; |
@@ -2066,72 +2048,83 @@ static void drain_local_stock(struct work_struct *dummy) | |||
2066 | * Cache charges(val) which is from res_counter, to local per_cpu area. | 2048 | * Cache charges(val) which is from res_counter, to local per_cpu area. |
2067 | * This will be consumed by consume_stock() function, later. | 2049 | * This will be consumed by consume_stock() function, later. |
2068 | */ | 2050 | */ |
2069 | static void refill_stock(struct mem_cgroup *mem, unsigned int nr_pages) | 2051 | static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) |
2070 | { | 2052 | { |
2071 | struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock); | 2053 | struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock); |
2072 | 2054 | ||
2073 | if (stock->cached != mem) { /* reset if necessary */ | 2055 | if (stock->cached != memcg) { /* reset if necessary */ |
2074 | drain_stock(stock); | 2056 | drain_stock(stock); |
2075 | stock->cached = mem; | 2057 | stock->cached = memcg; |
2076 | } | 2058 | } |
2077 | stock->nr_pages += nr_pages; | 2059 | stock->nr_pages += nr_pages; |
2078 | put_cpu_var(memcg_stock); | 2060 | put_cpu_var(memcg_stock); |
2079 | } | 2061 | } |
2080 | 2062 | ||
2081 | /* | 2063 | /* |
2082 | * Tries to drain stocked charges in other cpus. This function is asynchronous | 2064 | * Drains all per-CPU charge caches for given root_memcg resp. subtree |
2083 | * and just put a work per cpu for draining localy on each cpu. Caller can | 2065 | * of the hierarchy under it. sync flag says whether we should block |
2084 | * expects some charges will be back to res_counter later but cannot wait for | 2066 | * until the work is done. |
2085 | * it. | ||
2086 | */ | 2067 | */ |
2087 | static void drain_all_stock_async(struct mem_cgroup *root_mem) | 2068 | static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync) |
2088 | { | 2069 | { |
2089 | int cpu, curcpu; | 2070 | int cpu, curcpu; |
2090 | /* | 2071 | |
2091 | * If someone calls draining, avoid adding more kworker runs. | ||
2092 | */ | ||
2093 | if (!mutex_trylock(&percpu_charge_mutex)) | ||
2094 | return; | ||
2095 | /* Notify other cpus that system-wide "drain" is running */ | 2072 | /* Notify other cpus that system-wide "drain" is running */ |
2096 | get_online_cpus(); | 2073 | get_online_cpus(); |
2097 | /* | 2074 | curcpu = get_cpu(); |
2098 | * Get a hint for avoiding draining charges on the current cpu, | ||
2099 | * which must be exhausted by our charging. It is not required that | ||
2100 | * this be a precise check, so we use raw_smp_processor_id() instead of | ||
2101 | * getcpu()/putcpu(). | ||
2102 | */ | ||
2103 | curcpu = raw_smp_processor_id(); | ||
2104 | for_each_online_cpu(cpu) { | 2075 | for_each_online_cpu(cpu) { |
2105 | struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); | 2076 | struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); |
2106 | struct mem_cgroup *mem; | 2077 | struct mem_cgroup *memcg; |
2107 | 2078 | ||
2108 | if (cpu == curcpu) | 2079 | memcg = stock->cached; |
2080 | if (!memcg || !stock->nr_pages) | ||
2109 | continue; | 2081 | continue; |
2110 | 2082 | if (!mem_cgroup_same_or_subtree(root_memcg, memcg)) | |
2111 | mem = stock->cached; | ||
2112 | if (!mem) | ||
2113 | continue; | 2083 | continue; |
2114 | if (mem != root_mem) { | 2084 | if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) { |
2115 | if (!root_mem->use_hierarchy) | 2085 | if (cpu == curcpu) |
2116 | continue; | 2086 | drain_local_stock(&stock->work); |
2117 | /* check whether "mem" is under tree of "root_mem" */ | 2087 | else |
2118 | if (!css_is_ancestor(&mem->css, &root_mem->css)) | 2088 | schedule_work_on(cpu, &stock->work); |
2119 | continue; | ||
2120 | } | 2089 | } |
2121 | if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) | ||
2122 | schedule_work_on(cpu, &stock->work); | ||
2123 | } | 2090 | } |
2091 | put_cpu(); | ||
2092 | |||
2093 | if (!sync) | ||
2094 | goto out; | ||
2095 | |||
2096 | for_each_online_cpu(cpu) { | ||
2097 | struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); | ||
2098 | if (test_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) | ||
2099 | flush_work(&stock->work); | ||
2100 | } | ||
2101 | out: | ||
2124 | put_online_cpus(); | 2102 | put_online_cpus(); |
2103 | } | ||
2104 | |||
2105 | /* | ||
2106 | * Tries to drain stocked charges in other cpus. This function is asynchronous | ||
2107 | * and just put a work per cpu for draining localy on each cpu. Caller can | ||
2108 | * expects some charges will be back to res_counter later but cannot wait for | ||
2109 | * it. | ||
2110 | */ | ||
2111 | static void drain_all_stock_async(struct mem_cgroup *root_memcg) | ||
2112 | { | ||
2113 | /* | ||
2114 | * If someone calls draining, avoid adding more kworker runs. | ||
2115 | */ | ||
2116 | if (!mutex_trylock(&percpu_charge_mutex)) | ||
2117 | return; | ||
2118 | drain_all_stock(root_memcg, false); | ||
2125 | mutex_unlock(&percpu_charge_mutex); | 2119 | mutex_unlock(&percpu_charge_mutex); |
2126 | /* We don't wait for flush_work */ | ||
2127 | } | 2120 | } |
2128 | 2121 | ||
2129 | /* This is a synchronous drain interface. */ | 2122 | /* This is a synchronous drain interface. */ |
2130 | static void drain_all_stock_sync(void) | 2123 | static void drain_all_stock_sync(struct mem_cgroup *root_memcg) |
2131 | { | 2124 | { |
2132 | /* called when force_empty is called */ | 2125 | /* called when force_empty is called */ |
2133 | mutex_lock(&percpu_charge_mutex); | 2126 | mutex_lock(&percpu_charge_mutex); |
2134 | schedule_on_each_cpu(drain_local_stock); | 2127 | drain_all_stock(root_memcg, true); |
2135 | mutex_unlock(&percpu_charge_mutex); | 2128 | mutex_unlock(&percpu_charge_mutex); |
2136 | } | 2129 | } |
2137 | 2130 | ||
@@ -2139,35 +2132,35 @@ static void drain_all_stock_sync(void) | |||
2139 | * This function drains percpu counter value from DEAD cpu and | 2132 | * This function drains percpu counter value from DEAD cpu and |
2140 | * move it to local cpu. Note that this function can be preempted. | 2133 | * move it to local cpu. Note that this function can be preempted. |
2141 | */ | 2134 | */ |
2142 | static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *mem, int cpu) | 2135 | static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu) |
2143 | { | 2136 | { |
2144 | int i; | 2137 | int i; |
2145 | 2138 | ||
2146 | spin_lock(&mem->pcp_counter_lock); | 2139 | spin_lock(&memcg->pcp_counter_lock); |
2147 | for (i = 0; i < MEM_CGROUP_STAT_DATA; i++) { | 2140 | for (i = 0; i < MEM_CGROUP_STAT_DATA; i++) { |
2148 | long x = per_cpu(mem->stat->count[i], cpu); | 2141 | long x = per_cpu(memcg->stat->count[i], cpu); |
2149 | 2142 | ||
2150 | per_cpu(mem->stat->count[i], cpu) = 0; | 2143 | per_cpu(memcg->stat->count[i], cpu) = 0; |
2151 | mem->nocpu_base.count[i] += x; | 2144 | memcg->nocpu_base.count[i] += x; |
2152 | } | 2145 | } |
2153 | for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) { | 2146 | for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) { |
2154 | unsigned long x = per_cpu(mem->stat->events[i], cpu); | 2147 | unsigned long x = per_cpu(memcg->stat->events[i], cpu); |
2155 | 2148 | ||
2156 | per_cpu(mem->stat->events[i], cpu) = 0; | 2149 | per_cpu(memcg->stat->events[i], cpu) = 0; |
2157 | mem->nocpu_base.events[i] += x; | 2150 | memcg->nocpu_base.events[i] += x; |
2158 | } | 2151 | } |
2159 | /* need to clear ON_MOVE value, works as a kind of lock. */ | 2152 | /* need to clear ON_MOVE value, works as a kind of lock. */ |
2160 | per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0; | 2153 | per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0; |
2161 | spin_unlock(&mem->pcp_counter_lock); | 2154 | spin_unlock(&memcg->pcp_counter_lock); |
2162 | } | 2155 | } |
2163 | 2156 | ||
2164 | static void synchronize_mem_cgroup_on_move(struct mem_cgroup *mem, int cpu) | 2157 | static void synchronize_mem_cgroup_on_move(struct mem_cgroup *memcg, int cpu) |
2165 | { | 2158 | { |
2166 | int idx = MEM_CGROUP_ON_MOVE; | 2159 | int idx = MEM_CGROUP_ON_MOVE; |
2167 | 2160 | ||
2168 | spin_lock(&mem->pcp_counter_lock); | 2161 | spin_lock(&memcg->pcp_counter_lock); |
2169 | per_cpu(mem->stat->count[idx], cpu) = mem->nocpu_base.count[idx]; | 2162 | per_cpu(memcg->stat->count[idx], cpu) = memcg->nocpu_base.count[idx]; |
2170 | spin_unlock(&mem->pcp_counter_lock); | 2163 | spin_unlock(&memcg->pcp_counter_lock); |
2171 | } | 2164 | } |
2172 | 2165 | ||
2173 | static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb, | 2166 | static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb, |
@@ -2205,7 +2198,7 @@ enum { | |||
2205 | CHARGE_OOM_DIE, /* the current is killed because of OOM */ | 2198 | CHARGE_OOM_DIE, /* the current is killed because of OOM */ |
2206 | }; | 2199 | }; |
2207 | 2200 | ||
2208 | static int mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, | 2201 | static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, |
2209 | unsigned int nr_pages, bool oom_check) | 2202 | unsigned int nr_pages, bool oom_check) |
2210 | { | 2203 | { |
2211 | unsigned long csize = nr_pages * PAGE_SIZE; | 2204 | unsigned long csize = nr_pages * PAGE_SIZE; |
@@ -2214,16 +2207,16 @@ static int mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, | |||
2214 | unsigned long flags = 0; | 2207 | unsigned long flags = 0; |
2215 | int ret; | 2208 | int ret; |
2216 | 2209 | ||
2217 | ret = res_counter_charge(&mem->res, csize, &fail_res); | 2210 | ret = res_counter_charge(&memcg->res, csize, &fail_res); |
2218 | 2211 | ||
2219 | if (likely(!ret)) { | 2212 | if (likely(!ret)) { |
2220 | if (!do_swap_account) | 2213 | if (!do_swap_account) |
2221 | return CHARGE_OK; | 2214 | return CHARGE_OK; |
2222 | ret = res_counter_charge(&mem->memsw, csize, &fail_res); | 2215 | ret = res_counter_charge(&memcg->memsw, csize, &fail_res); |
2223 | if (likely(!ret)) | 2216 | if (likely(!ret)) |
2224 | return CHARGE_OK; | 2217 | return CHARGE_OK; |
2225 | 2218 | ||
2226 | res_counter_uncharge(&mem->res, csize); | 2219 | res_counter_uncharge(&memcg->res, csize); |
2227 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); | 2220 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); |
2228 | flags |= MEM_CGROUP_RECLAIM_NOSWAP; | 2221 | flags |= MEM_CGROUP_RECLAIM_NOSWAP; |
2229 | } else | 2222 | } else |
@@ -2281,12 +2274,12 @@ static int mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, | |||
2281 | static int __mem_cgroup_try_charge(struct mm_struct *mm, | 2274 | static int __mem_cgroup_try_charge(struct mm_struct *mm, |
2282 | gfp_t gfp_mask, | 2275 | gfp_t gfp_mask, |
2283 | unsigned int nr_pages, | 2276 | unsigned int nr_pages, |
2284 | struct mem_cgroup **memcg, | 2277 | struct mem_cgroup **ptr, |
2285 | bool oom) | 2278 | bool oom) |
2286 | { | 2279 | { |
2287 | unsigned int batch = max(CHARGE_BATCH, nr_pages); | 2280 | unsigned int batch = max(CHARGE_BATCH, nr_pages); |
2288 | int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; | 2281 | int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; |
2289 | struct mem_cgroup *mem = NULL; | 2282 | struct mem_cgroup *memcg = NULL; |
2290 | int ret; | 2283 | int ret; |
2291 | 2284 | ||
2292 | /* | 2285 | /* |
@@ -2304,17 +2297,17 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
2304 | * thread group leader migrates. It's possible that mm is not | 2297 | * thread group leader migrates. It's possible that mm is not |
2305 | * set, if so charge the init_mm (happens for pagecache usage). | 2298 | * set, if so charge the init_mm (happens for pagecache usage). |
2306 | */ | 2299 | */ |
2307 | if (!*memcg && !mm) | 2300 | if (!*ptr && !mm) |
2308 | goto bypass; | 2301 | goto bypass; |
2309 | again: | 2302 | again: |
2310 | if (*memcg) { /* css should be a valid one */ | 2303 | if (*ptr) { /* css should be a valid one */ |
2311 | mem = *memcg; | 2304 | memcg = *ptr; |
2312 | VM_BUG_ON(css_is_removed(&mem->css)); | 2305 | VM_BUG_ON(css_is_removed(&memcg->css)); |
2313 | if (mem_cgroup_is_root(mem)) | 2306 | if (mem_cgroup_is_root(memcg)) |
2314 | goto done; | 2307 | goto done; |
2315 | if (nr_pages == 1 && consume_stock(mem)) | 2308 | if (nr_pages == 1 && consume_stock(memcg)) |
2316 | goto done; | 2309 | goto done; |
2317 | css_get(&mem->css); | 2310 | css_get(&memcg->css); |
2318 | } else { | 2311 | } else { |
2319 | struct task_struct *p; | 2312 | struct task_struct *p; |
2320 | 2313 | ||
@@ -2322,7 +2315,7 @@ again: | |||
2322 | p = rcu_dereference(mm->owner); | 2315 | p = rcu_dereference(mm->owner); |
2323 | /* | 2316 | /* |
2324 | * Because we don't have task_lock(), "p" can exit. | 2317 | * Because we don't have task_lock(), "p" can exit. |
2325 | * In that case, "mem" can point to root or p can be NULL with | 2318 | * In that case, "memcg" can point to root or p can be NULL with |
2326 | * race with swapoff. Then, we have small risk of mis-accouning. | 2319 | * race with swapoff. Then, we have small risk of mis-accouning. |
2327 | * But such kind of mis-account by race always happens because | 2320 | * But such kind of mis-account by race always happens because |
2328 | * we don't have cgroup_mutex(). It's overkill and we allo that | 2321 | * we don't have cgroup_mutex(). It's overkill and we allo that |
@@ -2330,12 +2323,12 @@ again: | |||
2330 | * (*) swapoff at el will charge against mm-struct not against | 2323 | * (*) swapoff at el will charge against mm-struct not against |
2331 | * task-struct. So, mm->owner can be NULL. | 2324 | * task-struct. So, mm->owner can be NULL. |
2332 | */ | 2325 | */ |
2333 | mem = mem_cgroup_from_task(p); | 2326 | memcg = mem_cgroup_from_task(p); |
2334 | if (!mem || mem_cgroup_is_root(mem)) { | 2327 | if (!memcg || mem_cgroup_is_root(memcg)) { |
2335 | rcu_read_unlock(); | 2328 | rcu_read_unlock(); |
2336 | goto done; | 2329 | goto done; |
2337 | } | 2330 | } |
2338 | if (nr_pages == 1 && consume_stock(mem)) { | 2331 | if (nr_pages == 1 && consume_stock(memcg)) { |
2339 | /* | 2332 | /* |
2340 | * It seems dagerous to access memcg without css_get(). | 2333 | * It seems dagerous to access memcg without css_get(). |
2341 | * But considering how consume_stok works, it's not | 2334 | * But considering how consume_stok works, it's not |
@@ -2348,7 +2341,7 @@ again: | |||
2348 | goto done; | 2341 | goto done; |
2349 | } | 2342 | } |
2350 | /* after here, we may be blocked. we need to get refcnt */ | 2343 | /* after here, we may be blocked. we need to get refcnt */ |
2351 | if (!css_tryget(&mem->css)) { | 2344 | if (!css_tryget(&memcg->css)) { |
2352 | rcu_read_unlock(); | 2345 | rcu_read_unlock(); |
2353 | goto again; | 2346 | goto again; |
2354 | } | 2347 | } |
@@ -2360,7 +2353,7 @@ again: | |||
2360 | 2353 | ||
2361 | /* If killed, bypass charge */ | 2354 | /* If killed, bypass charge */ |
2362 | if (fatal_signal_pending(current)) { | 2355 | if (fatal_signal_pending(current)) { |
2363 | css_put(&mem->css); | 2356 | css_put(&memcg->css); |
2364 | goto bypass; | 2357 | goto bypass; |
2365 | } | 2358 | } |
2366 | 2359 | ||
@@ -2370,43 +2363,43 @@ again: | |||
2370 | nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; | 2363 | nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; |
2371 | } | 2364 | } |
2372 | 2365 | ||
2373 | ret = mem_cgroup_do_charge(mem, gfp_mask, batch, oom_check); | 2366 | ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, oom_check); |
2374 | switch (ret) { | 2367 | switch (ret) { |
2375 | case CHARGE_OK: | 2368 | case CHARGE_OK: |
2376 | break; | 2369 | break; |
2377 | case CHARGE_RETRY: /* not in OOM situation but retry */ | 2370 | case CHARGE_RETRY: /* not in OOM situation but retry */ |
2378 | batch = nr_pages; | 2371 | batch = nr_pages; |
2379 | css_put(&mem->css); | 2372 | css_put(&memcg->css); |
2380 | mem = NULL; | 2373 | memcg = NULL; |
2381 | goto again; | 2374 | goto again; |
2382 | case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */ | 2375 | case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */ |
2383 | css_put(&mem->css); | 2376 | css_put(&memcg->css); |
2384 | goto nomem; | 2377 | goto nomem; |
2385 | case CHARGE_NOMEM: /* OOM routine works */ | 2378 | case CHARGE_NOMEM: /* OOM routine works */ |
2386 | if (!oom) { | 2379 | if (!oom) { |
2387 | css_put(&mem->css); | 2380 | css_put(&memcg->css); |
2388 | goto nomem; | 2381 | goto nomem; |
2389 | } | 2382 | } |
2390 | /* If oom, we never return -ENOMEM */ | 2383 | /* If oom, we never return -ENOMEM */ |
2391 | nr_oom_retries--; | 2384 | nr_oom_retries--; |
2392 | break; | 2385 | break; |
2393 | case CHARGE_OOM_DIE: /* Killed by OOM Killer */ | 2386 | case CHARGE_OOM_DIE: /* Killed by OOM Killer */ |
2394 | css_put(&mem->css); | 2387 | css_put(&memcg->css); |
2395 | goto bypass; | 2388 | goto bypass; |
2396 | } | 2389 | } |
2397 | } while (ret != CHARGE_OK); | 2390 | } while (ret != CHARGE_OK); |
2398 | 2391 | ||
2399 | if (batch > nr_pages) | 2392 | if (batch > nr_pages) |
2400 | refill_stock(mem, batch - nr_pages); | 2393 | refill_stock(memcg, batch - nr_pages); |
2401 | css_put(&mem->css); | 2394 | css_put(&memcg->css); |
2402 | done: | 2395 | done: |
2403 | *memcg = mem; | 2396 | *ptr = memcg; |
2404 | return 0; | 2397 | return 0; |
2405 | nomem: | 2398 | nomem: |
2406 | *memcg = NULL; | 2399 | *ptr = NULL; |
2407 | return -ENOMEM; | 2400 | return -ENOMEM; |
2408 | bypass: | 2401 | bypass: |
2409 | *memcg = NULL; | 2402 | *ptr = NULL; |
2410 | return 0; | 2403 | return 0; |
2411 | } | 2404 | } |
2412 | 2405 | ||
@@ -2415,15 +2408,15 @@ bypass: | |||
2415 | * This function is for that and do uncharge, put css's refcnt. | 2408 | * This function is for that and do uncharge, put css's refcnt. |
2416 | * gotten by try_charge(). | 2409 | * gotten by try_charge(). |
2417 | */ | 2410 | */ |
2418 | static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem, | 2411 | static void __mem_cgroup_cancel_charge(struct mem_cgroup *memcg, |
2419 | unsigned int nr_pages) | 2412 | unsigned int nr_pages) |
2420 | { | 2413 | { |
2421 | if (!mem_cgroup_is_root(mem)) { | 2414 | if (!mem_cgroup_is_root(memcg)) { |
2422 | unsigned long bytes = nr_pages * PAGE_SIZE; | 2415 | unsigned long bytes = nr_pages * PAGE_SIZE; |
2423 | 2416 | ||
2424 | res_counter_uncharge(&mem->res, bytes); | 2417 | res_counter_uncharge(&memcg->res, bytes); |
2425 | if (do_swap_account) | 2418 | if (do_swap_account) |
2426 | res_counter_uncharge(&mem->memsw, bytes); | 2419 | res_counter_uncharge(&memcg->memsw, bytes); |
2427 | } | 2420 | } |
2428 | } | 2421 | } |
2429 | 2422 | ||
@@ -2448,7 +2441,7 @@ static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) | |||
2448 | 2441 | ||
2449 | struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) | 2442 | struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) |
2450 | { | 2443 | { |
2451 | struct mem_cgroup *mem = NULL; | 2444 | struct mem_cgroup *memcg = NULL; |
2452 | struct page_cgroup *pc; | 2445 | struct page_cgroup *pc; |
2453 | unsigned short id; | 2446 | unsigned short id; |
2454 | swp_entry_t ent; | 2447 | swp_entry_t ent; |
@@ -2458,23 +2451,23 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) | |||
2458 | pc = lookup_page_cgroup(page); | 2451 | pc = lookup_page_cgroup(page); |
2459 | lock_page_cgroup(pc); | 2452 | lock_page_cgroup(pc); |
2460 | if (PageCgroupUsed(pc)) { | 2453 | if (PageCgroupUsed(pc)) { |
2461 | mem = pc->mem_cgroup; | 2454 | memcg = pc->mem_cgroup; |
2462 | if (mem && !css_tryget(&mem->css)) | 2455 | if (memcg && !css_tryget(&memcg->css)) |
2463 | mem = NULL; | 2456 | memcg = NULL; |
2464 | } else if (PageSwapCache(page)) { | 2457 | } else if (PageSwapCache(page)) { |
2465 | ent.val = page_private(page); | 2458 | ent.val = page_private(page); |
2466 | id = lookup_swap_cgroup(ent); | 2459 | id = lookup_swap_cgroup(ent); |
2467 | rcu_read_lock(); | 2460 | rcu_read_lock(); |
2468 | mem = mem_cgroup_lookup(id); | 2461 | memcg = mem_cgroup_lookup(id); |
2469 | if (mem && !css_tryget(&mem->css)) | 2462 | if (memcg && !css_tryget(&memcg->css)) |
2470 | mem = NULL; | 2463 | memcg = NULL; |
2471 | rcu_read_unlock(); | 2464 | rcu_read_unlock(); |
2472 | } | 2465 | } |
2473 | unlock_page_cgroup(pc); | 2466 | unlock_page_cgroup(pc); |
2474 | return mem; | 2467 | return memcg; |
2475 | } | 2468 | } |
2476 | 2469 | ||
2477 | static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, | 2470 | static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, |
2478 | struct page *page, | 2471 | struct page *page, |
2479 | unsigned int nr_pages, | 2472 | unsigned int nr_pages, |
2480 | struct page_cgroup *pc, | 2473 | struct page_cgroup *pc, |
@@ -2483,14 +2476,14 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, | |||
2483 | lock_page_cgroup(pc); | 2476 | lock_page_cgroup(pc); |
2484 | if (unlikely(PageCgroupUsed(pc))) { | 2477 | if (unlikely(PageCgroupUsed(pc))) { |
2485 | unlock_page_cgroup(pc); | 2478 | unlock_page_cgroup(pc); |
2486 | __mem_cgroup_cancel_charge(mem, nr_pages); | 2479 | __mem_cgroup_cancel_charge(memcg, nr_pages); |
2487 | return; | 2480 | return; |
2488 | } | 2481 | } |
2489 | /* | 2482 | /* |
2490 | * we don't need page_cgroup_lock about tail pages, becase they are not | 2483 | * we don't need page_cgroup_lock about tail pages, becase they are not |
2491 | * accessed by any other context at this point. | 2484 | * accessed by any other context at this point. |
2492 | */ | 2485 | */ |
2493 | pc->mem_cgroup = mem; | 2486 | pc->mem_cgroup = memcg; |
2494 | /* | 2487 | /* |
2495 | * We access a page_cgroup asynchronously without lock_page_cgroup(). | 2488 | * We access a page_cgroup asynchronously without lock_page_cgroup(). |
2496 | * Especially when a page_cgroup is taken from a page, pc->mem_cgroup | 2489 | * Especially when a page_cgroup is taken from a page, pc->mem_cgroup |
@@ -2513,14 +2506,14 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, | |||
2513 | break; | 2506 | break; |
2514 | } | 2507 | } |
2515 | 2508 | ||
2516 | mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), nr_pages); | 2509 | mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), nr_pages); |
2517 | unlock_page_cgroup(pc); | 2510 | unlock_page_cgroup(pc); |
2518 | /* | 2511 | /* |
2519 | * "charge_statistics" updated event counter. Then, check it. | 2512 | * "charge_statistics" updated event counter. Then, check it. |
2520 | * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. | 2513 | * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. |
2521 | * if they exceeds softlimit. | 2514 | * if they exceeds softlimit. |
2522 | */ | 2515 | */ |
2523 | memcg_check_events(mem, page); | 2516 | memcg_check_events(memcg, page); |
2524 | } | 2517 | } |
2525 | 2518 | ||
2526 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 2519 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
@@ -2707,7 +2700,7 @@ out: | |||
2707 | static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, | 2700 | static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, |
2708 | gfp_t gfp_mask, enum charge_type ctype) | 2701 | gfp_t gfp_mask, enum charge_type ctype) |
2709 | { | 2702 | { |
2710 | struct mem_cgroup *mem = NULL; | 2703 | struct mem_cgroup *memcg = NULL; |
2711 | unsigned int nr_pages = 1; | 2704 | unsigned int nr_pages = 1; |
2712 | struct page_cgroup *pc; | 2705 | struct page_cgroup *pc; |
2713 | bool oom = true; | 2706 | bool oom = true; |
@@ -2726,11 +2719,11 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, | |||
2726 | pc = lookup_page_cgroup(page); | 2719 | pc = lookup_page_cgroup(page); |
2727 | BUG_ON(!pc); /* XXX: remove this and move pc lookup into commit */ | 2720 | BUG_ON(!pc); /* XXX: remove this and move pc lookup into commit */ |
2728 | 2721 | ||
2729 | ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &mem, oom); | 2722 | ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom); |
2730 | if (ret || !mem) | 2723 | if (ret || !memcg) |
2731 | return ret; | 2724 | return ret; |
2732 | 2725 | ||
2733 | __mem_cgroup_commit_charge(mem, page, nr_pages, pc, ctype); | 2726 | __mem_cgroup_commit_charge(memcg, page, nr_pages, pc, ctype); |
2734 | return 0; | 2727 | return 0; |
2735 | } | 2728 | } |
2736 | 2729 | ||
@@ -2759,7 +2752,7 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, | |||
2759 | enum charge_type ctype); | 2752 | enum charge_type ctype); |
2760 | 2753 | ||
2761 | static void | 2754 | static void |
2762 | __mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *mem, | 2755 | __mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *memcg, |
2763 | enum charge_type ctype) | 2756 | enum charge_type ctype) |
2764 | { | 2757 | { |
2765 | struct page_cgroup *pc = lookup_page_cgroup(page); | 2758 | struct page_cgroup *pc = lookup_page_cgroup(page); |
@@ -2769,7 +2762,7 @@ __mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *mem, | |||
2769 | * LRU. Take care of it. | 2762 | * LRU. Take care of it. |
2770 | */ | 2763 | */ |
2771 | mem_cgroup_lru_del_before_commit(page); | 2764 | mem_cgroup_lru_del_before_commit(page); |
2772 | __mem_cgroup_commit_charge(mem, page, 1, pc, ctype); | 2765 | __mem_cgroup_commit_charge(memcg, page, 1, pc, ctype); |
2773 | mem_cgroup_lru_add_after_commit(page); | 2766 | mem_cgroup_lru_add_after_commit(page); |
2774 | return; | 2767 | return; |
2775 | } | 2768 | } |
@@ -2777,44 +2770,20 @@ __mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *mem, | |||
2777 | int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, | 2770 | int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, |
2778 | gfp_t gfp_mask) | 2771 | gfp_t gfp_mask) |
2779 | { | 2772 | { |
2780 | struct mem_cgroup *mem = NULL; | 2773 | struct mem_cgroup *memcg = NULL; |
2781 | int ret; | 2774 | int ret; |
2782 | 2775 | ||
2783 | if (mem_cgroup_disabled()) | 2776 | if (mem_cgroup_disabled()) |
2784 | return 0; | 2777 | return 0; |
2785 | if (PageCompound(page)) | 2778 | if (PageCompound(page)) |
2786 | return 0; | 2779 | return 0; |
2787 | /* | ||
2788 | * Corner case handling. This is called from add_to_page_cache() | ||
2789 | * in usual. But some FS (shmem) precharges this page before calling it | ||
2790 | * and call add_to_page_cache() with GFP_NOWAIT. | ||
2791 | * | ||
2792 | * For GFP_NOWAIT case, the page may be pre-charged before calling | ||
2793 | * add_to_page_cache(). (See shmem.c) check it here and avoid to call | ||
2794 | * charge twice. (It works but has to pay a bit larger cost.) | ||
2795 | * And when the page is SwapCache, it should take swap information | ||
2796 | * into account. This is under lock_page() now. | ||
2797 | */ | ||
2798 | if (!(gfp_mask & __GFP_WAIT)) { | ||
2799 | struct page_cgroup *pc; | ||
2800 | |||
2801 | pc = lookup_page_cgroup(page); | ||
2802 | if (!pc) | ||
2803 | return 0; | ||
2804 | lock_page_cgroup(pc); | ||
2805 | if (PageCgroupUsed(pc)) { | ||
2806 | unlock_page_cgroup(pc); | ||
2807 | return 0; | ||
2808 | } | ||
2809 | unlock_page_cgroup(pc); | ||
2810 | } | ||
2811 | 2780 | ||
2812 | if (unlikely(!mm)) | 2781 | if (unlikely(!mm)) |
2813 | mm = &init_mm; | 2782 | mm = &init_mm; |
2814 | 2783 | ||
2815 | if (page_is_file_cache(page)) { | 2784 | if (page_is_file_cache(page)) { |
2816 | ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, &mem, true); | 2785 | ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, &memcg, true); |
2817 | if (ret || !mem) | 2786 | if (ret || !memcg) |
2818 | return ret; | 2787 | return ret; |
2819 | 2788 | ||
2820 | /* | 2789 | /* |
@@ -2822,15 +2791,15 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, | |||
2822 | * put that would remove them from the LRU list, make | 2791 | * put that would remove them from the LRU list, make |
2823 | * sure that they get relinked properly. | 2792 | * sure that they get relinked properly. |
2824 | */ | 2793 | */ |
2825 | __mem_cgroup_commit_charge_lrucare(page, mem, | 2794 | __mem_cgroup_commit_charge_lrucare(page, memcg, |
2826 | MEM_CGROUP_CHARGE_TYPE_CACHE); | 2795 | MEM_CGROUP_CHARGE_TYPE_CACHE); |
2827 | return ret; | 2796 | return ret; |
2828 | } | 2797 | } |
2829 | /* shmem */ | 2798 | /* shmem */ |
2830 | if (PageSwapCache(page)) { | 2799 | if (PageSwapCache(page)) { |
2831 | ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem); | 2800 | ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &memcg); |
2832 | if (!ret) | 2801 | if (!ret) |
2833 | __mem_cgroup_commit_charge_swapin(page, mem, | 2802 | __mem_cgroup_commit_charge_swapin(page, memcg, |
2834 | MEM_CGROUP_CHARGE_TYPE_SHMEM); | 2803 | MEM_CGROUP_CHARGE_TYPE_SHMEM); |
2835 | } else | 2804 | } else |
2836 | ret = mem_cgroup_charge_common(page, mm, gfp_mask, | 2805 | ret = mem_cgroup_charge_common(page, mm, gfp_mask, |
@@ -2849,7 +2818,7 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, | |||
2849 | struct page *page, | 2818 | struct page *page, |
2850 | gfp_t mask, struct mem_cgroup **ptr) | 2819 | gfp_t mask, struct mem_cgroup **ptr) |
2851 | { | 2820 | { |
2852 | struct mem_cgroup *mem; | 2821 | struct mem_cgroup *memcg; |
2853 | int ret; | 2822 | int ret; |
2854 | 2823 | ||
2855 | *ptr = NULL; | 2824 | *ptr = NULL; |
@@ -2867,12 +2836,12 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, | |||
2867 | */ | 2836 | */ |
2868 | if (!PageSwapCache(page)) | 2837 | if (!PageSwapCache(page)) |
2869 | goto charge_cur_mm; | 2838 | goto charge_cur_mm; |
2870 | mem = try_get_mem_cgroup_from_page(page); | 2839 | memcg = try_get_mem_cgroup_from_page(page); |
2871 | if (!mem) | 2840 | if (!memcg) |
2872 | goto charge_cur_mm; | 2841 | goto charge_cur_mm; |
2873 | *ptr = mem; | 2842 | *ptr = memcg; |
2874 | ret = __mem_cgroup_try_charge(NULL, mask, 1, ptr, true); | 2843 | ret = __mem_cgroup_try_charge(NULL, mask, 1, ptr, true); |
2875 | css_put(&mem->css); | 2844 | css_put(&memcg->css); |
2876 | return ret; | 2845 | return ret; |
2877 | charge_cur_mm: | 2846 | charge_cur_mm: |
2878 | if (unlikely(!mm)) | 2847 | if (unlikely(!mm)) |
@@ -2932,16 +2901,16 @@ void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr) | |||
2932 | MEM_CGROUP_CHARGE_TYPE_MAPPED); | 2901 | MEM_CGROUP_CHARGE_TYPE_MAPPED); |
2933 | } | 2902 | } |
2934 | 2903 | ||
2935 | void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) | 2904 | void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg) |
2936 | { | 2905 | { |
2937 | if (mem_cgroup_disabled()) | 2906 | if (mem_cgroup_disabled()) |
2938 | return; | 2907 | return; |
2939 | if (!mem) | 2908 | if (!memcg) |
2940 | return; | 2909 | return; |
2941 | __mem_cgroup_cancel_charge(mem, 1); | 2910 | __mem_cgroup_cancel_charge(memcg, 1); |
2942 | } | 2911 | } |
2943 | 2912 | ||
2944 | static void mem_cgroup_do_uncharge(struct mem_cgroup *mem, | 2913 | static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg, |
2945 | unsigned int nr_pages, | 2914 | unsigned int nr_pages, |
2946 | const enum charge_type ctype) | 2915 | const enum charge_type ctype) |
2947 | { | 2916 | { |
@@ -2959,7 +2928,7 @@ static void mem_cgroup_do_uncharge(struct mem_cgroup *mem, | |||
2959 | * uncharges. Then, it's ok to ignore memcg's refcnt. | 2928 | * uncharges. Then, it's ok to ignore memcg's refcnt. |
2960 | */ | 2929 | */ |
2961 | if (!batch->memcg) | 2930 | if (!batch->memcg) |
2962 | batch->memcg = mem; | 2931 | batch->memcg = memcg; |
2963 | /* | 2932 | /* |
2964 | * do_batch > 0 when unmapping pages or inode invalidate/truncate. | 2933 | * do_batch > 0 when unmapping pages or inode invalidate/truncate. |
2965 | * In those cases, all pages freed continuously can be expected to be in | 2934 | * In those cases, all pages freed continuously can be expected to be in |
@@ -2979,7 +2948,7 @@ static void mem_cgroup_do_uncharge(struct mem_cgroup *mem, | |||
2979 | * merge a series of uncharges to an uncharge of res_counter. | 2948 | * merge a series of uncharges to an uncharge of res_counter. |
2980 | * If not, we uncharge res_counter ony by one. | 2949 | * If not, we uncharge res_counter ony by one. |
2981 | */ | 2950 | */ |
2982 | if (batch->memcg != mem) | 2951 | if (batch->memcg != memcg) |
2983 | goto direct_uncharge; | 2952 | goto direct_uncharge; |
2984 | /* remember freed charge and uncharge it later */ | 2953 | /* remember freed charge and uncharge it later */ |
2985 | batch->nr_pages++; | 2954 | batch->nr_pages++; |
@@ -2987,11 +2956,11 @@ static void mem_cgroup_do_uncharge(struct mem_cgroup *mem, | |||
2987 | batch->memsw_nr_pages++; | 2956 | batch->memsw_nr_pages++; |
2988 | return; | 2957 | return; |
2989 | direct_uncharge: | 2958 | direct_uncharge: |
2990 | res_counter_uncharge(&mem->res, nr_pages * PAGE_SIZE); | 2959 | res_counter_uncharge(&memcg->res, nr_pages * PAGE_SIZE); |
2991 | if (uncharge_memsw) | 2960 | if (uncharge_memsw) |
2992 | res_counter_uncharge(&mem->memsw, nr_pages * PAGE_SIZE); | 2961 | res_counter_uncharge(&memcg->memsw, nr_pages * PAGE_SIZE); |
2993 | if (unlikely(batch->memcg != mem)) | 2962 | if (unlikely(batch->memcg != memcg)) |
2994 | memcg_oom_recover(mem); | 2963 | memcg_oom_recover(memcg); |
2995 | return; | 2964 | return; |
2996 | } | 2965 | } |
2997 | 2966 | ||
@@ -3001,7 +2970,7 @@ direct_uncharge: | |||
3001 | static struct mem_cgroup * | 2970 | static struct mem_cgroup * |
3002 | __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | 2971 | __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) |
3003 | { | 2972 | { |
3004 | struct mem_cgroup *mem = NULL; | 2973 | struct mem_cgroup *memcg = NULL; |
3005 | unsigned int nr_pages = 1; | 2974 | unsigned int nr_pages = 1; |
3006 | struct page_cgroup *pc; | 2975 | struct page_cgroup *pc; |
3007 | 2976 | ||
@@ -3024,7 +2993,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
3024 | 2993 | ||
3025 | lock_page_cgroup(pc); | 2994 | lock_page_cgroup(pc); |
3026 | 2995 | ||
3027 | mem = pc->mem_cgroup; | 2996 | memcg = pc->mem_cgroup; |
3028 | 2997 | ||
3029 | if (!PageCgroupUsed(pc)) | 2998 | if (!PageCgroupUsed(pc)) |
3030 | goto unlock_out; | 2999 | goto unlock_out; |
@@ -3047,7 +3016,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
3047 | break; | 3016 | break; |
3048 | } | 3017 | } |
3049 | 3018 | ||
3050 | mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), -nr_pages); | 3019 | mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), -nr_pages); |
3051 | 3020 | ||
3052 | ClearPageCgroupUsed(pc); | 3021 | ClearPageCgroupUsed(pc); |
3053 | /* | 3022 | /* |
@@ -3059,18 +3028,18 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
3059 | 3028 | ||
3060 | unlock_page_cgroup(pc); | 3029 | unlock_page_cgroup(pc); |
3061 | /* | 3030 | /* |
3062 | * even after unlock, we have mem->res.usage here and this memcg | 3031 | * even after unlock, we have memcg->res.usage here and this memcg |
3063 | * will never be freed. | 3032 | * will never be freed. |
3064 | */ | 3033 | */ |
3065 | memcg_check_events(mem, page); | 3034 | memcg_check_events(memcg, page); |
3066 | if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) { | 3035 | if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) { |
3067 | mem_cgroup_swap_statistics(mem, true); | 3036 | mem_cgroup_swap_statistics(memcg, true); |
3068 | mem_cgroup_get(mem); | 3037 | mem_cgroup_get(memcg); |
3069 | } | 3038 | } |
3070 | if (!mem_cgroup_is_root(mem)) | 3039 | if (!mem_cgroup_is_root(memcg)) |
3071 | mem_cgroup_do_uncharge(mem, nr_pages, ctype); | 3040 | mem_cgroup_do_uncharge(memcg, nr_pages, ctype); |
3072 | 3041 | ||
3073 | return mem; | 3042 | return memcg; |
3074 | 3043 | ||
3075 | unlock_out: | 3044 | unlock_out: |
3076 | unlock_page_cgroup(pc); | 3045 | unlock_page_cgroup(pc); |
@@ -3260,7 +3229,7 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry, | |||
3260 | int mem_cgroup_prepare_migration(struct page *page, | 3229 | int mem_cgroup_prepare_migration(struct page *page, |
3261 | struct page *newpage, struct mem_cgroup **ptr, gfp_t gfp_mask) | 3230 | struct page *newpage, struct mem_cgroup **ptr, gfp_t gfp_mask) |
3262 | { | 3231 | { |
3263 | struct mem_cgroup *mem = NULL; | 3232 | struct mem_cgroup *memcg = NULL; |
3264 | struct page_cgroup *pc; | 3233 | struct page_cgroup *pc; |
3265 | enum charge_type ctype; | 3234 | enum charge_type ctype; |
3266 | int ret = 0; | 3235 | int ret = 0; |
@@ -3274,8 +3243,8 @@ int mem_cgroup_prepare_migration(struct page *page, | |||
3274 | pc = lookup_page_cgroup(page); | 3243 | pc = lookup_page_cgroup(page); |
3275 | lock_page_cgroup(pc); | 3244 | lock_page_cgroup(pc); |
3276 | if (PageCgroupUsed(pc)) { | 3245 | if (PageCgroupUsed(pc)) { |
3277 | mem = pc->mem_cgroup; | 3246 | memcg = pc->mem_cgroup; |
3278 | css_get(&mem->css); | 3247 | css_get(&memcg->css); |
3279 | /* | 3248 | /* |
3280 | * At migrating an anonymous page, its mapcount goes down | 3249 | * At migrating an anonymous page, its mapcount goes down |
3281 | * to 0 and uncharge() will be called. But, even if it's fully | 3250 | * to 0 and uncharge() will be called. But, even if it's fully |
@@ -3313,12 +3282,12 @@ int mem_cgroup_prepare_migration(struct page *page, | |||
3313 | * If the page is not charged at this point, | 3282 | * If the page is not charged at this point, |
3314 | * we return here. | 3283 | * we return here. |
3315 | */ | 3284 | */ |
3316 | if (!mem) | 3285 | if (!memcg) |
3317 | return 0; | 3286 | return 0; |
3318 | 3287 | ||
3319 | *ptr = mem; | 3288 | *ptr = memcg; |
3320 | ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, ptr, false); | 3289 | ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, ptr, false); |
3321 | css_put(&mem->css);/* drop extra refcnt */ | 3290 | css_put(&memcg->css);/* drop extra refcnt */ |
3322 | if (ret || *ptr == NULL) { | 3291 | if (ret || *ptr == NULL) { |
3323 | if (PageAnon(page)) { | 3292 | if (PageAnon(page)) { |
3324 | lock_page_cgroup(pc); | 3293 | lock_page_cgroup(pc); |
@@ -3344,21 +3313,21 @@ int mem_cgroup_prepare_migration(struct page *page, | |||
3344 | ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; | 3313 | ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; |
3345 | else | 3314 | else |
3346 | ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; | 3315 | ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; |
3347 | __mem_cgroup_commit_charge(mem, page, 1, pc, ctype); | 3316 | __mem_cgroup_commit_charge(memcg, page, 1, pc, ctype); |
3348 | return ret; | 3317 | return ret; |
3349 | } | 3318 | } |
3350 | 3319 | ||
3351 | /* remove redundant charge if migration failed*/ | 3320 | /* remove redundant charge if migration failed*/ |
3352 | void mem_cgroup_end_migration(struct mem_cgroup *mem, | 3321 | void mem_cgroup_end_migration(struct mem_cgroup *memcg, |
3353 | struct page *oldpage, struct page *newpage, bool migration_ok) | 3322 | struct page *oldpage, struct page *newpage, bool migration_ok) |
3354 | { | 3323 | { |
3355 | struct page *used, *unused; | 3324 | struct page *used, *unused; |
3356 | struct page_cgroup *pc; | 3325 | struct page_cgroup *pc; |
3357 | 3326 | ||
3358 | if (!mem) | 3327 | if (!memcg) |
3359 | return; | 3328 | return; |
3360 | /* blocks rmdir() */ | 3329 | /* blocks rmdir() */ |
3361 | cgroup_exclude_rmdir(&mem->css); | 3330 | cgroup_exclude_rmdir(&memcg->css); |
3362 | if (!migration_ok) { | 3331 | if (!migration_ok) { |
3363 | used = oldpage; | 3332 | used = oldpage; |
3364 | unused = newpage; | 3333 | unused = newpage; |
@@ -3394,32 +3363,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem, | |||
3394 | * So, rmdir()->pre_destroy() can be called while we do this charge. | 3363 | * So, rmdir()->pre_destroy() can be called while we do this charge. |
3395 | * In that case, we need to call pre_destroy() again. check it here. | 3364 | * In that case, we need to call pre_destroy() again. check it here. |
3396 | */ | 3365 | */ |
3397 | cgroup_release_and_wakeup_rmdir(&mem->css); | 3366 | cgroup_release_and_wakeup_rmdir(&memcg->css); |
3398 | } | ||
3399 | |||
3400 | /* | ||
3401 | * A call to try to shrink memory usage on charge failure at shmem's swapin. | ||
3402 | * Calling hierarchical_reclaim is not enough because we should update | ||
3403 | * last_oom_jiffies to prevent pagefault_out_of_memory from invoking global OOM. | ||
3404 | * Moreover considering hierarchy, we should reclaim from the mem_over_limit, | ||
3405 | * not from the memcg which this page would be charged to. | ||
3406 | * try_charge_swapin does all of these works properly. | ||
3407 | */ | ||
3408 | int mem_cgroup_shmem_charge_fallback(struct page *page, | ||
3409 | struct mm_struct *mm, | ||
3410 | gfp_t gfp_mask) | ||
3411 | { | ||
3412 | struct mem_cgroup *mem; | ||
3413 | int ret; | ||
3414 | |||
3415 | if (mem_cgroup_disabled()) | ||
3416 | return 0; | ||
3417 | |||
3418 | ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem); | ||
3419 | if (!ret) | ||
3420 | mem_cgroup_cancel_charge_swapin(mem); /* it does !mem check */ | ||
3421 | |||
3422 | return ret; | ||
3423 | } | 3367 | } |
3424 | 3368 | ||
3425 | #ifdef CONFIG_DEBUG_VM | 3369 | #ifdef CONFIG_DEBUG_VM |
@@ -3498,7 +3442,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, | |||
3498 | /* | 3442 | /* |
3499 | * Rather than hide all in some function, I do this in | 3443 | * Rather than hide all in some function, I do this in |
3500 | * open coded manner. You see what this really does. | 3444 | * open coded manner. You see what this really does. |
3501 | * We have to guarantee mem->res.limit < mem->memsw.limit. | 3445 | * We have to guarantee memcg->res.limit < memcg->memsw.limit. |
3502 | */ | 3446 | */ |
3503 | mutex_lock(&set_limit_mutex); | 3447 | mutex_lock(&set_limit_mutex); |
3504 | memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); | 3448 | memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); |
@@ -3560,7 +3504,7 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, | |||
3560 | /* | 3504 | /* |
3561 | * Rather than hide all in some function, I do this in | 3505 | * Rather than hide all in some function, I do this in |
3562 | * open coded manner. You see what this really does. | 3506 | * open coded manner. You see what this really does. |
3563 | * We have to guarantee mem->res.limit < mem->memsw.limit. | 3507 | * We have to guarantee memcg->res.limit < memcg->memsw.limit. |
3564 | */ | 3508 | */ |
3565 | mutex_lock(&set_limit_mutex); | 3509 | mutex_lock(&set_limit_mutex); |
3566 | memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); | 3510 | memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); |
@@ -3698,7 +3642,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, | |||
3698 | * This routine traverse page_cgroup in given list and drop them all. | 3642 | * This routine traverse page_cgroup in given list and drop them all. |
3699 | * *And* this routine doesn't reclaim page itself, just removes page_cgroup. | 3643 | * *And* this routine doesn't reclaim page itself, just removes page_cgroup. |
3700 | */ | 3644 | */ |
3701 | static int mem_cgroup_force_empty_list(struct mem_cgroup *mem, | 3645 | static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg, |
3702 | int node, int zid, enum lru_list lru) | 3646 | int node, int zid, enum lru_list lru) |
3703 | { | 3647 | { |
3704 | struct zone *zone; | 3648 | struct zone *zone; |
@@ -3709,7 +3653,7 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem, | |||
3709 | int ret = 0; | 3653 | int ret = 0; |
3710 | 3654 | ||
3711 | zone = &NODE_DATA(node)->node_zones[zid]; | 3655 | zone = &NODE_DATA(node)->node_zones[zid]; |
3712 | mz = mem_cgroup_zoneinfo(mem, node, zid); | 3656 | mz = mem_cgroup_zoneinfo(memcg, node, zid); |
3713 | list = &mz->lists[lru]; | 3657 | list = &mz->lists[lru]; |
3714 | 3658 | ||
3715 | loop = MEM_CGROUP_ZSTAT(mz, lru); | 3659 | loop = MEM_CGROUP_ZSTAT(mz, lru); |
@@ -3736,7 +3680,7 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem, | |||
3736 | 3680 | ||
3737 | page = lookup_cgroup_page(pc); | 3681 | page = lookup_cgroup_page(pc); |
3738 | 3682 | ||
3739 | ret = mem_cgroup_move_parent(page, pc, mem, GFP_KERNEL); | 3683 | ret = mem_cgroup_move_parent(page, pc, memcg, GFP_KERNEL); |
3740 | if (ret == -ENOMEM) | 3684 | if (ret == -ENOMEM) |
3741 | break; | 3685 | break; |
3742 | 3686 | ||
@@ -3757,14 +3701,14 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem, | |||
3757 | * make mem_cgroup's charge to be 0 if there is no task. | 3701 | * make mem_cgroup's charge to be 0 if there is no task. |
3758 | * This enables deleting this mem_cgroup. | 3702 | * This enables deleting this mem_cgroup. |
3759 | */ | 3703 | */ |
3760 | static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all) | 3704 | static int mem_cgroup_force_empty(struct mem_cgroup *memcg, bool free_all) |
3761 | { | 3705 | { |
3762 | int ret; | 3706 | int ret; |
3763 | int node, zid, shrink; | 3707 | int node, zid, shrink; |
3764 | int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; | 3708 | int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; |
3765 | struct cgroup *cgrp = mem->css.cgroup; | 3709 | struct cgroup *cgrp = memcg->css.cgroup; |
3766 | 3710 | ||
3767 | css_get(&mem->css); | 3711 | css_get(&memcg->css); |
3768 | 3712 | ||
3769 | shrink = 0; | 3713 | shrink = 0; |
3770 | /* should free all ? */ | 3714 | /* should free all ? */ |
@@ -3780,14 +3724,14 @@ move_account: | |||
3780 | goto out; | 3724 | goto out; |
3781 | /* This is for making all *used* pages to be on LRU. */ | 3725 | /* This is for making all *used* pages to be on LRU. */ |
3782 | lru_add_drain_all(); | 3726 | lru_add_drain_all(); |
3783 | drain_all_stock_sync(); | 3727 | drain_all_stock_sync(memcg); |
3784 | ret = 0; | 3728 | ret = 0; |
3785 | mem_cgroup_start_move(mem); | 3729 | mem_cgroup_start_move(memcg); |
3786 | for_each_node_state(node, N_HIGH_MEMORY) { | 3730 | for_each_node_state(node, N_HIGH_MEMORY) { |
3787 | for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { | 3731 | for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { |
3788 | enum lru_list l; | 3732 | enum lru_list l; |
3789 | for_each_lru(l) { | 3733 | for_each_lru(l) { |
3790 | ret = mem_cgroup_force_empty_list(mem, | 3734 | ret = mem_cgroup_force_empty_list(memcg, |
3791 | node, zid, l); | 3735 | node, zid, l); |
3792 | if (ret) | 3736 | if (ret) |
3793 | break; | 3737 | break; |
@@ -3796,16 +3740,16 @@ move_account: | |||
3796 | if (ret) | 3740 | if (ret) |
3797 | break; | 3741 | break; |
3798 | } | 3742 | } |
3799 | mem_cgroup_end_move(mem); | 3743 | mem_cgroup_end_move(memcg); |
3800 | memcg_oom_recover(mem); | 3744 | memcg_oom_recover(memcg); |
3801 | /* it seems parent cgroup doesn't have enough mem */ | 3745 | /* it seems parent cgroup doesn't have enough mem */ |
3802 | if (ret == -ENOMEM) | 3746 | if (ret == -ENOMEM) |
3803 | goto try_to_free; | 3747 | goto try_to_free; |
3804 | cond_resched(); | 3748 | cond_resched(); |
3805 | /* "ret" should also be checked to ensure all lists are empty. */ | 3749 | /* "ret" should also be checked to ensure all lists are empty. */ |
3806 | } while (mem->res.usage > 0 || ret); | 3750 | } while (memcg->res.usage > 0 || ret); |
3807 | out: | 3751 | out: |
3808 | css_put(&mem->css); | 3752 | css_put(&memcg->css); |
3809 | return ret; | 3753 | return ret; |
3810 | 3754 | ||
3811 | try_to_free: | 3755 | try_to_free: |
@@ -3818,15 +3762,15 @@ try_to_free: | |||
3818 | lru_add_drain_all(); | 3762 | lru_add_drain_all(); |
3819 | /* try to free all pages in this cgroup */ | 3763 | /* try to free all pages in this cgroup */ |
3820 | shrink = 1; | 3764 | shrink = 1; |
3821 | while (nr_retries && mem->res.usage > 0) { | 3765 | while (nr_retries && memcg->res.usage > 0) { |
3822 | int progress; | 3766 | int progress; |
3823 | 3767 | ||
3824 | if (signal_pending(current)) { | 3768 | if (signal_pending(current)) { |
3825 | ret = -EINTR; | 3769 | ret = -EINTR; |
3826 | goto out; | 3770 | goto out; |
3827 | } | 3771 | } |
3828 | progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL, | 3772 | progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL, |
3829 | false, get_swappiness(mem)); | 3773 | false); |
3830 | if (!progress) { | 3774 | if (!progress) { |
3831 | nr_retries--; | 3775 | nr_retries--; |
3832 | /* maybe some writeback is necessary */ | 3776 | /* maybe some writeback is necessary */ |
@@ -3854,12 +3798,12 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, | |||
3854 | u64 val) | 3798 | u64 val) |
3855 | { | 3799 | { |
3856 | int retval = 0; | 3800 | int retval = 0; |
3857 | struct mem_cgroup *mem = mem_cgroup_from_cont(cont); | 3801 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); |
3858 | struct cgroup *parent = cont->parent; | 3802 | struct cgroup *parent = cont->parent; |
3859 | struct mem_cgroup *parent_mem = NULL; | 3803 | struct mem_cgroup *parent_memcg = NULL; |
3860 | 3804 | ||
3861 | if (parent) | 3805 | if (parent) |
3862 | parent_mem = mem_cgroup_from_cont(parent); | 3806 | parent_memcg = mem_cgroup_from_cont(parent); |
3863 | 3807 | ||
3864 | cgroup_lock(); | 3808 | cgroup_lock(); |
3865 | /* | 3809 | /* |
@@ -3870,10 +3814,10 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, | |||
3870 | * For the root cgroup, parent_mem is NULL, we allow value to be | 3814 | * For the root cgroup, parent_mem is NULL, we allow value to be |
3871 | * set if there are no children. | 3815 | * set if there are no children. |
3872 | */ | 3816 | */ |
3873 | if ((!parent_mem || !parent_mem->use_hierarchy) && | 3817 | if ((!parent_memcg || !parent_memcg->use_hierarchy) && |
3874 | (val == 1 || val == 0)) { | 3818 | (val == 1 || val == 0)) { |
3875 | if (list_empty(&cont->children)) | 3819 | if (list_empty(&cont->children)) |
3876 | mem->use_hierarchy = val; | 3820 | memcg->use_hierarchy = val; |
3877 | else | 3821 | else |
3878 | retval = -EBUSY; | 3822 | retval = -EBUSY; |
3879 | } else | 3823 | } else |
@@ -3884,14 +3828,14 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, | |||
3884 | } | 3828 | } |
3885 | 3829 | ||
3886 | 3830 | ||
3887 | static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *mem, | 3831 | static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg, |
3888 | enum mem_cgroup_stat_index idx) | 3832 | enum mem_cgroup_stat_index idx) |
3889 | { | 3833 | { |
3890 | struct mem_cgroup *iter; | 3834 | struct mem_cgroup *iter; |
3891 | long val = 0; | 3835 | long val = 0; |
3892 | 3836 | ||
3893 | /* Per-cpu values can be negative, use a signed accumulator */ | 3837 | /* Per-cpu values can be negative, use a signed accumulator */ |
3894 | for_each_mem_cgroup_tree(iter, mem) | 3838 | for_each_mem_cgroup_tree(iter, memcg) |
3895 | val += mem_cgroup_read_stat(iter, idx); | 3839 | val += mem_cgroup_read_stat(iter, idx); |
3896 | 3840 | ||
3897 | if (val < 0) /* race ? */ | 3841 | if (val < 0) /* race ? */ |
@@ -3899,29 +3843,29 @@ static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *mem, | |||
3899 | return val; | 3843 | return val; |
3900 | } | 3844 | } |
3901 | 3845 | ||
3902 | static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap) | 3846 | static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) |
3903 | { | 3847 | { |
3904 | u64 val; | 3848 | u64 val; |
3905 | 3849 | ||
3906 | if (!mem_cgroup_is_root(mem)) { | 3850 | if (!mem_cgroup_is_root(memcg)) { |
3907 | if (!swap) | 3851 | if (!swap) |
3908 | return res_counter_read_u64(&mem->res, RES_USAGE); | 3852 | return res_counter_read_u64(&memcg->res, RES_USAGE); |
3909 | else | 3853 | else |
3910 | return res_counter_read_u64(&mem->memsw, RES_USAGE); | 3854 | return res_counter_read_u64(&memcg->memsw, RES_USAGE); |
3911 | } | 3855 | } |
3912 | 3856 | ||
3913 | val = mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_CACHE); | 3857 | val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE); |
3914 | val += mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_RSS); | 3858 | val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS); |
3915 | 3859 | ||
3916 | if (swap) | 3860 | if (swap) |
3917 | val += mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_SWAPOUT); | 3861 | val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAPOUT); |
3918 | 3862 | ||
3919 | return val << PAGE_SHIFT; | 3863 | return val << PAGE_SHIFT; |
3920 | } | 3864 | } |
3921 | 3865 | ||
3922 | static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) | 3866 | static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) |
3923 | { | 3867 | { |
3924 | struct mem_cgroup *mem = mem_cgroup_from_cont(cont); | 3868 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); |
3925 | u64 val; | 3869 | u64 val; |
3926 | int type, name; | 3870 | int type, name; |
3927 | 3871 | ||
@@ -3930,15 +3874,15 @@ static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) | |||
3930 | switch (type) { | 3874 | switch (type) { |
3931 | case _MEM: | 3875 | case _MEM: |
3932 | if (name == RES_USAGE) | 3876 | if (name == RES_USAGE) |
3933 | val = mem_cgroup_usage(mem, false); | 3877 | val = mem_cgroup_usage(memcg, false); |
3934 | else | 3878 | else |
3935 | val = res_counter_read_u64(&mem->res, name); | 3879 | val = res_counter_read_u64(&memcg->res, name); |
3936 | break; | 3880 | break; |
3937 | case _MEMSWAP: | 3881 | case _MEMSWAP: |
3938 | if (name == RES_USAGE) | 3882 | if (name == RES_USAGE) |
3939 | val = mem_cgroup_usage(mem, true); | 3883 | val = mem_cgroup_usage(memcg, true); |
3940 | else | 3884 | else |
3941 | val = res_counter_read_u64(&mem->memsw, name); | 3885 | val = res_counter_read_u64(&memcg->memsw, name); |
3942 | break; | 3886 | break; |
3943 | default: | 3887 | default: |
3944 | BUG(); | 3888 | BUG(); |
@@ -4026,24 +3970,24 @@ out: | |||
4026 | 3970 | ||
4027 | static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) | 3971 | static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) |
4028 | { | 3972 | { |
4029 | struct mem_cgroup *mem; | 3973 | struct mem_cgroup *memcg; |
4030 | int type, name; | 3974 | int type, name; |
4031 | 3975 | ||
4032 | mem = mem_cgroup_from_cont(cont); | 3976 | memcg = mem_cgroup_from_cont(cont); |
4033 | type = MEMFILE_TYPE(event); | 3977 | type = MEMFILE_TYPE(event); |
4034 | name = MEMFILE_ATTR(event); | 3978 | name = MEMFILE_ATTR(event); |
4035 | switch (name) { | 3979 | switch (name) { |
4036 | case RES_MAX_USAGE: | 3980 | case RES_MAX_USAGE: |
4037 | if (type == _MEM) | 3981 | if (type == _MEM) |
4038 | res_counter_reset_max(&mem->res); | 3982 | res_counter_reset_max(&memcg->res); |
4039 | else | 3983 | else |
4040 | res_counter_reset_max(&mem->memsw); | 3984 | res_counter_reset_max(&memcg->memsw); |
4041 | break; | 3985 | break; |
4042 | case RES_FAILCNT: | 3986 | case RES_FAILCNT: |
4043 | if (type == _MEM) | 3987 | if (type == _MEM) |
4044 | res_counter_reset_failcnt(&mem->res); | 3988 | res_counter_reset_failcnt(&memcg->res); |
4045 | else | 3989 | else |
4046 | res_counter_reset_failcnt(&mem->memsw); | 3990 | res_counter_reset_failcnt(&memcg->memsw); |
4047 | break; | 3991 | break; |
4048 | } | 3992 | } |
4049 | 3993 | ||
@@ -4060,7 +4004,7 @@ static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp, | |||
4060 | static int mem_cgroup_move_charge_write(struct cgroup *cgrp, | 4004 | static int mem_cgroup_move_charge_write(struct cgroup *cgrp, |
4061 | struct cftype *cft, u64 val) | 4005 | struct cftype *cft, u64 val) |
4062 | { | 4006 | { |
4063 | struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); | 4007 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); |
4064 | 4008 | ||
4065 | if (val >= (1 << NR_MOVE_TYPE)) | 4009 | if (val >= (1 << NR_MOVE_TYPE)) |
4066 | return -EINVAL; | 4010 | return -EINVAL; |
@@ -4070,7 +4014,7 @@ static int mem_cgroup_move_charge_write(struct cgroup *cgrp, | |||
4070 | * inconsistent. | 4014 | * inconsistent. |
4071 | */ | 4015 | */ |
4072 | cgroup_lock(); | 4016 | cgroup_lock(); |
4073 | mem->move_charge_at_immigrate = val; | 4017 | memcg->move_charge_at_immigrate = val; |
4074 | cgroup_unlock(); | 4018 | cgroup_unlock(); |
4075 | 4019 | ||
4076 | return 0; | 4020 | return 0; |
@@ -4127,49 +4071,49 @@ struct { | |||
4127 | 4071 | ||
4128 | 4072 | ||
4129 | static void | 4073 | static void |
4130 | mem_cgroup_get_local_stat(struct mem_cgroup *mem, struct mcs_total_stat *s) | 4074 | mem_cgroup_get_local_stat(struct mem_cgroup *memcg, struct mcs_total_stat *s) |
4131 | { | 4075 | { |
4132 | s64 val; | 4076 | s64 val; |
4133 | 4077 | ||
4134 | /* per cpu stat */ | 4078 | /* per cpu stat */ |
4135 | val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE); | 4079 | val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_CACHE); |
4136 | s->stat[MCS_CACHE] += val * PAGE_SIZE; | 4080 | s->stat[MCS_CACHE] += val * PAGE_SIZE; |
4137 | val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS); | 4081 | val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_RSS); |
4138 | s->stat[MCS_RSS] += val * PAGE_SIZE; | 4082 | s->stat[MCS_RSS] += val * PAGE_SIZE; |
4139 | val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED); | 4083 | val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED); |
4140 | s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE; | 4084 | s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE; |
4141 | val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGPGIN); | 4085 | val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGPGIN); |
4142 | s->stat[MCS_PGPGIN] += val; | 4086 | s->stat[MCS_PGPGIN] += val; |
4143 | val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGPGOUT); | 4087 | val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGPGOUT); |
4144 | s->stat[MCS_PGPGOUT] += val; | 4088 | s->stat[MCS_PGPGOUT] += val; |
4145 | if (do_swap_account) { | 4089 | if (do_swap_account) { |
4146 | val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT); | 4090 | val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_SWAPOUT); |
4147 | s->stat[MCS_SWAP] += val * PAGE_SIZE; | 4091 | s->stat[MCS_SWAP] += val * PAGE_SIZE; |
4148 | } | 4092 | } |
4149 | val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGFAULT); | 4093 | val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGFAULT); |
4150 | s->stat[MCS_PGFAULT] += val; | 4094 | s->stat[MCS_PGFAULT] += val; |
4151 | val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGMAJFAULT); | 4095 | val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGMAJFAULT); |
4152 | s->stat[MCS_PGMAJFAULT] += val; | 4096 | s->stat[MCS_PGMAJFAULT] += val; |
4153 | 4097 | ||
4154 | /* per zone stat */ | 4098 | /* per zone stat */ |
4155 | val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON); | 4099 | val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_ANON)); |
4156 | s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE; | 4100 | s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE; |
4157 | val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_ANON); | 4101 | val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_ANON)); |
4158 | s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE; | 4102 | s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE; |
4159 | val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_FILE); | 4103 | val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_FILE)); |
4160 | s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE; | 4104 | s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE; |
4161 | val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_FILE); | 4105 | val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_FILE)); |
4162 | s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE; | 4106 | s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE; |
4163 | val = mem_cgroup_get_local_zonestat(mem, LRU_UNEVICTABLE); | 4107 | val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE)); |
4164 | s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE; | 4108 | s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE; |
4165 | } | 4109 | } |
4166 | 4110 | ||
4167 | static void | 4111 | static void |
4168 | mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s) | 4112 | mem_cgroup_get_total_stat(struct mem_cgroup *memcg, struct mcs_total_stat *s) |
4169 | { | 4113 | { |
4170 | struct mem_cgroup *iter; | 4114 | struct mem_cgroup *iter; |
4171 | 4115 | ||
4172 | for_each_mem_cgroup_tree(iter, mem) | 4116 | for_each_mem_cgroup_tree(iter, memcg) |
4173 | mem_cgroup_get_local_stat(iter, s); | 4117 | mem_cgroup_get_local_stat(iter, s); |
4174 | } | 4118 | } |
4175 | 4119 | ||
@@ -4182,35 +4126,37 @@ static int mem_control_numa_stat_show(struct seq_file *m, void *arg) | |||
4182 | struct cgroup *cont = m->private; | 4126 | struct cgroup *cont = m->private; |
4183 | struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont); | 4127 | struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont); |
4184 | 4128 | ||
4185 | total_nr = mem_cgroup_nr_lru_pages(mem_cont); | 4129 | total_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL); |
4186 | seq_printf(m, "total=%lu", total_nr); | 4130 | seq_printf(m, "total=%lu", total_nr); |
4187 | for_each_node_state(nid, N_HIGH_MEMORY) { | 4131 | for_each_node_state(nid, N_HIGH_MEMORY) { |
4188 | node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid); | 4132 | node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, LRU_ALL); |
4189 | seq_printf(m, " N%d=%lu", nid, node_nr); | 4133 | seq_printf(m, " N%d=%lu", nid, node_nr); |
4190 | } | 4134 | } |
4191 | seq_putc(m, '\n'); | 4135 | seq_putc(m, '\n'); |
4192 | 4136 | ||
4193 | file_nr = mem_cgroup_nr_file_lru_pages(mem_cont); | 4137 | file_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL_FILE); |
4194 | seq_printf(m, "file=%lu", file_nr); | 4138 | seq_printf(m, "file=%lu", file_nr); |
4195 | for_each_node_state(nid, N_HIGH_MEMORY) { | 4139 | for_each_node_state(nid, N_HIGH_MEMORY) { |
4196 | node_nr = mem_cgroup_node_nr_file_lru_pages(mem_cont, nid); | 4140 | node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, |
4141 | LRU_ALL_FILE); | ||
4197 | seq_printf(m, " N%d=%lu", nid, node_nr); | 4142 | seq_printf(m, " N%d=%lu", nid, node_nr); |
4198 | } | 4143 | } |
4199 | seq_putc(m, '\n'); | 4144 | seq_putc(m, '\n'); |
4200 | 4145 | ||
4201 | anon_nr = mem_cgroup_nr_anon_lru_pages(mem_cont); | 4146 | anon_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL_ANON); |
4202 | seq_printf(m, "anon=%lu", anon_nr); | 4147 | seq_printf(m, "anon=%lu", anon_nr); |
4203 | for_each_node_state(nid, N_HIGH_MEMORY) { | 4148 | for_each_node_state(nid, N_HIGH_MEMORY) { |
4204 | node_nr = mem_cgroup_node_nr_anon_lru_pages(mem_cont, nid); | 4149 | node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, |
4150 | LRU_ALL_ANON); | ||
4205 | seq_printf(m, " N%d=%lu", nid, node_nr); | 4151 | seq_printf(m, " N%d=%lu", nid, node_nr); |
4206 | } | 4152 | } |
4207 | seq_putc(m, '\n'); | 4153 | seq_putc(m, '\n'); |
4208 | 4154 | ||
4209 | unevictable_nr = mem_cgroup_nr_unevictable_lru_pages(mem_cont); | 4155 | unevictable_nr = mem_cgroup_nr_lru_pages(mem_cont, BIT(LRU_UNEVICTABLE)); |
4210 | seq_printf(m, "unevictable=%lu", unevictable_nr); | 4156 | seq_printf(m, "unevictable=%lu", unevictable_nr); |
4211 | for_each_node_state(nid, N_HIGH_MEMORY) { | 4157 | for_each_node_state(nid, N_HIGH_MEMORY) { |
4212 | node_nr = mem_cgroup_node_nr_unevictable_lru_pages(mem_cont, | 4158 | node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, |
4213 | nid); | 4159 | BIT(LRU_UNEVICTABLE)); |
4214 | seq_printf(m, " N%d=%lu", nid, node_nr); | 4160 | seq_printf(m, " N%d=%lu", nid, node_nr); |
4215 | } | 4161 | } |
4216 | seq_putc(m, '\n'); | 4162 | seq_putc(m, '\n'); |
@@ -4253,8 +4199,6 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, | |||
4253 | } | 4199 | } |
4254 | 4200 | ||
4255 | #ifdef CONFIG_DEBUG_VM | 4201 | #ifdef CONFIG_DEBUG_VM |
4256 | cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL)); | ||
4257 | |||
4258 | { | 4202 | { |
4259 | int nid, zid; | 4203 | int nid, zid; |
4260 | struct mem_cgroup_per_zone *mz; | 4204 | struct mem_cgroup_per_zone *mz; |
@@ -4288,7 +4232,7 @@ static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft) | |||
4288 | { | 4232 | { |
4289 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); | 4233 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); |
4290 | 4234 | ||
4291 | return get_swappiness(memcg); | 4235 | return mem_cgroup_swappiness(memcg); |
4292 | } | 4236 | } |
4293 | 4237 | ||
4294 | static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft, | 4238 | static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft, |
@@ -4391,20 +4335,20 @@ static int compare_thresholds(const void *a, const void *b) | |||
4391 | return _a->threshold - _b->threshold; | 4335 | return _a->threshold - _b->threshold; |
4392 | } | 4336 | } |
4393 | 4337 | ||
4394 | static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem) | 4338 | static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg) |
4395 | { | 4339 | { |
4396 | struct mem_cgroup_eventfd_list *ev; | 4340 | struct mem_cgroup_eventfd_list *ev; |
4397 | 4341 | ||
4398 | list_for_each_entry(ev, &mem->oom_notify, list) | 4342 | list_for_each_entry(ev, &memcg->oom_notify, list) |
4399 | eventfd_signal(ev->eventfd, 1); | 4343 | eventfd_signal(ev->eventfd, 1); |
4400 | return 0; | 4344 | return 0; |
4401 | } | 4345 | } |
4402 | 4346 | ||
4403 | static void mem_cgroup_oom_notify(struct mem_cgroup *mem) | 4347 | static void mem_cgroup_oom_notify(struct mem_cgroup *memcg) |
4404 | { | 4348 | { |
4405 | struct mem_cgroup *iter; | 4349 | struct mem_cgroup *iter; |
4406 | 4350 | ||
4407 | for_each_mem_cgroup_tree(iter, mem) | 4351 | for_each_mem_cgroup_tree(iter, memcg) |
4408 | mem_cgroup_oom_notify_cb(iter); | 4352 | mem_cgroup_oom_notify_cb(iter); |
4409 | } | 4353 | } |
4410 | 4354 | ||
@@ -4578,15 +4522,15 @@ static int mem_cgroup_oom_register_event(struct cgroup *cgrp, | |||
4578 | if (!event) | 4522 | if (!event) |
4579 | return -ENOMEM; | 4523 | return -ENOMEM; |
4580 | 4524 | ||
4581 | mutex_lock(&memcg_oom_mutex); | 4525 | spin_lock(&memcg_oom_lock); |
4582 | 4526 | ||
4583 | event->eventfd = eventfd; | 4527 | event->eventfd = eventfd; |
4584 | list_add(&event->list, &memcg->oom_notify); | 4528 | list_add(&event->list, &memcg->oom_notify); |
4585 | 4529 | ||
4586 | /* already in OOM ? */ | 4530 | /* already in OOM ? */ |
4587 | if (atomic_read(&memcg->oom_lock)) | 4531 | if (atomic_read(&memcg->under_oom)) |
4588 | eventfd_signal(eventfd, 1); | 4532 | eventfd_signal(eventfd, 1); |
4589 | mutex_unlock(&memcg_oom_mutex); | 4533 | spin_unlock(&memcg_oom_lock); |
4590 | 4534 | ||
4591 | return 0; | 4535 | return 0; |
4592 | } | 4536 | } |
@@ -4594,32 +4538,32 @@ static int mem_cgroup_oom_register_event(struct cgroup *cgrp, | |||
4594 | static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp, | 4538 | static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp, |
4595 | struct cftype *cft, struct eventfd_ctx *eventfd) | 4539 | struct cftype *cft, struct eventfd_ctx *eventfd) |
4596 | { | 4540 | { |
4597 | struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); | 4541 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); |
4598 | struct mem_cgroup_eventfd_list *ev, *tmp; | 4542 | struct mem_cgroup_eventfd_list *ev, *tmp; |
4599 | int type = MEMFILE_TYPE(cft->private); | 4543 | int type = MEMFILE_TYPE(cft->private); |
4600 | 4544 | ||
4601 | BUG_ON(type != _OOM_TYPE); | 4545 | BUG_ON(type != _OOM_TYPE); |
4602 | 4546 | ||
4603 | mutex_lock(&memcg_oom_mutex); | 4547 | spin_lock(&memcg_oom_lock); |
4604 | 4548 | ||
4605 | list_for_each_entry_safe(ev, tmp, &mem->oom_notify, list) { | 4549 | list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) { |
4606 | if (ev->eventfd == eventfd) { | 4550 | if (ev->eventfd == eventfd) { |
4607 | list_del(&ev->list); | 4551 | list_del(&ev->list); |
4608 | kfree(ev); | 4552 | kfree(ev); |
4609 | } | 4553 | } |
4610 | } | 4554 | } |
4611 | 4555 | ||
4612 | mutex_unlock(&memcg_oom_mutex); | 4556 | spin_unlock(&memcg_oom_lock); |
4613 | } | 4557 | } |
4614 | 4558 | ||
4615 | static int mem_cgroup_oom_control_read(struct cgroup *cgrp, | 4559 | static int mem_cgroup_oom_control_read(struct cgroup *cgrp, |
4616 | struct cftype *cft, struct cgroup_map_cb *cb) | 4560 | struct cftype *cft, struct cgroup_map_cb *cb) |
4617 | { | 4561 | { |
4618 | struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); | 4562 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); |
4619 | 4563 | ||
4620 | cb->fill(cb, "oom_kill_disable", mem->oom_kill_disable); | 4564 | cb->fill(cb, "oom_kill_disable", memcg->oom_kill_disable); |
4621 | 4565 | ||
4622 | if (atomic_read(&mem->oom_lock)) | 4566 | if (atomic_read(&memcg->under_oom)) |
4623 | cb->fill(cb, "under_oom", 1); | 4567 | cb->fill(cb, "under_oom", 1); |
4624 | else | 4568 | else |
4625 | cb->fill(cb, "under_oom", 0); | 4569 | cb->fill(cb, "under_oom", 0); |
@@ -4629,7 +4573,7 @@ static int mem_cgroup_oom_control_read(struct cgroup *cgrp, | |||
4629 | static int mem_cgroup_oom_control_write(struct cgroup *cgrp, | 4573 | static int mem_cgroup_oom_control_write(struct cgroup *cgrp, |
4630 | struct cftype *cft, u64 val) | 4574 | struct cftype *cft, u64 val) |
4631 | { | 4575 | { |
4632 | struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); | 4576 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); |
4633 | struct mem_cgroup *parent; | 4577 | struct mem_cgroup *parent; |
4634 | 4578 | ||
4635 | /* cannot set to root cgroup and only 0 and 1 are allowed */ | 4579 | /* cannot set to root cgroup and only 0 and 1 are allowed */ |
@@ -4641,13 +4585,13 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp, | |||
4641 | cgroup_lock(); | 4585 | cgroup_lock(); |
4642 | /* oom-kill-disable is a flag for subhierarchy. */ | 4586 | /* oom-kill-disable is a flag for subhierarchy. */ |
4643 | if ((parent->use_hierarchy) || | 4587 | if ((parent->use_hierarchy) || |
4644 | (mem->use_hierarchy && !list_empty(&cgrp->children))) { | 4588 | (memcg->use_hierarchy && !list_empty(&cgrp->children))) { |
4645 | cgroup_unlock(); | 4589 | cgroup_unlock(); |
4646 | return -EINVAL; | 4590 | return -EINVAL; |
4647 | } | 4591 | } |
4648 | mem->oom_kill_disable = val; | 4592 | memcg->oom_kill_disable = val; |
4649 | if (!val) | 4593 | if (!val) |
4650 | memcg_oom_recover(mem); | 4594 | memcg_oom_recover(memcg); |
4651 | cgroup_unlock(); | 4595 | cgroup_unlock(); |
4652 | return 0; | 4596 | return 0; |
4653 | } | 4597 | } |
@@ -4783,7 +4727,7 @@ static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss) | |||
4783 | } | 4727 | } |
4784 | #endif | 4728 | #endif |
4785 | 4729 | ||
4786 | static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) | 4730 | static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) |
4787 | { | 4731 | { |
4788 | struct mem_cgroup_per_node *pn; | 4732 | struct mem_cgroup_per_node *pn; |
4789 | struct mem_cgroup_per_zone *mz; | 4733 | struct mem_cgroup_per_zone *mz; |
@@ -4803,21 +4747,21 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) | |||
4803 | if (!pn) | 4747 | if (!pn) |
4804 | return 1; | 4748 | return 1; |
4805 | 4749 | ||
4806 | mem->info.nodeinfo[node] = pn; | ||
4807 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { | 4750 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { |
4808 | mz = &pn->zoneinfo[zone]; | 4751 | mz = &pn->zoneinfo[zone]; |
4809 | for_each_lru(l) | 4752 | for_each_lru(l) |
4810 | INIT_LIST_HEAD(&mz->lists[l]); | 4753 | INIT_LIST_HEAD(&mz->lists[l]); |
4811 | mz->usage_in_excess = 0; | 4754 | mz->usage_in_excess = 0; |
4812 | mz->on_tree = false; | 4755 | mz->on_tree = false; |
4813 | mz->mem = mem; | 4756 | mz->mem = memcg; |
4814 | } | 4757 | } |
4758 | memcg->info.nodeinfo[node] = pn; | ||
4815 | return 0; | 4759 | return 0; |
4816 | } | 4760 | } |
4817 | 4761 | ||
4818 | static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) | 4762 | static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) |
4819 | { | 4763 | { |
4820 | kfree(mem->info.nodeinfo[node]); | 4764 | kfree(memcg->info.nodeinfo[node]); |
4821 | } | 4765 | } |
4822 | 4766 | ||
4823 | static struct mem_cgroup *mem_cgroup_alloc(void) | 4767 | static struct mem_cgroup *mem_cgroup_alloc(void) |
@@ -4859,51 +4803,51 @@ out_free: | |||
4859 | * Removal of cgroup itself succeeds regardless of refs from swap. | 4803 | * Removal of cgroup itself succeeds regardless of refs from swap. |
4860 | */ | 4804 | */ |
4861 | 4805 | ||
4862 | static void __mem_cgroup_free(struct mem_cgroup *mem) | 4806 | static void __mem_cgroup_free(struct mem_cgroup *memcg) |
4863 | { | 4807 | { |
4864 | int node; | 4808 | int node; |
4865 | 4809 | ||
4866 | mem_cgroup_remove_from_trees(mem); | 4810 | mem_cgroup_remove_from_trees(memcg); |
4867 | free_css_id(&mem_cgroup_subsys, &mem->css); | 4811 | free_css_id(&mem_cgroup_subsys, &memcg->css); |
4868 | 4812 | ||
4869 | for_each_node_state(node, N_POSSIBLE) | 4813 | for_each_node_state(node, N_POSSIBLE) |
4870 | free_mem_cgroup_per_zone_info(mem, node); | 4814 | free_mem_cgroup_per_zone_info(memcg, node); |
4871 | 4815 | ||
4872 | free_percpu(mem->stat); | 4816 | free_percpu(memcg->stat); |
4873 | if (sizeof(struct mem_cgroup) < PAGE_SIZE) | 4817 | if (sizeof(struct mem_cgroup) < PAGE_SIZE) |
4874 | kfree(mem); | 4818 | kfree(memcg); |
4875 | else | 4819 | else |
4876 | vfree(mem); | 4820 | vfree(memcg); |
4877 | } | 4821 | } |
4878 | 4822 | ||
4879 | static void mem_cgroup_get(struct mem_cgroup *mem) | 4823 | static void mem_cgroup_get(struct mem_cgroup *memcg) |
4880 | { | 4824 | { |
4881 | atomic_inc(&mem->refcnt); | 4825 | atomic_inc(&memcg->refcnt); |
4882 | } | 4826 | } |
4883 | 4827 | ||
4884 | static void __mem_cgroup_put(struct mem_cgroup *mem, int count) | 4828 | static void __mem_cgroup_put(struct mem_cgroup *memcg, int count) |
4885 | { | 4829 | { |
4886 | if (atomic_sub_and_test(count, &mem->refcnt)) { | 4830 | if (atomic_sub_and_test(count, &memcg->refcnt)) { |
4887 | struct mem_cgroup *parent = parent_mem_cgroup(mem); | 4831 | struct mem_cgroup *parent = parent_mem_cgroup(memcg); |
4888 | __mem_cgroup_free(mem); | 4832 | __mem_cgroup_free(memcg); |
4889 | if (parent) | 4833 | if (parent) |
4890 | mem_cgroup_put(parent); | 4834 | mem_cgroup_put(parent); |
4891 | } | 4835 | } |
4892 | } | 4836 | } |
4893 | 4837 | ||
4894 | static void mem_cgroup_put(struct mem_cgroup *mem) | 4838 | static void mem_cgroup_put(struct mem_cgroup *memcg) |
4895 | { | 4839 | { |
4896 | __mem_cgroup_put(mem, 1); | 4840 | __mem_cgroup_put(memcg, 1); |
4897 | } | 4841 | } |
4898 | 4842 | ||
4899 | /* | 4843 | /* |
4900 | * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled. | 4844 | * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled. |
4901 | */ | 4845 | */ |
4902 | static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem) | 4846 | static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) |
4903 | { | 4847 | { |
4904 | if (!mem->res.parent) | 4848 | if (!memcg->res.parent) |
4905 | return NULL; | 4849 | return NULL; |
4906 | return mem_cgroup_from_res_counter(mem->res.parent, res); | 4850 | return mem_cgroup_from_res_counter(memcg->res.parent, res); |
4907 | } | 4851 | } |
4908 | 4852 | ||
4909 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | 4853 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP |
@@ -4946,16 +4890,16 @@ static int mem_cgroup_soft_limit_tree_init(void) | |||
4946 | static struct cgroup_subsys_state * __ref | 4890 | static struct cgroup_subsys_state * __ref |
4947 | mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | 4891 | mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) |
4948 | { | 4892 | { |
4949 | struct mem_cgroup *mem, *parent; | 4893 | struct mem_cgroup *memcg, *parent; |
4950 | long error = -ENOMEM; | 4894 | long error = -ENOMEM; |
4951 | int node; | 4895 | int node; |
4952 | 4896 | ||
4953 | mem = mem_cgroup_alloc(); | 4897 | memcg = mem_cgroup_alloc(); |
4954 | if (!mem) | 4898 | if (!memcg) |
4955 | return ERR_PTR(error); | 4899 | return ERR_PTR(error); |
4956 | 4900 | ||
4957 | for_each_node_state(node, N_POSSIBLE) | 4901 | for_each_node_state(node, N_POSSIBLE) |
4958 | if (alloc_mem_cgroup_per_zone_info(mem, node)) | 4902 | if (alloc_mem_cgroup_per_zone_info(memcg, node)) |
4959 | goto free_out; | 4903 | goto free_out; |
4960 | 4904 | ||
4961 | /* root ? */ | 4905 | /* root ? */ |
@@ -4963,7 +4907,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
4963 | int cpu; | 4907 | int cpu; |
4964 | enable_swap_cgroup(); | 4908 | enable_swap_cgroup(); |
4965 | parent = NULL; | 4909 | parent = NULL; |
4966 | root_mem_cgroup = mem; | 4910 | root_mem_cgroup = memcg; |
4967 | if (mem_cgroup_soft_limit_tree_init()) | 4911 | if (mem_cgroup_soft_limit_tree_init()) |
4968 | goto free_out; | 4912 | goto free_out; |
4969 | for_each_possible_cpu(cpu) { | 4913 | for_each_possible_cpu(cpu) { |
@@ -4974,13 +4918,13 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
4974 | hotcpu_notifier(memcg_cpu_hotplug_callback, 0); | 4918 | hotcpu_notifier(memcg_cpu_hotplug_callback, 0); |
4975 | } else { | 4919 | } else { |
4976 | parent = mem_cgroup_from_cont(cont->parent); | 4920 | parent = mem_cgroup_from_cont(cont->parent); |
4977 | mem->use_hierarchy = parent->use_hierarchy; | 4921 | memcg->use_hierarchy = parent->use_hierarchy; |
4978 | mem->oom_kill_disable = parent->oom_kill_disable; | 4922 | memcg->oom_kill_disable = parent->oom_kill_disable; |
4979 | } | 4923 | } |
4980 | 4924 | ||
4981 | if (parent && parent->use_hierarchy) { | 4925 | if (parent && parent->use_hierarchy) { |
4982 | res_counter_init(&mem->res, &parent->res); | 4926 | res_counter_init(&memcg->res, &parent->res); |
4983 | res_counter_init(&mem->memsw, &parent->memsw); | 4927 | res_counter_init(&memcg->memsw, &parent->memsw); |
4984 | /* | 4928 | /* |
4985 | * We increment refcnt of the parent to ensure that we can | 4929 | * We increment refcnt of the parent to ensure that we can |
4986 | * safely access it on res_counter_charge/uncharge. | 4930 | * safely access it on res_counter_charge/uncharge. |
@@ -4989,21 +4933,21 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
4989 | */ | 4933 | */ |
4990 | mem_cgroup_get(parent); | 4934 | mem_cgroup_get(parent); |
4991 | } else { | 4935 | } else { |
4992 | res_counter_init(&mem->res, NULL); | 4936 | res_counter_init(&memcg->res, NULL); |
4993 | res_counter_init(&mem->memsw, NULL); | 4937 | res_counter_init(&memcg->memsw, NULL); |
4994 | } | 4938 | } |
4995 | mem->last_scanned_child = 0; | 4939 | memcg->last_scanned_child = 0; |
4996 | mem->last_scanned_node = MAX_NUMNODES; | 4940 | memcg->last_scanned_node = MAX_NUMNODES; |
4997 | INIT_LIST_HEAD(&mem->oom_notify); | 4941 | INIT_LIST_HEAD(&memcg->oom_notify); |
4998 | 4942 | ||
4999 | if (parent) | 4943 | if (parent) |
5000 | mem->swappiness = get_swappiness(parent); | 4944 | memcg->swappiness = mem_cgroup_swappiness(parent); |
5001 | atomic_set(&mem->refcnt, 1); | 4945 | atomic_set(&memcg->refcnt, 1); |
5002 | mem->move_charge_at_immigrate = 0; | 4946 | memcg->move_charge_at_immigrate = 0; |
5003 | mutex_init(&mem->thresholds_lock); | 4947 | mutex_init(&memcg->thresholds_lock); |
5004 | return &mem->css; | 4948 | return &memcg->css; |
5005 | free_out: | 4949 | free_out: |
5006 | __mem_cgroup_free(mem); | 4950 | __mem_cgroup_free(memcg); |
5007 | root_mem_cgroup = NULL; | 4951 | root_mem_cgroup = NULL; |
5008 | return ERR_PTR(error); | 4952 | return ERR_PTR(error); |
5009 | } | 4953 | } |
@@ -5011,17 +4955,17 @@ free_out: | |||
5011 | static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss, | 4955 | static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss, |
5012 | struct cgroup *cont) | 4956 | struct cgroup *cont) |
5013 | { | 4957 | { |
5014 | struct mem_cgroup *mem = mem_cgroup_from_cont(cont); | 4958 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); |
5015 | 4959 | ||
5016 | return mem_cgroup_force_empty(mem, false); | 4960 | return mem_cgroup_force_empty(memcg, false); |
5017 | } | 4961 | } |
5018 | 4962 | ||
5019 | static void mem_cgroup_destroy(struct cgroup_subsys *ss, | 4963 | static void mem_cgroup_destroy(struct cgroup_subsys *ss, |
5020 | struct cgroup *cont) | 4964 | struct cgroup *cont) |
5021 | { | 4965 | { |
5022 | struct mem_cgroup *mem = mem_cgroup_from_cont(cont); | 4966 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); |
5023 | 4967 | ||
5024 | mem_cgroup_put(mem); | 4968 | mem_cgroup_put(memcg); |
5025 | } | 4969 | } |
5026 | 4970 | ||
5027 | static int mem_cgroup_populate(struct cgroup_subsys *ss, | 4971 | static int mem_cgroup_populate(struct cgroup_subsys *ss, |
@@ -5044,9 +4988,9 @@ static int mem_cgroup_do_precharge(unsigned long count) | |||
5044 | { | 4988 | { |
5045 | int ret = 0; | 4989 | int ret = 0; |
5046 | int batch_count = PRECHARGE_COUNT_AT_ONCE; | 4990 | int batch_count = PRECHARGE_COUNT_AT_ONCE; |
5047 | struct mem_cgroup *mem = mc.to; | 4991 | struct mem_cgroup *memcg = mc.to; |
5048 | 4992 | ||
5049 | if (mem_cgroup_is_root(mem)) { | 4993 | if (mem_cgroup_is_root(memcg)) { |
5050 | mc.precharge += count; | 4994 | mc.precharge += count; |
5051 | /* we don't need css_get for root */ | 4995 | /* we don't need css_get for root */ |
5052 | return ret; | 4996 | return ret; |
@@ -5055,16 +4999,16 @@ static int mem_cgroup_do_precharge(unsigned long count) | |||
5055 | if (count > 1) { | 4999 | if (count > 1) { |
5056 | struct res_counter *dummy; | 5000 | struct res_counter *dummy; |
5057 | /* | 5001 | /* |
5058 | * "mem" cannot be under rmdir() because we've already checked | 5002 | * "memcg" cannot be under rmdir() because we've already checked |
5059 | * by cgroup_lock_live_cgroup() that it is not removed and we | 5003 | * by cgroup_lock_live_cgroup() that it is not removed and we |
5060 | * are still under the same cgroup_mutex. So we can postpone | 5004 | * are still under the same cgroup_mutex. So we can postpone |
5061 | * css_get(). | 5005 | * css_get(). |
5062 | */ | 5006 | */ |
5063 | if (res_counter_charge(&mem->res, PAGE_SIZE * count, &dummy)) | 5007 | if (res_counter_charge(&memcg->res, PAGE_SIZE * count, &dummy)) |
5064 | goto one_by_one; | 5008 | goto one_by_one; |
5065 | if (do_swap_account && res_counter_charge(&mem->memsw, | 5009 | if (do_swap_account && res_counter_charge(&memcg->memsw, |
5066 | PAGE_SIZE * count, &dummy)) { | 5010 | PAGE_SIZE * count, &dummy)) { |
5067 | res_counter_uncharge(&mem->res, PAGE_SIZE * count); | 5011 | res_counter_uncharge(&memcg->res, PAGE_SIZE * count); |
5068 | goto one_by_one; | 5012 | goto one_by_one; |
5069 | } | 5013 | } |
5070 | mc.precharge += count; | 5014 | mc.precharge += count; |
@@ -5081,8 +5025,9 @@ one_by_one: | |||
5081 | batch_count = PRECHARGE_COUNT_AT_ONCE; | 5025 | batch_count = PRECHARGE_COUNT_AT_ONCE; |
5082 | cond_resched(); | 5026 | cond_resched(); |
5083 | } | 5027 | } |
5084 | ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, 1, &mem, false); | 5028 | ret = __mem_cgroup_try_charge(NULL, |
5085 | if (ret || !mem) | 5029 | GFP_KERNEL, 1, &memcg, false); |
5030 | if (ret || !memcg) | ||
5086 | /* mem_cgroup_clear_mc() will do uncharge later */ | 5031 | /* mem_cgroup_clear_mc() will do uncharge later */ |
5087 | return -ENOMEM; | 5032 | return -ENOMEM; |
5088 | mc.precharge++; | 5033 | mc.precharge++; |
@@ -5181,15 +5126,17 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma, | |||
5181 | pgoff = pte_to_pgoff(ptent); | 5126 | pgoff = pte_to_pgoff(ptent); |
5182 | 5127 | ||
5183 | /* page is moved even if it's not RSS of this task(page-faulted). */ | 5128 | /* page is moved even if it's not RSS of this task(page-faulted). */ |
5184 | if (!mapping_cap_swap_backed(mapping)) { /* normal file */ | 5129 | page = find_get_page(mapping, pgoff); |
5185 | page = find_get_page(mapping, pgoff); | 5130 | |
5186 | } else { /* shmem/tmpfs file. we should take account of swap too. */ | 5131 | #ifdef CONFIG_SWAP |
5187 | swp_entry_t ent; | 5132 | /* shmem/tmpfs may report page out on swap: account for that too. */ |
5188 | mem_cgroup_get_shmem_target(inode, pgoff, &page, &ent); | 5133 | if (radix_tree_exceptional_entry(page)) { |
5134 | swp_entry_t swap = radix_to_swp_entry(page); | ||
5189 | if (do_swap_account) | 5135 | if (do_swap_account) |
5190 | entry->val = ent.val; | 5136 | *entry = swap; |
5137 | page = find_get_page(&swapper_space, swap.val); | ||
5191 | } | 5138 | } |
5192 | 5139 | #endif | |
5193 | return page; | 5140 | return page; |
5194 | } | 5141 | } |
5195 | 5142 | ||
@@ -5354,13 +5301,13 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss, | |||
5354 | struct task_struct *p) | 5301 | struct task_struct *p) |
5355 | { | 5302 | { |
5356 | int ret = 0; | 5303 | int ret = 0; |
5357 | struct mem_cgroup *mem = mem_cgroup_from_cont(cgroup); | 5304 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgroup); |
5358 | 5305 | ||
5359 | if (mem->move_charge_at_immigrate) { | 5306 | if (memcg->move_charge_at_immigrate) { |
5360 | struct mm_struct *mm; | 5307 | struct mm_struct *mm; |
5361 | struct mem_cgroup *from = mem_cgroup_from_task(p); | 5308 | struct mem_cgroup *from = mem_cgroup_from_task(p); |
5362 | 5309 | ||
5363 | VM_BUG_ON(from == mem); | 5310 | VM_BUG_ON(from == memcg); |
5364 | 5311 | ||
5365 | mm = get_task_mm(p); | 5312 | mm = get_task_mm(p); |
5366 | if (!mm) | 5313 | if (!mm) |
@@ -5375,7 +5322,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss, | |||
5375 | mem_cgroup_start_move(from); | 5322 | mem_cgroup_start_move(from); |
5376 | spin_lock(&mc.lock); | 5323 | spin_lock(&mc.lock); |
5377 | mc.from = from; | 5324 | mc.from = from; |
5378 | mc.to = mem; | 5325 | mc.to = memcg; |
5379 | spin_unlock(&mc.lock); | 5326 | spin_unlock(&mc.lock); |
5380 | /* We set mc.moving_task later */ | 5327 | /* We set mc.moving_task later */ |
5381 | 5328 | ||
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 740c4f52059c..06d3479513aa 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
@@ -42,6 +42,7 @@ | |||
42 | #include <linux/sched.h> | 42 | #include <linux/sched.h> |
43 | #include <linux/ksm.h> | 43 | #include <linux/ksm.h> |
44 | #include <linux/rmap.h> | 44 | #include <linux/rmap.h> |
45 | #include <linux/export.h> | ||
45 | #include <linux/pagemap.h> | 46 | #include <linux/pagemap.h> |
46 | #include <linux/swap.h> | 47 | #include <linux/swap.h> |
47 | #include <linux/backing-dev.h> | 48 | #include <linux/backing-dev.h> |
@@ -53,6 +54,7 @@ | |||
53 | #include <linux/hugetlb.h> | 54 | #include <linux/hugetlb.h> |
54 | #include <linux/memory_hotplug.h> | 55 | #include <linux/memory_hotplug.h> |
55 | #include <linux/mm_inline.h> | 56 | #include <linux/mm_inline.h> |
57 | #include <linux/kfifo.h> | ||
56 | #include "internal.h" | 58 | #include "internal.h" |
57 | 59 | ||
58 | int sysctl_memory_failure_early_kill __read_mostly = 0; | 60 | int sysctl_memory_failure_early_kill __read_mostly = 0; |
@@ -1178,6 +1180,97 @@ void memory_failure(unsigned long pfn, int trapno) | |||
1178 | __memory_failure(pfn, trapno, 0); | 1180 | __memory_failure(pfn, trapno, 0); |
1179 | } | 1181 | } |
1180 | 1182 | ||
1183 | #define MEMORY_FAILURE_FIFO_ORDER 4 | ||
1184 | #define MEMORY_FAILURE_FIFO_SIZE (1 << MEMORY_FAILURE_FIFO_ORDER) | ||
1185 | |||
1186 | struct memory_failure_entry { | ||
1187 | unsigned long pfn; | ||
1188 | int trapno; | ||
1189 | int flags; | ||
1190 | }; | ||
1191 | |||
1192 | struct memory_failure_cpu { | ||
1193 | DECLARE_KFIFO(fifo, struct memory_failure_entry, | ||
1194 | MEMORY_FAILURE_FIFO_SIZE); | ||
1195 | spinlock_t lock; | ||
1196 | struct work_struct work; | ||
1197 | }; | ||
1198 | |||
1199 | static DEFINE_PER_CPU(struct memory_failure_cpu, memory_failure_cpu); | ||
1200 | |||
1201 | /** | ||
1202 | * memory_failure_queue - Schedule handling memory failure of a page. | ||
1203 | * @pfn: Page Number of the corrupted page | ||
1204 | * @trapno: Trap number reported in the signal to user space. | ||
1205 | * @flags: Flags for memory failure handling | ||
1206 | * | ||
1207 | * This function is called by the low level hardware error handler | ||
1208 | * when it detects hardware memory corruption of a page. It schedules | ||
1209 | * the recovering of error page, including dropping pages, killing | ||
1210 | * processes etc. | ||
1211 | * | ||
1212 | * The function is primarily of use for corruptions that | ||
1213 | * happen outside the current execution context (e.g. when | ||
1214 | * detected by a background scrubber) | ||
1215 | * | ||
1216 | * Can run in IRQ context. | ||
1217 | */ | ||
1218 | void memory_failure_queue(unsigned long pfn, int trapno, int flags) | ||
1219 | { | ||
1220 | struct memory_failure_cpu *mf_cpu; | ||
1221 | unsigned long proc_flags; | ||
1222 | struct memory_failure_entry entry = { | ||
1223 | .pfn = pfn, | ||
1224 | .trapno = trapno, | ||
1225 | .flags = flags, | ||
1226 | }; | ||
1227 | |||
1228 | mf_cpu = &get_cpu_var(memory_failure_cpu); | ||
1229 | spin_lock_irqsave(&mf_cpu->lock, proc_flags); | ||
1230 | if (kfifo_put(&mf_cpu->fifo, &entry)) | ||
1231 | schedule_work_on(smp_processor_id(), &mf_cpu->work); | ||
1232 | else | ||
1233 | pr_err("Memory failure: buffer overflow when queuing memory failure at 0x%#lx\n", | ||
1234 | pfn); | ||
1235 | spin_unlock_irqrestore(&mf_cpu->lock, proc_flags); | ||
1236 | put_cpu_var(memory_failure_cpu); | ||
1237 | } | ||
1238 | EXPORT_SYMBOL_GPL(memory_failure_queue); | ||
1239 | |||
1240 | static void memory_failure_work_func(struct work_struct *work) | ||
1241 | { | ||
1242 | struct memory_failure_cpu *mf_cpu; | ||
1243 | struct memory_failure_entry entry = { 0, }; | ||
1244 | unsigned long proc_flags; | ||
1245 | int gotten; | ||
1246 | |||
1247 | mf_cpu = &__get_cpu_var(memory_failure_cpu); | ||
1248 | for (;;) { | ||
1249 | spin_lock_irqsave(&mf_cpu->lock, proc_flags); | ||
1250 | gotten = kfifo_get(&mf_cpu->fifo, &entry); | ||
1251 | spin_unlock_irqrestore(&mf_cpu->lock, proc_flags); | ||
1252 | if (!gotten) | ||
1253 | break; | ||
1254 | __memory_failure(entry.pfn, entry.trapno, entry.flags); | ||
1255 | } | ||
1256 | } | ||
1257 | |||
1258 | static int __init memory_failure_init(void) | ||
1259 | { | ||
1260 | struct memory_failure_cpu *mf_cpu; | ||
1261 | int cpu; | ||
1262 | |||
1263 | for_each_possible_cpu(cpu) { | ||
1264 | mf_cpu = &per_cpu(memory_failure_cpu, cpu); | ||
1265 | spin_lock_init(&mf_cpu->lock); | ||
1266 | INIT_KFIFO(mf_cpu->fifo); | ||
1267 | INIT_WORK(&mf_cpu->work, memory_failure_work_func); | ||
1268 | } | ||
1269 | |||
1270 | return 0; | ||
1271 | } | ||
1272 | core_initcall(memory_failure_init); | ||
1273 | |||
1181 | /** | 1274 | /** |
1182 | * unpoison_memory - Unpoison a previously poisoned page | 1275 | * unpoison_memory - Unpoison a previously poisoned page |
1183 | * @pfn: Page number of the to be unpoisoned page | 1276 | * @pfn: Page number of the to be unpoisoned page |
@@ -1218,7 +1311,7 @@ int unpoison_memory(unsigned long pfn) | |||
1218 | * to the end. | 1311 | * to the end. |
1219 | */ | 1312 | */ |
1220 | if (PageHuge(page)) { | 1313 | if (PageHuge(page)) { |
1221 | pr_debug("MCE: Memory failure is now running on free hugepage %#lx\n", pfn); | 1314 | pr_info("MCE: Memory failure is now running on free hugepage %#lx\n", pfn); |
1222 | return 0; | 1315 | return 0; |
1223 | } | 1316 | } |
1224 | if (TestClearPageHWPoison(p)) | 1317 | if (TestClearPageHWPoison(p)) |
@@ -1327,7 +1420,7 @@ static int soft_offline_huge_page(struct page *page, int flags) | |||
1327 | 1420 | ||
1328 | if (PageHWPoison(hpage)) { | 1421 | if (PageHWPoison(hpage)) { |
1329 | put_page(hpage); | 1422 | put_page(hpage); |
1330 | pr_debug("soft offline: %#lx hugepage already poisoned\n", pfn); | 1423 | pr_info("soft offline: %#lx hugepage already poisoned\n", pfn); |
1331 | return -EBUSY; | 1424 | return -EBUSY; |
1332 | } | 1425 | } |
1333 | 1426 | ||
@@ -1341,8 +1434,8 @@ static int soft_offline_huge_page(struct page *page, int flags) | |||
1341 | list_for_each_entry_safe(page1, page2, &pagelist, lru) | 1434 | list_for_each_entry_safe(page1, page2, &pagelist, lru) |
1342 | put_page(page1); | 1435 | put_page(page1); |
1343 | 1436 | ||
1344 | pr_debug("soft offline: %#lx: migration failed %d, type %lx\n", | 1437 | pr_info("soft offline: %#lx: migration failed %d, type %lx\n", |
1345 | pfn, ret, page->flags); | 1438 | pfn, ret, page->flags); |
1346 | if (ret > 0) | 1439 | if (ret > 0) |
1347 | ret = -EIO; | 1440 | ret = -EIO; |
1348 | return ret; | 1441 | return ret; |
@@ -1413,7 +1506,7 @@ int soft_offline_page(struct page *page, int flags) | |||
1413 | } | 1506 | } |
1414 | if (!PageLRU(page)) { | 1507 | if (!PageLRU(page)) { |
1415 | pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n", | 1508 | pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n", |
1416 | pfn, page->flags); | 1509 | pfn, page->flags); |
1417 | return -EIO; | 1510 | return -EIO; |
1418 | } | 1511 | } |
1419 | 1512 | ||
@@ -1474,7 +1567,7 @@ int soft_offline_page(struct page *page, int flags) | |||
1474 | } | 1567 | } |
1475 | } else { | 1568 | } else { |
1476 | pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n", | 1569 | pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n", |
1477 | pfn, ret, page_count(page), page->flags); | 1570 | pfn, ret, page_count(page), page->flags); |
1478 | } | 1571 | } |
1479 | if (ret) | 1572 | if (ret) |
1480 | return ret; | 1573 | return ret; |
diff --git a/mm/memory.c b/mm/memory.c index 9b8a01d941cb..829d43735402 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -47,7 +47,7 @@ | |||
47 | #include <linux/pagemap.h> | 47 | #include <linux/pagemap.h> |
48 | #include <linux/ksm.h> | 48 | #include <linux/ksm.h> |
49 | #include <linux/rmap.h> | 49 | #include <linux/rmap.h> |
50 | #include <linux/module.h> | 50 | #include <linux/export.h> |
51 | #include <linux/delayacct.h> | 51 | #include <linux/delayacct.h> |
52 | #include <linux/init.h> | 52 | #include <linux/init.h> |
53 | #include <linux/writeback.h> | 53 | #include <linux/writeback.h> |
@@ -1290,13 +1290,6 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb, | |||
1290 | return addr; | 1290 | return addr; |
1291 | } | 1291 | } |
1292 | 1292 | ||
1293 | #ifdef CONFIG_PREEMPT | ||
1294 | # define ZAP_BLOCK_SIZE (8 * PAGE_SIZE) | ||
1295 | #else | ||
1296 | /* No preempt: go for improved straight-line efficiency */ | ||
1297 | # define ZAP_BLOCK_SIZE (1024 * PAGE_SIZE) | ||
1298 | #endif | ||
1299 | |||
1300 | /** | 1293 | /** |
1301 | * unmap_vmas - unmap a range of memory covered by a list of vma's | 1294 | * unmap_vmas - unmap a range of memory covered by a list of vma's |
1302 | * @tlb: address of the caller's struct mmu_gather | 1295 | * @tlb: address of the caller's struct mmu_gather |
@@ -1310,10 +1303,6 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb, | |||
1310 | * | 1303 | * |
1311 | * Unmap all pages in the vma list. | 1304 | * Unmap all pages in the vma list. |
1312 | * | 1305 | * |
1313 | * We aim to not hold locks for too long (for scheduling latency reasons). | ||
1314 | * So zap pages in ZAP_BLOCK_SIZE bytecounts. This means we need to | ||
1315 | * return the ending mmu_gather to the caller. | ||
1316 | * | ||
1317 | * Only addresses between `start' and `end' will be unmapped. | 1306 | * Only addresses between `start' and `end' will be unmapped. |
1318 | * | 1307 | * |
1319 | * The VMA list must be sorted in ascending virtual address order. | 1308 | * The VMA list must be sorted in ascending virtual address order. |
@@ -1514,7 +1503,7 @@ split_fallthrough: | |||
1514 | } | 1503 | } |
1515 | 1504 | ||
1516 | if (flags & FOLL_GET) | 1505 | if (flags & FOLL_GET) |
1517 | get_page(page); | 1506 | get_page_foll(page); |
1518 | if (flags & FOLL_TOUCH) { | 1507 | if (flags & FOLL_TOUCH) { |
1519 | if ((flags & FOLL_WRITE) && | 1508 | if ((flags & FOLL_WRITE) && |
1520 | !pte_dirty(pte) && !PageDirty(page)) | 1509 | !pte_dirty(pte) && !PageDirty(page)) |
@@ -1816,7 +1805,63 @@ next_page: | |||
1816 | } | 1805 | } |
1817 | EXPORT_SYMBOL(__get_user_pages); | 1806 | EXPORT_SYMBOL(__get_user_pages); |
1818 | 1807 | ||
1819 | /** | 1808 | /* |
1809 | * fixup_user_fault() - manually resolve a user page fault | ||
1810 | * @tsk: the task_struct to use for page fault accounting, or | ||
1811 | * NULL if faults are not to be recorded. | ||
1812 | * @mm: mm_struct of target mm | ||
1813 | * @address: user address | ||
1814 | * @fault_flags:flags to pass down to handle_mm_fault() | ||
1815 | * | ||
1816 | * This is meant to be called in the specific scenario where for locking reasons | ||
1817 | * we try to access user memory in atomic context (within a pagefault_disable() | ||
1818 | * section), this returns -EFAULT, and we want to resolve the user fault before | ||
1819 | * trying again. | ||
1820 | * | ||
1821 | * Typically this is meant to be used by the futex code. | ||
1822 | * | ||
1823 | * The main difference with get_user_pages() is that this function will | ||
1824 | * unconditionally call handle_mm_fault() which will in turn perform all the | ||
1825 | * necessary SW fixup of the dirty and young bits in the PTE, while | ||
1826 | * handle_mm_fault() only guarantees to update these in the struct page. | ||
1827 | * | ||
1828 | * This is important for some architectures where those bits also gate the | ||
1829 | * access permission to the page because they are maintained in software. On | ||
1830 | * such architectures, gup() will not be enough to make a subsequent access | ||
1831 | * succeed. | ||
1832 | * | ||
1833 | * This should be called with the mm_sem held for read. | ||
1834 | */ | ||
1835 | int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, | ||
1836 | unsigned long address, unsigned int fault_flags) | ||
1837 | { | ||
1838 | struct vm_area_struct *vma; | ||
1839 | int ret; | ||
1840 | |||
1841 | vma = find_extend_vma(mm, address); | ||
1842 | if (!vma || address < vma->vm_start) | ||
1843 | return -EFAULT; | ||
1844 | |||
1845 | ret = handle_mm_fault(mm, vma, address, fault_flags); | ||
1846 | if (ret & VM_FAULT_ERROR) { | ||
1847 | if (ret & VM_FAULT_OOM) | ||
1848 | return -ENOMEM; | ||
1849 | if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE)) | ||
1850 | return -EHWPOISON; | ||
1851 | if (ret & VM_FAULT_SIGBUS) | ||
1852 | return -EFAULT; | ||
1853 | BUG(); | ||
1854 | } | ||
1855 | if (tsk) { | ||
1856 | if (ret & VM_FAULT_MAJOR) | ||
1857 | tsk->maj_flt++; | ||
1858 | else | ||
1859 | tsk->min_flt++; | ||
1860 | } | ||
1861 | return 0; | ||
1862 | } | ||
1863 | |||
1864 | /* | ||
1820 | * get_user_pages() - pin user pages in memory | 1865 | * get_user_pages() - pin user pages in memory |
1821 | * @tsk: the task_struct to use for page fault accounting, or | 1866 | * @tsk: the task_struct to use for page fault accounting, or |
1822 | * NULL if faults are not to be recorded. | 1867 | * NULL if faults are not to be recorded. |
@@ -3104,14 +3149,34 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3104 | pte_t *page_table; | 3149 | pte_t *page_table; |
3105 | spinlock_t *ptl; | 3150 | spinlock_t *ptl; |
3106 | struct page *page; | 3151 | struct page *page; |
3152 | struct page *cow_page; | ||
3107 | pte_t entry; | 3153 | pte_t entry; |
3108 | int anon = 0; | 3154 | int anon = 0; |
3109 | int charged = 0; | ||
3110 | struct page *dirty_page = NULL; | 3155 | struct page *dirty_page = NULL; |
3111 | struct vm_fault vmf; | 3156 | struct vm_fault vmf; |
3112 | int ret; | 3157 | int ret; |
3113 | int page_mkwrite = 0; | 3158 | int page_mkwrite = 0; |
3114 | 3159 | ||
3160 | /* | ||
3161 | * If we do COW later, allocate page befor taking lock_page() | ||
3162 | * on the file cache page. This will reduce lock holding time. | ||
3163 | */ | ||
3164 | if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { | ||
3165 | |||
3166 | if (unlikely(anon_vma_prepare(vma))) | ||
3167 | return VM_FAULT_OOM; | ||
3168 | |||
3169 | cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); | ||
3170 | if (!cow_page) | ||
3171 | return VM_FAULT_OOM; | ||
3172 | |||
3173 | if (mem_cgroup_newpage_charge(cow_page, mm, GFP_KERNEL)) { | ||
3174 | page_cache_release(cow_page); | ||
3175 | return VM_FAULT_OOM; | ||
3176 | } | ||
3177 | } else | ||
3178 | cow_page = NULL; | ||
3179 | |||
3115 | vmf.virtual_address = (void __user *)(address & PAGE_MASK); | 3180 | vmf.virtual_address = (void __user *)(address & PAGE_MASK); |
3116 | vmf.pgoff = pgoff; | 3181 | vmf.pgoff = pgoff; |
3117 | vmf.flags = flags; | 3182 | vmf.flags = flags; |
@@ -3120,12 +3185,13 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3120 | ret = vma->vm_ops->fault(vma, &vmf); | 3185 | ret = vma->vm_ops->fault(vma, &vmf); |
3121 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | | 3186 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | |
3122 | VM_FAULT_RETRY))) | 3187 | VM_FAULT_RETRY))) |
3123 | return ret; | 3188 | goto uncharge_out; |
3124 | 3189 | ||
3125 | if (unlikely(PageHWPoison(vmf.page))) { | 3190 | if (unlikely(PageHWPoison(vmf.page))) { |
3126 | if (ret & VM_FAULT_LOCKED) | 3191 | if (ret & VM_FAULT_LOCKED) |
3127 | unlock_page(vmf.page); | 3192 | unlock_page(vmf.page); |
3128 | return VM_FAULT_HWPOISON; | 3193 | ret = VM_FAULT_HWPOISON; |
3194 | goto uncharge_out; | ||
3129 | } | 3195 | } |
3130 | 3196 | ||
3131 | /* | 3197 | /* |
@@ -3143,23 +3209,8 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3143 | page = vmf.page; | 3209 | page = vmf.page; |
3144 | if (flags & FAULT_FLAG_WRITE) { | 3210 | if (flags & FAULT_FLAG_WRITE) { |
3145 | if (!(vma->vm_flags & VM_SHARED)) { | 3211 | if (!(vma->vm_flags & VM_SHARED)) { |
3212 | page = cow_page; | ||
3146 | anon = 1; | 3213 | anon = 1; |
3147 | if (unlikely(anon_vma_prepare(vma))) { | ||
3148 | ret = VM_FAULT_OOM; | ||
3149 | goto out; | ||
3150 | } | ||
3151 | page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, | ||
3152 | vma, address); | ||
3153 | if (!page) { | ||
3154 | ret = VM_FAULT_OOM; | ||
3155 | goto out; | ||
3156 | } | ||
3157 | if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) { | ||
3158 | ret = VM_FAULT_OOM; | ||
3159 | page_cache_release(page); | ||
3160 | goto out; | ||
3161 | } | ||
3162 | charged = 1; | ||
3163 | copy_user_highpage(page, vmf.page, address, vma); | 3214 | copy_user_highpage(page, vmf.page, address, vma); |
3164 | __SetPageUptodate(page); | 3215 | __SetPageUptodate(page); |
3165 | } else { | 3216 | } else { |
@@ -3228,8 +3279,8 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3228 | /* no need to invalidate: a not-present page won't be cached */ | 3279 | /* no need to invalidate: a not-present page won't be cached */ |
3229 | update_mmu_cache(vma, address, page_table); | 3280 | update_mmu_cache(vma, address, page_table); |
3230 | } else { | 3281 | } else { |
3231 | if (charged) | 3282 | if (cow_page) |
3232 | mem_cgroup_uncharge_page(page); | 3283 | mem_cgroup_uncharge_page(cow_page); |
3233 | if (anon) | 3284 | if (anon) |
3234 | page_cache_release(page); | 3285 | page_cache_release(page); |
3235 | else | 3286 | else |
@@ -3238,7 +3289,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3238 | 3289 | ||
3239 | pte_unmap_unlock(page_table, ptl); | 3290 | pte_unmap_unlock(page_table, ptl); |
3240 | 3291 | ||
3241 | out: | ||
3242 | if (dirty_page) { | 3292 | if (dirty_page) { |
3243 | struct address_space *mapping = page->mapping; | 3293 | struct address_space *mapping = page->mapping; |
3244 | 3294 | ||
@@ -3268,6 +3318,13 @@ out: | |||
3268 | unwritable_page: | 3318 | unwritable_page: |
3269 | page_cache_release(page); | 3319 | page_cache_release(page); |
3270 | return ret; | 3320 | return ret; |
3321 | uncharge_out: | ||
3322 | /* fs's fault handler get error */ | ||
3323 | if (cow_page) { | ||
3324 | mem_cgroup_uncharge_page(cow_page); | ||
3325 | page_cache_release(cow_page); | ||
3326 | } | ||
3327 | return ret; | ||
3271 | } | 3328 | } |
3272 | 3329 | ||
3273 | static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, | 3330 | static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index c46887b5a11e..2168489c0bc9 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -11,7 +11,7 @@ | |||
11 | #include <linux/pagemap.h> | 11 | #include <linux/pagemap.h> |
12 | #include <linux/bootmem.h> | 12 | #include <linux/bootmem.h> |
13 | #include <linux/compiler.h> | 13 | #include <linux/compiler.h> |
14 | #include <linux/module.h> | 14 | #include <linux/export.h> |
15 | #include <linux/pagevec.h> | 15 | #include <linux/pagevec.h> |
16 | #include <linux/writeback.h> | 16 | #include <linux/writeback.h> |
17 | #include <linux/slab.h> | 17 | #include <linux/slab.h> |
@@ -34,6 +34,17 @@ | |||
34 | 34 | ||
35 | #include "internal.h" | 35 | #include "internal.h" |
36 | 36 | ||
37 | /* | ||
38 | * online_page_callback contains pointer to current page onlining function. | ||
39 | * Initially it is generic_online_page(). If it is required it could be | ||
40 | * changed by calling set_online_page_callback() for callback registration | ||
41 | * and restore_online_page_callback() for generic callback restore. | ||
42 | */ | ||
43 | |||
44 | static void generic_online_page(struct page *page); | ||
45 | |||
46 | static online_page_callback_t online_page_callback = generic_online_page; | ||
47 | |||
37 | DEFINE_MUTEX(mem_hotplug_mutex); | 48 | DEFINE_MUTEX(mem_hotplug_mutex); |
38 | 49 | ||
39 | void lock_memory_hotplug(void) | 50 | void lock_memory_hotplug(void) |
@@ -361,23 +372,74 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, | |||
361 | } | 372 | } |
362 | EXPORT_SYMBOL_GPL(__remove_pages); | 373 | EXPORT_SYMBOL_GPL(__remove_pages); |
363 | 374 | ||
364 | void online_page(struct page *page) | 375 | int set_online_page_callback(online_page_callback_t callback) |
376 | { | ||
377 | int rc = -EINVAL; | ||
378 | |||
379 | lock_memory_hotplug(); | ||
380 | |||
381 | if (online_page_callback == generic_online_page) { | ||
382 | online_page_callback = callback; | ||
383 | rc = 0; | ||
384 | } | ||
385 | |||
386 | unlock_memory_hotplug(); | ||
387 | |||
388 | return rc; | ||
389 | } | ||
390 | EXPORT_SYMBOL_GPL(set_online_page_callback); | ||
391 | |||
392 | int restore_online_page_callback(online_page_callback_t callback) | ||
393 | { | ||
394 | int rc = -EINVAL; | ||
395 | |||
396 | lock_memory_hotplug(); | ||
397 | |||
398 | if (online_page_callback == callback) { | ||
399 | online_page_callback = generic_online_page; | ||
400 | rc = 0; | ||
401 | } | ||
402 | |||
403 | unlock_memory_hotplug(); | ||
404 | |||
405 | return rc; | ||
406 | } | ||
407 | EXPORT_SYMBOL_GPL(restore_online_page_callback); | ||
408 | |||
409 | void __online_page_set_limits(struct page *page) | ||
365 | { | 410 | { |
366 | unsigned long pfn = page_to_pfn(page); | 411 | unsigned long pfn = page_to_pfn(page); |
367 | 412 | ||
368 | totalram_pages++; | ||
369 | if (pfn >= num_physpages) | 413 | if (pfn >= num_physpages) |
370 | num_physpages = pfn + 1; | 414 | num_physpages = pfn + 1; |
415 | } | ||
416 | EXPORT_SYMBOL_GPL(__online_page_set_limits); | ||
417 | |||
418 | void __online_page_increment_counters(struct page *page) | ||
419 | { | ||
420 | totalram_pages++; | ||
371 | 421 | ||
372 | #ifdef CONFIG_HIGHMEM | 422 | #ifdef CONFIG_HIGHMEM |
373 | if (PageHighMem(page)) | 423 | if (PageHighMem(page)) |
374 | totalhigh_pages++; | 424 | totalhigh_pages++; |
375 | #endif | 425 | #endif |
426 | } | ||
427 | EXPORT_SYMBOL_GPL(__online_page_increment_counters); | ||
376 | 428 | ||
429 | void __online_page_free(struct page *page) | ||
430 | { | ||
377 | ClearPageReserved(page); | 431 | ClearPageReserved(page); |
378 | init_page_count(page); | 432 | init_page_count(page); |
379 | __free_page(page); | 433 | __free_page(page); |
380 | } | 434 | } |
435 | EXPORT_SYMBOL_GPL(__online_page_free); | ||
436 | |||
437 | static void generic_online_page(struct page *page) | ||
438 | { | ||
439 | __online_page_set_limits(page); | ||
440 | __online_page_increment_counters(page); | ||
441 | __online_page_free(page); | ||
442 | } | ||
381 | 443 | ||
382 | static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, | 444 | static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, |
383 | void *arg) | 445 | void *arg) |
@@ -388,7 +450,7 @@ static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, | |||
388 | if (PageReserved(pfn_to_page(start_pfn))) | 450 | if (PageReserved(pfn_to_page(start_pfn))) |
389 | for (i = 0; i < nr_pages; i++) { | 451 | for (i = 0; i < nr_pages; i++) { |
390 | page = pfn_to_page(start_pfn + i); | 452 | page = pfn_to_page(start_pfn + i); |
391 | online_page(page); | 453 | (*online_page_callback)(page); |
392 | onlined_pages++; | 454 | onlined_pages++; |
393 | } | 455 | } |
394 | *(unsigned long *)arg = onlined_pages; | 456 | *(unsigned long *)arg = onlined_pages; |
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index e7fb9d25c54e..adc395481813 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -75,7 +75,7 @@ | |||
75 | #include <linux/cpuset.h> | 75 | #include <linux/cpuset.h> |
76 | #include <linux/slab.h> | 76 | #include <linux/slab.h> |
77 | #include <linux/string.h> | 77 | #include <linux/string.h> |
78 | #include <linux/module.h> | 78 | #include <linux/export.h> |
79 | #include <linux/nsproxy.h> | 79 | #include <linux/nsproxy.h> |
80 | #include <linux/interrupt.h> | 80 | #include <linux/interrupt.h> |
81 | #include <linux/init.h> | 81 | #include <linux/init.h> |
@@ -93,6 +93,7 @@ | |||
93 | 93 | ||
94 | #include <asm/tlbflush.h> | 94 | #include <asm/tlbflush.h> |
95 | #include <asm/uaccess.h> | 95 | #include <asm/uaccess.h> |
96 | #include <linux/random.h> | ||
96 | 97 | ||
97 | #include "internal.h" | 98 | #include "internal.h" |
98 | 99 | ||
@@ -110,7 +111,7 @@ enum zone_type policy_zone = 0; | |||
110 | /* | 111 | /* |
111 | * run-time system-wide default policy => local allocation | 112 | * run-time system-wide default policy => local allocation |
112 | */ | 113 | */ |
113 | struct mempolicy default_policy = { | 114 | static struct mempolicy default_policy = { |
114 | .refcnt = ATOMIC_INIT(1), /* never free it */ | 115 | .refcnt = ATOMIC_INIT(1), /* never free it */ |
115 | .mode = MPOL_PREFERRED, | 116 | .mode = MPOL_PREFERRED, |
116 | .flags = MPOL_F_LOCAL, | 117 | .flags = MPOL_F_LOCAL, |
@@ -635,7 +636,6 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, | |||
635 | struct vm_area_struct *prev; | 636 | struct vm_area_struct *prev; |
636 | struct vm_area_struct *vma; | 637 | struct vm_area_struct *vma; |
637 | int err = 0; | 638 | int err = 0; |
638 | pgoff_t pgoff; | ||
639 | unsigned long vmstart; | 639 | unsigned long vmstart; |
640 | unsigned long vmend; | 640 | unsigned long vmend; |
641 | 641 | ||
@@ -648,9 +648,9 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, | |||
648 | vmstart = max(start, vma->vm_start); | 648 | vmstart = max(start, vma->vm_start); |
649 | vmend = min(end, vma->vm_end); | 649 | vmend = min(end, vma->vm_end); |
650 | 650 | ||
651 | pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); | ||
652 | prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags, | 651 | prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags, |
653 | vma->anon_vma, vma->vm_file, pgoff, new_pol); | 652 | vma->anon_vma, vma->vm_file, vma->vm_pgoff, |
653 | new_pol); | ||
654 | if (prev) { | 654 | if (prev) { |
655 | vma = prev; | 655 | vma = prev; |
656 | next = vma->vm_next; | 656 | next = vma->vm_next; |
@@ -1411,7 +1411,9 @@ asmlinkage long compat_sys_get_mempolicy(int __user *policy, | |||
1411 | err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags); | 1411 | err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags); |
1412 | 1412 | ||
1413 | if (!err && nmask) { | 1413 | if (!err && nmask) { |
1414 | err = copy_from_user(bm, nm, alloc_size); | 1414 | unsigned long copy_size; |
1415 | copy_size = min_t(unsigned long, sizeof(bm), alloc_size); | ||
1416 | err = copy_from_user(bm, nm, copy_size); | ||
1415 | /* ensure entire bitmap is zeroed */ | 1417 | /* ensure entire bitmap is zeroed */ |
1416 | err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8); | 1418 | err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8); |
1417 | err |= compat_put_bitmap(nmask, bm, nr_bits); | 1419 | err |= compat_put_bitmap(nmask, bm, nr_bits); |
@@ -1645,6 +1647,21 @@ static inline unsigned interleave_nid(struct mempolicy *pol, | |||
1645 | return interleave_nodes(pol); | 1647 | return interleave_nodes(pol); |
1646 | } | 1648 | } |
1647 | 1649 | ||
1650 | /* | ||
1651 | * Return the bit number of a random bit set in the nodemask. | ||
1652 | * (returns -1 if nodemask is empty) | ||
1653 | */ | ||
1654 | int node_random(const nodemask_t *maskp) | ||
1655 | { | ||
1656 | int w, bit = -1; | ||
1657 | |||
1658 | w = nodes_weight(*maskp); | ||
1659 | if (w) | ||
1660 | bit = bitmap_ord_to_pos(maskp->bits, | ||
1661 | get_random_int() % w, MAX_NUMNODES); | ||
1662 | return bit; | ||
1663 | } | ||
1664 | |||
1648 | #ifdef CONFIG_HUGETLBFS | 1665 | #ifdef CONFIG_HUGETLBFS |
1649 | /* | 1666 | /* |
1650 | * huge_zonelist(@vma, @addr, @gfp_flags, @mpol) | 1667 | * huge_zonelist(@vma, @addr, @gfp_flags, @mpol) |
diff --git a/mm/mempool.c b/mm/mempool.c index 1a3bc3d4d554..e73641b79bb5 100644 --- a/mm/mempool.c +++ b/mm/mempool.c | |||
@@ -10,7 +10,7 @@ | |||
10 | 10 | ||
11 | #include <linux/mm.h> | 11 | #include <linux/mm.h> |
12 | #include <linux/slab.h> | 12 | #include <linux/slab.h> |
13 | #include <linux/module.h> | 13 | #include <linux/export.h> |
14 | #include <linux/mempool.h> | 14 | #include <linux/mempool.h> |
15 | #include <linux/blkdev.h> | 15 | #include <linux/blkdev.h> |
16 | #include <linux/writeback.h> | 16 | #include <linux/writeback.h> |
diff --git a/mm/migrate.c b/mm/migrate.c index 666e4e677414..578e29174fa6 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -13,7 +13,7 @@ | |||
13 | */ | 13 | */ |
14 | 14 | ||
15 | #include <linux/migrate.h> | 15 | #include <linux/migrate.h> |
16 | #include <linux/module.h> | 16 | #include <linux/export.h> |
17 | #include <linux/swap.h> | 17 | #include <linux/swap.h> |
18 | #include <linux/swapops.h> | 18 | #include <linux/swapops.h> |
19 | #include <linux/pagemap.h> | 19 | #include <linux/pagemap.h> |
@@ -120,10 +120,10 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, | |||
120 | 120 | ||
121 | ptep = pte_offset_map(pmd, addr); | 121 | ptep = pte_offset_map(pmd, addr); |
122 | 122 | ||
123 | if (!is_swap_pte(*ptep)) { | 123 | /* |
124 | pte_unmap(ptep); | 124 | * Peek to check is_swap_pte() before taking ptlock? No, we |
125 | goto out; | 125 | * can race mremap's move_ptes(), which skips anon_vma lock. |
126 | } | 126 | */ |
127 | 127 | ||
128 | ptl = pte_lockptr(mm, pmd); | 128 | ptl = pte_lockptr(mm, pmd); |
129 | } | 129 | } |
@@ -621,38 +621,18 @@ static int move_to_new_page(struct page *newpage, struct page *page, | |||
621 | return rc; | 621 | return rc; |
622 | } | 622 | } |
623 | 623 | ||
624 | /* | 624 | static int __unmap_and_move(struct page *page, struct page *newpage, |
625 | * Obtain the lock on page, remove all ptes and migrate the page | 625 | int force, bool offlining, bool sync) |
626 | * to the newly allocated page in newpage. | ||
627 | */ | ||
628 | static int unmap_and_move(new_page_t get_new_page, unsigned long private, | ||
629 | struct page *page, int force, bool offlining, bool sync) | ||
630 | { | 626 | { |
631 | int rc = 0; | 627 | int rc = -EAGAIN; |
632 | int *result = NULL; | ||
633 | struct page *newpage = get_new_page(page, private, &result); | ||
634 | int remap_swapcache = 1; | 628 | int remap_swapcache = 1; |
635 | int charge = 0; | 629 | int charge = 0; |
636 | struct mem_cgroup *mem; | 630 | struct mem_cgroup *mem; |
637 | struct anon_vma *anon_vma = NULL; | 631 | struct anon_vma *anon_vma = NULL; |
638 | 632 | ||
639 | if (!newpage) | ||
640 | return -ENOMEM; | ||
641 | |||
642 | if (page_count(page) == 1) { | ||
643 | /* page was freed from under us. So we are done. */ | ||
644 | goto move_newpage; | ||
645 | } | ||
646 | if (unlikely(PageTransHuge(page))) | ||
647 | if (unlikely(split_huge_page(page))) | ||
648 | goto move_newpage; | ||
649 | |||
650 | /* prepare cgroup just returns 0 or -ENOMEM */ | ||
651 | rc = -EAGAIN; | ||
652 | |||
653 | if (!trylock_page(page)) { | 633 | if (!trylock_page(page)) { |
654 | if (!force || !sync) | 634 | if (!force || !sync) |
655 | goto move_newpage; | 635 | goto out; |
656 | 636 | ||
657 | /* | 637 | /* |
658 | * It's not safe for direct compaction to call lock_page. | 638 | * It's not safe for direct compaction to call lock_page. |
@@ -668,7 +648,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, | |||
668 | * altogether. | 648 | * altogether. |
669 | */ | 649 | */ |
670 | if (current->flags & PF_MEMALLOC) | 650 | if (current->flags & PF_MEMALLOC) |
671 | goto move_newpage; | 651 | goto out; |
672 | 652 | ||
673 | lock_page(page); | 653 | lock_page(page); |
674 | } | 654 | } |
@@ -785,27 +765,52 @@ uncharge: | |||
785 | mem_cgroup_end_migration(mem, page, newpage, rc == 0); | 765 | mem_cgroup_end_migration(mem, page, newpage, rc == 0); |
786 | unlock: | 766 | unlock: |
787 | unlock_page(page); | 767 | unlock_page(page); |
768 | out: | ||
769 | return rc; | ||
770 | } | ||
788 | 771 | ||
789 | move_newpage: | 772 | /* |
773 | * Obtain the lock on page, remove all ptes and migrate the page | ||
774 | * to the newly allocated page in newpage. | ||
775 | */ | ||
776 | static int unmap_and_move(new_page_t get_new_page, unsigned long private, | ||
777 | struct page *page, int force, bool offlining, bool sync) | ||
778 | { | ||
779 | int rc = 0; | ||
780 | int *result = NULL; | ||
781 | struct page *newpage = get_new_page(page, private, &result); | ||
782 | |||
783 | if (!newpage) | ||
784 | return -ENOMEM; | ||
785 | |||
786 | if (page_count(page) == 1) { | ||
787 | /* page was freed from under us. So we are done. */ | ||
788 | goto out; | ||
789 | } | ||
790 | |||
791 | if (unlikely(PageTransHuge(page))) | ||
792 | if (unlikely(split_huge_page(page))) | ||
793 | goto out; | ||
794 | |||
795 | rc = __unmap_and_move(page, newpage, force, offlining, sync); | ||
796 | out: | ||
790 | if (rc != -EAGAIN) { | 797 | if (rc != -EAGAIN) { |
791 | /* | 798 | /* |
792 | * A page that has been migrated has all references | 799 | * A page that has been migrated has all references |
793 | * removed and will be freed. A page that has not been | 800 | * removed and will be freed. A page that has not been |
794 | * migrated will have kepts its references and be | 801 | * migrated will have kepts its references and be |
795 | * restored. | 802 | * restored. |
796 | */ | 803 | */ |
797 | list_del(&page->lru); | 804 | list_del(&page->lru); |
798 | dec_zone_page_state(page, NR_ISOLATED_ANON + | 805 | dec_zone_page_state(page, NR_ISOLATED_ANON + |
799 | page_is_file_cache(page)); | 806 | page_is_file_cache(page)); |
800 | putback_lru_page(page); | 807 | putback_lru_page(page); |
801 | } | 808 | } |
802 | |||
803 | /* | 809 | /* |
804 | * Move the new page to the LRU. If migration was not successful | 810 | * Move the new page to the LRU. If migration was not successful |
805 | * then this will free the page. | 811 | * then this will free the page. |
806 | */ | 812 | */ |
807 | putback_lru_page(newpage); | 813 | putback_lru_page(newpage); |
808 | |||
809 | if (result) { | 814 | if (result) { |
810 | if (rc) | 815 | if (rc) |
811 | *result = rc; | 816 | *result = rc; |
diff --git a/mm/mincore.c b/mm/mincore.c index a4e6b9d75c76..636a86876ff2 100644 --- a/mm/mincore.c +++ b/mm/mincore.c | |||
@@ -69,12 +69,15 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff) | |||
69 | * file will not get a swp_entry_t in its pte, but rather it is like | 69 | * file will not get a swp_entry_t in its pte, but rather it is like |
70 | * any other file mapping (ie. marked !present and faulted in with | 70 | * any other file mapping (ie. marked !present and faulted in with |
71 | * tmpfs's .fault). So swapped out tmpfs mappings are tested here. | 71 | * tmpfs's .fault). So swapped out tmpfs mappings are tested here. |
72 | * | ||
73 | * However when tmpfs moves the page from pagecache and into swapcache, | ||
74 | * it is still in core, but the find_get_page below won't find it. | ||
75 | * No big deal, but make a note of it. | ||
76 | */ | 72 | */ |
77 | page = find_get_page(mapping, pgoff); | 73 | page = find_get_page(mapping, pgoff); |
74 | #ifdef CONFIG_SWAP | ||
75 | /* shmem/tmpfs may return swap: account for swapcache page too. */ | ||
76 | if (radix_tree_exceptional_entry(page)) { | ||
77 | swp_entry_t swap = radix_to_swp_entry(page); | ||
78 | page = find_get_page(&swapper_space, swap.val); | ||
79 | } | ||
80 | #endif | ||
78 | if (page) { | 81 | if (page) { |
79 | present = PageUptodate(page); | 82 | present = PageUptodate(page); |
80 | page_cache_release(page); | 83 | page_cache_release(page); |
diff --git a/mm/mlock.c b/mm/mlock.c index 048260c4e02e..4f4f53bdc65d 100644 --- a/mm/mlock.c +++ b/mm/mlock.c | |||
@@ -14,7 +14,7 @@ | |||
14 | #include <linux/mempolicy.h> | 14 | #include <linux/mempolicy.h> |
15 | #include <linux/syscalls.h> | 15 | #include <linux/syscalls.h> |
16 | #include <linux/sched.h> | 16 | #include <linux/sched.h> |
17 | #include <linux/module.h> | 17 | #include <linux/export.h> |
18 | #include <linux/rmap.h> | 18 | #include <linux/rmap.h> |
19 | #include <linux/mmzone.h> | 19 | #include <linux/mmzone.h> |
20 | #include <linux/hugetlb.h> | 20 | #include <linux/hugetlb.h> |
@@ -110,7 +110,15 @@ void munlock_vma_page(struct page *page) | |||
110 | if (TestClearPageMlocked(page)) { | 110 | if (TestClearPageMlocked(page)) { |
111 | dec_zone_page_state(page, NR_MLOCK); | 111 | dec_zone_page_state(page, NR_MLOCK); |
112 | if (!isolate_lru_page(page)) { | 112 | if (!isolate_lru_page(page)) { |
113 | int ret = try_to_munlock(page); | 113 | int ret = SWAP_AGAIN; |
114 | |||
115 | /* | ||
116 | * Optimization: if the page was mapped just once, | ||
117 | * that's our mapping and we don't need to check all the | ||
118 | * other vmas. | ||
119 | */ | ||
120 | if (page_mapcount(page) > 1) | ||
121 | ret = try_to_munlock(page); | ||
114 | /* | 122 | /* |
115 | * did try_to_unlock() succeed or punt? | 123 | * did try_to_unlock() succeed or punt? |
116 | */ | 124 | */ |
@@ -549,7 +557,8 @@ SYSCALL_DEFINE1(mlockall, int, flags) | |||
549 | if (!can_do_mlock()) | 557 | if (!can_do_mlock()) |
550 | goto out; | 558 | goto out; |
551 | 559 | ||
552 | lru_add_drain_all(); /* flush pagevec */ | 560 | if (flags & MCL_CURRENT) |
561 | lru_add_drain_all(); /* flush pagevec */ | ||
553 | 562 | ||
554 | down_write(¤t->mm->mmap_sem); | 563 | down_write(¤t->mm->mmap_sem); |
555 | 564 | ||
diff --git a/mm/mm_init.c b/mm/mm_init.c index 4e0e26591dfa..1ffd97ae26d7 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c | |||
@@ -8,7 +8,7 @@ | |||
8 | #include <linux/kernel.h> | 8 | #include <linux/kernel.h> |
9 | #include <linux/init.h> | 9 | #include <linux/init.h> |
10 | #include <linux/kobject.h> | 10 | #include <linux/kobject.h> |
11 | #include <linux/module.h> | 11 | #include <linux/export.h> |
12 | #include "internal.h" | 12 | #include "internal.h" |
13 | 13 | ||
14 | #ifdef CONFIG_DEBUG_MEMORY_INIT | 14 | #ifdef CONFIG_DEBUG_MEMORY_INIT |
@@ -22,7 +22,7 @@ | |||
22 | #include <linux/security.h> | 22 | #include <linux/security.h> |
23 | #include <linux/hugetlb.h> | 23 | #include <linux/hugetlb.h> |
24 | #include <linux/profile.h> | 24 | #include <linux/profile.h> |
25 | #include <linux/module.h> | 25 | #include <linux/export.h> |
26 | #include <linux/mount.h> | 26 | #include <linux/mount.h> |
27 | #include <linux/mempolicy.h> | 27 | #include <linux/mempolicy.h> |
28 | #include <linux/rmap.h> | 28 | #include <linux/rmap.h> |
@@ -122,9 +122,17 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) | |||
122 | return 0; | 122 | return 0; |
123 | 123 | ||
124 | if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { | 124 | if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { |
125 | unsigned long n; | 125 | free = global_page_state(NR_FREE_PAGES); |
126 | free += global_page_state(NR_FILE_PAGES); | ||
127 | |||
128 | /* | ||
129 | * shmem pages shouldn't be counted as free in this | ||
130 | * case, they can't be purged, only swapped out, and | ||
131 | * that won't affect the overall amount of available | ||
132 | * memory in the system. | ||
133 | */ | ||
134 | free -= global_page_state(NR_SHMEM); | ||
126 | 135 | ||
127 | free = global_page_state(NR_FILE_PAGES); | ||
128 | free += nr_swap_pages; | 136 | free += nr_swap_pages; |
129 | 137 | ||
130 | /* | 138 | /* |
@@ -136,34 +144,18 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) | |||
136 | free += global_page_state(NR_SLAB_RECLAIMABLE); | 144 | free += global_page_state(NR_SLAB_RECLAIMABLE); |
137 | 145 | ||
138 | /* | 146 | /* |
139 | * Leave the last 3% for root | ||
140 | */ | ||
141 | if (!cap_sys_admin) | ||
142 | free -= free / 32; | ||
143 | |||
144 | if (free > pages) | ||
145 | return 0; | ||
146 | |||
147 | /* | ||
148 | * nr_free_pages() is very expensive on large systems, | ||
149 | * only call if we're about to fail. | ||
150 | */ | ||
151 | n = nr_free_pages(); | ||
152 | |||
153 | /* | ||
154 | * Leave reserved pages. The pages are not for anonymous pages. | 147 | * Leave reserved pages. The pages are not for anonymous pages. |
155 | */ | 148 | */ |
156 | if (n <= totalreserve_pages) | 149 | if (free <= totalreserve_pages) |
157 | goto error; | 150 | goto error; |
158 | else | 151 | else |
159 | n -= totalreserve_pages; | 152 | free -= totalreserve_pages; |
160 | 153 | ||
161 | /* | 154 | /* |
162 | * Leave the last 3% for root | 155 | * Leave the last 3% for root |
163 | */ | 156 | */ |
164 | if (!cap_sys_admin) | 157 | if (!cap_sys_admin) |
165 | n -= n / 32; | 158 | free -= free / 32; |
166 | free += n; | ||
167 | 159 | ||
168 | if (free > pages) | 160 | if (free > pages) |
169 | return 0; | 161 | return 0; |
@@ -2566,7 +2558,6 @@ int mm_take_all_locks(struct mm_struct *mm) | |||
2566 | { | 2558 | { |
2567 | struct vm_area_struct *vma; | 2559 | struct vm_area_struct *vma; |
2568 | struct anon_vma_chain *avc; | 2560 | struct anon_vma_chain *avc; |
2569 | int ret = -EINTR; | ||
2570 | 2561 | ||
2571 | BUG_ON(down_read_trylock(&mm->mmap_sem)); | 2562 | BUG_ON(down_read_trylock(&mm->mmap_sem)); |
2572 | 2563 | ||
@@ -2587,13 +2578,11 @@ int mm_take_all_locks(struct mm_struct *mm) | |||
2587 | vm_lock_anon_vma(mm, avc->anon_vma); | 2578 | vm_lock_anon_vma(mm, avc->anon_vma); |
2588 | } | 2579 | } |
2589 | 2580 | ||
2590 | ret = 0; | 2581 | return 0; |
2591 | 2582 | ||
2592 | out_unlock: | 2583 | out_unlock: |
2593 | if (ret) | 2584 | mm_drop_all_locks(mm); |
2594 | mm_drop_all_locks(mm); | 2585 | return -EINTR; |
2595 | |||
2596 | return ret; | ||
2597 | } | 2586 | } |
2598 | 2587 | ||
2599 | static void vm_unlock_anon_vma(struct anon_vma *anon_vma) | 2588 | static void vm_unlock_anon_vma(struct anon_vma *anon_vma) |
diff --git a/mm/mmu_context.c b/mm/mmu_context.c index 9e82e937000e..cf332bc0080a 100644 --- a/mm/mmu_context.c +++ b/mm/mmu_context.c | |||
@@ -5,7 +5,7 @@ | |||
5 | 5 | ||
6 | #include <linux/mm.h> | 6 | #include <linux/mm.h> |
7 | #include <linux/mmu_context.h> | 7 | #include <linux/mmu_context.h> |
8 | #include <linux/module.h> | 8 | #include <linux/export.h> |
9 | #include <linux/sched.h> | 9 | #include <linux/sched.h> |
10 | 10 | ||
11 | #include <asm/mmu_context.h> | 11 | #include <asm/mmu_context.h> |
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 8d032de4088e..9a611d3a1848 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c | |||
@@ -11,7 +11,7 @@ | |||
11 | 11 | ||
12 | #include <linux/rculist.h> | 12 | #include <linux/rculist.h> |
13 | #include <linux/mmu_notifier.h> | 13 | #include <linux/mmu_notifier.h> |
14 | #include <linux/module.h> | 14 | #include <linux/export.h> |
15 | #include <linux/mm.h> | 15 | #include <linux/mm.h> |
16 | #include <linux/err.h> | 16 | #include <linux/err.h> |
17 | #include <linux/rcupdate.h> | 17 | #include <linux/rcupdate.h> |
diff --git a/mm/mmzone.c b/mm/mmzone.c index f5b7d1760213..7cf7b7ddc7c5 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c | |||
@@ -8,7 +8,6 @@ | |||
8 | #include <linux/stddef.h> | 8 | #include <linux/stddef.h> |
9 | #include <linux/mm.h> | 9 | #include <linux/mm.h> |
10 | #include <linux/mmzone.h> | 10 | #include <linux/mmzone.h> |
11 | #include <linux/module.h> | ||
12 | 11 | ||
13 | struct pglist_data *first_online_pgdat(void) | 12 | struct pglist_data *first_online_pgdat(void) |
14 | { | 13 | { |
diff --git a/mm/mremap.c b/mm/mremap.c index 506fa44403df..d6959cb4df58 100644 --- a/mm/mremap.c +++ b/mm/mremap.c | |||
@@ -41,8 +41,7 @@ static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr) | |||
41 | return NULL; | 41 | return NULL; |
42 | 42 | ||
43 | pmd = pmd_offset(pud, addr); | 43 | pmd = pmd_offset(pud, addr); |
44 | split_huge_page_pmd(mm, pmd); | 44 | if (pmd_none(*pmd)) |
45 | if (pmd_none_or_clear_bad(pmd)) | ||
46 | return NULL; | 45 | return NULL; |
47 | 46 | ||
48 | return pmd; | 47 | return pmd; |
@@ -65,8 +64,6 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma, | |||
65 | return NULL; | 64 | return NULL; |
66 | 65 | ||
67 | VM_BUG_ON(pmd_trans_huge(*pmd)); | 66 | VM_BUG_ON(pmd_trans_huge(*pmd)); |
68 | if (pmd_none(*pmd) && __pte_alloc(mm, vma, pmd, addr)) | ||
69 | return NULL; | ||
70 | 67 | ||
71 | return pmd; | 68 | return pmd; |
72 | } | 69 | } |
@@ -80,11 +77,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, | |||
80 | struct mm_struct *mm = vma->vm_mm; | 77 | struct mm_struct *mm = vma->vm_mm; |
81 | pte_t *old_pte, *new_pte, pte; | 78 | pte_t *old_pte, *new_pte, pte; |
82 | spinlock_t *old_ptl, *new_ptl; | 79 | spinlock_t *old_ptl, *new_ptl; |
83 | unsigned long old_start; | ||
84 | 80 | ||
85 | old_start = old_addr; | ||
86 | mmu_notifier_invalidate_range_start(vma->vm_mm, | ||
87 | old_start, old_end); | ||
88 | if (vma->vm_file) { | 81 | if (vma->vm_file) { |
89 | /* | 82 | /* |
90 | * Subtle point from Rajesh Venkatasubramanian: before | 83 | * Subtle point from Rajesh Venkatasubramanian: before |
@@ -111,7 +104,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, | |||
111 | new_pte++, new_addr += PAGE_SIZE) { | 104 | new_pte++, new_addr += PAGE_SIZE) { |
112 | if (pte_none(*old_pte)) | 105 | if (pte_none(*old_pte)) |
113 | continue; | 106 | continue; |
114 | pte = ptep_clear_flush(vma, old_addr, old_pte); | 107 | pte = ptep_get_and_clear(mm, old_addr, old_pte); |
115 | pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr); | 108 | pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr); |
116 | set_pte_at(mm, new_addr, new_pte, pte); | 109 | set_pte_at(mm, new_addr, new_pte, pte); |
117 | } | 110 | } |
@@ -123,7 +116,6 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, | |||
123 | pte_unmap_unlock(old_pte - 1, old_ptl); | 116 | pte_unmap_unlock(old_pte - 1, old_ptl); |
124 | if (mapping) | 117 | if (mapping) |
125 | mutex_unlock(&mapping->i_mmap_mutex); | 118 | mutex_unlock(&mapping->i_mmap_mutex); |
126 | mmu_notifier_invalidate_range_end(vma->vm_mm, old_start, old_end); | ||
127 | } | 119 | } |
128 | 120 | ||
129 | #define LATENCY_LIMIT (64 * PAGE_SIZE) | 121 | #define LATENCY_LIMIT (64 * PAGE_SIZE) |
@@ -134,22 +126,43 @@ unsigned long move_page_tables(struct vm_area_struct *vma, | |||
134 | { | 126 | { |
135 | unsigned long extent, next, old_end; | 127 | unsigned long extent, next, old_end; |
136 | pmd_t *old_pmd, *new_pmd; | 128 | pmd_t *old_pmd, *new_pmd; |
129 | bool need_flush = false; | ||
137 | 130 | ||
138 | old_end = old_addr + len; | 131 | old_end = old_addr + len; |
139 | flush_cache_range(vma, old_addr, old_end); | 132 | flush_cache_range(vma, old_addr, old_end); |
140 | 133 | ||
134 | mmu_notifier_invalidate_range_start(vma->vm_mm, old_addr, old_end); | ||
135 | |||
141 | for (; old_addr < old_end; old_addr += extent, new_addr += extent) { | 136 | for (; old_addr < old_end; old_addr += extent, new_addr += extent) { |
142 | cond_resched(); | 137 | cond_resched(); |
143 | next = (old_addr + PMD_SIZE) & PMD_MASK; | 138 | next = (old_addr + PMD_SIZE) & PMD_MASK; |
144 | if (next - 1 > old_end) | 139 | /* even if next overflowed, extent below will be ok */ |
145 | next = old_end; | ||
146 | extent = next - old_addr; | 140 | extent = next - old_addr; |
141 | if (extent > old_end - old_addr) | ||
142 | extent = old_end - old_addr; | ||
147 | old_pmd = get_old_pmd(vma->vm_mm, old_addr); | 143 | old_pmd = get_old_pmd(vma->vm_mm, old_addr); |
148 | if (!old_pmd) | 144 | if (!old_pmd) |
149 | continue; | 145 | continue; |
150 | new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr); | 146 | new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr); |
151 | if (!new_pmd) | 147 | if (!new_pmd) |
152 | break; | 148 | break; |
149 | if (pmd_trans_huge(*old_pmd)) { | ||
150 | int err = 0; | ||
151 | if (extent == HPAGE_PMD_SIZE) | ||
152 | err = move_huge_pmd(vma, new_vma, old_addr, | ||
153 | new_addr, old_end, | ||
154 | old_pmd, new_pmd); | ||
155 | if (err > 0) { | ||
156 | need_flush = true; | ||
157 | continue; | ||
158 | } else if (!err) { | ||
159 | split_huge_page_pmd(vma->vm_mm, old_pmd); | ||
160 | } | ||
161 | VM_BUG_ON(pmd_trans_huge(*old_pmd)); | ||
162 | } | ||
163 | if (pmd_none(*new_pmd) && __pte_alloc(new_vma->vm_mm, new_vma, | ||
164 | new_pmd, new_addr)) | ||
165 | break; | ||
153 | next = (new_addr + PMD_SIZE) & PMD_MASK; | 166 | next = (new_addr + PMD_SIZE) & PMD_MASK; |
154 | if (extent > next - new_addr) | 167 | if (extent > next - new_addr) |
155 | extent = next - new_addr; | 168 | extent = next - new_addr; |
@@ -157,7 +170,12 @@ unsigned long move_page_tables(struct vm_area_struct *vma, | |||
157 | extent = LATENCY_LIMIT; | 170 | extent = LATENCY_LIMIT; |
158 | move_ptes(vma, old_pmd, old_addr, old_addr + extent, | 171 | move_ptes(vma, old_pmd, old_addr, old_addr + extent, |
159 | new_vma, new_pmd, new_addr); | 172 | new_vma, new_pmd, new_addr); |
173 | need_flush = true; | ||
160 | } | 174 | } |
175 | if (likely(need_flush)) | ||
176 | flush_tlb_range(vma, old_end-len, old_addr); | ||
177 | |||
178 | mmu_notifier_invalidate_range_end(vma->vm_mm, old_end-len, old_end); | ||
161 | 179 | ||
162 | return len + old_addr - old_end; /* how much done */ | 180 | return len + old_addr - old_end; /* how much done */ |
163 | } | 181 | } |
diff --git a/mm/nobootmem.c b/mm/nobootmem.c index 29d948ce6d0f..24f0fc1a56d6 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c | |||
@@ -12,7 +12,7 @@ | |||
12 | #include <linux/pfn.h> | 12 | #include <linux/pfn.h> |
13 | #include <linux/slab.h> | 13 | #include <linux/slab.h> |
14 | #include <linux/bootmem.h> | 14 | #include <linux/bootmem.h> |
15 | #include <linux/module.h> | 15 | #include <linux/export.h> |
16 | #include <linux/kmemleak.h> | 16 | #include <linux/kmemleak.h> |
17 | #include <linux/range.h> | 17 | #include <linux/range.h> |
18 | #include <linux/memblock.h> | 18 | #include <linux/memblock.h> |
diff --git a/mm/nommu.c b/mm/nommu.c index 9edc897a3970..b982290fd962 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
@@ -13,7 +13,7 @@ | |||
13 | * Copyright (c) 2007-2010 Paul Mundt <lethal@linux-sh.org> | 13 | * Copyright (c) 2007-2010 Paul Mundt <lethal@linux-sh.org> |
14 | */ | 14 | */ |
15 | 15 | ||
16 | #include <linux/module.h> | 16 | #include <linux/export.h> |
17 | #include <linux/mm.h> | 17 | #include <linux/mm.h> |
18 | #include <linux/mman.h> | 18 | #include <linux/mman.h> |
19 | #include <linux/swap.h> | 19 | #include <linux/swap.h> |
@@ -22,7 +22,6 @@ | |||
22 | #include <linux/pagemap.h> | 22 | #include <linux/pagemap.h> |
23 | #include <linux/slab.h> | 23 | #include <linux/slab.h> |
24 | #include <linux/vmalloc.h> | 24 | #include <linux/vmalloc.h> |
25 | #include <linux/tracehook.h> | ||
26 | #include <linux/blkdev.h> | 25 | #include <linux/blkdev.h> |
27 | #include <linux/backing-dev.h> | 26 | #include <linux/backing-dev.h> |
28 | #include <linux/mount.h> | 27 | #include <linux/mount.h> |
@@ -455,7 +454,7 @@ void __attribute__((weak)) vmalloc_sync_all(void) | |||
455 | * between processes, it syncs the pagetable across all | 454 | * between processes, it syncs the pagetable across all |
456 | * processes. | 455 | * processes. |
457 | */ | 456 | */ |
458 | struct vm_struct *alloc_vm_area(size_t size) | 457 | struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes) |
459 | { | 458 | { |
460 | BUG(); | 459 | BUG(); |
461 | return NULL; | 460 | return NULL; |
@@ -1087,7 +1086,7 @@ static unsigned long determine_vm_flags(struct file *file, | |||
1087 | * it's being traced - otherwise breakpoints set in it may interfere | 1086 | * it's being traced - otherwise breakpoints set in it may interfere |
1088 | * with another untraced process | 1087 | * with another untraced process |
1089 | */ | 1088 | */ |
1090 | if ((flags & MAP_PRIVATE) && tracehook_expect_breakpoints(current)) | 1089 | if ((flags & MAP_PRIVATE) && current->ptrace) |
1091 | vm_flags &= ~VM_MAYSHARE; | 1090 | vm_flags &= ~VM_MAYSHARE; |
1092 | 1091 | ||
1093 | return vm_flags; | 1092 | return vm_flags; |
@@ -1885,9 +1884,17 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) | |||
1885 | return 0; | 1884 | return 0; |
1886 | 1885 | ||
1887 | if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { | 1886 | if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { |
1888 | unsigned long n; | 1887 | free = global_page_state(NR_FREE_PAGES); |
1888 | free += global_page_state(NR_FILE_PAGES); | ||
1889 | |||
1890 | /* | ||
1891 | * shmem pages shouldn't be counted as free in this | ||
1892 | * case, they can't be purged, only swapped out, and | ||
1893 | * that won't affect the overall amount of available | ||
1894 | * memory in the system. | ||
1895 | */ | ||
1896 | free -= global_page_state(NR_SHMEM); | ||
1889 | 1897 | ||
1890 | free = global_page_state(NR_FILE_PAGES); | ||
1891 | free += nr_swap_pages; | 1898 | free += nr_swap_pages; |
1892 | 1899 | ||
1893 | /* | 1900 | /* |
@@ -1899,34 +1906,18 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) | |||
1899 | free += global_page_state(NR_SLAB_RECLAIMABLE); | 1906 | free += global_page_state(NR_SLAB_RECLAIMABLE); |
1900 | 1907 | ||
1901 | /* | 1908 | /* |
1902 | * Leave the last 3% for root | ||
1903 | */ | ||
1904 | if (!cap_sys_admin) | ||
1905 | free -= free / 32; | ||
1906 | |||
1907 | if (free > pages) | ||
1908 | return 0; | ||
1909 | |||
1910 | /* | ||
1911 | * nr_free_pages() is very expensive on large systems, | ||
1912 | * only call if we're about to fail. | ||
1913 | */ | ||
1914 | n = nr_free_pages(); | ||
1915 | |||
1916 | /* | ||
1917 | * Leave reserved pages. The pages are not for anonymous pages. | 1909 | * Leave reserved pages. The pages are not for anonymous pages. |
1918 | */ | 1910 | */ |
1919 | if (n <= totalreserve_pages) | 1911 | if (free <= totalreserve_pages) |
1920 | goto error; | 1912 | goto error; |
1921 | else | 1913 | else |
1922 | n -= totalreserve_pages; | 1914 | free -= totalreserve_pages; |
1923 | 1915 | ||
1924 | /* | 1916 | /* |
1925 | * Leave the last 3% for root | 1917 | * Leave the last 3% for root |
1926 | */ | 1918 | */ |
1927 | if (!cap_sys_admin) | 1919 | if (!cap_sys_admin) |
1928 | n -= n / 32; | 1920 | free -= free / 32; |
1929 | free += n; | ||
1930 | 1921 | ||
1931 | if (free > pages) | 1922 | if (free > pages) |
1932 | return 0; | 1923 | return 0; |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index e4b0991ca351..76f2c5ae908e 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -26,18 +26,38 @@ | |||
26 | #include <linux/timex.h> | 26 | #include <linux/timex.h> |
27 | #include <linux/jiffies.h> | 27 | #include <linux/jiffies.h> |
28 | #include <linux/cpuset.h> | 28 | #include <linux/cpuset.h> |
29 | #include <linux/module.h> | 29 | #include <linux/export.h> |
30 | #include <linux/notifier.h> | 30 | #include <linux/notifier.h> |
31 | #include <linux/memcontrol.h> | 31 | #include <linux/memcontrol.h> |
32 | #include <linux/mempolicy.h> | 32 | #include <linux/mempolicy.h> |
33 | #include <linux/security.h> | 33 | #include <linux/security.h> |
34 | #include <linux/ptrace.h> | 34 | #include <linux/ptrace.h> |
35 | #include <linux/freezer.h> | ||
35 | 36 | ||
36 | int sysctl_panic_on_oom; | 37 | int sysctl_panic_on_oom; |
37 | int sysctl_oom_kill_allocating_task; | 38 | int sysctl_oom_kill_allocating_task; |
38 | int sysctl_oom_dump_tasks = 1; | 39 | int sysctl_oom_dump_tasks = 1; |
39 | static DEFINE_SPINLOCK(zone_scan_lock); | 40 | static DEFINE_SPINLOCK(zone_scan_lock); |
40 | 41 | ||
42 | /* | ||
43 | * compare_swap_oom_score_adj() - compare and swap current's oom_score_adj | ||
44 | * @old_val: old oom_score_adj for compare | ||
45 | * @new_val: new oom_score_adj for swap | ||
46 | * | ||
47 | * Sets the oom_score_adj value for current to @new_val iff its present value is | ||
48 | * @old_val. Usually used to reinstate a previous value to prevent racing with | ||
49 | * userspacing tuning the value in the interim. | ||
50 | */ | ||
51 | void compare_swap_oom_score_adj(int old_val, int new_val) | ||
52 | { | ||
53 | struct sighand_struct *sighand = current->sighand; | ||
54 | |||
55 | spin_lock_irq(&sighand->siglock); | ||
56 | if (current->signal->oom_score_adj == old_val) | ||
57 | current->signal->oom_score_adj = new_val; | ||
58 | spin_unlock_irq(&sighand->siglock); | ||
59 | } | ||
60 | |||
41 | /** | 61 | /** |
42 | * test_set_oom_score_adj() - set current's oom_score_adj and return old value | 62 | * test_set_oom_score_adj() - set current's oom_score_adj and return old value |
43 | * @new_val: new oom_score_adj value | 63 | * @new_val: new oom_score_adj value |
@@ -53,13 +73,7 @@ int test_set_oom_score_adj(int new_val) | |||
53 | 73 | ||
54 | spin_lock_irq(&sighand->siglock); | 74 | spin_lock_irq(&sighand->siglock); |
55 | old_val = current->signal->oom_score_adj; | 75 | old_val = current->signal->oom_score_adj; |
56 | if (new_val != old_val) { | 76 | current->signal->oom_score_adj = new_val; |
57 | if (new_val == OOM_SCORE_ADJ_MIN) | ||
58 | atomic_inc(¤t->mm->oom_disable_count); | ||
59 | else if (old_val == OOM_SCORE_ADJ_MIN) | ||
60 | atomic_dec(¤t->mm->oom_disable_count); | ||
61 | current->signal->oom_score_adj = new_val; | ||
62 | } | ||
63 | spin_unlock_irq(&sighand->siglock); | 77 | spin_unlock_irq(&sighand->siglock); |
64 | 78 | ||
65 | return old_val; | 79 | return old_val; |
@@ -171,12 +185,7 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem, | |||
171 | if (!p) | 185 | if (!p) |
172 | return 0; | 186 | return 0; |
173 | 187 | ||
174 | /* | 188 | if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) { |
175 | * Shortcut check for a thread sharing p->mm that is OOM_SCORE_ADJ_MIN | ||
176 | * so the entire heuristic doesn't need to be executed for something | ||
177 | * that cannot be killed. | ||
178 | */ | ||
179 | if (atomic_read(&p->mm->oom_disable_count)) { | ||
180 | task_unlock(p); | 189 | task_unlock(p); |
181 | return 0; | 190 | return 0; |
182 | } | 191 | } |
@@ -303,7 +312,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, | |||
303 | do_each_thread(g, p) { | 312 | do_each_thread(g, p) { |
304 | unsigned int points; | 313 | unsigned int points; |
305 | 314 | ||
306 | if (!p->mm) | 315 | if (p->exit_state) |
307 | continue; | 316 | continue; |
308 | if (oom_unkillable_task(p, mem, nodemask)) | 317 | if (oom_unkillable_task(p, mem, nodemask)) |
309 | continue; | 318 | continue; |
@@ -317,8 +326,13 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, | |||
317 | * blocked waiting for another task which itself is waiting | 326 | * blocked waiting for another task which itself is waiting |
318 | * for memory. Is there a better alternative? | 327 | * for memory. Is there a better alternative? |
319 | */ | 328 | */ |
320 | if (test_tsk_thread_flag(p, TIF_MEMDIE)) | 329 | if (test_tsk_thread_flag(p, TIF_MEMDIE)) { |
330 | if (unlikely(frozen(p))) | ||
331 | thaw_process(p); | ||
321 | return ERR_PTR(-1UL); | 332 | return ERR_PTR(-1UL); |
333 | } | ||
334 | if (!p->mm) | ||
335 | continue; | ||
322 | 336 | ||
323 | if (p->flags & PF_EXITING) { | 337 | if (p->flags & PF_EXITING) { |
324 | /* | 338 | /* |
@@ -339,8 +353,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, | |||
339 | * then wait for it to finish before killing | 353 | * then wait for it to finish before killing |
340 | * some other task unnecessarily. | 354 | * some other task unnecessarily. |
341 | */ | 355 | */ |
342 | if (!(task_ptrace(p->group_leader) & | 356 | if (!(p->group_leader->ptrace & PT_TRACE_EXIT)) |
343 | PT_TRACE_EXIT)) | ||
344 | return ERR_PTR(-1UL); | 357 | return ERR_PTR(-1UL); |
345 | } | 358 | } |
346 | } | 359 | } |
@@ -434,7 +447,7 @@ static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem) | |||
434 | task_unlock(p); | 447 | task_unlock(p); |
435 | 448 | ||
436 | /* | 449 | /* |
437 | * Kill all processes sharing p->mm in other thread groups, if any. | 450 | * Kill all user processes sharing p->mm in other thread groups, if any. |
438 | * They don't get access to memory reserves or a higher scheduler | 451 | * They don't get access to memory reserves or a higher scheduler |
439 | * priority, though, to avoid depletion of all memory or task | 452 | * priority, though, to avoid depletion of all memory or task |
440 | * starvation. This prevents mm->mmap_sem livelock when an oom killed | 453 | * starvation. This prevents mm->mmap_sem livelock when an oom killed |
@@ -444,7 +457,11 @@ static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem) | |||
444 | * signal. | 457 | * signal. |
445 | */ | 458 | */ |
446 | for_each_process(q) | 459 | for_each_process(q) |
447 | if (q->mm == mm && !same_thread_group(q, p)) { | 460 | if (q->mm == mm && !same_thread_group(q, p) && |
461 | !(q->flags & PF_KTHREAD)) { | ||
462 | if (q->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) | ||
463 | continue; | ||
464 | |||
448 | task_lock(q); /* Protect ->comm from prctl() */ | 465 | task_lock(q); /* Protect ->comm from prctl() */ |
449 | pr_err("Kill process %d (%s) sharing same memory\n", | 466 | pr_err("Kill process %d (%s) sharing same memory\n", |
450 | task_pid_nr(q), q->comm); | 467 | task_pid_nr(q), q->comm); |
@@ -488,7 +505,7 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
488 | 505 | ||
489 | /* | 506 | /* |
490 | * If any of p's children has a different mm and is eligible for kill, | 507 | * If any of p's children has a different mm and is eligible for kill, |
491 | * the one with the highest badness() score is sacrificed for its | 508 | * the one with the highest oom_badness() score is sacrificed for its |
492 | * parent. This attempts to lose the minimal amount of work done while | 509 | * parent. This attempts to lose the minimal amount of work done while |
493 | * still freeing memory. | 510 | * still freeing memory. |
494 | */ | 511 | */ |
@@ -721,7 +738,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, | |||
721 | read_lock(&tasklist_lock); | 738 | read_lock(&tasklist_lock); |
722 | if (sysctl_oom_kill_allocating_task && | 739 | if (sysctl_oom_kill_allocating_task && |
723 | !oom_unkillable_task(current, NULL, nodemask) && | 740 | !oom_unkillable_task(current, NULL, nodemask) && |
724 | current->mm && !atomic_read(¤t->mm->oom_disable_count)) { | 741 | current->mm) { |
725 | /* | 742 | /* |
726 | * oom_kill_process() needs tasklist_lock held. If it returns | 743 | * oom_kill_process() needs tasklist_lock held. If it returns |
727 | * non-zero, current could not be killed so we must fallback to | 744 | * non-zero, current could not be killed so we must fallback to |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 31f698862420..71252486bc6f 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -12,7 +12,7 @@ | |||
12 | */ | 12 | */ |
13 | 13 | ||
14 | #include <linux/kernel.h> | 14 | #include <linux/kernel.h> |
15 | #include <linux/module.h> | 15 | #include <linux/export.h> |
16 | #include <linux/spinlock.h> | 16 | #include <linux/spinlock.h> |
17 | #include <linux/fs.h> | 17 | #include <linux/fs.h> |
18 | #include <linux/mm.h> | 18 | #include <linux/mm.h> |
@@ -37,24 +37,22 @@ | |||
37 | #include <trace/events/writeback.h> | 37 | #include <trace/events/writeback.h> |
38 | 38 | ||
39 | /* | 39 | /* |
40 | * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited | 40 | * Sleep at most 200ms at a time in balance_dirty_pages(). |
41 | * will look to see if it needs to force writeback or throttling. | ||
42 | */ | 41 | */ |
43 | static long ratelimit_pages = 32; | 42 | #define MAX_PAUSE max(HZ/5, 1) |
44 | 43 | ||
45 | /* | 44 | /* |
46 | * When balance_dirty_pages decides that the caller needs to perform some | 45 | * Estimate write bandwidth at 200ms intervals. |
47 | * non-background writeback, this is how many pages it will attempt to write. | ||
48 | * It should be somewhat larger than dirtied pages to ensure that reasonably | ||
49 | * large amounts of I/O are submitted. | ||
50 | */ | 46 | */ |
51 | static inline long sync_writeback_pages(unsigned long dirtied) | 47 | #define BANDWIDTH_INTERVAL max(HZ/5, 1) |
52 | { | ||
53 | if (dirtied < ratelimit_pages) | ||
54 | dirtied = ratelimit_pages; | ||
55 | 48 | ||
56 | return dirtied + dirtied / 2; | 49 | #define RATELIMIT_CALC_SHIFT 10 |
57 | } | 50 | |
51 | /* | ||
52 | * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited | ||
53 | * will look to see if it needs to force writeback or throttling. | ||
54 | */ | ||
55 | static long ratelimit_pages = 32; | ||
58 | 56 | ||
59 | /* The following parameters are exported via /proc/sys/vm */ | 57 | /* The following parameters are exported via /proc/sys/vm */ |
60 | 58 | ||
@@ -111,6 +109,7 @@ EXPORT_SYMBOL(laptop_mode); | |||
111 | 109 | ||
112 | /* End of sysctl-exported parameters */ | 110 | /* End of sysctl-exported parameters */ |
113 | 111 | ||
112 | unsigned long global_dirty_limit; | ||
114 | 113 | ||
115 | /* | 114 | /* |
116 | * Scale the writeback cache size proportional to the relative writeout speeds. | 115 | * Scale the writeback cache size proportional to the relative writeout speeds. |
@@ -129,7 +128,6 @@ EXPORT_SYMBOL(laptop_mode); | |||
129 | * | 128 | * |
130 | */ | 129 | */ |
131 | static struct prop_descriptor vm_completions; | 130 | static struct prop_descriptor vm_completions; |
132 | static struct prop_descriptor vm_dirties; | ||
133 | 131 | ||
134 | /* | 132 | /* |
135 | * couple the period to the dirty_ratio: | 133 | * couple the period to the dirty_ratio: |
@@ -155,7 +153,8 @@ static void update_completion_period(void) | |||
155 | { | 153 | { |
156 | int shift = calc_period_shift(); | 154 | int shift = calc_period_shift(); |
157 | prop_change_shift(&vm_completions, shift); | 155 | prop_change_shift(&vm_completions, shift); |
158 | prop_change_shift(&vm_dirties, shift); | 156 | |
157 | writeback_set_ratelimit(); | ||
159 | } | 158 | } |
160 | 159 | ||
161 | int dirty_background_ratio_handler(struct ctl_table *table, int write, | 160 | int dirty_background_ratio_handler(struct ctl_table *table, int write, |
@@ -219,6 +218,7 @@ int dirty_bytes_handler(struct ctl_table *table, int write, | |||
219 | */ | 218 | */ |
220 | static inline void __bdi_writeout_inc(struct backing_dev_info *bdi) | 219 | static inline void __bdi_writeout_inc(struct backing_dev_info *bdi) |
221 | { | 220 | { |
221 | __inc_bdi_stat(bdi, BDI_WRITTEN); | ||
222 | __prop_inc_percpu_max(&vm_completions, &bdi->completions, | 222 | __prop_inc_percpu_max(&vm_completions, &bdi->completions, |
223 | bdi->max_prop_frac); | 223 | bdi->max_prop_frac); |
224 | } | 224 | } |
@@ -233,65 +233,20 @@ void bdi_writeout_inc(struct backing_dev_info *bdi) | |||
233 | } | 233 | } |
234 | EXPORT_SYMBOL_GPL(bdi_writeout_inc); | 234 | EXPORT_SYMBOL_GPL(bdi_writeout_inc); |
235 | 235 | ||
236 | void task_dirty_inc(struct task_struct *tsk) | ||
237 | { | ||
238 | prop_inc_single(&vm_dirties, &tsk->dirties); | ||
239 | } | ||
240 | |||
241 | /* | 236 | /* |
242 | * Obtain an accurate fraction of the BDI's portion. | 237 | * Obtain an accurate fraction of the BDI's portion. |
243 | */ | 238 | */ |
244 | static void bdi_writeout_fraction(struct backing_dev_info *bdi, | 239 | static void bdi_writeout_fraction(struct backing_dev_info *bdi, |
245 | long *numerator, long *denominator) | 240 | long *numerator, long *denominator) |
246 | { | 241 | { |
247 | if (bdi_cap_writeback_dirty(bdi)) { | 242 | prop_fraction_percpu(&vm_completions, &bdi->completions, |
248 | prop_fraction_percpu(&vm_completions, &bdi->completions, | ||
249 | numerator, denominator); | 243 | numerator, denominator); |
250 | } else { | ||
251 | *numerator = 0; | ||
252 | *denominator = 1; | ||
253 | } | ||
254 | } | ||
255 | |||
256 | static inline void task_dirties_fraction(struct task_struct *tsk, | ||
257 | long *numerator, long *denominator) | ||
258 | { | ||
259 | prop_fraction_single(&vm_dirties, &tsk->dirties, | ||
260 | numerator, denominator); | ||
261 | } | ||
262 | |||
263 | /* | ||
264 | * task_dirty_limit - scale down dirty throttling threshold for one task | ||
265 | * | ||
266 | * task specific dirty limit: | ||
267 | * | ||
268 | * dirty -= (dirty/8) * p_{t} | ||
269 | * | ||
270 | * To protect light/slow dirtying tasks from heavier/fast ones, we start | ||
271 | * throttling individual tasks before reaching the bdi dirty limit. | ||
272 | * Relatively low thresholds will be allocated to heavy dirtiers. So when | ||
273 | * dirty pages grow large, heavy dirtiers will be throttled first, which will | ||
274 | * effectively curb the growth of dirty pages. Light dirtiers with high enough | ||
275 | * dirty threshold may never get throttled. | ||
276 | */ | ||
277 | static unsigned long task_dirty_limit(struct task_struct *tsk, | ||
278 | unsigned long bdi_dirty) | ||
279 | { | ||
280 | long numerator, denominator; | ||
281 | unsigned long dirty = bdi_dirty; | ||
282 | u64 inv = dirty >> 3; | ||
283 | |||
284 | task_dirties_fraction(tsk, &numerator, &denominator); | ||
285 | inv *= numerator; | ||
286 | do_div(inv, denominator); | ||
287 | |||
288 | dirty -= inv; | ||
289 | |||
290 | return max(dirty, bdi_dirty/2); | ||
291 | } | 244 | } |
292 | 245 | ||
293 | /* | 246 | /* |
294 | * | 247 | * bdi_min_ratio keeps the sum of the minimum dirty shares of all |
248 | * registered backing devices, which, for obvious reasons, can not | ||
249 | * exceed 100%. | ||
295 | */ | 250 | */ |
296 | static unsigned int bdi_min_ratio; | 251 | static unsigned int bdi_min_ratio; |
297 | 252 | ||
@@ -397,6 +352,17 @@ unsigned long determine_dirtyable_memory(void) | |||
397 | return x + 1; /* Ensure that we never return 0 */ | 352 | return x + 1; /* Ensure that we never return 0 */ |
398 | } | 353 | } |
399 | 354 | ||
355 | static unsigned long dirty_freerun_ceiling(unsigned long thresh, | ||
356 | unsigned long bg_thresh) | ||
357 | { | ||
358 | return (thresh + bg_thresh) / 2; | ||
359 | } | ||
360 | |||
361 | static unsigned long hard_dirty_limit(unsigned long thresh) | ||
362 | { | ||
363 | return max(thresh, global_dirty_limit); | ||
364 | } | ||
365 | |||
400 | /* | 366 | /* |
401 | * global_dirty_limits - background-writeback and dirty-throttling thresholds | 367 | * global_dirty_limits - background-writeback and dirty-throttling thresholds |
402 | * | 368 | * |
@@ -435,12 +401,20 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) | |||
435 | } | 401 | } |
436 | *pbackground = background; | 402 | *pbackground = background; |
437 | *pdirty = dirty; | 403 | *pdirty = dirty; |
404 | trace_global_dirty_state(background, dirty); | ||
438 | } | 405 | } |
439 | 406 | ||
440 | /* | 407 | /** |
441 | * bdi_dirty_limit - @bdi's share of dirty throttling threshold | 408 | * bdi_dirty_limit - @bdi's share of dirty throttling threshold |
409 | * @bdi: the backing_dev_info to query | ||
410 | * @dirty: global dirty limit in pages | ||
442 | * | 411 | * |
443 | * Allocate high/low dirty limits to fast/slow devices, in order to prevent | 412 | * Returns @bdi's dirty limit in pages. The term "dirty" in the context of |
413 | * dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages. | ||
414 | * And the "limit" in the name is not seriously taken as hard limit in | ||
415 | * balance_dirty_pages(). | ||
416 | * | ||
417 | * It allocates high/low dirty limits to fast/slow devices, in order to prevent | ||
444 | * - starving fast devices | 418 | * - starving fast devices |
445 | * - piling up dirty pages (that will take long time to sync) on slow devices | 419 | * - piling up dirty pages (that will take long time to sync) on slow devices |
446 | * | 420 | * |
@@ -469,36 +443,587 @@ unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty) | |||
469 | } | 443 | } |
470 | 444 | ||
471 | /* | 445 | /* |
446 | * Dirty position control. | ||
447 | * | ||
448 | * (o) global/bdi setpoints | ||
449 | * | ||
450 | * We want the dirty pages be balanced around the global/bdi setpoints. | ||
451 | * When the number of dirty pages is higher/lower than the setpoint, the | ||
452 | * dirty position control ratio (and hence task dirty ratelimit) will be | ||
453 | * decreased/increased to bring the dirty pages back to the setpoint. | ||
454 | * | ||
455 | * pos_ratio = 1 << RATELIMIT_CALC_SHIFT | ||
456 | * | ||
457 | * if (dirty < setpoint) scale up pos_ratio | ||
458 | * if (dirty > setpoint) scale down pos_ratio | ||
459 | * | ||
460 | * if (bdi_dirty < bdi_setpoint) scale up pos_ratio | ||
461 | * if (bdi_dirty > bdi_setpoint) scale down pos_ratio | ||
462 | * | ||
463 | * task_ratelimit = dirty_ratelimit * pos_ratio >> RATELIMIT_CALC_SHIFT | ||
464 | * | ||
465 | * (o) global control line | ||
466 | * | ||
467 | * ^ pos_ratio | ||
468 | * | | ||
469 | * | |<===== global dirty control scope ======>| | ||
470 | * 2.0 .............* | ||
471 | * | .* | ||
472 | * | . * | ||
473 | * | . * | ||
474 | * | . * | ||
475 | * | . * | ||
476 | * | . * | ||
477 | * 1.0 ................................* | ||
478 | * | . . * | ||
479 | * | . . * | ||
480 | * | . . * | ||
481 | * | . . * | ||
482 | * | . . * | ||
483 | * 0 +------------.------------------.----------------------*-------------> | ||
484 | * freerun^ setpoint^ limit^ dirty pages | ||
485 | * | ||
486 | * (o) bdi control line | ||
487 | * | ||
488 | * ^ pos_ratio | ||
489 | * | | ||
490 | * | * | ||
491 | * | * | ||
492 | * | * | ||
493 | * | * | ||
494 | * | * |<=========== span ============>| | ||
495 | * 1.0 .......................* | ||
496 | * | . * | ||
497 | * | . * | ||
498 | * | . * | ||
499 | * | . * | ||
500 | * | . * | ||
501 | * | . * | ||
502 | * | . * | ||
503 | * | . * | ||
504 | * | . * | ||
505 | * | . * | ||
506 | * | . * | ||
507 | * 1/4 ...............................................* * * * * * * * * * * * | ||
508 | * | . . | ||
509 | * | . . | ||
510 | * | . . | ||
511 | * 0 +----------------------.-------------------------------.-------------> | ||
512 | * bdi_setpoint^ x_intercept^ | ||
513 | * | ||
514 | * The bdi control line won't drop below pos_ratio=1/4, so that bdi_dirty can | ||
515 | * be smoothly throttled down to normal if it starts high in situations like | ||
516 | * - start writing to a slow SD card and a fast disk at the same time. The SD | ||
517 | * card's bdi_dirty may rush to many times higher than bdi_setpoint. | ||
518 | * - the bdi dirty thresh drops quickly due to change of JBOD workload | ||
519 | */ | ||
520 | static unsigned long bdi_position_ratio(struct backing_dev_info *bdi, | ||
521 | unsigned long thresh, | ||
522 | unsigned long bg_thresh, | ||
523 | unsigned long dirty, | ||
524 | unsigned long bdi_thresh, | ||
525 | unsigned long bdi_dirty) | ||
526 | { | ||
527 | unsigned long write_bw = bdi->avg_write_bandwidth; | ||
528 | unsigned long freerun = dirty_freerun_ceiling(thresh, bg_thresh); | ||
529 | unsigned long limit = hard_dirty_limit(thresh); | ||
530 | unsigned long x_intercept; | ||
531 | unsigned long setpoint; /* dirty pages' target balance point */ | ||
532 | unsigned long bdi_setpoint; | ||
533 | unsigned long span; | ||
534 | long long pos_ratio; /* for scaling up/down the rate limit */ | ||
535 | long x; | ||
536 | |||
537 | if (unlikely(dirty >= limit)) | ||
538 | return 0; | ||
539 | |||
540 | /* | ||
541 | * global setpoint | ||
542 | * | ||
543 | * setpoint - dirty 3 | ||
544 | * f(dirty) := 1.0 + (----------------) | ||
545 | * limit - setpoint | ||
546 | * | ||
547 | * it's a 3rd order polynomial that subjects to | ||
548 | * | ||
549 | * (1) f(freerun) = 2.0 => rampup dirty_ratelimit reasonably fast | ||
550 | * (2) f(setpoint) = 1.0 => the balance point | ||
551 | * (3) f(limit) = 0 => the hard limit | ||
552 | * (4) df/dx <= 0 => negative feedback control | ||
553 | * (5) the closer to setpoint, the smaller |df/dx| (and the reverse) | ||
554 | * => fast response on large errors; small oscillation near setpoint | ||
555 | */ | ||
556 | setpoint = (freerun + limit) / 2; | ||
557 | x = div_s64((setpoint - dirty) << RATELIMIT_CALC_SHIFT, | ||
558 | limit - setpoint + 1); | ||
559 | pos_ratio = x; | ||
560 | pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT; | ||
561 | pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT; | ||
562 | pos_ratio += 1 << RATELIMIT_CALC_SHIFT; | ||
563 | |||
564 | /* | ||
565 | * We have computed basic pos_ratio above based on global situation. If | ||
566 | * the bdi is over/under its share of dirty pages, we want to scale | ||
567 | * pos_ratio further down/up. That is done by the following mechanism. | ||
568 | */ | ||
569 | |||
570 | /* | ||
571 | * bdi setpoint | ||
572 | * | ||
573 | * f(bdi_dirty) := 1.0 + k * (bdi_dirty - bdi_setpoint) | ||
574 | * | ||
575 | * x_intercept - bdi_dirty | ||
576 | * := -------------------------- | ||
577 | * x_intercept - bdi_setpoint | ||
578 | * | ||
579 | * The main bdi control line is a linear function that subjects to | ||
580 | * | ||
581 | * (1) f(bdi_setpoint) = 1.0 | ||
582 | * (2) k = - 1 / (8 * write_bw) (in single bdi case) | ||
583 | * or equally: x_intercept = bdi_setpoint + 8 * write_bw | ||
584 | * | ||
585 | * For single bdi case, the dirty pages are observed to fluctuate | ||
586 | * regularly within range | ||
587 | * [bdi_setpoint - write_bw/2, bdi_setpoint + write_bw/2] | ||
588 | * for various filesystems, where (2) can yield in a reasonable 12.5% | ||
589 | * fluctuation range for pos_ratio. | ||
590 | * | ||
591 | * For JBOD case, bdi_thresh (not bdi_dirty!) could fluctuate up to its | ||
592 | * own size, so move the slope over accordingly and choose a slope that | ||
593 | * yields 100% pos_ratio fluctuation on suddenly doubled bdi_thresh. | ||
594 | */ | ||
595 | if (unlikely(bdi_thresh > thresh)) | ||
596 | bdi_thresh = thresh; | ||
597 | bdi_thresh = max(bdi_thresh, (limit - dirty) / 8); | ||
598 | /* | ||
599 | * scale global setpoint to bdi's: | ||
600 | * bdi_setpoint = setpoint * bdi_thresh / thresh | ||
601 | */ | ||
602 | x = div_u64((u64)bdi_thresh << 16, thresh + 1); | ||
603 | bdi_setpoint = setpoint * (u64)x >> 16; | ||
604 | /* | ||
605 | * Use span=(8*write_bw) in single bdi case as indicated by | ||
606 | * (thresh - bdi_thresh ~= 0) and transit to bdi_thresh in JBOD case. | ||
607 | * | ||
608 | * bdi_thresh thresh - bdi_thresh | ||
609 | * span = ---------- * (8 * write_bw) + ------------------- * bdi_thresh | ||
610 | * thresh thresh | ||
611 | */ | ||
612 | span = (thresh - bdi_thresh + 8 * write_bw) * (u64)x >> 16; | ||
613 | x_intercept = bdi_setpoint + span; | ||
614 | |||
615 | if (bdi_dirty < x_intercept - span / 4) { | ||
616 | pos_ratio = div_u64(pos_ratio * (x_intercept - bdi_dirty), | ||
617 | x_intercept - bdi_setpoint + 1); | ||
618 | } else | ||
619 | pos_ratio /= 4; | ||
620 | |||
621 | /* | ||
622 | * bdi reserve area, safeguard against dirty pool underrun and disk idle | ||
623 | * It may push the desired control point of global dirty pages higher | ||
624 | * than setpoint. | ||
625 | */ | ||
626 | x_intercept = bdi_thresh / 2; | ||
627 | if (bdi_dirty < x_intercept) { | ||
628 | if (bdi_dirty > x_intercept / 8) | ||
629 | pos_ratio = div_u64(pos_ratio * x_intercept, bdi_dirty); | ||
630 | else | ||
631 | pos_ratio *= 8; | ||
632 | } | ||
633 | |||
634 | return pos_ratio; | ||
635 | } | ||
636 | |||
637 | static void bdi_update_write_bandwidth(struct backing_dev_info *bdi, | ||
638 | unsigned long elapsed, | ||
639 | unsigned long written) | ||
640 | { | ||
641 | const unsigned long period = roundup_pow_of_two(3 * HZ); | ||
642 | unsigned long avg = bdi->avg_write_bandwidth; | ||
643 | unsigned long old = bdi->write_bandwidth; | ||
644 | u64 bw; | ||
645 | |||
646 | /* | ||
647 | * bw = written * HZ / elapsed | ||
648 | * | ||
649 | * bw * elapsed + write_bandwidth * (period - elapsed) | ||
650 | * write_bandwidth = --------------------------------------------------- | ||
651 | * period | ||
652 | */ | ||
653 | bw = written - bdi->written_stamp; | ||
654 | bw *= HZ; | ||
655 | if (unlikely(elapsed > period)) { | ||
656 | do_div(bw, elapsed); | ||
657 | avg = bw; | ||
658 | goto out; | ||
659 | } | ||
660 | bw += (u64)bdi->write_bandwidth * (period - elapsed); | ||
661 | bw >>= ilog2(period); | ||
662 | |||
663 | /* | ||
664 | * one more level of smoothing, for filtering out sudden spikes | ||
665 | */ | ||
666 | if (avg > old && old >= (unsigned long)bw) | ||
667 | avg -= (avg - old) >> 3; | ||
668 | |||
669 | if (avg < old && old <= (unsigned long)bw) | ||
670 | avg += (old - avg) >> 3; | ||
671 | |||
672 | out: | ||
673 | bdi->write_bandwidth = bw; | ||
674 | bdi->avg_write_bandwidth = avg; | ||
675 | } | ||
676 | |||
677 | /* | ||
678 | * The global dirtyable memory and dirty threshold could be suddenly knocked | ||
679 | * down by a large amount (eg. on the startup of KVM in a swapless system). | ||
680 | * This may throw the system into deep dirty exceeded state and throttle | ||
681 | * heavy/light dirtiers alike. To retain good responsiveness, maintain | ||
682 | * global_dirty_limit for tracking slowly down to the knocked down dirty | ||
683 | * threshold. | ||
684 | */ | ||
685 | static void update_dirty_limit(unsigned long thresh, unsigned long dirty) | ||
686 | { | ||
687 | unsigned long limit = global_dirty_limit; | ||
688 | |||
689 | /* | ||
690 | * Follow up in one step. | ||
691 | */ | ||
692 | if (limit < thresh) { | ||
693 | limit = thresh; | ||
694 | goto update; | ||
695 | } | ||
696 | |||
697 | /* | ||
698 | * Follow down slowly. Use the higher one as the target, because thresh | ||
699 | * may drop below dirty. This is exactly the reason to introduce | ||
700 | * global_dirty_limit which is guaranteed to lie above the dirty pages. | ||
701 | */ | ||
702 | thresh = max(thresh, dirty); | ||
703 | if (limit > thresh) { | ||
704 | limit -= (limit - thresh) >> 5; | ||
705 | goto update; | ||
706 | } | ||
707 | return; | ||
708 | update: | ||
709 | global_dirty_limit = limit; | ||
710 | } | ||
711 | |||
712 | static void global_update_bandwidth(unsigned long thresh, | ||
713 | unsigned long dirty, | ||
714 | unsigned long now) | ||
715 | { | ||
716 | static DEFINE_SPINLOCK(dirty_lock); | ||
717 | static unsigned long update_time; | ||
718 | |||
719 | /* | ||
720 | * check locklessly first to optimize away locking for the most time | ||
721 | */ | ||
722 | if (time_before(now, update_time + BANDWIDTH_INTERVAL)) | ||
723 | return; | ||
724 | |||
725 | spin_lock(&dirty_lock); | ||
726 | if (time_after_eq(now, update_time + BANDWIDTH_INTERVAL)) { | ||
727 | update_dirty_limit(thresh, dirty); | ||
728 | update_time = now; | ||
729 | } | ||
730 | spin_unlock(&dirty_lock); | ||
731 | } | ||
732 | |||
733 | /* | ||
734 | * Maintain bdi->dirty_ratelimit, the base dirty throttle rate. | ||
735 | * | ||
736 | * Normal bdi tasks will be curbed at or below it in long term. | ||
737 | * Obviously it should be around (write_bw / N) when there are N dd tasks. | ||
738 | */ | ||
739 | static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi, | ||
740 | unsigned long thresh, | ||
741 | unsigned long bg_thresh, | ||
742 | unsigned long dirty, | ||
743 | unsigned long bdi_thresh, | ||
744 | unsigned long bdi_dirty, | ||
745 | unsigned long dirtied, | ||
746 | unsigned long elapsed) | ||
747 | { | ||
748 | unsigned long freerun = dirty_freerun_ceiling(thresh, bg_thresh); | ||
749 | unsigned long limit = hard_dirty_limit(thresh); | ||
750 | unsigned long setpoint = (freerun + limit) / 2; | ||
751 | unsigned long write_bw = bdi->avg_write_bandwidth; | ||
752 | unsigned long dirty_ratelimit = bdi->dirty_ratelimit; | ||
753 | unsigned long dirty_rate; | ||
754 | unsigned long task_ratelimit; | ||
755 | unsigned long balanced_dirty_ratelimit; | ||
756 | unsigned long pos_ratio; | ||
757 | unsigned long step; | ||
758 | unsigned long x; | ||
759 | |||
760 | /* | ||
761 | * The dirty rate will match the writeout rate in long term, except | ||
762 | * when dirty pages are truncated by userspace or re-dirtied by FS. | ||
763 | */ | ||
764 | dirty_rate = (dirtied - bdi->dirtied_stamp) * HZ / elapsed; | ||
765 | |||
766 | pos_ratio = bdi_position_ratio(bdi, thresh, bg_thresh, dirty, | ||
767 | bdi_thresh, bdi_dirty); | ||
768 | /* | ||
769 | * task_ratelimit reflects each dd's dirty rate for the past 200ms. | ||
770 | */ | ||
771 | task_ratelimit = (u64)dirty_ratelimit * | ||
772 | pos_ratio >> RATELIMIT_CALC_SHIFT; | ||
773 | task_ratelimit++; /* it helps rampup dirty_ratelimit from tiny values */ | ||
774 | |||
775 | /* | ||
776 | * A linear estimation of the "balanced" throttle rate. The theory is, | ||
777 | * if there are N dd tasks, each throttled at task_ratelimit, the bdi's | ||
778 | * dirty_rate will be measured to be (N * task_ratelimit). So the below | ||
779 | * formula will yield the balanced rate limit (write_bw / N). | ||
780 | * | ||
781 | * Note that the expanded form is not a pure rate feedback: | ||
782 | * rate_(i+1) = rate_(i) * (write_bw / dirty_rate) (1) | ||
783 | * but also takes pos_ratio into account: | ||
784 | * rate_(i+1) = rate_(i) * (write_bw / dirty_rate) * pos_ratio (2) | ||
785 | * | ||
786 | * (1) is not realistic because pos_ratio also takes part in balancing | ||
787 | * the dirty rate. Consider the state | ||
788 | * pos_ratio = 0.5 (3) | ||
789 | * rate = 2 * (write_bw / N) (4) | ||
790 | * If (1) is used, it will stuck in that state! Because each dd will | ||
791 | * be throttled at | ||
792 | * task_ratelimit = pos_ratio * rate = (write_bw / N) (5) | ||
793 | * yielding | ||
794 | * dirty_rate = N * task_ratelimit = write_bw (6) | ||
795 | * put (6) into (1) we get | ||
796 | * rate_(i+1) = rate_(i) (7) | ||
797 | * | ||
798 | * So we end up using (2) to always keep | ||
799 | * rate_(i+1) ~= (write_bw / N) (8) | ||
800 | * regardless of the value of pos_ratio. As long as (8) is satisfied, | ||
801 | * pos_ratio is able to drive itself to 1.0, which is not only where | ||
802 | * the dirty count meet the setpoint, but also where the slope of | ||
803 | * pos_ratio is most flat and hence task_ratelimit is least fluctuated. | ||
804 | */ | ||
805 | balanced_dirty_ratelimit = div_u64((u64)task_ratelimit * write_bw, | ||
806 | dirty_rate | 1); | ||
807 | |||
808 | /* | ||
809 | * We could safely do this and return immediately: | ||
810 | * | ||
811 | * bdi->dirty_ratelimit = balanced_dirty_ratelimit; | ||
812 | * | ||
813 | * However to get a more stable dirty_ratelimit, the below elaborated | ||
814 | * code makes use of task_ratelimit to filter out sigular points and | ||
815 | * limit the step size. | ||
816 | * | ||
817 | * The below code essentially only uses the relative value of | ||
818 | * | ||
819 | * task_ratelimit - dirty_ratelimit | ||
820 | * = (pos_ratio - 1) * dirty_ratelimit | ||
821 | * | ||
822 | * which reflects the direction and size of dirty position error. | ||
823 | */ | ||
824 | |||
825 | /* | ||
826 | * dirty_ratelimit will follow balanced_dirty_ratelimit iff | ||
827 | * task_ratelimit is on the same side of dirty_ratelimit, too. | ||
828 | * For example, when | ||
829 | * - dirty_ratelimit > balanced_dirty_ratelimit | ||
830 | * - dirty_ratelimit > task_ratelimit (dirty pages are above setpoint) | ||
831 | * lowering dirty_ratelimit will help meet both the position and rate | ||
832 | * control targets. Otherwise, don't update dirty_ratelimit if it will | ||
833 | * only help meet the rate target. After all, what the users ultimately | ||
834 | * feel and care are stable dirty rate and small position error. | ||
835 | * | ||
836 | * |task_ratelimit - dirty_ratelimit| is used to limit the step size | ||
837 | * and filter out the sigular points of balanced_dirty_ratelimit. Which | ||
838 | * keeps jumping around randomly and can even leap far away at times | ||
839 | * due to the small 200ms estimation period of dirty_rate (we want to | ||
840 | * keep that period small to reduce time lags). | ||
841 | */ | ||
842 | step = 0; | ||
843 | if (dirty < setpoint) { | ||
844 | x = min(bdi->balanced_dirty_ratelimit, | ||
845 | min(balanced_dirty_ratelimit, task_ratelimit)); | ||
846 | if (dirty_ratelimit < x) | ||
847 | step = x - dirty_ratelimit; | ||
848 | } else { | ||
849 | x = max(bdi->balanced_dirty_ratelimit, | ||
850 | max(balanced_dirty_ratelimit, task_ratelimit)); | ||
851 | if (dirty_ratelimit > x) | ||
852 | step = dirty_ratelimit - x; | ||
853 | } | ||
854 | |||
855 | /* | ||
856 | * Don't pursue 100% rate matching. It's impossible since the balanced | ||
857 | * rate itself is constantly fluctuating. So decrease the track speed | ||
858 | * when it gets close to the target. Helps eliminate pointless tremors. | ||
859 | */ | ||
860 | step >>= dirty_ratelimit / (2 * step + 1); | ||
861 | /* | ||
862 | * Limit the tracking speed to avoid overshooting. | ||
863 | */ | ||
864 | step = (step + 7) / 8; | ||
865 | |||
866 | if (dirty_ratelimit < balanced_dirty_ratelimit) | ||
867 | dirty_ratelimit += step; | ||
868 | else | ||
869 | dirty_ratelimit -= step; | ||
870 | |||
871 | bdi->dirty_ratelimit = max(dirty_ratelimit, 1UL); | ||
872 | bdi->balanced_dirty_ratelimit = balanced_dirty_ratelimit; | ||
873 | |||
874 | trace_bdi_dirty_ratelimit(bdi, dirty_rate, task_ratelimit); | ||
875 | } | ||
876 | |||
877 | void __bdi_update_bandwidth(struct backing_dev_info *bdi, | ||
878 | unsigned long thresh, | ||
879 | unsigned long bg_thresh, | ||
880 | unsigned long dirty, | ||
881 | unsigned long bdi_thresh, | ||
882 | unsigned long bdi_dirty, | ||
883 | unsigned long start_time) | ||
884 | { | ||
885 | unsigned long now = jiffies; | ||
886 | unsigned long elapsed = now - bdi->bw_time_stamp; | ||
887 | unsigned long dirtied; | ||
888 | unsigned long written; | ||
889 | |||
890 | /* | ||
891 | * rate-limit, only update once every 200ms. | ||
892 | */ | ||
893 | if (elapsed < BANDWIDTH_INTERVAL) | ||
894 | return; | ||
895 | |||
896 | dirtied = percpu_counter_read(&bdi->bdi_stat[BDI_DIRTIED]); | ||
897 | written = percpu_counter_read(&bdi->bdi_stat[BDI_WRITTEN]); | ||
898 | |||
899 | /* | ||
900 | * Skip quiet periods when disk bandwidth is under-utilized. | ||
901 | * (at least 1s idle time between two flusher runs) | ||
902 | */ | ||
903 | if (elapsed > HZ && time_before(bdi->bw_time_stamp, start_time)) | ||
904 | goto snapshot; | ||
905 | |||
906 | if (thresh) { | ||
907 | global_update_bandwidth(thresh, dirty, now); | ||
908 | bdi_update_dirty_ratelimit(bdi, thresh, bg_thresh, dirty, | ||
909 | bdi_thresh, bdi_dirty, | ||
910 | dirtied, elapsed); | ||
911 | } | ||
912 | bdi_update_write_bandwidth(bdi, elapsed, written); | ||
913 | |||
914 | snapshot: | ||
915 | bdi->dirtied_stamp = dirtied; | ||
916 | bdi->written_stamp = written; | ||
917 | bdi->bw_time_stamp = now; | ||
918 | } | ||
919 | |||
920 | static void bdi_update_bandwidth(struct backing_dev_info *bdi, | ||
921 | unsigned long thresh, | ||
922 | unsigned long bg_thresh, | ||
923 | unsigned long dirty, | ||
924 | unsigned long bdi_thresh, | ||
925 | unsigned long bdi_dirty, | ||
926 | unsigned long start_time) | ||
927 | { | ||
928 | if (time_is_after_eq_jiffies(bdi->bw_time_stamp + BANDWIDTH_INTERVAL)) | ||
929 | return; | ||
930 | spin_lock(&bdi->wb.list_lock); | ||
931 | __bdi_update_bandwidth(bdi, thresh, bg_thresh, dirty, | ||
932 | bdi_thresh, bdi_dirty, start_time); | ||
933 | spin_unlock(&bdi->wb.list_lock); | ||
934 | } | ||
935 | |||
936 | /* | ||
937 | * After a task dirtied this many pages, balance_dirty_pages_ratelimited_nr() | ||
938 | * will look to see if it needs to start dirty throttling. | ||
939 | * | ||
940 | * If dirty_poll_interval is too low, big NUMA machines will call the expensive | ||
941 | * global_page_state() too often. So scale it near-sqrt to the safety margin | ||
942 | * (the number of pages we may dirty without exceeding the dirty limits). | ||
943 | */ | ||
944 | static unsigned long dirty_poll_interval(unsigned long dirty, | ||
945 | unsigned long thresh) | ||
946 | { | ||
947 | if (thresh > dirty) | ||
948 | return 1UL << (ilog2(thresh - dirty) >> 1); | ||
949 | |||
950 | return 1; | ||
951 | } | ||
952 | |||
953 | static unsigned long bdi_max_pause(struct backing_dev_info *bdi, | ||
954 | unsigned long bdi_dirty) | ||
955 | { | ||
956 | unsigned long bw = bdi->avg_write_bandwidth; | ||
957 | unsigned long hi = ilog2(bw); | ||
958 | unsigned long lo = ilog2(bdi->dirty_ratelimit); | ||
959 | unsigned long t; | ||
960 | |||
961 | /* target for 20ms max pause on 1-dd case */ | ||
962 | t = HZ / 50; | ||
963 | |||
964 | /* | ||
965 | * Scale up pause time for concurrent dirtiers in order to reduce CPU | ||
966 | * overheads. | ||
967 | * | ||
968 | * (N * 20ms) on 2^N concurrent tasks. | ||
969 | */ | ||
970 | if (hi > lo) | ||
971 | t += (hi - lo) * (20 * HZ) / 1024; | ||
972 | |||
973 | /* | ||
974 | * Limit pause time for small memory systems. If sleeping for too long | ||
975 | * time, a small pool of dirty/writeback pages may go empty and disk go | ||
976 | * idle. | ||
977 | * | ||
978 | * 8 serves as the safety ratio. | ||
979 | */ | ||
980 | if (bdi_dirty) | ||
981 | t = min(t, bdi_dirty * HZ / (8 * bw + 1)); | ||
982 | |||
983 | /* | ||
984 | * The pause time will be settled within range (max_pause/4, max_pause). | ||
985 | * Apply a minimal value of 4 to get a non-zero max_pause/4. | ||
986 | */ | ||
987 | return clamp_val(t, 4, MAX_PAUSE); | ||
988 | } | ||
989 | |||
990 | /* | ||
472 | * balance_dirty_pages() must be called by processes which are generating dirty | 991 | * balance_dirty_pages() must be called by processes which are generating dirty |
473 | * data. It looks at the number of dirty pages in the machine and will force | 992 | * data. It looks at the number of dirty pages in the machine and will force |
474 | * the caller to perform writeback if the system is over `vm_dirty_ratio'. | 993 | * the caller to wait once crossing the (background_thresh + dirty_thresh) / 2. |
475 | * If we're over `background_thresh' then the writeback threads are woken to | 994 | * If we're over `background_thresh' then the writeback threads are woken to |
476 | * perform some writeout. | 995 | * perform some writeout. |
477 | */ | 996 | */ |
478 | static void balance_dirty_pages(struct address_space *mapping, | 997 | static void balance_dirty_pages(struct address_space *mapping, |
479 | unsigned long write_chunk) | 998 | unsigned long pages_dirtied) |
480 | { | 999 | { |
481 | long nr_reclaimable, bdi_nr_reclaimable; | 1000 | unsigned long nr_reclaimable; /* = file_dirty + unstable_nfs */ |
482 | long nr_writeback, bdi_nr_writeback; | 1001 | unsigned long bdi_reclaimable; |
1002 | unsigned long nr_dirty; /* = file_dirty + writeback + unstable_nfs */ | ||
1003 | unsigned long bdi_dirty; | ||
1004 | unsigned long freerun; | ||
483 | unsigned long background_thresh; | 1005 | unsigned long background_thresh; |
484 | unsigned long dirty_thresh; | 1006 | unsigned long dirty_thresh; |
485 | unsigned long bdi_thresh; | 1007 | unsigned long bdi_thresh; |
486 | unsigned long pages_written = 0; | 1008 | long pause = 0; |
487 | unsigned long pause = 1; | 1009 | long uninitialized_var(max_pause); |
488 | bool dirty_exceeded = false; | 1010 | bool dirty_exceeded = false; |
1011 | unsigned long task_ratelimit; | ||
1012 | unsigned long uninitialized_var(dirty_ratelimit); | ||
1013 | unsigned long pos_ratio; | ||
489 | struct backing_dev_info *bdi = mapping->backing_dev_info; | 1014 | struct backing_dev_info *bdi = mapping->backing_dev_info; |
1015 | unsigned long start_time = jiffies; | ||
490 | 1016 | ||
491 | for (;;) { | 1017 | for (;;) { |
492 | struct writeback_control wbc = { | 1018 | /* |
493 | .sync_mode = WB_SYNC_NONE, | 1019 | * Unstable writes are a feature of certain networked |
494 | .older_than_this = NULL, | 1020 | * filesystems (i.e. NFS) in which data may have been |
495 | .nr_to_write = write_chunk, | 1021 | * written to the server's write cache, but has not yet |
496 | .range_cyclic = 1, | 1022 | * been flushed to permanent storage. |
497 | }; | 1023 | */ |
498 | |||
499 | nr_reclaimable = global_page_state(NR_FILE_DIRTY) + | 1024 | nr_reclaimable = global_page_state(NR_FILE_DIRTY) + |
500 | global_page_state(NR_UNSTABLE_NFS); | 1025 | global_page_state(NR_UNSTABLE_NFS); |
501 | nr_writeback = global_page_state(NR_WRITEBACK); | 1026 | nr_dirty = nr_reclaimable + global_page_state(NR_WRITEBACK); |
502 | 1027 | ||
503 | global_dirty_limits(&background_thresh, &dirty_thresh); | 1028 | global_dirty_limits(&background_thresh, &dirty_thresh); |
504 | 1029 | ||
@@ -507,12 +1032,28 @@ static void balance_dirty_pages(struct address_space *mapping, | |||
507 | * catch-up. This avoids (excessively) small writeouts | 1032 | * catch-up. This avoids (excessively) small writeouts |
508 | * when the bdi limits are ramping up. | 1033 | * when the bdi limits are ramping up. |
509 | */ | 1034 | */ |
510 | if (nr_reclaimable + nr_writeback <= | 1035 | freerun = dirty_freerun_ceiling(dirty_thresh, |
511 | (background_thresh + dirty_thresh) / 2) | 1036 | background_thresh); |
1037 | if (nr_dirty <= freerun) | ||
512 | break; | 1038 | break; |
513 | 1039 | ||
1040 | if (unlikely(!writeback_in_progress(bdi))) | ||
1041 | bdi_start_background_writeback(bdi); | ||
1042 | |||
1043 | /* | ||
1044 | * bdi_thresh is not treated as some limiting factor as | ||
1045 | * dirty_thresh, due to reasons | ||
1046 | * - in JBOD setup, bdi_thresh can fluctuate a lot | ||
1047 | * - in a system with HDD and USB key, the USB key may somehow | ||
1048 | * go into state (bdi_dirty >> bdi_thresh) either because | ||
1049 | * bdi_dirty starts high, or because bdi_thresh drops low. | ||
1050 | * In this case we don't want to hard throttle the USB key | ||
1051 | * dirtiers for 100 seconds until bdi_dirty drops under | ||
1052 | * bdi_thresh. Instead the auxiliary bdi control line in | ||
1053 | * bdi_position_ratio() will let the dirtier task progress | ||
1054 | * at some rate <= (write_bw / 2) for bringing down bdi_dirty. | ||
1055 | */ | ||
514 | bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); | 1056 | bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); |
515 | bdi_thresh = task_dirty_limit(current, bdi_thresh); | ||
516 | 1057 | ||
517 | /* | 1058 | /* |
518 | * In order to avoid the stacked BDI deadlock we need | 1059 | * In order to avoid the stacked BDI deadlock we need |
@@ -524,63 +1065,101 @@ static void balance_dirty_pages(struct address_space *mapping, | |||
524 | * actually dirty; with m+n sitting in the percpu | 1065 | * actually dirty; with m+n sitting in the percpu |
525 | * deltas. | 1066 | * deltas. |
526 | */ | 1067 | */ |
527 | if (bdi_thresh < 2*bdi_stat_error(bdi)) { | 1068 | if (bdi_thresh < 2 * bdi_stat_error(bdi)) { |
528 | bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE); | 1069 | bdi_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE); |
529 | bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK); | 1070 | bdi_dirty = bdi_reclaimable + |
1071 | bdi_stat_sum(bdi, BDI_WRITEBACK); | ||
530 | } else { | 1072 | } else { |
531 | bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); | 1073 | bdi_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); |
532 | bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK); | 1074 | bdi_dirty = bdi_reclaimable + |
1075 | bdi_stat(bdi, BDI_WRITEBACK); | ||
533 | } | 1076 | } |
534 | 1077 | ||
535 | /* | 1078 | dirty_exceeded = (bdi_dirty > bdi_thresh) || |
536 | * The bdi thresh is somehow "soft" limit derived from the | 1079 | (nr_dirty > dirty_thresh); |
537 | * global "hard" limit. The former helps to prevent heavy IO | 1080 | if (dirty_exceeded && !bdi->dirty_exceeded) |
538 | * bdi or process from holding back light ones; The latter is | ||
539 | * the last resort safeguard. | ||
540 | */ | ||
541 | dirty_exceeded = | ||
542 | (bdi_nr_reclaimable + bdi_nr_writeback > bdi_thresh) | ||
543 | || (nr_reclaimable + nr_writeback > dirty_thresh); | ||
544 | |||
545 | if (!dirty_exceeded) | ||
546 | break; | ||
547 | |||
548 | if (!bdi->dirty_exceeded) | ||
549 | bdi->dirty_exceeded = 1; | 1081 | bdi->dirty_exceeded = 1; |
550 | 1082 | ||
551 | /* Note: nr_reclaimable denotes nr_dirty + nr_unstable. | 1083 | bdi_update_bandwidth(bdi, dirty_thresh, background_thresh, |
552 | * Unstable writes are a feature of certain networked | 1084 | nr_dirty, bdi_thresh, bdi_dirty, |
553 | * filesystems (i.e. NFS) in which data may have been | 1085 | start_time); |
554 | * written to the server's write cache, but has not yet | 1086 | |
555 | * been flushed to permanent storage. | 1087 | max_pause = bdi_max_pause(bdi, bdi_dirty); |
556 | * Only move pages to writeback if this bdi is over its | 1088 | |
557 | * threshold otherwise wait until the disk writes catch | 1089 | dirty_ratelimit = bdi->dirty_ratelimit; |
558 | * up. | 1090 | pos_ratio = bdi_position_ratio(bdi, dirty_thresh, |
559 | */ | 1091 | background_thresh, nr_dirty, |
560 | trace_wbc_balance_dirty_start(&wbc, bdi); | 1092 | bdi_thresh, bdi_dirty); |
561 | if (bdi_nr_reclaimable > bdi_thresh) { | 1093 | task_ratelimit = ((u64)dirty_ratelimit * pos_ratio) >> |
562 | writeback_inodes_wb(&bdi->wb, &wbc); | 1094 | RATELIMIT_CALC_SHIFT; |
563 | pages_written += write_chunk - wbc.nr_to_write; | 1095 | if (unlikely(task_ratelimit == 0)) { |
564 | trace_wbc_balance_dirty_written(&wbc, bdi); | 1096 | pause = max_pause; |
565 | if (pages_written >= write_chunk) | 1097 | goto pause; |
566 | break; /* We've done our duty */ | ||
567 | } | 1098 | } |
568 | trace_wbc_balance_dirty_wait(&wbc, bdi); | 1099 | pause = HZ * pages_dirtied / task_ratelimit; |
569 | __set_current_state(TASK_UNINTERRUPTIBLE); | 1100 | if (unlikely(pause <= 0)) { |
1101 | trace_balance_dirty_pages(bdi, | ||
1102 | dirty_thresh, | ||
1103 | background_thresh, | ||
1104 | nr_dirty, | ||
1105 | bdi_thresh, | ||
1106 | bdi_dirty, | ||
1107 | dirty_ratelimit, | ||
1108 | task_ratelimit, | ||
1109 | pages_dirtied, | ||
1110 | pause, | ||
1111 | start_time); | ||
1112 | pause = 1; /* avoid resetting nr_dirtied_pause below */ | ||
1113 | break; | ||
1114 | } | ||
1115 | pause = min(pause, max_pause); | ||
1116 | |||
1117 | pause: | ||
1118 | trace_balance_dirty_pages(bdi, | ||
1119 | dirty_thresh, | ||
1120 | background_thresh, | ||
1121 | nr_dirty, | ||
1122 | bdi_thresh, | ||
1123 | bdi_dirty, | ||
1124 | dirty_ratelimit, | ||
1125 | task_ratelimit, | ||
1126 | pages_dirtied, | ||
1127 | pause, | ||
1128 | start_time); | ||
1129 | __set_current_state(TASK_KILLABLE); | ||
570 | io_schedule_timeout(pause); | 1130 | io_schedule_timeout(pause); |
571 | 1131 | ||
572 | /* | 1132 | /* |
573 | * Increase the delay for each loop, up to our previous | 1133 | * This is typically equal to (nr_dirty < dirty_thresh) and can |
574 | * default of taking a 100ms nap. | 1134 | * also keep "1000+ dd on a slow USB stick" under control. |
575 | */ | 1135 | */ |
576 | pause <<= 1; | 1136 | if (task_ratelimit) |
577 | if (pause > HZ / 10) | 1137 | break; |
578 | pause = HZ / 10; | 1138 | |
1139 | if (fatal_signal_pending(current)) | ||
1140 | break; | ||
579 | } | 1141 | } |
580 | 1142 | ||
581 | if (!dirty_exceeded && bdi->dirty_exceeded) | 1143 | if (!dirty_exceeded && bdi->dirty_exceeded) |
582 | bdi->dirty_exceeded = 0; | 1144 | bdi->dirty_exceeded = 0; |
583 | 1145 | ||
1146 | current->nr_dirtied = 0; | ||
1147 | if (pause == 0) { /* in freerun area */ | ||
1148 | current->nr_dirtied_pause = | ||
1149 | dirty_poll_interval(nr_dirty, dirty_thresh); | ||
1150 | } else if (pause <= max_pause / 4 && | ||
1151 | pages_dirtied >= current->nr_dirtied_pause) { | ||
1152 | current->nr_dirtied_pause = clamp_val( | ||
1153 | dirty_ratelimit * (max_pause / 2) / HZ, | ||
1154 | pages_dirtied + pages_dirtied / 8, | ||
1155 | pages_dirtied * 4); | ||
1156 | } else if (pause >= max_pause) { | ||
1157 | current->nr_dirtied_pause = 1 | clamp_val( | ||
1158 | dirty_ratelimit * (max_pause / 2) / HZ, | ||
1159 | pages_dirtied / 4, | ||
1160 | pages_dirtied - pages_dirtied / 8); | ||
1161 | } | ||
1162 | |||
584 | if (writeback_in_progress(bdi)) | 1163 | if (writeback_in_progress(bdi)) |
585 | return; | 1164 | return; |
586 | 1165 | ||
@@ -592,8 +1171,10 @@ static void balance_dirty_pages(struct address_space *mapping, | |||
592 | * In normal mode, we start background writeout at the lower | 1171 | * In normal mode, we start background writeout at the lower |
593 | * background_thresh, to keep the amount of dirty memory low. | 1172 | * background_thresh, to keep the amount of dirty memory low. |
594 | */ | 1173 | */ |
595 | if ((laptop_mode && pages_written) || | 1174 | if (laptop_mode) |
596 | (!laptop_mode && (nr_reclaimable > background_thresh))) | 1175 | return; |
1176 | |||
1177 | if (nr_reclaimable > background_thresh) | ||
597 | bdi_start_background_writeback(bdi); | 1178 | bdi_start_background_writeback(bdi); |
598 | } | 1179 | } |
599 | 1180 | ||
@@ -607,7 +1188,7 @@ void set_page_dirty_balance(struct page *page, int page_mkwrite) | |||
607 | } | 1188 | } |
608 | } | 1189 | } |
609 | 1190 | ||
610 | static DEFINE_PER_CPU(unsigned long, bdp_ratelimits) = 0; | 1191 | static DEFINE_PER_CPU(int, bdp_ratelimits); |
611 | 1192 | ||
612 | /** | 1193 | /** |
613 | * balance_dirty_pages_ratelimited_nr - balance dirty memory state | 1194 | * balance_dirty_pages_ratelimited_nr - balance dirty memory state |
@@ -626,28 +1207,40 @@ static DEFINE_PER_CPU(unsigned long, bdp_ratelimits) = 0; | |||
626 | void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, | 1207 | void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, |
627 | unsigned long nr_pages_dirtied) | 1208 | unsigned long nr_pages_dirtied) |
628 | { | 1209 | { |
629 | unsigned long ratelimit; | 1210 | struct backing_dev_info *bdi = mapping->backing_dev_info; |
630 | unsigned long *p; | 1211 | int ratelimit; |
1212 | int *p; | ||
1213 | |||
1214 | if (!bdi_cap_account_dirty(bdi)) | ||
1215 | return; | ||
631 | 1216 | ||
632 | ratelimit = ratelimit_pages; | 1217 | ratelimit = current->nr_dirtied_pause; |
633 | if (mapping->backing_dev_info->dirty_exceeded) | 1218 | if (bdi->dirty_exceeded) |
634 | ratelimit = 8; | 1219 | ratelimit = min(ratelimit, 32 >> (PAGE_SHIFT - 10)); |
635 | 1220 | ||
1221 | current->nr_dirtied += nr_pages_dirtied; | ||
1222 | |||
1223 | preempt_disable(); | ||
636 | /* | 1224 | /* |
637 | * Check the rate limiting. Also, we do not want to throttle real-time | 1225 | * This prevents one CPU to accumulate too many dirtied pages without |
638 | * tasks in balance_dirty_pages(). Period. | 1226 | * calling into balance_dirty_pages(), which can happen when there are |
1227 | * 1000+ tasks, all of them start dirtying pages at exactly the same | ||
1228 | * time, hence all honoured too large initial task->nr_dirtied_pause. | ||
639 | */ | 1229 | */ |
640 | preempt_disable(); | ||
641 | p = &__get_cpu_var(bdp_ratelimits); | 1230 | p = &__get_cpu_var(bdp_ratelimits); |
642 | *p += nr_pages_dirtied; | 1231 | if (unlikely(current->nr_dirtied >= ratelimit)) |
643 | if (unlikely(*p >= ratelimit)) { | ||
644 | ratelimit = sync_writeback_pages(*p); | ||
645 | *p = 0; | 1232 | *p = 0; |
646 | preempt_enable(); | 1233 | else { |
647 | balance_dirty_pages(mapping, ratelimit); | 1234 | *p += nr_pages_dirtied; |
648 | return; | 1235 | if (unlikely(*p >= ratelimit_pages)) { |
1236 | *p = 0; | ||
1237 | ratelimit = 0; | ||
1238 | } | ||
649 | } | 1239 | } |
650 | preempt_enable(); | 1240 | preempt_enable(); |
1241 | |||
1242 | if (unlikely(current->nr_dirtied >= ratelimit)) | ||
1243 | balance_dirty_pages(mapping, current->nr_dirtied); | ||
651 | } | 1244 | } |
652 | EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr); | 1245 | EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr); |
653 | 1246 | ||
@@ -703,7 +1296,8 @@ void laptop_mode_timer_fn(unsigned long data) | |||
703 | * threshold | 1296 | * threshold |
704 | */ | 1297 | */ |
705 | if (bdi_has_dirty_io(&q->backing_dev_info)) | 1298 | if (bdi_has_dirty_io(&q->backing_dev_info)) |
706 | bdi_start_writeback(&q->backing_dev_info, nr_pages); | 1299 | bdi_start_writeback(&q->backing_dev_info, nr_pages, |
1300 | WB_REASON_LAPTOP_TIMER); | ||
707 | } | 1301 | } |
708 | 1302 | ||
709 | /* | 1303 | /* |
@@ -742,22 +1336,17 @@ void laptop_sync_completion(void) | |||
742 | * | 1336 | * |
743 | * Here we set ratelimit_pages to a level which ensures that when all CPUs are | 1337 | * Here we set ratelimit_pages to a level which ensures that when all CPUs are |
744 | * dirtying in parallel, we cannot go more than 3% (1/32) over the dirty memory | 1338 | * dirtying in parallel, we cannot go more than 3% (1/32) over the dirty memory |
745 | * thresholds before writeback cuts in. | 1339 | * thresholds. |
746 | * | ||
747 | * But the limit should not be set too high. Because it also controls the | ||
748 | * amount of memory which the balance_dirty_pages() caller has to write back. | ||
749 | * If this is too large then the caller will block on the IO queue all the | ||
750 | * time. So limit it to four megabytes - the balance_dirty_pages() caller | ||
751 | * will write six megabyte chunks, max. | ||
752 | */ | 1340 | */ |
753 | 1341 | ||
754 | void writeback_set_ratelimit(void) | 1342 | void writeback_set_ratelimit(void) |
755 | { | 1343 | { |
756 | ratelimit_pages = vm_total_pages / (num_online_cpus() * 32); | 1344 | unsigned long background_thresh; |
1345 | unsigned long dirty_thresh; | ||
1346 | global_dirty_limits(&background_thresh, &dirty_thresh); | ||
1347 | ratelimit_pages = dirty_thresh / (num_online_cpus() * 32); | ||
757 | if (ratelimit_pages < 16) | 1348 | if (ratelimit_pages < 16) |
758 | ratelimit_pages = 16; | 1349 | ratelimit_pages = 16; |
759 | if (ratelimit_pages * PAGE_CACHE_SIZE > 4096 * 1024) | ||
760 | ratelimit_pages = (4096 * 1024) / PAGE_CACHE_SIZE; | ||
761 | } | 1350 | } |
762 | 1351 | ||
763 | static int __cpuinit | 1352 | static int __cpuinit |
@@ -799,7 +1388,6 @@ void __init page_writeback_init(void) | |||
799 | 1388 | ||
800 | shift = calc_period_shift(); | 1389 | shift = calc_period_shift(); |
801 | prop_descriptor_init(&vm_completions, shift); | 1390 | prop_descriptor_init(&vm_completions, shift); |
802 | prop_descriptor_init(&vm_dirties, shift); | ||
803 | } | 1391 | } |
804 | 1392 | ||
805 | /** | 1393 | /** |
@@ -892,12 +1480,12 @@ int write_cache_pages(struct address_space *mapping, | |||
892 | range_whole = 1; | 1480 | range_whole = 1; |
893 | cycled = 1; /* ignore range_cyclic tests */ | 1481 | cycled = 1; /* ignore range_cyclic tests */ |
894 | } | 1482 | } |
895 | if (wbc->sync_mode == WB_SYNC_ALL) | 1483 | if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) |
896 | tag = PAGECACHE_TAG_TOWRITE; | 1484 | tag = PAGECACHE_TAG_TOWRITE; |
897 | else | 1485 | else |
898 | tag = PAGECACHE_TAG_DIRTY; | 1486 | tag = PAGECACHE_TAG_DIRTY; |
899 | retry: | 1487 | retry: |
900 | if (wbc->sync_mode == WB_SYNC_ALL) | 1488 | if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) |
901 | tag_pages_for_writeback(mapping, index, end); | 1489 | tag_pages_for_writeback(mapping, index, end); |
902 | done_index = index; | 1490 | done_index = index; |
903 | while (!done && (index <= end)) { | 1491 | while (!done && (index <= end)) { |
@@ -1127,7 +1715,7 @@ void account_page_dirtied(struct page *page, struct address_space *mapping) | |||
1127 | __inc_zone_page_state(page, NR_FILE_DIRTY); | 1715 | __inc_zone_page_state(page, NR_FILE_DIRTY); |
1128 | __inc_zone_page_state(page, NR_DIRTIED); | 1716 | __inc_zone_page_state(page, NR_DIRTIED); |
1129 | __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); | 1717 | __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); |
1130 | task_dirty_inc(current); | 1718 | __inc_bdi_stat(mapping->backing_dev_info, BDI_DIRTIED); |
1131 | task_io_account_write(PAGE_CACHE_SIZE); | 1719 | task_io_account_write(PAGE_CACHE_SIZE); |
1132 | } | 1720 | } |
1133 | } | 1721 | } |
@@ -1141,7 +1729,6 @@ EXPORT_SYMBOL(account_page_dirtied); | |||
1141 | void account_page_writeback(struct page *page) | 1729 | void account_page_writeback(struct page *page) |
1142 | { | 1730 | { |
1143 | inc_zone_page_state(page, NR_WRITEBACK); | 1731 | inc_zone_page_state(page, NR_WRITEBACK); |
1144 | inc_zone_page_state(page, NR_WRITTEN); | ||
1145 | } | 1732 | } |
1146 | EXPORT_SYMBOL(account_page_writeback); | 1733 | EXPORT_SYMBOL(account_page_writeback); |
1147 | 1734 | ||
@@ -1358,8 +1945,10 @@ int test_clear_page_writeback(struct page *page) | |||
1358 | } else { | 1945 | } else { |
1359 | ret = TestClearPageWriteback(page); | 1946 | ret = TestClearPageWriteback(page); |
1360 | } | 1947 | } |
1361 | if (ret) | 1948 | if (ret) { |
1362 | dec_zone_page_state(page, NR_WRITEBACK); | 1949 | dec_zone_page_state(page, NR_WRITEBACK); |
1950 | inc_zone_page_state(page, NR_WRITTEN); | ||
1951 | } | ||
1363 | return ret; | 1952 | return ret; |
1364 | } | 1953 | } |
1365 | 1954 | ||
@@ -1405,10 +1994,6 @@ EXPORT_SYMBOL(test_set_page_writeback); | |||
1405 | */ | 1994 | */ |
1406 | int mapping_tagged(struct address_space *mapping, int tag) | 1995 | int mapping_tagged(struct address_space *mapping, int tag) |
1407 | { | 1996 | { |
1408 | int ret; | 1997 | return radix_tree_tagged(&mapping->page_tree, tag); |
1409 | rcu_read_lock(); | ||
1410 | ret = radix_tree_tagged(&mapping->page_tree, tag); | ||
1411 | rcu_read_unlock(); | ||
1412 | return ret; | ||
1413 | } | 1998 | } |
1414 | EXPORT_SYMBOL(mapping_tagged); | 1999 | EXPORT_SYMBOL(mapping_tagged); |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3c7ea45ffba9..6ce27331834c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -321,6 +321,7 @@ static void bad_page(struct page *page) | |||
321 | current->comm, page_to_pfn(page)); | 321 | current->comm, page_to_pfn(page)); |
322 | dump_page(page); | 322 | dump_page(page); |
323 | 323 | ||
324 | print_modules(); | ||
324 | dump_stack(); | 325 | dump_stack(); |
325 | out: | 326 | out: |
326 | /* Leave bad fields for debug, except PageBuddy could make trouble */ | 327 | /* Leave bad fields for debug, except PageBuddy could make trouble */ |
@@ -1373,21 +1374,12 @@ failed: | |||
1373 | 1374 | ||
1374 | #ifdef CONFIG_FAIL_PAGE_ALLOC | 1375 | #ifdef CONFIG_FAIL_PAGE_ALLOC |
1375 | 1376 | ||
1376 | static struct fail_page_alloc_attr { | 1377 | static struct { |
1377 | struct fault_attr attr; | 1378 | struct fault_attr attr; |
1378 | 1379 | ||
1379 | u32 ignore_gfp_highmem; | 1380 | u32 ignore_gfp_highmem; |
1380 | u32 ignore_gfp_wait; | 1381 | u32 ignore_gfp_wait; |
1381 | u32 min_order; | 1382 | u32 min_order; |
1382 | |||
1383 | #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS | ||
1384 | |||
1385 | struct dentry *ignore_gfp_highmem_file; | ||
1386 | struct dentry *ignore_gfp_wait_file; | ||
1387 | struct dentry *min_order_file; | ||
1388 | |||
1389 | #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ | ||
1390 | |||
1391 | } fail_page_alloc = { | 1383 | } fail_page_alloc = { |
1392 | .attr = FAULT_ATTR_INITIALIZER, | 1384 | .attr = FAULT_ATTR_INITIALIZER, |
1393 | .ignore_gfp_wait = 1, | 1385 | .ignore_gfp_wait = 1, |
@@ -1421,36 +1413,27 @@ static int __init fail_page_alloc_debugfs(void) | |||
1421 | { | 1413 | { |
1422 | mode_t mode = S_IFREG | S_IRUSR | S_IWUSR; | 1414 | mode_t mode = S_IFREG | S_IRUSR | S_IWUSR; |
1423 | struct dentry *dir; | 1415 | struct dentry *dir; |
1424 | int err; | ||
1425 | |||
1426 | err = init_fault_attr_dentries(&fail_page_alloc.attr, | ||
1427 | "fail_page_alloc"); | ||
1428 | if (err) | ||
1429 | return err; | ||
1430 | dir = fail_page_alloc.attr.dentries.dir; | ||
1431 | |||
1432 | fail_page_alloc.ignore_gfp_wait_file = | ||
1433 | debugfs_create_bool("ignore-gfp-wait", mode, dir, | ||
1434 | &fail_page_alloc.ignore_gfp_wait); | ||
1435 | |||
1436 | fail_page_alloc.ignore_gfp_highmem_file = | ||
1437 | debugfs_create_bool("ignore-gfp-highmem", mode, dir, | ||
1438 | &fail_page_alloc.ignore_gfp_highmem); | ||
1439 | fail_page_alloc.min_order_file = | ||
1440 | debugfs_create_u32("min-order", mode, dir, | ||
1441 | &fail_page_alloc.min_order); | ||
1442 | |||
1443 | if (!fail_page_alloc.ignore_gfp_wait_file || | ||
1444 | !fail_page_alloc.ignore_gfp_highmem_file || | ||
1445 | !fail_page_alloc.min_order_file) { | ||
1446 | err = -ENOMEM; | ||
1447 | debugfs_remove(fail_page_alloc.ignore_gfp_wait_file); | ||
1448 | debugfs_remove(fail_page_alloc.ignore_gfp_highmem_file); | ||
1449 | debugfs_remove(fail_page_alloc.min_order_file); | ||
1450 | cleanup_fault_attr_dentries(&fail_page_alloc.attr); | ||
1451 | } | ||
1452 | 1416 | ||
1453 | return err; | 1417 | dir = fault_create_debugfs_attr("fail_page_alloc", NULL, |
1418 | &fail_page_alloc.attr); | ||
1419 | if (IS_ERR(dir)) | ||
1420 | return PTR_ERR(dir); | ||
1421 | |||
1422 | if (!debugfs_create_bool("ignore-gfp-wait", mode, dir, | ||
1423 | &fail_page_alloc.ignore_gfp_wait)) | ||
1424 | goto fail; | ||
1425 | if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir, | ||
1426 | &fail_page_alloc.ignore_gfp_highmem)) | ||
1427 | goto fail; | ||
1428 | if (!debugfs_create_u32("min-order", mode, dir, | ||
1429 | &fail_page_alloc.min_order)) | ||
1430 | goto fail; | ||
1431 | |||
1432 | return 0; | ||
1433 | fail: | ||
1434 | debugfs_remove_recursive(dir); | ||
1435 | |||
1436 | return -ENOMEM; | ||
1454 | } | 1437 | } |
1455 | 1438 | ||
1456 | late_initcall(fail_page_alloc_debugfs); | 1439 | late_initcall(fail_page_alloc_debugfs); |
@@ -1619,6 +1602,21 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) | |||
1619 | set_bit(i, zlc->fullzones); | 1602 | set_bit(i, zlc->fullzones); |
1620 | } | 1603 | } |
1621 | 1604 | ||
1605 | /* | ||
1606 | * clear all zones full, called after direct reclaim makes progress so that | ||
1607 | * a zone that was recently full is not skipped over for up to a second | ||
1608 | */ | ||
1609 | static void zlc_clear_zones_full(struct zonelist *zonelist) | ||
1610 | { | ||
1611 | struct zonelist_cache *zlc; /* cached zonelist speedup info */ | ||
1612 | |||
1613 | zlc = zonelist->zlcache_ptr; | ||
1614 | if (!zlc) | ||
1615 | return; | ||
1616 | |||
1617 | bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); | ||
1618 | } | ||
1619 | |||
1622 | #else /* CONFIG_NUMA */ | 1620 | #else /* CONFIG_NUMA */ |
1623 | 1621 | ||
1624 | static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) | 1622 | static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) |
@@ -1635,6 +1633,10 @@ static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z, | |||
1635 | static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) | 1633 | static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) |
1636 | { | 1634 | { |
1637 | } | 1635 | } |
1636 | |||
1637 | static void zlc_clear_zones_full(struct zonelist *zonelist) | ||
1638 | { | ||
1639 | } | ||
1638 | #endif /* CONFIG_NUMA */ | 1640 | #endif /* CONFIG_NUMA */ |
1639 | 1641 | ||
1640 | /* | 1642 | /* |
@@ -1667,7 +1669,7 @@ zonelist_scan: | |||
1667 | continue; | 1669 | continue; |
1668 | if ((alloc_flags & ALLOC_CPUSET) && | 1670 | if ((alloc_flags & ALLOC_CPUSET) && |
1669 | !cpuset_zone_allowed_softwall(zone, gfp_mask)) | 1671 | !cpuset_zone_allowed_softwall(zone, gfp_mask)) |
1670 | goto try_next_zone; | 1672 | continue; |
1671 | 1673 | ||
1672 | BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); | 1674 | BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); |
1673 | if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { | 1675 | if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { |
@@ -1679,17 +1681,36 @@ zonelist_scan: | |||
1679 | classzone_idx, alloc_flags)) | 1681 | classzone_idx, alloc_flags)) |
1680 | goto try_this_zone; | 1682 | goto try_this_zone; |
1681 | 1683 | ||
1684 | if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) { | ||
1685 | /* | ||
1686 | * we do zlc_setup if there are multiple nodes | ||
1687 | * and before considering the first zone allowed | ||
1688 | * by the cpuset. | ||
1689 | */ | ||
1690 | allowednodes = zlc_setup(zonelist, alloc_flags); | ||
1691 | zlc_active = 1; | ||
1692 | did_zlc_setup = 1; | ||
1693 | } | ||
1694 | |||
1682 | if (zone_reclaim_mode == 0) | 1695 | if (zone_reclaim_mode == 0) |
1683 | goto this_zone_full; | 1696 | goto this_zone_full; |
1684 | 1697 | ||
1698 | /* | ||
1699 | * As we may have just activated ZLC, check if the first | ||
1700 | * eligible zone has failed zone_reclaim recently. | ||
1701 | */ | ||
1702 | if (NUMA_BUILD && zlc_active && | ||
1703 | !zlc_zone_worth_trying(zonelist, z, allowednodes)) | ||
1704 | continue; | ||
1705 | |||
1685 | ret = zone_reclaim(zone, gfp_mask, order); | 1706 | ret = zone_reclaim(zone, gfp_mask, order); |
1686 | switch (ret) { | 1707 | switch (ret) { |
1687 | case ZONE_RECLAIM_NOSCAN: | 1708 | case ZONE_RECLAIM_NOSCAN: |
1688 | /* did not scan */ | 1709 | /* did not scan */ |
1689 | goto try_next_zone; | 1710 | continue; |
1690 | case ZONE_RECLAIM_FULL: | 1711 | case ZONE_RECLAIM_FULL: |
1691 | /* scanned but unreclaimable */ | 1712 | /* scanned but unreclaimable */ |
1692 | goto this_zone_full; | 1713 | continue; |
1693 | default: | 1714 | default: |
1694 | /* did we reclaim enough */ | 1715 | /* did we reclaim enough */ |
1695 | if (!zone_watermark_ok(zone, order, mark, | 1716 | if (!zone_watermark_ok(zone, order, mark, |
@@ -1706,16 +1727,6 @@ try_this_zone: | |||
1706 | this_zone_full: | 1727 | this_zone_full: |
1707 | if (NUMA_BUILD) | 1728 | if (NUMA_BUILD) |
1708 | zlc_mark_zone_full(zonelist, z); | 1729 | zlc_mark_zone_full(zonelist, z); |
1709 | try_next_zone: | ||
1710 | if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) { | ||
1711 | /* | ||
1712 | * we do zlc_setup after the first zone is tried but only | ||
1713 | * if there are multiple nodes make it worthwhile | ||
1714 | */ | ||
1715 | allowednodes = zlc_setup(zonelist, alloc_flags); | ||
1716 | zlc_active = 1; | ||
1717 | did_zlc_setup = 1; | ||
1718 | } | ||
1719 | } | 1730 | } |
1720 | 1731 | ||
1721 | if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) { | 1732 | if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) { |
@@ -1746,7 +1757,6 @@ static DEFINE_RATELIMIT_STATE(nopage_rs, | |||
1746 | 1757 | ||
1747 | void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...) | 1758 | void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...) |
1748 | { | 1759 | { |
1749 | va_list args; | ||
1750 | unsigned int filter = SHOW_MEM_FILTER_NODES; | 1760 | unsigned int filter = SHOW_MEM_FILTER_NODES; |
1751 | 1761 | ||
1752 | if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs)) | 1762 | if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs)) |
@@ -1765,14 +1775,21 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...) | |||
1765 | filter &= ~SHOW_MEM_FILTER_NODES; | 1775 | filter &= ~SHOW_MEM_FILTER_NODES; |
1766 | 1776 | ||
1767 | if (fmt) { | 1777 | if (fmt) { |
1768 | printk(KERN_WARNING); | 1778 | struct va_format vaf; |
1779 | va_list args; | ||
1780 | |||
1769 | va_start(args, fmt); | 1781 | va_start(args, fmt); |
1770 | vprintk(fmt, args); | 1782 | |
1783 | vaf.fmt = fmt; | ||
1784 | vaf.va = &args; | ||
1785 | |||
1786 | pr_warn("%pV", &vaf); | ||
1787 | |||
1771 | va_end(args); | 1788 | va_end(args); |
1772 | } | 1789 | } |
1773 | 1790 | ||
1774 | pr_warning("%s: page allocation failure: order:%d, mode:0x%x\n", | 1791 | pr_warn("%s: page allocation failure: order:%d, mode:0x%x\n", |
1775 | current->comm, order, gfp_mask); | 1792 | current->comm, order, gfp_mask); |
1776 | 1793 | ||
1777 | dump_stack(); | 1794 | dump_stack(); |
1778 | if (!should_suppress_show_mem()) | 1795 | if (!should_suppress_show_mem()) |
@@ -1957,6 +1974,10 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, | |||
1957 | if (unlikely(!(*did_some_progress))) | 1974 | if (unlikely(!(*did_some_progress))) |
1958 | return NULL; | 1975 | return NULL; |
1959 | 1976 | ||
1977 | /* After successful reclaim, reconsider all zones for allocation */ | ||
1978 | if (NUMA_BUILD) | ||
1979 | zlc_clear_zones_full(zonelist); | ||
1980 | |||
1960 | retry: | 1981 | retry: |
1961 | page = get_page_from_freelist(gfp_mask, nodemask, order, | 1982 | page = get_page_from_freelist(gfp_mask, nodemask, order, |
1962 | zonelist, high_zoneidx, | 1983 | zonelist, high_zoneidx, |
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index 53bffc6c293e..2d123f94a8df 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c | |||
@@ -133,10 +133,13 @@ struct page *lookup_cgroup_page(struct page_cgroup *pc) | |||
133 | static void *__meminit alloc_page_cgroup(size_t size, int nid) | 133 | static void *__meminit alloc_page_cgroup(size_t size, int nid) |
134 | { | 134 | { |
135 | void *addr = NULL; | 135 | void *addr = NULL; |
136 | gfp_t flags = GFP_KERNEL | __GFP_NOWARN; | ||
136 | 137 | ||
137 | addr = alloc_pages_exact_nid(nid, size, GFP_KERNEL | __GFP_NOWARN); | 138 | addr = alloc_pages_exact_nid(nid, size, flags); |
138 | if (addr) | 139 | if (addr) { |
140 | kmemleak_alloc(addr, size, 1, flags); | ||
139 | return addr; | 141 | return addr; |
142 | } | ||
140 | 143 | ||
141 | if (node_state(nid, N_HIGH_MEMORY)) | 144 | if (node_state(nid, N_HIGH_MEMORY)) |
142 | addr = vmalloc_node(size, nid); | 145 | addr = vmalloc_node(size, nid); |
@@ -225,8 +228,8 @@ int __meminit online_page_cgroup(unsigned long start_pfn, | |||
225 | unsigned long start, end, pfn; | 228 | unsigned long start, end, pfn; |
226 | int fail = 0; | 229 | int fail = 0; |
227 | 230 | ||
228 | start = start_pfn & ~(PAGES_PER_SECTION - 1); | 231 | start = SECTION_ALIGN_DOWN(start_pfn); |
229 | end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION); | 232 | end = SECTION_ALIGN_UP(start_pfn + nr_pages); |
230 | 233 | ||
231 | if (nid == -1) { | 234 | if (nid == -1) { |
232 | /* | 235 | /* |
@@ -258,8 +261,8 @@ int __meminit offline_page_cgroup(unsigned long start_pfn, | |||
258 | { | 261 | { |
259 | unsigned long start, end, pfn; | 262 | unsigned long start, end, pfn; |
260 | 263 | ||
261 | start = start_pfn & ~(PAGES_PER_SECTION - 1); | 264 | start = SECTION_ALIGN_DOWN(start_pfn); |
262 | end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION); | 265 | end = SECTION_ALIGN_UP(start_pfn + nr_pages); |
263 | 266 | ||
264 | for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) | 267 | for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) |
265 | __free_page_cgroup(pfn); | 268 | __free_page_cgroup(pfn); |
@@ -357,7 +360,7 @@ struct swap_cgroup_ctrl { | |||
357 | spinlock_t lock; | 360 | spinlock_t lock; |
358 | }; | 361 | }; |
359 | 362 | ||
360 | struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES]; | 363 | static struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES]; |
361 | 364 | ||
362 | struct swap_cgroup { | 365 | struct swap_cgroup { |
363 | unsigned short id; | 366 | unsigned short id; |
@@ -513,11 +516,10 @@ int swap_cgroup_swapon(int type, unsigned long max_pages) | |||
513 | length = DIV_ROUND_UP(max_pages, SC_PER_PAGE); | 516 | length = DIV_ROUND_UP(max_pages, SC_PER_PAGE); |
514 | array_size = length * sizeof(void *); | 517 | array_size = length * sizeof(void *); |
515 | 518 | ||
516 | array = vmalloc(array_size); | 519 | array = vzalloc(array_size); |
517 | if (!array) | 520 | if (!array) |
518 | goto nomem; | 521 | goto nomem; |
519 | 522 | ||
520 | memset(array, 0, array_size); | ||
521 | ctrl = &swap_cgroup_ctrl[type]; | 523 | ctrl = &swap_cgroup_ctrl[type]; |
522 | mutex_lock(&swap_cgroup_mutex); | 524 | mutex_lock(&swap_cgroup_mutex); |
523 | ctrl->length = length; | 525 | ctrl->length = length; |
@@ -537,7 +539,7 @@ int swap_cgroup_swapon(int type, unsigned long max_pages) | |||
537 | nomem: | 539 | nomem: |
538 | printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n"); | 540 | printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n"); |
539 | printk(KERN_INFO | 541 | printk(KERN_INFO |
540 | "swap_cgroup can be disabled by noswapaccount boot option\n"); | 542 | "swap_cgroup can be disabled by swapaccount=0 boot option\n"); |
541 | return -ENOMEM; | 543 | return -ENOMEM; |
542 | } | 544 | } |
543 | 545 | ||
diff --git a/mm/pagewalk.c b/mm/pagewalk.c index c3450d533611..2f5cf10ff660 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c | |||
@@ -126,7 +126,39 @@ static int walk_hugetlb_range(struct vm_area_struct *vma, | |||
126 | 126 | ||
127 | return 0; | 127 | return 0; |
128 | } | 128 | } |
129 | #endif | 129 | |
130 | static struct vm_area_struct* hugetlb_vma(unsigned long addr, struct mm_walk *walk) | ||
131 | { | ||
132 | struct vm_area_struct *vma; | ||
133 | |||
134 | /* We don't need vma lookup at all. */ | ||
135 | if (!walk->hugetlb_entry) | ||
136 | return NULL; | ||
137 | |||
138 | VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem)); | ||
139 | vma = find_vma(walk->mm, addr); | ||
140 | if (vma && vma->vm_start <= addr && is_vm_hugetlb_page(vma)) | ||
141 | return vma; | ||
142 | |||
143 | return NULL; | ||
144 | } | ||
145 | |||
146 | #else /* CONFIG_HUGETLB_PAGE */ | ||
147 | static struct vm_area_struct* hugetlb_vma(unsigned long addr, struct mm_walk *walk) | ||
148 | { | ||
149 | return NULL; | ||
150 | } | ||
151 | |||
152 | static int walk_hugetlb_range(struct vm_area_struct *vma, | ||
153 | unsigned long addr, unsigned long end, | ||
154 | struct mm_walk *walk) | ||
155 | { | ||
156 | return 0; | ||
157 | } | ||
158 | |||
159 | #endif /* CONFIG_HUGETLB_PAGE */ | ||
160 | |||
161 | |||
130 | 162 | ||
131 | /** | 163 | /** |
132 | * walk_page_range - walk a memory map's page tables with a callback | 164 | * walk_page_range - walk a memory map's page tables with a callback |
@@ -144,11 +176,15 @@ static int walk_hugetlb_range(struct vm_area_struct *vma, | |||
144 | * associated range, and a copy of the original mm_walk for access to | 176 | * associated range, and a copy of the original mm_walk for access to |
145 | * the ->private or ->mm fields. | 177 | * the ->private or ->mm fields. |
146 | * | 178 | * |
147 | * No locks are taken, but the bottom level iterator will map PTE | 179 | * Usually no locks are taken, but splitting transparent huge page may |
180 | * take page table lock. And the bottom level iterator will map PTE | ||
148 | * directories from highmem if necessary. | 181 | * directories from highmem if necessary. |
149 | * | 182 | * |
150 | * If any callback returns a non-zero value, the walk is aborted and | 183 | * If any callback returns a non-zero value, the walk is aborted and |
151 | * the return value is propagated back to the caller. Otherwise 0 is returned. | 184 | * the return value is propagated back to the caller. Otherwise 0 is returned. |
185 | * | ||
186 | * walk->mm->mmap_sem must be held for at least read if walk->hugetlb_entry | ||
187 | * is !NULL. | ||
152 | */ | 188 | */ |
153 | int walk_page_range(unsigned long addr, unsigned long end, | 189 | int walk_page_range(unsigned long addr, unsigned long end, |
154 | struct mm_walk *walk) | 190 | struct mm_walk *walk) |
@@ -165,18 +201,17 @@ int walk_page_range(unsigned long addr, unsigned long end, | |||
165 | 201 | ||
166 | pgd = pgd_offset(walk->mm, addr); | 202 | pgd = pgd_offset(walk->mm, addr); |
167 | do { | 203 | do { |
168 | struct vm_area_struct *uninitialized_var(vma); | 204 | struct vm_area_struct *vma; |
169 | 205 | ||
170 | next = pgd_addr_end(addr, end); | 206 | next = pgd_addr_end(addr, end); |
171 | 207 | ||
172 | #ifdef CONFIG_HUGETLB_PAGE | ||
173 | /* | 208 | /* |
174 | * handle hugetlb vma individually because pagetable walk for | 209 | * handle hugetlb vma individually because pagetable walk for |
175 | * the hugetlb page is dependent on the architecture and | 210 | * the hugetlb page is dependent on the architecture and |
176 | * we can't handled it in the same manner as non-huge pages. | 211 | * we can't handled it in the same manner as non-huge pages. |
177 | */ | 212 | */ |
178 | vma = find_vma(walk->mm, addr); | 213 | vma = hugetlb_vma(addr, walk); |
179 | if (vma && is_vm_hugetlb_page(vma)) { | 214 | if (vma) { |
180 | if (vma->vm_end < next) | 215 | if (vma->vm_end < next) |
181 | next = vma->vm_end; | 216 | next = vma->vm_end; |
182 | /* | 217 | /* |
@@ -189,7 +224,7 @@ int walk_page_range(unsigned long addr, unsigned long end, | |||
189 | pgd = pgd_offset(walk->mm, next); | 224 | pgd = pgd_offset(walk->mm, next); |
190 | continue; | 225 | continue; |
191 | } | 226 | } |
192 | #endif | 227 | |
193 | if (pgd_none_or_clear_bad(pgd)) { | 228 | if (pgd_none_or_clear_bad(pgd)) { |
194 | if (walk->pte_hole) | 229 | if (walk->pte_hole) |
195 | err = walk->pte_hole(addr, next, walk); | 230 | err = walk->pte_hole(addr, next, walk); |
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c new file mode 100644 index 000000000000..e920aa3ce104 --- /dev/null +++ b/mm/process_vm_access.c | |||
@@ -0,0 +1,496 @@ | |||
1 | /* | ||
2 | * linux/mm/process_vm_access.c | ||
3 | * | ||
4 | * Copyright (C) 2010-2011 Christopher Yeoh <cyeoh@au1.ibm.com>, IBM Corp. | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or | ||
7 | * modify it under the terms of the GNU General Public License | ||
8 | * as published by the Free Software Foundation; either version | ||
9 | * 2 of the License, or (at your option) any later version. | ||
10 | */ | ||
11 | |||
12 | #include <linux/mm.h> | ||
13 | #include <linux/uio.h> | ||
14 | #include <linux/sched.h> | ||
15 | #include <linux/highmem.h> | ||
16 | #include <linux/ptrace.h> | ||
17 | #include <linux/slab.h> | ||
18 | #include <linux/syscalls.h> | ||
19 | |||
20 | #ifdef CONFIG_COMPAT | ||
21 | #include <linux/compat.h> | ||
22 | #endif | ||
23 | |||
24 | /** | ||
25 | * process_vm_rw_pages - read/write pages from task specified | ||
26 | * @task: task to read/write from | ||
27 | * @mm: mm for task | ||
28 | * @process_pages: struct pages area that can store at least | ||
29 | * nr_pages_to_copy struct page pointers | ||
30 | * @pa: address of page in task to start copying from/to | ||
31 | * @start_offset: offset in page to start copying from/to | ||
32 | * @len: number of bytes to copy | ||
33 | * @lvec: iovec array specifying where to copy to/from | ||
34 | * @lvec_cnt: number of elements in iovec array | ||
35 | * @lvec_current: index in iovec array we are up to | ||
36 | * @lvec_offset: offset in bytes from current iovec iov_base we are up to | ||
37 | * @vm_write: 0 means copy from, 1 means copy to | ||
38 | * @nr_pages_to_copy: number of pages to copy | ||
39 | * @bytes_copied: returns number of bytes successfully copied | ||
40 | * Returns 0 on success, error code otherwise | ||
41 | */ | ||
42 | static int process_vm_rw_pages(struct task_struct *task, | ||
43 | struct mm_struct *mm, | ||
44 | struct page **process_pages, | ||
45 | unsigned long pa, | ||
46 | unsigned long start_offset, | ||
47 | unsigned long len, | ||
48 | const struct iovec *lvec, | ||
49 | unsigned long lvec_cnt, | ||
50 | unsigned long *lvec_current, | ||
51 | size_t *lvec_offset, | ||
52 | int vm_write, | ||
53 | unsigned int nr_pages_to_copy, | ||
54 | ssize_t *bytes_copied) | ||
55 | { | ||
56 | int pages_pinned; | ||
57 | void *target_kaddr; | ||
58 | int pgs_copied = 0; | ||
59 | int j; | ||
60 | int ret; | ||
61 | ssize_t bytes_to_copy; | ||
62 | ssize_t rc = 0; | ||
63 | |||
64 | *bytes_copied = 0; | ||
65 | |||
66 | /* Get the pages we're interested in */ | ||
67 | down_read(&mm->mmap_sem); | ||
68 | pages_pinned = get_user_pages(task, mm, pa, | ||
69 | nr_pages_to_copy, | ||
70 | vm_write, 0, process_pages, NULL); | ||
71 | up_read(&mm->mmap_sem); | ||
72 | |||
73 | if (pages_pinned != nr_pages_to_copy) { | ||
74 | rc = -EFAULT; | ||
75 | goto end; | ||
76 | } | ||
77 | |||
78 | /* Do the copy for each page */ | ||
79 | for (pgs_copied = 0; | ||
80 | (pgs_copied < nr_pages_to_copy) && (*lvec_current < lvec_cnt); | ||
81 | pgs_copied++) { | ||
82 | /* Make sure we have a non zero length iovec */ | ||
83 | while (*lvec_current < lvec_cnt | ||
84 | && lvec[*lvec_current].iov_len == 0) | ||
85 | (*lvec_current)++; | ||
86 | if (*lvec_current == lvec_cnt) | ||
87 | break; | ||
88 | |||
89 | /* | ||
90 | * Will copy smallest of: | ||
91 | * - bytes remaining in page | ||
92 | * - bytes remaining in destination iovec | ||
93 | */ | ||
94 | bytes_to_copy = min_t(ssize_t, PAGE_SIZE - start_offset, | ||
95 | len - *bytes_copied); | ||
96 | bytes_to_copy = min_t(ssize_t, bytes_to_copy, | ||
97 | lvec[*lvec_current].iov_len | ||
98 | - *lvec_offset); | ||
99 | |||
100 | target_kaddr = kmap(process_pages[pgs_copied]) + start_offset; | ||
101 | |||
102 | if (vm_write) | ||
103 | ret = copy_from_user(target_kaddr, | ||
104 | lvec[*lvec_current].iov_base | ||
105 | + *lvec_offset, | ||
106 | bytes_to_copy); | ||
107 | else | ||
108 | ret = copy_to_user(lvec[*lvec_current].iov_base | ||
109 | + *lvec_offset, | ||
110 | target_kaddr, bytes_to_copy); | ||
111 | kunmap(process_pages[pgs_copied]); | ||
112 | if (ret) { | ||
113 | *bytes_copied += bytes_to_copy - ret; | ||
114 | pgs_copied++; | ||
115 | rc = -EFAULT; | ||
116 | goto end; | ||
117 | } | ||
118 | *bytes_copied += bytes_to_copy; | ||
119 | *lvec_offset += bytes_to_copy; | ||
120 | if (*lvec_offset == lvec[*lvec_current].iov_len) { | ||
121 | /* | ||
122 | * Need to copy remaining part of page into the | ||
123 | * next iovec if there are any bytes left in page | ||
124 | */ | ||
125 | (*lvec_current)++; | ||
126 | *lvec_offset = 0; | ||
127 | start_offset = (start_offset + bytes_to_copy) | ||
128 | % PAGE_SIZE; | ||
129 | if (start_offset) | ||
130 | pgs_copied--; | ||
131 | } else { | ||
132 | start_offset = 0; | ||
133 | } | ||
134 | } | ||
135 | |||
136 | end: | ||
137 | if (vm_write) { | ||
138 | for (j = 0; j < pages_pinned; j++) { | ||
139 | if (j < pgs_copied) | ||
140 | set_page_dirty_lock(process_pages[j]); | ||
141 | put_page(process_pages[j]); | ||
142 | } | ||
143 | } else { | ||
144 | for (j = 0; j < pages_pinned; j++) | ||
145 | put_page(process_pages[j]); | ||
146 | } | ||
147 | |||
148 | return rc; | ||
149 | } | ||
150 | |||
151 | /* Maximum number of pages kmalloc'd to hold struct page's during copy */ | ||
152 | #define PVM_MAX_KMALLOC_PAGES (PAGE_SIZE * 2) | ||
153 | |||
154 | /** | ||
155 | * process_vm_rw_single_vec - read/write pages from task specified | ||
156 | * @addr: start memory address of target process | ||
157 | * @len: size of area to copy to/from | ||
158 | * @lvec: iovec array specifying where to copy to/from locally | ||
159 | * @lvec_cnt: number of elements in iovec array | ||
160 | * @lvec_current: index in iovec array we are up to | ||
161 | * @lvec_offset: offset in bytes from current iovec iov_base we are up to | ||
162 | * @process_pages: struct pages area that can store at least | ||
163 | * nr_pages_to_copy struct page pointers | ||
164 | * @mm: mm for task | ||
165 | * @task: task to read/write from | ||
166 | * @vm_write: 0 means copy from, 1 means copy to | ||
167 | * @bytes_copied: returns number of bytes successfully copied | ||
168 | * Returns 0 on success or on failure error code | ||
169 | */ | ||
170 | static int process_vm_rw_single_vec(unsigned long addr, | ||
171 | unsigned long len, | ||
172 | const struct iovec *lvec, | ||
173 | unsigned long lvec_cnt, | ||
174 | unsigned long *lvec_current, | ||
175 | size_t *lvec_offset, | ||
176 | struct page **process_pages, | ||
177 | struct mm_struct *mm, | ||
178 | struct task_struct *task, | ||
179 | int vm_write, | ||
180 | ssize_t *bytes_copied) | ||
181 | { | ||
182 | unsigned long pa = addr & PAGE_MASK; | ||
183 | unsigned long start_offset = addr - pa; | ||
184 | unsigned long nr_pages; | ||
185 | ssize_t bytes_copied_loop; | ||
186 | ssize_t rc = 0; | ||
187 | unsigned long nr_pages_copied = 0; | ||
188 | unsigned long nr_pages_to_copy; | ||
189 | unsigned long max_pages_per_loop = PVM_MAX_KMALLOC_PAGES | ||
190 | / sizeof(struct pages *); | ||
191 | |||
192 | *bytes_copied = 0; | ||
193 | |||
194 | /* Work out address and page range required */ | ||
195 | if (len == 0) | ||
196 | return 0; | ||
197 | nr_pages = (addr + len - 1) / PAGE_SIZE - addr / PAGE_SIZE + 1; | ||
198 | |||
199 | while ((nr_pages_copied < nr_pages) && (*lvec_current < lvec_cnt)) { | ||
200 | nr_pages_to_copy = min(nr_pages - nr_pages_copied, | ||
201 | max_pages_per_loop); | ||
202 | |||
203 | rc = process_vm_rw_pages(task, mm, process_pages, pa, | ||
204 | start_offset, len, | ||
205 | lvec, lvec_cnt, | ||
206 | lvec_current, lvec_offset, | ||
207 | vm_write, nr_pages_to_copy, | ||
208 | &bytes_copied_loop); | ||
209 | start_offset = 0; | ||
210 | *bytes_copied += bytes_copied_loop; | ||
211 | |||
212 | if (rc < 0) { | ||
213 | return rc; | ||
214 | } else { | ||
215 | len -= bytes_copied_loop; | ||
216 | nr_pages_copied += nr_pages_to_copy; | ||
217 | pa += nr_pages_to_copy * PAGE_SIZE; | ||
218 | } | ||
219 | } | ||
220 | |||
221 | return rc; | ||
222 | } | ||
223 | |||
224 | /* Maximum number of entries for process pages array | ||
225 | which lives on stack */ | ||
226 | #define PVM_MAX_PP_ARRAY_COUNT 16 | ||
227 | |||
228 | /** | ||
229 | * process_vm_rw_core - core of reading/writing pages from task specified | ||
230 | * @pid: PID of process to read/write from/to | ||
231 | * @lvec: iovec array specifying where to copy to/from locally | ||
232 | * @liovcnt: size of lvec array | ||
233 | * @rvec: iovec array specifying where to copy to/from in the other process | ||
234 | * @riovcnt: size of rvec array | ||
235 | * @flags: currently unused | ||
236 | * @vm_write: 0 if reading from other process, 1 if writing to other process | ||
237 | * Returns the number of bytes read/written or error code. May | ||
238 | * return less bytes than expected if an error occurs during the copying | ||
239 | * process. | ||
240 | */ | ||
241 | static ssize_t process_vm_rw_core(pid_t pid, const struct iovec *lvec, | ||
242 | unsigned long liovcnt, | ||
243 | const struct iovec *rvec, | ||
244 | unsigned long riovcnt, | ||
245 | unsigned long flags, int vm_write) | ||
246 | { | ||
247 | struct task_struct *task; | ||
248 | struct page *pp_stack[PVM_MAX_PP_ARRAY_COUNT]; | ||
249 | struct page **process_pages = pp_stack; | ||
250 | struct mm_struct *mm; | ||
251 | unsigned long i; | ||
252 | ssize_t rc = 0; | ||
253 | ssize_t bytes_copied_loop; | ||
254 | ssize_t bytes_copied = 0; | ||
255 | unsigned long nr_pages = 0; | ||
256 | unsigned long nr_pages_iov; | ||
257 | unsigned long iov_l_curr_idx = 0; | ||
258 | size_t iov_l_curr_offset = 0; | ||
259 | ssize_t iov_len; | ||
260 | |||
261 | /* | ||
262 | * Work out how many pages of struct pages we're going to need | ||
263 | * when eventually calling get_user_pages | ||
264 | */ | ||
265 | for (i = 0; i < riovcnt; i++) { | ||
266 | iov_len = rvec[i].iov_len; | ||
267 | if (iov_len > 0) { | ||
268 | nr_pages_iov = ((unsigned long)rvec[i].iov_base | ||
269 | + iov_len) | ||
270 | / PAGE_SIZE - (unsigned long)rvec[i].iov_base | ||
271 | / PAGE_SIZE + 1; | ||
272 | nr_pages = max(nr_pages, nr_pages_iov); | ||
273 | } | ||
274 | } | ||
275 | |||
276 | if (nr_pages == 0) | ||
277 | return 0; | ||
278 | |||
279 | if (nr_pages > PVM_MAX_PP_ARRAY_COUNT) { | ||
280 | /* For reliability don't try to kmalloc more than | ||
281 | 2 pages worth */ | ||
282 | process_pages = kmalloc(min_t(size_t, PVM_MAX_KMALLOC_PAGES, | ||
283 | sizeof(struct pages *)*nr_pages), | ||
284 | GFP_KERNEL); | ||
285 | |||
286 | if (!process_pages) | ||
287 | return -ENOMEM; | ||
288 | } | ||
289 | |||
290 | /* Get process information */ | ||
291 | rcu_read_lock(); | ||
292 | task = find_task_by_vpid(pid); | ||
293 | if (task) | ||
294 | get_task_struct(task); | ||
295 | rcu_read_unlock(); | ||
296 | if (!task) { | ||
297 | rc = -ESRCH; | ||
298 | goto free_proc_pages; | ||
299 | } | ||
300 | |||
301 | task_lock(task); | ||
302 | if (__ptrace_may_access(task, PTRACE_MODE_ATTACH)) { | ||
303 | task_unlock(task); | ||
304 | rc = -EPERM; | ||
305 | goto put_task_struct; | ||
306 | } | ||
307 | mm = task->mm; | ||
308 | |||
309 | if (!mm || (task->flags & PF_KTHREAD)) { | ||
310 | task_unlock(task); | ||
311 | rc = -EINVAL; | ||
312 | goto put_task_struct; | ||
313 | } | ||
314 | |||
315 | atomic_inc(&mm->mm_users); | ||
316 | task_unlock(task); | ||
317 | |||
318 | for (i = 0; i < riovcnt && iov_l_curr_idx < liovcnt; i++) { | ||
319 | rc = process_vm_rw_single_vec( | ||
320 | (unsigned long)rvec[i].iov_base, rvec[i].iov_len, | ||
321 | lvec, liovcnt, &iov_l_curr_idx, &iov_l_curr_offset, | ||
322 | process_pages, mm, task, vm_write, &bytes_copied_loop); | ||
323 | bytes_copied += bytes_copied_loop; | ||
324 | if (rc != 0) { | ||
325 | /* If we have managed to copy any data at all then | ||
326 | we return the number of bytes copied. Otherwise | ||
327 | we return the error code */ | ||
328 | if (bytes_copied) | ||
329 | rc = bytes_copied; | ||
330 | goto put_mm; | ||
331 | } | ||
332 | } | ||
333 | |||
334 | rc = bytes_copied; | ||
335 | put_mm: | ||
336 | mmput(mm); | ||
337 | |||
338 | put_task_struct: | ||
339 | put_task_struct(task); | ||
340 | |||
341 | free_proc_pages: | ||
342 | if (process_pages != pp_stack) | ||
343 | kfree(process_pages); | ||
344 | return rc; | ||
345 | } | ||
346 | |||
347 | /** | ||
348 | * process_vm_rw - check iovecs before calling core routine | ||
349 | * @pid: PID of process to read/write from/to | ||
350 | * @lvec: iovec array specifying where to copy to/from locally | ||
351 | * @liovcnt: size of lvec array | ||
352 | * @rvec: iovec array specifying where to copy to/from in the other process | ||
353 | * @riovcnt: size of rvec array | ||
354 | * @flags: currently unused | ||
355 | * @vm_write: 0 if reading from other process, 1 if writing to other process | ||
356 | * Returns the number of bytes read/written or error code. May | ||
357 | * return less bytes than expected if an error occurs during the copying | ||
358 | * process. | ||
359 | */ | ||
360 | static ssize_t process_vm_rw(pid_t pid, | ||
361 | const struct iovec __user *lvec, | ||
362 | unsigned long liovcnt, | ||
363 | const struct iovec __user *rvec, | ||
364 | unsigned long riovcnt, | ||
365 | unsigned long flags, int vm_write) | ||
366 | { | ||
367 | struct iovec iovstack_l[UIO_FASTIOV]; | ||
368 | struct iovec iovstack_r[UIO_FASTIOV]; | ||
369 | struct iovec *iov_l = iovstack_l; | ||
370 | struct iovec *iov_r = iovstack_r; | ||
371 | ssize_t rc; | ||
372 | |||
373 | if (flags != 0) | ||
374 | return -EINVAL; | ||
375 | |||
376 | /* Check iovecs */ | ||
377 | if (vm_write) | ||
378 | rc = rw_copy_check_uvector(WRITE, lvec, liovcnt, UIO_FASTIOV, | ||
379 | iovstack_l, &iov_l, 1); | ||
380 | else | ||
381 | rc = rw_copy_check_uvector(READ, lvec, liovcnt, UIO_FASTIOV, | ||
382 | iovstack_l, &iov_l, 1); | ||
383 | if (rc <= 0) | ||
384 | goto free_iovecs; | ||
385 | |||
386 | rc = rw_copy_check_uvector(READ, rvec, riovcnt, UIO_FASTIOV, | ||
387 | iovstack_r, &iov_r, 0); | ||
388 | if (rc <= 0) | ||
389 | goto free_iovecs; | ||
390 | |||
391 | rc = process_vm_rw_core(pid, iov_l, liovcnt, iov_r, riovcnt, flags, | ||
392 | vm_write); | ||
393 | |||
394 | free_iovecs: | ||
395 | if (iov_r != iovstack_r) | ||
396 | kfree(iov_r); | ||
397 | if (iov_l != iovstack_l) | ||
398 | kfree(iov_l); | ||
399 | |||
400 | return rc; | ||
401 | } | ||
402 | |||
403 | SYSCALL_DEFINE6(process_vm_readv, pid_t, pid, const struct iovec __user *, lvec, | ||
404 | unsigned long, liovcnt, const struct iovec __user *, rvec, | ||
405 | unsigned long, riovcnt, unsigned long, flags) | ||
406 | { | ||
407 | return process_vm_rw(pid, lvec, liovcnt, rvec, riovcnt, flags, 0); | ||
408 | } | ||
409 | |||
410 | SYSCALL_DEFINE6(process_vm_writev, pid_t, pid, | ||
411 | const struct iovec __user *, lvec, | ||
412 | unsigned long, liovcnt, const struct iovec __user *, rvec, | ||
413 | unsigned long, riovcnt, unsigned long, flags) | ||
414 | { | ||
415 | return process_vm_rw(pid, lvec, liovcnt, rvec, riovcnt, flags, 1); | ||
416 | } | ||
417 | |||
418 | #ifdef CONFIG_COMPAT | ||
419 | |||
420 | asmlinkage ssize_t | ||
421 | compat_process_vm_rw(compat_pid_t pid, | ||
422 | const struct compat_iovec __user *lvec, | ||
423 | unsigned long liovcnt, | ||
424 | const struct compat_iovec __user *rvec, | ||
425 | unsigned long riovcnt, | ||
426 | unsigned long flags, int vm_write) | ||
427 | { | ||
428 | struct iovec iovstack_l[UIO_FASTIOV]; | ||
429 | struct iovec iovstack_r[UIO_FASTIOV]; | ||
430 | struct iovec *iov_l = iovstack_l; | ||
431 | struct iovec *iov_r = iovstack_r; | ||
432 | ssize_t rc = -EFAULT; | ||
433 | |||
434 | if (flags != 0) | ||
435 | return -EINVAL; | ||
436 | |||
437 | if (!access_ok(VERIFY_READ, lvec, liovcnt * sizeof(*lvec))) | ||
438 | goto out; | ||
439 | |||
440 | if (!access_ok(VERIFY_READ, rvec, riovcnt * sizeof(*rvec))) | ||
441 | goto out; | ||
442 | |||
443 | if (vm_write) | ||
444 | rc = compat_rw_copy_check_uvector(WRITE, lvec, liovcnt, | ||
445 | UIO_FASTIOV, iovstack_l, | ||
446 | &iov_l, 1); | ||
447 | else | ||
448 | rc = compat_rw_copy_check_uvector(READ, lvec, liovcnt, | ||
449 | UIO_FASTIOV, iovstack_l, | ||
450 | &iov_l, 1); | ||
451 | if (rc <= 0) | ||
452 | goto free_iovecs; | ||
453 | rc = compat_rw_copy_check_uvector(READ, rvec, riovcnt, | ||
454 | UIO_FASTIOV, iovstack_r, | ||
455 | &iov_r, 0); | ||
456 | if (rc <= 0) | ||
457 | goto free_iovecs; | ||
458 | |||
459 | rc = process_vm_rw_core(pid, iov_l, liovcnt, iov_r, riovcnt, flags, | ||
460 | vm_write); | ||
461 | |||
462 | free_iovecs: | ||
463 | if (iov_r != iovstack_r) | ||
464 | kfree(iov_r); | ||
465 | if (iov_l != iovstack_l) | ||
466 | kfree(iov_l); | ||
467 | |||
468 | out: | ||
469 | return rc; | ||
470 | } | ||
471 | |||
472 | asmlinkage ssize_t | ||
473 | compat_sys_process_vm_readv(compat_pid_t pid, | ||
474 | const struct compat_iovec __user *lvec, | ||
475 | unsigned long liovcnt, | ||
476 | const struct compat_iovec __user *rvec, | ||
477 | unsigned long riovcnt, | ||
478 | unsigned long flags) | ||
479 | { | ||
480 | return compat_process_vm_rw(pid, lvec, liovcnt, rvec, | ||
481 | riovcnt, flags, 0); | ||
482 | } | ||
483 | |||
484 | asmlinkage ssize_t | ||
485 | compat_sys_process_vm_writev(compat_pid_t pid, | ||
486 | const struct compat_iovec __user *lvec, | ||
487 | unsigned long liovcnt, | ||
488 | const struct compat_iovec __user *rvec, | ||
489 | unsigned long riovcnt, | ||
490 | unsigned long flags) | ||
491 | { | ||
492 | return compat_process_vm_rw(pid, lvec, liovcnt, rvec, | ||
493 | riovcnt, flags, 1); | ||
494 | } | ||
495 | |||
496 | #endif | ||
diff --git a/mm/quicklist.c b/mm/quicklist.c index 2876349339a7..942212970529 100644 --- a/mm/quicklist.c +++ b/mm/quicklist.c | |||
@@ -17,7 +17,6 @@ | |||
17 | #include <linux/gfp.h> | 17 | #include <linux/gfp.h> |
18 | #include <linux/mm.h> | 18 | #include <linux/mm.h> |
19 | #include <linux/mmzone.h> | 19 | #include <linux/mmzone.h> |
20 | #include <linux/module.h> | ||
21 | #include <linux/quicklist.h> | 20 | #include <linux/quicklist.h> |
22 | 21 | ||
23 | DEFINE_PER_CPU(struct quicklist [CONFIG_NR_QUICK], quicklist); | 22 | DEFINE_PER_CPU(struct quicklist [CONFIG_NR_QUICK], quicklist); |
diff --git a/mm/readahead.c b/mm/readahead.c index 867f9dd82dcd..cbcbb02f3e28 100644 --- a/mm/readahead.c +++ b/mm/readahead.c | |||
@@ -11,7 +11,7 @@ | |||
11 | #include <linux/fs.h> | 11 | #include <linux/fs.h> |
12 | #include <linux/gfp.h> | 12 | #include <linux/gfp.h> |
13 | #include <linux/mm.h> | 13 | #include <linux/mm.h> |
14 | #include <linux/module.h> | 14 | #include <linux/export.h> |
15 | #include <linux/blkdev.h> | 15 | #include <linux/blkdev.h> |
16 | #include <linux/backing-dev.h> | 16 | #include <linux/backing-dev.h> |
17 | #include <linux/task_io_accounting_ops.h> | 17 | #include <linux/task_io_accounting_ops.h> |
@@ -21,7 +21,6 @@ | |||
21 | * Lock ordering in mm: | 21 | * Lock ordering in mm: |
22 | * | 22 | * |
23 | * inode->i_mutex (while writing or truncating, not reading or faulting) | 23 | * inode->i_mutex (while writing or truncating, not reading or faulting) |
24 | * inode->i_alloc_sem (vmtruncate_range) | ||
25 | * mm->mmap_sem | 24 | * mm->mmap_sem |
26 | * page->flags PG_locked (lock_page) | 25 | * page->flags PG_locked (lock_page) |
27 | * mapping->i_mmap_mutex | 26 | * mapping->i_mmap_mutex |
@@ -32,11 +31,11 @@ | |||
32 | * mmlist_lock (in mmput, drain_mmlist and others) | 31 | * mmlist_lock (in mmput, drain_mmlist and others) |
33 | * mapping->private_lock (in __set_page_dirty_buffers) | 32 | * mapping->private_lock (in __set_page_dirty_buffers) |
34 | * inode->i_lock (in set_page_dirty's __mark_inode_dirty) | 33 | * inode->i_lock (in set_page_dirty's __mark_inode_dirty) |
35 | * inode_wb_list_lock (in set_page_dirty's __mark_inode_dirty) | 34 | * bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty) |
36 | * sb_lock (within inode_lock in fs/fs-writeback.c) | 35 | * sb_lock (within inode_lock in fs/fs-writeback.c) |
37 | * mapping->tree_lock (widely used, in set_page_dirty, | 36 | * mapping->tree_lock (widely used, in set_page_dirty, |
38 | * in arch-dependent flush_dcache_mmap_lock, | 37 | * in arch-dependent flush_dcache_mmap_lock, |
39 | * within inode_wb_list_lock in __sync_single_inode) | 38 | * within bdi.wb->list_lock in __sync_single_inode) |
40 | * | 39 | * |
41 | * anon_vma->mutex,mapping->i_mutex (memory_failure, collect_procs_anon) | 40 | * anon_vma->mutex,mapping->i_mutex (memory_failure, collect_procs_anon) |
42 | * ->tasklist_lock | 41 | * ->tasklist_lock |
@@ -52,7 +51,7 @@ | |||
52 | #include <linux/ksm.h> | 51 | #include <linux/ksm.h> |
53 | #include <linux/rmap.h> | 52 | #include <linux/rmap.h> |
54 | #include <linux/rcupdate.h> | 53 | #include <linux/rcupdate.h> |
55 | #include <linux/module.h> | 54 | #include <linux/export.h> |
56 | #include <linux/memcontrol.h> | 55 | #include <linux/memcontrol.h> |
57 | #include <linux/mmu_notifier.h> | 56 | #include <linux/mmu_notifier.h> |
58 | #include <linux/migrate.h> | 57 | #include <linux/migrate.h> |
@@ -870,11 +869,11 @@ int page_referenced(struct page *page, | |||
870 | vm_flags); | 869 | vm_flags); |
871 | if (we_locked) | 870 | if (we_locked) |
872 | unlock_page(page); | 871 | unlock_page(page); |
872 | |||
873 | if (page_test_and_clear_young(page_to_pfn(page))) | ||
874 | referenced++; | ||
873 | } | 875 | } |
874 | out: | 876 | out: |
875 | if (page_test_and_clear_young(page_to_pfn(page))) | ||
876 | referenced++; | ||
877 | |||
878 | return referenced; | 877 | return referenced; |
879 | } | 878 | } |
880 | 879 | ||
@@ -1165,7 +1164,7 @@ void page_remove_rmap(struct page *page) | |||
1165 | 1164 | ||
1166 | /* | 1165 | /* |
1167 | * Subfunctions of try_to_unmap: try_to_unmap_one called | 1166 | * Subfunctions of try_to_unmap: try_to_unmap_one called |
1168 | * repeatedly from either try_to_unmap_anon or try_to_unmap_file. | 1167 | * repeatedly from try_to_unmap_ksm, try_to_unmap_anon or try_to_unmap_file. |
1169 | */ | 1168 | */ |
1170 | int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | 1169 | int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, |
1171 | unsigned long address, enum ttu_flags flags) | 1170 | unsigned long address, enum ttu_flags flags) |
diff --git a/mm/shmem.c b/mm/shmem.c index fcedf5464eb7..d6722506d2da 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -6,7 +6,8 @@ | |||
6 | * 2000-2001 Christoph Rohland | 6 | * 2000-2001 Christoph Rohland |
7 | * 2000-2001 SAP AG | 7 | * 2000-2001 SAP AG |
8 | * 2002 Red Hat Inc. | 8 | * 2002 Red Hat Inc. |
9 | * Copyright (C) 2002-2005 Hugh Dickins. | 9 | * Copyright (C) 2002-2011 Hugh Dickins. |
10 | * Copyright (C) 2011 Google Inc. | ||
10 | * Copyright (C) 2002-2005 VERITAS Software Corporation. | 11 | * Copyright (C) 2002-2005 VERITAS Software Corporation. |
11 | * Copyright (C) 2004 Andi Kleen, SuSE Labs | 12 | * Copyright (C) 2004 Andi Kleen, SuSE Labs |
12 | * | 13 | * |
@@ -27,8 +28,7 @@ | |||
27 | #include <linux/pagemap.h> | 28 | #include <linux/pagemap.h> |
28 | #include <linux/file.h> | 29 | #include <linux/file.h> |
29 | #include <linux/mm.h> | 30 | #include <linux/mm.h> |
30 | #include <linux/module.h> | 31 | #include <linux/export.h> |
31 | #include <linux/percpu_counter.h> | ||
32 | #include <linux/swap.h> | 32 | #include <linux/swap.h> |
33 | 33 | ||
34 | static struct vfsmount *shm_mnt; | 34 | static struct vfsmount *shm_mnt; |
@@ -51,6 +51,9 @@ static struct vfsmount *shm_mnt; | |||
51 | #include <linux/shmem_fs.h> | 51 | #include <linux/shmem_fs.h> |
52 | #include <linux/writeback.h> | 52 | #include <linux/writeback.h> |
53 | #include <linux/blkdev.h> | 53 | #include <linux/blkdev.h> |
54 | #include <linux/pagevec.h> | ||
55 | #include <linux/percpu_counter.h> | ||
56 | #include <linux/splice.h> | ||
54 | #include <linux/security.h> | 57 | #include <linux/security.h> |
55 | #include <linux/swapops.h> | 58 | #include <linux/swapops.h> |
56 | #include <linux/mempolicy.h> | 59 | #include <linux/mempolicy.h> |
@@ -62,43 +65,17 @@ static struct vfsmount *shm_mnt; | |||
62 | #include <linux/magic.h> | 65 | #include <linux/magic.h> |
63 | 66 | ||
64 | #include <asm/uaccess.h> | 67 | #include <asm/uaccess.h> |
65 | #include <asm/div64.h> | ||
66 | #include <asm/pgtable.h> | 68 | #include <asm/pgtable.h> |
67 | 69 | ||
68 | /* | ||
69 | * The maximum size of a shmem/tmpfs file is limited by the maximum size of | ||
70 | * its triple-indirect swap vector - see illustration at shmem_swp_entry(). | ||
71 | * | ||
72 | * With 4kB page size, maximum file size is just over 2TB on a 32-bit kernel, | ||
73 | * but one eighth of that on a 64-bit kernel. With 8kB page size, maximum | ||
74 | * file size is just over 4TB on a 64-bit kernel, but 16TB on a 32-bit kernel, | ||
75 | * MAX_LFS_FILESIZE being then more restrictive than swap vector layout. | ||
76 | * | ||
77 | * We use / and * instead of shifts in the definitions below, so that the swap | ||
78 | * vector can be tested with small even values (e.g. 20) for ENTRIES_PER_PAGE. | ||
79 | */ | ||
80 | #define ENTRIES_PER_PAGE (PAGE_CACHE_SIZE/sizeof(unsigned long)) | ||
81 | #define ENTRIES_PER_PAGEPAGE ((unsigned long long)ENTRIES_PER_PAGE*ENTRIES_PER_PAGE) | ||
82 | |||
83 | #define SHMSWP_MAX_INDEX (SHMEM_NR_DIRECT + (ENTRIES_PER_PAGEPAGE/2) * (ENTRIES_PER_PAGE+1)) | ||
84 | #define SHMSWP_MAX_BYTES (SHMSWP_MAX_INDEX << PAGE_CACHE_SHIFT) | ||
85 | |||
86 | #define SHMEM_MAX_BYTES min_t(unsigned long long, SHMSWP_MAX_BYTES, MAX_LFS_FILESIZE) | ||
87 | #define SHMEM_MAX_INDEX ((unsigned long)((SHMEM_MAX_BYTES+1) >> PAGE_CACHE_SHIFT)) | ||
88 | |||
89 | #define BLOCKS_PER_PAGE (PAGE_CACHE_SIZE/512) | 70 | #define BLOCKS_PER_PAGE (PAGE_CACHE_SIZE/512) |
90 | #define VM_ACCT(size) (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT) | 71 | #define VM_ACCT(size) (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT) |
91 | 72 | ||
92 | /* info->flags needs VM_flags to handle pagein/truncate races efficiently */ | ||
93 | #define SHMEM_PAGEIN VM_READ | ||
94 | #define SHMEM_TRUNCATE VM_WRITE | ||
95 | |||
96 | /* Definition to limit shmem_truncate's steps between cond_rescheds */ | ||
97 | #define LATENCY_LIMIT 64 | ||
98 | |||
99 | /* Pretend that each entry is of this size in directory's i_size */ | 73 | /* Pretend that each entry is of this size in directory's i_size */ |
100 | #define BOGO_DIRENT_SIZE 20 | 74 | #define BOGO_DIRENT_SIZE 20 |
101 | 75 | ||
76 | /* Symlink up to this size is kmalloc'ed instead of using a swappable page */ | ||
77 | #define SHORT_SYMLINK_LEN 128 | ||
78 | |||
102 | struct shmem_xattr { | 79 | struct shmem_xattr { |
103 | struct list_head list; /* anchored by shmem_inode_info->xattr_list */ | 80 | struct list_head list; /* anchored by shmem_inode_info->xattr_list */ |
104 | char *name; /* xattr name */ | 81 | char *name; /* xattr name */ |
@@ -106,7 +83,7 @@ struct shmem_xattr { | |||
106 | char value[0]; | 83 | char value[0]; |
107 | }; | 84 | }; |
108 | 85 | ||
109 | /* Flag allocation requirements to shmem_getpage and shmem_swp_alloc */ | 86 | /* Flag allocation requirements to shmem_getpage */ |
110 | enum sgp_type { | 87 | enum sgp_type { |
111 | SGP_READ, /* don't exceed i_size, don't allocate page */ | 88 | SGP_READ, /* don't exceed i_size, don't allocate page */ |
112 | SGP_CACHE, /* don't exceed i_size, may allocate page */ | 89 | SGP_CACHE, /* don't exceed i_size, may allocate page */ |
@@ -126,57 +103,14 @@ static unsigned long shmem_default_max_inodes(void) | |||
126 | } | 103 | } |
127 | #endif | 104 | #endif |
128 | 105 | ||
129 | static int shmem_getpage(struct inode *inode, unsigned long idx, | 106 | static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, |
130 | struct page **pagep, enum sgp_type sgp, int *type); | 107 | struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type); |
131 | |||
132 | static inline struct page *shmem_dir_alloc(gfp_t gfp_mask) | ||
133 | { | ||
134 | /* | ||
135 | * The above definition of ENTRIES_PER_PAGE, and the use of | ||
136 | * BLOCKS_PER_PAGE on indirect pages, assume PAGE_CACHE_SIZE: | ||
137 | * might be reconsidered if it ever diverges from PAGE_SIZE. | ||
138 | * | ||
139 | * Mobility flags are masked out as swap vectors cannot move | ||
140 | */ | ||
141 | return alloc_pages((gfp_mask & ~GFP_MOVABLE_MASK) | __GFP_ZERO, | ||
142 | PAGE_CACHE_SHIFT-PAGE_SHIFT); | ||
143 | } | ||
144 | |||
145 | static inline void shmem_dir_free(struct page *page) | ||
146 | { | ||
147 | __free_pages(page, PAGE_CACHE_SHIFT-PAGE_SHIFT); | ||
148 | } | ||
149 | |||
150 | static struct page **shmem_dir_map(struct page *page) | ||
151 | { | ||
152 | return (struct page **)kmap_atomic(page, KM_USER0); | ||
153 | } | ||
154 | |||
155 | static inline void shmem_dir_unmap(struct page **dir) | ||
156 | { | ||
157 | kunmap_atomic(dir, KM_USER0); | ||
158 | } | ||
159 | |||
160 | static swp_entry_t *shmem_swp_map(struct page *page) | ||
161 | { | ||
162 | return (swp_entry_t *)kmap_atomic(page, KM_USER1); | ||
163 | } | ||
164 | |||
165 | static inline void shmem_swp_balance_unmap(void) | ||
166 | { | ||
167 | /* | ||
168 | * When passing a pointer to an i_direct entry, to code which | ||
169 | * also handles indirect entries and so will shmem_swp_unmap, | ||
170 | * we must arrange for the preempt count to remain in balance. | ||
171 | * What kmap_atomic of a lowmem page does depends on config | ||
172 | * and architecture, so pretend to kmap_atomic some lowmem page. | ||
173 | */ | ||
174 | (void) kmap_atomic(ZERO_PAGE(0), KM_USER1); | ||
175 | } | ||
176 | 108 | ||
177 | static inline void shmem_swp_unmap(swp_entry_t *entry) | 109 | static inline int shmem_getpage(struct inode *inode, pgoff_t index, |
110 | struct page **pagep, enum sgp_type sgp, int *fault_type) | ||
178 | { | 111 | { |
179 | kunmap_atomic(entry, KM_USER1); | 112 | return shmem_getpage_gfp(inode, index, pagep, sgp, |
113 | mapping_gfp_mask(inode->i_mapping), fault_type); | ||
180 | } | 114 | } |
181 | 115 | ||
182 | static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb) | 116 | static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb) |
@@ -236,17 +170,6 @@ static struct backing_dev_info shmem_backing_dev_info __read_mostly = { | |||
236 | static LIST_HEAD(shmem_swaplist); | 170 | static LIST_HEAD(shmem_swaplist); |
237 | static DEFINE_MUTEX(shmem_swaplist_mutex); | 171 | static DEFINE_MUTEX(shmem_swaplist_mutex); |
238 | 172 | ||
239 | static void shmem_free_blocks(struct inode *inode, long pages) | ||
240 | { | ||
241 | struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); | ||
242 | if (sbinfo->max_blocks) { | ||
243 | percpu_counter_add(&sbinfo->used_blocks, -pages); | ||
244 | spin_lock(&inode->i_lock); | ||
245 | inode->i_blocks -= pages*BLOCKS_PER_PAGE; | ||
246 | spin_unlock(&inode->i_lock); | ||
247 | } | ||
248 | } | ||
249 | |||
250 | static int shmem_reserve_inode(struct super_block *sb) | 173 | static int shmem_reserve_inode(struct super_block *sb) |
251 | { | 174 | { |
252 | struct shmem_sb_info *sbinfo = SHMEM_SB(sb); | 175 | struct shmem_sb_info *sbinfo = SHMEM_SB(sb); |
@@ -273,7 +196,7 @@ static void shmem_free_inode(struct super_block *sb) | |||
273 | } | 196 | } |
274 | 197 | ||
275 | /** | 198 | /** |
276 | * shmem_recalc_inode - recalculate the size of an inode | 199 | * shmem_recalc_inode - recalculate the block usage of an inode |
277 | * @inode: inode to recalc | 200 | * @inode: inode to recalc |
278 | * | 201 | * |
279 | * We have to calculate the free blocks since the mm can drop | 202 | * We have to calculate the free blocks since the mm can drop |
@@ -291,474 +214,297 @@ static void shmem_recalc_inode(struct inode *inode) | |||
291 | 214 | ||
292 | freed = info->alloced - info->swapped - inode->i_mapping->nrpages; | 215 | freed = info->alloced - info->swapped - inode->i_mapping->nrpages; |
293 | if (freed > 0) { | 216 | if (freed > 0) { |
217 | struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); | ||
218 | if (sbinfo->max_blocks) | ||
219 | percpu_counter_add(&sbinfo->used_blocks, -freed); | ||
294 | info->alloced -= freed; | 220 | info->alloced -= freed; |
221 | inode->i_blocks -= freed * BLOCKS_PER_PAGE; | ||
295 | shmem_unacct_blocks(info->flags, freed); | 222 | shmem_unacct_blocks(info->flags, freed); |
296 | shmem_free_blocks(inode, freed); | ||
297 | } | 223 | } |
298 | } | 224 | } |
299 | 225 | ||
300 | /** | 226 | /* |
301 | * shmem_swp_entry - find the swap vector position in the info structure | 227 | * Replace item expected in radix tree by a new item, while holding tree lock. |
302 | * @info: info structure for the inode | ||
303 | * @index: index of the page to find | ||
304 | * @page: optional page to add to the structure. Has to be preset to | ||
305 | * all zeros | ||
306 | * | ||
307 | * If there is no space allocated yet it will return NULL when | ||
308 | * page is NULL, else it will use the page for the needed block, | ||
309 | * setting it to NULL on return to indicate that it has been used. | ||
310 | * | ||
311 | * The swap vector is organized the following way: | ||
312 | * | ||
313 | * There are SHMEM_NR_DIRECT entries directly stored in the | ||
314 | * shmem_inode_info structure. So small files do not need an addional | ||
315 | * allocation. | ||
316 | * | ||
317 | * For pages with index > SHMEM_NR_DIRECT there is the pointer | ||
318 | * i_indirect which points to a page which holds in the first half | ||
319 | * doubly indirect blocks, in the second half triple indirect blocks: | ||
320 | * | ||
321 | * For an artificial ENTRIES_PER_PAGE = 4 this would lead to the | ||
322 | * following layout (for SHMEM_NR_DIRECT == 16): | ||
323 | * | ||
324 | * i_indirect -> dir --> 16-19 | ||
325 | * | +-> 20-23 | ||
326 | * | | ||
327 | * +-->dir2 --> 24-27 | ||
328 | * | +-> 28-31 | ||
329 | * | +-> 32-35 | ||
330 | * | +-> 36-39 | ||
331 | * | | ||
332 | * +-->dir3 --> 40-43 | ||
333 | * +-> 44-47 | ||
334 | * +-> 48-51 | ||
335 | * +-> 52-55 | ||
336 | */ | 228 | */ |
337 | static swp_entry_t *shmem_swp_entry(struct shmem_inode_info *info, unsigned long index, struct page **page) | 229 | static int shmem_radix_tree_replace(struct address_space *mapping, |
338 | { | 230 | pgoff_t index, void *expected, void *replacement) |
339 | unsigned long offset; | 231 | { |
340 | struct page **dir; | 232 | void **pslot; |
341 | struct page *subdir; | 233 | void *item = NULL; |
234 | |||
235 | VM_BUG_ON(!expected); | ||
236 | pslot = radix_tree_lookup_slot(&mapping->page_tree, index); | ||
237 | if (pslot) | ||
238 | item = radix_tree_deref_slot_protected(pslot, | ||
239 | &mapping->tree_lock); | ||
240 | if (item != expected) | ||
241 | return -ENOENT; | ||
242 | if (replacement) | ||
243 | radix_tree_replace_slot(pslot, replacement); | ||
244 | else | ||
245 | radix_tree_delete(&mapping->page_tree, index); | ||
246 | return 0; | ||
247 | } | ||
342 | 248 | ||
343 | if (index < SHMEM_NR_DIRECT) { | 249 | /* |
344 | shmem_swp_balance_unmap(); | 250 | * Like add_to_page_cache_locked, but error if expected item has gone. |
345 | return info->i_direct+index; | 251 | */ |
346 | } | 252 | static int shmem_add_to_page_cache(struct page *page, |
347 | if (!info->i_indirect) { | 253 | struct address_space *mapping, |
348 | if (page) { | 254 | pgoff_t index, gfp_t gfp, void *expected) |
349 | info->i_indirect = *page; | 255 | { |
350 | *page = NULL; | 256 | int error = 0; |
351 | } | ||
352 | return NULL; /* need another page */ | ||
353 | } | ||
354 | 257 | ||
355 | index -= SHMEM_NR_DIRECT; | 258 | VM_BUG_ON(!PageLocked(page)); |
356 | offset = index % ENTRIES_PER_PAGE; | 259 | VM_BUG_ON(!PageSwapBacked(page)); |
357 | index /= ENTRIES_PER_PAGE; | ||
358 | dir = shmem_dir_map(info->i_indirect); | ||
359 | |||
360 | if (index >= ENTRIES_PER_PAGE/2) { | ||
361 | index -= ENTRIES_PER_PAGE/2; | ||
362 | dir += ENTRIES_PER_PAGE/2 + index/ENTRIES_PER_PAGE; | ||
363 | index %= ENTRIES_PER_PAGE; | ||
364 | subdir = *dir; | ||
365 | if (!subdir) { | ||
366 | if (page) { | ||
367 | *dir = *page; | ||
368 | *page = NULL; | ||
369 | } | ||
370 | shmem_dir_unmap(dir); | ||
371 | return NULL; /* need another page */ | ||
372 | } | ||
373 | shmem_dir_unmap(dir); | ||
374 | dir = shmem_dir_map(subdir); | ||
375 | } | ||
376 | 260 | ||
377 | dir += index; | 261 | if (!expected) |
378 | subdir = *dir; | 262 | error = radix_tree_preload(gfp & GFP_RECLAIM_MASK); |
379 | if (!subdir) { | 263 | if (!error) { |
380 | if (!page || !(subdir = *page)) { | 264 | page_cache_get(page); |
381 | shmem_dir_unmap(dir); | 265 | page->mapping = mapping; |
382 | return NULL; /* need a page */ | 266 | page->index = index; |
267 | |||
268 | spin_lock_irq(&mapping->tree_lock); | ||
269 | if (!expected) | ||
270 | error = radix_tree_insert(&mapping->page_tree, | ||
271 | index, page); | ||
272 | else | ||
273 | error = shmem_radix_tree_replace(mapping, index, | ||
274 | expected, page); | ||
275 | if (!error) { | ||
276 | mapping->nrpages++; | ||
277 | __inc_zone_page_state(page, NR_FILE_PAGES); | ||
278 | __inc_zone_page_state(page, NR_SHMEM); | ||
279 | spin_unlock_irq(&mapping->tree_lock); | ||
280 | } else { | ||
281 | page->mapping = NULL; | ||
282 | spin_unlock_irq(&mapping->tree_lock); | ||
283 | page_cache_release(page); | ||
383 | } | 284 | } |
384 | *dir = subdir; | 285 | if (!expected) |
385 | *page = NULL; | 286 | radix_tree_preload_end(); |
386 | } | 287 | } |
387 | shmem_dir_unmap(dir); | 288 | if (error) |
388 | return shmem_swp_map(subdir) + offset; | 289 | mem_cgroup_uncharge_cache_page(page); |
290 | return error; | ||
389 | } | 291 | } |
390 | 292 | ||
391 | static void shmem_swp_set(struct shmem_inode_info *info, swp_entry_t *entry, unsigned long value) | 293 | /* |
294 | * Like delete_from_page_cache, but substitutes swap for page. | ||
295 | */ | ||
296 | static void shmem_delete_from_page_cache(struct page *page, void *radswap) | ||
392 | { | 297 | { |
393 | long incdec = value? 1: -1; | 298 | struct address_space *mapping = page->mapping; |
299 | int error; | ||
394 | 300 | ||
395 | entry->val = value; | 301 | spin_lock_irq(&mapping->tree_lock); |
396 | info->swapped += incdec; | 302 | error = shmem_radix_tree_replace(mapping, page->index, page, radswap); |
397 | if ((unsigned long)(entry - info->i_direct) >= SHMEM_NR_DIRECT) { | 303 | page->mapping = NULL; |
398 | struct page *page = kmap_atomic_to_page(entry); | 304 | mapping->nrpages--; |
399 | set_page_private(page, page_private(page) + incdec); | 305 | __dec_zone_page_state(page, NR_FILE_PAGES); |
400 | } | 306 | __dec_zone_page_state(page, NR_SHMEM); |
307 | spin_unlock_irq(&mapping->tree_lock); | ||
308 | page_cache_release(page); | ||
309 | BUG_ON(error); | ||
401 | } | 310 | } |
402 | 311 | ||
403 | /** | 312 | /* |
404 | * shmem_swp_alloc - get the position of the swap entry for the page. | 313 | * Like find_get_pages, but collecting swap entries as well as pages. |
405 | * @info: info structure for the inode | ||
406 | * @index: index of the page to find | ||
407 | * @sgp: check and recheck i_size? skip allocation? | ||
408 | * | ||
409 | * If the entry does not exist, allocate it. | ||
410 | */ | 314 | */ |
411 | static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long index, enum sgp_type sgp) | 315 | static unsigned shmem_find_get_pages_and_swap(struct address_space *mapping, |
412 | { | 316 | pgoff_t start, unsigned int nr_pages, |
413 | struct inode *inode = &info->vfs_inode; | 317 | struct page **pages, pgoff_t *indices) |
414 | struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); | 318 | { |
415 | struct page *page = NULL; | 319 | unsigned int i; |
416 | swp_entry_t *entry; | 320 | unsigned int ret; |
417 | 321 | unsigned int nr_found; | |
418 | if (sgp != SGP_WRITE && | 322 | |
419 | ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) | 323 | rcu_read_lock(); |
420 | return ERR_PTR(-EINVAL); | 324 | restart: |
421 | 325 | nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, | |
422 | while (!(entry = shmem_swp_entry(info, index, &page))) { | 326 | (void ***)pages, indices, start, nr_pages); |
423 | if (sgp == SGP_READ) | 327 | ret = 0; |
424 | return shmem_swp_map(ZERO_PAGE(0)); | 328 | for (i = 0; i < nr_found; i++) { |
425 | /* | 329 | struct page *page; |
426 | * Test used_blocks against 1 less max_blocks, since we have 1 data | 330 | repeat: |
427 | * page (and perhaps indirect index pages) yet to allocate: | 331 | page = radix_tree_deref_slot((void **)pages[i]); |
428 | * a waste to allocate index if we cannot allocate data. | 332 | if (unlikely(!page)) |
429 | */ | 333 | continue; |
430 | if (sbinfo->max_blocks) { | 334 | if (radix_tree_exception(page)) { |
431 | if (percpu_counter_compare(&sbinfo->used_blocks, | 335 | if (radix_tree_deref_retry(page)) |
432 | sbinfo->max_blocks - 1) >= 0) | 336 | goto restart; |
433 | return ERR_PTR(-ENOSPC); | 337 | /* |
434 | percpu_counter_inc(&sbinfo->used_blocks); | 338 | * Otherwise, we must be storing a swap entry |
435 | spin_lock(&inode->i_lock); | 339 | * here as an exceptional entry: so return it |
436 | inode->i_blocks += BLOCKS_PER_PAGE; | 340 | * without attempting to raise page count. |
437 | spin_unlock(&inode->i_lock); | 341 | */ |
342 | goto export; | ||
438 | } | 343 | } |
344 | if (!page_cache_get_speculative(page)) | ||
345 | goto repeat; | ||
439 | 346 | ||
440 | spin_unlock(&info->lock); | 347 | /* Has the page moved? */ |
441 | page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping)); | 348 | if (unlikely(page != *((void **)pages[i]))) { |
442 | spin_lock(&info->lock); | 349 | page_cache_release(page); |
443 | 350 | goto repeat; | |
444 | if (!page) { | ||
445 | shmem_free_blocks(inode, 1); | ||
446 | return ERR_PTR(-ENOMEM); | ||
447 | } | ||
448 | if (sgp != SGP_WRITE && | ||
449 | ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { | ||
450 | entry = ERR_PTR(-EINVAL); | ||
451 | break; | ||
452 | } | 351 | } |
453 | if (info->next_index <= index) | 352 | export: |
454 | info->next_index = index + 1; | 353 | indices[ret] = indices[i]; |
455 | } | 354 | pages[ret] = page; |
456 | if (page) { | 355 | ret++; |
457 | /* another task gave its page, or truncated the file */ | 356 | } |
458 | shmem_free_blocks(inode, 1); | 357 | if (unlikely(!ret && nr_found)) |
459 | shmem_dir_free(page); | 358 | goto restart; |
460 | } | 359 | rcu_read_unlock(); |
461 | if (info->next_index <= index && !IS_ERR(entry)) | 360 | return ret; |
462 | info->next_index = index + 1; | ||
463 | return entry; | ||
464 | } | 361 | } |
465 | 362 | ||
466 | /** | 363 | /* |
467 | * shmem_free_swp - free some swap entries in a directory | 364 | * Remove swap entry from radix tree, free the swap and its page cache. |
468 | * @dir: pointer to the directory | ||
469 | * @edir: pointer after last entry of the directory | ||
470 | * @punch_lock: pointer to spinlock when needed for the holepunch case | ||
471 | */ | 365 | */ |
472 | static int shmem_free_swp(swp_entry_t *dir, swp_entry_t *edir, | 366 | static int shmem_free_swap(struct address_space *mapping, |
473 | spinlock_t *punch_lock) | 367 | pgoff_t index, void *radswap) |
474 | { | 368 | { |
475 | spinlock_t *punch_unlock = NULL; | 369 | int error; |
476 | swp_entry_t *ptr; | 370 | |
477 | int freed = 0; | 371 | spin_lock_irq(&mapping->tree_lock); |
478 | 372 | error = shmem_radix_tree_replace(mapping, index, radswap, NULL); | |
479 | for (ptr = dir; ptr < edir; ptr++) { | 373 | spin_unlock_irq(&mapping->tree_lock); |
480 | if (ptr->val) { | 374 | if (!error) |
481 | if (unlikely(punch_lock)) { | 375 | free_swap_and_cache(radix_to_swp_entry(radswap)); |
482 | punch_unlock = punch_lock; | 376 | return error; |
483 | punch_lock = NULL; | ||
484 | spin_lock(punch_unlock); | ||
485 | if (!ptr->val) | ||
486 | continue; | ||
487 | } | ||
488 | free_swap_and_cache(*ptr); | ||
489 | *ptr = (swp_entry_t){0}; | ||
490 | freed++; | ||
491 | } | ||
492 | } | ||
493 | if (punch_unlock) | ||
494 | spin_unlock(punch_unlock); | ||
495 | return freed; | ||
496 | } | ||
497 | |||
498 | static int shmem_map_and_free_swp(struct page *subdir, int offset, | ||
499 | int limit, struct page ***dir, spinlock_t *punch_lock) | ||
500 | { | ||
501 | swp_entry_t *ptr; | ||
502 | int freed = 0; | ||
503 | |||
504 | ptr = shmem_swp_map(subdir); | ||
505 | for (; offset < limit; offset += LATENCY_LIMIT) { | ||
506 | int size = limit - offset; | ||
507 | if (size > LATENCY_LIMIT) | ||
508 | size = LATENCY_LIMIT; | ||
509 | freed += shmem_free_swp(ptr+offset, ptr+offset+size, | ||
510 | punch_lock); | ||
511 | if (need_resched()) { | ||
512 | shmem_swp_unmap(ptr); | ||
513 | if (*dir) { | ||
514 | shmem_dir_unmap(*dir); | ||
515 | *dir = NULL; | ||
516 | } | ||
517 | cond_resched(); | ||
518 | ptr = shmem_swp_map(subdir); | ||
519 | } | ||
520 | } | ||
521 | shmem_swp_unmap(ptr); | ||
522 | return freed; | ||
523 | } | 377 | } |
524 | 378 | ||
525 | static void shmem_free_pages(struct list_head *next) | 379 | /* |
380 | * Pagevec may contain swap entries, so shuffle up pages before releasing. | ||
381 | */ | ||
382 | static void shmem_pagevec_release(struct pagevec *pvec) | ||
526 | { | 383 | { |
527 | struct page *page; | 384 | int i, j; |
528 | int freed = 0; | 385 | |
529 | 386 | for (i = 0, j = 0; i < pagevec_count(pvec); i++) { | |
530 | do { | 387 | struct page *page = pvec->pages[i]; |
531 | page = container_of(next, struct page, lru); | 388 | if (!radix_tree_exceptional_entry(page)) |
532 | next = next->next; | 389 | pvec->pages[j++] = page; |
533 | shmem_dir_free(page); | 390 | } |
534 | freed++; | 391 | pvec->nr = j; |
535 | if (freed >= LATENCY_LIMIT) { | 392 | pagevec_release(pvec); |
536 | cond_resched(); | ||
537 | freed = 0; | ||
538 | } | ||
539 | } while (next); | ||
540 | } | 393 | } |
541 | 394 | ||
542 | void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end) | 395 | /* |
396 | * Remove range of pages and swap entries from radix tree, and free them. | ||
397 | */ | ||
398 | void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) | ||
543 | { | 399 | { |
400 | struct address_space *mapping = inode->i_mapping; | ||
544 | struct shmem_inode_info *info = SHMEM_I(inode); | 401 | struct shmem_inode_info *info = SHMEM_I(inode); |
545 | unsigned long idx; | 402 | pgoff_t start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; |
546 | unsigned long size; | 403 | unsigned partial = lstart & (PAGE_CACHE_SIZE - 1); |
547 | unsigned long limit; | 404 | pgoff_t end = (lend >> PAGE_CACHE_SHIFT); |
548 | unsigned long stage; | 405 | struct pagevec pvec; |
549 | unsigned long diroff; | 406 | pgoff_t indices[PAGEVEC_SIZE]; |
550 | struct page **dir; | ||
551 | struct page *topdir; | ||
552 | struct page *middir; | ||
553 | struct page *subdir; | ||
554 | swp_entry_t *ptr; | ||
555 | LIST_HEAD(pages_to_free); | ||
556 | long nr_pages_to_free = 0; | ||
557 | long nr_swaps_freed = 0; | 407 | long nr_swaps_freed = 0; |
558 | int offset; | 408 | pgoff_t index; |
559 | int freed; | 409 | int i; |
560 | int punch_hole; | ||
561 | spinlock_t *needs_lock; | ||
562 | spinlock_t *punch_lock; | ||
563 | unsigned long upper_limit; | ||
564 | 410 | ||
565 | truncate_inode_pages_range(inode->i_mapping, start, end); | 411 | BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1)); |
566 | 412 | ||
567 | inode->i_ctime = inode->i_mtime = CURRENT_TIME; | 413 | pagevec_init(&pvec, 0); |
568 | idx = (start + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | 414 | index = start; |
569 | if (idx >= info->next_index) | 415 | while (index <= end) { |
570 | return; | 416 | pvec.nr = shmem_find_get_pages_and_swap(mapping, index, |
417 | min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, | ||
418 | pvec.pages, indices); | ||
419 | if (!pvec.nr) | ||
420 | break; | ||
421 | mem_cgroup_uncharge_start(); | ||
422 | for (i = 0; i < pagevec_count(&pvec); i++) { | ||
423 | struct page *page = pvec.pages[i]; | ||
571 | 424 | ||
572 | spin_lock(&info->lock); | 425 | index = indices[i]; |
573 | info->flags |= SHMEM_TRUNCATE; | 426 | if (index > end) |
574 | if (likely(end == (loff_t) -1)) { | 427 | break; |
575 | limit = info->next_index; | ||
576 | upper_limit = SHMEM_MAX_INDEX; | ||
577 | info->next_index = idx; | ||
578 | needs_lock = NULL; | ||
579 | punch_hole = 0; | ||
580 | } else { | ||
581 | if (end + 1 >= inode->i_size) { /* we may free a little more */ | ||
582 | limit = (inode->i_size + PAGE_CACHE_SIZE - 1) >> | ||
583 | PAGE_CACHE_SHIFT; | ||
584 | upper_limit = SHMEM_MAX_INDEX; | ||
585 | } else { | ||
586 | limit = (end + 1) >> PAGE_CACHE_SHIFT; | ||
587 | upper_limit = limit; | ||
588 | } | ||
589 | needs_lock = &info->lock; | ||
590 | punch_hole = 1; | ||
591 | } | ||
592 | 428 | ||
593 | topdir = info->i_indirect; | 429 | if (radix_tree_exceptional_entry(page)) { |
594 | if (topdir && idx <= SHMEM_NR_DIRECT && !punch_hole) { | 430 | nr_swaps_freed += !shmem_free_swap(mapping, |
595 | info->i_indirect = NULL; | 431 | index, page); |
596 | nr_pages_to_free++; | 432 | continue; |
597 | list_add(&topdir->lru, &pages_to_free); | 433 | } |
434 | |||
435 | if (!trylock_page(page)) | ||
436 | continue; | ||
437 | if (page->mapping == mapping) { | ||
438 | VM_BUG_ON(PageWriteback(page)); | ||
439 | truncate_inode_page(mapping, page); | ||
440 | } | ||
441 | unlock_page(page); | ||
442 | } | ||
443 | shmem_pagevec_release(&pvec); | ||
444 | mem_cgroup_uncharge_end(); | ||
445 | cond_resched(); | ||
446 | index++; | ||
598 | } | 447 | } |
599 | spin_unlock(&info->lock); | ||
600 | 448 | ||
601 | if (info->swapped && idx < SHMEM_NR_DIRECT) { | 449 | if (partial) { |
602 | ptr = info->i_direct; | 450 | struct page *page = NULL; |
603 | size = limit; | 451 | shmem_getpage(inode, start - 1, &page, SGP_READ, NULL); |
604 | if (size > SHMEM_NR_DIRECT) | 452 | if (page) { |
605 | size = SHMEM_NR_DIRECT; | 453 | zero_user_segment(page, partial, PAGE_CACHE_SIZE); |
606 | nr_swaps_freed = shmem_free_swp(ptr+idx, ptr+size, needs_lock); | 454 | set_page_dirty(page); |
455 | unlock_page(page); | ||
456 | page_cache_release(page); | ||
457 | } | ||
607 | } | 458 | } |
608 | 459 | ||
609 | /* | 460 | index = start; |
610 | * If there are no indirect blocks or we are punching a hole | 461 | for ( ; ; ) { |
611 | * below indirect blocks, nothing to be done. | 462 | cond_resched(); |
612 | */ | 463 | pvec.nr = shmem_find_get_pages_and_swap(mapping, index, |
613 | if (!topdir || limit <= SHMEM_NR_DIRECT) | 464 | min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, |
614 | goto done2; | 465 | pvec.pages, indices); |
466 | if (!pvec.nr) { | ||
467 | if (index == start) | ||
468 | break; | ||
469 | index = start; | ||
470 | continue; | ||
471 | } | ||
472 | if (index == start && indices[0] > end) { | ||
473 | shmem_pagevec_release(&pvec); | ||
474 | break; | ||
475 | } | ||
476 | mem_cgroup_uncharge_start(); | ||
477 | for (i = 0; i < pagevec_count(&pvec); i++) { | ||
478 | struct page *page = pvec.pages[i]; | ||
615 | 479 | ||
616 | /* | 480 | index = indices[i]; |
617 | * The truncation case has already dropped info->lock, and we're safe | 481 | if (index > end) |
618 | * because i_size and next_index have already been lowered, preventing | 482 | break; |
619 | * access beyond. But in the punch_hole case, we still need to take | ||
620 | * the lock when updating the swap directory, because there might be | ||
621 | * racing accesses by shmem_getpage(SGP_CACHE), shmem_unuse_inode or | ||
622 | * shmem_writepage. However, whenever we find we can remove a whole | ||
623 | * directory page (not at the misaligned start or end of the range), | ||
624 | * we first NULLify its pointer in the level above, and then have no | ||
625 | * need to take the lock when updating its contents: needs_lock and | ||
626 | * punch_lock (either pointing to info->lock or NULL) manage this. | ||
627 | */ | ||
628 | 483 | ||
629 | upper_limit -= SHMEM_NR_DIRECT; | 484 | if (radix_tree_exceptional_entry(page)) { |
630 | limit -= SHMEM_NR_DIRECT; | 485 | nr_swaps_freed += !shmem_free_swap(mapping, |
631 | idx = (idx > SHMEM_NR_DIRECT)? (idx - SHMEM_NR_DIRECT): 0; | 486 | index, page); |
632 | offset = idx % ENTRIES_PER_PAGE; | 487 | continue; |
633 | idx -= offset; | ||
634 | |||
635 | dir = shmem_dir_map(topdir); | ||
636 | stage = ENTRIES_PER_PAGEPAGE/2; | ||
637 | if (idx < ENTRIES_PER_PAGEPAGE/2) { | ||
638 | middir = topdir; | ||
639 | diroff = idx/ENTRIES_PER_PAGE; | ||
640 | } else { | ||
641 | dir += ENTRIES_PER_PAGE/2; | ||
642 | dir += (idx - ENTRIES_PER_PAGEPAGE/2)/ENTRIES_PER_PAGEPAGE; | ||
643 | while (stage <= idx) | ||
644 | stage += ENTRIES_PER_PAGEPAGE; | ||
645 | middir = *dir; | ||
646 | if (*dir) { | ||
647 | diroff = ((idx - ENTRIES_PER_PAGEPAGE/2) % | ||
648 | ENTRIES_PER_PAGEPAGE) / ENTRIES_PER_PAGE; | ||
649 | if (!diroff && !offset && upper_limit >= stage) { | ||
650 | if (needs_lock) { | ||
651 | spin_lock(needs_lock); | ||
652 | *dir = NULL; | ||
653 | spin_unlock(needs_lock); | ||
654 | needs_lock = NULL; | ||
655 | } else | ||
656 | *dir = NULL; | ||
657 | nr_pages_to_free++; | ||
658 | list_add(&middir->lru, &pages_to_free); | ||
659 | } | 488 | } |
660 | shmem_dir_unmap(dir); | ||
661 | dir = shmem_dir_map(middir); | ||
662 | } else { | ||
663 | diroff = 0; | ||
664 | offset = 0; | ||
665 | idx = stage; | ||
666 | } | ||
667 | } | ||
668 | 489 | ||
669 | for (; idx < limit; idx += ENTRIES_PER_PAGE, diroff++) { | 490 | lock_page(page); |
670 | if (unlikely(idx == stage)) { | 491 | if (page->mapping == mapping) { |
671 | shmem_dir_unmap(dir); | 492 | VM_BUG_ON(PageWriteback(page)); |
672 | dir = shmem_dir_map(topdir) + | 493 | truncate_inode_page(mapping, page); |
673 | ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE; | ||
674 | while (!*dir) { | ||
675 | dir++; | ||
676 | idx += ENTRIES_PER_PAGEPAGE; | ||
677 | if (idx >= limit) | ||
678 | goto done1; | ||
679 | } | 494 | } |
680 | stage = idx + ENTRIES_PER_PAGEPAGE; | 495 | unlock_page(page); |
681 | middir = *dir; | ||
682 | if (punch_hole) | ||
683 | needs_lock = &info->lock; | ||
684 | if (upper_limit >= stage) { | ||
685 | if (needs_lock) { | ||
686 | spin_lock(needs_lock); | ||
687 | *dir = NULL; | ||
688 | spin_unlock(needs_lock); | ||
689 | needs_lock = NULL; | ||
690 | } else | ||
691 | *dir = NULL; | ||
692 | nr_pages_to_free++; | ||
693 | list_add(&middir->lru, &pages_to_free); | ||
694 | } | ||
695 | shmem_dir_unmap(dir); | ||
696 | cond_resched(); | ||
697 | dir = shmem_dir_map(middir); | ||
698 | diroff = 0; | ||
699 | } | ||
700 | punch_lock = needs_lock; | ||
701 | subdir = dir[diroff]; | ||
702 | if (subdir && !offset && upper_limit-idx >= ENTRIES_PER_PAGE) { | ||
703 | if (needs_lock) { | ||
704 | spin_lock(needs_lock); | ||
705 | dir[diroff] = NULL; | ||
706 | spin_unlock(needs_lock); | ||
707 | punch_lock = NULL; | ||
708 | } else | ||
709 | dir[diroff] = NULL; | ||
710 | nr_pages_to_free++; | ||
711 | list_add(&subdir->lru, &pages_to_free); | ||
712 | } | ||
713 | if (subdir && page_private(subdir) /* has swap entries */) { | ||
714 | size = limit - idx; | ||
715 | if (size > ENTRIES_PER_PAGE) | ||
716 | size = ENTRIES_PER_PAGE; | ||
717 | freed = shmem_map_and_free_swp(subdir, | ||
718 | offset, size, &dir, punch_lock); | ||
719 | if (!dir) | ||
720 | dir = shmem_dir_map(middir); | ||
721 | nr_swaps_freed += freed; | ||
722 | if (offset || punch_lock) { | ||
723 | spin_lock(&info->lock); | ||
724 | set_page_private(subdir, | ||
725 | page_private(subdir) - freed); | ||
726 | spin_unlock(&info->lock); | ||
727 | } else | ||
728 | BUG_ON(page_private(subdir) != freed); | ||
729 | } | 496 | } |
730 | offset = 0; | 497 | shmem_pagevec_release(&pvec); |
731 | } | 498 | mem_cgroup_uncharge_end(); |
732 | done1: | 499 | index++; |
733 | shmem_dir_unmap(dir); | ||
734 | done2: | ||
735 | if (inode->i_mapping->nrpages && (info->flags & SHMEM_PAGEIN)) { | ||
736 | /* | ||
737 | * Call truncate_inode_pages again: racing shmem_unuse_inode | ||
738 | * may have swizzled a page in from swap since | ||
739 | * truncate_pagecache or generic_delete_inode did it, before we | ||
740 | * lowered next_index. Also, though shmem_getpage checks | ||
741 | * i_size before adding to cache, no recheck after: so fix the | ||
742 | * narrow window there too. | ||
743 | */ | ||
744 | truncate_inode_pages_range(inode->i_mapping, start, end); | ||
745 | } | 500 | } |
746 | 501 | ||
747 | spin_lock(&info->lock); | 502 | spin_lock(&info->lock); |
748 | info->flags &= ~SHMEM_TRUNCATE; | ||
749 | info->swapped -= nr_swaps_freed; | 503 | info->swapped -= nr_swaps_freed; |
750 | if (nr_pages_to_free) | ||
751 | shmem_free_blocks(inode, nr_pages_to_free); | ||
752 | shmem_recalc_inode(inode); | 504 | shmem_recalc_inode(inode); |
753 | spin_unlock(&info->lock); | 505 | spin_unlock(&info->lock); |
754 | 506 | ||
755 | /* | 507 | inode->i_ctime = inode->i_mtime = CURRENT_TIME; |
756 | * Empty swap vector directory pages to be freed? | ||
757 | */ | ||
758 | if (!list_empty(&pages_to_free)) { | ||
759 | pages_to_free.prev->next = NULL; | ||
760 | shmem_free_pages(pages_to_free.next); | ||
761 | } | ||
762 | } | 508 | } |
763 | EXPORT_SYMBOL_GPL(shmem_truncate_range); | 509 | EXPORT_SYMBOL_GPL(shmem_truncate_range); |
764 | 510 | ||
@@ -774,37 +520,7 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr) | |||
774 | if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { | 520 | if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { |
775 | loff_t oldsize = inode->i_size; | 521 | loff_t oldsize = inode->i_size; |
776 | loff_t newsize = attr->ia_size; | 522 | loff_t newsize = attr->ia_size; |
777 | struct page *page = NULL; | ||
778 | 523 | ||
779 | if (newsize < oldsize) { | ||
780 | /* | ||
781 | * If truncating down to a partial page, then | ||
782 | * if that page is already allocated, hold it | ||
783 | * in memory until the truncation is over, so | ||
784 | * truncate_partial_page cannot miss it were | ||
785 | * it assigned to swap. | ||
786 | */ | ||
787 | if (newsize & (PAGE_CACHE_SIZE-1)) { | ||
788 | (void) shmem_getpage(inode, | ||
789 | newsize >> PAGE_CACHE_SHIFT, | ||
790 | &page, SGP_READ, NULL); | ||
791 | if (page) | ||
792 | unlock_page(page); | ||
793 | } | ||
794 | /* | ||
795 | * Reset SHMEM_PAGEIN flag so that shmem_truncate can | ||
796 | * detect if any pages might have been added to cache | ||
797 | * after truncate_inode_pages. But we needn't bother | ||
798 | * if it's being fully truncated to zero-length: the | ||
799 | * nrpages check is efficient enough in that case. | ||
800 | */ | ||
801 | if (newsize) { | ||
802 | struct shmem_inode_info *info = SHMEM_I(inode); | ||
803 | spin_lock(&info->lock); | ||
804 | info->flags &= ~SHMEM_PAGEIN; | ||
805 | spin_unlock(&info->lock); | ||
806 | } | ||
807 | } | ||
808 | if (newsize != oldsize) { | 524 | if (newsize != oldsize) { |
809 | i_size_write(inode, newsize); | 525 | i_size_write(inode, newsize); |
810 | inode->i_ctime = inode->i_mtime = CURRENT_TIME; | 526 | inode->i_ctime = inode->i_mtime = CURRENT_TIME; |
@@ -816,8 +532,6 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr) | |||
816 | /* unmap again to remove racily COWed private pages */ | 532 | /* unmap again to remove racily COWed private pages */ |
817 | unmap_mapping_range(inode->i_mapping, holebegin, 0, 1); | 533 | unmap_mapping_range(inode->i_mapping, holebegin, 0, 1); |
818 | } | 534 | } |
819 | if (page) | ||
820 | page_cache_release(page); | ||
821 | } | 535 | } |
822 | 536 | ||
823 | setattr_copy(inode, attr); | 537 | setattr_copy(inode, attr); |
@@ -842,7 +556,8 @@ static void shmem_evict_inode(struct inode *inode) | |||
842 | list_del_init(&info->swaplist); | 556 | list_del_init(&info->swaplist); |
843 | mutex_unlock(&shmem_swaplist_mutex); | 557 | mutex_unlock(&shmem_swaplist_mutex); |
844 | } | 558 | } |
845 | } | 559 | } else |
560 | kfree(info->symlink); | ||
846 | 561 | ||
847 | list_for_each_entry_safe(xattr, nxattr, &info->xattr_list, list) { | 562 | list_for_each_entry_safe(xattr, nxattr, &info->xattr_list, list) { |
848 | kfree(xattr->name); | 563 | kfree(xattr->name); |
@@ -853,106 +568,27 @@ static void shmem_evict_inode(struct inode *inode) | |||
853 | end_writeback(inode); | 568 | end_writeback(inode); |
854 | } | 569 | } |
855 | 570 | ||
856 | static inline int shmem_find_swp(swp_entry_t entry, swp_entry_t *dir, swp_entry_t *edir) | 571 | /* |
857 | { | 572 | * If swap found in inode, free it and move page from swapcache to filecache. |
858 | swp_entry_t *ptr; | 573 | */ |
859 | 574 | static int shmem_unuse_inode(struct shmem_inode_info *info, | |
860 | for (ptr = dir; ptr < edir; ptr++) { | 575 | swp_entry_t swap, struct page *page) |
861 | if (ptr->val == entry.val) | ||
862 | return ptr - dir; | ||
863 | } | ||
864 | return -1; | ||
865 | } | ||
866 | |||
867 | static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, struct page *page) | ||
868 | { | 576 | { |
869 | struct address_space *mapping; | 577 | struct address_space *mapping = info->vfs_inode.i_mapping; |
870 | unsigned long idx; | 578 | void *radswap; |
871 | unsigned long size; | 579 | pgoff_t index; |
872 | unsigned long limit; | ||
873 | unsigned long stage; | ||
874 | struct page **dir; | ||
875 | struct page *subdir; | ||
876 | swp_entry_t *ptr; | ||
877 | int offset; | ||
878 | int error; | 580 | int error; |
879 | 581 | ||
880 | idx = 0; | 582 | radswap = swp_to_radix_entry(swap); |
881 | ptr = info->i_direct; | 583 | index = radix_tree_locate_item(&mapping->page_tree, radswap); |
882 | spin_lock(&info->lock); | 584 | if (index == -1) |
883 | if (!info->swapped) { | 585 | return 0; |
884 | list_del_init(&info->swaplist); | ||
885 | goto lost2; | ||
886 | } | ||
887 | limit = info->next_index; | ||
888 | size = limit; | ||
889 | if (size > SHMEM_NR_DIRECT) | ||
890 | size = SHMEM_NR_DIRECT; | ||
891 | offset = shmem_find_swp(entry, ptr, ptr+size); | ||
892 | if (offset >= 0) { | ||
893 | shmem_swp_balance_unmap(); | ||
894 | goto found; | ||
895 | } | ||
896 | if (!info->i_indirect) | ||
897 | goto lost2; | ||
898 | |||
899 | dir = shmem_dir_map(info->i_indirect); | ||
900 | stage = SHMEM_NR_DIRECT + ENTRIES_PER_PAGEPAGE/2; | ||
901 | |||
902 | for (idx = SHMEM_NR_DIRECT; idx < limit; idx += ENTRIES_PER_PAGE, dir++) { | ||
903 | if (unlikely(idx == stage)) { | ||
904 | shmem_dir_unmap(dir-1); | ||
905 | if (cond_resched_lock(&info->lock)) { | ||
906 | /* check it has not been truncated */ | ||
907 | if (limit > info->next_index) { | ||
908 | limit = info->next_index; | ||
909 | if (idx >= limit) | ||
910 | goto lost2; | ||
911 | } | ||
912 | } | ||
913 | dir = shmem_dir_map(info->i_indirect) + | ||
914 | ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE; | ||
915 | while (!*dir) { | ||
916 | dir++; | ||
917 | idx += ENTRIES_PER_PAGEPAGE; | ||
918 | if (idx >= limit) | ||
919 | goto lost1; | ||
920 | } | ||
921 | stage = idx + ENTRIES_PER_PAGEPAGE; | ||
922 | subdir = *dir; | ||
923 | shmem_dir_unmap(dir); | ||
924 | dir = shmem_dir_map(subdir); | ||
925 | } | ||
926 | subdir = *dir; | ||
927 | if (subdir && page_private(subdir)) { | ||
928 | ptr = shmem_swp_map(subdir); | ||
929 | size = limit - idx; | ||
930 | if (size > ENTRIES_PER_PAGE) | ||
931 | size = ENTRIES_PER_PAGE; | ||
932 | offset = shmem_find_swp(entry, ptr, ptr+size); | ||
933 | shmem_swp_unmap(ptr); | ||
934 | if (offset >= 0) { | ||
935 | shmem_dir_unmap(dir); | ||
936 | ptr = shmem_swp_map(subdir); | ||
937 | goto found; | ||
938 | } | ||
939 | } | ||
940 | } | ||
941 | lost1: | ||
942 | shmem_dir_unmap(dir-1); | ||
943 | lost2: | ||
944 | spin_unlock(&info->lock); | ||
945 | return 0; | ||
946 | found: | ||
947 | idx += offset; | ||
948 | ptr += offset; | ||
949 | 586 | ||
950 | /* | 587 | /* |
951 | * Move _head_ to start search for next from here. | 588 | * Move _head_ to start search for next from here. |
952 | * But be careful: shmem_evict_inode checks list_empty without taking | 589 | * But be careful: shmem_evict_inode checks list_empty without taking |
953 | * mutex, and there's an instant in list_move_tail when info->swaplist | 590 | * mutex, and there's an instant in list_move_tail when info->swaplist |
954 | * would appear empty, if it were the only one on shmem_swaplist. We | 591 | * would appear empty, if it were the only one on shmem_swaplist. |
955 | * could avoid doing it if inode NULL; or use this minor optimization. | ||
956 | */ | 592 | */ |
957 | if (shmem_swaplist.next != &info->swaplist) | 593 | if (shmem_swaplist.next != &info->swaplist) |
958 | list_move_tail(&shmem_swaplist, &info->swaplist); | 594 | list_move_tail(&shmem_swaplist, &info->swaplist); |
@@ -962,42 +598,34 @@ found: | |||
962 | * but also to hold up shmem_evict_inode(): so inode cannot be freed | 598 | * but also to hold up shmem_evict_inode(): so inode cannot be freed |
963 | * beneath us (pagelock doesn't help until the page is in pagecache). | 599 | * beneath us (pagelock doesn't help until the page is in pagecache). |
964 | */ | 600 | */ |
965 | mapping = info->vfs_inode.i_mapping; | 601 | error = shmem_add_to_page_cache(page, mapping, index, |
966 | error = add_to_page_cache_locked(page, mapping, idx, GFP_NOWAIT); | 602 | GFP_NOWAIT, radswap); |
967 | /* which does mem_cgroup_uncharge_cache_page on error */ | 603 | /* which does mem_cgroup_uncharge_cache_page on error */ |
968 | 604 | ||
969 | if (error == -EEXIST) { | 605 | if (error != -ENOMEM) { |
970 | struct page *filepage = find_get_page(mapping, idx); | 606 | /* |
971 | error = 1; | 607 | * Truncation and eviction use free_swap_and_cache(), which |
972 | if (filepage) { | 608 | * only does trylock page: if we raced, best clean up here. |
973 | /* | 609 | */ |
974 | * There might be a more uptodate page coming down | ||
975 | * from a stacked writepage: forget our swappage if so. | ||
976 | */ | ||
977 | if (PageUptodate(filepage)) | ||
978 | error = 0; | ||
979 | page_cache_release(filepage); | ||
980 | } | ||
981 | } | ||
982 | if (!error) { | ||
983 | delete_from_swap_cache(page); | 610 | delete_from_swap_cache(page); |
984 | set_page_dirty(page); | 611 | set_page_dirty(page); |
985 | info->flags |= SHMEM_PAGEIN; | 612 | if (!error) { |
986 | shmem_swp_set(info, ptr, 0); | 613 | spin_lock(&info->lock); |
987 | swap_free(entry); | 614 | info->swapped--; |
615 | spin_unlock(&info->lock); | ||
616 | swap_free(swap); | ||
617 | } | ||
988 | error = 1; /* not an error, but entry was found */ | 618 | error = 1; /* not an error, but entry was found */ |
989 | } | 619 | } |
990 | shmem_swp_unmap(ptr); | ||
991 | spin_unlock(&info->lock); | ||
992 | return error; | 620 | return error; |
993 | } | 621 | } |
994 | 622 | ||
995 | /* | 623 | /* |
996 | * shmem_unuse() search for an eventually swapped out shmem page. | 624 | * Search through swapped inodes to find and replace swap by page. |
997 | */ | 625 | */ |
998 | int shmem_unuse(swp_entry_t entry, struct page *page) | 626 | int shmem_unuse(swp_entry_t swap, struct page *page) |
999 | { | 627 | { |
1000 | struct list_head *p, *next; | 628 | struct list_head *this, *next; |
1001 | struct shmem_inode_info *info; | 629 | struct shmem_inode_info *info; |
1002 | int found = 0; | 630 | int found = 0; |
1003 | int error; | 631 | int error; |
@@ -1006,32 +634,25 @@ int shmem_unuse(swp_entry_t entry, struct page *page) | |||
1006 | * Charge page using GFP_KERNEL while we can wait, before taking | 634 | * Charge page using GFP_KERNEL while we can wait, before taking |
1007 | * the shmem_swaplist_mutex which might hold up shmem_writepage(). | 635 | * the shmem_swaplist_mutex which might hold up shmem_writepage(). |
1008 | * Charged back to the user (not to caller) when swap account is used. | 636 | * Charged back to the user (not to caller) when swap account is used. |
1009 | * add_to_page_cache() will be called with GFP_NOWAIT. | ||
1010 | */ | 637 | */ |
1011 | error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL); | 638 | error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL); |
1012 | if (error) | 639 | if (error) |
1013 | goto out; | 640 | goto out; |
1014 | /* | 641 | /* No radix_tree_preload: swap entry keeps a place for page in tree */ |
1015 | * Try to preload while we can wait, to not make a habit of | ||
1016 | * draining atomic reserves; but don't latch on to this cpu, | ||
1017 | * it's okay if sometimes we get rescheduled after this. | ||
1018 | */ | ||
1019 | error = radix_tree_preload(GFP_KERNEL); | ||
1020 | if (error) | ||
1021 | goto uncharge; | ||
1022 | radix_tree_preload_end(); | ||
1023 | 642 | ||
1024 | mutex_lock(&shmem_swaplist_mutex); | 643 | mutex_lock(&shmem_swaplist_mutex); |
1025 | list_for_each_safe(p, next, &shmem_swaplist) { | 644 | list_for_each_safe(this, next, &shmem_swaplist) { |
1026 | info = list_entry(p, struct shmem_inode_info, swaplist); | 645 | info = list_entry(this, struct shmem_inode_info, swaplist); |
1027 | found = shmem_unuse_inode(info, entry, page); | 646 | if (info->swapped) |
647 | found = shmem_unuse_inode(info, swap, page); | ||
648 | else | ||
649 | list_del_init(&info->swaplist); | ||
1028 | cond_resched(); | 650 | cond_resched(); |
1029 | if (found) | 651 | if (found) |
1030 | break; | 652 | break; |
1031 | } | 653 | } |
1032 | mutex_unlock(&shmem_swaplist_mutex); | 654 | mutex_unlock(&shmem_swaplist_mutex); |
1033 | 655 | ||
1034 | uncharge: | ||
1035 | if (!found) | 656 | if (!found) |
1036 | mem_cgroup_uncharge_cache_page(page); | 657 | mem_cgroup_uncharge_cache_page(page); |
1037 | if (found < 0) | 658 | if (found < 0) |
@@ -1048,10 +669,10 @@ out: | |||
1048 | static int shmem_writepage(struct page *page, struct writeback_control *wbc) | 669 | static int shmem_writepage(struct page *page, struct writeback_control *wbc) |
1049 | { | 670 | { |
1050 | struct shmem_inode_info *info; | 671 | struct shmem_inode_info *info; |
1051 | swp_entry_t *entry, swap; | ||
1052 | struct address_space *mapping; | 672 | struct address_space *mapping; |
1053 | unsigned long index; | ||
1054 | struct inode *inode; | 673 | struct inode *inode; |
674 | swp_entry_t swap; | ||
675 | pgoff_t index; | ||
1055 | 676 | ||
1056 | BUG_ON(!PageLocked(page)); | 677 | BUG_ON(!PageLocked(page)); |
1057 | mapping = page->mapping; | 678 | mapping = page->mapping; |
@@ -1066,69 +687,46 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) | |||
1066 | /* | 687 | /* |
1067 | * shmem_backing_dev_info's capabilities prevent regular writeback or | 688 | * shmem_backing_dev_info's capabilities prevent regular writeback or |
1068 | * sync from ever calling shmem_writepage; but a stacking filesystem | 689 | * sync from ever calling shmem_writepage; but a stacking filesystem |
1069 | * may use the ->writepage of its underlying filesystem, in which case | 690 | * might use ->writepage of its underlying filesystem, in which case |
1070 | * tmpfs should write out to swap only in response to memory pressure, | 691 | * tmpfs should write out to swap only in response to memory pressure, |
1071 | * and not for the writeback threads or sync. However, in those cases, | 692 | * and not for the writeback threads or sync. |
1072 | * we do still want to check if there's a redundant swappage to be | ||
1073 | * discarded. | ||
1074 | */ | 693 | */ |
1075 | if (wbc->for_reclaim) | 694 | if (!wbc->for_reclaim) { |
1076 | swap = get_swap_page(); | 695 | WARN_ON_ONCE(1); /* Still happens? Tell us about it! */ |
1077 | else | 696 | goto redirty; |
1078 | swap.val = 0; | 697 | } |
698 | swap = get_swap_page(); | ||
699 | if (!swap.val) | ||
700 | goto redirty; | ||
1079 | 701 | ||
1080 | /* | 702 | /* |
1081 | * Add inode to shmem_unuse()'s list of swapped-out inodes, | 703 | * Add inode to shmem_unuse()'s list of swapped-out inodes, |
1082 | * if it's not already there. Do it now because we cannot take | 704 | * if it's not already there. Do it now before the page is |
1083 | * mutex while holding spinlock, and must do so before the page | 705 | * moved to swap cache, when its pagelock no longer protects |
1084 | * is moved to swap cache, when its pagelock no longer protects | ||
1085 | * the inode from eviction. But don't unlock the mutex until | 706 | * the inode from eviction. But don't unlock the mutex until |
1086 | * we've taken the spinlock, because shmem_unuse_inode() will | 707 | * we've incremented swapped, because shmem_unuse_inode() will |
1087 | * prune a !swapped inode from the swaplist under both locks. | 708 | * prune a !swapped inode from the swaplist under this mutex. |
1088 | */ | 709 | */ |
1089 | if (swap.val) { | 710 | mutex_lock(&shmem_swaplist_mutex); |
1090 | mutex_lock(&shmem_swaplist_mutex); | 711 | if (list_empty(&info->swaplist)) |
1091 | if (list_empty(&info->swaplist)) | 712 | list_add_tail(&info->swaplist, &shmem_swaplist); |
1092 | list_add_tail(&info->swaplist, &shmem_swaplist); | ||
1093 | } | ||
1094 | |||
1095 | spin_lock(&info->lock); | ||
1096 | if (swap.val) | ||
1097 | mutex_unlock(&shmem_swaplist_mutex); | ||
1098 | |||
1099 | if (index >= info->next_index) { | ||
1100 | BUG_ON(!(info->flags & SHMEM_TRUNCATE)); | ||
1101 | goto unlock; | ||
1102 | } | ||
1103 | entry = shmem_swp_entry(info, index, NULL); | ||
1104 | if (entry->val) { | ||
1105 | /* | ||
1106 | * The more uptodate page coming down from a stacked | ||
1107 | * writepage should replace our old swappage. | ||
1108 | */ | ||
1109 | free_swap_and_cache(*entry); | ||
1110 | shmem_swp_set(info, entry, 0); | ||
1111 | } | ||
1112 | shmem_recalc_inode(inode); | ||
1113 | 713 | ||
1114 | if (swap.val && add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) { | 714 | if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) { |
1115 | delete_from_page_cache(page); | ||
1116 | shmem_swp_set(info, entry, swap.val); | ||
1117 | shmem_swp_unmap(entry); | ||
1118 | swap_shmem_alloc(swap); | 715 | swap_shmem_alloc(swap); |
716 | shmem_delete_from_page_cache(page, swp_to_radix_entry(swap)); | ||
717 | |||
718 | spin_lock(&info->lock); | ||
719 | info->swapped++; | ||
720 | shmem_recalc_inode(inode); | ||
1119 | spin_unlock(&info->lock); | 721 | spin_unlock(&info->lock); |
722 | |||
723 | mutex_unlock(&shmem_swaplist_mutex); | ||
1120 | BUG_ON(page_mapped(page)); | 724 | BUG_ON(page_mapped(page)); |
1121 | swap_writepage(page, wbc); | 725 | swap_writepage(page, wbc); |
1122 | return 0; | 726 | return 0; |
1123 | } | 727 | } |
1124 | 728 | ||
1125 | shmem_swp_unmap(entry); | 729 | mutex_unlock(&shmem_swaplist_mutex); |
1126 | unlock: | ||
1127 | spin_unlock(&info->lock); | ||
1128 | /* | ||
1129 | * add_to_swap_cache() doesn't return -EEXIST, so we can safely | ||
1130 | * clear SWAP_HAS_CACHE flag. | ||
1131 | */ | ||
1132 | swapcache_free(swap, NULL); | 730 | swapcache_free(swap, NULL); |
1133 | redirty: | 731 | redirty: |
1134 | set_page_dirty(page); | 732 | set_page_dirty(page); |
@@ -1165,35 +763,33 @@ static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) | |||
1165 | } | 763 | } |
1166 | #endif /* CONFIG_TMPFS */ | 764 | #endif /* CONFIG_TMPFS */ |
1167 | 765 | ||
1168 | static struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp, | 766 | static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp, |
1169 | struct shmem_inode_info *info, unsigned long idx) | 767 | struct shmem_inode_info *info, pgoff_t index) |
1170 | { | 768 | { |
1171 | struct mempolicy mpol, *spol; | 769 | struct mempolicy mpol, *spol; |
1172 | struct vm_area_struct pvma; | 770 | struct vm_area_struct pvma; |
1173 | struct page *page; | ||
1174 | 771 | ||
1175 | spol = mpol_cond_copy(&mpol, | 772 | spol = mpol_cond_copy(&mpol, |
1176 | mpol_shared_policy_lookup(&info->policy, idx)); | 773 | mpol_shared_policy_lookup(&info->policy, index)); |
1177 | 774 | ||
1178 | /* Create a pseudo vma that just contains the policy */ | 775 | /* Create a pseudo vma that just contains the policy */ |
1179 | pvma.vm_start = 0; | 776 | pvma.vm_start = 0; |
1180 | pvma.vm_pgoff = idx; | 777 | pvma.vm_pgoff = index; |
1181 | pvma.vm_ops = NULL; | 778 | pvma.vm_ops = NULL; |
1182 | pvma.vm_policy = spol; | 779 | pvma.vm_policy = spol; |
1183 | page = swapin_readahead(entry, gfp, &pvma, 0); | 780 | return swapin_readahead(swap, gfp, &pvma, 0); |
1184 | return page; | ||
1185 | } | 781 | } |
1186 | 782 | ||
1187 | static struct page *shmem_alloc_page(gfp_t gfp, | 783 | static struct page *shmem_alloc_page(gfp_t gfp, |
1188 | struct shmem_inode_info *info, unsigned long idx) | 784 | struct shmem_inode_info *info, pgoff_t index) |
1189 | { | 785 | { |
1190 | struct vm_area_struct pvma; | 786 | struct vm_area_struct pvma; |
1191 | 787 | ||
1192 | /* Create a pseudo vma that just contains the policy */ | 788 | /* Create a pseudo vma that just contains the policy */ |
1193 | pvma.vm_start = 0; | 789 | pvma.vm_start = 0; |
1194 | pvma.vm_pgoff = idx; | 790 | pvma.vm_pgoff = index; |
1195 | pvma.vm_ops = NULL; | 791 | pvma.vm_ops = NULL; |
1196 | pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx); | 792 | pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index); |
1197 | 793 | ||
1198 | /* | 794 | /* |
1199 | * alloc_page_vma() will drop the shared policy reference | 795 | * alloc_page_vma() will drop the shared policy reference |
@@ -1202,19 +798,19 @@ static struct page *shmem_alloc_page(gfp_t gfp, | |||
1202 | } | 798 | } |
1203 | #else /* !CONFIG_NUMA */ | 799 | #else /* !CONFIG_NUMA */ |
1204 | #ifdef CONFIG_TMPFS | 800 | #ifdef CONFIG_TMPFS |
1205 | static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *p) | 801 | static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol) |
1206 | { | 802 | { |
1207 | } | 803 | } |
1208 | #endif /* CONFIG_TMPFS */ | 804 | #endif /* CONFIG_TMPFS */ |
1209 | 805 | ||
1210 | static inline struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp, | 806 | static inline struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp, |
1211 | struct shmem_inode_info *info, unsigned long idx) | 807 | struct shmem_inode_info *info, pgoff_t index) |
1212 | { | 808 | { |
1213 | return swapin_readahead(entry, gfp, NULL, 0); | 809 | return swapin_readahead(swap, gfp, NULL, 0); |
1214 | } | 810 | } |
1215 | 811 | ||
1216 | static inline struct page *shmem_alloc_page(gfp_t gfp, | 812 | static inline struct page *shmem_alloc_page(gfp_t gfp, |
1217 | struct shmem_inode_info *info, unsigned long idx) | 813 | struct shmem_inode_info *info, pgoff_t index) |
1218 | { | 814 | { |
1219 | return alloc_page(gfp); | 815 | return alloc_page(gfp); |
1220 | } | 816 | } |
@@ -1228,311 +824,195 @@ static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) | |||
1228 | #endif | 824 | #endif |
1229 | 825 | ||
1230 | /* | 826 | /* |
1231 | * shmem_getpage - either get the page from swap or allocate a new one | 827 | * shmem_getpage_gfp - find page in cache, or get from swap, or allocate |
1232 | * | 828 | * |
1233 | * If we allocate a new one we do not mark it dirty. That's up to the | 829 | * If we allocate a new one we do not mark it dirty. That's up to the |
1234 | * vm. If we swap it in we mark it dirty since we also free the swap | 830 | * vm. If we swap it in we mark it dirty since we also free the swap |
1235 | * entry since a page cannot live in both the swap and page cache | 831 | * entry since a page cannot live in both the swap and page cache |
1236 | */ | 832 | */ |
1237 | static int shmem_getpage(struct inode *inode, unsigned long idx, | 833 | static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, |
1238 | struct page **pagep, enum sgp_type sgp, int *type) | 834 | struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type) |
1239 | { | 835 | { |
1240 | struct address_space *mapping = inode->i_mapping; | 836 | struct address_space *mapping = inode->i_mapping; |
1241 | struct shmem_inode_info *info = SHMEM_I(inode); | 837 | struct shmem_inode_info *info; |
1242 | struct shmem_sb_info *sbinfo; | 838 | struct shmem_sb_info *sbinfo; |
1243 | struct page *filepage = *pagep; | 839 | struct page *page; |
1244 | struct page *swappage; | ||
1245 | struct page *prealloc_page = NULL; | ||
1246 | swp_entry_t *entry; | ||
1247 | swp_entry_t swap; | 840 | swp_entry_t swap; |
1248 | gfp_t gfp; | ||
1249 | int error; | 841 | int error; |
842 | int once = 0; | ||
1250 | 843 | ||
1251 | if (idx >= SHMEM_MAX_INDEX) | 844 | if (index > (MAX_LFS_FILESIZE >> PAGE_CACHE_SHIFT)) |
1252 | return -EFBIG; | 845 | return -EFBIG; |
846 | repeat: | ||
847 | swap.val = 0; | ||
848 | page = find_lock_page(mapping, index); | ||
849 | if (radix_tree_exceptional_entry(page)) { | ||
850 | swap = radix_to_swp_entry(page); | ||
851 | page = NULL; | ||
852 | } | ||
1253 | 853 | ||
1254 | if (type) | 854 | if (sgp != SGP_WRITE && |
1255 | *type = 0; | 855 | ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { |
856 | error = -EINVAL; | ||
857 | goto failed; | ||
858 | } | ||
1256 | 859 | ||
1257 | /* | 860 | if (page || (sgp == SGP_READ && !swap.val)) { |
1258 | * Normally, filepage is NULL on entry, and either found | ||
1259 | * uptodate immediately, or allocated and zeroed, or read | ||
1260 | * in under swappage, which is then assigned to filepage. | ||
1261 | * But shmem_readpage (required for splice) passes in a locked | ||
1262 | * filepage, which may be found not uptodate by other callers | ||
1263 | * too, and may need to be copied from the swappage read in. | ||
1264 | */ | ||
1265 | repeat: | ||
1266 | if (!filepage) | ||
1267 | filepage = find_lock_page(mapping, idx); | ||
1268 | if (filepage && PageUptodate(filepage)) | ||
1269 | goto done; | ||
1270 | gfp = mapping_gfp_mask(mapping); | ||
1271 | if (!filepage) { | ||
1272 | /* | 861 | /* |
1273 | * Try to preload while we can wait, to not make a habit of | 862 | * Once we can get the page lock, it must be uptodate: |
1274 | * draining atomic reserves; but don't latch on to this cpu. | 863 | * if there were an error in reading back from swap, |
864 | * the page would not be inserted into the filecache. | ||
1275 | */ | 865 | */ |
1276 | error = radix_tree_preload(gfp & ~__GFP_HIGHMEM); | 866 | BUG_ON(page && !PageUptodate(page)); |
1277 | if (error) | 867 | *pagep = page; |
1278 | goto failed; | 868 | return 0; |
1279 | radix_tree_preload_end(); | ||
1280 | if (sgp != SGP_READ && !prealloc_page) { | ||
1281 | /* We don't care if this fails */ | ||
1282 | prealloc_page = shmem_alloc_page(gfp, info, idx); | ||
1283 | if (prealloc_page) { | ||
1284 | if (mem_cgroup_cache_charge(prealloc_page, | ||
1285 | current->mm, GFP_KERNEL)) { | ||
1286 | page_cache_release(prealloc_page); | ||
1287 | prealloc_page = NULL; | ||
1288 | } | ||
1289 | } | ||
1290 | } | ||
1291 | } | 869 | } |
1292 | error = 0; | ||
1293 | 870 | ||
1294 | spin_lock(&info->lock); | 871 | /* |
1295 | shmem_recalc_inode(inode); | 872 | * Fast cache lookup did not find it: |
1296 | entry = shmem_swp_alloc(info, idx, sgp); | 873 | * bring it back from swap or allocate. |
1297 | if (IS_ERR(entry)) { | 874 | */ |
1298 | spin_unlock(&info->lock); | 875 | info = SHMEM_I(inode); |
1299 | error = PTR_ERR(entry); | 876 | sbinfo = SHMEM_SB(inode->i_sb); |
1300 | goto failed; | ||
1301 | } | ||
1302 | swap = *entry; | ||
1303 | 877 | ||
1304 | if (swap.val) { | 878 | if (swap.val) { |
1305 | /* Look it up and read it in.. */ | 879 | /* Look it up and read it in.. */ |
1306 | swappage = lookup_swap_cache(swap); | 880 | page = lookup_swap_cache(swap); |
1307 | if (!swappage) { | 881 | if (!page) { |
1308 | shmem_swp_unmap(entry); | ||
1309 | spin_unlock(&info->lock); | ||
1310 | /* here we actually do the io */ | 882 | /* here we actually do the io */ |
1311 | if (type) | 883 | if (fault_type) |
1312 | *type |= VM_FAULT_MAJOR; | 884 | *fault_type |= VM_FAULT_MAJOR; |
1313 | swappage = shmem_swapin(swap, gfp, info, idx); | 885 | page = shmem_swapin(swap, gfp, info, index); |
1314 | if (!swappage) { | 886 | if (!page) { |
1315 | spin_lock(&info->lock); | 887 | error = -ENOMEM; |
1316 | entry = shmem_swp_alloc(info, idx, sgp); | 888 | goto failed; |
1317 | if (IS_ERR(entry)) | ||
1318 | error = PTR_ERR(entry); | ||
1319 | else { | ||
1320 | if (entry->val == swap.val) | ||
1321 | error = -ENOMEM; | ||
1322 | shmem_swp_unmap(entry); | ||
1323 | } | ||
1324 | spin_unlock(&info->lock); | ||
1325 | if (error) | ||
1326 | goto failed; | ||
1327 | goto repeat; | ||
1328 | } | 889 | } |
1329 | wait_on_page_locked(swappage); | ||
1330 | page_cache_release(swappage); | ||
1331 | goto repeat; | ||
1332 | } | 890 | } |
1333 | 891 | ||
1334 | /* We have to do this with page locked to prevent races */ | 892 | /* We have to do this with page locked to prevent races */ |
1335 | if (!trylock_page(swappage)) { | 893 | lock_page(page); |
1336 | shmem_swp_unmap(entry); | 894 | if (!PageUptodate(page)) { |
1337 | spin_unlock(&info->lock); | ||
1338 | wait_on_page_locked(swappage); | ||
1339 | page_cache_release(swappage); | ||
1340 | goto repeat; | ||
1341 | } | ||
1342 | if (PageWriteback(swappage)) { | ||
1343 | shmem_swp_unmap(entry); | ||
1344 | spin_unlock(&info->lock); | ||
1345 | wait_on_page_writeback(swappage); | ||
1346 | unlock_page(swappage); | ||
1347 | page_cache_release(swappage); | ||
1348 | goto repeat; | ||
1349 | } | ||
1350 | if (!PageUptodate(swappage)) { | ||
1351 | shmem_swp_unmap(entry); | ||
1352 | spin_unlock(&info->lock); | ||
1353 | unlock_page(swappage); | ||
1354 | page_cache_release(swappage); | ||
1355 | error = -EIO; | 895 | error = -EIO; |
1356 | goto failed; | 896 | goto failed; |
1357 | } | 897 | } |
1358 | 898 | wait_on_page_writeback(page); | |
1359 | if (filepage) { | 899 | |
1360 | shmem_swp_set(info, entry, 0); | 900 | /* Someone may have already done it for us */ |
1361 | shmem_swp_unmap(entry); | 901 | if (page->mapping) { |
1362 | delete_from_swap_cache(swappage); | 902 | if (page->mapping == mapping && |
1363 | spin_unlock(&info->lock); | 903 | page->index == index) |
1364 | copy_highpage(filepage, swappage); | 904 | goto done; |
1365 | unlock_page(swappage); | 905 | error = -EEXIST; |
1366 | page_cache_release(swappage); | 906 | goto failed; |
1367 | flush_dcache_page(filepage); | ||
1368 | SetPageUptodate(filepage); | ||
1369 | set_page_dirty(filepage); | ||
1370 | swap_free(swap); | ||
1371 | } else if (!(error = add_to_page_cache_locked(swappage, mapping, | ||
1372 | idx, GFP_NOWAIT))) { | ||
1373 | info->flags |= SHMEM_PAGEIN; | ||
1374 | shmem_swp_set(info, entry, 0); | ||
1375 | shmem_swp_unmap(entry); | ||
1376 | delete_from_swap_cache(swappage); | ||
1377 | spin_unlock(&info->lock); | ||
1378 | filepage = swappage; | ||
1379 | set_page_dirty(filepage); | ||
1380 | swap_free(swap); | ||
1381 | } else { | ||
1382 | shmem_swp_unmap(entry); | ||
1383 | spin_unlock(&info->lock); | ||
1384 | if (error == -ENOMEM) { | ||
1385 | /* | ||
1386 | * reclaim from proper memory cgroup and | ||
1387 | * call memcg's OOM if needed. | ||
1388 | */ | ||
1389 | error = mem_cgroup_shmem_charge_fallback( | ||
1390 | swappage, | ||
1391 | current->mm, | ||
1392 | gfp); | ||
1393 | if (error) { | ||
1394 | unlock_page(swappage); | ||
1395 | page_cache_release(swappage); | ||
1396 | goto failed; | ||
1397 | } | ||
1398 | } | ||
1399 | unlock_page(swappage); | ||
1400 | page_cache_release(swappage); | ||
1401 | goto repeat; | ||
1402 | } | ||
1403 | } else if (sgp == SGP_READ && !filepage) { | ||
1404 | shmem_swp_unmap(entry); | ||
1405 | filepage = find_get_page(mapping, idx); | ||
1406 | if (filepage && | ||
1407 | (!PageUptodate(filepage) || !trylock_page(filepage))) { | ||
1408 | spin_unlock(&info->lock); | ||
1409 | wait_on_page_locked(filepage); | ||
1410 | page_cache_release(filepage); | ||
1411 | filepage = NULL; | ||
1412 | goto repeat; | ||
1413 | } | 907 | } |
908 | |||
909 | error = mem_cgroup_cache_charge(page, current->mm, | ||
910 | gfp & GFP_RECLAIM_MASK); | ||
911 | if (!error) | ||
912 | error = shmem_add_to_page_cache(page, mapping, index, | ||
913 | gfp, swp_to_radix_entry(swap)); | ||
914 | if (error) | ||
915 | goto failed; | ||
916 | |||
917 | spin_lock(&info->lock); | ||
918 | info->swapped--; | ||
919 | shmem_recalc_inode(inode); | ||
1414 | spin_unlock(&info->lock); | 920 | spin_unlock(&info->lock); |
921 | |||
922 | delete_from_swap_cache(page); | ||
923 | set_page_dirty(page); | ||
924 | swap_free(swap); | ||
925 | |||
1415 | } else { | 926 | } else { |
1416 | shmem_swp_unmap(entry); | 927 | if (shmem_acct_block(info->flags)) { |
1417 | sbinfo = SHMEM_SB(inode->i_sb); | 928 | error = -ENOSPC; |
929 | goto failed; | ||
930 | } | ||
1418 | if (sbinfo->max_blocks) { | 931 | if (sbinfo->max_blocks) { |
1419 | if (percpu_counter_compare(&sbinfo->used_blocks, | 932 | if (percpu_counter_compare(&sbinfo->used_blocks, |
1420 | sbinfo->max_blocks) >= 0 || | 933 | sbinfo->max_blocks) >= 0) { |
1421 | shmem_acct_block(info->flags)) | 934 | error = -ENOSPC; |
1422 | goto nospace; | 935 | goto unacct; |
1423 | percpu_counter_inc(&sbinfo->used_blocks); | ||
1424 | spin_lock(&inode->i_lock); | ||
1425 | inode->i_blocks += BLOCKS_PER_PAGE; | ||
1426 | spin_unlock(&inode->i_lock); | ||
1427 | } else if (shmem_acct_block(info->flags)) | ||
1428 | goto nospace; | ||
1429 | |||
1430 | if (!filepage) { | ||
1431 | int ret; | ||
1432 | |||
1433 | if (!prealloc_page) { | ||
1434 | spin_unlock(&info->lock); | ||
1435 | filepage = shmem_alloc_page(gfp, info, idx); | ||
1436 | if (!filepage) { | ||
1437 | shmem_unacct_blocks(info->flags, 1); | ||
1438 | shmem_free_blocks(inode, 1); | ||
1439 | error = -ENOMEM; | ||
1440 | goto failed; | ||
1441 | } | ||
1442 | SetPageSwapBacked(filepage); | ||
1443 | |||
1444 | /* | ||
1445 | * Precharge page while we can wait, compensate | ||
1446 | * after | ||
1447 | */ | ||
1448 | error = mem_cgroup_cache_charge(filepage, | ||
1449 | current->mm, GFP_KERNEL); | ||
1450 | if (error) { | ||
1451 | page_cache_release(filepage); | ||
1452 | shmem_unacct_blocks(info->flags, 1); | ||
1453 | shmem_free_blocks(inode, 1); | ||
1454 | filepage = NULL; | ||
1455 | goto failed; | ||
1456 | } | ||
1457 | |||
1458 | spin_lock(&info->lock); | ||
1459 | } else { | ||
1460 | filepage = prealloc_page; | ||
1461 | prealloc_page = NULL; | ||
1462 | SetPageSwapBacked(filepage); | ||
1463 | } | 936 | } |
937 | percpu_counter_inc(&sbinfo->used_blocks); | ||
938 | } | ||
1464 | 939 | ||
1465 | entry = shmem_swp_alloc(info, idx, sgp); | 940 | page = shmem_alloc_page(gfp, info, index); |
1466 | if (IS_ERR(entry)) | 941 | if (!page) { |
1467 | error = PTR_ERR(entry); | 942 | error = -ENOMEM; |
1468 | else { | 943 | goto decused; |
1469 | swap = *entry; | ||
1470 | shmem_swp_unmap(entry); | ||
1471 | } | ||
1472 | ret = error || swap.val; | ||
1473 | if (ret) | ||
1474 | mem_cgroup_uncharge_cache_page(filepage); | ||
1475 | else | ||
1476 | ret = add_to_page_cache_lru(filepage, mapping, | ||
1477 | idx, GFP_NOWAIT); | ||
1478 | /* | ||
1479 | * At add_to_page_cache_lru() failure, uncharge will | ||
1480 | * be done automatically. | ||
1481 | */ | ||
1482 | if (ret) { | ||
1483 | spin_unlock(&info->lock); | ||
1484 | page_cache_release(filepage); | ||
1485 | shmem_unacct_blocks(info->flags, 1); | ||
1486 | shmem_free_blocks(inode, 1); | ||
1487 | filepage = NULL; | ||
1488 | if (error) | ||
1489 | goto failed; | ||
1490 | goto repeat; | ||
1491 | } | ||
1492 | info->flags |= SHMEM_PAGEIN; | ||
1493 | } | 944 | } |
1494 | 945 | ||
946 | SetPageSwapBacked(page); | ||
947 | __set_page_locked(page); | ||
948 | error = mem_cgroup_cache_charge(page, current->mm, | ||
949 | gfp & GFP_RECLAIM_MASK); | ||
950 | if (!error) | ||
951 | error = shmem_add_to_page_cache(page, mapping, index, | ||
952 | gfp, NULL); | ||
953 | if (error) | ||
954 | goto decused; | ||
955 | lru_cache_add_anon(page); | ||
956 | |||
957 | spin_lock(&info->lock); | ||
1495 | info->alloced++; | 958 | info->alloced++; |
959 | inode->i_blocks += BLOCKS_PER_PAGE; | ||
960 | shmem_recalc_inode(inode); | ||
1496 | spin_unlock(&info->lock); | 961 | spin_unlock(&info->lock); |
1497 | clear_highpage(filepage); | 962 | |
1498 | flush_dcache_page(filepage); | 963 | clear_highpage(page); |
1499 | SetPageUptodate(filepage); | 964 | flush_dcache_page(page); |
965 | SetPageUptodate(page); | ||
1500 | if (sgp == SGP_DIRTY) | 966 | if (sgp == SGP_DIRTY) |
1501 | set_page_dirty(filepage); | 967 | set_page_dirty(page); |
1502 | } | 968 | } |
1503 | done: | 969 | done: |
1504 | *pagep = filepage; | 970 | /* Perhaps the file has been truncated since we checked */ |
1505 | error = 0; | 971 | if (sgp != SGP_WRITE && |
1506 | goto out; | 972 | ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { |
973 | error = -EINVAL; | ||
974 | goto trunc; | ||
975 | } | ||
976 | *pagep = page; | ||
977 | return 0; | ||
1507 | 978 | ||
1508 | nospace: | ||
1509 | /* | 979 | /* |
1510 | * Perhaps the page was brought in from swap between find_lock_page | 980 | * Error recovery. |
1511 | * and taking info->lock? We allow for that at add_to_page_cache_lru, | ||
1512 | * but must also avoid reporting a spurious ENOSPC while working on a | ||
1513 | * full tmpfs. (When filepage has been passed in to shmem_getpage, it | ||
1514 | * is already in page cache, which prevents this race from occurring.) | ||
1515 | */ | 981 | */ |
1516 | if (!filepage) { | 982 | trunc: |
1517 | struct page *page = find_get_page(mapping, idx); | 983 | ClearPageDirty(page); |
1518 | if (page) { | 984 | delete_from_page_cache(page); |
1519 | spin_unlock(&info->lock); | 985 | spin_lock(&info->lock); |
1520 | page_cache_release(page); | 986 | info->alloced--; |
1521 | goto repeat; | 987 | inode->i_blocks -= BLOCKS_PER_PAGE; |
1522 | } | ||
1523 | } | ||
1524 | spin_unlock(&info->lock); | 988 | spin_unlock(&info->lock); |
1525 | error = -ENOSPC; | 989 | decused: |
990 | if (sbinfo->max_blocks) | ||
991 | percpu_counter_add(&sbinfo->used_blocks, -1); | ||
992 | unacct: | ||
993 | shmem_unacct_blocks(info->flags, 1); | ||
1526 | failed: | 994 | failed: |
1527 | if (*pagep != filepage) { | 995 | if (swap.val && error != -EINVAL) { |
1528 | unlock_page(filepage); | 996 | struct page *test = find_get_page(mapping, index); |
1529 | page_cache_release(filepage); | 997 | if (test && !radix_tree_exceptional_entry(test)) |
998 | page_cache_release(test); | ||
999 | /* Have another try if the entry has changed */ | ||
1000 | if (test != swp_to_radix_entry(swap)) | ||
1001 | error = -EEXIST; | ||
1530 | } | 1002 | } |
1531 | out: | 1003 | if (page) { |
1532 | if (prealloc_page) { | 1004 | unlock_page(page); |
1533 | mem_cgroup_uncharge_cache_page(prealloc_page); | 1005 | page_cache_release(page); |
1534 | page_cache_release(prealloc_page); | ||
1535 | } | 1006 | } |
1007 | if (error == -ENOSPC && !once++) { | ||
1008 | info = SHMEM_I(inode); | ||
1009 | spin_lock(&info->lock); | ||
1010 | shmem_recalc_inode(inode); | ||
1011 | spin_unlock(&info->lock); | ||
1012 | goto repeat; | ||
1013 | } | ||
1014 | if (error == -EEXIST) | ||
1015 | goto repeat; | ||
1536 | return error; | 1016 | return error; |
1537 | } | 1017 | } |
1538 | 1018 | ||
@@ -1540,36 +1020,34 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
1540 | { | 1020 | { |
1541 | struct inode *inode = vma->vm_file->f_path.dentry->d_inode; | 1021 | struct inode *inode = vma->vm_file->f_path.dentry->d_inode; |
1542 | int error; | 1022 | int error; |
1543 | int ret; | 1023 | int ret = VM_FAULT_LOCKED; |
1544 | |||
1545 | if (((loff_t)vmf->pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode)) | ||
1546 | return VM_FAULT_SIGBUS; | ||
1547 | 1024 | ||
1548 | error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret); | 1025 | error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret); |
1549 | if (error) | 1026 | if (error) |
1550 | return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); | 1027 | return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); |
1028 | |||
1551 | if (ret & VM_FAULT_MAJOR) { | 1029 | if (ret & VM_FAULT_MAJOR) { |
1552 | count_vm_event(PGMAJFAULT); | 1030 | count_vm_event(PGMAJFAULT); |
1553 | mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); | 1031 | mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); |
1554 | } | 1032 | } |
1555 | return ret | VM_FAULT_LOCKED; | 1033 | return ret; |
1556 | } | 1034 | } |
1557 | 1035 | ||
1558 | #ifdef CONFIG_NUMA | 1036 | #ifdef CONFIG_NUMA |
1559 | static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new) | 1037 | static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol) |
1560 | { | 1038 | { |
1561 | struct inode *i = vma->vm_file->f_path.dentry->d_inode; | 1039 | struct inode *inode = vma->vm_file->f_path.dentry->d_inode; |
1562 | return mpol_set_shared_policy(&SHMEM_I(i)->policy, vma, new); | 1040 | return mpol_set_shared_policy(&SHMEM_I(inode)->policy, vma, mpol); |
1563 | } | 1041 | } |
1564 | 1042 | ||
1565 | static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma, | 1043 | static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma, |
1566 | unsigned long addr) | 1044 | unsigned long addr) |
1567 | { | 1045 | { |
1568 | struct inode *i = vma->vm_file->f_path.dentry->d_inode; | 1046 | struct inode *inode = vma->vm_file->f_path.dentry->d_inode; |
1569 | unsigned long idx; | 1047 | pgoff_t index; |
1570 | 1048 | ||
1571 | idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; | 1049 | index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; |
1572 | return mpol_shared_policy_lookup(&SHMEM_I(i)->policy, idx); | 1050 | return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index); |
1573 | } | 1051 | } |
1574 | #endif | 1052 | #endif |
1575 | 1053 | ||
@@ -1590,6 +1068,12 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user) | |||
1590 | user_shm_unlock(inode->i_size, user); | 1068 | user_shm_unlock(inode->i_size, user); |
1591 | info->flags &= ~VM_LOCKED; | 1069 | info->flags &= ~VM_LOCKED; |
1592 | mapping_clear_unevictable(file->f_mapping); | 1070 | mapping_clear_unevictable(file->f_mapping); |
1071 | /* | ||
1072 | * Ensure that a racing putback_lru_page() can see | ||
1073 | * the pages of this mapping are evictable when we | ||
1074 | * skip them due to !PageLRU during the scan. | ||
1075 | */ | ||
1076 | smp_mb__after_clear_bit(); | ||
1593 | scan_mapping_unevictable_pages(file->f_mapping); | 1077 | scan_mapping_unevictable_pages(file->f_mapping); |
1594 | } | 1078 | } |
1595 | retval = 0; | 1079 | retval = 0; |
@@ -1667,20 +1151,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode | |||
1667 | 1151 | ||
1668 | #ifdef CONFIG_TMPFS | 1152 | #ifdef CONFIG_TMPFS |
1669 | static const struct inode_operations shmem_symlink_inode_operations; | 1153 | static const struct inode_operations shmem_symlink_inode_operations; |
1670 | static const struct inode_operations shmem_symlink_inline_operations; | 1154 | static const struct inode_operations shmem_short_symlink_operations; |
1671 | |||
1672 | /* | ||
1673 | * Normally tmpfs avoids the use of shmem_readpage and shmem_write_begin; | ||
1674 | * but providing them allows a tmpfs file to be used for splice, sendfile, and | ||
1675 | * below the loop driver, in the generic fashion that many filesystems support. | ||
1676 | */ | ||
1677 | static int shmem_readpage(struct file *file, struct page *page) | ||
1678 | { | ||
1679 | struct inode *inode = page->mapping->host; | ||
1680 | int error = shmem_getpage(inode, page->index, &page, SGP_CACHE, NULL); | ||
1681 | unlock_page(page); | ||
1682 | return error; | ||
1683 | } | ||
1684 | 1155 | ||
1685 | static int | 1156 | static int |
1686 | shmem_write_begin(struct file *file, struct address_space *mapping, | 1157 | shmem_write_begin(struct file *file, struct address_space *mapping, |
@@ -1689,7 +1160,6 @@ shmem_write_begin(struct file *file, struct address_space *mapping, | |||
1689 | { | 1160 | { |
1690 | struct inode *inode = mapping->host; | 1161 | struct inode *inode = mapping->host; |
1691 | pgoff_t index = pos >> PAGE_CACHE_SHIFT; | 1162 | pgoff_t index = pos >> PAGE_CACHE_SHIFT; |
1692 | *pagep = NULL; | ||
1693 | return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL); | 1163 | return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL); |
1694 | } | 1164 | } |
1695 | 1165 | ||
@@ -1714,7 +1184,8 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_ | |||
1714 | { | 1184 | { |
1715 | struct inode *inode = filp->f_path.dentry->d_inode; | 1185 | struct inode *inode = filp->f_path.dentry->d_inode; |
1716 | struct address_space *mapping = inode->i_mapping; | 1186 | struct address_space *mapping = inode->i_mapping; |
1717 | unsigned long index, offset; | 1187 | pgoff_t index; |
1188 | unsigned long offset; | ||
1718 | enum sgp_type sgp = SGP_READ; | 1189 | enum sgp_type sgp = SGP_READ; |
1719 | 1190 | ||
1720 | /* | 1191 | /* |
@@ -1730,7 +1201,8 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_ | |||
1730 | 1201 | ||
1731 | for (;;) { | 1202 | for (;;) { |
1732 | struct page *page = NULL; | 1203 | struct page *page = NULL; |
1733 | unsigned long end_index, nr, ret; | 1204 | pgoff_t end_index; |
1205 | unsigned long nr, ret; | ||
1734 | loff_t i_size = i_size_read(inode); | 1206 | loff_t i_size = i_size_read(inode); |
1735 | 1207 | ||
1736 | end_index = i_size >> PAGE_CACHE_SHIFT; | 1208 | end_index = i_size >> PAGE_CACHE_SHIFT; |
@@ -1846,6 +1318,119 @@ static ssize_t shmem_file_aio_read(struct kiocb *iocb, | |||
1846 | return retval; | 1318 | return retval; |
1847 | } | 1319 | } |
1848 | 1320 | ||
1321 | static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, | ||
1322 | struct pipe_inode_info *pipe, size_t len, | ||
1323 | unsigned int flags) | ||
1324 | { | ||
1325 | struct address_space *mapping = in->f_mapping; | ||
1326 | struct inode *inode = mapping->host; | ||
1327 | unsigned int loff, nr_pages, req_pages; | ||
1328 | struct page *pages[PIPE_DEF_BUFFERS]; | ||
1329 | struct partial_page partial[PIPE_DEF_BUFFERS]; | ||
1330 | struct page *page; | ||
1331 | pgoff_t index, end_index; | ||
1332 | loff_t isize, left; | ||
1333 | int error, page_nr; | ||
1334 | struct splice_pipe_desc spd = { | ||
1335 | .pages = pages, | ||
1336 | .partial = partial, | ||
1337 | .flags = flags, | ||
1338 | .ops = &page_cache_pipe_buf_ops, | ||
1339 | .spd_release = spd_release_page, | ||
1340 | }; | ||
1341 | |||
1342 | isize = i_size_read(inode); | ||
1343 | if (unlikely(*ppos >= isize)) | ||
1344 | return 0; | ||
1345 | |||
1346 | left = isize - *ppos; | ||
1347 | if (unlikely(left < len)) | ||
1348 | len = left; | ||
1349 | |||
1350 | if (splice_grow_spd(pipe, &spd)) | ||
1351 | return -ENOMEM; | ||
1352 | |||
1353 | index = *ppos >> PAGE_CACHE_SHIFT; | ||
1354 | loff = *ppos & ~PAGE_CACHE_MASK; | ||
1355 | req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | ||
1356 | nr_pages = min(req_pages, pipe->buffers); | ||
1357 | |||
1358 | spd.nr_pages = find_get_pages_contig(mapping, index, | ||
1359 | nr_pages, spd.pages); | ||
1360 | index += spd.nr_pages; | ||
1361 | error = 0; | ||
1362 | |||
1363 | while (spd.nr_pages < nr_pages) { | ||
1364 | error = shmem_getpage(inode, index, &page, SGP_CACHE, NULL); | ||
1365 | if (error) | ||
1366 | break; | ||
1367 | unlock_page(page); | ||
1368 | spd.pages[spd.nr_pages++] = page; | ||
1369 | index++; | ||
1370 | } | ||
1371 | |||
1372 | index = *ppos >> PAGE_CACHE_SHIFT; | ||
1373 | nr_pages = spd.nr_pages; | ||
1374 | spd.nr_pages = 0; | ||
1375 | |||
1376 | for (page_nr = 0; page_nr < nr_pages; page_nr++) { | ||
1377 | unsigned int this_len; | ||
1378 | |||
1379 | if (!len) | ||
1380 | break; | ||
1381 | |||
1382 | this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff); | ||
1383 | page = spd.pages[page_nr]; | ||
1384 | |||
1385 | if (!PageUptodate(page) || page->mapping != mapping) { | ||
1386 | error = shmem_getpage(inode, index, &page, | ||
1387 | SGP_CACHE, NULL); | ||
1388 | if (error) | ||
1389 | break; | ||
1390 | unlock_page(page); | ||
1391 | page_cache_release(spd.pages[page_nr]); | ||
1392 | spd.pages[page_nr] = page; | ||
1393 | } | ||
1394 | |||
1395 | isize = i_size_read(inode); | ||
1396 | end_index = (isize - 1) >> PAGE_CACHE_SHIFT; | ||
1397 | if (unlikely(!isize || index > end_index)) | ||
1398 | break; | ||
1399 | |||
1400 | if (end_index == index) { | ||
1401 | unsigned int plen; | ||
1402 | |||
1403 | plen = ((isize - 1) & ~PAGE_CACHE_MASK) + 1; | ||
1404 | if (plen <= loff) | ||
1405 | break; | ||
1406 | |||
1407 | this_len = min(this_len, plen - loff); | ||
1408 | len = this_len; | ||
1409 | } | ||
1410 | |||
1411 | spd.partial[page_nr].offset = loff; | ||
1412 | spd.partial[page_nr].len = this_len; | ||
1413 | len -= this_len; | ||
1414 | loff = 0; | ||
1415 | spd.nr_pages++; | ||
1416 | index++; | ||
1417 | } | ||
1418 | |||
1419 | while (page_nr < nr_pages) | ||
1420 | page_cache_release(spd.pages[page_nr++]); | ||
1421 | |||
1422 | if (spd.nr_pages) | ||
1423 | error = splice_to_pipe(pipe, &spd); | ||
1424 | |||
1425 | splice_shrink_spd(pipe, &spd); | ||
1426 | |||
1427 | if (error > 0) { | ||
1428 | *ppos += error; | ||
1429 | file_accessed(in); | ||
1430 | } | ||
1431 | return error; | ||
1432 | } | ||
1433 | |||
1849 | static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) | 1434 | static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) |
1850 | { | 1435 | { |
1851 | struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb); | 1436 | struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb); |
@@ -1855,8 +1440,9 @@ static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) | |||
1855 | buf->f_namelen = NAME_MAX; | 1440 | buf->f_namelen = NAME_MAX; |
1856 | if (sbinfo->max_blocks) { | 1441 | if (sbinfo->max_blocks) { |
1857 | buf->f_blocks = sbinfo->max_blocks; | 1442 | buf->f_blocks = sbinfo->max_blocks; |
1858 | buf->f_bavail = buf->f_bfree = | 1443 | buf->f_bavail = |
1859 | sbinfo->max_blocks - percpu_counter_sum(&sbinfo->used_blocks); | 1444 | buf->f_bfree = sbinfo->max_blocks - |
1445 | percpu_counter_sum(&sbinfo->used_blocks); | ||
1860 | } | 1446 | } |
1861 | if (sbinfo->max_inodes) { | 1447 | if (sbinfo->max_inodes) { |
1862 | buf->f_files = sbinfo->max_inodes; | 1448 | buf->f_files = sbinfo->max_inodes; |
@@ -1878,7 +1464,7 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) | |||
1878 | inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE); | 1464 | inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE); |
1879 | if (inode) { | 1465 | if (inode) { |
1880 | error = security_inode_init_security(inode, dir, | 1466 | error = security_inode_init_security(inode, dir, |
1881 | &dentry->d_name, NULL, | 1467 | &dentry->d_name, |
1882 | NULL, NULL); | 1468 | NULL, NULL); |
1883 | if (error) { | 1469 | if (error) { |
1884 | if (error != -EOPNOTSUPP) { | 1470 | if (error != -EOPNOTSUPP) { |
@@ -2006,7 +1592,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s | |||
2006 | int error; | 1592 | int error; |
2007 | int len; | 1593 | int len; |
2008 | struct inode *inode; | 1594 | struct inode *inode; |
2009 | struct page *page = NULL; | 1595 | struct page *page; |
2010 | char *kaddr; | 1596 | char *kaddr; |
2011 | struct shmem_inode_info *info; | 1597 | struct shmem_inode_info *info; |
2012 | 1598 | ||
@@ -2018,7 +1604,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s | |||
2018 | if (!inode) | 1604 | if (!inode) |
2019 | return -ENOSPC; | 1605 | return -ENOSPC; |
2020 | 1606 | ||
2021 | error = security_inode_init_security(inode, dir, &dentry->d_name, NULL, | 1607 | error = security_inode_init_security(inode, dir, &dentry->d_name, |
2022 | NULL, NULL); | 1608 | NULL, NULL); |
2023 | if (error) { | 1609 | if (error) { |
2024 | if (error != -EOPNOTSUPP) { | 1610 | if (error != -EOPNOTSUPP) { |
@@ -2030,10 +1616,13 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s | |||
2030 | 1616 | ||
2031 | info = SHMEM_I(inode); | 1617 | info = SHMEM_I(inode); |
2032 | inode->i_size = len-1; | 1618 | inode->i_size = len-1; |
2033 | if (len <= SHMEM_SYMLINK_INLINE_LEN) { | 1619 | if (len <= SHORT_SYMLINK_LEN) { |
2034 | /* do it inline */ | 1620 | info->symlink = kmemdup(symname, len, GFP_KERNEL); |
2035 | memcpy(info->inline_symlink, symname, len); | 1621 | if (!info->symlink) { |
2036 | inode->i_op = &shmem_symlink_inline_operations; | 1622 | iput(inode); |
1623 | return -ENOMEM; | ||
1624 | } | ||
1625 | inode->i_op = &shmem_short_symlink_operations; | ||
2037 | } else { | 1626 | } else { |
2038 | error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL); | 1627 | error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL); |
2039 | if (error) { | 1628 | if (error) { |
@@ -2056,17 +1645,17 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s | |||
2056 | return 0; | 1645 | return 0; |
2057 | } | 1646 | } |
2058 | 1647 | ||
2059 | static void *shmem_follow_link_inline(struct dentry *dentry, struct nameidata *nd) | 1648 | static void *shmem_follow_short_symlink(struct dentry *dentry, struct nameidata *nd) |
2060 | { | 1649 | { |
2061 | nd_set_link(nd, SHMEM_I(dentry->d_inode)->inline_symlink); | 1650 | nd_set_link(nd, SHMEM_I(dentry->d_inode)->symlink); |
2062 | return NULL; | 1651 | return NULL; |
2063 | } | 1652 | } |
2064 | 1653 | ||
2065 | static void *shmem_follow_link(struct dentry *dentry, struct nameidata *nd) | 1654 | static void *shmem_follow_link(struct dentry *dentry, struct nameidata *nd) |
2066 | { | 1655 | { |
2067 | struct page *page = NULL; | 1656 | struct page *page = NULL; |
2068 | int res = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL); | 1657 | int error = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL); |
2069 | nd_set_link(nd, res ? ERR_PTR(res) : kmap(page)); | 1658 | nd_set_link(nd, error ? ERR_PTR(error) : kmap(page)); |
2070 | if (page) | 1659 | if (page) |
2071 | unlock_page(page); | 1660 | unlock_page(page); |
2072 | return page; | 1661 | return page; |
@@ -2177,7 +1766,6 @@ out: | |||
2177 | return err; | 1766 | return err; |
2178 | } | 1767 | } |
2179 | 1768 | ||
2180 | |||
2181 | static const struct xattr_handler *shmem_xattr_handlers[] = { | 1769 | static const struct xattr_handler *shmem_xattr_handlers[] = { |
2182 | #ifdef CONFIG_TMPFS_POSIX_ACL | 1770 | #ifdef CONFIG_TMPFS_POSIX_ACL |
2183 | &generic_acl_access_handler, | 1771 | &generic_acl_access_handler, |
@@ -2307,9 +1895,9 @@ static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size) | |||
2307 | } | 1895 | } |
2308 | #endif /* CONFIG_TMPFS_XATTR */ | 1896 | #endif /* CONFIG_TMPFS_XATTR */ |
2309 | 1897 | ||
2310 | static const struct inode_operations shmem_symlink_inline_operations = { | 1898 | static const struct inode_operations shmem_short_symlink_operations = { |
2311 | .readlink = generic_readlink, | 1899 | .readlink = generic_readlink, |
2312 | .follow_link = shmem_follow_link_inline, | 1900 | .follow_link = shmem_follow_short_symlink, |
2313 | #ifdef CONFIG_TMPFS_XATTR | 1901 | #ifdef CONFIG_TMPFS_XATTR |
2314 | .setxattr = shmem_setxattr, | 1902 | .setxattr = shmem_setxattr, |
2315 | .getxattr = shmem_getxattr, | 1903 | .getxattr = shmem_getxattr, |
@@ -2509,8 +2097,7 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data) | |||
2509 | if (config.max_inodes < inodes) | 2097 | if (config.max_inodes < inodes) |
2510 | goto out; | 2098 | goto out; |
2511 | /* | 2099 | /* |
2512 | * Those tests also disallow limited->unlimited while any are in | 2100 | * Those tests disallow limited->unlimited while any are in use; |
2513 | * use, so i_blocks will always be zero when max_blocks is zero; | ||
2514 | * but we must separately disallow unlimited->limited, because | 2101 | * but we must separately disallow unlimited->limited, because |
2515 | * in that case we have no record of how much is already in use. | 2102 | * in that case we have no record of how much is already in use. |
2516 | */ | 2103 | */ |
@@ -2602,7 +2189,7 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent) | |||
2602 | goto failed; | 2189 | goto failed; |
2603 | sbinfo->free_inodes = sbinfo->max_inodes; | 2190 | sbinfo->free_inodes = sbinfo->max_inodes; |
2604 | 2191 | ||
2605 | sb->s_maxbytes = SHMEM_MAX_BYTES; | 2192 | sb->s_maxbytes = MAX_LFS_FILESIZE; |
2606 | sb->s_blocksize = PAGE_CACHE_SIZE; | 2193 | sb->s_blocksize = PAGE_CACHE_SIZE; |
2607 | sb->s_blocksize_bits = PAGE_CACHE_SHIFT; | 2194 | sb->s_blocksize_bits = PAGE_CACHE_SHIFT; |
2608 | sb->s_magic = TMPFS_MAGIC; | 2195 | sb->s_magic = TMPFS_MAGIC; |
@@ -2637,14 +2224,14 @@ static struct kmem_cache *shmem_inode_cachep; | |||
2637 | 2224 | ||
2638 | static struct inode *shmem_alloc_inode(struct super_block *sb) | 2225 | static struct inode *shmem_alloc_inode(struct super_block *sb) |
2639 | { | 2226 | { |
2640 | struct shmem_inode_info *p; | 2227 | struct shmem_inode_info *info; |
2641 | p = (struct shmem_inode_info *)kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL); | 2228 | info = kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL); |
2642 | if (!p) | 2229 | if (!info) |
2643 | return NULL; | 2230 | return NULL; |
2644 | return &p->vfs_inode; | 2231 | return &info->vfs_inode; |
2645 | } | 2232 | } |
2646 | 2233 | ||
2647 | static void shmem_i_callback(struct rcu_head *head) | 2234 | static void shmem_destroy_callback(struct rcu_head *head) |
2648 | { | 2235 | { |
2649 | struct inode *inode = container_of(head, struct inode, i_rcu); | 2236 | struct inode *inode = container_of(head, struct inode, i_rcu); |
2650 | INIT_LIST_HEAD(&inode->i_dentry); | 2237 | INIT_LIST_HEAD(&inode->i_dentry); |
@@ -2653,29 +2240,26 @@ static void shmem_i_callback(struct rcu_head *head) | |||
2653 | 2240 | ||
2654 | static void shmem_destroy_inode(struct inode *inode) | 2241 | static void shmem_destroy_inode(struct inode *inode) |
2655 | { | 2242 | { |
2656 | if ((inode->i_mode & S_IFMT) == S_IFREG) { | 2243 | if ((inode->i_mode & S_IFMT) == S_IFREG) |
2657 | /* only struct inode is valid if it's an inline symlink */ | ||
2658 | mpol_free_shared_policy(&SHMEM_I(inode)->policy); | 2244 | mpol_free_shared_policy(&SHMEM_I(inode)->policy); |
2659 | } | 2245 | call_rcu(&inode->i_rcu, shmem_destroy_callback); |
2660 | call_rcu(&inode->i_rcu, shmem_i_callback); | ||
2661 | } | 2246 | } |
2662 | 2247 | ||
2663 | static void init_once(void *foo) | 2248 | static void shmem_init_inode(void *foo) |
2664 | { | 2249 | { |
2665 | struct shmem_inode_info *p = (struct shmem_inode_info *) foo; | 2250 | struct shmem_inode_info *info = foo; |
2666 | 2251 | inode_init_once(&info->vfs_inode); | |
2667 | inode_init_once(&p->vfs_inode); | ||
2668 | } | 2252 | } |
2669 | 2253 | ||
2670 | static int init_inodecache(void) | 2254 | static int shmem_init_inodecache(void) |
2671 | { | 2255 | { |
2672 | shmem_inode_cachep = kmem_cache_create("shmem_inode_cache", | 2256 | shmem_inode_cachep = kmem_cache_create("shmem_inode_cache", |
2673 | sizeof(struct shmem_inode_info), | 2257 | sizeof(struct shmem_inode_info), |
2674 | 0, SLAB_PANIC, init_once); | 2258 | 0, SLAB_PANIC, shmem_init_inode); |
2675 | return 0; | 2259 | return 0; |
2676 | } | 2260 | } |
2677 | 2261 | ||
2678 | static void destroy_inodecache(void) | 2262 | static void shmem_destroy_inodecache(void) |
2679 | { | 2263 | { |
2680 | kmem_cache_destroy(shmem_inode_cachep); | 2264 | kmem_cache_destroy(shmem_inode_cachep); |
2681 | } | 2265 | } |
@@ -2684,7 +2268,6 @@ static const struct address_space_operations shmem_aops = { | |||
2684 | .writepage = shmem_writepage, | 2268 | .writepage = shmem_writepage, |
2685 | .set_page_dirty = __set_page_dirty_no_writeback, | 2269 | .set_page_dirty = __set_page_dirty_no_writeback, |
2686 | #ifdef CONFIG_TMPFS | 2270 | #ifdef CONFIG_TMPFS |
2687 | .readpage = shmem_readpage, | ||
2688 | .write_begin = shmem_write_begin, | 2271 | .write_begin = shmem_write_begin, |
2689 | .write_end = shmem_write_end, | 2272 | .write_end = shmem_write_end, |
2690 | #endif | 2273 | #endif |
@@ -2701,7 +2284,7 @@ static const struct file_operations shmem_file_operations = { | |||
2701 | .aio_read = shmem_file_aio_read, | 2284 | .aio_read = shmem_file_aio_read, |
2702 | .aio_write = generic_file_aio_write, | 2285 | .aio_write = generic_file_aio_write, |
2703 | .fsync = noop_fsync, | 2286 | .fsync = noop_fsync, |
2704 | .splice_read = generic_file_splice_read, | 2287 | .splice_read = shmem_file_splice_read, |
2705 | .splice_write = generic_file_splice_write, | 2288 | .splice_write = generic_file_splice_write, |
2706 | #endif | 2289 | #endif |
2707 | }; | 2290 | }; |
@@ -2715,10 +2298,6 @@ static const struct inode_operations shmem_inode_operations = { | |||
2715 | .listxattr = shmem_listxattr, | 2298 | .listxattr = shmem_listxattr, |
2716 | .removexattr = shmem_removexattr, | 2299 | .removexattr = shmem_removexattr, |
2717 | #endif | 2300 | #endif |
2718 | #ifdef CONFIG_TMPFS_POSIX_ACL | ||
2719 | .check_acl = generic_check_acl, | ||
2720 | #endif | ||
2721 | |||
2722 | }; | 2301 | }; |
2723 | 2302 | ||
2724 | static const struct inode_operations shmem_dir_inode_operations = { | 2303 | static const struct inode_operations shmem_dir_inode_operations = { |
@@ -2741,7 +2320,6 @@ static const struct inode_operations shmem_dir_inode_operations = { | |||
2741 | #endif | 2320 | #endif |
2742 | #ifdef CONFIG_TMPFS_POSIX_ACL | 2321 | #ifdef CONFIG_TMPFS_POSIX_ACL |
2743 | .setattr = shmem_setattr, | 2322 | .setattr = shmem_setattr, |
2744 | .check_acl = generic_check_acl, | ||
2745 | #endif | 2323 | #endif |
2746 | }; | 2324 | }; |
2747 | 2325 | ||
@@ -2754,7 +2332,6 @@ static const struct inode_operations shmem_special_inode_operations = { | |||
2754 | #endif | 2332 | #endif |
2755 | #ifdef CONFIG_TMPFS_POSIX_ACL | 2333 | #ifdef CONFIG_TMPFS_POSIX_ACL |
2756 | .setattr = shmem_setattr, | 2334 | .setattr = shmem_setattr, |
2757 | .check_acl = generic_check_acl, | ||
2758 | #endif | 2335 | #endif |
2759 | }; | 2336 | }; |
2760 | 2337 | ||
@@ -2779,21 +2356,20 @@ static const struct vm_operations_struct shmem_vm_ops = { | |||
2779 | #endif | 2356 | #endif |
2780 | }; | 2357 | }; |
2781 | 2358 | ||
2782 | |||
2783 | static struct dentry *shmem_mount(struct file_system_type *fs_type, | 2359 | static struct dentry *shmem_mount(struct file_system_type *fs_type, |
2784 | int flags, const char *dev_name, void *data) | 2360 | int flags, const char *dev_name, void *data) |
2785 | { | 2361 | { |
2786 | return mount_nodev(fs_type, flags, data, shmem_fill_super); | 2362 | return mount_nodev(fs_type, flags, data, shmem_fill_super); |
2787 | } | 2363 | } |
2788 | 2364 | ||
2789 | static struct file_system_type tmpfs_fs_type = { | 2365 | static struct file_system_type shmem_fs_type = { |
2790 | .owner = THIS_MODULE, | 2366 | .owner = THIS_MODULE, |
2791 | .name = "tmpfs", | 2367 | .name = "tmpfs", |
2792 | .mount = shmem_mount, | 2368 | .mount = shmem_mount, |
2793 | .kill_sb = kill_litter_super, | 2369 | .kill_sb = kill_litter_super, |
2794 | }; | 2370 | }; |
2795 | 2371 | ||
2796 | int __init init_tmpfs(void) | 2372 | int __init shmem_init(void) |
2797 | { | 2373 | { |
2798 | int error; | 2374 | int error; |
2799 | 2375 | ||
@@ -2801,18 +2377,18 @@ int __init init_tmpfs(void) | |||
2801 | if (error) | 2377 | if (error) |
2802 | goto out4; | 2378 | goto out4; |
2803 | 2379 | ||
2804 | error = init_inodecache(); | 2380 | error = shmem_init_inodecache(); |
2805 | if (error) | 2381 | if (error) |
2806 | goto out3; | 2382 | goto out3; |
2807 | 2383 | ||
2808 | error = register_filesystem(&tmpfs_fs_type); | 2384 | error = register_filesystem(&shmem_fs_type); |
2809 | if (error) { | 2385 | if (error) { |
2810 | printk(KERN_ERR "Could not register tmpfs\n"); | 2386 | printk(KERN_ERR "Could not register tmpfs\n"); |
2811 | goto out2; | 2387 | goto out2; |
2812 | } | 2388 | } |
2813 | 2389 | ||
2814 | shm_mnt = vfs_kern_mount(&tmpfs_fs_type, MS_NOUSER, | 2390 | shm_mnt = vfs_kern_mount(&shmem_fs_type, MS_NOUSER, |
2815 | tmpfs_fs_type.name, NULL); | 2391 | shmem_fs_type.name, NULL); |
2816 | if (IS_ERR(shm_mnt)) { | 2392 | if (IS_ERR(shm_mnt)) { |
2817 | error = PTR_ERR(shm_mnt); | 2393 | error = PTR_ERR(shm_mnt); |
2818 | printk(KERN_ERR "Could not kern_mount tmpfs\n"); | 2394 | printk(KERN_ERR "Could not kern_mount tmpfs\n"); |
@@ -2821,9 +2397,9 @@ int __init init_tmpfs(void) | |||
2821 | return 0; | 2397 | return 0; |
2822 | 2398 | ||
2823 | out1: | 2399 | out1: |
2824 | unregister_filesystem(&tmpfs_fs_type); | 2400 | unregister_filesystem(&shmem_fs_type); |
2825 | out2: | 2401 | out2: |
2826 | destroy_inodecache(); | 2402 | shmem_destroy_inodecache(); |
2827 | out3: | 2403 | out3: |
2828 | bdi_destroy(&shmem_backing_dev_info); | 2404 | bdi_destroy(&shmem_backing_dev_info); |
2829 | out4: | 2405 | out4: |
@@ -2831,45 +2407,6 @@ out4: | |||
2831 | return error; | 2407 | return error; |
2832 | } | 2408 | } |
2833 | 2409 | ||
2834 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR | ||
2835 | /** | ||
2836 | * mem_cgroup_get_shmem_target - find a page or entry assigned to the shmem file | ||
2837 | * @inode: the inode to be searched | ||
2838 | * @pgoff: the offset to be searched | ||
2839 | * @pagep: the pointer for the found page to be stored | ||
2840 | * @ent: the pointer for the found swap entry to be stored | ||
2841 | * | ||
2842 | * If a page is found, refcount of it is incremented. Callers should handle | ||
2843 | * these refcount. | ||
2844 | */ | ||
2845 | void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t pgoff, | ||
2846 | struct page **pagep, swp_entry_t *ent) | ||
2847 | { | ||
2848 | swp_entry_t entry = { .val = 0 }, *ptr; | ||
2849 | struct page *page = NULL; | ||
2850 | struct shmem_inode_info *info = SHMEM_I(inode); | ||
2851 | |||
2852 | if ((pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode)) | ||
2853 | goto out; | ||
2854 | |||
2855 | spin_lock(&info->lock); | ||
2856 | ptr = shmem_swp_entry(info, pgoff, NULL); | ||
2857 | #ifdef CONFIG_SWAP | ||
2858 | if (ptr && ptr->val) { | ||
2859 | entry.val = ptr->val; | ||
2860 | page = find_get_page(&swapper_space, entry.val); | ||
2861 | } else | ||
2862 | #endif | ||
2863 | page = find_get_page(inode->i_mapping, pgoff); | ||
2864 | if (ptr) | ||
2865 | shmem_swp_unmap(ptr); | ||
2866 | spin_unlock(&info->lock); | ||
2867 | out: | ||
2868 | *pagep = page; | ||
2869 | *ent = entry; | ||
2870 | } | ||
2871 | #endif | ||
2872 | |||
2873 | #else /* !CONFIG_SHMEM */ | 2410 | #else /* !CONFIG_SHMEM */ |
2874 | 2411 | ||
2875 | /* | 2412 | /* |
@@ -2883,23 +2420,23 @@ out: | |||
2883 | 2420 | ||
2884 | #include <linux/ramfs.h> | 2421 | #include <linux/ramfs.h> |
2885 | 2422 | ||
2886 | static struct file_system_type tmpfs_fs_type = { | 2423 | static struct file_system_type shmem_fs_type = { |
2887 | .name = "tmpfs", | 2424 | .name = "tmpfs", |
2888 | .mount = ramfs_mount, | 2425 | .mount = ramfs_mount, |
2889 | .kill_sb = kill_litter_super, | 2426 | .kill_sb = kill_litter_super, |
2890 | }; | 2427 | }; |
2891 | 2428 | ||
2892 | int __init init_tmpfs(void) | 2429 | int __init shmem_init(void) |
2893 | { | 2430 | { |
2894 | BUG_ON(register_filesystem(&tmpfs_fs_type) != 0); | 2431 | BUG_ON(register_filesystem(&shmem_fs_type) != 0); |
2895 | 2432 | ||
2896 | shm_mnt = kern_mount(&tmpfs_fs_type); | 2433 | shm_mnt = kern_mount(&shmem_fs_type); |
2897 | BUG_ON(IS_ERR(shm_mnt)); | 2434 | BUG_ON(IS_ERR(shm_mnt)); |
2898 | 2435 | ||
2899 | return 0; | 2436 | return 0; |
2900 | } | 2437 | } |
2901 | 2438 | ||
2902 | int shmem_unuse(swp_entry_t entry, struct page *page) | 2439 | int shmem_unuse(swp_entry_t swap, struct page *page) |
2903 | { | 2440 | { |
2904 | return 0; | 2441 | return 0; |
2905 | } | 2442 | } |
@@ -2909,43 +2446,17 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user) | |||
2909 | return 0; | 2446 | return 0; |
2910 | } | 2447 | } |
2911 | 2448 | ||
2912 | void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end) | 2449 | void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) |
2913 | { | 2450 | { |
2914 | truncate_inode_pages_range(inode->i_mapping, start, end); | 2451 | truncate_inode_pages_range(inode->i_mapping, lstart, lend); |
2915 | } | 2452 | } |
2916 | EXPORT_SYMBOL_GPL(shmem_truncate_range); | 2453 | EXPORT_SYMBOL_GPL(shmem_truncate_range); |
2917 | 2454 | ||
2918 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR | ||
2919 | /** | ||
2920 | * mem_cgroup_get_shmem_target - find a page or entry assigned to the shmem file | ||
2921 | * @inode: the inode to be searched | ||
2922 | * @pgoff: the offset to be searched | ||
2923 | * @pagep: the pointer for the found page to be stored | ||
2924 | * @ent: the pointer for the found swap entry to be stored | ||
2925 | * | ||
2926 | * If a page is found, refcount of it is incremented. Callers should handle | ||
2927 | * these refcount. | ||
2928 | */ | ||
2929 | void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t pgoff, | ||
2930 | struct page **pagep, swp_entry_t *ent) | ||
2931 | { | ||
2932 | struct page *page = NULL; | ||
2933 | |||
2934 | if ((pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode)) | ||
2935 | goto out; | ||
2936 | page = find_get_page(inode->i_mapping, pgoff); | ||
2937 | out: | ||
2938 | *pagep = page; | ||
2939 | *ent = (swp_entry_t){ .val = 0 }; | ||
2940 | } | ||
2941 | #endif | ||
2942 | |||
2943 | #define shmem_vm_ops generic_file_vm_ops | 2455 | #define shmem_vm_ops generic_file_vm_ops |
2944 | #define shmem_file_operations ramfs_file_operations | 2456 | #define shmem_file_operations ramfs_file_operations |
2945 | #define shmem_get_inode(sb, dir, mode, dev, flags) ramfs_get_inode(sb, dir, mode, dev) | 2457 | #define shmem_get_inode(sb, dir, mode, dev, flags) ramfs_get_inode(sb, dir, mode, dev) |
2946 | #define shmem_acct_size(flags, size) 0 | 2458 | #define shmem_acct_size(flags, size) 0 |
2947 | #define shmem_unacct_size(flags, size) do {} while (0) | 2459 | #define shmem_unacct_size(flags, size) do {} while (0) |
2948 | #define SHMEM_MAX_BYTES MAX_LFS_FILESIZE | ||
2949 | 2460 | ||
2950 | #endif /* CONFIG_SHMEM */ | 2461 | #endif /* CONFIG_SHMEM */ |
2951 | 2462 | ||
@@ -2969,7 +2480,7 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags | |||
2969 | if (IS_ERR(shm_mnt)) | 2480 | if (IS_ERR(shm_mnt)) |
2970 | return (void *)shm_mnt; | 2481 | return (void *)shm_mnt; |
2971 | 2482 | ||
2972 | if (size < 0 || size > SHMEM_MAX_BYTES) | 2483 | if (size < 0 || size > MAX_LFS_FILESIZE) |
2973 | return ERR_PTR(-EINVAL); | 2484 | return ERR_PTR(-EINVAL); |
2974 | 2485 | ||
2975 | if (shmem_acct_size(flags, size)) | 2486 | if (shmem_acct_size(flags, size)) |
@@ -2992,7 +2503,7 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags | |||
2992 | 2503 | ||
2993 | d_instantiate(path.dentry, inode); | 2504 | d_instantiate(path.dentry, inode); |
2994 | inode->i_size = size; | 2505 | inode->i_size = size; |
2995 | inode->i_nlink = 0; /* It is unlinked */ | 2506 | clear_nlink(inode); /* It is unlinked */ |
2996 | #ifndef CONFIG_MMU | 2507 | #ifndef CONFIG_MMU |
2997 | error = ramfs_nommu_expand_for_mapping(inode, size); | 2508 | error = ramfs_nommu_expand_for_mapping(inode, size); |
2998 | if (error) | 2509 | if (error) |
@@ -3048,13 +2559,29 @@ int shmem_zero_setup(struct vm_area_struct *vma) | |||
3048 | * suit tmpfs, since it may have pages in swapcache, and needs to find those | 2559 | * suit tmpfs, since it may have pages in swapcache, and needs to find those |
3049 | * for itself; although drivers/gpu/drm i915 and ttm rely upon this support. | 2560 | * for itself; although drivers/gpu/drm i915 and ttm rely upon this support. |
3050 | * | 2561 | * |
3051 | * Provide a stub for those callers to start using now, then later | 2562 | * i915_gem_object_get_pages_gtt() mixes __GFP_NORETRY | __GFP_NOWARN in |
3052 | * flesh it out to call shmem_getpage() with additional gfp mask, when | 2563 | * with the mapping_gfp_mask(), to avoid OOMing the machine unnecessarily. |
3053 | * shmem_file_splice_read() is added and shmem_readpage() is removed. | ||
3054 | */ | 2564 | */ |
3055 | struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, | 2565 | struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, |
3056 | pgoff_t index, gfp_t gfp) | 2566 | pgoff_t index, gfp_t gfp) |
3057 | { | 2567 | { |
2568 | #ifdef CONFIG_SHMEM | ||
2569 | struct inode *inode = mapping->host; | ||
2570 | struct page *page; | ||
2571 | int error; | ||
2572 | |||
2573 | BUG_ON(mapping->a_ops != &shmem_aops); | ||
2574 | error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE, gfp, NULL); | ||
2575 | if (error) | ||
2576 | page = ERR_PTR(error); | ||
2577 | else | ||
2578 | unlock_page(page); | ||
2579 | return page; | ||
2580 | #else | ||
2581 | /* | ||
2582 | * The tiny !SHMEM case uses ramfs without swap | ||
2583 | */ | ||
3058 | return read_cache_page_gfp(mapping, index, gfp); | 2584 | return read_cache_page_gfp(mapping, index, gfp); |
2585 | #endif | ||
3059 | } | 2586 | } |
3060 | EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp); | 2587 | EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp); |
@@ -574,7 +574,9 @@ static struct arraycache_init initarray_generic = | |||
574 | { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; | 574 | { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; |
575 | 575 | ||
576 | /* internal cache of cache description objs */ | 576 | /* internal cache of cache description objs */ |
577 | static struct kmem_list3 *cache_cache_nodelists[MAX_NUMNODES]; | ||
577 | static struct kmem_cache cache_cache = { | 578 | static struct kmem_cache cache_cache = { |
579 | .nodelists = cache_cache_nodelists, | ||
578 | .batchcount = 1, | 580 | .batchcount = 1, |
579 | .limit = BOOT_CPUCACHE_ENTRIES, | 581 | .limit = BOOT_CPUCACHE_ENTRIES, |
580 | .shared = 1, | 582 | .shared = 1, |
@@ -620,6 +622,51 @@ int slab_is_available(void) | |||
620 | static struct lock_class_key on_slab_l3_key; | 622 | static struct lock_class_key on_slab_l3_key; |
621 | static struct lock_class_key on_slab_alc_key; | 623 | static struct lock_class_key on_slab_alc_key; |
622 | 624 | ||
625 | static struct lock_class_key debugobj_l3_key; | ||
626 | static struct lock_class_key debugobj_alc_key; | ||
627 | |||
628 | static void slab_set_lock_classes(struct kmem_cache *cachep, | ||
629 | struct lock_class_key *l3_key, struct lock_class_key *alc_key, | ||
630 | int q) | ||
631 | { | ||
632 | struct array_cache **alc; | ||
633 | struct kmem_list3 *l3; | ||
634 | int r; | ||
635 | |||
636 | l3 = cachep->nodelists[q]; | ||
637 | if (!l3) | ||
638 | return; | ||
639 | |||
640 | lockdep_set_class(&l3->list_lock, l3_key); | ||
641 | alc = l3->alien; | ||
642 | /* | ||
643 | * FIXME: This check for BAD_ALIEN_MAGIC | ||
644 | * should go away when common slab code is taught to | ||
645 | * work even without alien caches. | ||
646 | * Currently, non NUMA code returns BAD_ALIEN_MAGIC | ||
647 | * for alloc_alien_cache, | ||
648 | */ | ||
649 | if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC) | ||
650 | return; | ||
651 | for_each_node(r) { | ||
652 | if (alc[r]) | ||
653 | lockdep_set_class(&alc[r]->lock, alc_key); | ||
654 | } | ||
655 | } | ||
656 | |||
657 | static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, int node) | ||
658 | { | ||
659 | slab_set_lock_classes(cachep, &debugobj_l3_key, &debugobj_alc_key, node); | ||
660 | } | ||
661 | |||
662 | static void slab_set_debugobj_lock_classes(struct kmem_cache *cachep) | ||
663 | { | ||
664 | int node; | ||
665 | |||
666 | for_each_online_node(node) | ||
667 | slab_set_debugobj_lock_classes_node(cachep, node); | ||
668 | } | ||
669 | |||
623 | static void init_node_lock_keys(int q) | 670 | static void init_node_lock_keys(int q) |
624 | { | 671 | { |
625 | struct cache_sizes *s = malloc_sizes; | 672 | struct cache_sizes *s = malloc_sizes; |
@@ -628,29 +675,14 @@ static void init_node_lock_keys(int q) | |||
628 | return; | 675 | return; |
629 | 676 | ||
630 | for (s = malloc_sizes; s->cs_size != ULONG_MAX; s++) { | 677 | for (s = malloc_sizes; s->cs_size != ULONG_MAX; s++) { |
631 | struct array_cache **alc; | ||
632 | struct kmem_list3 *l3; | 678 | struct kmem_list3 *l3; |
633 | int r; | ||
634 | 679 | ||
635 | l3 = s->cs_cachep->nodelists[q]; | 680 | l3 = s->cs_cachep->nodelists[q]; |
636 | if (!l3 || OFF_SLAB(s->cs_cachep)) | 681 | if (!l3 || OFF_SLAB(s->cs_cachep)) |
637 | continue; | 682 | continue; |
638 | lockdep_set_class(&l3->list_lock, &on_slab_l3_key); | 683 | |
639 | alc = l3->alien; | 684 | slab_set_lock_classes(s->cs_cachep, &on_slab_l3_key, |
640 | /* | 685 | &on_slab_alc_key, q); |
641 | * FIXME: This check for BAD_ALIEN_MAGIC | ||
642 | * should go away when common slab code is taught to | ||
643 | * work even without alien caches. | ||
644 | * Currently, non NUMA code returns BAD_ALIEN_MAGIC | ||
645 | * for alloc_alien_cache, | ||
646 | */ | ||
647 | if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC) | ||
648 | continue; | ||
649 | for_each_node(r) { | ||
650 | if (alc[r]) | ||
651 | lockdep_set_class(&alc[r]->lock, | ||
652 | &on_slab_alc_key); | ||
653 | } | ||
654 | } | 686 | } |
655 | } | 687 | } |
656 | 688 | ||
@@ -669,6 +701,14 @@ static void init_node_lock_keys(int q) | |||
669 | static inline void init_lock_keys(void) | 701 | static inline void init_lock_keys(void) |
670 | { | 702 | { |
671 | } | 703 | } |
704 | |||
705 | static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, int node) | ||
706 | { | ||
707 | } | ||
708 | |||
709 | static void slab_set_debugobj_lock_classes(struct kmem_cache *cachep) | ||
710 | { | ||
711 | } | ||
672 | #endif | 712 | #endif |
673 | 713 | ||
674 | /* | 714 | /* |
@@ -1262,6 +1302,8 @@ static int __cpuinit cpuup_prepare(long cpu) | |||
1262 | spin_unlock_irq(&l3->list_lock); | 1302 | spin_unlock_irq(&l3->list_lock); |
1263 | kfree(shared); | 1303 | kfree(shared); |
1264 | free_alien_cache(alien); | 1304 | free_alien_cache(alien); |
1305 | if (cachep->flags & SLAB_DEBUG_OBJECTS) | ||
1306 | slab_set_debugobj_lock_classes_node(cachep, node); | ||
1265 | } | 1307 | } |
1266 | init_node_lock_keys(node); | 1308 | init_node_lock_keys(node); |
1267 | 1309 | ||
@@ -1492,11 +1534,10 @@ void __init kmem_cache_init(void) | |||
1492 | cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE + node]; | 1534 | cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE + node]; |
1493 | 1535 | ||
1494 | /* | 1536 | /* |
1495 | * struct kmem_cache size depends on nr_node_ids, which | 1537 | * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids |
1496 | * can be less than MAX_NUMNODES. | ||
1497 | */ | 1538 | */ |
1498 | cache_cache.buffer_size = offsetof(struct kmem_cache, nodelists) + | 1539 | cache_cache.buffer_size = offsetof(struct kmem_cache, array[nr_cpu_ids]) + |
1499 | nr_node_ids * sizeof(struct kmem_list3 *); | 1540 | nr_node_ids * sizeof(struct kmem_list3 *); |
1500 | #if DEBUG | 1541 | #if DEBUG |
1501 | cache_cache.obj_size = cache_cache.buffer_size; | 1542 | cache_cache.obj_size = cache_cache.buffer_size; |
1502 | #endif | 1543 | #endif |
@@ -1625,6 +1666,9 @@ void __init kmem_cache_init_late(void) | |||
1625 | { | 1666 | { |
1626 | struct kmem_cache *cachep; | 1667 | struct kmem_cache *cachep; |
1627 | 1668 | ||
1669 | /* Annotate slab for lockdep -- annotate the malloc caches */ | ||
1670 | init_lock_keys(); | ||
1671 | |||
1628 | /* 6) resize the head arrays to their final sizes */ | 1672 | /* 6) resize the head arrays to their final sizes */ |
1629 | mutex_lock(&cache_chain_mutex); | 1673 | mutex_lock(&cache_chain_mutex); |
1630 | list_for_each_entry(cachep, &cache_chain, next) | 1674 | list_for_each_entry(cachep, &cache_chain, next) |
@@ -1635,9 +1679,6 @@ void __init kmem_cache_init_late(void) | |||
1635 | /* Done! */ | 1679 | /* Done! */ |
1636 | g_cpucache_up = FULL; | 1680 | g_cpucache_up = FULL; |
1637 | 1681 | ||
1638 | /* Annotate slab for lockdep -- annotate the malloc caches */ | ||
1639 | init_lock_keys(); | ||
1640 | |||
1641 | /* | 1682 | /* |
1642 | * Register a cpu startup notifier callback that initializes | 1683 | * Register a cpu startup notifier callback that initializes |
1643 | * cpu_cache_get for all new cpus | 1684 | * cpu_cache_get for all new cpus |
@@ -1810,15 +1851,15 @@ static void dump_line(char *data, int offset, int limit) | |||
1810 | unsigned char error = 0; | 1851 | unsigned char error = 0; |
1811 | int bad_count = 0; | 1852 | int bad_count = 0; |
1812 | 1853 | ||
1813 | printk(KERN_ERR "%03x:", offset); | 1854 | printk(KERN_ERR "%03x: ", offset); |
1814 | for (i = 0; i < limit; i++) { | 1855 | for (i = 0; i < limit; i++) { |
1815 | if (data[offset + i] != POISON_FREE) { | 1856 | if (data[offset + i] != POISON_FREE) { |
1816 | error = data[offset + i]; | 1857 | error = data[offset + i]; |
1817 | bad_count++; | 1858 | bad_count++; |
1818 | } | 1859 | } |
1819 | printk(" %02x", (unsigned char)data[offset + i]); | ||
1820 | } | 1860 | } |
1821 | printk("\n"); | 1861 | print_hex_dump(KERN_CONT, "", 0, 16, 1, |
1862 | &data[offset], limit, 1); | ||
1822 | 1863 | ||
1823 | if (bad_count == 1) { | 1864 | if (bad_count == 1) { |
1824 | error ^= POISON_FREE; | 1865 | error ^= POISON_FREE; |
@@ -2308,6 +2349,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
2308 | if (!cachep) | 2349 | if (!cachep) |
2309 | goto oops; | 2350 | goto oops; |
2310 | 2351 | ||
2352 | cachep->nodelists = (struct kmem_list3 **)&cachep->array[nr_cpu_ids]; | ||
2311 | #if DEBUG | 2353 | #if DEBUG |
2312 | cachep->obj_size = size; | 2354 | cachep->obj_size = size; |
2313 | 2355 | ||
@@ -2424,6 +2466,16 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
2424 | goto oops; | 2466 | goto oops; |
2425 | } | 2467 | } |
2426 | 2468 | ||
2469 | if (flags & SLAB_DEBUG_OBJECTS) { | ||
2470 | /* | ||
2471 | * Would deadlock through slab_destroy()->call_rcu()-> | ||
2472 | * debug_object_activate()->kmem_cache_alloc(). | ||
2473 | */ | ||
2474 | WARN_ON_ONCE(flags & SLAB_DESTROY_BY_RCU); | ||
2475 | |||
2476 | slab_set_debugobj_lock_classes(cachep); | ||
2477 | } | ||
2478 | |||
2427 | /* cache setup completed, link it into the list */ | 2479 | /* cache setup completed, link it into the list */ |
2428 | list_add(&cachep->next, &cache_chain); | 2480 | list_add(&cachep->next, &cache_chain); |
2429 | oops: | 2481 | oops: |
@@ -2987,14 +3039,9 @@ bad: | |||
2987 | printk(KERN_ERR "slab: Internal list corruption detected in " | 3039 | printk(KERN_ERR "slab: Internal list corruption detected in " |
2988 | "cache '%s'(%d), slabp %p(%d). Hexdump:\n", | 3040 | "cache '%s'(%d), slabp %p(%d). Hexdump:\n", |
2989 | cachep->name, cachep->num, slabp, slabp->inuse); | 3041 | cachep->name, cachep->num, slabp, slabp->inuse); |
2990 | for (i = 0; | 3042 | print_hex_dump(KERN_ERR, "", DUMP_PREFIX_OFFSET, 16, 1, slabp, |
2991 | i < sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t); | 3043 | sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t), |
2992 | i++) { | 3044 | 1); |
2993 | if (i % 16 == 0) | ||
2994 | printk("\n%03x:", i); | ||
2995 | printk(" %02x", ((unsigned char *)slabp)[i]); | ||
2996 | } | ||
2997 | printk("\n"); | ||
2998 | BUG(); | 3045 | BUG(); |
2999 | } | 3046 | } |
3000 | } | 3047 | } |
@@ -3153,12 +3200,11 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, | |||
3153 | objp += obj_offset(cachep); | 3200 | objp += obj_offset(cachep); |
3154 | if (cachep->ctor && cachep->flags & SLAB_POISON) | 3201 | if (cachep->ctor && cachep->flags & SLAB_POISON) |
3155 | cachep->ctor(objp); | 3202 | cachep->ctor(objp); |
3156 | #if ARCH_SLAB_MINALIGN | 3203 | if (ARCH_SLAB_MINALIGN && |
3157 | if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) { | 3204 | ((unsigned long)objp & (ARCH_SLAB_MINALIGN-1))) { |
3158 | printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n", | 3205 | printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n", |
3159 | objp, ARCH_SLAB_MINALIGN); | 3206 | objp, (int)ARCH_SLAB_MINALIGN); |
3160 | } | 3207 | } |
3161 | #endif | ||
3162 | return objp; | 3208 | return objp; |
3163 | } | 3209 | } |
3164 | #else | 3210 | #else |
@@ -3402,7 +3448,7 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, | |||
3402 | cache_alloc_debugcheck_before(cachep, flags); | 3448 | cache_alloc_debugcheck_before(cachep, flags); |
3403 | local_irq_save(save_flags); | 3449 | local_irq_save(save_flags); |
3404 | 3450 | ||
3405 | if (nodeid == -1) | 3451 | if (nodeid == NUMA_NO_NODE) |
3406 | nodeid = slab_node; | 3452 | nodeid = slab_node; |
3407 | 3453 | ||
3408 | if (unlikely(!cachep->nodelists[nodeid])) { | 3454 | if (unlikely(!cachep->nodelists[nodeid])) { |
@@ -3933,7 +3979,7 @@ fail: | |||
3933 | 3979 | ||
3934 | struct ccupdate_struct { | 3980 | struct ccupdate_struct { |
3935 | struct kmem_cache *cachep; | 3981 | struct kmem_cache *cachep; |
3936 | struct array_cache *new[NR_CPUS]; | 3982 | struct array_cache *new[0]; |
3937 | }; | 3983 | }; |
3938 | 3984 | ||
3939 | static void do_ccupdate_local(void *info) | 3985 | static void do_ccupdate_local(void *info) |
@@ -3955,7 +4001,8 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, | |||
3955 | struct ccupdate_struct *new; | 4001 | struct ccupdate_struct *new; |
3956 | int i; | 4002 | int i; |
3957 | 4003 | ||
3958 | new = kzalloc(sizeof(*new), gfp); | 4004 | new = kzalloc(sizeof(*new) + nr_cpu_ids * sizeof(struct array_cache *), |
4005 | gfp); | ||
3959 | if (!new) | 4006 | if (!new) |
3960 | return -ENOMEM; | 4007 | return -ENOMEM; |
3961 | 4008 | ||
@@ -4532,7 +4579,7 @@ static const struct file_operations proc_slabstats_operations = { | |||
4532 | 4579 | ||
4533 | static int __init slab_proc_init(void) | 4580 | static int __init slab_proc_init(void) |
4534 | { | 4581 | { |
4535 | proc_create("slabinfo",S_IWUSR|S_IRUGO,NULL,&proc_slabinfo_operations); | 4582 | proc_create("slabinfo",S_IWUSR|S_IRUSR,NULL,&proc_slabinfo_operations); |
4536 | #ifdef CONFIG_DEBUG_SLAB_LEAK | 4583 | #ifdef CONFIG_DEBUG_SLAB_LEAK |
4537 | proc_create("slab_allocators", 0, NULL, &proc_slabstats_operations); | 4584 | proc_create("slab_allocators", 0, NULL, &proc_slabstats_operations); |
4538 | #endif | 4585 | #endif |
@@ -63,14 +63,14 @@ | |||
63 | #include <linux/swap.h> /* struct reclaim_state */ | 63 | #include <linux/swap.h> /* struct reclaim_state */ |
64 | #include <linux/cache.h> | 64 | #include <linux/cache.h> |
65 | #include <linux/init.h> | 65 | #include <linux/init.h> |
66 | #include <linux/module.h> | 66 | #include <linux/export.h> |
67 | #include <linux/rcupdate.h> | 67 | #include <linux/rcupdate.h> |
68 | #include <linux/list.h> | 68 | #include <linux/list.h> |
69 | #include <linux/kmemleak.h> | 69 | #include <linux/kmemleak.h> |
70 | 70 | ||
71 | #include <trace/events/kmem.h> | 71 | #include <trace/events/kmem.h> |
72 | 72 | ||
73 | #include <asm/atomic.h> | 73 | #include <linux/atomic.h> |
74 | 74 | ||
75 | /* | 75 | /* |
76 | * slob_block has a field 'units', which indicates size of block if +ve, | 76 | * slob_block has a field 'units', which indicates size of block if +ve, |
@@ -482,6 +482,8 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node) | |||
482 | int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); | 482 | int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); |
483 | void *ret; | 483 | void *ret; |
484 | 484 | ||
485 | gfp &= gfp_allowed_mask; | ||
486 | |||
485 | lockdep_trace_alloc(gfp); | 487 | lockdep_trace_alloc(gfp); |
486 | 488 | ||
487 | if (size < PAGE_SIZE - align) { | 489 | if (size < PAGE_SIZE - align) { |
@@ -608,6 +610,10 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node) | |||
608 | { | 610 | { |
609 | void *b; | 611 | void *b; |
610 | 612 | ||
613 | flags &= gfp_allowed_mask; | ||
614 | |||
615 | lockdep_trace_alloc(flags); | ||
616 | |||
611 | if (c->size < PAGE_SIZE) { | 617 | if (c->size < PAGE_SIZE) { |
612 | b = slob_alloc(c->size, flags, c->align, node); | 618 | b = slob_alloc(c->size, flags, c->align, node); |
613 | trace_kmem_cache_alloc_node(_RET_IP_, b, c->size, | 619 | trace_kmem_cache_alloc_node(_RET_IP_, b, c->size, |
@@ -2,10 +2,11 @@ | |||
2 | * SLUB: A slab allocator that limits cache line use instead of queuing | 2 | * SLUB: A slab allocator that limits cache line use instead of queuing |
3 | * objects in per cpu and per node lists. | 3 | * objects in per cpu and per node lists. |
4 | * | 4 | * |
5 | * The allocator synchronizes using per slab locks and only | 5 | * The allocator synchronizes using per slab locks or atomic operatios |
6 | * uses a centralized lock to manage a pool of partial slabs. | 6 | * and only uses a centralized lock to manage a pool of partial slabs. |
7 | * | 7 | * |
8 | * (C) 2007 SGI, Christoph Lameter | 8 | * (C) 2007 SGI, Christoph Lameter |
9 | * (C) 2011 Linux Foundation, Christoph Lameter | ||
9 | */ | 10 | */ |
10 | 11 | ||
11 | #include <linux/mm.h> | 12 | #include <linux/mm.h> |
@@ -27,20 +28,33 @@ | |||
27 | #include <linux/memory.h> | 28 | #include <linux/memory.h> |
28 | #include <linux/math64.h> | 29 | #include <linux/math64.h> |
29 | #include <linux/fault-inject.h> | 30 | #include <linux/fault-inject.h> |
31 | #include <linux/stacktrace.h> | ||
30 | 32 | ||
31 | #include <trace/events/kmem.h> | 33 | #include <trace/events/kmem.h> |
32 | 34 | ||
33 | /* | 35 | /* |
34 | * Lock order: | 36 | * Lock order: |
35 | * 1. slab_lock(page) | 37 | * 1. slub_lock (Global Semaphore) |
36 | * 2. slab->list_lock | 38 | * 2. node->list_lock |
39 | * 3. slab_lock(page) (Only on some arches and for debugging) | ||
37 | * | 40 | * |
38 | * The slab_lock protects operations on the object of a particular | 41 | * slub_lock |
39 | * slab and its metadata in the page struct. If the slab lock | 42 | * |
40 | * has been taken then no allocations nor frees can be performed | 43 | * The role of the slub_lock is to protect the list of all the slabs |
41 | * on the objects in the slab nor can the slab be added or removed | 44 | * and to synchronize major metadata changes to slab cache structures. |
42 | * from the partial or full lists since this would mean modifying | 45 | * |
43 | * the page_struct of the slab. | 46 | * The slab_lock is only used for debugging and on arches that do not |
47 | * have the ability to do a cmpxchg_double. It only protects the second | ||
48 | * double word in the page struct. Meaning | ||
49 | * A. page->freelist -> List of object free in a page | ||
50 | * B. page->counters -> Counters of objects | ||
51 | * C. page->frozen -> frozen state | ||
52 | * | ||
53 | * If a slab is frozen then it is exempt from list management. It is not | ||
54 | * on any list. The processor that froze the slab is the one who can | ||
55 | * perform list operations on the page. Other processors may put objects | ||
56 | * onto the freelist but the processor that froze the slab is the only | ||
57 | * one that can retrieve the objects from the page's freelist. | ||
44 | * | 58 | * |
45 | * The list_lock protects the partial and full list on each node and | 59 | * The list_lock protects the partial and full list on each node and |
46 | * the partial slab counter. If taken then no new slabs may be added or | 60 | * the partial slab counter. If taken then no new slabs may be added or |
@@ -53,20 +67,6 @@ | |||
53 | * slabs, operations can continue without any centralized lock. F.e. | 67 | * slabs, operations can continue without any centralized lock. F.e. |
54 | * allocating a long series of objects that fill up slabs does not require | 68 | * allocating a long series of objects that fill up slabs does not require |
55 | * the list lock. | 69 | * the list lock. |
56 | * | ||
57 | * The lock order is sometimes inverted when we are trying to get a slab | ||
58 | * off a list. We take the list_lock and then look for a page on the list | ||
59 | * to use. While we do that objects in the slabs may be freed. We can | ||
60 | * only operate on the slab if we have also taken the slab_lock. So we use | ||
61 | * a slab_trylock() on the slab. If trylock was successful then no frees | ||
62 | * can occur anymore and we can use the slab for allocations etc. If the | ||
63 | * slab_trylock() does not succeed then frees are in progress in the slab and | ||
64 | * we must stay away from it for a while since we may cause a bouncing | ||
65 | * cacheline if we try to acquire the lock. So go onto the next slab. | ||
66 | * If all pages are busy then we may allocate a new slab instead of reusing | ||
67 | * a partial slab. A new slab has no one operating on it and thus there is | ||
68 | * no danger of cacheline contention. | ||
69 | * | ||
70 | * Interrupts are disabled during allocation and deallocation in order to | 70 | * Interrupts are disabled during allocation and deallocation in order to |
71 | * make the slab allocator safe to use in the context of an irq. In addition | 71 | * make the slab allocator safe to use in the context of an irq. In addition |
72 | * interrupts are disabled to ensure that the processor does not change | 72 | * interrupts are disabled to ensure that the processor does not change |
@@ -131,6 +131,9 @@ static inline int kmem_cache_debug(struct kmem_cache *s) | |||
131 | /* Enable to test recovery from slab corruption on boot */ | 131 | /* Enable to test recovery from slab corruption on boot */ |
132 | #undef SLUB_RESILIENCY_TEST | 132 | #undef SLUB_RESILIENCY_TEST |
133 | 133 | ||
134 | /* Enable to log cmpxchg failures */ | ||
135 | #undef SLUB_DEBUG_CMPXCHG | ||
136 | |||
134 | /* | 137 | /* |
135 | * Mininum number of partial slabs. These will be left on the partial | 138 | * Mininum number of partial slabs. These will be left on the partial |
136 | * lists even if they are empty. kmem_cache_shrink may reclaim them. | 139 | * lists even if they are empty. kmem_cache_shrink may reclaim them. |
@@ -166,10 +169,11 @@ static inline int kmem_cache_debug(struct kmem_cache *s) | |||
166 | 169 | ||
167 | #define OO_SHIFT 16 | 170 | #define OO_SHIFT 16 |
168 | #define OO_MASK ((1 << OO_SHIFT) - 1) | 171 | #define OO_MASK ((1 << OO_SHIFT) - 1) |
169 | #define MAX_OBJS_PER_PAGE 65535 /* since page.objects is u16 */ | 172 | #define MAX_OBJS_PER_PAGE 32767 /* since page.objects is u15 */ |
170 | 173 | ||
171 | /* Internal SLUB flags */ | 174 | /* Internal SLUB flags */ |
172 | #define __OBJECT_POISON 0x80000000UL /* Poison object */ | 175 | #define __OBJECT_POISON 0x80000000UL /* Poison object */ |
176 | #define __CMPXCHG_DOUBLE 0x40000000UL /* Use cmpxchg_double */ | ||
173 | 177 | ||
174 | static int kmem_size = sizeof(struct kmem_cache); | 178 | static int kmem_size = sizeof(struct kmem_cache); |
175 | 179 | ||
@@ -191,8 +195,12 @@ static LIST_HEAD(slab_caches); | |||
191 | /* | 195 | /* |
192 | * Tracking user of a slab. | 196 | * Tracking user of a slab. |
193 | */ | 197 | */ |
198 | #define TRACK_ADDRS_COUNT 16 | ||
194 | struct track { | 199 | struct track { |
195 | unsigned long addr; /* Called from address */ | 200 | unsigned long addr; /* Called from address */ |
201 | #ifdef CONFIG_STACKTRACE | ||
202 | unsigned long addrs[TRACK_ADDRS_COUNT]; /* Called from address */ | ||
203 | #endif | ||
196 | int cpu; /* Was running on cpu */ | 204 | int cpu; /* Was running on cpu */ |
197 | int pid; /* Pid context */ | 205 | int pid; /* Pid context */ |
198 | unsigned long when; /* When did the operation occur */ | 206 | unsigned long when; /* When did the operation occur */ |
@@ -338,11 +346,99 @@ static inline int oo_objects(struct kmem_cache_order_objects x) | |||
338 | return x.x & OO_MASK; | 346 | return x.x & OO_MASK; |
339 | } | 347 | } |
340 | 348 | ||
349 | /* | ||
350 | * Per slab locking using the pagelock | ||
351 | */ | ||
352 | static __always_inline void slab_lock(struct page *page) | ||
353 | { | ||
354 | bit_spin_lock(PG_locked, &page->flags); | ||
355 | } | ||
356 | |||
357 | static __always_inline void slab_unlock(struct page *page) | ||
358 | { | ||
359 | __bit_spin_unlock(PG_locked, &page->flags); | ||
360 | } | ||
361 | |||
362 | /* Interrupts must be disabled (for the fallback code to work right) */ | ||
363 | static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page, | ||
364 | void *freelist_old, unsigned long counters_old, | ||
365 | void *freelist_new, unsigned long counters_new, | ||
366 | const char *n) | ||
367 | { | ||
368 | VM_BUG_ON(!irqs_disabled()); | ||
369 | #ifdef CONFIG_CMPXCHG_DOUBLE | ||
370 | if (s->flags & __CMPXCHG_DOUBLE) { | ||
371 | if (cmpxchg_double(&page->freelist, | ||
372 | freelist_old, counters_old, | ||
373 | freelist_new, counters_new)) | ||
374 | return 1; | ||
375 | } else | ||
376 | #endif | ||
377 | { | ||
378 | slab_lock(page); | ||
379 | if (page->freelist == freelist_old && page->counters == counters_old) { | ||
380 | page->freelist = freelist_new; | ||
381 | page->counters = counters_new; | ||
382 | slab_unlock(page); | ||
383 | return 1; | ||
384 | } | ||
385 | slab_unlock(page); | ||
386 | } | ||
387 | |||
388 | cpu_relax(); | ||
389 | stat(s, CMPXCHG_DOUBLE_FAIL); | ||
390 | |||
391 | #ifdef SLUB_DEBUG_CMPXCHG | ||
392 | printk(KERN_INFO "%s %s: cmpxchg double redo ", n, s->name); | ||
393 | #endif | ||
394 | |||
395 | return 0; | ||
396 | } | ||
397 | |||
398 | static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, | ||
399 | void *freelist_old, unsigned long counters_old, | ||
400 | void *freelist_new, unsigned long counters_new, | ||
401 | const char *n) | ||
402 | { | ||
403 | #ifdef CONFIG_CMPXCHG_DOUBLE | ||
404 | if (s->flags & __CMPXCHG_DOUBLE) { | ||
405 | if (cmpxchg_double(&page->freelist, | ||
406 | freelist_old, counters_old, | ||
407 | freelist_new, counters_new)) | ||
408 | return 1; | ||
409 | } else | ||
410 | #endif | ||
411 | { | ||
412 | unsigned long flags; | ||
413 | |||
414 | local_irq_save(flags); | ||
415 | slab_lock(page); | ||
416 | if (page->freelist == freelist_old && page->counters == counters_old) { | ||
417 | page->freelist = freelist_new; | ||
418 | page->counters = counters_new; | ||
419 | slab_unlock(page); | ||
420 | local_irq_restore(flags); | ||
421 | return 1; | ||
422 | } | ||
423 | slab_unlock(page); | ||
424 | local_irq_restore(flags); | ||
425 | } | ||
426 | |||
427 | cpu_relax(); | ||
428 | stat(s, CMPXCHG_DOUBLE_FAIL); | ||
429 | |||
430 | #ifdef SLUB_DEBUG_CMPXCHG | ||
431 | printk(KERN_INFO "%s %s: cmpxchg double redo ", n, s->name); | ||
432 | #endif | ||
433 | |||
434 | return 0; | ||
435 | } | ||
436 | |||
341 | #ifdef CONFIG_SLUB_DEBUG | 437 | #ifdef CONFIG_SLUB_DEBUG |
342 | /* | 438 | /* |
343 | * Determine a map of object in use on a page. | 439 | * Determine a map of object in use on a page. |
344 | * | 440 | * |
345 | * Slab lock or node listlock must be held to guarantee that the page does | 441 | * Node listlock must be held to guarantee that the page does |
346 | * not vanish from under us. | 442 | * not vanish from under us. |
347 | */ | 443 | */ |
348 | static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map) | 444 | static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map) |
@@ -371,34 +467,8 @@ static int disable_higher_order_debug; | |||
371 | */ | 467 | */ |
372 | static void print_section(char *text, u8 *addr, unsigned int length) | 468 | static void print_section(char *text, u8 *addr, unsigned int length) |
373 | { | 469 | { |
374 | int i, offset; | 470 | print_hex_dump(KERN_ERR, text, DUMP_PREFIX_ADDRESS, 16, 1, addr, |
375 | int newline = 1; | 471 | length, 1); |
376 | char ascii[17]; | ||
377 | |||
378 | ascii[16] = 0; | ||
379 | |||
380 | for (i = 0; i < length; i++) { | ||
381 | if (newline) { | ||
382 | printk(KERN_ERR "%8s 0x%p: ", text, addr + i); | ||
383 | newline = 0; | ||
384 | } | ||
385 | printk(KERN_CONT " %02x", addr[i]); | ||
386 | offset = i % 16; | ||
387 | ascii[offset] = isgraph(addr[i]) ? addr[i] : '.'; | ||
388 | if (offset == 15) { | ||
389 | printk(KERN_CONT " %s\n", ascii); | ||
390 | newline = 1; | ||
391 | } | ||
392 | } | ||
393 | if (!newline) { | ||
394 | i %= 16; | ||
395 | while (i < 16) { | ||
396 | printk(KERN_CONT " "); | ||
397 | ascii[i] = ' '; | ||
398 | i++; | ||
399 | } | ||
400 | printk(KERN_CONT " %s\n", ascii); | ||
401 | } | ||
402 | } | 472 | } |
403 | 473 | ||
404 | static struct track *get_track(struct kmem_cache *s, void *object, | 474 | static struct track *get_track(struct kmem_cache *s, void *object, |
@@ -420,6 +490,24 @@ static void set_track(struct kmem_cache *s, void *object, | |||
420 | struct track *p = get_track(s, object, alloc); | 490 | struct track *p = get_track(s, object, alloc); |
421 | 491 | ||
422 | if (addr) { | 492 | if (addr) { |
493 | #ifdef CONFIG_STACKTRACE | ||
494 | struct stack_trace trace; | ||
495 | int i; | ||
496 | |||
497 | trace.nr_entries = 0; | ||
498 | trace.max_entries = TRACK_ADDRS_COUNT; | ||
499 | trace.entries = p->addrs; | ||
500 | trace.skip = 3; | ||
501 | save_stack_trace(&trace); | ||
502 | |||
503 | /* See rant in lockdep.c */ | ||
504 | if (trace.nr_entries != 0 && | ||
505 | trace.entries[trace.nr_entries - 1] == ULONG_MAX) | ||
506 | trace.nr_entries--; | ||
507 | |||
508 | for (i = trace.nr_entries; i < TRACK_ADDRS_COUNT; i++) | ||
509 | p->addrs[i] = 0; | ||
510 | #endif | ||
423 | p->addr = addr; | 511 | p->addr = addr; |
424 | p->cpu = smp_processor_id(); | 512 | p->cpu = smp_processor_id(); |
425 | p->pid = current->pid; | 513 | p->pid = current->pid; |
@@ -444,6 +532,16 @@ static void print_track(const char *s, struct track *t) | |||
444 | 532 | ||
445 | printk(KERN_ERR "INFO: %s in %pS age=%lu cpu=%u pid=%d\n", | 533 | printk(KERN_ERR "INFO: %s in %pS age=%lu cpu=%u pid=%d\n", |
446 | s, (void *)t->addr, jiffies - t->when, t->cpu, t->pid); | 534 | s, (void *)t->addr, jiffies - t->when, t->cpu, t->pid); |
535 | #ifdef CONFIG_STACKTRACE | ||
536 | { | ||
537 | int i; | ||
538 | for (i = 0; i < TRACK_ADDRS_COUNT; i++) | ||
539 | if (t->addrs[i]) | ||
540 | printk(KERN_ERR "\t%pS\n", (void *)t->addrs[i]); | ||
541 | else | ||
542 | break; | ||
543 | } | ||
544 | #endif | ||
447 | } | 545 | } |
448 | 546 | ||
449 | static void print_tracking(struct kmem_cache *s, void *object) | 547 | static void print_tracking(struct kmem_cache *s, void *object) |
@@ -501,12 +599,12 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) | |||
501 | p, p - addr, get_freepointer(s, p)); | 599 | p, p - addr, get_freepointer(s, p)); |
502 | 600 | ||
503 | if (p > addr + 16) | 601 | if (p > addr + 16) |
504 | print_section("Bytes b4", p - 16, 16); | 602 | print_section("Bytes b4 ", p - 16, 16); |
505 | |||
506 | print_section("Object", p, min_t(unsigned long, s->objsize, PAGE_SIZE)); | ||
507 | 603 | ||
604 | print_section("Object ", p, min_t(unsigned long, s->objsize, | ||
605 | PAGE_SIZE)); | ||
508 | if (s->flags & SLAB_RED_ZONE) | 606 | if (s->flags & SLAB_RED_ZONE) |
509 | print_section("Redzone", p + s->objsize, | 607 | print_section("Redzone ", p + s->objsize, |
510 | s->inuse - s->objsize); | 608 | s->inuse - s->objsize); |
511 | 609 | ||
512 | if (s->offset) | 610 | if (s->offset) |
@@ -519,7 +617,7 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) | |||
519 | 617 | ||
520 | if (off != s->size) | 618 | if (off != s->size) |
521 | /* Beginning of the filler is the free pointer */ | 619 | /* Beginning of the filler is the free pointer */ |
522 | print_section("Padding", p + off, s->size - off); | 620 | print_section("Padding ", p + off, s->size - off); |
523 | 621 | ||
524 | dump_stack(); | 622 | dump_stack(); |
525 | } | 623 | } |
@@ -557,17 +655,6 @@ static void init_object(struct kmem_cache *s, void *object, u8 val) | |||
557 | memset(p + s->objsize, val, s->inuse - s->objsize); | 655 | memset(p + s->objsize, val, s->inuse - s->objsize); |
558 | } | 656 | } |
559 | 657 | ||
560 | static u8 *check_bytes(u8 *start, unsigned int value, unsigned int bytes) | ||
561 | { | ||
562 | while (bytes) { | ||
563 | if (*start != (u8)value) | ||
564 | return start; | ||
565 | start++; | ||
566 | bytes--; | ||
567 | } | ||
568 | return NULL; | ||
569 | } | ||
570 | |||
571 | static void restore_bytes(struct kmem_cache *s, char *message, u8 data, | 658 | static void restore_bytes(struct kmem_cache *s, char *message, u8 data, |
572 | void *from, void *to) | 659 | void *from, void *to) |
573 | { | 660 | { |
@@ -582,7 +669,7 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page, | |||
582 | u8 *fault; | 669 | u8 *fault; |
583 | u8 *end; | 670 | u8 *end; |
584 | 671 | ||
585 | fault = check_bytes(start, value, bytes); | 672 | fault = memchr_inv(start, value, bytes); |
586 | if (!fault) | 673 | if (!fault) |
587 | return 1; | 674 | return 1; |
588 | 675 | ||
@@ -675,14 +762,14 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page) | |||
675 | if (!remainder) | 762 | if (!remainder) |
676 | return 1; | 763 | return 1; |
677 | 764 | ||
678 | fault = check_bytes(end - remainder, POISON_INUSE, remainder); | 765 | fault = memchr_inv(end - remainder, POISON_INUSE, remainder); |
679 | if (!fault) | 766 | if (!fault) |
680 | return 1; | 767 | return 1; |
681 | while (end > fault && end[-1] == POISON_INUSE) | 768 | while (end > fault && end[-1] == POISON_INUSE) |
682 | end--; | 769 | end--; |
683 | 770 | ||
684 | slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1); | 771 | slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1); |
685 | print_section("Padding", end - remainder, remainder); | 772 | print_section("Padding ", end - remainder, remainder); |
686 | 773 | ||
687 | restore_bytes(s, "slab padding", POISON_INUSE, end - remainder, end); | 774 | restore_bytes(s, "slab padding", POISON_INUSE, end - remainder, end); |
688 | return 0; | 775 | return 0; |
@@ -773,10 +860,11 @@ static int check_slab(struct kmem_cache *s, struct page *page) | |||
773 | static int on_freelist(struct kmem_cache *s, struct page *page, void *search) | 860 | static int on_freelist(struct kmem_cache *s, struct page *page, void *search) |
774 | { | 861 | { |
775 | int nr = 0; | 862 | int nr = 0; |
776 | void *fp = page->freelist; | 863 | void *fp; |
777 | void *object = NULL; | 864 | void *object = NULL; |
778 | unsigned long max_objects; | 865 | unsigned long max_objects; |
779 | 866 | ||
867 | fp = page->freelist; | ||
780 | while (fp && nr <= page->objects) { | 868 | while (fp && nr <= page->objects) { |
781 | if (fp == search) | 869 | if (fp == search) |
782 | return 1; | 870 | return 1; |
@@ -830,7 +918,7 @@ static void trace(struct kmem_cache *s, struct page *page, void *object, | |||
830 | page->freelist); | 918 | page->freelist); |
831 | 919 | ||
832 | if (!alloc) | 920 | if (!alloc) |
833 | print_section("Object", (void *)object, s->objsize); | 921 | print_section("Object ", (void *)object, s->objsize); |
834 | 922 | ||
835 | dump_stack(); | 923 | dump_stack(); |
836 | } | 924 | } |
@@ -881,26 +969,27 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x) | |||
881 | 969 | ||
882 | /* | 970 | /* |
883 | * Tracking of fully allocated slabs for debugging purposes. | 971 | * Tracking of fully allocated slabs for debugging purposes. |
972 | * | ||
973 | * list_lock must be held. | ||
884 | */ | 974 | */ |
885 | static void add_full(struct kmem_cache_node *n, struct page *page) | 975 | static void add_full(struct kmem_cache *s, |
976 | struct kmem_cache_node *n, struct page *page) | ||
886 | { | 977 | { |
887 | spin_lock(&n->list_lock); | 978 | if (!(s->flags & SLAB_STORE_USER)) |
979 | return; | ||
980 | |||
888 | list_add(&page->lru, &n->full); | 981 | list_add(&page->lru, &n->full); |
889 | spin_unlock(&n->list_lock); | ||
890 | } | 982 | } |
891 | 983 | ||
984 | /* | ||
985 | * list_lock must be held. | ||
986 | */ | ||
892 | static void remove_full(struct kmem_cache *s, struct page *page) | 987 | static void remove_full(struct kmem_cache *s, struct page *page) |
893 | { | 988 | { |
894 | struct kmem_cache_node *n; | ||
895 | |||
896 | if (!(s->flags & SLAB_STORE_USER)) | 989 | if (!(s->flags & SLAB_STORE_USER)) |
897 | return; | 990 | return; |
898 | 991 | ||
899 | n = get_node(s, page_to_nid(page)); | ||
900 | |||
901 | spin_lock(&n->list_lock); | ||
902 | list_del(&page->lru); | 992 | list_del(&page->lru); |
903 | spin_unlock(&n->list_lock); | ||
904 | } | 993 | } |
905 | 994 | ||
906 | /* Tracking of the number of slabs for debugging purposes */ | 995 | /* Tracking of the number of slabs for debugging purposes */ |
@@ -956,11 +1045,6 @@ static noinline int alloc_debug_processing(struct kmem_cache *s, struct page *pa | |||
956 | if (!check_slab(s, page)) | 1045 | if (!check_slab(s, page)) |
957 | goto bad; | 1046 | goto bad; |
958 | 1047 | ||
959 | if (!on_freelist(s, page, object)) { | ||
960 | object_err(s, page, object, "Object already allocated"); | ||
961 | goto bad; | ||
962 | } | ||
963 | |||
964 | if (!check_valid_pointer(s, page, object)) { | 1048 | if (!check_valid_pointer(s, page, object)) { |
965 | object_err(s, page, object, "Freelist Pointer check fails"); | 1049 | object_err(s, page, object, "Freelist Pointer check fails"); |
966 | goto bad; | 1050 | goto bad; |
@@ -993,6 +1077,12 @@ bad: | |||
993 | static noinline int free_debug_processing(struct kmem_cache *s, | 1077 | static noinline int free_debug_processing(struct kmem_cache *s, |
994 | struct page *page, void *object, unsigned long addr) | 1078 | struct page *page, void *object, unsigned long addr) |
995 | { | 1079 | { |
1080 | unsigned long flags; | ||
1081 | int rc = 0; | ||
1082 | |||
1083 | local_irq_save(flags); | ||
1084 | slab_lock(page); | ||
1085 | |||
996 | if (!check_slab(s, page)) | 1086 | if (!check_slab(s, page)) |
997 | goto fail; | 1087 | goto fail; |
998 | 1088 | ||
@@ -1007,7 +1097,7 @@ static noinline int free_debug_processing(struct kmem_cache *s, | |||
1007 | } | 1097 | } |
1008 | 1098 | ||
1009 | if (!check_object(s, page, object, SLUB_RED_ACTIVE)) | 1099 | if (!check_object(s, page, object, SLUB_RED_ACTIVE)) |
1010 | return 0; | 1100 | goto out; |
1011 | 1101 | ||
1012 | if (unlikely(s != page->slab)) { | 1102 | if (unlikely(s != page->slab)) { |
1013 | if (!PageSlab(page)) { | 1103 | if (!PageSlab(page)) { |
@@ -1024,18 +1114,19 @@ static noinline int free_debug_processing(struct kmem_cache *s, | |||
1024 | goto fail; | 1114 | goto fail; |
1025 | } | 1115 | } |
1026 | 1116 | ||
1027 | /* Special debug activities for freeing objects */ | ||
1028 | if (!PageSlubFrozen(page) && !page->freelist) | ||
1029 | remove_full(s, page); | ||
1030 | if (s->flags & SLAB_STORE_USER) | 1117 | if (s->flags & SLAB_STORE_USER) |
1031 | set_track(s, object, TRACK_FREE, addr); | 1118 | set_track(s, object, TRACK_FREE, addr); |
1032 | trace(s, page, object, 0); | 1119 | trace(s, page, object, 0); |
1033 | init_object(s, object, SLUB_RED_INACTIVE); | 1120 | init_object(s, object, SLUB_RED_INACTIVE); |
1034 | return 1; | 1121 | rc = 1; |
1122 | out: | ||
1123 | slab_unlock(page); | ||
1124 | local_irq_restore(flags); | ||
1125 | return rc; | ||
1035 | 1126 | ||
1036 | fail: | 1127 | fail: |
1037 | slab_fix(s, "Object at 0x%p not freed", object); | 1128 | slab_fix(s, "Object at 0x%p not freed", object); |
1038 | return 0; | 1129 | goto out; |
1039 | } | 1130 | } |
1040 | 1131 | ||
1041 | static int __init setup_slub_debug(char *str) | 1132 | static int __init setup_slub_debug(char *str) |
@@ -1135,7 +1226,9 @@ static inline int slab_pad_check(struct kmem_cache *s, struct page *page) | |||
1135 | { return 1; } | 1226 | { return 1; } |
1136 | static inline int check_object(struct kmem_cache *s, struct page *page, | 1227 | static inline int check_object(struct kmem_cache *s, struct page *page, |
1137 | void *object, u8 val) { return 1; } | 1228 | void *object, u8 val) { return 1; } |
1138 | static inline void add_full(struct kmem_cache_node *n, struct page *page) {} | 1229 | static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n, |
1230 | struct page *page) {} | ||
1231 | static inline void remove_full(struct kmem_cache *s, struct page *page) {} | ||
1139 | static inline unsigned long kmem_cache_flags(unsigned long objsize, | 1232 | static inline unsigned long kmem_cache_flags(unsigned long objsize, |
1140 | unsigned long flags, const char *name, | 1233 | unsigned long flags, const char *name, |
1141 | void (*ctor)(void *)) | 1234 | void (*ctor)(void *)) |
@@ -1187,6 +1280,11 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) | |||
1187 | struct kmem_cache_order_objects oo = s->oo; | 1280 | struct kmem_cache_order_objects oo = s->oo; |
1188 | gfp_t alloc_gfp; | 1281 | gfp_t alloc_gfp; |
1189 | 1282 | ||
1283 | flags &= gfp_allowed_mask; | ||
1284 | |||
1285 | if (flags & __GFP_WAIT) | ||
1286 | local_irq_enable(); | ||
1287 | |||
1190 | flags |= s->allocflags; | 1288 | flags |= s->allocflags; |
1191 | 1289 | ||
1192 | /* | 1290 | /* |
@@ -1203,12 +1301,17 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) | |||
1203 | * Try a lower order alloc if possible | 1301 | * Try a lower order alloc if possible |
1204 | */ | 1302 | */ |
1205 | page = alloc_slab_page(flags, node, oo); | 1303 | page = alloc_slab_page(flags, node, oo); |
1206 | if (!page) | ||
1207 | return NULL; | ||
1208 | 1304 | ||
1209 | stat(s, ORDER_FALLBACK); | 1305 | if (page) |
1306 | stat(s, ORDER_FALLBACK); | ||
1210 | } | 1307 | } |
1211 | 1308 | ||
1309 | if (flags & __GFP_WAIT) | ||
1310 | local_irq_disable(); | ||
1311 | |||
1312 | if (!page) | ||
1313 | return NULL; | ||
1314 | |||
1212 | if (kmemcheck_enabled | 1315 | if (kmemcheck_enabled |
1213 | && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) { | 1316 | && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) { |
1214 | int pages = 1 << oo_order(oo); | 1317 | int pages = 1 << oo_order(oo); |
@@ -1275,7 +1378,8 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) | |||
1275 | set_freepointer(s, last, NULL); | 1378 | set_freepointer(s, last, NULL); |
1276 | 1379 | ||
1277 | page->freelist = start; | 1380 | page->freelist = start; |
1278 | page->inuse = 0; | 1381 | page->inuse = page->objects; |
1382 | page->frozen = 1; | ||
1279 | out: | 1383 | out: |
1280 | return page; | 1384 | return page; |
1281 | } | 1385 | } |
@@ -1353,79 +1457,80 @@ static void discard_slab(struct kmem_cache *s, struct page *page) | |||
1353 | } | 1457 | } |
1354 | 1458 | ||
1355 | /* | 1459 | /* |
1356 | * Per slab locking using the pagelock | 1460 | * Management of partially allocated slabs. |
1357 | */ | 1461 | * |
1358 | static __always_inline void slab_lock(struct page *page) | 1462 | * list_lock must be held. |
1359 | { | ||
1360 | bit_spin_lock(PG_locked, &page->flags); | ||
1361 | } | ||
1362 | |||
1363 | static __always_inline void slab_unlock(struct page *page) | ||
1364 | { | ||
1365 | __bit_spin_unlock(PG_locked, &page->flags); | ||
1366 | } | ||
1367 | |||
1368 | static __always_inline int slab_trylock(struct page *page) | ||
1369 | { | ||
1370 | int rc = 1; | ||
1371 | |||
1372 | rc = bit_spin_trylock(PG_locked, &page->flags); | ||
1373 | return rc; | ||
1374 | } | ||
1375 | |||
1376 | /* | ||
1377 | * Management of partially allocated slabs | ||
1378 | */ | 1463 | */ |
1379 | static void add_partial(struct kmem_cache_node *n, | 1464 | static inline void add_partial(struct kmem_cache_node *n, |
1380 | struct page *page, int tail) | 1465 | struct page *page, int tail) |
1381 | { | 1466 | { |
1382 | spin_lock(&n->list_lock); | ||
1383 | n->nr_partial++; | 1467 | n->nr_partial++; |
1384 | if (tail) | 1468 | if (tail == DEACTIVATE_TO_TAIL) |
1385 | list_add_tail(&page->lru, &n->partial); | 1469 | list_add_tail(&page->lru, &n->partial); |
1386 | else | 1470 | else |
1387 | list_add(&page->lru, &n->partial); | 1471 | list_add(&page->lru, &n->partial); |
1388 | spin_unlock(&n->list_lock); | ||
1389 | } | 1472 | } |
1390 | 1473 | ||
1391 | static inline void __remove_partial(struct kmem_cache_node *n, | 1474 | /* |
1475 | * list_lock must be held. | ||
1476 | */ | ||
1477 | static inline void remove_partial(struct kmem_cache_node *n, | ||
1392 | struct page *page) | 1478 | struct page *page) |
1393 | { | 1479 | { |
1394 | list_del(&page->lru); | 1480 | list_del(&page->lru); |
1395 | n->nr_partial--; | 1481 | n->nr_partial--; |
1396 | } | 1482 | } |
1397 | 1483 | ||
1398 | static void remove_partial(struct kmem_cache *s, struct page *page) | ||
1399 | { | ||
1400 | struct kmem_cache_node *n = get_node(s, page_to_nid(page)); | ||
1401 | |||
1402 | spin_lock(&n->list_lock); | ||
1403 | __remove_partial(n, page); | ||
1404 | spin_unlock(&n->list_lock); | ||
1405 | } | ||
1406 | |||
1407 | /* | 1484 | /* |
1408 | * Lock slab and remove from the partial list. | 1485 | * Lock slab, remove from the partial list and put the object into the |
1486 | * per cpu freelist. | ||
1487 | * | ||
1488 | * Returns a list of objects or NULL if it fails. | ||
1409 | * | 1489 | * |
1410 | * Must hold list_lock. | 1490 | * Must hold list_lock. |
1411 | */ | 1491 | */ |
1412 | static inline int lock_and_freeze_slab(struct kmem_cache_node *n, | 1492 | static inline void *acquire_slab(struct kmem_cache *s, |
1413 | struct page *page) | 1493 | struct kmem_cache_node *n, struct page *page, |
1494 | int mode) | ||
1414 | { | 1495 | { |
1415 | if (slab_trylock(page)) { | 1496 | void *freelist; |
1416 | __remove_partial(n, page); | 1497 | unsigned long counters; |
1417 | __SetPageSlubFrozen(page); | 1498 | struct page new; |
1418 | return 1; | 1499 | |
1419 | } | 1500 | /* |
1420 | return 0; | 1501 | * Zap the freelist and set the frozen bit. |
1502 | * The old freelist is the list of objects for the | ||
1503 | * per cpu allocation list. | ||
1504 | */ | ||
1505 | do { | ||
1506 | freelist = page->freelist; | ||
1507 | counters = page->counters; | ||
1508 | new.counters = counters; | ||
1509 | if (mode) | ||
1510 | new.inuse = page->objects; | ||
1511 | |||
1512 | VM_BUG_ON(new.frozen); | ||
1513 | new.frozen = 1; | ||
1514 | |||
1515 | } while (!__cmpxchg_double_slab(s, page, | ||
1516 | freelist, counters, | ||
1517 | NULL, new.counters, | ||
1518 | "lock and freeze")); | ||
1519 | |||
1520 | remove_partial(n, page); | ||
1521 | return freelist; | ||
1421 | } | 1522 | } |
1422 | 1523 | ||
1524 | static int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain); | ||
1525 | |||
1423 | /* | 1526 | /* |
1424 | * Try to allocate a partial slab from a specific node. | 1527 | * Try to allocate a partial slab from a specific node. |
1425 | */ | 1528 | */ |
1426 | static struct page *get_partial_node(struct kmem_cache_node *n) | 1529 | static void *get_partial_node(struct kmem_cache *s, |
1530 | struct kmem_cache_node *n, struct kmem_cache_cpu *c) | ||
1427 | { | 1531 | { |
1428 | struct page *page; | 1532 | struct page *page, *page2; |
1533 | void *object = NULL; | ||
1429 | 1534 | ||
1430 | /* | 1535 | /* |
1431 | * Racy check. If we mistakenly see no partial slabs then we | 1536 | * Racy check. If we mistakenly see no partial slabs then we |
@@ -1437,26 +1542,43 @@ static struct page *get_partial_node(struct kmem_cache_node *n) | |||
1437 | return NULL; | 1542 | return NULL; |
1438 | 1543 | ||
1439 | spin_lock(&n->list_lock); | 1544 | spin_lock(&n->list_lock); |
1440 | list_for_each_entry(page, &n->partial, lru) | 1545 | list_for_each_entry_safe(page, page2, &n->partial, lru) { |
1441 | if (lock_and_freeze_slab(n, page)) | 1546 | void *t = acquire_slab(s, n, page, object == NULL); |
1442 | goto out; | 1547 | int available; |
1443 | page = NULL; | 1548 | |
1444 | out: | 1549 | if (!t) |
1550 | break; | ||
1551 | |||
1552 | if (!object) { | ||
1553 | c->page = page; | ||
1554 | c->node = page_to_nid(page); | ||
1555 | stat(s, ALLOC_FROM_PARTIAL); | ||
1556 | object = t; | ||
1557 | available = page->objects - page->inuse; | ||
1558 | } else { | ||
1559 | page->freelist = t; | ||
1560 | available = put_cpu_partial(s, page, 0); | ||
1561 | } | ||
1562 | if (kmem_cache_debug(s) || available > s->cpu_partial / 2) | ||
1563 | break; | ||
1564 | |||
1565 | } | ||
1445 | spin_unlock(&n->list_lock); | 1566 | spin_unlock(&n->list_lock); |
1446 | return page; | 1567 | return object; |
1447 | } | 1568 | } |
1448 | 1569 | ||
1449 | /* | 1570 | /* |
1450 | * Get a page from somewhere. Search in increasing NUMA distances. | 1571 | * Get a page from somewhere. Search in increasing NUMA distances. |
1451 | */ | 1572 | */ |
1452 | static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) | 1573 | static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags, |
1574 | struct kmem_cache_cpu *c) | ||
1453 | { | 1575 | { |
1454 | #ifdef CONFIG_NUMA | 1576 | #ifdef CONFIG_NUMA |
1455 | struct zonelist *zonelist; | 1577 | struct zonelist *zonelist; |
1456 | struct zoneref *z; | 1578 | struct zoneref *z; |
1457 | struct zone *zone; | 1579 | struct zone *zone; |
1458 | enum zone_type high_zoneidx = gfp_zone(flags); | 1580 | enum zone_type high_zoneidx = gfp_zone(flags); |
1459 | struct page *page; | 1581 | void *object; |
1460 | 1582 | ||
1461 | /* | 1583 | /* |
1462 | * The defrag ratio allows a configuration of the tradeoffs between | 1584 | * The defrag ratio allows a configuration of the tradeoffs between |
@@ -1489,10 +1611,10 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) | |||
1489 | 1611 | ||
1490 | if (n && cpuset_zone_allowed_hardwall(zone, flags) && | 1612 | if (n && cpuset_zone_allowed_hardwall(zone, flags) && |
1491 | n->nr_partial > s->min_partial) { | 1613 | n->nr_partial > s->min_partial) { |
1492 | page = get_partial_node(n); | 1614 | object = get_partial_node(s, n, c); |
1493 | if (page) { | 1615 | if (object) { |
1494 | put_mems_allowed(); | 1616 | put_mems_allowed(); |
1495 | return page; | 1617 | return object; |
1496 | } | 1618 | } |
1497 | } | 1619 | } |
1498 | } | 1620 | } |
@@ -1504,63 +1626,17 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) | |||
1504 | /* | 1626 | /* |
1505 | * Get a partial page, lock it and return it. | 1627 | * Get a partial page, lock it and return it. |
1506 | */ | 1628 | */ |
1507 | static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node) | 1629 | static void *get_partial(struct kmem_cache *s, gfp_t flags, int node, |
1630 | struct kmem_cache_cpu *c) | ||
1508 | { | 1631 | { |
1509 | struct page *page; | 1632 | void *object; |
1510 | int searchnode = (node == NUMA_NO_NODE) ? numa_node_id() : node; | 1633 | int searchnode = (node == NUMA_NO_NODE) ? numa_node_id() : node; |
1511 | 1634 | ||
1512 | page = get_partial_node(get_node(s, searchnode)); | 1635 | object = get_partial_node(s, get_node(s, searchnode), c); |
1513 | if (page || node != NUMA_NO_NODE) | 1636 | if (object || node != NUMA_NO_NODE) |
1514 | return page; | 1637 | return object; |
1515 | |||
1516 | return get_any_partial(s, flags); | ||
1517 | } | ||
1518 | |||
1519 | /* | ||
1520 | * Move a page back to the lists. | ||
1521 | * | ||
1522 | * Must be called with the slab lock held. | ||
1523 | * | ||
1524 | * On exit the slab lock will have been dropped. | ||
1525 | */ | ||
1526 | static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) | ||
1527 | __releases(bitlock) | ||
1528 | { | ||
1529 | struct kmem_cache_node *n = get_node(s, page_to_nid(page)); | ||
1530 | |||
1531 | __ClearPageSlubFrozen(page); | ||
1532 | if (page->inuse) { | ||
1533 | 1638 | ||
1534 | if (page->freelist) { | 1639 | return get_any_partial(s, flags, c); |
1535 | add_partial(n, page, tail); | ||
1536 | stat(s, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD); | ||
1537 | } else { | ||
1538 | stat(s, DEACTIVATE_FULL); | ||
1539 | if (kmem_cache_debug(s) && (s->flags & SLAB_STORE_USER)) | ||
1540 | add_full(n, page); | ||
1541 | } | ||
1542 | slab_unlock(page); | ||
1543 | } else { | ||
1544 | stat(s, DEACTIVATE_EMPTY); | ||
1545 | if (n->nr_partial < s->min_partial) { | ||
1546 | /* | ||
1547 | * Adding an empty slab to the partial slabs in order | ||
1548 | * to avoid page allocator overhead. This slab needs | ||
1549 | * to come after the other slabs with objects in | ||
1550 | * so that the others get filled first. That way the | ||
1551 | * size of the partial list stays small. | ||
1552 | * | ||
1553 | * kmem_cache_shrink can reclaim any empty slabs from | ||
1554 | * the partial list. | ||
1555 | */ | ||
1556 | add_partial(n, page, 1); | ||
1557 | slab_unlock(page); | ||
1558 | } else { | ||
1559 | slab_unlock(page); | ||
1560 | stat(s, FREE_SLAB); | ||
1561 | discard_slab(s, page); | ||
1562 | } | ||
1563 | } | ||
1564 | } | 1640 | } |
1565 | 1641 | ||
1566 | #ifdef CONFIG_PREEMPT | 1642 | #ifdef CONFIG_PREEMPT |
@@ -1629,45 +1705,278 @@ void init_kmem_cache_cpus(struct kmem_cache *s) | |||
1629 | for_each_possible_cpu(cpu) | 1705 | for_each_possible_cpu(cpu) |
1630 | per_cpu_ptr(s->cpu_slab, cpu)->tid = init_tid(cpu); | 1706 | per_cpu_ptr(s->cpu_slab, cpu)->tid = init_tid(cpu); |
1631 | } | 1707 | } |
1708 | |||
1632 | /* | 1709 | /* |
1633 | * Remove the cpu slab | 1710 | * Remove the cpu slab |
1634 | */ | 1711 | */ |
1635 | static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) | 1712 | static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) |
1636 | __releases(bitlock) | ||
1637 | { | 1713 | { |
1714 | enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE }; | ||
1638 | struct page *page = c->page; | 1715 | struct page *page = c->page; |
1639 | int tail = 1; | 1716 | struct kmem_cache_node *n = get_node(s, page_to_nid(page)); |
1640 | 1717 | int lock = 0; | |
1641 | if (page->freelist) | 1718 | enum slab_modes l = M_NONE, m = M_NONE; |
1719 | void *freelist; | ||
1720 | void *nextfree; | ||
1721 | int tail = DEACTIVATE_TO_HEAD; | ||
1722 | struct page new; | ||
1723 | struct page old; | ||
1724 | |||
1725 | if (page->freelist) { | ||
1642 | stat(s, DEACTIVATE_REMOTE_FREES); | 1726 | stat(s, DEACTIVATE_REMOTE_FREES); |
1727 | tail = DEACTIVATE_TO_TAIL; | ||
1728 | } | ||
1729 | |||
1730 | c->tid = next_tid(c->tid); | ||
1731 | c->page = NULL; | ||
1732 | freelist = c->freelist; | ||
1733 | c->freelist = NULL; | ||
1734 | |||
1735 | /* | ||
1736 | * Stage one: Free all available per cpu objects back | ||
1737 | * to the page freelist while it is still frozen. Leave the | ||
1738 | * last one. | ||
1739 | * | ||
1740 | * There is no need to take the list->lock because the page | ||
1741 | * is still frozen. | ||
1742 | */ | ||
1743 | while (freelist && (nextfree = get_freepointer(s, freelist))) { | ||
1744 | void *prior; | ||
1745 | unsigned long counters; | ||
1746 | |||
1747 | do { | ||
1748 | prior = page->freelist; | ||
1749 | counters = page->counters; | ||
1750 | set_freepointer(s, freelist, prior); | ||
1751 | new.counters = counters; | ||
1752 | new.inuse--; | ||
1753 | VM_BUG_ON(!new.frozen); | ||
1754 | |||
1755 | } while (!__cmpxchg_double_slab(s, page, | ||
1756 | prior, counters, | ||
1757 | freelist, new.counters, | ||
1758 | "drain percpu freelist")); | ||
1759 | |||
1760 | freelist = nextfree; | ||
1761 | } | ||
1762 | |||
1643 | /* | 1763 | /* |
1644 | * Merge cpu freelist into slab freelist. Typically we get here | 1764 | * Stage two: Ensure that the page is unfrozen while the |
1645 | * because both freelists are empty. So this is unlikely | 1765 | * list presence reflects the actual number of objects |
1646 | * to occur. | 1766 | * during unfreeze. |
1767 | * | ||
1768 | * We setup the list membership and then perform a cmpxchg | ||
1769 | * with the count. If there is a mismatch then the page | ||
1770 | * is not unfrozen but the page is on the wrong list. | ||
1771 | * | ||
1772 | * Then we restart the process which may have to remove | ||
1773 | * the page from the list that we just put it on again | ||
1774 | * because the number of objects in the slab may have | ||
1775 | * changed. | ||
1647 | */ | 1776 | */ |
1648 | while (unlikely(c->freelist)) { | 1777 | redo: |
1649 | void **object; | 1778 | |
1779 | old.freelist = page->freelist; | ||
1780 | old.counters = page->counters; | ||
1781 | VM_BUG_ON(!old.frozen); | ||
1650 | 1782 | ||
1651 | tail = 0; /* Hot objects. Put the slab first */ | 1783 | /* Determine target state of the slab */ |
1784 | new.counters = old.counters; | ||
1785 | if (freelist) { | ||
1786 | new.inuse--; | ||
1787 | set_freepointer(s, freelist, old.freelist); | ||
1788 | new.freelist = freelist; | ||
1789 | } else | ||
1790 | new.freelist = old.freelist; | ||
1652 | 1791 | ||
1653 | /* Retrieve object from cpu_freelist */ | 1792 | new.frozen = 0; |
1654 | object = c->freelist; | ||
1655 | c->freelist = get_freepointer(s, c->freelist); | ||
1656 | 1793 | ||
1657 | /* And put onto the regular freelist */ | 1794 | if (!new.inuse && n->nr_partial > s->min_partial) |
1658 | set_freepointer(s, object, page->freelist); | 1795 | m = M_FREE; |
1659 | page->freelist = object; | 1796 | else if (new.freelist) { |
1660 | page->inuse--; | 1797 | m = M_PARTIAL; |
1798 | if (!lock) { | ||
1799 | lock = 1; | ||
1800 | /* | ||
1801 | * Taking the spinlock removes the possiblity | ||
1802 | * that acquire_slab() will see a slab page that | ||
1803 | * is frozen | ||
1804 | */ | ||
1805 | spin_lock(&n->list_lock); | ||
1806 | } | ||
1807 | } else { | ||
1808 | m = M_FULL; | ||
1809 | if (kmem_cache_debug(s) && !lock) { | ||
1810 | lock = 1; | ||
1811 | /* | ||
1812 | * This also ensures that the scanning of full | ||
1813 | * slabs from diagnostic functions will not see | ||
1814 | * any frozen slabs. | ||
1815 | */ | ||
1816 | spin_lock(&n->list_lock); | ||
1817 | } | ||
1818 | } | ||
1819 | |||
1820 | if (l != m) { | ||
1821 | |||
1822 | if (l == M_PARTIAL) | ||
1823 | |||
1824 | remove_partial(n, page); | ||
1825 | |||
1826 | else if (l == M_FULL) | ||
1827 | |||
1828 | remove_full(s, page); | ||
1829 | |||
1830 | if (m == M_PARTIAL) { | ||
1831 | |||
1832 | add_partial(n, page, tail); | ||
1833 | stat(s, tail); | ||
1834 | |||
1835 | } else if (m == M_FULL) { | ||
1836 | |||
1837 | stat(s, DEACTIVATE_FULL); | ||
1838 | add_full(s, n, page); | ||
1839 | |||
1840 | } | ||
1841 | } | ||
1842 | |||
1843 | l = m; | ||
1844 | if (!__cmpxchg_double_slab(s, page, | ||
1845 | old.freelist, old.counters, | ||
1846 | new.freelist, new.counters, | ||
1847 | "unfreezing slab")) | ||
1848 | goto redo; | ||
1849 | |||
1850 | if (lock) | ||
1851 | spin_unlock(&n->list_lock); | ||
1852 | |||
1853 | if (m == M_FREE) { | ||
1854 | stat(s, DEACTIVATE_EMPTY); | ||
1855 | discard_slab(s, page); | ||
1856 | stat(s, FREE_SLAB); | ||
1661 | } | 1857 | } |
1662 | c->page = NULL; | 1858 | } |
1663 | c->tid = next_tid(c->tid); | 1859 | |
1664 | unfreeze_slab(s, page, tail); | 1860 | /* Unfreeze all the cpu partial slabs */ |
1861 | static void unfreeze_partials(struct kmem_cache *s) | ||
1862 | { | ||
1863 | struct kmem_cache_node *n = NULL; | ||
1864 | struct kmem_cache_cpu *c = this_cpu_ptr(s->cpu_slab); | ||
1865 | struct page *page; | ||
1866 | |||
1867 | while ((page = c->partial)) { | ||
1868 | enum slab_modes { M_PARTIAL, M_FREE }; | ||
1869 | enum slab_modes l, m; | ||
1870 | struct page new; | ||
1871 | struct page old; | ||
1872 | |||
1873 | c->partial = page->next; | ||
1874 | l = M_FREE; | ||
1875 | |||
1876 | do { | ||
1877 | |||
1878 | old.freelist = page->freelist; | ||
1879 | old.counters = page->counters; | ||
1880 | VM_BUG_ON(!old.frozen); | ||
1881 | |||
1882 | new.counters = old.counters; | ||
1883 | new.freelist = old.freelist; | ||
1884 | |||
1885 | new.frozen = 0; | ||
1886 | |||
1887 | if (!new.inuse && (!n || n->nr_partial > s->min_partial)) | ||
1888 | m = M_FREE; | ||
1889 | else { | ||
1890 | struct kmem_cache_node *n2 = get_node(s, | ||
1891 | page_to_nid(page)); | ||
1892 | |||
1893 | m = M_PARTIAL; | ||
1894 | if (n != n2) { | ||
1895 | if (n) | ||
1896 | spin_unlock(&n->list_lock); | ||
1897 | |||
1898 | n = n2; | ||
1899 | spin_lock(&n->list_lock); | ||
1900 | } | ||
1901 | } | ||
1902 | |||
1903 | if (l != m) { | ||
1904 | if (l == M_PARTIAL) | ||
1905 | remove_partial(n, page); | ||
1906 | else | ||
1907 | add_partial(n, page, 1); | ||
1908 | |||
1909 | l = m; | ||
1910 | } | ||
1911 | |||
1912 | } while (!cmpxchg_double_slab(s, page, | ||
1913 | old.freelist, old.counters, | ||
1914 | new.freelist, new.counters, | ||
1915 | "unfreezing slab")); | ||
1916 | |||
1917 | if (m == M_FREE) { | ||
1918 | stat(s, DEACTIVATE_EMPTY); | ||
1919 | discard_slab(s, page); | ||
1920 | stat(s, FREE_SLAB); | ||
1921 | } | ||
1922 | } | ||
1923 | |||
1924 | if (n) | ||
1925 | spin_unlock(&n->list_lock); | ||
1926 | } | ||
1927 | |||
1928 | /* | ||
1929 | * Put a page that was just frozen (in __slab_free) into a partial page | ||
1930 | * slot if available. This is done without interrupts disabled and without | ||
1931 | * preemption disabled. The cmpxchg is racy and may put the partial page | ||
1932 | * onto a random cpus partial slot. | ||
1933 | * | ||
1934 | * If we did not find a slot then simply move all the partials to the | ||
1935 | * per node partial list. | ||
1936 | */ | ||
1937 | int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) | ||
1938 | { | ||
1939 | struct page *oldpage; | ||
1940 | int pages; | ||
1941 | int pobjects; | ||
1942 | |||
1943 | do { | ||
1944 | pages = 0; | ||
1945 | pobjects = 0; | ||
1946 | oldpage = this_cpu_read(s->cpu_slab->partial); | ||
1947 | |||
1948 | if (oldpage) { | ||
1949 | pobjects = oldpage->pobjects; | ||
1950 | pages = oldpage->pages; | ||
1951 | if (drain && pobjects > s->cpu_partial) { | ||
1952 | unsigned long flags; | ||
1953 | /* | ||
1954 | * partial array is full. Move the existing | ||
1955 | * set to the per node partial list. | ||
1956 | */ | ||
1957 | local_irq_save(flags); | ||
1958 | unfreeze_partials(s); | ||
1959 | local_irq_restore(flags); | ||
1960 | pobjects = 0; | ||
1961 | pages = 0; | ||
1962 | } | ||
1963 | } | ||
1964 | |||
1965 | pages++; | ||
1966 | pobjects += page->objects - page->inuse; | ||
1967 | |||
1968 | page->pages = pages; | ||
1969 | page->pobjects = pobjects; | ||
1970 | page->next = oldpage; | ||
1971 | |||
1972 | } while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page) != oldpage); | ||
1973 | stat(s, CPU_PARTIAL_FREE); | ||
1974 | return pobjects; | ||
1665 | } | 1975 | } |
1666 | 1976 | ||
1667 | static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) | 1977 | static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) |
1668 | { | 1978 | { |
1669 | stat(s, CPUSLAB_FLUSH); | 1979 | stat(s, CPUSLAB_FLUSH); |
1670 | slab_lock(c->page); | ||
1671 | deactivate_slab(s, c); | 1980 | deactivate_slab(s, c); |
1672 | } | 1981 | } |
1673 | 1982 | ||
@@ -1680,8 +1989,12 @@ static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) | |||
1680 | { | 1989 | { |
1681 | struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); | 1990 | struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); |
1682 | 1991 | ||
1683 | if (likely(c && c->page)) | 1992 | if (likely(c)) { |
1684 | flush_slab(s, c); | 1993 | if (c->page) |
1994 | flush_slab(s, c); | ||
1995 | |||
1996 | unfreeze_partials(s); | ||
1997 | } | ||
1685 | } | 1998 | } |
1686 | 1999 | ||
1687 | static void flush_cpu_slab(void *d) | 2000 | static void flush_cpu_slab(void *d) |
@@ -1772,12 +2085,39 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) | |||
1772 | } | 2085 | } |
1773 | } | 2086 | } |
1774 | 2087 | ||
2088 | static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags, | ||
2089 | int node, struct kmem_cache_cpu **pc) | ||
2090 | { | ||
2091 | void *object; | ||
2092 | struct kmem_cache_cpu *c; | ||
2093 | struct page *page = new_slab(s, flags, node); | ||
2094 | |||
2095 | if (page) { | ||
2096 | c = __this_cpu_ptr(s->cpu_slab); | ||
2097 | if (c->page) | ||
2098 | flush_slab(s, c); | ||
2099 | |||
2100 | /* | ||
2101 | * No other reference to the page yet so we can | ||
2102 | * muck around with it freely without cmpxchg | ||
2103 | */ | ||
2104 | object = page->freelist; | ||
2105 | page->freelist = NULL; | ||
2106 | |||
2107 | stat(s, ALLOC_SLAB); | ||
2108 | c->node = page_to_nid(page); | ||
2109 | c->page = page; | ||
2110 | *pc = c; | ||
2111 | } else | ||
2112 | object = NULL; | ||
2113 | |||
2114 | return object; | ||
2115 | } | ||
2116 | |||
1775 | /* | 2117 | /* |
1776 | * Slow path. The lockless freelist is empty or we need to perform | 2118 | * Slow path. The lockless freelist is empty or we need to perform |
1777 | * debugging duties. | 2119 | * debugging duties. |
1778 | * | 2120 | * |
1779 | * Interrupts are disabled. | ||
1780 | * | ||
1781 | * Processing is still very fast if new objects have been freed to the | 2121 | * Processing is still very fast if new objects have been freed to the |
1782 | * regular freelist. In that case we simply take over the regular freelist | 2122 | * regular freelist. In that case we simply take over the regular freelist |
1783 | * as the lockless freelist and zap the regular freelist. | 2123 | * as the lockless freelist and zap the regular freelist. |
@@ -1794,8 +2134,9 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, | |||
1794 | unsigned long addr, struct kmem_cache_cpu *c) | 2134 | unsigned long addr, struct kmem_cache_cpu *c) |
1795 | { | 2135 | { |
1796 | void **object; | 2136 | void **object; |
1797 | struct page *page; | ||
1798 | unsigned long flags; | 2137 | unsigned long flags; |
2138 | struct page new; | ||
2139 | unsigned long counters; | ||
1799 | 2140 | ||
1800 | local_irq_save(flags); | 2141 | local_irq_save(flags); |
1801 | #ifdef CONFIG_PREEMPT | 2142 | #ifdef CONFIG_PREEMPT |
@@ -1807,81 +2148,91 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, | |||
1807 | c = this_cpu_ptr(s->cpu_slab); | 2148 | c = this_cpu_ptr(s->cpu_slab); |
1808 | #endif | 2149 | #endif |
1809 | 2150 | ||
1810 | /* We handle __GFP_ZERO in the caller */ | 2151 | if (!c->page) |
1811 | gfpflags &= ~__GFP_ZERO; | 2152 | goto new_slab; |
1812 | 2153 | redo: | |
1813 | page = c->page; | 2154 | if (unlikely(!node_match(c, node))) { |
1814 | if (!page) | 2155 | stat(s, ALLOC_NODE_MISMATCH); |
2156 | deactivate_slab(s, c); | ||
1815 | goto new_slab; | 2157 | goto new_slab; |
2158 | } | ||
1816 | 2159 | ||
1817 | slab_lock(page); | 2160 | stat(s, ALLOC_SLOWPATH); |
1818 | if (unlikely(!node_match(c, node))) | 2161 | |
1819 | goto another_slab; | 2162 | do { |
2163 | object = c->page->freelist; | ||
2164 | counters = c->page->counters; | ||
2165 | new.counters = counters; | ||
2166 | VM_BUG_ON(!new.frozen); | ||
2167 | |||
2168 | /* | ||
2169 | * If there is no object left then we use this loop to | ||
2170 | * deactivate the slab which is simple since no objects | ||
2171 | * are left in the slab and therefore we do not need to | ||
2172 | * put the page back onto the partial list. | ||
2173 | * | ||
2174 | * If there are objects left then we retrieve them | ||
2175 | * and use them to refill the per cpu queue. | ||
2176 | */ | ||
2177 | |||
2178 | new.inuse = c->page->objects; | ||
2179 | new.frozen = object != NULL; | ||
2180 | |||
2181 | } while (!__cmpxchg_double_slab(s, c->page, | ||
2182 | object, counters, | ||
2183 | NULL, new.counters, | ||
2184 | "__slab_alloc")); | ||
2185 | |||
2186 | if (!object) { | ||
2187 | c->page = NULL; | ||
2188 | stat(s, DEACTIVATE_BYPASS); | ||
2189 | goto new_slab; | ||
2190 | } | ||
1820 | 2191 | ||
1821 | stat(s, ALLOC_REFILL); | 2192 | stat(s, ALLOC_REFILL); |
1822 | 2193 | ||
1823 | load_freelist: | 2194 | load_freelist: |
1824 | object = page->freelist; | ||
1825 | if (unlikely(!object)) | ||
1826 | goto another_slab; | ||
1827 | if (kmem_cache_debug(s)) | ||
1828 | goto debug; | ||
1829 | |||
1830 | c->freelist = get_freepointer(s, object); | 2195 | c->freelist = get_freepointer(s, object); |
1831 | page->inuse = page->objects; | ||
1832 | page->freelist = NULL; | ||
1833 | |||
1834 | slab_unlock(page); | ||
1835 | c->tid = next_tid(c->tid); | 2196 | c->tid = next_tid(c->tid); |
1836 | local_irq_restore(flags); | 2197 | local_irq_restore(flags); |
1837 | stat(s, ALLOC_SLOWPATH); | ||
1838 | return object; | 2198 | return object; |
1839 | 2199 | ||
1840 | another_slab: | ||
1841 | deactivate_slab(s, c); | ||
1842 | |||
1843 | new_slab: | 2200 | new_slab: |
1844 | page = get_partial(s, gfpflags, node); | 2201 | |
1845 | if (page) { | 2202 | if (c->partial) { |
1846 | stat(s, ALLOC_FROM_PARTIAL); | 2203 | c->page = c->partial; |
1847 | c->node = page_to_nid(page); | 2204 | c->partial = c->page->next; |
1848 | c->page = page; | 2205 | c->node = page_to_nid(c->page); |
1849 | goto load_freelist; | 2206 | stat(s, CPU_PARTIAL_ALLOC); |
2207 | c->freelist = NULL; | ||
2208 | goto redo; | ||
1850 | } | 2209 | } |
1851 | 2210 | ||
1852 | gfpflags &= gfp_allowed_mask; | 2211 | /* Then do expensive stuff like retrieving pages from the partial lists */ |
1853 | if (gfpflags & __GFP_WAIT) | 2212 | object = get_partial(s, gfpflags, node, c); |
1854 | local_irq_enable(); | ||
1855 | 2213 | ||
1856 | page = new_slab(s, gfpflags, node); | 2214 | if (unlikely(!object)) { |
1857 | 2215 | ||
1858 | if (gfpflags & __GFP_WAIT) | 2216 | object = new_slab_objects(s, gfpflags, node, &c); |
1859 | local_irq_disable(); | ||
1860 | 2217 | ||
1861 | if (page) { | 2218 | if (unlikely(!object)) { |
1862 | c = __this_cpu_ptr(s->cpu_slab); | 2219 | if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit()) |
1863 | stat(s, ALLOC_SLAB); | 2220 | slab_out_of_memory(s, gfpflags, node); |
1864 | if (c->page) | ||
1865 | flush_slab(s, c); | ||
1866 | 2221 | ||
1867 | slab_lock(page); | 2222 | local_irq_restore(flags); |
1868 | __SetPageSlubFrozen(page); | 2223 | return NULL; |
1869 | c->node = page_to_nid(page); | 2224 | } |
1870 | c->page = page; | ||
1871 | goto load_freelist; | ||
1872 | } | 2225 | } |
1873 | if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit()) | ||
1874 | slab_out_of_memory(s, gfpflags, node); | ||
1875 | local_irq_restore(flags); | ||
1876 | return NULL; | ||
1877 | debug: | ||
1878 | if (!alloc_debug_processing(s, page, object, addr)) | ||
1879 | goto another_slab; | ||
1880 | 2226 | ||
1881 | page->inuse++; | 2227 | if (likely(!kmem_cache_debug(s))) |
1882 | page->freelist = get_freepointer(s, object); | 2228 | goto load_freelist; |
2229 | |||
2230 | /* Only entered in the debug case */ | ||
2231 | if (!alloc_debug_processing(s, c->page, object, addr)) | ||
2232 | goto new_slab; /* Slab failed checks. Next slab needed */ | ||
2233 | |||
2234 | c->freelist = get_freepointer(s, object); | ||
1883 | deactivate_slab(s, c); | 2235 | deactivate_slab(s, c); |
1884 | c->page = NULL; | ||
1885 | c->node = NUMA_NO_NODE; | 2236 | c->node = NUMA_NO_NODE; |
1886 | local_irq_restore(flags); | 2237 | local_irq_restore(flags); |
1887 | return object; | 2238 | return object; |
@@ -2031,52 +2382,110 @@ static void __slab_free(struct kmem_cache *s, struct page *page, | |||
2031 | { | 2382 | { |
2032 | void *prior; | 2383 | void *prior; |
2033 | void **object = (void *)x; | 2384 | void **object = (void *)x; |
2034 | unsigned long flags; | 2385 | int was_frozen; |
2386 | int inuse; | ||
2387 | struct page new; | ||
2388 | unsigned long counters; | ||
2389 | struct kmem_cache_node *n = NULL; | ||
2390 | unsigned long uninitialized_var(flags); | ||
2035 | 2391 | ||
2036 | local_irq_save(flags); | ||
2037 | slab_lock(page); | ||
2038 | stat(s, FREE_SLOWPATH); | 2392 | stat(s, FREE_SLOWPATH); |
2039 | 2393 | ||
2040 | if (kmem_cache_debug(s) && !free_debug_processing(s, page, x, addr)) | 2394 | if (kmem_cache_debug(s) && !free_debug_processing(s, page, x, addr)) |
2041 | goto out_unlock; | 2395 | return; |
2042 | 2396 | ||
2043 | prior = page->freelist; | 2397 | do { |
2044 | set_freepointer(s, object, prior); | 2398 | prior = page->freelist; |
2045 | page->freelist = object; | 2399 | counters = page->counters; |
2046 | page->inuse--; | 2400 | set_freepointer(s, object, prior); |
2401 | new.counters = counters; | ||
2402 | was_frozen = new.frozen; | ||
2403 | new.inuse--; | ||
2404 | if ((!new.inuse || !prior) && !was_frozen && !n) { | ||
2047 | 2405 | ||
2048 | if (unlikely(PageSlubFrozen(page))) { | 2406 | if (!kmem_cache_debug(s) && !prior) |
2049 | stat(s, FREE_FROZEN); | 2407 | |
2050 | goto out_unlock; | 2408 | /* |
2051 | } | 2409 | * Slab was on no list before and will be partially empty |
2410 | * We can defer the list move and instead freeze it. | ||
2411 | */ | ||
2412 | new.frozen = 1; | ||
2413 | |||
2414 | else { /* Needs to be taken off a list */ | ||
2415 | |||
2416 | n = get_node(s, page_to_nid(page)); | ||
2417 | /* | ||
2418 | * Speculatively acquire the list_lock. | ||
2419 | * If the cmpxchg does not succeed then we may | ||
2420 | * drop the list_lock without any processing. | ||
2421 | * | ||
2422 | * Otherwise the list_lock will synchronize with | ||
2423 | * other processors updating the list of slabs. | ||
2424 | */ | ||
2425 | spin_lock_irqsave(&n->list_lock, flags); | ||
2426 | |||
2427 | } | ||
2428 | } | ||
2429 | inuse = new.inuse; | ||
2430 | |||
2431 | } while (!cmpxchg_double_slab(s, page, | ||
2432 | prior, counters, | ||
2433 | object, new.counters, | ||
2434 | "__slab_free")); | ||
2435 | |||
2436 | if (likely(!n)) { | ||
2437 | |||
2438 | /* | ||
2439 | * If we just froze the page then put it onto the | ||
2440 | * per cpu partial list. | ||
2441 | */ | ||
2442 | if (new.frozen && !was_frozen) | ||
2443 | put_cpu_partial(s, page, 1); | ||
2052 | 2444 | ||
2053 | if (unlikely(!page->inuse)) | 2445 | /* |
2054 | goto slab_empty; | 2446 | * The list lock was not taken therefore no list |
2447 | * activity can be necessary. | ||
2448 | */ | ||
2449 | if (was_frozen) | ||
2450 | stat(s, FREE_FROZEN); | ||
2451 | return; | ||
2452 | } | ||
2055 | 2453 | ||
2056 | /* | 2454 | /* |
2057 | * Objects left in the slab. If it was not on the partial list before | 2455 | * was_frozen may have been set after we acquired the list_lock in |
2058 | * then add it. | 2456 | * an earlier loop. So we need to check it here again. |
2059 | */ | 2457 | */ |
2060 | if (unlikely(!prior)) { | 2458 | if (was_frozen) |
2061 | add_partial(get_node(s, page_to_nid(page)), page, 1); | 2459 | stat(s, FREE_FROZEN); |
2062 | stat(s, FREE_ADD_PARTIAL); | 2460 | else { |
2063 | } | 2461 | if (unlikely(!inuse && n->nr_partial > s->min_partial)) |
2462 | goto slab_empty; | ||
2064 | 2463 | ||
2065 | out_unlock: | 2464 | /* |
2066 | slab_unlock(page); | 2465 | * Objects left in the slab. If it was not on the partial list before |
2067 | local_irq_restore(flags); | 2466 | * then add it. |
2467 | */ | ||
2468 | if (unlikely(!prior)) { | ||
2469 | remove_full(s, page); | ||
2470 | add_partial(n, page, DEACTIVATE_TO_TAIL); | ||
2471 | stat(s, FREE_ADD_PARTIAL); | ||
2472 | } | ||
2473 | } | ||
2474 | spin_unlock_irqrestore(&n->list_lock, flags); | ||
2068 | return; | 2475 | return; |
2069 | 2476 | ||
2070 | slab_empty: | 2477 | slab_empty: |
2071 | if (prior) { | 2478 | if (prior) { |
2072 | /* | 2479 | /* |
2073 | * Slab still on the partial list. | 2480 | * Slab on the partial list. |
2074 | */ | 2481 | */ |
2075 | remove_partial(s, page); | 2482 | remove_partial(n, page); |
2076 | stat(s, FREE_REMOVE_PARTIAL); | 2483 | stat(s, FREE_REMOVE_PARTIAL); |
2077 | } | 2484 | } else |
2078 | slab_unlock(page); | 2485 | /* Slab must be on the full list */ |
2079 | local_irq_restore(flags); | 2486 | remove_full(s, page); |
2487 | |||
2488 | spin_unlock_irqrestore(&n->list_lock, flags); | ||
2080 | stat(s, FREE_SLAB); | 2489 | stat(s, FREE_SLAB); |
2081 | discard_slab(s, page); | 2490 | discard_slab(s, page); |
2082 | } | 2491 | } |
@@ -2102,7 +2511,6 @@ static __always_inline void slab_free(struct kmem_cache *s, | |||
2102 | slab_free_hook(s, x); | 2511 | slab_free_hook(s, x); |
2103 | 2512 | ||
2104 | redo: | 2513 | redo: |
2105 | |||
2106 | /* | 2514 | /* |
2107 | * Determine the currently cpus per cpu slab. | 2515 | * Determine the currently cpus per cpu slab. |
2108 | * The cpu may change afterward. However that does not matter since | 2516 | * The cpu may change afterward. However that does not matter since |
@@ -2350,7 +2758,6 @@ static void early_kmem_cache_node_alloc(int node) | |||
2350 | { | 2758 | { |
2351 | struct page *page; | 2759 | struct page *page; |
2352 | struct kmem_cache_node *n; | 2760 | struct kmem_cache_node *n; |
2353 | unsigned long flags; | ||
2354 | 2761 | ||
2355 | BUG_ON(kmem_cache_node->size < sizeof(struct kmem_cache_node)); | 2762 | BUG_ON(kmem_cache_node->size < sizeof(struct kmem_cache_node)); |
2356 | 2763 | ||
@@ -2367,7 +2774,8 @@ static void early_kmem_cache_node_alloc(int node) | |||
2367 | n = page->freelist; | 2774 | n = page->freelist; |
2368 | BUG_ON(!n); | 2775 | BUG_ON(!n); |
2369 | page->freelist = get_freepointer(kmem_cache_node, n); | 2776 | page->freelist = get_freepointer(kmem_cache_node, n); |
2370 | page->inuse++; | 2777 | page->inuse = 1; |
2778 | page->frozen = 0; | ||
2371 | kmem_cache_node->node[node] = n; | 2779 | kmem_cache_node->node[node] = n; |
2372 | #ifdef CONFIG_SLUB_DEBUG | 2780 | #ifdef CONFIG_SLUB_DEBUG |
2373 | init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); | 2781 | init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); |
@@ -2376,14 +2784,7 @@ static void early_kmem_cache_node_alloc(int node) | |||
2376 | init_kmem_cache_node(n, kmem_cache_node); | 2784 | init_kmem_cache_node(n, kmem_cache_node); |
2377 | inc_slabs_node(kmem_cache_node, node, page->objects); | 2785 | inc_slabs_node(kmem_cache_node, node, page->objects); |
2378 | 2786 | ||
2379 | /* | 2787 | add_partial(n, page, DEACTIVATE_TO_HEAD); |
2380 | * lockdep requires consistent irq usage for each lock | ||
2381 | * so even though there cannot be a race this early in | ||
2382 | * the boot sequence, we still disable irqs. | ||
2383 | */ | ||
2384 | local_irq_save(flags); | ||
2385 | add_partial(n, page, 0); | ||
2386 | local_irq_restore(flags); | ||
2387 | } | 2788 | } |
2388 | 2789 | ||
2389 | static void free_kmem_cache_nodes(struct kmem_cache *s) | 2790 | static void free_kmem_cache_nodes(struct kmem_cache *s) |
@@ -2589,11 +2990,44 @@ static int kmem_cache_open(struct kmem_cache *s, | |||
2589 | } | 2990 | } |
2590 | } | 2991 | } |
2591 | 2992 | ||
2993 | #ifdef CONFIG_CMPXCHG_DOUBLE | ||
2994 | if (system_has_cmpxchg_double() && (s->flags & SLAB_DEBUG_FLAGS) == 0) | ||
2995 | /* Enable fast mode */ | ||
2996 | s->flags |= __CMPXCHG_DOUBLE; | ||
2997 | #endif | ||
2998 | |||
2592 | /* | 2999 | /* |
2593 | * The larger the object size is, the more pages we want on the partial | 3000 | * The larger the object size is, the more pages we want on the partial |
2594 | * list to avoid pounding the page allocator excessively. | 3001 | * list to avoid pounding the page allocator excessively. |
2595 | */ | 3002 | */ |
2596 | set_min_partial(s, ilog2(s->size)); | 3003 | set_min_partial(s, ilog2(s->size) / 2); |
3004 | |||
3005 | /* | ||
3006 | * cpu_partial determined the maximum number of objects kept in the | ||
3007 | * per cpu partial lists of a processor. | ||
3008 | * | ||
3009 | * Per cpu partial lists mainly contain slabs that just have one | ||
3010 | * object freed. If they are used for allocation then they can be | ||
3011 | * filled up again with minimal effort. The slab will never hit the | ||
3012 | * per node partial lists and therefore no locking will be required. | ||
3013 | * | ||
3014 | * This setting also determines | ||
3015 | * | ||
3016 | * A) The number of objects from per cpu partial slabs dumped to the | ||
3017 | * per node list when we reach the limit. | ||
3018 | * B) The number of objects in cpu partial slabs to extract from the | ||
3019 | * per node list when we run out of per cpu objects. We only fetch 50% | ||
3020 | * to keep some capacity around for frees. | ||
3021 | */ | ||
3022 | if (s->size >= PAGE_SIZE) | ||
3023 | s->cpu_partial = 2; | ||
3024 | else if (s->size >= 1024) | ||
3025 | s->cpu_partial = 6; | ||
3026 | else if (s->size >= 256) | ||
3027 | s->cpu_partial = 13; | ||
3028 | else | ||
3029 | s->cpu_partial = 30; | ||
3030 | |||
2597 | s->refcount = 1; | 3031 | s->refcount = 1; |
2598 | #ifdef CONFIG_NUMA | 3032 | #ifdef CONFIG_NUMA |
2599 | s->remote_node_defrag_ratio = 1000; | 3033 | s->remote_node_defrag_ratio = 1000; |
@@ -2652,23 +3086,22 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page, | |||
2652 | 3086 | ||
2653 | /* | 3087 | /* |
2654 | * Attempt to free all partial slabs on a node. | 3088 | * Attempt to free all partial slabs on a node. |
3089 | * This is called from kmem_cache_close(). We must be the last thread | ||
3090 | * using the cache and therefore we do not need to lock anymore. | ||
2655 | */ | 3091 | */ |
2656 | static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) | 3092 | static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) |
2657 | { | 3093 | { |
2658 | unsigned long flags; | ||
2659 | struct page *page, *h; | 3094 | struct page *page, *h; |
2660 | 3095 | ||
2661 | spin_lock_irqsave(&n->list_lock, flags); | ||
2662 | list_for_each_entry_safe(page, h, &n->partial, lru) { | 3096 | list_for_each_entry_safe(page, h, &n->partial, lru) { |
2663 | if (!page->inuse) { | 3097 | if (!page->inuse) { |
2664 | __remove_partial(n, page); | 3098 | remove_partial(n, page); |
2665 | discard_slab(s, page); | 3099 | discard_slab(s, page); |
2666 | } else { | 3100 | } else { |
2667 | list_slab_objects(s, page, | 3101 | list_slab_objects(s, page, |
2668 | "Objects remaining on kmem_cache_close()"); | 3102 | "Objects remaining on kmem_cache_close()"); |
2669 | } | 3103 | } |
2670 | } | 3104 | } |
2671 | spin_unlock_irqrestore(&n->list_lock, flags); | ||
2672 | } | 3105 | } |
2673 | 3106 | ||
2674 | /* | 3107 | /* |
@@ -2702,6 +3135,7 @@ void kmem_cache_destroy(struct kmem_cache *s) | |||
2702 | s->refcount--; | 3135 | s->refcount--; |
2703 | if (!s->refcount) { | 3136 | if (!s->refcount) { |
2704 | list_del(&s->list); | 3137 | list_del(&s->list); |
3138 | up_write(&slub_lock); | ||
2705 | if (kmem_cache_close(s)) { | 3139 | if (kmem_cache_close(s)) { |
2706 | printk(KERN_ERR "SLUB %s: %s called for cache that " | 3140 | printk(KERN_ERR "SLUB %s: %s called for cache that " |
2707 | "still has objects.\n", s->name, __func__); | 3141 | "still has objects.\n", s->name, __func__); |
@@ -2710,8 +3144,8 @@ void kmem_cache_destroy(struct kmem_cache *s) | |||
2710 | if (s->flags & SLAB_DESTROY_BY_RCU) | 3144 | if (s->flags & SLAB_DESTROY_BY_RCU) |
2711 | rcu_barrier(); | 3145 | rcu_barrier(); |
2712 | sysfs_slab_remove(s); | 3146 | sysfs_slab_remove(s); |
2713 | } | 3147 | } else |
2714 | up_write(&slub_lock); | 3148 | up_write(&slub_lock); |
2715 | } | 3149 | } |
2716 | EXPORT_SYMBOL(kmem_cache_destroy); | 3150 | EXPORT_SYMBOL(kmem_cache_destroy); |
2717 | 3151 | ||
@@ -2928,6 +3362,42 @@ size_t ksize(const void *object) | |||
2928 | } | 3362 | } |
2929 | EXPORT_SYMBOL(ksize); | 3363 | EXPORT_SYMBOL(ksize); |
2930 | 3364 | ||
3365 | #ifdef CONFIG_SLUB_DEBUG | ||
3366 | bool verify_mem_not_deleted(const void *x) | ||
3367 | { | ||
3368 | struct page *page; | ||
3369 | void *object = (void *)x; | ||
3370 | unsigned long flags; | ||
3371 | bool rv; | ||
3372 | |||
3373 | if (unlikely(ZERO_OR_NULL_PTR(x))) | ||
3374 | return false; | ||
3375 | |||
3376 | local_irq_save(flags); | ||
3377 | |||
3378 | page = virt_to_head_page(x); | ||
3379 | if (unlikely(!PageSlab(page))) { | ||
3380 | /* maybe it was from stack? */ | ||
3381 | rv = true; | ||
3382 | goto out_unlock; | ||
3383 | } | ||
3384 | |||
3385 | slab_lock(page); | ||
3386 | if (on_freelist(page->slab, page, object)) { | ||
3387 | object_err(page->slab, page, object, "Object is on free-list"); | ||
3388 | rv = false; | ||
3389 | } else { | ||
3390 | rv = true; | ||
3391 | } | ||
3392 | slab_unlock(page); | ||
3393 | |||
3394 | out_unlock: | ||
3395 | local_irq_restore(flags); | ||
3396 | return rv; | ||
3397 | } | ||
3398 | EXPORT_SYMBOL(verify_mem_not_deleted); | ||
3399 | #endif | ||
3400 | |||
2931 | void kfree(const void *x) | 3401 | void kfree(const void *x) |
2932 | { | 3402 | { |
2933 | struct page *page; | 3403 | struct page *page; |
@@ -2993,29 +3463,23 @@ int kmem_cache_shrink(struct kmem_cache *s) | |||
2993 | * list_lock. page->inuse here is the upper limit. | 3463 | * list_lock. page->inuse here is the upper limit. |
2994 | */ | 3464 | */ |
2995 | list_for_each_entry_safe(page, t, &n->partial, lru) { | 3465 | list_for_each_entry_safe(page, t, &n->partial, lru) { |
2996 | if (!page->inuse && slab_trylock(page)) { | 3466 | list_move(&page->lru, slabs_by_inuse + page->inuse); |
2997 | /* | 3467 | if (!page->inuse) |
2998 | * Must hold slab lock here because slab_free | 3468 | n->nr_partial--; |
2999 | * may have freed the last object and be | ||
3000 | * waiting to release the slab. | ||
3001 | */ | ||
3002 | __remove_partial(n, page); | ||
3003 | slab_unlock(page); | ||
3004 | discard_slab(s, page); | ||
3005 | } else { | ||
3006 | list_move(&page->lru, | ||
3007 | slabs_by_inuse + page->inuse); | ||
3008 | } | ||
3009 | } | 3469 | } |
3010 | 3470 | ||
3011 | /* | 3471 | /* |
3012 | * Rebuild the partial list with the slabs filled up most | 3472 | * Rebuild the partial list with the slabs filled up most |
3013 | * first and the least used slabs at the end. | 3473 | * first and the least used slabs at the end. |
3014 | */ | 3474 | */ |
3015 | for (i = objects - 1; i >= 0; i--) | 3475 | for (i = objects - 1; i > 0; i--) |
3016 | list_splice(slabs_by_inuse + i, n->partial.prev); | 3476 | list_splice(slabs_by_inuse + i, n->partial.prev); |
3017 | 3477 | ||
3018 | spin_unlock_irqrestore(&n->list_lock, flags); | 3478 | spin_unlock_irqrestore(&n->list_lock, flags); |
3479 | |||
3480 | /* Release empty slabs */ | ||
3481 | list_for_each_entry_safe(page, t, slabs_by_inuse, lru) | ||
3482 | discard_slab(s, page); | ||
3019 | } | 3483 | } |
3020 | 3484 | ||
3021 | kfree(slabs_by_inuse); | 3485 | kfree(slabs_by_inuse); |
@@ -3588,12 +4052,9 @@ static int validate_slab(struct kmem_cache *s, struct page *page, | |||
3588 | static void validate_slab_slab(struct kmem_cache *s, struct page *page, | 4052 | static void validate_slab_slab(struct kmem_cache *s, struct page *page, |
3589 | unsigned long *map) | 4053 | unsigned long *map) |
3590 | { | 4054 | { |
3591 | if (slab_trylock(page)) { | 4055 | slab_lock(page); |
3592 | validate_slab(s, page, map); | 4056 | validate_slab(s, page, map); |
3593 | slab_unlock(page); | 4057 | slab_unlock(page); |
3594 | } else | ||
3595 | printk(KERN_INFO "SLUB %s: Skipped busy slab 0x%p\n", | ||
3596 | s->name, page); | ||
3597 | } | 4058 | } |
3598 | 4059 | ||
3599 | static int validate_slab_node(struct kmem_cache *s, | 4060 | static int validate_slab_node(struct kmem_cache *s, |
@@ -3974,6 +4435,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s, | |||
3974 | 4435 | ||
3975 | for_each_possible_cpu(cpu) { | 4436 | for_each_possible_cpu(cpu) { |
3976 | struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); | 4437 | struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); |
4438 | struct page *page; | ||
3977 | 4439 | ||
3978 | if (!c || c->node < 0) | 4440 | if (!c || c->node < 0) |
3979 | continue; | 4441 | continue; |
@@ -3989,6 +4451,13 @@ static ssize_t show_slab_objects(struct kmem_cache *s, | |||
3989 | total += x; | 4451 | total += x; |
3990 | nodes[c->node] += x; | 4452 | nodes[c->node] += x; |
3991 | } | 4453 | } |
4454 | page = c->partial; | ||
4455 | |||
4456 | if (page) { | ||
4457 | x = page->pobjects; | ||
4458 | total += x; | ||
4459 | nodes[c->node] += x; | ||
4460 | } | ||
3992 | per_cpu[c->node]++; | 4461 | per_cpu[c->node]++; |
3993 | } | 4462 | } |
3994 | } | 4463 | } |
@@ -4058,7 +4527,7 @@ static int any_slab_objects(struct kmem_cache *s) | |||
4058 | #endif | 4527 | #endif |
4059 | 4528 | ||
4060 | #define to_slab_attr(n) container_of(n, struct slab_attribute, attr) | 4529 | #define to_slab_attr(n) container_of(n, struct slab_attribute, attr) |
4061 | #define to_slab(n) container_of(n, struct kmem_cache, kobj); | 4530 | #define to_slab(n) container_of(n, struct kmem_cache, kobj) |
4062 | 4531 | ||
4063 | struct slab_attribute { | 4532 | struct slab_attribute { |
4064 | struct attribute attr; | 4533 | struct attribute attr; |
@@ -4067,11 +4536,12 @@ struct slab_attribute { | |||
4067 | }; | 4536 | }; |
4068 | 4537 | ||
4069 | #define SLAB_ATTR_RO(_name) \ | 4538 | #define SLAB_ATTR_RO(_name) \ |
4070 | static struct slab_attribute _name##_attr = __ATTR_RO(_name) | 4539 | static struct slab_attribute _name##_attr = \ |
4540 | __ATTR(_name, 0400, _name##_show, NULL) | ||
4071 | 4541 | ||
4072 | #define SLAB_ATTR(_name) \ | 4542 | #define SLAB_ATTR(_name) \ |
4073 | static struct slab_attribute _name##_attr = \ | 4543 | static struct slab_attribute _name##_attr = \ |
4074 | __ATTR(_name, 0644, _name##_show, _name##_store) | 4544 | __ATTR(_name, 0600, _name##_show, _name##_store) |
4075 | 4545 | ||
4076 | static ssize_t slab_size_show(struct kmem_cache *s, char *buf) | 4546 | static ssize_t slab_size_show(struct kmem_cache *s, char *buf) |
4077 | { | 4547 | { |
@@ -4140,6 +4610,27 @@ static ssize_t min_partial_store(struct kmem_cache *s, const char *buf, | |||
4140 | } | 4610 | } |
4141 | SLAB_ATTR(min_partial); | 4611 | SLAB_ATTR(min_partial); |
4142 | 4612 | ||
4613 | static ssize_t cpu_partial_show(struct kmem_cache *s, char *buf) | ||
4614 | { | ||
4615 | return sprintf(buf, "%u\n", s->cpu_partial); | ||
4616 | } | ||
4617 | |||
4618 | static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf, | ||
4619 | size_t length) | ||
4620 | { | ||
4621 | unsigned long objects; | ||
4622 | int err; | ||
4623 | |||
4624 | err = strict_strtoul(buf, 10, &objects); | ||
4625 | if (err) | ||
4626 | return err; | ||
4627 | |||
4628 | s->cpu_partial = objects; | ||
4629 | flush_all(s); | ||
4630 | return length; | ||
4631 | } | ||
4632 | SLAB_ATTR(cpu_partial); | ||
4633 | |||
4143 | static ssize_t ctor_show(struct kmem_cache *s, char *buf) | 4634 | static ssize_t ctor_show(struct kmem_cache *s, char *buf) |
4144 | { | 4635 | { |
4145 | if (!s->ctor) | 4636 | if (!s->ctor) |
@@ -4178,6 +4669,37 @@ static ssize_t objects_partial_show(struct kmem_cache *s, char *buf) | |||
4178 | } | 4669 | } |
4179 | SLAB_ATTR_RO(objects_partial); | 4670 | SLAB_ATTR_RO(objects_partial); |
4180 | 4671 | ||
4672 | static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf) | ||
4673 | { | ||
4674 | int objects = 0; | ||
4675 | int pages = 0; | ||
4676 | int cpu; | ||
4677 | int len; | ||
4678 | |||
4679 | for_each_online_cpu(cpu) { | ||
4680 | struct page *page = per_cpu_ptr(s->cpu_slab, cpu)->partial; | ||
4681 | |||
4682 | if (page) { | ||
4683 | pages += page->pages; | ||
4684 | objects += page->pobjects; | ||
4685 | } | ||
4686 | } | ||
4687 | |||
4688 | len = sprintf(buf, "%d(%d)", objects, pages); | ||
4689 | |||
4690 | #ifdef CONFIG_SMP | ||
4691 | for_each_online_cpu(cpu) { | ||
4692 | struct page *page = per_cpu_ptr(s->cpu_slab, cpu) ->partial; | ||
4693 | |||
4694 | if (page && len < PAGE_SIZE - 20) | ||
4695 | len += sprintf(buf + len, " C%d=%d(%d)", cpu, | ||
4696 | page->pobjects, page->pages); | ||
4697 | } | ||
4698 | #endif | ||
4699 | return len + sprintf(buf + len, "\n"); | ||
4700 | } | ||
4701 | SLAB_ATTR_RO(slabs_cpu_partial); | ||
4702 | |||
4181 | static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf) | 4703 | static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf) |
4182 | { | 4704 | { |
4183 | return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT)); | 4705 | return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT)); |
@@ -4241,8 +4763,10 @@ static ssize_t sanity_checks_store(struct kmem_cache *s, | |||
4241 | const char *buf, size_t length) | 4763 | const char *buf, size_t length) |
4242 | { | 4764 | { |
4243 | s->flags &= ~SLAB_DEBUG_FREE; | 4765 | s->flags &= ~SLAB_DEBUG_FREE; |
4244 | if (buf[0] == '1') | 4766 | if (buf[0] == '1') { |
4767 | s->flags &= ~__CMPXCHG_DOUBLE; | ||
4245 | s->flags |= SLAB_DEBUG_FREE; | 4768 | s->flags |= SLAB_DEBUG_FREE; |
4769 | } | ||
4246 | return length; | 4770 | return length; |
4247 | } | 4771 | } |
4248 | SLAB_ATTR(sanity_checks); | 4772 | SLAB_ATTR(sanity_checks); |
@@ -4256,8 +4780,10 @@ static ssize_t trace_store(struct kmem_cache *s, const char *buf, | |||
4256 | size_t length) | 4780 | size_t length) |
4257 | { | 4781 | { |
4258 | s->flags &= ~SLAB_TRACE; | 4782 | s->flags &= ~SLAB_TRACE; |
4259 | if (buf[0] == '1') | 4783 | if (buf[0] == '1') { |
4784 | s->flags &= ~__CMPXCHG_DOUBLE; | ||
4260 | s->flags |= SLAB_TRACE; | 4785 | s->flags |= SLAB_TRACE; |
4786 | } | ||
4261 | return length; | 4787 | return length; |
4262 | } | 4788 | } |
4263 | SLAB_ATTR(trace); | 4789 | SLAB_ATTR(trace); |
@@ -4274,8 +4800,10 @@ static ssize_t red_zone_store(struct kmem_cache *s, | |||
4274 | return -EBUSY; | 4800 | return -EBUSY; |
4275 | 4801 | ||
4276 | s->flags &= ~SLAB_RED_ZONE; | 4802 | s->flags &= ~SLAB_RED_ZONE; |
4277 | if (buf[0] == '1') | 4803 | if (buf[0] == '1') { |
4804 | s->flags &= ~__CMPXCHG_DOUBLE; | ||
4278 | s->flags |= SLAB_RED_ZONE; | 4805 | s->flags |= SLAB_RED_ZONE; |
4806 | } | ||
4279 | calculate_sizes(s, -1); | 4807 | calculate_sizes(s, -1); |
4280 | return length; | 4808 | return length; |
4281 | } | 4809 | } |
@@ -4293,8 +4821,10 @@ static ssize_t poison_store(struct kmem_cache *s, | |||
4293 | return -EBUSY; | 4821 | return -EBUSY; |
4294 | 4822 | ||
4295 | s->flags &= ~SLAB_POISON; | 4823 | s->flags &= ~SLAB_POISON; |
4296 | if (buf[0] == '1') | 4824 | if (buf[0] == '1') { |
4825 | s->flags &= ~__CMPXCHG_DOUBLE; | ||
4297 | s->flags |= SLAB_POISON; | 4826 | s->flags |= SLAB_POISON; |
4827 | } | ||
4298 | calculate_sizes(s, -1); | 4828 | calculate_sizes(s, -1); |
4299 | return length; | 4829 | return length; |
4300 | } | 4830 | } |
@@ -4312,8 +4842,10 @@ static ssize_t store_user_store(struct kmem_cache *s, | |||
4312 | return -EBUSY; | 4842 | return -EBUSY; |
4313 | 4843 | ||
4314 | s->flags &= ~SLAB_STORE_USER; | 4844 | s->flags &= ~SLAB_STORE_USER; |
4315 | if (buf[0] == '1') | 4845 | if (buf[0] == '1') { |
4846 | s->flags &= ~__CMPXCHG_DOUBLE; | ||
4316 | s->flags |= SLAB_STORE_USER; | 4847 | s->flags |= SLAB_STORE_USER; |
4848 | } | ||
4317 | calculate_sizes(s, -1); | 4849 | calculate_sizes(s, -1); |
4318 | return length; | 4850 | return length; |
4319 | } | 4851 | } |
@@ -4478,6 +5010,7 @@ STAT_ATTR(FREE_REMOVE_PARTIAL, free_remove_partial); | |||
4478 | STAT_ATTR(ALLOC_FROM_PARTIAL, alloc_from_partial); | 5010 | STAT_ATTR(ALLOC_FROM_PARTIAL, alloc_from_partial); |
4479 | STAT_ATTR(ALLOC_SLAB, alloc_slab); | 5011 | STAT_ATTR(ALLOC_SLAB, alloc_slab); |
4480 | STAT_ATTR(ALLOC_REFILL, alloc_refill); | 5012 | STAT_ATTR(ALLOC_REFILL, alloc_refill); |
5013 | STAT_ATTR(ALLOC_NODE_MISMATCH, alloc_node_mismatch); | ||
4481 | STAT_ATTR(FREE_SLAB, free_slab); | 5014 | STAT_ATTR(FREE_SLAB, free_slab); |
4482 | STAT_ATTR(CPUSLAB_FLUSH, cpuslab_flush); | 5015 | STAT_ATTR(CPUSLAB_FLUSH, cpuslab_flush); |
4483 | STAT_ATTR(DEACTIVATE_FULL, deactivate_full); | 5016 | STAT_ATTR(DEACTIVATE_FULL, deactivate_full); |
@@ -4485,7 +5018,12 @@ STAT_ATTR(DEACTIVATE_EMPTY, deactivate_empty); | |||
4485 | STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate_to_head); | 5018 | STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate_to_head); |
4486 | STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail); | 5019 | STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail); |
4487 | STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees); | 5020 | STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees); |
5021 | STAT_ATTR(DEACTIVATE_BYPASS, deactivate_bypass); | ||
4488 | STAT_ATTR(ORDER_FALLBACK, order_fallback); | 5022 | STAT_ATTR(ORDER_FALLBACK, order_fallback); |
5023 | STAT_ATTR(CMPXCHG_DOUBLE_CPU_FAIL, cmpxchg_double_cpu_fail); | ||
5024 | STAT_ATTR(CMPXCHG_DOUBLE_FAIL, cmpxchg_double_fail); | ||
5025 | STAT_ATTR(CPU_PARTIAL_ALLOC, cpu_partial_alloc); | ||
5026 | STAT_ATTR(CPU_PARTIAL_FREE, cpu_partial_free); | ||
4489 | #endif | 5027 | #endif |
4490 | 5028 | ||
4491 | static struct attribute *slab_attrs[] = { | 5029 | static struct attribute *slab_attrs[] = { |
@@ -4494,6 +5032,7 @@ static struct attribute *slab_attrs[] = { | |||
4494 | &objs_per_slab_attr.attr, | 5032 | &objs_per_slab_attr.attr, |
4495 | &order_attr.attr, | 5033 | &order_attr.attr, |
4496 | &min_partial_attr.attr, | 5034 | &min_partial_attr.attr, |
5035 | &cpu_partial_attr.attr, | ||
4497 | &objects_attr.attr, | 5036 | &objects_attr.attr, |
4498 | &objects_partial_attr.attr, | 5037 | &objects_partial_attr.attr, |
4499 | &partial_attr.attr, | 5038 | &partial_attr.attr, |
@@ -4506,6 +5045,7 @@ static struct attribute *slab_attrs[] = { | |||
4506 | &destroy_by_rcu_attr.attr, | 5045 | &destroy_by_rcu_attr.attr, |
4507 | &shrink_attr.attr, | 5046 | &shrink_attr.attr, |
4508 | &reserved_attr.attr, | 5047 | &reserved_attr.attr, |
5048 | &slabs_cpu_partial_attr.attr, | ||
4509 | #ifdef CONFIG_SLUB_DEBUG | 5049 | #ifdef CONFIG_SLUB_DEBUG |
4510 | &total_objects_attr.attr, | 5050 | &total_objects_attr.attr, |
4511 | &slabs_attr.attr, | 5051 | &slabs_attr.attr, |
@@ -4535,6 +5075,7 @@ static struct attribute *slab_attrs[] = { | |||
4535 | &alloc_from_partial_attr.attr, | 5075 | &alloc_from_partial_attr.attr, |
4536 | &alloc_slab_attr.attr, | 5076 | &alloc_slab_attr.attr, |
4537 | &alloc_refill_attr.attr, | 5077 | &alloc_refill_attr.attr, |
5078 | &alloc_node_mismatch_attr.attr, | ||
4538 | &free_slab_attr.attr, | 5079 | &free_slab_attr.attr, |
4539 | &cpuslab_flush_attr.attr, | 5080 | &cpuslab_flush_attr.attr, |
4540 | &deactivate_full_attr.attr, | 5081 | &deactivate_full_attr.attr, |
@@ -4542,7 +5083,12 @@ static struct attribute *slab_attrs[] = { | |||
4542 | &deactivate_to_head_attr.attr, | 5083 | &deactivate_to_head_attr.attr, |
4543 | &deactivate_to_tail_attr.attr, | 5084 | &deactivate_to_tail_attr.attr, |
4544 | &deactivate_remote_frees_attr.attr, | 5085 | &deactivate_remote_frees_attr.attr, |
5086 | &deactivate_bypass_attr.attr, | ||
4545 | &order_fallback_attr.attr, | 5087 | &order_fallback_attr.attr, |
5088 | &cmpxchg_double_fail_attr.attr, | ||
5089 | &cmpxchg_double_cpu_fail_attr.attr, | ||
5090 | &cpu_partial_alloc_attr.attr, | ||
5091 | &cpu_partial_free_attr.attr, | ||
4546 | #endif | 5092 | #endif |
4547 | #ifdef CONFIG_FAILSLAB | 5093 | #ifdef CONFIG_FAILSLAB |
4548 | &failslab_attr.attr, | 5094 | &failslab_attr.attr, |
@@ -4894,7 +5440,7 @@ static const struct file_operations proc_slabinfo_operations = { | |||
4894 | 5440 | ||
4895 | static int __init slab_proc_init(void) | 5441 | static int __init slab_proc_init(void) |
4896 | { | 5442 | { |
4897 | proc_create("slabinfo", S_IRUGO, NULL, &proc_slabinfo_operations); | 5443 | proc_create("slabinfo", S_IRUSR, NULL, &proc_slabinfo_operations); |
4898 | return 0; | 5444 | return 0; |
4899 | } | 5445 | } |
4900 | module_init(slab_proc_init); | 5446 | module_init(slab_proc_init); |
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 64b984091edb..1b7e22ab9b09 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c | |||
@@ -21,7 +21,6 @@ | |||
21 | #include <linux/mmzone.h> | 21 | #include <linux/mmzone.h> |
22 | #include <linux/bootmem.h> | 22 | #include <linux/bootmem.h> |
23 | #include <linux/highmem.h> | 23 | #include <linux/highmem.h> |
24 | #include <linux/module.h> | ||
25 | #include <linux/slab.h> | 24 | #include <linux/slab.h> |
26 | #include <linux/spinlock.h> | 25 | #include <linux/spinlock.h> |
27 | #include <linux/vmalloc.h> | 26 | #include <linux/vmalloc.h> |
diff --git a/mm/sparse.c b/mm/sparse.c index aa64b12831a2..61d7cde23111 100644 --- a/mm/sparse.c +++ b/mm/sparse.c | |||
@@ -6,7 +6,7 @@ | |||
6 | #include <linux/mmzone.h> | 6 | #include <linux/mmzone.h> |
7 | #include <linux/bootmem.h> | 7 | #include <linux/bootmem.h> |
8 | #include <linux/highmem.h> | 8 | #include <linux/highmem.h> |
9 | #include <linux/module.h> | 9 | #include <linux/export.h> |
10 | #include <linux/spinlock.h> | 10 | #include <linux/spinlock.h> |
11 | #include <linux/vmalloc.h> | 11 | #include <linux/vmalloc.h> |
12 | #include "internal.h" | 12 | #include "internal.h" |
@@ -40,7 +40,7 @@ static u8 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned; | |||
40 | static u16 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned; | 40 | static u16 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned; |
41 | #endif | 41 | #endif |
42 | 42 | ||
43 | int page_to_nid(struct page *page) | 43 | int page_to_nid(const struct page *page) |
44 | { | 44 | { |
45 | return section_to_node_table[page_to_section(page)]; | 45 | return section_to_node_table[page_to_section(page)]; |
46 | } | 46 | } |
@@ -21,7 +21,7 @@ | |||
21 | #include <linux/pagemap.h> | 21 | #include <linux/pagemap.h> |
22 | #include <linux/pagevec.h> | 22 | #include <linux/pagevec.h> |
23 | #include <linux/init.h> | 23 | #include <linux/init.h> |
24 | #include <linux/module.h> | 24 | #include <linux/export.h> |
25 | #include <linux/mm_inline.h> | 25 | #include <linux/mm_inline.h> |
26 | #include <linux/buffer_head.h> /* for try_to_release_page() */ | 26 | #include <linux/buffer_head.h> /* for try_to_release_page() */ |
27 | #include <linux/percpu_counter.h> | 27 | #include <linux/percpu_counter.h> |
@@ -78,39 +78,22 @@ static void put_compound_page(struct page *page) | |||
78 | { | 78 | { |
79 | if (unlikely(PageTail(page))) { | 79 | if (unlikely(PageTail(page))) { |
80 | /* __split_huge_page_refcount can run under us */ | 80 | /* __split_huge_page_refcount can run under us */ |
81 | struct page *page_head = page->first_page; | 81 | struct page *page_head = compound_trans_head(page); |
82 | smp_rmb(); | 82 | |
83 | /* | 83 | if (likely(page != page_head && |
84 | * If PageTail is still set after smp_rmb() we can be sure | 84 | get_page_unless_zero(page_head))) { |
85 | * that the page->first_page we read wasn't a dangling pointer. | ||
86 | * See __split_huge_page_refcount() smp_wmb(). | ||
87 | */ | ||
88 | if (likely(PageTail(page) && get_page_unless_zero(page_head))) { | ||
89 | unsigned long flags; | 85 | unsigned long flags; |
90 | /* | 86 | /* |
91 | * Verify that our page_head wasn't converted | 87 | * page_head wasn't a dangling pointer but it |
92 | * to a a regular page before we got a | 88 | * may not be a head page anymore by the time |
93 | * reference on it. | 89 | * we obtain the lock. That is ok as long as it |
90 | * can't be freed from under us. | ||
94 | */ | 91 | */ |
95 | if (unlikely(!PageHead(page_head))) { | ||
96 | /* PageHead is cleared after PageTail */ | ||
97 | smp_rmb(); | ||
98 | VM_BUG_ON(PageTail(page)); | ||
99 | goto out_put_head; | ||
100 | } | ||
101 | /* | ||
102 | * Only run compound_lock on a valid PageHead, | ||
103 | * after having it pinned with | ||
104 | * get_page_unless_zero() above. | ||
105 | */ | ||
106 | smp_mb(); | ||
107 | /* page_head wasn't a dangling pointer */ | ||
108 | flags = compound_lock_irqsave(page_head); | 92 | flags = compound_lock_irqsave(page_head); |
109 | if (unlikely(!PageTail(page))) { | 93 | if (unlikely(!PageTail(page))) { |
110 | /* __split_huge_page_refcount run before us */ | 94 | /* __split_huge_page_refcount run before us */ |
111 | compound_unlock_irqrestore(page_head, flags); | 95 | compound_unlock_irqrestore(page_head, flags); |
112 | VM_BUG_ON(PageHead(page_head)); | 96 | VM_BUG_ON(PageHead(page_head)); |
113 | out_put_head: | ||
114 | if (put_page_testzero(page_head)) | 97 | if (put_page_testzero(page_head)) |
115 | __put_single_page(page_head); | 98 | __put_single_page(page_head); |
116 | out_put_single: | 99 | out_put_single: |
@@ -121,16 +104,17 @@ static void put_compound_page(struct page *page) | |||
121 | VM_BUG_ON(page_head != page->first_page); | 104 | VM_BUG_ON(page_head != page->first_page); |
122 | /* | 105 | /* |
123 | * We can release the refcount taken by | 106 | * We can release the refcount taken by |
124 | * get_page_unless_zero now that | 107 | * get_page_unless_zero() now that |
125 | * split_huge_page_refcount is blocked on the | 108 | * __split_huge_page_refcount() is blocked on |
126 | * compound_lock. | 109 | * the compound_lock. |
127 | */ | 110 | */ |
128 | if (put_page_testzero(page_head)) | 111 | if (put_page_testzero(page_head)) |
129 | VM_BUG_ON(1); | 112 | VM_BUG_ON(1); |
130 | /* __split_huge_page_refcount will wait now */ | 113 | /* __split_huge_page_refcount will wait now */ |
131 | VM_BUG_ON(atomic_read(&page->_count) <= 0); | 114 | VM_BUG_ON(page_mapcount(page) <= 0); |
132 | atomic_dec(&page->_count); | 115 | atomic_dec(&page->_mapcount); |
133 | VM_BUG_ON(atomic_read(&page_head->_count) <= 0); | 116 | VM_BUG_ON(atomic_read(&page_head->_count) <= 0); |
117 | VM_BUG_ON(atomic_read(&page->_count) != 0); | ||
134 | compound_unlock_irqrestore(page_head, flags); | 118 | compound_unlock_irqrestore(page_head, flags); |
135 | if (put_page_testzero(page_head)) { | 119 | if (put_page_testzero(page_head)) { |
136 | if (PageHead(page_head)) | 120 | if (PageHead(page_head)) |
@@ -160,6 +144,45 @@ void put_page(struct page *page) | |||
160 | } | 144 | } |
161 | EXPORT_SYMBOL(put_page); | 145 | EXPORT_SYMBOL(put_page); |
162 | 146 | ||
147 | /* | ||
148 | * This function is exported but must not be called by anything other | ||
149 | * than get_page(). It implements the slow path of get_page(). | ||
150 | */ | ||
151 | bool __get_page_tail(struct page *page) | ||
152 | { | ||
153 | /* | ||
154 | * This takes care of get_page() if run on a tail page | ||
155 | * returned by one of the get_user_pages/follow_page variants. | ||
156 | * get_user_pages/follow_page itself doesn't need the compound | ||
157 | * lock because it runs __get_page_tail_foll() under the | ||
158 | * proper PT lock that already serializes against | ||
159 | * split_huge_page(). | ||
160 | */ | ||
161 | unsigned long flags; | ||
162 | bool got = false; | ||
163 | struct page *page_head = compound_trans_head(page); | ||
164 | |||
165 | if (likely(page != page_head && get_page_unless_zero(page_head))) { | ||
166 | /* | ||
167 | * page_head wasn't a dangling pointer but it | ||
168 | * may not be a head page anymore by the time | ||
169 | * we obtain the lock. That is ok as long as it | ||
170 | * can't be freed from under us. | ||
171 | */ | ||
172 | flags = compound_lock_irqsave(page_head); | ||
173 | /* here __split_huge_page_refcount won't run anymore */ | ||
174 | if (likely(PageTail(page))) { | ||
175 | __get_page_tail_foll(page, false); | ||
176 | got = true; | ||
177 | } | ||
178 | compound_unlock_irqrestore(page_head, flags); | ||
179 | if (unlikely(!got)) | ||
180 | put_page(page_head); | ||
181 | } | ||
182 | return got; | ||
183 | } | ||
184 | EXPORT_SYMBOL(__get_page_tail); | ||
185 | |||
163 | /** | 186 | /** |
164 | * put_pages_list() - release a list of pages | 187 | * put_pages_list() - release a list of pages |
165 | * @pages: list of pages threaded on page->lru | 188 | * @pages: list of pages threaded on page->lru |
diff --git a/mm/swap_state.c b/mm/swap_state.c index 46680461785b..78cc4d1f6cce 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c | |||
@@ -6,7 +6,6 @@ | |||
6 | * | 6 | * |
7 | * Rewritten to use page cache, (C) 1998 Stephen Tweedie | 7 | * Rewritten to use page cache, (C) 1998 Stephen Tweedie |
8 | */ | 8 | */ |
9 | #include <linux/module.h> | ||
10 | #include <linux/mm.h> | 9 | #include <linux/mm.h> |
11 | #include <linux/gfp.h> | 10 | #include <linux/gfp.h> |
12 | #include <linux/kernel_stat.h> | 11 | #include <linux/kernel_stat.h> |
diff --git a/mm/swapfile.c b/mm/swapfile.c index ff8dc1a18cb4..b1cd12060723 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -21,7 +21,6 @@ | |||
21 | #include <linux/proc_fs.h> | 21 | #include <linux/proc_fs.h> |
22 | #include <linux/seq_file.h> | 22 | #include <linux/seq_file.h> |
23 | #include <linux/init.h> | 23 | #include <linux/init.h> |
24 | #include <linux/module.h> | ||
25 | #include <linux/ksm.h> | 24 | #include <linux/ksm.h> |
26 | #include <linux/rmap.h> | 25 | #include <linux/rmap.h> |
27 | #include <linux/security.h> | 26 | #include <linux/security.h> |
@@ -1617,7 +1616,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1617 | 1616 | ||
1618 | oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX); | 1617 | oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX); |
1619 | err = try_to_unuse(type); | 1618 | err = try_to_unuse(type); |
1620 | test_set_oom_score_adj(oom_score_adj); | 1619 | compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, oom_score_adj); |
1621 | 1620 | ||
1622 | if (err) { | 1621 | if (err) { |
1623 | /* | 1622 | /* |
@@ -1681,19 +1680,14 @@ out: | |||
1681 | } | 1680 | } |
1682 | 1681 | ||
1683 | #ifdef CONFIG_PROC_FS | 1682 | #ifdef CONFIG_PROC_FS |
1684 | struct proc_swaps { | ||
1685 | struct seq_file seq; | ||
1686 | int event; | ||
1687 | }; | ||
1688 | |||
1689 | static unsigned swaps_poll(struct file *file, poll_table *wait) | 1683 | static unsigned swaps_poll(struct file *file, poll_table *wait) |
1690 | { | 1684 | { |
1691 | struct proc_swaps *s = file->private_data; | 1685 | struct seq_file *seq = file->private_data; |
1692 | 1686 | ||
1693 | poll_wait(file, &proc_poll_wait, wait); | 1687 | poll_wait(file, &proc_poll_wait, wait); |
1694 | 1688 | ||
1695 | if (s->event != atomic_read(&proc_poll_event)) { | 1689 | if (seq->poll_event != atomic_read(&proc_poll_event)) { |
1696 | s->event = atomic_read(&proc_poll_event); | 1690 | seq->poll_event = atomic_read(&proc_poll_event); |
1697 | return POLLIN | POLLRDNORM | POLLERR | POLLPRI; | 1691 | return POLLIN | POLLRDNORM | POLLERR | POLLPRI; |
1698 | } | 1692 | } |
1699 | 1693 | ||
@@ -1783,24 +1777,16 @@ static const struct seq_operations swaps_op = { | |||
1783 | 1777 | ||
1784 | static int swaps_open(struct inode *inode, struct file *file) | 1778 | static int swaps_open(struct inode *inode, struct file *file) |
1785 | { | 1779 | { |
1786 | struct proc_swaps *s; | 1780 | struct seq_file *seq; |
1787 | int ret; | 1781 | int ret; |
1788 | 1782 | ||
1789 | s = kmalloc(sizeof(struct proc_swaps), GFP_KERNEL); | ||
1790 | if (!s) | ||
1791 | return -ENOMEM; | ||
1792 | |||
1793 | file->private_data = s; | ||
1794 | |||
1795 | ret = seq_open(file, &swaps_op); | 1783 | ret = seq_open(file, &swaps_op); |
1796 | if (ret) { | 1784 | if (ret) |
1797 | kfree(s); | ||
1798 | return ret; | 1785 | return ret; |
1799 | } | ||
1800 | 1786 | ||
1801 | s->seq.private = s; | 1787 | seq = file->private_data; |
1802 | s->event = atomic_read(&proc_poll_event); | 1788 | seq->poll_event = atomic_read(&proc_poll_event); |
1803 | return ret; | 1789 | return 0; |
1804 | } | 1790 | } |
1805 | 1791 | ||
1806 | static const struct file_operations proc_swaps_operations = { | 1792 | static const struct file_operations proc_swaps_operations = { |
@@ -1937,20 +1923,24 @@ static unsigned long read_swap_header(struct swap_info_struct *p, | |||
1937 | 1923 | ||
1938 | /* | 1924 | /* |
1939 | * Find out how many pages are allowed for a single swap | 1925 | * Find out how many pages are allowed for a single swap |
1940 | * device. There are two limiting factors: 1) the number of | 1926 | * device. There are three limiting factors: 1) the number |
1941 | * bits for the swap offset in the swp_entry_t type and | 1927 | * of bits for the swap offset in the swp_entry_t type, and |
1942 | * 2) the number of bits in the a swap pte as defined by | 1928 | * 2) the number of bits in the swap pte as defined by the |
1943 | * the different architectures. In order to find the | 1929 | * the different architectures, and 3) the number of free bits |
1944 | * largest possible bit mask a swap entry with swap type 0 | 1930 | * in an exceptional radix_tree entry. In order to find the |
1931 | * largest possible bit mask, a swap entry with swap type 0 | ||
1945 | * and swap offset ~0UL is created, encoded to a swap pte, | 1932 | * and swap offset ~0UL is created, encoded to a swap pte, |
1946 | * decoded to a swp_entry_t again and finally the swap | 1933 | * decoded to a swp_entry_t again, and finally the swap |
1947 | * offset is extracted. This will mask all the bits from | 1934 | * offset is extracted. This will mask all the bits from |
1948 | * the initial ~0UL mask that can't be encoded in either | 1935 | * the initial ~0UL mask that can't be encoded in either |
1949 | * the swp_entry_t or the architecture definition of a | 1936 | * the swp_entry_t or the architecture definition of a |
1950 | * swap pte. | 1937 | * swap pte. Then the same is done for a radix_tree entry. |
1951 | */ | 1938 | */ |
1952 | maxpages = swp_offset(pte_to_swp_entry( | 1939 | maxpages = swp_offset(pte_to_swp_entry( |
1953 | swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1; | 1940 | swp_entry_to_pte(swp_entry(0, ~0UL)))); |
1941 | maxpages = swp_offset(radix_to_swp_entry( | ||
1942 | swp_to_radix_entry(swp_entry(0, maxpages)))) + 1; | ||
1943 | |||
1954 | if (maxpages > swap_header->info.last_page) { | 1944 | if (maxpages > swap_header->info.last_page) { |
1955 | maxpages = swap_header->info.last_page + 1; | 1945 | maxpages = swap_header->info.last_page + 1; |
1956 | /* p->max is an unsigned int: don't overflow it */ | 1946 | /* p->max is an unsigned int: don't overflow it */ |
diff --git a/mm/thrash.c b/mm/thrash.c index fabf2d0f5169..57ad495dbd54 100644 --- a/mm/thrash.c +++ b/mm/thrash.c | |||
@@ -6,7 +6,7 @@ | |||
6 | * Released under the GPL, see the file COPYING for details. | 6 | * Released under the GPL, see the file COPYING for details. |
7 | * | 7 | * |
8 | * Simple token based thrashing protection, using the algorithm | 8 | * Simple token based thrashing protection, using the algorithm |
9 | * described in: http://www.cs.wm.edu/~sjiang/token.pdf | 9 | * described in: http://www.cse.ohio-state.edu/hpcs/WWW/HTML/publications/abs05-1.html |
10 | * | 10 | * |
11 | * Sep 2006, Ashwin Chaugule <ashwin.chaugule@celunite.com> | 11 | * Sep 2006, Ashwin Chaugule <ashwin.chaugule@celunite.com> |
12 | * Improved algorithm to pass token: | 12 | * Improved algorithm to pass token: |
@@ -29,9 +29,7 @@ | |||
29 | 29 | ||
30 | static DEFINE_SPINLOCK(swap_token_lock); | 30 | static DEFINE_SPINLOCK(swap_token_lock); |
31 | struct mm_struct *swap_token_mm; | 31 | struct mm_struct *swap_token_mm; |
32 | struct mem_cgroup *swap_token_memcg; | 32 | static struct mem_cgroup *swap_token_memcg; |
33 | static unsigned int global_faults; | ||
34 | static unsigned int last_aging; | ||
35 | 33 | ||
36 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR | 34 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR |
37 | static struct mem_cgroup *swap_token_memcg_from_mm(struct mm_struct *mm) | 35 | static struct mem_cgroup *swap_token_memcg_from_mm(struct mm_struct *mm) |
@@ -55,6 +53,8 @@ void grab_swap_token(struct mm_struct *mm) | |||
55 | { | 53 | { |
56 | int current_interval; | 54 | int current_interval; |
57 | unsigned int old_prio = mm->token_priority; | 55 | unsigned int old_prio = mm->token_priority; |
56 | static unsigned int global_faults; | ||
57 | static unsigned int last_aging; | ||
58 | 58 | ||
59 | global_faults++; | 59 | global_faults++; |
60 | 60 | ||
@@ -67,6 +67,17 @@ void grab_swap_token(struct mm_struct *mm) | |||
67 | if (!swap_token_mm) | 67 | if (!swap_token_mm) |
68 | goto replace_token; | 68 | goto replace_token; |
69 | 69 | ||
70 | /* | ||
71 | * Usually, we don't need priority aging because long interval faults | ||
72 | * makes priority decrease quickly. But there is one exception. If the | ||
73 | * token owner task is sleeping, it never make long interval faults. | ||
74 | * Thus, we need a priority aging mechanism instead. The requirements | ||
75 | * of priority aging are | ||
76 | * 1) An aging interval is reasonable enough long. Too short aging | ||
77 | * interval makes quick swap token lost and decrease performance. | ||
78 | * 2) The swap token owner task have to get priority aging even if | ||
79 | * it's under sleep. | ||
80 | */ | ||
70 | if ((global_faults - last_aging) > TOKEN_AGING_INTERVAL) { | 81 | if ((global_faults - last_aging) > TOKEN_AGING_INTERVAL) { |
71 | swap_token_mm->token_priority /= 2; | 82 | swap_token_mm->token_priority /= 2; |
72 | last_aging = global_faults; | 83 | last_aging = global_faults; |
diff --git a/mm/truncate.c b/mm/truncate.c index e13f22efaad7..632b15e29f74 100644 --- a/mm/truncate.c +++ b/mm/truncate.c | |||
@@ -12,7 +12,7 @@ | |||
12 | #include <linux/gfp.h> | 12 | #include <linux/gfp.h> |
13 | #include <linux/mm.h> | 13 | #include <linux/mm.h> |
14 | #include <linux/swap.h> | 14 | #include <linux/swap.h> |
15 | #include <linux/module.h> | 15 | #include <linux/export.h> |
16 | #include <linux/pagemap.h> | 16 | #include <linux/pagemap.h> |
17 | #include <linux/highmem.h> | 17 | #include <linux/highmem.h> |
18 | #include <linux/pagevec.h> | 18 | #include <linux/pagevec.h> |
@@ -199,9 +199,6 @@ int invalidate_inode_page(struct page *page) | |||
199 | * The first pass will remove most pages, so the search cost of the second pass | 199 | * The first pass will remove most pages, so the search cost of the second pass |
200 | * is low. | 200 | * is low. |
201 | * | 201 | * |
202 | * When looking at page->index outside the page lock we need to be careful to | ||
203 | * copy it into a local to avoid races (it could change at any time). | ||
204 | * | ||
205 | * We pass down the cache-hot hint to the page freeing code. Even if the | 202 | * We pass down the cache-hot hint to the page freeing code. Even if the |
206 | * mapping is large, it is probably the case that the final pages are the most | 203 | * mapping is large, it is probably the case that the final pages are the most |
207 | * recently touched, and freeing happens in ascending file offset order. | 204 | * recently touched, and freeing happens in ascending file offset order. |
@@ -210,10 +207,10 @@ void truncate_inode_pages_range(struct address_space *mapping, | |||
210 | loff_t lstart, loff_t lend) | 207 | loff_t lstart, loff_t lend) |
211 | { | 208 | { |
212 | const pgoff_t start = (lstart + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; | 209 | const pgoff_t start = (lstart + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; |
213 | pgoff_t end; | ||
214 | const unsigned partial = lstart & (PAGE_CACHE_SIZE - 1); | 210 | const unsigned partial = lstart & (PAGE_CACHE_SIZE - 1); |
215 | struct pagevec pvec; | 211 | struct pagevec pvec; |
216 | pgoff_t next; | 212 | pgoff_t index; |
213 | pgoff_t end; | ||
217 | int i; | 214 | int i; |
218 | 215 | ||
219 | cleancache_flush_inode(mapping); | 216 | cleancache_flush_inode(mapping); |
@@ -224,24 +221,21 @@ void truncate_inode_pages_range(struct address_space *mapping, | |||
224 | end = (lend >> PAGE_CACHE_SHIFT); | 221 | end = (lend >> PAGE_CACHE_SHIFT); |
225 | 222 | ||
226 | pagevec_init(&pvec, 0); | 223 | pagevec_init(&pvec, 0); |
227 | next = start; | 224 | index = start; |
228 | while (next <= end && | 225 | while (index <= end && pagevec_lookup(&pvec, mapping, index, |
229 | pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { | 226 | min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { |
230 | mem_cgroup_uncharge_start(); | 227 | mem_cgroup_uncharge_start(); |
231 | for (i = 0; i < pagevec_count(&pvec); i++) { | 228 | for (i = 0; i < pagevec_count(&pvec); i++) { |
232 | struct page *page = pvec.pages[i]; | 229 | struct page *page = pvec.pages[i]; |
233 | pgoff_t page_index = page->index; | ||
234 | 230 | ||
235 | if (page_index > end) { | 231 | /* We rely upon deletion not changing page->index */ |
236 | next = page_index; | 232 | index = page->index; |
233 | if (index > end) | ||
237 | break; | 234 | break; |
238 | } | ||
239 | 235 | ||
240 | if (page_index > next) | ||
241 | next = page_index; | ||
242 | next++; | ||
243 | if (!trylock_page(page)) | 236 | if (!trylock_page(page)) |
244 | continue; | 237 | continue; |
238 | WARN_ON(page->index != index); | ||
245 | if (PageWriteback(page)) { | 239 | if (PageWriteback(page)) { |
246 | unlock_page(page); | 240 | unlock_page(page); |
247 | continue; | 241 | continue; |
@@ -252,6 +246,7 @@ void truncate_inode_pages_range(struct address_space *mapping, | |||
252 | pagevec_release(&pvec); | 246 | pagevec_release(&pvec); |
253 | mem_cgroup_uncharge_end(); | 247 | mem_cgroup_uncharge_end(); |
254 | cond_resched(); | 248 | cond_resched(); |
249 | index++; | ||
255 | } | 250 | } |
256 | 251 | ||
257 | if (partial) { | 252 | if (partial) { |
@@ -264,16 +259,17 @@ void truncate_inode_pages_range(struct address_space *mapping, | |||
264 | } | 259 | } |
265 | } | 260 | } |
266 | 261 | ||
267 | next = start; | 262 | index = start; |
268 | for ( ; ; ) { | 263 | for ( ; ; ) { |
269 | cond_resched(); | 264 | cond_resched(); |
270 | if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { | 265 | if (!pagevec_lookup(&pvec, mapping, index, |
271 | if (next == start) | 266 | min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { |
267 | if (index == start) | ||
272 | break; | 268 | break; |
273 | next = start; | 269 | index = start; |
274 | continue; | 270 | continue; |
275 | } | 271 | } |
276 | if (pvec.pages[0]->index > end) { | 272 | if (index == start && pvec.pages[0]->index > end) { |
277 | pagevec_release(&pvec); | 273 | pagevec_release(&pvec); |
278 | break; | 274 | break; |
279 | } | 275 | } |
@@ -281,18 +277,20 @@ void truncate_inode_pages_range(struct address_space *mapping, | |||
281 | for (i = 0; i < pagevec_count(&pvec); i++) { | 277 | for (i = 0; i < pagevec_count(&pvec); i++) { |
282 | struct page *page = pvec.pages[i]; | 278 | struct page *page = pvec.pages[i]; |
283 | 279 | ||
284 | if (page->index > end) | 280 | /* We rely upon deletion not changing page->index */ |
281 | index = page->index; | ||
282 | if (index > end) | ||
285 | break; | 283 | break; |
284 | |||
286 | lock_page(page); | 285 | lock_page(page); |
286 | WARN_ON(page->index != index); | ||
287 | wait_on_page_writeback(page); | 287 | wait_on_page_writeback(page); |
288 | truncate_inode_page(mapping, page); | 288 | truncate_inode_page(mapping, page); |
289 | if (page->index > next) | ||
290 | next = page->index; | ||
291 | next++; | ||
292 | unlock_page(page); | 289 | unlock_page(page); |
293 | } | 290 | } |
294 | pagevec_release(&pvec); | 291 | pagevec_release(&pvec); |
295 | mem_cgroup_uncharge_end(); | 292 | mem_cgroup_uncharge_end(); |
293 | index++; | ||
296 | } | 294 | } |
297 | cleancache_flush_inode(mapping); | 295 | cleancache_flush_inode(mapping); |
298 | } | 296 | } |
@@ -333,35 +331,34 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, | |||
333 | pgoff_t start, pgoff_t end) | 331 | pgoff_t start, pgoff_t end) |
334 | { | 332 | { |
335 | struct pagevec pvec; | 333 | struct pagevec pvec; |
336 | pgoff_t next = start; | 334 | pgoff_t index = start; |
337 | unsigned long ret; | 335 | unsigned long ret; |
338 | unsigned long count = 0; | 336 | unsigned long count = 0; |
339 | int i; | 337 | int i; |
340 | 338 | ||
339 | /* | ||
340 | * Note: this function may get called on a shmem/tmpfs mapping: | ||
341 | * pagevec_lookup() might then return 0 prematurely (because it | ||
342 | * got a gangful of swap entries); but it's hardly worth worrying | ||
343 | * about - it can rarely have anything to free from such a mapping | ||
344 | * (most pages are dirty), and already skips over any difficulties. | ||
345 | */ | ||
346 | |||
341 | pagevec_init(&pvec, 0); | 347 | pagevec_init(&pvec, 0); |
342 | while (next <= end && | 348 | while (index <= end && pagevec_lookup(&pvec, mapping, index, |
343 | pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { | 349 | min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { |
344 | mem_cgroup_uncharge_start(); | 350 | mem_cgroup_uncharge_start(); |
345 | for (i = 0; i < pagevec_count(&pvec); i++) { | 351 | for (i = 0; i < pagevec_count(&pvec); i++) { |
346 | struct page *page = pvec.pages[i]; | 352 | struct page *page = pvec.pages[i]; |
347 | pgoff_t index; | ||
348 | int lock_failed; | ||
349 | 353 | ||
350 | lock_failed = !trylock_page(page); | 354 | /* We rely upon deletion not changing page->index */ |
351 | |||
352 | /* | ||
353 | * We really shouldn't be looking at the ->index of an | ||
354 | * unlocked page. But we're not allowed to lock these | ||
355 | * pages. So we rely upon nobody altering the ->index | ||
356 | * of this (pinned-by-us) page. | ||
357 | */ | ||
358 | index = page->index; | 355 | index = page->index; |
359 | if (index > next) | 356 | if (index > end) |
360 | next = index; | 357 | break; |
361 | next++; | ||
362 | if (lock_failed) | ||
363 | continue; | ||
364 | 358 | ||
359 | if (!trylock_page(page)) | ||
360 | continue; | ||
361 | WARN_ON(page->index != index); | ||
365 | ret = invalidate_inode_page(page); | 362 | ret = invalidate_inode_page(page); |
366 | unlock_page(page); | 363 | unlock_page(page); |
367 | /* | 364 | /* |
@@ -371,12 +368,11 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, | |||
371 | if (!ret) | 368 | if (!ret) |
372 | deactivate_page(page); | 369 | deactivate_page(page); |
373 | count += ret; | 370 | count += ret; |
374 | if (next > end) | ||
375 | break; | ||
376 | } | 371 | } |
377 | pagevec_release(&pvec); | 372 | pagevec_release(&pvec); |
378 | mem_cgroup_uncharge_end(); | 373 | mem_cgroup_uncharge_end(); |
379 | cond_resched(); | 374 | cond_resched(); |
375 | index++; | ||
380 | } | 376 | } |
381 | return count; | 377 | return count; |
382 | } | 378 | } |
@@ -442,37 +438,32 @@ int invalidate_inode_pages2_range(struct address_space *mapping, | |||
442 | pgoff_t start, pgoff_t end) | 438 | pgoff_t start, pgoff_t end) |
443 | { | 439 | { |
444 | struct pagevec pvec; | 440 | struct pagevec pvec; |
445 | pgoff_t next; | 441 | pgoff_t index; |
446 | int i; | 442 | int i; |
447 | int ret = 0; | 443 | int ret = 0; |
448 | int ret2 = 0; | 444 | int ret2 = 0; |
449 | int did_range_unmap = 0; | 445 | int did_range_unmap = 0; |
450 | int wrapped = 0; | ||
451 | 446 | ||
452 | cleancache_flush_inode(mapping); | 447 | cleancache_flush_inode(mapping); |
453 | pagevec_init(&pvec, 0); | 448 | pagevec_init(&pvec, 0); |
454 | next = start; | 449 | index = start; |
455 | while (next <= end && !wrapped && | 450 | while (index <= end && pagevec_lookup(&pvec, mapping, index, |
456 | pagevec_lookup(&pvec, mapping, next, | 451 | min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { |
457 | min(end - next, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { | ||
458 | mem_cgroup_uncharge_start(); | 452 | mem_cgroup_uncharge_start(); |
459 | for (i = 0; i < pagevec_count(&pvec); i++) { | 453 | for (i = 0; i < pagevec_count(&pvec); i++) { |
460 | struct page *page = pvec.pages[i]; | 454 | struct page *page = pvec.pages[i]; |
461 | pgoff_t page_index; | 455 | |
456 | /* We rely upon deletion not changing page->index */ | ||
457 | index = page->index; | ||
458 | if (index > end) | ||
459 | break; | ||
462 | 460 | ||
463 | lock_page(page); | 461 | lock_page(page); |
462 | WARN_ON(page->index != index); | ||
464 | if (page->mapping != mapping) { | 463 | if (page->mapping != mapping) { |
465 | unlock_page(page); | 464 | unlock_page(page); |
466 | continue; | 465 | continue; |
467 | } | 466 | } |
468 | page_index = page->index; | ||
469 | next = page_index + 1; | ||
470 | if (next == 0) | ||
471 | wrapped = 1; | ||
472 | if (page_index > end) { | ||
473 | unlock_page(page); | ||
474 | break; | ||
475 | } | ||
476 | wait_on_page_writeback(page); | 467 | wait_on_page_writeback(page); |
477 | if (page_mapped(page)) { | 468 | if (page_mapped(page)) { |
478 | if (!did_range_unmap) { | 469 | if (!did_range_unmap) { |
@@ -480,9 +471,9 @@ int invalidate_inode_pages2_range(struct address_space *mapping, | |||
480 | * Zap the rest of the file in one hit. | 471 | * Zap the rest of the file in one hit. |
481 | */ | 472 | */ |
482 | unmap_mapping_range(mapping, | 473 | unmap_mapping_range(mapping, |
483 | (loff_t)page_index<<PAGE_CACHE_SHIFT, | 474 | (loff_t)index << PAGE_CACHE_SHIFT, |
484 | (loff_t)(end - page_index + 1) | 475 | (loff_t)(1 + end - index) |
485 | << PAGE_CACHE_SHIFT, | 476 | << PAGE_CACHE_SHIFT, |
486 | 0); | 477 | 0); |
487 | did_range_unmap = 1; | 478 | did_range_unmap = 1; |
488 | } else { | 479 | } else { |
@@ -490,8 +481,8 @@ int invalidate_inode_pages2_range(struct address_space *mapping, | |||
490 | * Just zap this page | 481 | * Just zap this page |
491 | */ | 482 | */ |
492 | unmap_mapping_range(mapping, | 483 | unmap_mapping_range(mapping, |
493 | (loff_t)page_index<<PAGE_CACHE_SHIFT, | 484 | (loff_t)index << PAGE_CACHE_SHIFT, |
494 | PAGE_CACHE_SIZE, 0); | 485 | PAGE_CACHE_SIZE, 0); |
495 | } | 486 | } |
496 | } | 487 | } |
497 | BUG_ON(page_mapped(page)); | 488 | BUG_ON(page_mapped(page)); |
@@ -507,6 +498,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, | |||
507 | pagevec_release(&pvec); | 498 | pagevec_release(&pvec); |
508 | mem_cgroup_uncharge_end(); | 499 | mem_cgroup_uncharge_end(); |
509 | cond_resched(); | 500 | cond_resched(); |
501 | index++; | ||
510 | } | 502 | } |
511 | cleancache_flush_inode(mapping); | 503 | cleancache_flush_inode(mapping); |
512 | return ret; | 504 | return ret; |
@@ -531,8 +523,8 @@ EXPORT_SYMBOL_GPL(invalidate_inode_pages2); | |||
531 | /** | 523 | /** |
532 | * truncate_pagecache - unmap and remove pagecache that has been truncated | 524 | * truncate_pagecache - unmap and remove pagecache that has been truncated |
533 | * @inode: inode | 525 | * @inode: inode |
534 | * @old: old file offset | 526 | * @oldsize: old file size |
535 | * @new: new file offset | 527 | * @newsize: new file size |
536 | * | 528 | * |
537 | * inode's new i_size must already be written before truncate_pagecache | 529 | * inode's new i_size must already be written before truncate_pagecache |
538 | * is called. | 530 | * is called. |
@@ -544,9 +536,10 @@ EXPORT_SYMBOL_GPL(invalidate_inode_pages2); | |||
544 | * situations such as writepage being called for a page that has already | 536 | * situations such as writepage being called for a page that has already |
545 | * had its underlying blocks deallocated. | 537 | * had its underlying blocks deallocated. |
546 | */ | 538 | */ |
547 | void truncate_pagecache(struct inode *inode, loff_t old, loff_t new) | 539 | void truncate_pagecache(struct inode *inode, loff_t oldsize, loff_t newsize) |
548 | { | 540 | { |
549 | struct address_space *mapping = inode->i_mapping; | 541 | struct address_space *mapping = inode->i_mapping; |
542 | loff_t holebegin = round_up(newsize, PAGE_SIZE); | ||
550 | 543 | ||
551 | /* | 544 | /* |
552 | * unmap_mapping_range is called twice, first simply for | 545 | * unmap_mapping_range is called twice, first simply for |
@@ -557,9 +550,9 @@ void truncate_pagecache(struct inode *inode, loff_t old, loff_t new) | |||
557 | * truncate_inode_pages finishes, hence the second | 550 | * truncate_inode_pages finishes, hence the second |
558 | * unmap_mapping_range call must be made for correctness. | 551 | * unmap_mapping_range call must be made for correctness. |
559 | */ | 552 | */ |
560 | unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1); | 553 | unmap_mapping_range(mapping, holebegin, 0, 1); |
561 | truncate_inode_pages(mapping, new); | 554 | truncate_inode_pages(mapping, newsize); |
562 | unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1); | 555 | unmap_mapping_range(mapping, holebegin, 0, 1); |
563 | } | 556 | } |
564 | EXPORT_SYMBOL(truncate_pagecache); | 557 | EXPORT_SYMBOL(truncate_pagecache); |
565 | 558 | ||
@@ -589,29 +582,31 @@ EXPORT_SYMBOL(truncate_setsize); | |||
589 | /** | 582 | /** |
590 | * vmtruncate - unmap mappings "freed" by truncate() syscall | 583 | * vmtruncate - unmap mappings "freed" by truncate() syscall |
591 | * @inode: inode of the file used | 584 | * @inode: inode of the file used |
592 | * @offset: file offset to start truncating | 585 | * @newsize: file offset to start truncating |
593 | * | 586 | * |
594 | * This function is deprecated and truncate_setsize or truncate_pagecache | 587 | * This function is deprecated and truncate_setsize or truncate_pagecache |
595 | * should be used instead, together with filesystem specific block truncation. | 588 | * should be used instead, together with filesystem specific block truncation. |
596 | */ | 589 | */ |
597 | int vmtruncate(struct inode *inode, loff_t offset) | 590 | int vmtruncate(struct inode *inode, loff_t newsize) |
598 | { | 591 | { |
599 | int error; | 592 | int error; |
600 | 593 | ||
601 | error = inode_newsize_ok(inode, offset); | 594 | error = inode_newsize_ok(inode, newsize); |
602 | if (error) | 595 | if (error) |
603 | return error; | 596 | return error; |
604 | 597 | ||
605 | truncate_setsize(inode, offset); | 598 | truncate_setsize(inode, newsize); |
606 | if (inode->i_op->truncate) | 599 | if (inode->i_op->truncate) |
607 | inode->i_op->truncate(inode); | 600 | inode->i_op->truncate(inode); |
608 | return 0; | 601 | return 0; |
609 | } | 602 | } |
610 | EXPORT_SYMBOL(vmtruncate); | 603 | EXPORT_SYMBOL(vmtruncate); |
611 | 604 | ||
612 | int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end) | 605 | int vmtruncate_range(struct inode *inode, loff_t lstart, loff_t lend) |
613 | { | 606 | { |
614 | struct address_space *mapping = inode->i_mapping; | 607 | struct address_space *mapping = inode->i_mapping; |
608 | loff_t holebegin = round_up(lstart, PAGE_SIZE); | ||
609 | loff_t holelen = 1 + lend - holebegin; | ||
615 | 610 | ||
616 | /* | 611 | /* |
617 | * If the underlying filesystem is not going to provide | 612 | * If the underlying filesystem is not going to provide |
@@ -622,12 +617,11 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end) | |||
622 | return -ENOSYS; | 617 | return -ENOSYS; |
623 | 618 | ||
624 | mutex_lock(&inode->i_mutex); | 619 | mutex_lock(&inode->i_mutex); |
625 | down_write(&inode->i_alloc_sem); | 620 | inode_dio_wait(inode); |
626 | unmap_mapping_range(mapping, offset, (end - offset), 1); | 621 | unmap_mapping_range(mapping, holebegin, holelen, 1); |
627 | inode->i_op->truncate_range(inode, offset, end); | 622 | inode->i_op->truncate_range(inode, lstart, lend); |
628 | /* unmap again to remove racily COWed private pages */ | 623 | /* unmap again to remove racily COWed private pages */ |
629 | unmap_mapping_range(mapping, offset, (end - offset), 1); | 624 | unmap_mapping_range(mapping, holebegin, holelen, 1); |
630 | up_write(&inode->i_alloc_sem); | ||
631 | mutex_unlock(&inode->i_mutex); | 625 | mutex_unlock(&inode->i_mutex); |
632 | 626 | ||
633 | return 0; | 627 | return 0; |
@@ -1,7 +1,7 @@ | |||
1 | #include <linux/mm.h> | 1 | #include <linux/mm.h> |
2 | #include <linux/slab.h> | 2 | #include <linux/slab.h> |
3 | #include <linux/string.h> | 3 | #include <linux/string.h> |
4 | #include <linux/module.h> | 4 | #include <linux/export.h> |
5 | #include <linux/err.h> | 5 | #include <linux/err.h> |
6 | #include <linux/sched.h> | 6 | #include <linux/sched.h> |
7 | #include <asm/uaccess.h> | 7 | #include <asm/uaccess.h> |
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 1d34d75366a7..3231bf332878 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
@@ -26,7 +26,7 @@ | |||
26 | #include <linux/rcupdate.h> | 26 | #include <linux/rcupdate.h> |
27 | #include <linux/pfn.h> | 27 | #include <linux/pfn.h> |
28 | #include <linux/kmemleak.h> | 28 | #include <linux/kmemleak.h> |
29 | #include <asm/atomic.h> | 29 | #include <linux/atomic.h> |
30 | #include <asm/uaccess.h> | 30 | #include <asm/uaccess.h> |
31 | #include <asm/tlbflush.h> | 31 | #include <asm/tlbflush.h> |
32 | #include <asm/shmparam.h> | 32 | #include <asm/shmparam.h> |
@@ -452,13 +452,6 @@ overflow: | |||
452 | return ERR_PTR(-EBUSY); | 452 | return ERR_PTR(-EBUSY); |
453 | } | 453 | } |
454 | 454 | ||
455 | static void rcu_free_va(struct rcu_head *head) | ||
456 | { | ||
457 | struct vmap_area *va = container_of(head, struct vmap_area, rcu_head); | ||
458 | |||
459 | kfree(va); | ||
460 | } | ||
461 | |||
462 | static void __free_vmap_area(struct vmap_area *va) | 455 | static void __free_vmap_area(struct vmap_area *va) |
463 | { | 456 | { |
464 | BUG_ON(RB_EMPTY_NODE(&va->rb_node)); | 457 | BUG_ON(RB_EMPTY_NODE(&va->rb_node)); |
@@ -491,7 +484,7 @@ static void __free_vmap_area(struct vmap_area *va) | |||
491 | if (va->va_end > VMALLOC_START && va->va_end <= VMALLOC_END) | 484 | if (va->va_end > VMALLOC_START && va->va_end <= VMALLOC_END) |
492 | vmap_area_pcpu_hole = max(vmap_area_pcpu_hole, va->va_end); | 485 | vmap_area_pcpu_hole = max(vmap_area_pcpu_hole, va->va_end); |
493 | 486 | ||
494 | call_rcu(&va->rcu_head, rcu_free_va); | 487 | kfree_rcu(va, rcu_head); |
495 | } | 488 | } |
496 | 489 | ||
497 | /* | 490 | /* |
@@ -732,9 +725,10 @@ static void free_unmap_vmap_area_addr(unsigned long addr) | |||
732 | #define VMAP_BBMAP_BITS_MIN (VMAP_MAX_ALLOC*2) | 725 | #define VMAP_BBMAP_BITS_MIN (VMAP_MAX_ALLOC*2) |
733 | #define VMAP_MIN(x, y) ((x) < (y) ? (x) : (y)) /* can't use min() */ | 726 | #define VMAP_MIN(x, y) ((x) < (y) ? (x) : (y)) /* can't use min() */ |
734 | #define VMAP_MAX(x, y) ((x) > (y) ? (x) : (y)) /* can't use max() */ | 727 | #define VMAP_MAX(x, y) ((x) > (y) ? (x) : (y)) /* can't use max() */ |
735 | #define VMAP_BBMAP_BITS VMAP_MIN(VMAP_BBMAP_BITS_MAX, \ | 728 | #define VMAP_BBMAP_BITS \ |
736 | VMAP_MAX(VMAP_BBMAP_BITS_MIN, \ | 729 | VMAP_MIN(VMAP_BBMAP_BITS_MAX, \ |
737 | VMALLOC_PAGES / NR_CPUS / 16)) | 730 | VMAP_MAX(VMAP_BBMAP_BITS_MIN, \ |
731 | VMALLOC_PAGES / roundup_pow_of_two(NR_CPUS) / 16)) | ||
738 | 732 | ||
739 | #define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE) | 733 | #define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE) |
740 | 734 | ||
@@ -837,13 +831,6 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask) | |||
837 | return vb; | 831 | return vb; |
838 | } | 832 | } |
839 | 833 | ||
840 | static void rcu_free_vb(struct rcu_head *head) | ||
841 | { | ||
842 | struct vmap_block *vb = container_of(head, struct vmap_block, rcu_head); | ||
843 | |||
844 | kfree(vb); | ||
845 | } | ||
846 | |||
847 | static void free_vmap_block(struct vmap_block *vb) | 834 | static void free_vmap_block(struct vmap_block *vb) |
848 | { | 835 | { |
849 | struct vmap_block *tmp; | 836 | struct vmap_block *tmp; |
@@ -856,7 +843,7 @@ static void free_vmap_block(struct vmap_block *vb) | |||
856 | BUG_ON(tmp != vb); | 843 | BUG_ON(tmp != vb); |
857 | 844 | ||
858 | free_vmap_area_noflush(vb->va); | 845 | free_vmap_area_noflush(vb->va); |
859 | call_rcu(&vb->rcu_head, rcu_free_vb); | 846 | kfree_rcu(vb, rcu_head); |
860 | } | 847 | } |
861 | 848 | ||
862 | static void purge_fragmented_blocks(int cpu) | 849 | static void purge_fragmented_blocks(int cpu) |
@@ -1266,18 +1253,22 @@ EXPORT_SYMBOL_GPL(map_vm_area); | |||
1266 | DEFINE_RWLOCK(vmlist_lock); | 1253 | DEFINE_RWLOCK(vmlist_lock); |
1267 | struct vm_struct *vmlist; | 1254 | struct vm_struct *vmlist; |
1268 | 1255 | ||
1269 | static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, | 1256 | static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, |
1270 | unsigned long flags, void *caller) | 1257 | unsigned long flags, void *caller) |
1271 | { | 1258 | { |
1272 | struct vm_struct *tmp, **p; | ||
1273 | |||
1274 | vm->flags = flags; | 1259 | vm->flags = flags; |
1275 | vm->addr = (void *)va->va_start; | 1260 | vm->addr = (void *)va->va_start; |
1276 | vm->size = va->va_end - va->va_start; | 1261 | vm->size = va->va_end - va->va_start; |
1277 | vm->caller = caller; | 1262 | vm->caller = caller; |
1278 | va->private = vm; | 1263 | va->private = vm; |
1279 | va->flags |= VM_VM_AREA; | 1264 | va->flags |= VM_VM_AREA; |
1265 | } | ||
1266 | |||
1267 | static void insert_vmalloc_vmlist(struct vm_struct *vm) | ||
1268 | { | ||
1269 | struct vm_struct *tmp, **p; | ||
1280 | 1270 | ||
1271 | vm->flags &= ~VM_UNLIST; | ||
1281 | write_lock(&vmlist_lock); | 1272 | write_lock(&vmlist_lock); |
1282 | for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) { | 1273 | for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) { |
1283 | if (tmp->addr >= vm->addr) | 1274 | if (tmp->addr >= vm->addr) |
@@ -1288,6 +1279,13 @@ static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, | |||
1288 | write_unlock(&vmlist_lock); | 1279 | write_unlock(&vmlist_lock); |
1289 | } | 1280 | } |
1290 | 1281 | ||
1282 | static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, | ||
1283 | unsigned long flags, void *caller) | ||
1284 | { | ||
1285 | setup_vmalloc_vm(vm, va, flags, caller); | ||
1286 | insert_vmalloc_vmlist(vm); | ||
1287 | } | ||
1288 | |||
1291 | static struct vm_struct *__get_vm_area_node(unsigned long size, | 1289 | static struct vm_struct *__get_vm_area_node(unsigned long size, |
1292 | unsigned long align, unsigned long flags, unsigned long start, | 1290 | unsigned long align, unsigned long flags, unsigned long start, |
1293 | unsigned long end, int node, gfp_t gfp_mask, void *caller) | 1291 | unsigned long end, int node, gfp_t gfp_mask, void *caller) |
@@ -1326,7 +1324,18 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, | |||
1326 | return NULL; | 1324 | return NULL; |
1327 | } | 1325 | } |
1328 | 1326 | ||
1329 | insert_vmalloc_vm(area, va, flags, caller); | 1327 | /* |
1328 | * When this function is called from __vmalloc_node_range, | ||
1329 | * we do not add vm_struct to vmlist here to avoid | ||
1330 | * accessing uninitialized members of vm_struct such as | ||
1331 | * pages and nr_pages fields. They will be set later. | ||
1332 | * To distinguish it from others, we use a VM_UNLIST flag. | ||
1333 | */ | ||
1334 | if (flags & VM_UNLIST) | ||
1335 | setup_vmalloc_vm(area, va, flags, caller); | ||
1336 | else | ||
1337 | insert_vmalloc_vm(area, va, flags, caller); | ||
1338 | |||
1330 | return area; | 1339 | return area; |
1331 | } | 1340 | } |
1332 | 1341 | ||
@@ -1394,17 +1403,20 @@ struct vm_struct *remove_vm_area(const void *addr) | |||
1394 | va = find_vmap_area((unsigned long)addr); | 1403 | va = find_vmap_area((unsigned long)addr); |
1395 | if (va && va->flags & VM_VM_AREA) { | 1404 | if (va && va->flags & VM_VM_AREA) { |
1396 | struct vm_struct *vm = va->private; | 1405 | struct vm_struct *vm = va->private; |
1397 | struct vm_struct *tmp, **p; | 1406 | |
1398 | /* | 1407 | if (!(vm->flags & VM_UNLIST)) { |
1399 | * remove from list and disallow access to this vm_struct | 1408 | struct vm_struct *tmp, **p; |
1400 | * before unmap. (address range confliction is maintained by | 1409 | /* |
1401 | * vmap.) | 1410 | * remove from list and disallow access to |
1402 | */ | 1411 | * this vm_struct before unmap. (address range |
1403 | write_lock(&vmlist_lock); | 1412 | * confliction is maintained by vmap.) |
1404 | for (p = &vmlist; (tmp = *p) != vm; p = &tmp->next) | 1413 | */ |
1405 | ; | 1414 | write_lock(&vmlist_lock); |
1406 | *p = tmp->next; | 1415 | for (p = &vmlist; (tmp = *p) != vm; p = &tmp->next) |
1407 | write_unlock(&vmlist_lock); | 1416 | ; |
1417 | *p = tmp->next; | ||
1418 | write_unlock(&vmlist_lock); | ||
1419 | } | ||
1408 | 1420 | ||
1409 | vmap_debug_free_range(va->va_start, va->va_end); | 1421 | vmap_debug_free_range(va->va_start, va->va_end); |
1410 | free_unmap_vmap_area(va); | 1422 | free_unmap_vmap_area(va); |
@@ -1581,8 +1593,8 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, | |||
1581 | return area->addr; | 1593 | return area->addr; |
1582 | 1594 | ||
1583 | fail: | 1595 | fail: |
1584 | warn_alloc_failed(gfp_mask, order, "vmalloc: allocation failure, " | 1596 | warn_alloc_failed(gfp_mask, order, |
1585 | "allocated %ld of %ld bytes\n", | 1597 | "vmalloc: allocation failure, allocated %ld of %ld bytes\n", |
1586 | (area->nr_pages*PAGE_SIZE), area->size); | 1598 | (area->nr_pages*PAGE_SIZE), area->size); |
1587 | vfree(area->addr); | 1599 | vfree(area->addr); |
1588 | return NULL; | 1600 | return NULL; |
@@ -1613,17 +1625,22 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, | |||
1613 | 1625 | ||
1614 | size = PAGE_ALIGN(size); | 1626 | size = PAGE_ALIGN(size); |
1615 | if (!size || (size >> PAGE_SHIFT) > totalram_pages) | 1627 | if (!size || (size >> PAGE_SHIFT) > totalram_pages) |
1616 | return NULL; | 1628 | goto fail; |
1617 | |||
1618 | area = __get_vm_area_node(size, align, VM_ALLOC, start, end, node, | ||
1619 | gfp_mask, caller); | ||
1620 | 1629 | ||
1630 | area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNLIST, | ||
1631 | start, end, node, gfp_mask, caller); | ||
1621 | if (!area) | 1632 | if (!area) |
1622 | return NULL; | 1633 | goto fail; |
1623 | 1634 | ||
1624 | addr = __vmalloc_area_node(area, gfp_mask, prot, node, caller); | 1635 | addr = __vmalloc_area_node(area, gfp_mask, prot, node, caller); |
1625 | 1636 | ||
1626 | /* | 1637 | /* |
1638 | * In this function, newly allocated vm_struct is not added | ||
1639 | * to vmlist at __get_vm_area_node(). so, it is added here. | ||
1640 | */ | ||
1641 | insert_vmalloc_vmlist(area); | ||
1642 | |||
1643 | /* | ||
1627 | * A ref_count = 3 is needed because the vm_struct and vmap_area | 1644 | * A ref_count = 3 is needed because the vm_struct and vmap_area |
1628 | * structures allocated in the __get_vm_area_node() function contain | 1645 | * structures allocated in the __get_vm_area_node() function contain |
1629 | * references to the virtual address of the vmalloc'ed block. | 1646 | * references to the virtual address of the vmalloc'ed block. |
@@ -1631,6 +1648,12 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, | |||
1631 | kmemleak_alloc(addr, real_size, 3, gfp_mask); | 1648 | kmemleak_alloc(addr, real_size, 3, gfp_mask); |
1632 | 1649 | ||
1633 | return addr; | 1650 | return addr; |
1651 | |||
1652 | fail: | ||
1653 | warn_alloc_failed(gfp_mask, 0, | ||
1654 | "vmalloc: allocation failure: %lu bytes\n", | ||
1655 | real_size); | ||
1656 | return NULL; | ||
1634 | } | 1657 | } |
1635 | 1658 | ||
1636 | /** | 1659 | /** |
@@ -2118,23 +2141,30 @@ void __attribute__((weak)) vmalloc_sync_all(void) | |||
2118 | 2141 | ||
2119 | static int f(pte_t *pte, pgtable_t table, unsigned long addr, void *data) | 2142 | static int f(pte_t *pte, pgtable_t table, unsigned long addr, void *data) |
2120 | { | 2143 | { |
2121 | /* apply_to_page_range() does all the hard work. */ | 2144 | pte_t ***p = data; |
2145 | |||
2146 | if (p) { | ||
2147 | *(*p) = pte; | ||
2148 | (*p)++; | ||
2149 | } | ||
2122 | return 0; | 2150 | return 0; |
2123 | } | 2151 | } |
2124 | 2152 | ||
2125 | /** | 2153 | /** |
2126 | * alloc_vm_area - allocate a range of kernel address space | 2154 | * alloc_vm_area - allocate a range of kernel address space |
2127 | * @size: size of the area | 2155 | * @size: size of the area |
2156 | * @ptes: returns the PTEs for the address space | ||
2128 | * | 2157 | * |
2129 | * Returns: NULL on failure, vm_struct on success | 2158 | * Returns: NULL on failure, vm_struct on success |
2130 | * | 2159 | * |
2131 | * This function reserves a range of kernel address space, and | 2160 | * This function reserves a range of kernel address space, and |
2132 | * allocates pagetables to map that range. No actual mappings | 2161 | * allocates pagetables to map that range. No actual mappings |
2133 | * are created. If the kernel address space is not shared | 2162 | * are created. |
2134 | * between processes, it syncs the pagetable across all | 2163 | * |
2135 | * processes. | 2164 | * If @ptes is non-NULL, pointers to the PTEs (in init_mm) |
2165 | * allocated for the VM area are returned. | ||
2136 | */ | 2166 | */ |
2137 | struct vm_struct *alloc_vm_area(size_t size) | 2167 | struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes) |
2138 | { | 2168 | { |
2139 | struct vm_struct *area; | 2169 | struct vm_struct *area; |
2140 | 2170 | ||
@@ -2148,7 +2178,7 @@ struct vm_struct *alloc_vm_area(size_t size) | |||
2148 | * of kernel virtual address space and mapped into init_mm. | 2178 | * of kernel virtual address space and mapped into init_mm. |
2149 | */ | 2179 | */ |
2150 | if (apply_to_page_range(&init_mm, (unsigned long)area->addr, | 2180 | if (apply_to_page_range(&init_mm, (unsigned long)area->addr, |
2151 | area->size, f, NULL)) { | 2181 | size, f, ptes ? &ptes : NULL)) { |
2152 | free_vm_area(area); | 2182 | free_vm_area(area); |
2153 | return NULL; | 2183 | return NULL; |
2154 | } | 2184 | } |
diff --git a/mm/vmscan.c b/mm/vmscan.c index 5ed24b94c5e6..a1893c050795 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -95,8 +95,6 @@ struct scan_control { | |||
95 | /* Can pages be swapped as part of reclaim? */ | 95 | /* Can pages be swapped as part of reclaim? */ |
96 | int may_swap; | 96 | int may_swap; |
97 | 97 | ||
98 | int swappiness; | ||
99 | |||
100 | int order; | 98 | int order; |
101 | 99 | ||
102 | /* | 100 | /* |
@@ -173,7 +171,8 @@ static unsigned long zone_nr_lru_pages(struct zone *zone, | |||
173 | struct scan_control *sc, enum lru_list lru) | 171 | struct scan_control *sc, enum lru_list lru) |
174 | { | 172 | { |
175 | if (!scanning_global_lru(sc)) | 173 | if (!scanning_global_lru(sc)) |
176 | return mem_cgroup_zone_nr_lru_pages(sc->mem_cgroup, zone, lru); | 174 | return mem_cgroup_zone_nr_lru_pages(sc->mem_cgroup, |
175 | zone_to_nid(zone), zone_idx(zone), BIT(lru)); | ||
177 | 176 | ||
178 | return zone_page_state(zone, NR_LRU_BASE + lru); | 177 | return zone_page_state(zone, NR_LRU_BASE + lru); |
179 | } | 178 | } |
@@ -250,49 +249,90 @@ unsigned long shrink_slab(struct shrink_control *shrink, | |||
250 | unsigned long long delta; | 249 | unsigned long long delta; |
251 | unsigned long total_scan; | 250 | unsigned long total_scan; |
252 | unsigned long max_pass; | 251 | unsigned long max_pass; |
252 | int shrink_ret = 0; | ||
253 | long nr; | ||
254 | long new_nr; | ||
255 | long batch_size = shrinker->batch ? shrinker->batch | ||
256 | : SHRINK_BATCH; | ||
257 | |||
258 | /* | ||
259 | * copy the current shrinker scan count into a local variable | ||
260 | * and zero it so that other concurrent shrinker invocations | ||
261 | * don't also do this scanning work. | ||
262 | */ | ||
263 | do { | ||
264 | nr = shrinker->nr; | ||
265 | } while (cmpxchg(&shrinker->nr, nr, 0) != nr); | ||
253 | 266 | ||
267 | total_scan = nr; | ||
254 | max_pass = do_shrinker_shrink(shrinker, shrink, 0); | 268 | max_pass = do_shrinker_shrink(shrinker, shrink, 0); |
255 | delta = (4 * nr_pages_scanned) / shrinker->seeks; | 269 | delta = (4 * nr_pages_scanned) / shrinker->seeks; |
256 | delta *= max_pass; | 270 | delta *= max_pass; |
257 | do_div(delta, lru_pages + 1); | 271 | do_div(delta, lru_pages + 1); |
258 | shrinker->nr += delta; | 272 | total_scan += delta; |
259 | if (shrinker->nr < 0) { | 273 | if (total_scan < 0) { |
260 | printk(KERN_ERR "shrink_slab: %pF negative objects to " | 274 | printk(KERN_ERR "shrink_slab: %pF negative objects to " |
261 | "delete nr=%ld\n", | 275 | "delete nr=%ld\n", |
262 | shrinker->shrink, shrinker->nr); | 276 | shrinker->shrink, total_scan); |
263 | shrinker->nr = max_pass; | 277 | total_scan = max_pass; |
264 | } | 278 | } |
265 | 279 | ||
266 | /* | 280 | /* |
281 | * We need to avoid excessive windup on filesystem shrinkers | ||
282 | * due to large numbers of GFP_NOFS allocations causing the | ||
283 | * shrinkers to return -1 all the time. This results in a large | ||
284 | * nr being built up so when a shrink that can do some work | ||
285 | * comes along it empties the entire cache due to nr >>> | ||
286 | * max_pass. This is bad for sustaining a working set in | ||
287 | * memory. | ||
288 | * | ||
289 | * Hence only allow the shrinker to scan the entire cache when | ||
290 | * a large delta change is calculated directly. | ||
291 | */ | ||
292 | if (delta < max_pass / 4) | ||
293 | total_scan = min(total_scan, max_pass / 2); | ||
294 | |||
295 | /* | ||
267 | * Avoid risking looping forever due to too large nr value: | 296 | * Avoid risking looping forever due to too large nr value: |
268 | * never try to free more than twice the estimate number of | 297 | * never try to free more than twice the estimate number of |
269 | * freeable entries. | 298 | * freeable entries. |
270 | */ | 299 | */ |
271 | if (shrinker->nr > max_pass * 2) | 300 | if (total_scan > max_pass * 2) |
272 | shrinker->nr = max_pass * 2; | 301 | total_scan = max_pass * 2; |
273 | 302 | ||
274 | total_scan = shrinker->nr; | 303 | trace_mm_shrink_slab_start(shrinker, shrink, nr, |
275 | shrinker->nr = 0; | 304 | nr_pages_scanned, lru_pages, |
305 | max_pass, delta, total_scan); | ||
276 | 306 | ||
277 | while (total_scan >= SHRINK_BATCH) { | 307 | while (total_scan >= batch_size) { |
278 | long this_scan = SHRINK_BATCH; | ||
279 | int shrink_ret; | ||
280 | int nr_before; | 308 | int nr_before; |
281 | 309 | ||
282 | nr_before = do_shrinker_shrink(shrinker, shrink, 0); | 310 | nr_before = do_shrinker_shrink(shrinker, shrink, 0); |
283 | shrink_ret = do_shrinker_shrink(shrinker, shrink, | 311 | shrink_ret = do_shrinker_shrink(shrinker, shrink, |
284 | this_scan); | 312 | batch_size); |
285 | if (shrink_ret == -1) | 313 | if (shrink_ret == -1) |
286 | break; | 314 | break; |
287 | if (shrink_ret < nr_before) | 315 | if (shrink_ret < nr_before) |
288 | ret += nr_before - shrink_ret; | 316 | ret += nr_before - shrink_ret; |
289 | count_vm_events(SLABS_SCANNED, this_scan); | 317 | count_vm_events(SLABS_SCANNED, batch_size); |
290 | total_scan -= this_scan; | 318 | total_scan -= batch_size; |
291 | 319 | ||
292 | cond_resched(); | 320 | cond_resched(); |
293 | } | 321 | } |
294 | 322 | ||
295 | shrinker->nr += total_scan; | 323 | /* |
324 | * move the unused scan count back into the shrinker in a | ||
325 | * manner that handles concurrent updates. If we exhausted the | ||
326 | * scan, there is no need to do an update. | ||
327 | */ | ||
328 | do { | ||
329 | nr = shrinker->nr; | ||
330 | new_nr = total_scan + nr; | ||
331 | if (total_scan <= 0) | ||
332 | break; | ||
333 | } while (cmpxchg(&shrinker->nr, nr, new_nr) != nr); | ||
334 | |||
335 | trace_mm_shrink_slab_end(shrinker, shrink_ret, nr, new_nr); | ||
296 | } | 336 | } |
297 | up_read(&shrinker_rwsem); | 337 | up_read(&shrinker_rwsem); |
298 | out: | 338 | out: |
@@ -455,15 +495,6 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, | |||
455 | return PAGE_ACTIVATE; | 495 | return PAGE_ACTIVATE; |
456 | } | 496 | } |
457 | 497 | ||
458 | /* | ||
459 | * Wait on writeback if requested to. This happens when | ||
460 | * direct reclaiming a large contiguous area and the | ||
461 | * first attempt to free a range of pages fails. | ||
462 | */ | ||
463 | if (PageWriteback(page) && | ||
464 | (sc->reclaim_mode & RECLAIM_MODE_SYNC)) | ||
465 | wait_on_page_writeback(page); | ||
466 | |||
467 | if (!PageWriteback(page)) { | 498 | if (!PageWriteback(page)) { |
468 | /* synchronous write or broken a_ops? */ | 499 | /* synchronous write or broken a_ops? */ |
469 | ClearPageReclaim(page); | 500 | ClearPageReclaim(page); |
@@ -602,13 +633,14 @@ redo: | |||
602 | lru = LRU_UNEVICTABLE; | 633 | lru = LRU_UNEVICTABLE; |
603 | add_page_to_unevictable_list(page); | 634 | add_page_to_unevictable_list(page); |
604 | /* | 635 | /* |
605 | * When racing with an mlock clearing (page is | 636 | * When racing with an mlock or AS_UNEVICTABLE clearing |
606 | * unlocked), make sure that if the other thread does | 637 | * (page is unlocked) make sure that if the other thread |
607 | * not observe our setting of PG_lru and fails | 638 | * does not observe our setting of PG_lru and fails |
608 | * isolation, we see PG_mlocked cleared below and move | 639 | * isolation/check_move_unevictable_page, |
640 | * we see PG_mlocked/AS_UNEVICTABLE cleared below and move | ||
609 | * the page back to the evictable list. | 641 | * the page back to the evictable list. |
610 | * | 642 | * |
611 | * The other side is TestClearPageMlocked(). | 643 | * The other side is TestClearPageMlocked() or shmem_lock(). |
612 | */ | 644 | */ |
613 | smp_mb(); | 645 | smp_mb(); |
614 | } | 646 | } |
@@ -719,7 +751,10 @@ static noinline_for_stack void free_page_list(struct list_head *free_pages) | |||
719 | */ | 751 | */ |
720 | static unsigned long shrink_page_list(struct list_head *page_list, | 752 | static unsigned long shrink_page_list(struct list_head *page_list, |
721 | struct zone *zone, | 753 | struct zone *zone, |
722 | struct scan_control *sc) | 754 | struct scan_control *sc, |
755 | int priority, | ||
756 | unsigned long *ret_nr_dirty, | ||
757 | unsigned long *ret_nr_writeback) | ||
723 | { | 758 | { |
724 | LIST_HEAD(ret_pages); | 759 | LIST_HEAD(ret_pages); |
725 | LIST_HEAD(free_pages); | 760 | LIST_HEAD(free_pages); |
@@ -727,6 +762,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
727 | unsigned long nr_dirty = 0; | 762 | unsigned long nr_dirty = 0; |
728 | unsigned long nr_congested = 0; | 763 | unsigned long nr_congested = 0; |
729 | unsigned long nr_reclaimed = 0; | 764 | unsigned long nr_reclaimed = 0; |
765 | unsigned long nr_writeback = 0; | ||
730 | 766 | ||
731 | cond_resched(); | 767 | cond_resched(); |
732 | 768 | ||
@@ -763,13 +799,12 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
763 | (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); | 799 | (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); |
764 | 800 | ||
765 | if (PageWriteback(page)) { | 801 | if (PageWriteback(page)) { |
802 | nr_writeback++; | ||
766 | /* | 803 | /* |
767 | * Synchronous reclaim is performed in two passes, | 804 | * Synchronous reclaim cannot queue pages for |
768 | * first an asynchronous pass over the list to | 805 | * writeback due to the possibility of stack overflow |
769 | * start parallel writeback, and a second synchronous | 806 | * but if it encounters a page under writeback, wait |
770 | * pass to wait for the IO to complete. Wait here | 807 | * for the IO to complete. |
771 | * for any page for which writeback has already | ||
772 | * started. | ||
773 | */ | 808 | */ |
774 | if ((sc->reclaim_mode & RECLAIM_MODE_SYNC) && | 809 | if ((sc->reclaim_mode & RECLAIM_MODE_SYNC) && |
775 | may_enter_fs) | 810 | may_enter_fs) |
@@ -825,6 +860,25 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
825 | if (PageDirty(page)) { | 860 | if (PageDirty(page)) { |
826 | nr_dirty++; | 861 | nr_dirty++; |
827 | 862 | ||
863 | /* | ||
864 | * Only kswapd can writeback filesystem pages to | ||
865 | * avoid risk of stack overflow but do not writeback | ||
866 | * unless under significant pressure. | ||
867 | */ | ||
868 | if (page_is_file_cache(page) && | ||
869 | (!current_is_kswapd() || priority >= DEF_PRIORITY - 2)) { | ||
870 | /* | ||
871 | * Immediately reclaim when written back. | ||
872 | * Similar in principal to deactivate_page() | ||
873 | * except we already have the page isolated | ||
874 | * and know it's dirty | ||
875 | */ | ||
876 | inc_zone_page_state(page, NR_VMSCAN_IMMEDIATE); | ||
877 | SetPageReclaim(page); | ||
878 | |||
879 | goto keep_locked; | ||
880 | } | ||
881 | |||
828 | if (references == PAGEREF_RECLAIM_CLEAN) | 882 | if (references == PAGEREF_RECLAIM_CLEAN) |
829 | goto keep_locked; | 883 | goto keep_locked; |
830 | if (!may_enter_fs) | 884 | if (!may_enter_fs) |
@@ -959,6 +1013,8 @@ keep_lumpy: | |||
959 | 1013 | ||
960 | list_splice(&ret_pages, page_list); | 1014 | list_splice(&ret_pages, page_list); |
961 | count_vm_events(PGACTIVATE, pgactivate); | 1015 | count_vm_events(PGACTIVATE, pgactivate); |
1016 | *ret_nr_dirty += nr_dirty; | ||
1017 | *ret_nr_writeback += nr_writeback; | ||
962 | return nr_reclaimed; | 1018 | return nr_reclaimed; |
963 | } | 1019 | } |
964 | 1020 | ||
@@ -972,23 +1028,27 @@ keep_lumpy: | |||
972 | * | 1028 | * |
973 | * returns 0 on success, -ve errno on failure. | 1029 | * returns 0 on success, -ve errno on failure. |
974 | */ | 1030 | */ |
975 | int __isolate_lru_page(struct page *page, int mode, int file) | 1031 | int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file) |
976 | { | 1032 | { |
1033 | bool all_lru_mode; | ||
977 | int ret = -EINVAL; | 1034 | int ret = -EINVAL; |
978 | 1035 | ||
979 | /* Only take pages on the LRU. */ | 1036 | /* Only take pages on the LRU. */ |
980 | if (!PageLRU(page)) | 1037 | if (!PageLRU(page)) |
981 | return ret; | 1038 | return ret; |
982 | 1039 | ||
1040 | all_lru_mode = (mode & (ISOLATE_ACTIVE|ISOLATE_INACTIVE)) == | ||
1041 | (ISOLATE_ACTIVE|ISOLATE_INACTIVE); | ||
1042 | |||
983 | /* | 1043 | /* |
984 | * When checking the active state, we need to be sure we are | 1044 | * When checking the active state, we need to be sure we are |
985 | * dealing with comparible boolean values. Take the logical not | 1045 | * dealing with comparible boolean values. Take the logical not |
986 | * of each. | 1046 | * of each. |
987 | */ | 1047 | */ |
988 | if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode)) | 1048 | if (!all_lru_mode && !PageActive(page) != !(mode & ISOLATE_ACTIVE)) |
989 | return ret; | 1049 | return ret; |
990 | 1050 | ||
991 | if (mode != ISOLATE_BOTH && page_is_file_cache(page) != file) | 1051 | if (!all_lru_mode && !!page_is_file_cache(page) != file) |
992 | return ret; | 1052 | return ret; |
993 | 1053 | ||
994 | /* | 1054 | /* |
@@ -1001,6 +1061,12 @@ int __isolate_lru_page(struct page *page, int mode, int file) | |||
1001 | 1061 | ||
1002 | ret = -EBUSY; | 1062 | ret = -EBUSY; |
1003 | 1063 | ||
1064 | if ((mode & ISOLATE_CLEAN) && (PageDirty(page) || PageWriteback(page))) | ||
1065 | return ret; | ||
1066 | |||
1067 | if ((mode & ISOLATE_UNMAPPED) && page_mapped(page)) | ||
1068 | return ret; | ||
1069 | |||
1004 | if (likely(get_page_unless_zero(page))) { | 1070 | if (likely(get_page_unless_zero(page))) { |
1005 | /* | 1071 | /* |
1006 | * Be careful not to clear PageLRU until after we're | 1072 | * Be careful not to clear PageLRU until after we're |
@@ -1036,7 +1102,8 @@ int __isolate_lru_page(struct page *page, int mode, int file) | |||
1036 | */ | 1102 | */ |
1037 | static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | 1103 | static unsigned long isolate_lru_pages(unsigned long nr_to_scan, |
1038 | struct list_head *src, struct list_head *dst, | 1104 | struct list_head *src, struct list_head *dst, |
1039 | unsigned long *scanned, int order, int mode, int file) | 1105 | unsigned long *scanned, int order, isolate_mode_t mode, |
1106 | int file) | ||
1040 | { | 1107 | { |
1041 | unsigned long nr_taken = 0; | 1108 | unsigned long nr_taken = 0; |
1042 | unsigned long nr_lumpy_taken = 0; | 1109 | unsigned long nr_lumpy_taken = 0; |
@@ -1161,8 +1228,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | |||
1161 | static unsigned long isolate_pages_global(unsigned long nr, | 1228 | static unsigned long isolate_pages_global(unsigned long nr, |
1162 | struct list_head *dst, | 1229 | struct list_head *dst, |
1163 | unsigned long *scanned, int order, | 1230 | unsigned long *scanned, int order, |
1164 | int mode, struct zone *z, | 1231 | isolate_mode_t mode, |
1165 | int active, int file) | 1232 | struct zone *z, int active, int file) |
1166 | { | 1233 | { |
1167 | int lru = LRU_BASE; | 1234 | int lru = LRU_BASE; |
1168 | if (active) | 1235 | if (active) |
@@ -1354,7 +1421,7 @@ static noinline_for_stack void update_isolated_counts(struct zone *zone, | |||
1354 | } | 1421 | } |
1355 | 1422 | ||
1356 | /* | 1423 | /* |
1357 | * Returns true if the caller should wait to clean dirty/writeback pages. | 1424 | * Returns true if a direct reclaim should wait on pages under writeback. |
1358 | * | 1425 | * |
1359 | * If we are direct reclaiming for contiguous pages and we do not reclaim | 1426 | * If we are direct reclaiming for contiguous pages and we do not reclaim |
1360 | * everything in the list, try again and wait for writeback IO to complete. | 1427 | * everything in the list, try again and wait for writeback IO to complete. |
@@ -1376,7 +1443,7 @@ static inline bool should_reclaim_stall(unsigned long nr_taken, | |||
1376 | if (sc->reclaim_mode & RECLAIM_MODE_SINGLE) | 1443 | if (sc->reclaim_mode & RECLAIM_MODE_SINGLE) |
1377 | return false; | 1444 | return false; |
1378 | 1445 | ||
1379 | /* If we have relaimed everything on the isolated list, no stall */ | 1446 | /* If we have reclaimed everything on the isolated list, no stall */ |
1380 | if (nr_freed == nr_taken) | 1447 | if (nr_freed == nr_taken) |
1381 | return false; | 1448 | return false; |
1382 | 1449 | ||
@@ -1408,6 +1475,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, | |||
1408 | unsigned long nr_taken; | 1475 | unsigned long nr_taken; |
1409 | unsigned long nr_anon; | 1476 | unsigned long nr_anon; |
1410 | unsigned long nr_file; | 1477 | unsigned long nr_file; |
1478 | unsigned long nr_dirty = 0; | ||
1479 | unsigned long nr_writeback = 0; | ||
1480 | isolate_mode_t reclaim_mode = ISOLATE_INACTIVE; | ||
1411 | 1481 | ||
1412 | while (unlikely(too_many_isolated(zone, file, sc))) { | 1482 | while (unlikely(too_many_isolated(zone, file, sc))) { |
1413 | congestion_wait(BLK_RW_ASYNC, HZ/10); | 1483 | congestion_wait(BLK_RW_ASYNC, HZ/10); |
@@ -1418,15 +1488,21 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, | |||
1418 | } | 1488 | } |
1419 | 1489 | ||
1420 | set_reclaim_mode(priority, sc, false); | 1490 | set_reclaim_mode(priority, sc, false); |
1491 | if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM) | ||
1492 | reclaim_mode |= ISOLATE_ACTIVE; | ||
1493 | |||
1421 | lru_add_drain(); | 1494 | lru_add_drain(); |
1495 | |||
1496 | if (!sc->may_unmap) | ||
1497 | reclaim_mode |= ISOLATE_UNMAPPED; | ||
1498 | if (!sc->may_writepage) | ||
1499 | reclaim_mode |= ISOLATE_CLEAN; | ||
1500 | |||
1422 | spin_lock_irq(&zone->lru_lock); | 1501 | spin_lock_irq(&zone->lru_lock); |
1423 | 1502 | ||
1424 | if (scanning_global_lru(sc)) { | 1503 | if (scanning_global_lru(sc)) { |
1425 | nr_taken = isolate_pages_global(nr_to_scan, | 1504 | nr_taken = isolate_pages_global(nr_to_scan, &page_list, |
1426 | &page_list, &nr_scanned, sc->order, | 1505 | &nr_scanned, sc->order, reclaim_mode, zone, 0, file); |
1427 | sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ? | ||
1428 | ISOLATE_BOTH : ISOLATE_INACTIVE, | ||
1429 | zone, 0, file); | ||
1430 | zone->pages_scanned += nr_scanned; | 1506 | zone->pages_scanned += nr_scanned; |
1431 | if (current_is_kswapd()) | 1507 | if (current_is_kswapd()) |
1432 | __count_zone_vm_events(PGSCAN_KSWAPD, zone, | 1508 | __count_zone_vm_events(PGSCAN_KSWAPD, zone, |
@@ -1435,12 +1511,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, | |||
1435 | __count_zone_vm_events(PGSCAN_DIRECT, zone, | 1511 | __count_zone_vm_events(PGSCAN_DIRECT, zone, |
1436 | nr_scanned); | 1512 | nr_scanned); |
1437 | } else { | 1513 | } else { |
1438 | nr_taken = mem_cgroup_isolate_pages(nr_to_scan, | 1514 | nr_taken = mem_cgroup_isolate_pages(nr_to_scan, &page_list, |
1439 | &page_list, &nr_scanned, sc->order, | 1515 | &nr_scanned, sc->order, reclaim_mode, zone, |
1440 | sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ? | 1516 | sc->mem_cgroup, 0, file); |
1441 | ISOLATE_BOTH : ISOLATE_INACTIVE, | ||
1442 | zone, sc->mem_cgroup, | ||
1443 | 0, file); | ||
1444 | /* | 1517 | /* |
1445 | * mem_cgroup_isolate_pages() keeps track of | 1518 | * mem_cgroup_isolate_pages() keeps track of |
1446 | * scanned pages on its own. | 1519 | * scanned pages on its own. |
@@ -1456,12 +1529,14 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, | |||
1456 | 1529 | ||
1457 | spin_unlock_irq(&zone->lru_lock); | 1530 | spin_unlock_irq(&zone->lru_lock); |
1458 | 1531 | ||
1459 | nr_reclaimed = shrink_page_list(&page_list, zone, sc); | 1532 | nr_reclaimed = shrink_page_list(&page_list, zone, sc, priority, |
1533 | &nr_dirty, &nr_writeback); | ||
1460 | 1534 | ||
1461 | /* Check if we should syncronously wait for writeback */ | 1535 | /* Check if we should syncronously wait for writeback */ |
1462 | if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) { | 1536 | if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) { |
1463 | set_reclaim_mode(priority, sc, true); | 1537 | set_reclaim_mode(priority, sc, true); |
1464 | nr_reclaimed += shrink_page_list(&page_list, zone, sc); | 1538 | nr_reclaimed += shrink_page_list(&page_list, zone, sc, |
1539 | priority, &nr_dirty, &nr_writeback); | ||
1465 | } | 1540 | } |
1466 | 1541 | ||
1467 | local_irq_disable(); | 1542 | local_irq_disable(); |
@@ -1471,6 +1546,32 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, | |||
1471 | 1546 | ||
1472 | putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list); | 1547 | putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list); |
1473 | 1548 | ||
1549 | /* | ||
1550 | * If reclaim is isolating dirty pages under writeback, it implies | ||
1551 | * that the long-lived page allocation rate is exceeding the page | ||
1552 | * laundering rate. Either the global limits are not being effective | ||
1553 | * at throttling processes due to the page distribution throughout | ||
1554 | * zones or there is heavy usage of a slow backing device. The | ||
1555 | * only option is to throttle from reclaim context which is not ideal | ||
1556 | * as there is no guarantee the dirtying process is throttled in the | ||
1557 | * same way balance_dirty_pages() manages. | ||
1558 | * | ||
1559 | * This scales the number of dirty pages that must be under writeback | ||
1560 | * before throttling depending on priority. It is a simple backoff | ||
1561 | * function that has the most effect in the range DEF_PRIORITY to | ||
1562 | * DEF_PRIORITY-2 which is the priority reclaim is considered to be | ||
1563 | * in trouble and reclaim is considered to be in trouble. | ||
1564 | * | ||
1565 | * DEF_PRIORITY 100% isolated pages must be PageWriteback to throttle | ||
1566 | * DEF_PRIORITY-1 50% must be PageWriteback | ||
1567 | * DEF_PRIORITY-2 25% must be PageWriteback, kswapd in trouble | ||
1568 | * ... | ||
1569 | * DEF_PRIORITY-6 For SWAP_CLUSTER_MAX isolated pages, throttle if any | ||
1570 | * isolated page is PageWriteback | ||
1571 | */ | ||
1572 | if (nr_writeback && nr_writeback >= (nr_taken >> (DEF_PRIORITY-priority))) | ||
1573 | wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10); | ||
1574 | |||
1474 | trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id, | 1575 | trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id, |
1475 | zone_idx(zone), | 1576 | zone_idx(zone), |
1476 | nr_scanned, nr_reclaimed, | 1577 | nr_scanned, nr_reclaimed, |
@@ -1542,19 +1643,26 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
1542 | struct page *page; | 1643 | struct page *page; |
1543 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); | 1644 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); |
1544 | unsigned long nr_rotated = 0; | 1645 | unsigned long nr_rotated = 0; |
1646 | isolate_mode_t reclaim_mode = ISOLATE_ACTIVE; | ||
1545 | 1647 | ||
1546 | lru_add_drain(); | 1648 | lru_add_drain(); |
1649 | |||
1650 | if (!sc->may_unmap) | ||
1651 | reclaim_mode |= ISOLATE_UNMAPPED; | ||
1652 | if (!sc->may_writepage) | ||
1653 | reclaim_mode |= ISOLATE_CLEAN; | ||
1654 | |||
1547 | spin_lock_irq(&zone->lru_lock); | 1655 | spin_lock_irq(&zone->lru_lock); |
1548 | if (scanning_global_lru(sc)) { | 1656 | if (scanning_global_lru(sc)) { |
1549 | nr_taken = isolate_pages_global(nr_pages, &l_hold, | 1657 | nr_taken = isolate_pages_global(nr_pages, &l_hold, |
1550 | &pgscanned, sc->order, | 1658 | &pgscanned, sc->order, |
1551 | ISOLATE_ACTIVE, zone, | 1659 | reclaim_mode, zone, |
1552 | 1, file); | 1660 | 1, file); |
1553 | zone->pages_scanned += pgscanned; | 1661 | zone->pages_scanned += pgscanned; |
1554 | } else { | 1662 | } else { |
1555 | nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold, | 1663 | nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold, |
1556 | &pgscanned, sc->order, | 1664 | &pgscanned, sc->order, |
1557 | ISOLATE_ACTIVE, zone, | 1665 | reclaim_mode, zone, |
1558 | sc->mem_cgroup, 1, file); | 1666 | sc->mem_cgroup, 1, file); |
1559 | /* | 1667 | /* |
1560 | * mem_cgroup_isolate_pages() keeps track of | 1668 | * mem_cgroup_isolate_pages() keeps track of |
@@ -1659,7 +1767,7 @@ static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc) | |||
1659 | if (scanning_global_lru(sc)) | 1767 | if (scanning_global_lru(sc)) |
1660 | low = inactive_anon_is_low_global(zone); | 1768 | low = inactive_anon_is_low_global(zone); |
1661 | else | 1769 | else |
1662 | low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup); | 1770 | low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup, zone); |
1663 | return low; | 1771 | return low; |
1664 | } | 1772 | } |
1665 | #else | 1773 | #else |
@@ -1702,7 +1810,7 @@ static int inactive_file_is_low(struct zone *zone, struct scan_control *sc) | |||
1702 | if (scanning_global_lru(sc)) | 1810 | if (scanning_global_lru(sc)) |
1703 | low = inactive_file_is_low_global(zone); | 1811 | low = inactive_file_is_low_global(zone); |
1704 | else | 1812 | else |
1705 | low = mem_cgroup_inactive_file_is_low(sc->mem_cgroup); | 1813 | low = mem_cgroup_inactive_file_is_low(sc->mem_cgroup, zone); |
1706 | return low; | 1814 | return low; |
1707 | } | 1815 | } |
1708 | 1816 | ||
@@ -1729,6 +1837,13 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, | |||
1729 | return shrink_inactive_list(nr_to_scan, zone, sc, priority, file); | 1837 | return shrink_inactive_list(nr_to_scan, zone, sc, priority, file); |
1730 | } | 1838 | } |
1731 | 1839 | ||
1840 | static int vmscan_swappiness(struct scan_control *sc) | ||
1841 | { | ||
1842 | if (scanning_global_lru(sc)) | ||
1843 | return vm_swappiness; | ||
1844 | return mem_cgroup_swappiness(sc->mem_cgroup); | ||
1845 | } | ||
1846 | |||
1732 | /* | 1847 | /* |
1733 | * Determine how aggressively the anon and file LRU lists should be | 1848 | * Determine how aggressively the anon and file LRU lists should be |
1734 | * scanned. The relative value of each set of LRU lists is determined | 1849 | * scanned. The relative value of each set of LRU lists is determined |
@@ -1747,22 +1862,22 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, | |||
1747 | u64 fraction[2], denominator; | 1862 | u64 fraction[2], denominator; |
1748 | enum lru_list l; | 1863 | enum lru_list l; |
1749 | int noswap = 0; | 1864 | int noswap = 0; |
1750 | int force_scan = 0; | 1865 | bool force_scan = false; |
1751 | |||
1752 | |||
1753 | anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) + | ||
1754 | zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON); | ||
1755 | file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) + | ||
1756 | zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); | ||
1757 | 1866 | ||
1758 | if (((anon + file) >> priority) < SWAP_CLUSTER_MAX) { | 1867 | /* |
1759 | /* kswapd does zone balancing and need to scan this zone */ | 1868 | * If the zone or memcg is small, nr[l] can be 0. This |
1760 | if (scanning_global_lru(sc) && current_is_kswapd()) | 1869 | * results in no scanning on this priority and a potential |
1761 | force_scan = 1; | 1870 | * priority drop. Global direct reclaim can go to the next |
1762 | /* memcg may have small limit and need to avoid priority drop */ | 1871 | * zone and tends to have no problems. Global kswapd is for |
1763 | if (!scanning_global_lru(sc)) | 1872 | * zone balancing and it needs to scan a minimum amount. When |
1764 | force_scan = 1; | 1873 | * reclaiming for a memcg, a priority drop can cause high |
1765 | } | 1874 | * latencies, so it's better to scan a minimum amount there as |
1875 | * well. | ||
1876 | */ | ||
1877 | if (scanning_global_lru(sc) && current_is_kswapd()) | ||
1878 | force_scan = true; | ||
1879 | if (!scanning_global_lru(sc)) | ||
1880 | force_scan = true; | ||
1766 | 1881 | ||
1767 | /* If we have no swap space, do not bother scanning anon pages. */ | 1882 | /* If we have no swap space, do not bother scanning anon pages. */ |
1768 | if (!sc->may_swap || (nr_swap_pages <= 0)) { | 1883 | if (!sc->may_swap || (nr_swap_pages <= 0)) { |
@@ -1773,6 +1888,11 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, | |||
1773 | goto out; | 1888 | goto out; |
1774 | } | 1889 | } |
1775 | 1890 | ||
1891 | anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) + | ||
1892 | zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON); | ||
1893 | file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) + | ||
1894 | zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); | ||
1895 | |||
1776 | if (scanning_global_lru(sc)) { | 1896 | if (scanning_global_lru(sc)) { |
1777 | free = zone_page_state(zone, NR_FREE_PAGES); | 1897 | free = zone_page_state(zone, NR_FREE_PAGES); |
1778 | /* If we have very few page cache pages, | 1898 | /* If we have very few page cache pages, |
@@ -1789,8 +1909,8 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, | |||
1789 | * With swappiness at 100, anonymous and file have the same priority. | 1909 | * With swappiness at 100, anonymous and file have the same priority. |
1790 | * This scanning priority is essentially the inverse of IO cost. | 1910 | * This scanning priority is essentially the inverse of IO cost. |
1791 | */ | 1911 | */ |
1792 | anon_prio = sc->swappiness; | 1912 | anon_prio = vmscan_swappiness(sc); |
1793 | file_prio = 200 - sc->swappiness; | 1913 | file_prio = 200 - vmscan_swappiness(sc); |
1794 | 1914 | ||
1795 | /* | 1915 | /* |
1796 | * OK, so we have swap space and a fair amount of page cache | 1916 | * OK, so we have swap space and a fair amount of page cache |
@@ -1837,23 +1957,9 @@ out: | |||
1837 | scan = zone_nr_lru_pages(zone, sc, l); | 1957 | scan = zone_nr_lru_pages(zone, sc, l); |
1838 | if (priority || noswap) { | 1958 | if (priority || noswap) { |
1839 | scan >>= priority; | 1959 | scan >>= priority; |
1840 | scan = div64_u64(scan * fraction[file], denominator); | 1960 | if (!scan && force_scan) |
1841 | } | ||
1842 | |||
1843 | /* | ||
1844 | * If zone is small or memcg is small, nr[l] can be 0. | ||
1845 | * This results no-scan on this priority and priority drop down. | ||
1846 | * For global direct reclaim, it can visit next zone and tend | ||
1847 | * not to have problems. For global kswapd, it's for zone | ||
1848 | * balancing and it need to scan a small amounts. When using | ||
1849 | * memcg, priority drop can cause big latency. So, it's better | ||
1850 | * to scan small amount. See may_noscan above. | ||
1851 | */ | ||
1852 | if (!scan && force_scan) { | ||
1853 | if (file) | ||
1854 | scan = SWAP_CLUSTER_MAX; | ||
1855 | else if (!noswap) | ||
1856 | scan = SWAP_CLUSTER_MAX; | 1961 | scan = SWAP_CLUSTER_MAX; |
1962 | scan = div64_u64(scan * fraction[file], denominator); | ||
1857 | } | 1963 | } |
1858 | nr[l] = scan; | 1964 | nr[l] = scan; |
1859 | } | 1965 | } |
@@ -1933,12 +2039,14 @@ static void shrink_zone(int priority, struct zone *zone, | |||
1933 | enum lru_list l; | 2039 | enum lru_list l; |
1934 | unsigned long nr_reclaimed, nr_scanned; | 2040 | unsigned long nr_reclaimed, nr_scanned; |
1935 | unsigned long nr_to_reclaim = sc->nr_to_reclaim; | 2041 | unsigned long nr_to_reclaim = sc->nr_to_reclaim; |
2042 | struct blk_plug plug; | ||
1936 | 2043 | ||
1937 | restart: | 2044 | restart: |
1938 | nr_reclaimed = 0; | 2045 | nr_reclaimed = 0; |
1939 | nr_scanned = sc->nr_scanned; | 2046 | nr_scanned = sc->nr_scanned; |
1940 | get_scan_count(zone, sc, nr, priority); | 2047 | get_scan_count(zone, sc, nr, priority); |
1941 | 2048 | ||
2049 | blk_start_plug(&plug); | ||
1942 | while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || | 2050 | while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || |
1943 | nr[LRU_INACTIVE_FILE]) { | 2051 | nr[LRU_INACTIVE_FILE]) { |
1944 | for_each_evictable_lru(l) { | 2052 | for_each_evictable_lru(l) { |
@@ -1962,6 +2070,7 @@ restart: | |||
1962 | if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY) | 2070 | if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY) |
1963 | break; | 2071 | break; |
1964 | } | 2072 | } |
2073 | blk_finish_plug(&plug); | ||
1965 | sc->nr_reclaimed += nr_reclaimed; | 2074 | sc->nr_reclaimed += nr_reclaimed; |
1966 | 2075 | ||
1967 | /* | 2076 | /* |
@@ -1994,14 +2103,19 @@ restart: | |||
1994 | * | 2103 | * |
1995 | * If a zone is deemed to be full of pinned pages then just give it a light | 2104 | * If a zone is deemed to be full of pinned pages then just give it a light |
1996 | * scan then give up on it. | 2105 | * scan then give up on it. |
2106 | * | ||
2107 | * This function returns true if a zone is being reclaimed for a costly | ||
2108 | * high-order allocation and compaction is either ready to begin or deferred. | ||
2109 | * This indicates to the caller that it should retry the allocation or fail. | ||
1997 | */ | 2110 | */ |
1998 | static void shrink_zones(int priority, struct zonelist *zonelist, | 2111 | static bool shrink_zones(int priority, struct zonelist *zonelist, |
1999 | struct scan_control *sc) | 2112 | struct scan_control *sc) |
2000 | { | 2113 | { |
2001 | struct zoneref *z; | 2114 | struct zoneref *z; |
2002 | struct zone *zone; | 2115 | struct zone *zone; |
2003 | unsigned long nr_soft_reclaimed; | 2116 | unsigned long nr_soft_reclaimed; |
2004 | unsigned long nr_soft_scanned; | 2117 | unsigned long nr_soft_scanned; |
2118 | bool should_abort_reclaim = false; | ||
2005 | 2119 | ||
2006 | for_each_zone_zonelist_nodemask(zone, z, zonelist, | 2120 | for_each_zone_zonelist_nodemask(zone, z, zonelist, |
2007 | gfp_zone(sc->gfp_mask), sc->nodemask) { | 2121 | gfp_zone(sc->gfp_mask), sc->nodemask) { |
@@ -2016,6 +2130,23 @@ static void shrink_zones(int priority, struct zonelist *zonelist, | |||
2016 | continue; | 2130 | continue; |
2017 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) | 2131 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) |
2018 | continue; /* Let kswapd poll it */ | 2132 | continue; /* Let kswapd poll it */ |
2133 | if (COMPACTION_BUILD) { | ||
2134 | /* | ||
2135 | * If we already have plenty of memory free for | ||
2136 | * compaction in this zone, don't free any more. | ||
2137 | * Even though compaction is invoked for any | ||
2138 | * non-zero order, only frequent costly order | ||
2139 | * reclamation is disruptive enough to become a | ||
2140 | * noticable problem, like transparent huge page | ||
2141 | * allocations. | ||
2142 | */ | ||
2143 | if (sc->order > PAGE_ALLOC_COSTLY_ORDER && | ||
2144 | (compaction_suitable(zone, sc->order) || | ||
2145 | compaction_deferred(zone))) { | ||
2146 | should_abort_reclaim = true; | ||
2147 | continue; | ||
2148 | } | ||
2149 | } | ||
2019 | /* | 2150 | /* |
2020 | * This steals pages from memory cgroups over softlimit | 2151 | * This steals pages from memory cgroups over softlimit |
2021 | * and returns the number of reclaimed pages and | 2152 | * and returns the number of reclaimed pages and |
@@ -2033,6 +2164,8 @@ static void shrink_zones(int priority, struct zonelist *zonelist, | |||
2033 | 2164 | ||
2034 | shrink_zone(priority, zone, sc); | 2165 | shrink_zone(priority, zone, sc); |
2035 | } | 2166 | } |
2167 | |||
2168 | return should_abort_reclaim; | ||
2036 | } | 2169 | } |
2037 | 2170 | ||
2038 | static bool zone_reclaimable(struct zone *zone) | 2171 | static bool zone_reclaimable(struct zone *zone) |
@@ -2097,7 +2230,9 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
2097 | sc->nr_scanned = 0; | 2230 | sc->nr_scanned = 0; |
2098 | if (!priority) | 2231 | if (!priority) |
2099 | disable_swap_token(sc->mem_cgroup); | 2232 | disable_swap_token(sc->mem_cgroup); |
2100 | shrink_zones(priority, zonelist, sc); | 2233 | if (shrink_zones(priority, zonelist, sc)) |
2234 | break; | ||
2235 | |||
2101 | /* | 2236 | /* |
2102 | * Don't shrink slabs when reclaiming memory from | 2237 | * Don't shrink slabs when reclaiming memory from |
2103 | * over limit cgroups | 2238 | * over limit cgroups |
@@ -2131,7 +2266,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
2131 | */ | 2266 | */ |
2132 | writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2; | 2267 | writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2; |
2133 | if (total_scanned > writeback_threshold) { | 2268 | if (total_scanned > writeback_threshold) { |
2134 | wakeup_flusher_threads(laptop_mode ? 0 : total_scanned); | 2269 | wakeup_flusher_threads(laptop_mode ? 0 : total_scanned, |
2270 | WB_REASON_TRY_TO_FREE_PAGES); | ||
2135 | sc->may_writepage = 1; | 2271 | sc->may_writepage = 1; |
2136 | } | 2272 | } |
2137 | 2273 | ||
@@ -2179,7 +2315,6 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, | |||
2179 | .nr_to_reclaim = SWAP_CLUSTER_MAX, | 2315 | .nr_to_reclaim = SWAP_CLUSTER_MAX, |
2180 | .may_unmap = 1, | 2316 | .may_unmap = 1, |
2181 | .may_swap = 1, | 2317 | .may_swap = 1, |
2182 | .swappiness = vm_swappiness, | ||
2183 | .order = order, | 2318 | .order = order, |
2184 | .mem_cgroup = NULL, | 2319 | .mem_cgroup = NULL, |
2185 | .nodemask = nodemask, | 2320 | .nodemask = nodemask, |
@@ -2203,7 +2338,6 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, | |||
2203 | 2338 | ||
2204 | unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, | 2339 | unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, |
2205 | gfp_t gfp_mask, bool noswap, | 2340 | gfp_t gfp_mask, bool noswap, |
2206 | unsigned int swappiness, | ||
2207 | struct zone *zone, | 2341 | struct zone *zone, |
2208 | unsigned long *nr_scanned) | 2342 | unsigned long *nr_scanned) |
2209 | { | 2343 | { |
@@ -2213,7 +2347,6 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, | |||
2213 | .may_writepage = !laptop_mode, | 2347 | .may_writepage = !laptop_mode, |
2214 | .may_unmap = 1, | 2348 | .may_unmap = 1, |
2215 | .may_swap = !noswap, | 2349 | .may_swap = !noswap, |
2216 | .swappiness = swappiness, | ||
2217 | .order = 0, | 2350 | .order = 0, |
2218 | .mem_cgroup = mem, | 2351 | .mem_cgroup = mem, |
2219 | }; | 2352 | }; |
@@ -2242,8 +2375,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, | |||
2242 | 2375 | ||
2243 | unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, | 2376 | unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, |
2244 | gfp_t gfp_mask, | 2377 | gfp_t gfp_mask, |
2245 | bool noswap, | 2378 | bool noswap) |
2246 | unsigned int swappiness) | ||
2247 | { | 2379 | { |
2248 | struct zonelist *zonelist; | 2380 | struct zonelist *zonelist; |
2249 | unsigned long nr_reclaimed; | 2381 | unsigned long nr_reclaimed; |
@@ -2253,7 +2385,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, | |||
2253 | .may_unmap = 1, | 2385 | .may_unmap = 1, |
2254 | .may_swap = !noswap, | 2386 | .may_swap = !noswap, |
2255 | .nr_to_reclaim = SWAP_CLUSTER_MAX, | 2387 | .nr_to_reclaim = SWAP_CLUSTER_MAX, |
2256 | .swappiness = swappiness, | ||
2257 | .order = 0, | 2388 | .order = 0, |
2258 | .mem_cgroup = mem_cont, | 2389 | .mem_cgroup = mem_cont, |
2259 | .nodemask = NULL, /* we don't care the placement */ | 2390 | .nodemask = NULL, /* we don't care the placement */ |
@@ -2310,7 +2441,8 @@ static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages, | |||
2310 | for (i = 0; i <= classzone_idx; i++) | 2441 | for (i = 0; i <= classzone_idx; i++) |
2311 | present_pages += pgdat->node_zones[i].present_pages; | 2442 | present_pages += pgdat->node_zones[i].present_pages; |
2312 | 2443 | ||
2313 | return balanced_pages > (present_pages >> 2); | 2444 | /* A special case here: if zone has no page, we think it's balanced */ |
2445 | return balanced_pages >= (present_pages >> 2); | ||
2314 | } | 2446 | } |
2315 | 2447 | ||
2316 | /* is kswapd sleeping prematurely? */ | 2448 | /* is kswapd sleeping prematurely? */ |
@@ -2403,7 +2535,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, | |||
2403 | * we want to put equal scanning pressure on each zone. | 2535 | * we want to put equal scanning pressure on each zone. |
2404 | */ | 2536 | */ |
2405 | .nr_to_reclaim = ULONG_MAX, | 2537 | .nr_to_reclaim = ULONG_MAX, |
2406 | .swappiness = vm_swappiness, | ||
2407 | .order = order, | 2538 | .order = order, |
2408 | .mem_cgroup = NULL, | 2539 | .mem_cgroup = NULL, |
2409 | }; | 2540 | }; |
@@ -2452,6 +2583,9 @@ loop_again: | |||
2452 | high_wmark_pages(zone), 0, 0)) { | 2583 | high_wmark_pages(zone), 0, 0)) { |
2453 | end_zone = i; | 2584 | end_zone = i; |
2454 | break; | 2585 | break; |
2586 | } else { | ||
2587 | /* If balanced, clear the congested flag */ | ||
2588 | zone_clear_flag(zone, ZONE_CONGESTED); | ||
2455 | } | 2589 | } |
2456 | } | 2590 | } |
2457 | if (i < 0) | 2591 | if (i < 0) |
@@ -2642,6 +2776,8 @@ out: | |||
2642 | 2776 | ||
2643 | /* If balanced, clear the congested flag */ | 2777 | /* If balanced, clear the congested flag */ |
2644 | zone_clear_flag(zone, ZONE_CONGESTED); | 2778 | zone_clear_flag(zone, ZONE_CONGESTED); |
2779 | if (i <= *classzone_idx) | ||
2780 | balanced += zone->present_pages; | ||
2645 | } | 2781 | } |
2646 | } | 2782 | } |
2647 | 2783 | ||
@@ -2715,7 +2851,9 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) | |||
2715 | static int kswapd(void *p) | 2851 | static int kswapd(void *p) |
2716 | { | 2852 | { |
2717 | unsigned long order, new_order; | 2853 | unsigned long order, new_order; |
2854 | unsigned balanced_order; | ||
2718 | int classzone_idx, new_classzone_idx; | 2855 | int classzone_idx, new_classzone_idx; |
2856 | int balanced_classzone_idx; | ||
2719 | pg_data_t *pgdat = (pg_data_t*)p; | 2857 | pg_data_t *pgdat = (pg_data_t*)p; |
2720 | struct task_struct *tsk = current; | 2858 | struct task_struct *tsk = current; |
2721 | 2859 | ||
@@ -2746,7 +2884,9 @@ static int kswapd(void *p) | |||
2746 | set_freezable(); | 2884 | set_freezable(); |
2747 | 2885 | ||
2748 | order = new_order = 0; | 2886 | order = new_order = 0; |
2887 | balanced_order = 0; | ||
2749 | classzone_idx = new_classzone_idx = pgdat->nr_zones - 1; | 2888 | classzone_idx = new_classzone_idx = pgdat->nr_zones - 1; |
2889 | balanced_classzone_idx = classzone_idx; | ||
2750 | for ( ; ; ) { | 2890 | for ( ; ; ) { |
2751 | int ret; | 2891 | int ret; |
2752 | 2892 | ||
@@ -2755,7 +2895,8 @@ static int kswapd(void *p) | |||
2755 | * new request of a similar or harder type will succeed soon | 2895 | * new request of a similar or harder type will succeed soon |
2756 | * so consider going to sleep on the basis we reclaimed at | 2896 | * so consider going to sleep on the basis we reclaimed at |
2757 | */ | 2897 | */ |
2758 | if (classzone_idx >= new_classzone_idx && order == new_order) { | 2898 | if (balanced_classzone_idx >= new_classzone_idx && |
2899 | balanced_order == new_order) { | ||
2759 | new_order = pgdat->kswapd_max_order; | 2900 | new_order = pgdat->kswapd_max_order; |
2760 | new_classzone_idx = pgdat->classzone_idx; | 2901 | new_classzone_idx = pgdat->classzone_idx; |
2761 | pgdat->kswapd_max_order = 0; | 2902 | pgdat->kswapd_max_order = 0; |
@@ -2770,9 +2911,12 @@ static int kswapd(void *p) | |||
2770 | order = new_order; | 2911 | order = new_order; |
2771 | classzone_idx = new_classzone_idx; | 2912 | classzone_idx = new_classzone_idx; |
2772 | } else { | 2913 | } else { |
2773 | kswapd_try_to_sleep(pgdat, order, classzone_idx); | 2914 | kswapd_try_to_sleep(pgdat, balanced_order, |
2915 | balanced_classzone_idx); | ||
2774 | order = pgdat->kswapd_max_order; | 2916 | order = pgdat->kswapd_max_order; |
2775 | classzone_idx = pgdat->classzone_idx; | 2917 | classzone_idx = pgdat->classzone_idx; |
2918 | new_order = order; | ||
2919 | new_classzone_idx = classzone_idx; | ||
2776 | pgdat->kswapd_max_order = 0; | 2920 | pgdat->kswapd_max_order = 0; |
2777 | pgdat->classzone_idx = pgdat->nr_zones - 1; | 2921 | pgdat->classzone_idx = pgdat->nr_zones - 1; |
2778 | } | 2922 | } |
@@ -2787,7 +2931,9 @@ static int kswapd(void *p) | |||
2787 | */ | 2931 | */ |
2788 | if (!ret) { | 2932 | if (!ret) { |
2789 | trace_mm_vmscan_kswapd_wake(pgdat->node_id, order); | 2933 | trace_mm_vmscan_kswapd_wake(pgdat->node_id, order); |
2790 | order = balance_pgdat(pgdat, order, &classzone_idx); | 2934 | balanced_classzone_idx = classzone_idx; |
2935 | balanced_order = balance_pgdat(pgdat, order, | ||
2936 | &balanced_classzone_idx); | ||
2791 | } | 2937 | } |
2792 | } | 2938 | } |
2793 | return 0; | 2939 | return 0; |
@@ -2873,7 +3019,6 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) | |||
2873 | .may_writepage = 1, | 3019 | .may_writepage = 1, |
2874 | .nr_to_reclaim = nr_to_reclaim, | 3020 | .nr_to_reclaim = nr_to_reclaim, |
2875 | .hibernation_mode = 1, | 3021 | .hibernation_mode = 1, |
2876 | .swappiness = vm_swappiness, | ||
2877 | .order = 0, | 3022 | .order = 0, |
2878 | }; | 3023 | }; |
2879 | struct shrink_control shrink = { | 3024 | struct shrink_control shrink = { |
@@ -3060,7 +3205,6 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
3060 | .nr_to_reclaim = max_t(unsigned long, nr_pages, | 3205 | .nr_to_reclaim = max_t(unsigned long, nr_pages, |
3061 | SWAP_CLUSTER_MAX), | 3206 | SWAP_CLUSTER_MAX), |
3062 | .gfp_mask = gfp_mask, | 3207 | .gfp_mask = gfp_mask, |
3063 | .swappiness = vm_swappiness, | ||
3064 | .order = order, | 3208 | .order = order, |
3065 | }; | 3209 | }; |
3066 | struct shrink_control shrink = { | 3210 | struct shrink_control shrink = { |
@@ -3301,66 +3445,12 @@ void scan_mapping_unevictable_pages(struct address_space *mapping) | |||
3301 | 3445 | ||
3302 | } | 3446 | } |
3303 | 3447 | ||
3304 | /** | 3448 | static void warn_scan_unevictable_pages(void) |
3305 | * scan_zone_unevictable_pages - check unevictable list for evictable pages | ||
3306 | * @zone - zone of which to scan the unevictable list | ||
3307 | * | ||
3308 | * Scan @zone's unevictable LRU lists to check for pages that have become | ||
3309 | * evictable. Move those that have to @zone's inactive list where they | ||
3310 | * become candidates for reclaim, unless shrink_inactive_zone() decides | ||
3311 | * to reactivate them. Pages that are still unevictable are rotated | ||
3312 | * back onto @zone's unevictable list. | ||
3313 | */ | ||
3314 | #define SCAN_UNEVICTABLE_BATCH_SIZE 16UL /* arbitrary lock hold batch size */ | ||
3315 | static void scan_zone_unevictable_pages(struct zone *zone) | ||
3316 | { | 3449 | { |
3317 | struct list_head *l_unevictable = &zone->lru[LRU_UNEVICTABLE].list; | 3450 | printk_once(KERN_WARNING |
3318 | unsigned long scan; | 3451 | "The scan_unevictable_pages sysctl/node-interface has been " |
3319 | unsigned long nr_to_scan = zone_page_state(zone, NR_UNEVICTABLE); | 3452 | "disabled for lack of a legitimate use case. If you have " |
3320 | 3453 | "one, please send an email to linux-mm@kvack.org.\n"); | |
3321 | while (nr_to_scan > 0) { | ||
3322 | unsigned long batch_size = min(nr_to_scan, | ||
3323 | SCAN_UNEVICTABLE_BATCH_SIZE); | ||
3324 | |||
3325 | spin_lock_irq(&zone->lru_lock); | ||
3326 | for (scan = 0; scan < batch_size; scan++) { | ||
3327 | struct page *page = lru_to_page(l_unevictable); | ||
3328 | |||
3329 | if (!trylock_page(page)) | ||
3330 | continue; | ||
3331 | |||
3332 | prefetchw_prev_lru_page(page, l_unevictable, flags); | ||
3333 | |||
3334 | if (likely(PageLRU(page) && PageUnevictable(page))) | ||
3335 | check_move_unevictable_page(page, zone); | ||
3336 | |||
3337 | unlock_page(page); | ||
3338 | } | ||
3339 | spin_unlock_irq(&zone->lru_lock); | ||
3340 | |||
3341 | nr_to_scan -= batch_size; | ||
3342 | } | ||
3343 | } | ||
3344 | |||
3345 | |||
3346 | /** | ||
3347 | * scan_all_zones_unevictable_pages - scan all unevictable lists for evictable pages | ||
3348 | * | ||
3349 | * A really big hammer: scan all zones' unevictable LRU lists to check for | ||
3350 | * pages that have become evictable. Move those back to the zones' | ||
3351 | * inactive list where they become candidates for reclaim. | ||
3352 | * This occurs when, e.g., we have unswappable pages on the unevictable lists, | ||
3353 | * and we add swap to the system. As such, it runs in the context of a task | ||
3354 | * that has possibly/probably made some previously unevictable pages | ||
3355 | * evictable. | ||
3356 | */ | ||
3357 | static void scan_all_zones_unevictable_pages(void) | ||
3358 | { | ||
3359 | struct zone *zone; | ||
3360 | |||
3361 | for_each_zone(zone) { | ||
3362 | scan_zone_unevictable_pages(zone); | ||
3363 | } | ||
3364 | } | 3454 | } |
3365 | 3455 | ||
3366 | /* | 3456 | /* |
@@ -3373,11 +3463,8 @@ int scan_unevictable_handler(struct ctl_table *table, int write, | |||
3373 | void __user *buffer, | 3463 | void __user *buffer, |
3374 | size_t *length, loff_t *ppos) | 3464 | size_t *length, loff_t *ppos) |
3375 | { | 3465 | { |
3466 | warn_scan_unevictable_pages(); | ||
3376 | proc_doulongvec_minmax(table, write, buffer, length, ppos); | 3467 | proc_doulongvec_minmax(table, write, buffer, length, ppos); |
3377 | |||
3378 | if (write && *(unsigned long *)table->data) | ||
3379 | scan_all_zones_unevictable_pages(); | ||
3380 | |||
3381 | scan_unevictable_pages = 0; | 3468 | scan_unevictable_pages = 0; |
3382 | return 0; | 3469 | return 0; |
3383 | } | 3470 | } |
@@ -3392,6 +3479,7 @@ static ssize_t read_scan_unevictable_node(struct sys_device *dev, | |||
3392 | struct sysdev_attribute *attr, | 3479 | struct sysdev_attribute *attr, |
3393 | char *buf) | 3480 | char *buf) |
3394 | { | 3481 | { |
3482 | warn_scan_unevictable_pages(); | ||
3395 | return sprintf(buf, "0\n"); /* always zero; should fit... */ | 3483 | return sprintf(buf, "0\n"); /* always zero; should fit... */ |
3396 | } | 3484 | } |
3397 | 3485 | ||
@@ -3399,19 +3487,7 @@ static ssize_t write_scan_unevictable_node(struct sys_device *dev, | |||
3399 | struct sysdev_attribute *attr, | 3487 | struct sysdev_attribute *attr, |
3400 | const char *buf, size_t count) | 3488 | const char *buf, size_t count) |
3401 | { | 3489 | { |
3402 | struct zone *node_zones = NODE_DATA(dev->id)->node_zones; | 3490 | warn_scan_unevictable_pages(); |
3403 | struct zone *zone; | ||
3404 | unsigned long res; | ||
3405 | unsigned long req = strict_strtoul(buf, 10, &res); | ||
3406 | |||
3407 | if (!req) | ||
3408 | return 1; /* zero is no-op */ | ||
3409 | |||
3410 | for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { | ||
3411 | if (!populated_zone(zone)) | ||
3412 | continue; | ||
3413 | scan_zone_unevictable_pages(zone); | ||
3414 | } | ||
3415 | return 1; | 3491 | return 1; |
3416 | } | 3492 | } |
3417 | 3493 | ||
diff --git a/mm/vmstat.c b/mm/vmstat.c index 20c18b7694b2..8fd603b1665e 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
@@ -78,7 +78,7 @@ void vm_events_fold_cpu(int cpu) | |||
78 | * | 78 | * |
79 | * vm_stat contains the global counters | 79 | * vm_stat contains the global counters |
80 | */ | 80 | */ |
81 | atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS]; | 81 | atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp; |
82 | EXPORT_SYMBOL(vm_stat); | 82 | EXPORT_SYMBOL(vm_stat); |
83 | 83 | ||
84 | #ifdef CONFIG_SMP | 84 | #ifdef CONFIG_SMP |
@@ -659,7 +659,7 @@ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat, | |||
659 | } | 659 | } |
660 | #endif | 660 | #endif |
661 | 661 | ||
662 | #if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS) | 662 | #if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS) || defined(CONFIG_NUMA) |
663 | #ifdef CONFIG_ZONE_DMA | 663 | #ifdef CONFIG_ZONE_DMA |
664 | #define TEXT_FOR_DMA(xx) xx "_dma", | 664 | #define TEXT_FOR_DMA(xx) xx "_dma", |
665 | #else | 665 | #else |
@@ -702,6 +702,7 @@ const char * const vmstat_text[] = { | |||
702 | "nr_unstable", | 702 | "nr_unstable", |
703 | "nr_bounce", | 703 | "nr_bounce", |
704 | "nr_vmscan_write", | 704 | "nr_vmscan_write", |
705 | "nr_vmscan_immediate_reclaim", | ||
705 | "nr_writeback_temp", | 706 | "nr_writeback_temp", |
706 | "nr_isolated_anon", | 707 | "nr_isolated_anon", |
707 | "nr_isolated_file", | 708 | "nr_isolated_file", |
@@ -788,7 +789,7 @@ const char * const vmstat_text[] = { | |||
788 | 789 | ||
789 | #endif /* CONFIG_VM_EVENTS_COUNTERS */ | 790 | #endif /* CONFIG_VM_EVENTS_COUNTERS */ |
790 | }; | 791 | }; |
791 | #endif /* CONFIG_PROC_FS || CONFIG_SYSFS */ | 792 | #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA */ |
792 | 793 | ||
793 | 794 | ||
794 | #ifdef CONFIG_PROC_FS | 795 | #ifdef CONFIG_PROC_FS |