diff options
author | Luciano Coelho <coelho@ti.com> | 2011-12-01 05:14:48 -0500 |
---|---|---|
committer | Luciano Coelho <coelho@ti.com> | 2011-12-01 05:14:48 -0500 |
commit | e4da3fbfbd1de56d2367653e3823e6445e49f8a9 (patch) | |
tree | f69f424f731b89a75f881967903ff2f38f4b6a92 /mm | |
parent | b693289406f0b8ca70ab77e745be6196d5740eb0 (diff) | |
parent | ba5736a5e9ac20c378ae4179e8a0ed3cc4b44351 (diff) |
Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/linville/wireless-next into wl12xx-next
Diffstat (limited to 'mm')
59 files changed, 3934 insertions, 2910 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index f2f1ca19ed53..011b110365c8 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
@@ -131,6 +131,9 @@ config SPARSEMEM_VMEMMAP | |||
131 | config HAVE_MEMBLOCK | 131 | config HAVE_MEMBLOCK |
132 | boolean | 132 | boolean |
133 | 133 | ||
134 | config NO_BOOTMEM | ||
135 | boolean | ||
136 | |||
134 | # eventually, we can have this option just 'select SPARSEMEM' | 137 | # eventually, we can have this option just 'select SPARSEMEM' |
135 | config MEMORY_HOTPLUG | 138 | config MEMORY_HOTPLUG |
136 | bool "Allow for memory hot-add" | 139 | bool "Allow for memory hot-add" |
diff --git a/mm/Makefile b/mm/Makefile index 836e4163c1bf..50ec00ef2a0e 100644 --- a/mm/Makefile +++ b/mm/Makefile | |||
@@ -5,7 +5,8 @@ | |||
5 | mmu-y := nommu.o | 5 | mmu-y := nommu.o |
6 | mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ | 6 | mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ |
7 | mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ | 7 | mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ |
8 | vmalloc.o pagewalk.o pgtable-generic.o | 8 | vmalloc.o pagewalk.o pgtable-generic.o \ |
9 | process_vm_access.o | ||
9 | 10 | ||
10 | obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ | 11 | obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ |
11 | maccess.o page_alloc.o page-writeback.o \ | 12 | maccess.o page_alloc.o page-writeback.o \ |
diff --git a/mm/backing-dev.c b/mm/backing-dev.c index d6edf8d14f9c..a0860640378d 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c | |||
@@ -97,6 +97,7 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) | |||
97 | "BdiDirtyThresh: %10lu kB\n" | 97 | "BdiDirtyThresh: %10lu kB\n" |
98 | "DirtyThresh: %10lu kB\n" | 98 | "DirtyThresh: %10lu kB\n" |
99 | "BackgroundThresh: %10lu kB\n" | 99 | "BackgroundThresh: %10lu kB\n" |
100 | "BdiDirtied: %10lu kB\n" | ||
100 | "BdiWritten: %10lu kB\n" | 101 | "BdiWritten: %10lu kB\n" |
101 | "BdiWriteBandwidth: %10lu kBps\n" | 102 | "BdiWriteBandwidth: %10lu kBps\n" |
102 | "b_dirty: %10lu\n" | 103 | "b_dirty: %10lu\n" |
@@ -109,6 +110,7 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) | |||
109 | K(bdi_thresh), | 110 | K(bdi_thresh), |
110 | K(dirty_thresh), | 111 | K(dirty_thresh), |
111 | K(background_thresh), | 112 | K(background_thresh), |
113 | (unsigned long) K(bdi_stat(bdi, BDI_DIRTIED)), | ||
112 | (unsigned long) K(bdi_stat(bdi, BDI_WRITTEN)), | 114 | (unsigned long) K(bdi_stat(bdi, BDI_WRITTEN)), |
113 | (unsigned long) K(bdi->write_bandwidth), | 115 | (unsigned long) K(bdi->write_bandwidth), |
114 | nr_dirty, | 116 | nr_dirty, |
@@ -359,6 +361,17 @@ static unsigned long bdi_longest_inactive(void) | |||
359 | return max(5UL * 60 * HZ, interval); | 361 | return max(5UL * 60 * HZ, interval); |
360 | } | 362 | } |
361 | 363 | ||
364 | /* | ||
365 | * Clear pending bit and wakeup anybody waiting for flusher thread creation or | ||
366 | * shutdown | ||
367 | */ | ||
368 | static void bdi_clear_pending(struct backing_dev_info *bdi) | ||
369 | { | ||
370 | clear_bit(BDI_pending, &bdi->state); | ||
371 | smp_mb__after_clear_bit(); | ||
372 | wake_up_bit(&bdi->state, BDI_pending); | ||
373 | } | ||
374 | |||
362 | static int bdi_forker_thread(void *ptr) | 375 | static int bdi_forker_thread(void *ptr) |
363 | { | 376 | { |
364 | struct bdi_writeback *me = ptr; | 377 | struct bdi_writeback *me = ptr; |
@@ -390,6 +403,12 @@ static int bdi_forker_thread(void *ptr) | |||
390 | } | 403 | } |
391 | 404 | ||
392 | spin_lock_bh(&bdi_lock); | 405 | spin_lock_bh(&bdi_lock); |
406 | /* | ||
407 | * In the following loop we are going to check whether we have | ||
408 | * some work to do without any synchronization with tasks | ||
409 | * waking us up to do work for them. Set the task state here | ||
410 | * so that we don't miss wakeups after verifying conditions. | ||
411 | */ | ||
393 | set_current_state(TASK_INTERRUPTIBLE); | 412 | set_current_state(TASK_INTERRUPTIBLE); |
394 | 413 | ||
395 | list_for_each_entry(bdi, &bdi_list, bdi_list) { | 414 | list_for_each_entry(bdi, &bdi_list, bdi_list) { |
@@ -456,7 +475,8 @@ static int bdi_forker_thread(void *ptr) | |||
456 | * the bdi from the thread. Hopefully 1024 is | 475 | * the bdi from the thread. Hopefully 1024 is |
457 | * large enough for efficient IO. | 476 | * large enough for efficient IO. |
458 | */ | 477 | */ |
459 | writeback_inodes_wb(&bdi->wb, 1024); | 478 | writeback_inodes_wb(&bdi->wb, 1024, |
479 | WB_REASON_FORKER_THREAD); | ||
460 | } else { | 480 | } else { |
461 | /* | 481 | /* |
462 | * The spinlock makes sure we do not lose | 482 | * The spinlock makes sure we do not lose |
@@ -469,11 +489,13 @@ static int bdi_forker_thread(void *ptr) | |||
469 | spin_unlock_bh(&bdi->wb_lock); | 489 | spin_unlock_bh(&bdi->wb_lock); |
470 | wake_up_process(task); | 490 | wake_up_process(task); |
471 | } | 491 | } |
492 | bdi_clear_pending(bdi); | ||
472 | break; | 493 | break; |
473 | 494 | ||
474 | case KILL_THREAD: | 495 | case KILL_THREAD: |
475 | __set_current_state(TASK_RUNNING); | 496 | __set_current_state(TASK_RUNNING); |
476 | kthread_stop(task); | 497 | kthread_stop(task); |
498 | bdi_clear_pending(bdi); | ||
477 | break; | 499 | break; |
478 | 500 | ||
479 | case NO_ACTION: | 501 | case NO_ACTION: |
@@ -489,16 +511,8 @@ static int bdi_forker_thread(void *ptr) | |||
489 | else | 511 | else |
490 | schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10)); | 512 | schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10)); |
491 | try_to_freeze(); | 513 | try_to_freeze(); |
492 | /* Back to the main loop */ | 514 | break; |
493 | continue; | ||
494 | } | 515 | } |
495 | |||
496 | /* | ||
497 | * Clear pending bit and wakeup anybody waiting to tear us down. | ||
498 | */ | ||
499 | clear_bit(BDI_pending, &bdi->state); | ||
500 | smp_mb__after_clear_bit(); | ||
501 | wake_up_bit(&bdi->state, BDI_pending); | ||
502 | } | 516 | } |
503 | 517 | ||
504 | return 0; | 518 | return 0; |
@@ -672,6 +686,8 @@ int bdi_init(struct backing_dev_info *bdi) | |||
672 | bdi->bw_time_stamp = jiffies; | 686 | bdi->bw_time_stamp = jiffies; |
673 | bdi->written_stamp = 0; | 687 | bdi->written_stamp = 0; |
674 | 688 | ||
689 | bdi->balanced_dirty_ratelimit = INIT_BW; | ||
690 | bdi->dirty_ratelimit = INIT_BW; | ||
675 | bdi->write_bandwidth = INIT_BW; | 691 | bdi->write_bandwidth = INIT_BW; |
676 | bdi->avg_write_bandwidth = INIT_BW; | 692 | bdi->avg_write_bandwidth = INIT_BW; |
677 | 693 | ||
diff --git a/mm/bootmem.c b/mm/bootmem.c index 01d5a4b3dd0c..1a77012ecdb3 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c | |||
@@ -12,7 +12,7 @@ | |||
12 | #include <linux/pfn.h> | 12 | #include <linux/pfn.h> |
13 | #include <linux/slab.h> | 13 | #include <linux/slab.h> |
14 | #include <linux/bootmem.h> | 14 | #include <linux/bootmem.h> |
15 | #include <linux/module.h> | 15 | #include <linux/export.h> |
16 | #include <linux/kmemleak.h> | 16 | #include <linux/kmemleak.h> |
17 | #include <linux/range.h> | 17 | #include <linux/range.h> |
18 | #include <linux/memblock.h> | 18 | #include <linux/memblock.h> |
diff --git a/mm/bounce.c b/mm/bounce.c index 1481de68184b..4e9ae722af83 100644 --- a/mm/bounce.c +++ b/mm/bounce.c | |||
@@ -4,7 +4,7 @@ | |||
4 | */ | 4 | */ |
5 | 5 | ||
6 | #include <linux/mm.h> | 6 | #include <linux/mm.h> |
7 | #include <linux/module.h> | 7 | #include <linux/export.h> |
8 | #include <linux/swap.h> | 8 | #include <linux/swap.h> |
9 | #include <linux/gfp.h> | 9 | #include <linux/gfp.h> |
10 | #include <linux/bio.h> | 10 | #include <linux/bio.h> |
@@ -14,6 +14,7 @@ | |||
14 | #include <linux/init.h> | 14 | #include <linux/init.h> |
15 | #include <linux/hash.h> | 15 | #include <linux/hash.h> |
16 | #include <linux/highmem.h> | 16 | #include <linux/highmem.h> |
17 | #include <linux/bootmem.h> | ||
17 | #include <asm/tlbflush.h> | 18 | #include <asm/tlbflush.h> |
18 | 19 | ||
19 | #include <trace/events/block.h> | 20 | #include <trace/events/block.h> |
@@ -26,12 +27,10 @@ static mempool_t *page_pool, *isa_page_pool; | |||
26 | #ifdef CONFIG_HIGHMEM | 27 | #ifdef CONFIG_HIGHMEM |
27 | static __init int init_emergency_pool(void) | 28 | static __init int init_emergency_pool(void) |
28 | { | 29 | { |
29 | struct sysinfo i; | 30 | #ifndef CONFIG_MEMORY_HOTPLUG |
30 | si_meminfo(&i); | 31 | if (max_pfn <= max_low_pfn) |
31 | si_swapinfo(&i); | ||
32 | |||
33 | if (!i.totalhigh) | ||
34 | return 0; | 32 | return 0; |
33 | #endif | ||
35 | 34 | ||
36 | page_pool = mempool_create_page_pool(POOL_SIZE, 0); | 35 | page_pool = mempool_create_page_pool(POOL_SIZE, 0); |
37 | BUG_ON(!page_pool); | 36 | BUG_ON(!page_pool); |
diff --git a/mm/compaction.c b/mm/compaction.c index 6cc604bd5649..899d95638586 100644 --- a/mm/compaction.c +++ b/mm/compaction.c | |||
@@ -35,10 +35,6 @@ struct compact_control { | |||
35 | unsigned long migrate_pfn; /* isolate_migratepages search base */ | 35 | unsigned long migrate_pfn; /* isolate_migratepages search base */ |
36 | bool sync; /* Synchronous migration */ | 36 | bool sync; /* Synchronous migration */ |
37 | 37 | ||
38 | /* Account for isolated anon and file pages */ | ||
39 | unsigned long nr_anon; | ||
40 | unsigned long nr_file; | ||
41 | |||
42 | unsigned int order; /* order a direct compactor needs */ | 38 | unsigned int order; /* order a direct compactor needs */ |
43 | int migratetype; /* MOVABLE, RECLAIMABLE etc */ | 39 | int migratetype; /* MOVABLE, RECLAIMABLE etc */ |
44 | struct zone *zone; | 40 | struct zone *zone; |
@@ -223,17 +219,13 @@ static void isolate_freepages(struct zone *zone, | |||
223 | static void acct_isolated(struct zone *zone, struct compact_control *cc) | 219 | static void acct_isolated(struct zone *zone, struct compact_control *cc) |
224 | { | 220 | { |
225 | struct page *page; | 221 | struct page *page; |
226 | unsigned int count[NR_LRU_LISTS] = { 0, }; | 222 | unsigned int count[2] = { 0, }; |
227 | 223 | ||
228 | list_for_each_entry(page, &cc->migratepages, lru) { | 224 | list_for_each_entry(page, &cc->migratepages, lru) |
229 | int lru = page_lru_base_type(page); | 225 | count[!!page_is_file_cache(page)]++; |
230 | count[lru]++; | ||
231 | } | ||
232 | 226 | ||
233 | cc->nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON]; | 227 | __mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]); |
234 | cc->nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE]; | 228 | __mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]); |
235 | __mod_zone_page_state(zone, NR_ISOLATED_ANON, cc->nr_anon); | ||
236 | __mod_zone_page_state(zone, NR_ISOLATED_FILE, cc->nr_file); | ||
237 | } | 229 | } |
238 | 230 | ||
239 | /* Similar to reclaim, but different enough that they don't share logic */ | 231 | /* Similar to reclaim, but different enough that they don't share logic */ |
@@ -269,6 +261,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, | |||
269 | unsigned long last_pageblock_nr = 0, pageblock_nr; | 261 | unsigned long last_pageblock_nr = 0, pageblock_nr; |
270 | unsigned long nr_scanned = 0, nr_isolated = 0; | 262 | unsigned long nr_scanned = 0, nr_isolated = 0; |
271 | struct list_head *migratelist = &cc->migratepages; | 263 | struct list_head *migratelist = &cc->migratepages; |
264 | isolate_mode_t mode = ISOLATE_ACTIVE|ISOLATE_INACTIVE; | ||
272 | 265 | ||
273 | /* Do not scan outside zone boundaries */ | 266 | /* Do not scan outside zone boundaries */ |
274 | low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn); | 267 | low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn); |
@@ -356,8 +349,11 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, | |||
356 | continue; | 349 | continue; |
357 | } | 350 | } |
358 | 351 | ||
352 | if (!cc->sync) | ||
353 | mode |= ISOLATE_CLEAN; | ||
354 | |||
359 | /* Try isolate the page */ | 355 | /* Try isolate the page */ |
360 | if (__isolate_lru_page(page, ISOLATE_BOTH, 0) != 0) | 356 | if (__isolate_lru_page(page, mode, 0) != 0) |
361 | continue; | 357 | continue; |
362 | 358 | ||
363 | VM_BUG_ON(PageTransCompound(page)); | 359 | VM_BUG_ON(PageTransCompound(page)); |
@@ -586,7 +582,7 @@ out: | |||
586 | return ret; | 582 | return ret; |
587 | } | 583 | } |
588 | 584 | ||
589 | unsigned long compact_zone_order(struct zone *zone, | 585 | static unsigned long compact_zone_order(struct zone *zone, |
590 | int order, gfp_t gfp_mask, | 586 | int order, gfp_t gfp_mask, |
591 | bool sync) | 587 | bool sync) |
592 | { | 588 | { |
diff --git a/mm/debug-pagealloc.c b/mm/debug-pagealloc.c index a1e3324de2b5..7cea557407f4 100644 --- a/mm/debug-pagealloc.c +++ b/mm/debug-pagealloc.c | |||
@@ -1,7 +1,10 @@ | |||
1 | #include <linux/kernel.h> | 1 | #include <linux/kernel.h> |
2 | #include <linux/string.h> | ||
2 | #include <linux/mm.h> | 3 | #include <linux/mm.h> |
4 | #include <linux/highmem.h> | ||
3 | #include <linux/page-debug-flags.h> | 5 | #include <linux/page-debug-flags.h> |
4 | #include <linux/poison.h> | 6 | #include <linux/poison.h> |
7 | #include <linux/ratelimit.h> | ||
5 | 8 | ||
6 | static inline void set_page_poison(struct page *page) | 9 | static inline void set_page_poison(struct page *page) |
7 | { | 10 | { |
@@ -18,28 +21,13 @@ static inline bool page_poison(struct page *page) | |||
18 | return test_bit(PAGE_DEBUG_FLAG_POISON, &page->debug_flags); | 21 | return test_bit(PAGE_DEBUG_FLAG_POISON, &page->debug_flags); |
19 | } | 22 | } |
20 | 23 | ||
21 | static void poison_highpage(struct page *page) | ||
22 | { | ||
23 | /* | ||
24 | * Page poisoning for highmem pages is not implemented. | ||
25 | * | ||
26 | * This can be called from interrupt contexts. | ||
27 | * So we need to create a new kmap_atomic slot for this | ||
28 | * application and it will need interrupt protection. | ||
29 | */ | ||
30 | } | ||
31 | |||
32 | static void poison_page(struct page *page) | 24 | static void poison_page(struct page *page) |
33 | { | 25 | { |
34 | void *addr; | 26 | void *addr = kmap_atomic(page); |
35 | 27 | ||
36 | if (PageHighMem(page)) { | ||
37 | poison_highpage(page); | ||
38 | return; | ||
39 | } | ||
40 | set_page_poison(page); | 28 | set_page_poison(page); |
41 | addr = page_address(page); | ||
42 | memset(addr, PAGE_POISON, PAGE_SIZE); | 29 | memset(addr, PAGE_POISON, PAGE_SIZE); |
30 | kunmap_atomic(addr); | ||
43 | } | 31 | } |
44 | 32 | ||
45 | static void poison_pages(struct page *page, int n) | 33 | static void poison_pages(struct page *page, int n) |
@@ -59,14 +47,12 @@ static bool single_bit_flip(unsigned char a, unsigned char b) | |||
59 | 47 | ||
60 | static void check_poison_mem(unsigned char *mem, size_t bytes) | 48 | static void check_poison_mem(unsigned char *mem, size_t bytes) |
61 | { | 49 | { |
50 | static DEFINE_RATELIMIT_STATE(ratelimit, 5 * HZ, 10); | ||
62 | unsigned char *start; | 51 | unsigned char *start; |
63 | unsigned char *end; | 52 | unsigned char *end; |
64 | 53 | ||
65 | for (start = mem; start < mem + bytes; start++) { | 54 | start = memchr_inv(mem, PAGE_POISON, bytes); |
66 | if (*start != PAGE_POISON) | 55 | if (!start) |
67 | break; | ||
68 | } | ||
69 | if (start == mem + bytes) | ||
70 | return; | 56 | return; |
71 | 57 | ||
72 | for (end = mem + bytes - 1; end > start; end--) { | 58 | for (end = mem + bytes - 1; end > start; end--) { |
@@ -74,7 +60,7 @@ static void check_poison_mem(unsigned char *mem, size_t bytes) | |||
74 | break; | 60 | break; |
75 | } | 61 | } |
76 | 62 | ||
77 | if (!printk_ratelimit()) | 63 | if (!__ratelimit(&ratelimit)) |
78 | return; | 64 | return; |
79 | else if (start == end && single_bit_flip(*start, PAGE_POISON)) | 65 | else if (start == end && single_bit_flip(*start, PAGE_POISON)) |
80 | printk(KERN_ERR "pagealloc: single bit error\n"); | 66 | printk(KERN_ERR "pagealloc: single bit error\n"); |
@@ -86,27 +72,17 @@ static void check_poison_mem(unsigned char *mem, size_t bytes) | |||
86 | dump_stack(); | 72 | dump_stack(); |
87 | } | 73 | } |
88 | 74 | ||
89 | static void unpoison_highpage(struct page *page) | ||
90 | { | ||
91 | /* | ||
92 | * See comment in poison_highpage(). | ||
93 | * Highmem pages should not be poisoned for now | ||
94 | */ | ||
95 | BUG_ON(page_poison(page)); | ||
96 | } | ||
97 | |||
98 | static void unpoison_page(struct page *page) | 75 | static void unpoison_page(struct page *page) |
99 | { | 76 | { |
100 | if (PageHighMem(page)) { | 77 | void *addr; |
101 | unpoison_highpage(page); | 78 | |
79 | if (!page_poison(page)) | ||
102 | return; | 80 | return; |
103 | } | ||
104 | if (page_poison(page)) { | ||
105 | void *addr = page_address(page); | ||
106 | 81 | ||
107 | check_poison_mem(addr, PAGE_SIZE); | 82 | addr = kmap_atomic(page); |
108 | clear_page_poison(page); | 83 | check_poison_mem(addr, PAGE_SIZE); |
109 | } | 84 | clear_page_poison(page); |
85 | kunmap_atomic(addr); | ||
110 | } | 86 | } |
111 | 87 | ||
112 | static void unpoison_pages(struct page *page, int n) | 88 | static void unpoison_pages(struct page *page, int n) |
diff --git a/mm/dmapool.c b/mm/dmapool.c index fbb58e346888..c5ab33bca0a8 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c | |||
@@ -27,11 +27,12 @@ | |||
27 | #include <linux/dmapool.h> | 27 | #include <linux/dmapool.h> |
28 | #include <linux/kernel.h> | 28 | #include <linux/kernel.h> |
29 | #include <linux/list.h> | 29 | #include <linux/list.h> |
30 | #include <linux/module.h> | 30 | #include <linux/export.h> |
31 | #include <linux/mutex.h> | 31 | #include <linux/mutex.h> |
32 | #include <linux/poison.h> | 32 | #include <linux/poison.h> |
33 | #include <linux/sched.h> | 33 | #include <linux/sched.h> |
34 | #include <linux/slab.h> | 34 | #include <linux/slab.h> |
35 | #include <linux/stat.h> | ||
35 | #include <linux/spinlock.h> | 36 | #include <linux/spinlock.h> |
36 | #include <linux/string.h> | 37 | #include <linux/string.h> |
37 | #include <linux/types.h> | 38 | #include <linux/types.h> |
diff --git a/mm/failslab.c b/mm/failslab.c index 1ce58c201dca..0dd7b8fec71c 100644 --- a/mm/failslab.c +++ b/mm/failslab.c | |||
@@ -34,23 +34,23 @@ __setup("failslab=", setup_failslab); | |||
34 | #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS | 34 | #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS |
35 | static int __init failslab_debugfs_init(void) | 35 | static int __init failslab_debugfs_init(void) |
36 | { | 36 | { |
37 | struct dentry *dir; | ||
37 | mode_t mode = S_IFREG | S_IRUSR | S_IWUSR; | 38 | mode_t mode = S_IFREG | S_IRUSR | S_IWUSR; |
38 | int err; | ||
39 | 39 | ||
40 | err = init_fault_attr_dentries(&failslab.attr, "failslab"); | 40 | dir = fault_create_debugfs_attr("failslab", NULL, &failslab.attr); |
41 | if (err) | 41 | if (IS_ERR(dir)) |
42 | return err; | 42 | return PTR_ERR(dir); |
43 | 43 | ||
44 | if (!debugfs_create_bool("ignore-gfp-wait", mode, failslab.attr.dir, | 44 | if (!debugfs_create_bool("ignore-gfp-wait", mode, dir, |
45 | &failslab.ignore_gfp_wait)) | 45 | &failslab.ignore_gfp_wait)) |
46 | goto fail; | 46 | goto fail; |
47 | if (!debugfs_create_bool("cache-filter", mode, failslab.attr.dir, | 47 | if (!debugfs_create_bool("cache-filter", mode, dir, |
48 | &failslab.cache_filter)) | 48 | &failslab.cache_filter)) |
49 | goto fail; | 49 | goto fail; |
50 | 50 | ||
51 | return 0; | 51 | return 0; |
52 | fail: | 52 | fail: |
53 | cleanup_fault_attr_dentries(&failslab.attr); | 53 | debugfs_remove_recursive(dir); |
54 | 54 | ||
55 | return -ENOMEM; | 55 | return -ENOMEM; |
56 | } | 56 | } |
diff --git a/mm/filemap.c b/mm/filemap.c index 867d40222ec7..c0018f2d50e0 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -9,7 +9,7 @@ | |||
9 | * most "normal" filesystems (but you don't /have/ to use this: | 9 | * most "normal" filesystems (but you don't /have/ to use this: |
10 | * the NFS filesystem used to do this differently, for example) | 10 | * the NFS filesystem used to do this differently, for example) |
11 | */ | 11 | */ |
12 | #include <linux/module.h> | 12 | #include <linux/export.h> |
13 | #include <linux/compiler.h> | 13 | #include <linux/compiler.h> |
14 | #include <linux/fs.h> | 14 | #include <linux/fs.h> |
15 | #include <linux/uaccess.h> | 15 | #include <linux/uaccess.h> |
@@ -33,7 +33,6 @@ | |||
33 | #include <linux/cpuset.h> | 33 | #include <linux/cpuset.h> |
34 | #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */ | 34 | #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */ |
35 | #include <linux/memcontrol.h> | 35 | #include <linux/memcontrol.h> |
36 | #include <linux/mm_inline.h> /* for page_is_file_cache() */ | ||
37 | #include <linux/cleancache.h> | 36 | #include <linux/cleancache.h> |
38 | #include "internal.h" | 37 | #include "internal.h" |
39 | 38 | ||
@@ -462,6 +461,7 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping, | |||
462 | int error; | 461 | int error; |
463 | 462 | ||
464 | VM_BUG_ON(!PageLocked(page)); | 463 | VM_BUG_ON(!PageLocked(page)); |
464 | VM_BUG_ON(PageSwapBacked(page)); | ||
465 | 465 | ||
466 | error = mem_cgroup_cache_charge(page, current->mm, | 466 | error = mem_cgroup_cache_charge(page, current->mm, |
467 | gfp_mask & GFP_RECLAIM_MASK); | 467 | gfp_mask & GFP_RECLAIM_MASK); |
@@ -479,8 +479,6 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping, | |||
479 | if (likely(!error)) { | 479 | if (likely(!error)) { |
480 | mapping->nrpages++; | 480 | mapping->nrpages++; |
481 | __inc_zone_page_state(page, NR_FILE_PAGES); | 481 | __inc_zone_page_state(page, NR_FILE_PAGES); |
482 | if (PageSwapBacked(page)) | ||
483 | __inc_zone_page_state(page, NR_SHMEM); | ||
484 | spin_unlock_irq(&mapping->tree_lock); | 482 | spin_unlock_irq(&mapping->tree_lock); |
485 | } else { | 483 | } else { |
486 | page->mapping = NULL; | 484 | page->mapping = NULL; |
@@ -502,22 +500,9 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping, | |||
502 | { | 500 | { |
503 | int ret; | 501 | int ret; |
504 | 502 | ||
505 | /* | ||
506 | * Splice_read and readahead add shmem/tmpfs pages into the page cache | ||
507 | * before shmem_readpage has a chance to mark them as SwapBacked: they | ||
508 | * need to go on the anon lru below, and mem_cgroup_cache_charge | ||
509 | * (called in add_to_page_cache) needs to know where they're going too. | ||
510 | */ | ||
511 | if (mapping_cap_swap_backed(mapping)) | ||
512 | SetPageSwapBacked(page); | ||
513 | |||
514 | ret = add_to_page_cache(page, mapping, offset, gfp_mask); | 503 | ret = add_to_page_cache(page, mapping, offset, gfp_mask); |
515 | if (ret == 0) { | 504 | if (ret == 0) |
516 | if (page_is_file_cache(page)) | 505 | lru_cache_add_file(page); |
517 | lru_cache_add_file(page); | ||
518 | else | ||
519 | lru_cache_add_anon(page); | ||
520 | } | ||
521 | return ret; | 506 | return ret; |
522 | } | 507 | } |
523 | EXPORT_SYMBOL_GPL(add_to_page_cache_lru); | 508 | EXPORT_SYMBOL_GPL(add_to_page_cache_lru); |
@@ -714,9 +699,16 @@ repeat: | |||
714 | page = radix_tree_deref_slot(pagep); | 699 | page = radix_tree_deref_slot(pagep); |
715 | if (unlikely(!page)) | 700 | if (unlikely(!page)) |
716 | goto out; | 701 | goto out; |
717 | if (radix_tree_deref_retry(page)) | 702 | if (radix_tree_exception(page)) { |
718 | goto repeat; | 703 | if (radix_tree_deref_retry(page)) |
719 | 704 | goto repeat; | |
705 | /* | ||
706 | * Otherwise, shmem/tmpfs must be storing a swap entry | ||
707 | * here as an exceptional entry: so return it without | ||
708 | * attempting to raise page count. | ||
709 | */ | ||
710 | goto out; | ||
711 | } | ||
720 | if (!page_cache_get_speculative(page)) | 712 | if (!page_cache_get_speculative(page)) |
721 | goto repeat; | 713 | goto repeat; |
722 | 714 | ||
@@ -753,7 +745,7 @@ struct page *find_lock_page(struct address_space *mapping, pgoff_t offset) | |||
753 | 745 | ||
754 | repeat: | 746 | repeat: |
755 | page = find_get_page(mapping, offset); | 747 | page = find_get_page(mapping, offset); |
756 | if (page) { | 748 | if (page && !radix_tree_exception(page)) { |
757 | lock_page(page); | 749 | lock_page(page); |
758 | /* Has the page been truncated? */ | 750 | /* Has the page been truncated? */ |
759 | if (unlikely(page->mapping != mapping)) { | 751 | if (unlikely(page->mapping != mapping)) { |
@@ -835,13 +827,14 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start, | |||
835 | { | 827 | { |
836 | unsigned int i; | 828 | unsigned int i; |
837 | unsigned int ret; | 829 | unsigned int ret; |
838 | unsigned int nr_found; | 830 | unsigned int nr_found, nr_skip; |
839 | 831 | ||
840 | rcu_read_lock(); | 832 | rcu_read_lock(); |
841 | restart: | 833 | restart: |
842 | nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, | 834 | nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, |
843 | (void ***)pages, start, nr_pages); | 835 | (void ***)pages, NULL, start, nr_pages); |
844 | ret = 0; | 836 | ret = 0; |
837 | nr_skip = 0; | ||
845 | for (i = 0; i < nr_found; i++) { | 838 | for (i = 0; i < nr_found; i++) { |
846 | struct page *page; | 839 | struct page *page; |
847 | repeat: | 840 | repeat: |
@@ -849,13 +842,23 @@ repeat: | |||
849 | if (unlikely(!page)) | 842 | if (unlikely(!page)) |
850 | continue; | 843 | continue; |
851 | 844 | ||
852 | /* | 845 | if (radix_tree_exception(page)) { |
853 | * This can only trigger when the entry at index 0 moves out | 846 | if (radix_tree_deref_retry(page)) { |
854 | * of or back to the root: none yet gotten, safe to restart. | 847 | /* |
855 | */ | 848 | * Transient condition which can only trigger |
856 | if (radix_tree_deref_retry(page)) { | 849 | * when entry at index 0 moves out of or back |
857 | WARN_ON(start | i); | 850 | * to root: none yet gotten, safe to restart. |
858 | goto restart; | 851 | */ |
852 | WARN_ON(start | i); | ||
853 | goto restart; | ||
854 | } | ||
855 | /* | ||
856 | * Otherwise, shmem/tmpfs must be storing a swap entry | ||
857 | * here as an exceptional entry: so skip over it - | ||
858 | * we only reach this from invalidate_mapping_pages(). | ||
859 | */ | ||
860 | nr_skip++; | ||
861 | continue; | ||
859 | } | 862 | } |
860 | 863 | ||
861 | if (!page_cache_get_speculative(page)) | 864 | if (!page_cache_get_speculative(page)) |
@@ -875,7 +878,7 @@ repeat: | |||
875 | * If all entries were removed before we could secure them, | 878 | * If all entries were removed before we could secure them, |
876 | * try again, because callers stop trying once 0 is returned. | 879 | * try again, because callers stop trying once 0 is returned. |
877 | */ | 880 | */ |
878 | if (unlikely(!ret && nr_found)) | 881 | if (unlikely(!ret && nr_found > nr_skip)) |
879 | goto restart; | 882 | goto restart; |
880 | rcu_read_unlock(); | 883 | rcu_read_unlock(); |
881 | return ret; | 884 | return ret; |
@@ -903,7 +906,7 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, | |||
903 | rcu_read_lock(); | 906 | rcu_read_lock(); |
904 | restart: | 907 | restart: |
905 | nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, | 908 | nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, |
906 | (void ***)pages, index, nr_pages); | 909 | (void ***)pages, NULL, index, nr_pages); |
907 | ret = 0; | 910 | ret = 0; |
908 | for (i = 0; i < nr_found; i++) { | 911 | for (i = 0; i < nr_found; i++) { |
909 | struct page *page; | 912 | struct page *page; |
@@ -912,12 +915,22 @@ repeat: | |||
912 | if (unlikely(!page)) | 915 | if (unlikely(!page)) |
913 | continue; | 916 | continue; |
914 | 917 | ||
915 | /* | 918 | if (radix_tree_exception(page)) { |
916 | * This can only trigger when the entry at index 0 moves out | 919 | if (radix_tree_deref_retry(page)) { |
917 | * of or back to the root: none yet gotten, safe to restart. | 920 | /* |
918 | */ | 921 | * Transient condition which can only trigger |
919 | if (radix_tree_deref_retry(page)) | 922 | * when entry at index 0 moves out of or back |
920 | goto restart; | 923 | * to root: none yet gotten, safe to restart. |
924 | */ | ||
925 | goto restart; | ||
926 | } | ||
927 | /* | ||
928 | * Otherwise, shmem/tmpfs must be storing a swap entry | ||
929 | * here as an exceptional entry: so stop looking for | ||
930 | * contiguous pages. | ||
931 | */ | ||
932 | break; | ||
933 | } | ||
921 | 934 | ||
922 | if (!page_cache_get_speculative(page)) | 935 | if (!page_cache_get_speculative(page)) |
923 | goto repeat; | 936 | goto repeat; |
@@ -977,12 +990,21 @@ repeat: | |||
977 | if (unlikely(!page)) | 990 | if (unlikely(!page)) |
978 | continue; | 991 | continue; |
979 | 992 | ||
980 | /* | 993 | if (radix_tree_exception(page)) { |
981 | * This can only trigger when the entry at index 0 moves out | 994 | if (radix_tree_deref_retry(page)) { |
982 | * of or back to the root: none yet gotten, safe to restart. | 995 | /* |
983 | */ | 996 | * Transient condition which can only trigger |
984 | if (radix_tree_deref_retry(page)) | 997 | * when entry at index 0 moves out of or back |
985 | goto restart; | 998 | * to root: none yet gotten, safe to restart. |
999 | */ | ||
1000 | goto restart; | ||
1001 | } | ||
1002 | /* | ||
1003 | * This function is never used on a shmem/tmpfs | ||
1004 | * mapping, so a swap entry won't be found here. | ||
1005 | */ | ||
1006 | BUG(); | ||
1007 | } | ||
986 | 1008 | ||
987 | if (!page_cache_get_speculative(page)) | 1009 | if (!page_cache_get_speculative(page)) |
988 | goto repeat; | 1010 | goto repeat; |
@@ -2093,6 +2115,7 @@ void iov_iter_advance(struct iov_iter *i, size_t bytes) | |||
2093 | } else { | 2115 | } else { |
2094 | const struct iovec *iov = i->iov; | 2116 | const struct iovec *iov = i->iov; |
2095 | size_t base = i->iov_offset; | 2117 | size_t base = i->iov_offset; |
2118 | unsigned long nr_segs = i->nr_segs; | ||
2096 | 2119 | ||
2097 | /* | 2120 | /* |
2098 | * The !iov->iov_len check ensures we skip over unlikely | 2121 | * The !iov->iov_len check ensures we skip over unlikely |
@@ -2108,11 +2131,13 @@ void iov_iter_advance(struct iov_iter *i, size_t bytes) | |||
2108 | base += copy; | 2131 | base += copy; |
2109 | if (iov->iov_len == base) { | 2132 | if (iov->iov_len == base) { |
2110 | iov++; | 2133 | iov++; |
2134 | nr_segs--; | ||
2111 | base = 0; | 2135 | base = 0; |
2112 | } | 2136 | } |
2113 | } | 2137 | } |
2114 | i->iov = iov; | 2138 | i->iov = iov; |
2115 | i->iov_offset = base; | 2139 | i->iov_offset = base; |
2140 | i->nr_segs = nr_segs; | ||
2116 | } | 2141 | } |
2117 | } | 2142 | } |
2118 | EXPORT_SYMBOL(iov_iter_advance); | 2143 | EXPORT_SYMBOL(iov_iter_advance); |
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index 93356cd12828..f91b2f687343 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c | |||
@@ -10,7 +10,7 @@ | |||
10 | 10 | ||
11 | #include <linux/fs.h> | 11 | #include <linux/fs.h> |
12 | #include <linux/pagemap.h> | 12 | #include <linux/pagemap.h> |
13 | #include <linux/module.h> | 13 | #include <linux/export.h> |
14 | #include <linux/uio.h> | 14 | #include <linux/uio.h> |
15 | #include <linux/rmap.h> | 15 | #include <linux/rmap.h> |
16 | #include <linux/mmu_notifier.h> | 16 | #include <linux/mmu_notifier.h> |
diff --git a/mm/fremap.c b/mm/fremap.c index b8e0e2d468af..9ed4fd432467 100644 --- a/mm/fremap.c +++ b/mm/fremap.c | |||
@@ -13,7 +13,6 @@ | |||
13 | #include <linux/pagemap.h> | 13 | #include <linux/pagemap.h> |
14 | #include <linux/swapops.h> | 14 | #include <linux/swapops.h> |
15 | #include <linux/rmap.h> | 15 | #include <linux/rmap.h> |
16 | #include <linux/module.h> | ||
17 | #include <linux/syscalls.h> | 16 | #include <linux/syscalls.h> |
18 | #include <linux/mmu_notifier.h> | 17 | #include <linux/mmu_notifier.h> |
19 | 18 | ||
diff --git a/mm/highmem.c b/mm/highmem.c index 693394daa2ed..57d82c6250c3 100644 --- a/mm/highmem.c +++ b/mm/highmem.c | |||
@@ -17,7 +17,7 @@ | |||
17 | */ | 17 | */ |
18 | 18 | ||
19 | #include <linux/mm.h> | 19 | #include <linux/mm.h> |
20 | #include <linux/module.h> | 20 | #include <linux/export.h> |
21 | #include <linux/swap.h> | 21 | #include <linux/swap.h> |
22 | #include <linux/bio.h> | 22 | #include <linux/bio.h> |
23 | #include <linux/pagemap.h> | 23 | #include <linux/pagemap.h> |
@@ -250,7 +250,7 @@ void *kmap_high_get(struct page *page) | |||
250 | #endif | 250 | #endif |
251 | 251 | ||
252 | /** | 252 | /** |
253 | * kunmap_high - map a highmem page into memory | 253 | * kunmap_high - unmap a highmem page into memory |
254 | * @page: &struct page to unmap | 254 | * @page: &struct page to unmap |
255 | * | 255 | * |
256 | * If ARCH_NEEDS_KMAP_HIGH_GET is not defined then this may be called | 256 | * If ARCH_NEEDS_KMAP_HIGH_GET is not defined then this may be called |
@@ -326,7 +326,7 @@ static struct page_address_slot { | |||
326 | spinlock_t lock; /* Protect this bucket's list */ | 326 | spinlock_t lock; /* Protect this bucket's list */ |
327 | } ____cacheline_aligned_in_smp page_address_htable[1<<PA_HASH_ORDER]; | 327 | } ____cacheline_aligned_in_smp page_address_htable[1<<PA_HASH_ORDER]; |
328 | 328 | ||
329 | static struct page_address_slot *page_slot(struct page *page) | 329 | static struct page_address_slot *page_slot(const struct page *page) |
330 | { | 330 | { |
331 | return &page_address_htable[hash_ptr(page, PA_HASH_ORDER)]; | 331 | return &page_address_htable[hash_ptr(page, PA_HASH_ORDER)]; |
332 | } | 332 | } |
@@ -337,7 +337,7 @@ static struct page_address_slot *page_slot(struct page *page) | |||
337 | * | 337 | * |
338 | * Returns the page's virtual address. | 338 | * Returns the page's virtual address. |
339 | */ | 339 | */ |
340 | void *page_address(struct page *page) | 340 | void *page_address(const struct page *page) |
341 | { | 341 | { |
342 | unsigned long flags; | 342 | unsigned long flags; |
343 | void *ret; | 343 | void *ret; |
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index e2d1587be269..4298abaae153 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
@@ -89,7 +89,8 @@ struct khugepaged_scan { | |||
89 | struct list_head mm_head; | 89 | struct list_head mm_head; |
90 | struct mm_slot *mm_slot; | 90 | struct mm_slot *mm_slot; |
91 | unsigned long address; | 91 | unsigned long address; |
92 | } khugepaged_scan = { | 92 | }; |
93 | static struct khugepaged_scan khugepaged_scan = { | ||
93 | .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head), | 94 | .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head), |
94 | }; | 95 | }; |
95 | 96 | ||
@@ -829,7 +830,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, | |||
829 | 830 | ||
830 | for (i = 0; i < HPAGE_PMD_NR; i++) { | 831 | for (i = 0; i < HPAGE_PMD_NR; i++) { |
831 | copy_user_highpage(pages[i], page + i, | 832 | copy_user_highpage(pages[i], page + i, |
832 | haddr + PAGE_SHIFT*i, vma); | 833 | haddr + PAGE_SIZE * i, vma); |
833 | __SetPageUptodate(pages[i]); | 834 | __SetPageUptodate(pages[i]); |
834 | cond_resched(); | 835 | cond_resched(); |
835 | } | 836 | } |
@@ -989,7 +990,7 @@ struct page *follow_trans_huge_pmd(struct mm_struct *mm, | |||
989 | page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; | 990 | page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; |
990 | VM_BUG_ON(!PageCompound(page)); | 991 | VM_BUG_ON(!PageCompound(page)); |
991 | if (flags & FOLL_GET) | 992 | if (flags & FOLL_GET) |
992 | get_page(page); | 993 | get_page_foll(page); |
993 | 994 | ||
994 | out: | 995 | out: |
995 | return page; | 996 | return page; |
@@ -1052,6 +1053,51 @@ int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, | |||
1052 | return ret; | 1053 | return ret; |
1053 | } | 1054 | } |
1054 | 1055 | ||
1056 | int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, | ||
1057 | unsigned long old_addr, | ||
1058 | unsigned long new_addr, unsigned long old_end, | ||
1059 | pmd_t *old_pmd, pmd_t *new_pmd) | ||
1060 | { | ||
1061 | int ret = 0; | ||
1062 | pmd_t pmd; | ||
1063 | |||
1064 | struct mm_struct *mm = vma->vm_mm; | ||
1065 | |||
1066 | if ((old_addr & ~HPAGE_PMD_MASK) || | ||
1067 | (new_addr & ~HPAGE_PMD_MASK) || | ||
1068 | old_end - old_addr < HPAGE_PMD_SIZE || | ||
1069 | (new_vma->vm_flags & VM_NOHUGEPAGE)) | ||
1070 | goto out; | ||
1071 | |||
1072 | /* | ||
1073 | * The destination pmd shouldn't be established, free_pgtables() | ||
1074 | * should have release it. | ||
1075 | */ | ||
1076 | if (WARN_ON(!pmd_none(*new_pmd))) { | ||
1077 | VM_BUG_ON(pmd_trans_huge(*new_pmd)); | ||
1078 | goto out; | ||
1079 | } | ||
1080 | |||
1081 | spin_lock(&mm->page_table_lock); | ||
1082 | if (likely(pmd_trans_huge(*old_pmd))) { | ||
1083 | if (pmd_trans_splitting(*old_pmd)) { | ||
1084 | spin_unlock(&mm->page_table_lock); | ||
1085 | wait_split_huge_page(vma->anon_vma, old_pmd); | ||
1086 | ret = -1; | ||
1087 | } else { | ||
1088 | pmd = pmdp_get_and_clear(mm, old_addr, old_pmd); | ||
1089 | VM_BUG_ON(!pmd_none(*new_pmd)); | ||
1090 | set_pmd_at(mm, new_addr, new_pmd, pmd); | ||
1091 | spin_unlock(&mm->page_table_lock); | ||
1092 | ret = 1; | ||
1093 | } | ||
1094 | } else { | ||
1095 | spin_unlock(&mm->page_table_lock); | ||
1096 | } | ||
1097 | out: | ||
1098 | return ret; | ||
1099 | } | ||
1100 | |||
1055 | int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, | 1101 | int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, |
1056 | unsigned long addr, pgprot_t newprot) | 1102 | unsigned long addr, pgprot_t newprot) |
1057 | { | 1103 | { |
@@ -1156,6 +1202,7 @@ static void __split_huge_page_refcount(struct page *page) | |||
1156 | unsigned long head_index = page->index; | 1202 | unsigned long head_index = page->index; |
1157 | struct zone *zone = page_zone(page); | 1203 | struct zone *zone = page_zone(page); |
1158 | int zonestat; | 1204 | int zonestat; |
1205 | int tail_count = 0; | ||
1159 | 1206 | ||
1160 | /* prevent PageLRU to go away from under us, and freeze lru stats */ | 1207 | /* prevent PageLRU to go away from under us, and freeze lru stats */ |
1161 | spin_lock_irq(&zone->lru_lock); | 1208 | spin_lock_irq(&zone->lru_lock); |
@@ -1164,11 +1211,27 @@ static void __split_huge_page_refcount(struct page *page) | |||
1164 | for (i = 1; i < HPAGE_PMD_NR; i++) { | 1211 | for (i = 1; i < HPAGE_PMD_NR; i++) { |
1165 | struct page *page_tail = page + i; | 1212 | struct page *page_tail = page + i; |
1166 | 1213 | ||
1167 | /* tail_page->_count cannot change */ | 1214 | /* tail_page->_mapcount cannot change */ |
1168 | atomic_sub(atomic_read(&page_tail->_count), &page->_count); | 1215 | BUG_ON(page_mapcount(page_tail) < 0); |
1169 | BUG_ON(page_count(page) <= 0); | 1216 | tail_count += page_mapcount(page_tail); |
1170 | atomic_add(page_mapcount(page) + 1, &page_tail->_count); | 1217 | /* check for overflow */ |
1171 | BUG_ON(atomic_read(&page_tail->_count) <= 0); | 1218 | BUG_ON(tail_count < 0); |
1219 | BUG_ON(atomic_read(&page_tail->_count) != 0); | ||
1220 | /* | ||
1221 | * tail_page->_count is zero and not changing from | ||
1222 | * under us. But get_page_unless_zero() may be running | ||
1223 | * from under us on the tail_page. If we used | ||
1224 | * atomic_set() below instead of atomic_add(), we | ||
1225 | * would then run atomic_set() concurrently with | ||
1226 | * get_page_unless_zero(), and atomic_set() is | ||
1227 | * implemented in C not using locked ops. spin_unlock | ||
1228 | * on x86 sometime uses locked ops because of PPro | ||
1229 | * errata 66, 92, so unless somebody can guarantee | ||
1230 | * atomic_set() here would be safe on all archs (and | ||
1231 | * not only on x86), it's safer to use atomic_add(). | ||
1232 | */ | ||
1233 | atomic_add(page_mapcount(page) + page_mapcount(page_tail) + 1, | ||
1234 | &page_tail->_count); | ||
1172 | 1235 | ||
1173 | /* after clearing PageTail the gup refcount can be released */ | 1236 | /* after clearing PageTail the gup refcount can be released */ |
1174 | smp_mb(); | 1237 | smp_mb(); |
@@ -1186,10 +1249,7 @@ static void __split_huge_page_refcount(struct page *page) | |||
1186 | (1L << PG_uptodate))); | 1249 | (1L << PG_uptodate))); |
1187 | page_tail->flags |= (1L << PG_dirty); | 1250 | page_tail->flags |= (1L << PG_dirty); |
1188 | 1251 | ||
1189 | /* | 1252 | /* clear PageTail before overwriting first_page */ |
1190 | * 1) clear PageTail before overwriting first_page | ||
1191 | * 2) clear PageTail before clearing PageHead for VM_BUG_ON | ||
1192 | */ | ||
1193 | smp_wmb(); | 1253 | smp_wmb(); |
1194 | 1254 | ||
1195 | /* | 1255 | /* |
@@ -1206,7 +1266,6 @@ static void __split_huge_page_refcount(struct page *page) | |||
1206 | * status is achieved setting a reserved bit in the | 1266 | * status is achieved setting a reserved bit in the |
1207 | * pmd, not by clearing the present bit. | 1267 | * pmd, not by clearing the present bit. |
1208 | */ | 1268 | */ |
1209 | BUG_ON(page_mapcount(page_tail)); | ||
1210 | page_tail->_mapcount = page->_mapcount; | 1269 | page_tail->_mapcount = page->_mapcount; |
1211 | 1270 | ||
1212 | BUG_ON(page_tail->mapping); | 1271 | BUG_ON(page_tail->mapping); |
@@ -1223,6 +1282,8 @@ static void __split_huge_page_refcount(struct page *page) | |||
1223 | 1282 | ||
1224 | lru_add_page_tail(zone, page, page_tail); | 1283 | lru_add_page_tail(zone, page, page_tail); |
1225 | } | 1284 | } |
1285 | atomic_sub(tail_count, &page->_count); | ||
1286 | BUG_ON(atomic_read(&page->_count) <= 0); | ||
1226 | 1287 | ||
1227 | __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); | 1288 | __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); |
1228 | __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR); | 1289 | __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR); |
@@ -1906,7 +1967,7 @@ static void collapse_huge_page(struct mm_struct *mm, | |||
1906 | BUG_ON(!pmd_none(*pmd)); | 1967 | BUG_ON(!pmd_none(*pmd)); |
1907 | page_add_new_anon_rmap(new_page, vma, address); | 1968 | page_add_new_anon_rmap(new_page, vma, address); |
1908 | set_pmd_at(mm, address, pmd, _pmd); | 1969 | set_pmd_at(mm, address, pmd, _pmd); |
1909 | update_mmu_cache(vma, address, entry); | 1970 | update_mmu_cache(vma, address, _pmd); |
1910 | prepare_pmd_huge_pte(pgtable, mm); | 1971 | prepare_pmd_huge_pte(pgtable, mm); |
1911 | mm->nr_ptes--; | 1972 | mm->nr_ptes--; |
1912 | spin_unlock(&mm->page_table_lock); | 1973 | spin_unlock(&mm->page_table_lock); |
@@ -2024,6 +2085,8 @@ static void collect_mm_slot(struct mm_slot *mm_slot) | |||
2024 | 2085 | ||
2025 | static unsigned int khugepaged_scan_mm_slot(unsigned int pages, | 2086 | static unsigned int khugepaged_scan_mm_slot(unsigned int pages, |
2026 | struct page **hpage) | 2087 | struct page **hpage) |
2088 | __releases(&khugepaged_mm_lock) | ||
2089 | __acquires(&khugepaged_mm_lock) | ||
2027 | { | 2090 | { |
2028 | struct mm_slot *mm_slot; | 2091 | struct mm_slot *mm_slot; |
2029 | struct mm_struct *mm; | 2092 | struct mm_struct *mm; |
diff --git a/mm/internal.h b/mm/internal.h index d071d380fb49..2189af491783 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
@@ -37,6 +37,52 @@ static inline void __put_page(struct page *page) | |||
37 | atomic_dec(&page->_count); | 37 | atomic_dec(&page->_count); |
38 | } | 38 | } |
39 | 39 | ||
40 | static inline void __get_page_tail_foll(struct page *page, | ||
41 | bool get_page_head) | ||
42 | { | ||
43 | /* | ||
44 | * If we're getting a tail page, the elevated page->_count is | ||
45 | * required only in the head page and we will elevate the head | ||
46 | * page->_count and tail page->_mapcount. | ||
47 | * | ||
48 | * We elevate page_tail->_mapcount for tail pages to force | ||
49 | * page_tail->_count to be zero at all times to avoid getting | ||
50 | * false positives from get_page_unless_zero() with | ||
51 | * speculative page access (like in | ||
52 | * page_cache_get_speculative()) on tail pages. | ||
53 | */ | ||
54 | VM_BUG_ON(atomic_read(&page->first_page->_count) <= 0); | ||
55 | VM_BUG_ON(atomic_read(&page->_count) != 0); | ||
56 | VM_BUG_ON(page_mapcount(page) < 0); | ||
57 | if (get_page_head) | ||
58 | atomic_inc(&page->first_page->_count); | ||
59 | atomic_inc(&page->_mapcount); | ||
60 | } | ||
61 | |||
62 | /* | ||
63 | * This is meant to be called as the FOLL_GET operation of | ||
64 | * follow_page() and it must be called while holding the proper PT | ||
65 | * lock while the pte (or pmd_trans_huge) is still mapping the page. | ||
66 | */ | ||
67 | static inline void get_page_foll(struct page *page) | ||
68 | { | ||
69 | if (unlikely(PageTail(page))) | ||
70 | /* | ||
71 | * This is safe only because | ||
72 | * __split_huge_page_refcount() can't run under | ||
73 | * get_page_foll() because we hold the proper PT lock. | ||
74 | */ | ||
75 | __get_page_tail_foll(page, true); | ||
76 | else { | ||
77 | /* | ||
78 | * Getting a normal page or the head of a compound page | ||
79 | * requires to already have an elevated page->_count. | ||
80 | */ | ||
81 | VM_BUG_ON(atomic_read(&page->_count) <= 0); | ||
82 | atomic_inc(&page->_count); | ||
83 | } | ||
84 | } | ||
85 | |||
40 | extern unsigned long highest_memmap_pfn; | 86 | extern unsigned long highest_memmap_pfn; |
41 | 87 | ||
42 | /* | 88 | /* |
diff --git a/mm/kmemleak.c b/mm/kmemleak.c index d6880f542f95..f3b2a00fe9c1 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c | |||
@@ -69,7 +69,7 @@ | |||
69 | #include <linux/sched.h> | 69 | #include <linux/sched.h> |
70 | #include <linux/jiffies.h> | 70 | #include <linux/jiffies.h> |
71 | #include <linux/delay.h> | 71 | #include <linux/delay.h> |
72 | #include <linux/module.h> | 72 | #include <linux/export.h> |
73 | #include <linux/kthread.h> | 73 | #include <linux/kthread.h> |
74 | #include <linux/prio_tree.h> | 74 | #include <linux/prio_tree.h> |
75 | #include <linux/fs.h> | 75 | #include <linux/fs.h> |
@@ -1905,7 +1905,8 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, | |||
1905 | 1905 | ||
1906 | oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX); | 1906 | oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX); |
1907 | err = unmerge_and_remove_all_rmap_items(); | 1907 | err = unmerge_and_remove_all_rmap_items(); |
1908 | test_set_oom_score_adj(oom_score_adj); | 1908 | compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, |
1909 | oom_score_adj); | ||
1909 | if (err) { | 1910 | if (err) { |
1910 | ksm_run = KSM_RUN_STOP; | 1911 | ksm_run = KSM_RUN_STOP; |
1911 | count = err; | 1912 | count = err; |
diff --git a/mm/maccess.c b/mm/maccess.c index 4cee182ab5f3..d53adf9ba84b 100644 --- a/mm/maccess.c +++ b/mm/maccess.c | |||
@@ -1,7 +1,7 @@ | |||
1 | /* | 1 | /* |
2 | * Access kernel memory without faulting. | 2 | * Access kernel memory without faulting. |
3 | */ | 3 | */ |
4 | #include <linux/module.h> | 4 | #include <linux/export.h> |
5 | #include <linux/mm.h> | 5 | #include <linux/mm.h> |
6 | #include <linux/uaccess.h> | 6 | #include <linux/uaccess.h> |
7 | 7 | ||
diff --git a/mm/memblock.c b/mm/memblock.c index ccbf97339592..84bec4969ed5 100644 --- a/mm/memblock.c +++ b/mm/memblock.c | |||
@@ -58,7 +58,8 @@ static unsigned long __init_memblock memblock_addrs_overlap(phys_addr_t base1, p | |||
58 | return ((base1 < (base2 + size2)) && (base2 < (base1 + size1))); | 58 | return ((base1 < (base2 + size2)) && (base2 < (base1 + size1))); |
59 | } | 59 | } |
60 | 60 | ||
61 | long __init_memblock memblock_overlaps_region(struct memblock_type *type, phys_addr_t base, phys_addr_t size) | 61 | static long __init_memblock memblock_overlaps_region(struct memblock_type *type, |
62 | phys_addr_t base, phys_addr_t size) | ||
62 | { | 63 | { |
63 | unsigned long i; | 64 | unsigned long i; |
64 | 65 | ||
@@ -267,7 +268,7 @@ static int __init_memblock memblock_double_array(struct memblock_type *type) | |||
267 | return 0; | 268 | return 0; |
268 | } | 269 | } |
269 | 270 | ||
270 | extern int __init_memblock __weak memblock_memory_can_coalesce(phys_addr_t addr1, phys_addr_t size1, | 271 | int __init_memblock __weak memblock_memory_can_coalesce(phys_addr_t addr1, phys_addr_t size1, |
271 | phys_addr_t addr2, phys_addr_t size2) | 272 | phys_addr_t addr2, phys_addr_t size2) |
272 | { | 273 | { |
273 | return 1; | 274 | return 1; |
@@ -626,6 +627,12 @@ phys_addr_t __init memblock_phys_mem_size(void) | |||
626 | return memblock.memory_size; | 627 | return memblock.memory_size; |
627 | } | 628 | } |
628 | 629 | ||
630 | /* lowest address */ | ||
631 | phys_addr_t __init_memblock memblock_start_of_DRAM(void) | ||
632 | { | ||
633 | return memblock.memory.regions[0].base; | ||
634 | } | ||
635 | |||
629 | phys_addr_t __init_memblock memblock_end_of_DRAM(void) | 636 | phys_addr_t __init_memblock memblock_end_of_DRAM(void) |
630 | { | 637 | { |
631 | int idx = memblock.memory.cnt - 1; | 638 | int idx = memblock.memory.cnt - 1; |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 5f84d2351ddb..6aff93c98aca 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -33,9 +33,9 @@ | |||
33 | #include <linux/bit_spinlock.h> | 33 | #include <linux/bit_spinlock.h> |
34 | #include <linux/rcupdate.h> | 34 | #include <linux/rcupdate.h> |
35 | #include <linux/limits.h> | 35 | #include <linux/limits.h> |
36 | #include <linux/export.h> | ||
36 | #include <linux/mutex.h> | 37 | #include <linux/mutex.h> |
37 | #include <linux/rbtree.h> | 38 | #include <linux/rbtree.h> |
38 | #include <linux/shmem_fs.h> | ||
39 | #include <linux/slab.h> | 39 | #include <linux/slab.h> |
40 | #include <linux/swap.h> | 40 | #include <linux/swap.h> |
41 | #include <linux/swapops.h> | 41 | #include <linux/swapops.h> |
@@ -202,52 +202,8 @@ struct mem_cgroup_eventfd_list { | |||
202 | struct eventfd_ctx *eventfd; | 202 | struct eventfd_ctx *eventfd; |
203 | }; | 203 | }; |
204 | 204 | ||
205 | static void mem_cgroup_threshold(struct mem_cgroup *mem); | 205 | static void mem_cgroup_threshold(struct mem_cgroup *memcg); |
206 | static void mem_cgroup_oom_notify(struct mem_cgroup *mem); | 206 | static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); |
207 | |||
208 | enum { | ||
209 | SCAN_BY_LIMIT, | ||
210 | SCAN_BY_SYSTEM, | ||
211 | NR_SCAN_CONTEXT, | ||
212 | SCAN_BY_SHRINK, /* not recorded now */ | ||
213 | }; | ||
214 | |||
215 | enum { | ||
216 | SCAN, | ||
217 | SCAN_ANON, | ||
218 | SCAN_FILE, | ||
219 | ROTATE, | ||
220 | ROTATE_ANON, | ||
221 | ROTATE_FILE, | ||
222 | FREED, | ||
223 | FREED_ANON, | ||
224 | FREED_FILE, | ||
225 | ELAPSED, | ||
226 | NR_SCANSTATS, | ||
227 | }; | ||
228 | |||
229 | struct scanstat { | ||
230 | spinlock_t lock; | ||
231 | unsigned long stats[NR_SCAN_CONTEXT][NR_SCANSTATS]; | ||
232 | unsigned long rootstats[NR_SCAN_CONTEXT][NR_SCANSTATS]; | ||
233 | }; | ||
234 | |||
235 | const char *scanstat_string[NR_SCANSTATS] = { | ||
236 | "scanned_pages", | ||
237 | "scanned_anon_pages", | ||
238 | "scanned_file_pages", | ||
239 | "rotated_pages", | ||
240 | "rotated_anon_pages", | ||
241 | "rotated_file_pages", | ||
242 | "freed_pages", | ||
243 | "freed_anon_pages", | ||
244 | "freed_file_pages", | ||
245 | "elapsed_ns", | ||
246 | }; | ||
247 | #define SCANSTAT_WORD_LIMIT "_by_limit" | ||
248 | #define SCANSTAT_WORD_SYSTEM "_by_system" | ||
249 | #define SCANSTAT_WORD_HIERARCHY "_under_hierarchy" | ||
250 | |||
251 | 207 | ||
252 | /* | 208 | /* |
253 | * The memory controller data structure. The memory controller controls both | 209 | * The memory controller data structure. The memory controller controls both |
@@ -314,8 +270,7 @@ struct mem_cgroup { | |||
314 | 270 | ||
315 | /* For oom notifier event fd */ | 271 | /* For oom notifier event fd */ |
316 | struct list_head oom_notify; | 272 | struct list_head oom_notify; |
317 | /* For recording LRU-scan statistics */ | 273 | |
318 | struct scanstat scanstat; | ||
319 | /* | 274 | /* |
320 | * Should we move charges of a task when a task is moved into this | 275 | * Should we move charges of a task when a task is moved into this |
321 | * mem_cgroup ? And what type of charges should we move ? | 276 | * mem_cgroup ? And what type of charges should we move ? |
@@ -408,29 +363,29 @@ enum charge_type { | |||
408 | #define MEM_CGROUP_RECLAIM_SOFT_BIT 0x2 | 363 | #define MEM_CGROUP_RECLAIM_SOFT_BIT 0x2 |
409 | #define MEM_CGROUP_RECLAIM_SOFT (1 << MEM_CGROUP_RECLAIM_SOFT_BIT) | 364 | #define MEM_CGROUP_RECLAIM_SOFT (1 << MEM_CGROUP_RECLAIM_SOFT_BIT) |
410 | 365 | ||
411 | static void mem_cgroup_get(struct mem_cgroup *mem); | 366 | static void mem_cgroup_get(struct mem_cgroup *memcg); |
412 | static void mem_cgroup_put(struct mem_cgroup *mem); | 367 | static void mem_cgroup_put(struct mem_cgroup *memcg); |
413 | static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); | 368 | static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg); |
414 | static void drain_all_stock_async(struct mem_cgroup *mem); | 369 | static void drain_all_stock_async(struct mem_cgroup *memcg); |
415 | 370 | ||
416 | static struct mem_cgroup_per_zone * | 371 | static struct mem_cgroup_per_zone * |
417 | mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) | 372 | mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid) |
418 | { | 373 | { |
419 | return &mem->info.nodeinfo[nid]->zoneinfo[zid]; | 374 | return &memcg->info.nodeinfo[nid]->zoneinfo[zid]; |
420 | } | 375 | } |
421 | 376 | ||
422 | struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem) | 377 | struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg) |
423 | { | 378 | { |
424 | return &mem->css; | 379 | return &memcg->css; |
425 | } | 380 | } |
426 | 381 | ||
427 | static struct mem_cgroup_per_zone * | 382 | static struct mem_cgroup_per_zone * |
428 | page_cgroup_zoneinfo(struct mem_cgroup *mem, struct page *page) | 383 | page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page) |
429 | { | 384 | { |
430 | int nid = page_to_nid(page); | 385 | int nid = page_to_nid(page); |
431 | int zid = page_zonenum(page); | 386 | int zid = page_zonenum(page); |
432 | 387 | ||
433 | return mem_cgroup_zoneinfo(mem, nid, zid); | 388 | return mem_cgroup_zoneinfo(memcg, nid, zid); |
434 | } | 389 | } |
435 | 390 | ||
436 | static struct mem_cgroup_tree_per_zone * | 391 | static struct mem_cgroup_tree_per_zone * |
@@ -449,7 +404,7 @@ soft_limit_tree_from_page(struct page *page) | |||
449 | } | 404 | } |
450 | 405 | ||
451 | static void | 406 | static void |
452 | __mem_cgroup_insert_exceeded(struct mem_cgroup *mem, | 407 | __mem_cgroup_insert_exceeded(struct mem_cgroup *memcg, |
453 | struct mem_cgroup_per_zone *mz, | 408 | struct mem_cgroup_per_zone *mz, |
454 | struct mem_cgroup_tree_per_zone *mctz, | 409 | struct mem_cgroup_tree_per_zone *mctz, |
455 | unsigned long long new_usage_in_excess) | 410 | unsigned long long new_usage_in_excess) |
@@ -483,7 +438,7 @@ __mem_cgroup_insert_exceeded(struct mem_cgroup *mem, | |||
483 | } | 438 | } |
484 | 439 | ||
485 | static void | 440 | static void |
486 | __mem_cgroup_remove_exceeded(struct mem_cgroup *mem, | 441 | __mem_cgroup_remove_exceeded(struct mem_cgroup *memcg, |
487 | struct mem_cgroup_per_zone *mz, | 442 | struct mem_cgroup_per_zone *mz, |
488 | struct mem_cgroup_tree_per_zone *mctz) | 443 | struct mem_cgroup_tree_per_zone *mctz) |
489 | { | 444 | { |
@@ -494,17 +449,17 @@ __mem_cgroup_remove_exceeded(struct mem_cgroup *mem, | |||
494 | } | 449 | } |
495 | 450 | ||
496 | static void | 451 | static void |
497 | mem_cgroup_remove_exceeded(struct mem_cgroup *mem, | 452 | mem_cgroup_remove_exceeded(struct mem_cgroup *memcg, |
498 | struct mem_cgroup_per_zone *mz, | 453 | struct mem_cgroup_per_zone *mz, |
499 | struct mem_cgroup_tree_per_zone *mctz) | 454 | struct mem_cgroup_tree_per_zone *mctz) |
500 | { | 455 | { |
501 | spin_lock(&mctz->lock); | 456 | spin_lock(&mctz->lock); |
502 | __mem_cgroup_remove_exceeded(mem, mz, mctz); | 457 | __mem_cgroup_remove_exceeded(memcg, mz, mctz); |
503 | spin_unlock(&mctz->lock); | 458 | spin_unlock(&mctz->lock); |
504 | } | 459 | } |
505 | 460 | ||
506 | 461 | ||
507 | static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page) | 462 | static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) |
508 | { | 463 | { |
509 | unsigned long long excess; | 464 | unsigned long long excess; |
510 | struct mem_cgroup_per_zone *mz; | 465 | struct mem_cgroup_per_zone *mz; |
@@ -517,9 +472,9 @@ static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page) | |||
517 | * Necessary to update all ancestors when hierarchy is used. | 472 | * Necessary to update all ancestors when hierarchy is used. |
518 | * because their event counter is not touched. | 473 | * because their event counter is not touched. |
519 | */ | 474 | */ |
520 | for (; mem; mem = parent_mem_cgroup(mem)) { | 475 | for (; memcg; memcg = parent_mem_cgroup(memcg)) { |
521 | mz = mem_cgroup_zoneinfo(mem, nid, zid); | 476 | mz = mem_cgroup_zoneinfo(memcg, nid, zid); |
522 | excess = res_counter_soft_limit_excess(&mem->res); | 477 | excess = res_counter_soft_limit_excess(&memcg->res); |
523 | /* | 478 | /* |
524 | * We have to update the tree if mz is on RB-tree or | 479 | * We have to update the tree if mz is on RB-tree or |
525 | * mem is over its softlimit. | 480 | * mem is over its softlimit. |
@@ -528,18 +483,18 @@ static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page) | |||
528 | spin_lock(&mctz->lock); | 483 | spin_lock(&mctz->lock); |
529 | /* if on-tree, remove it */ | 484 | /* if on-tree, remove it */ |
530 | if (mz->on_tree) | 485 | if (mz->on_tree) |
531 | __mem_cgroup_remove_exceeded(mem, mz, mctz); | 486 | __mem_cgroup_remove_exceeded(memcg, mz, mctz); |
532 | /* | 487 | /* |
533 | * Insert again. mz->usage_in_excess will be updated. | 488 | * Insert again. mz->usage_in_excess will be updated. |
534 | * If excess is 0, no tree ops. | 489 | * If excess is 0, no tree ops. |
535 | */ | 490 | */ |
536 | __mem_cgroup_insert_exceeded(mem, mz, mctz, excess); | 491 | __mem_cgroup_insert_exceeded(memcg, mz, mctz, excess); |
537 | spin_unlock(&mctz->lock); | 492 | spin_unlock(&mctz->lock); |
538 | } | 493 | } |
539 | } | 494 | } |
540 | } | 495 | } |
541 | 496 | ||
542 | static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem) | 497 | static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) |
543 | { | 498 | { |
544 | int node, zone; | 499 | int node, zone; |
545 | struct mem_cgroup_per_zone *mz; | 500 | struct mem_cgroup_per_zone *mz; |
@@ -547,9 +502,9 @@ static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem) | |||
547 | 502 | ||
548 | for_each_node_state(node, N_POSSIBLE) { | 503 | for_each_node_state(node, N_POSSIBLE) { |
549 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { | 504 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { |
550 | mz = mem_cgroup_zoneinfo(mem, node, zone); | 505 | mz = mem_cgroup_zoneinfo(memcg, node, zone); |
551 | mctz = soft_limit_tree_node_zone(node, zone); | 506 | mctz = soft_limit_tree_node_zone(node, zone); |
552 | mem_cgroup_remove_exceeded(mem, mz, mctz); | 507 | mem_cgroup_remove_exceeded(memcg, mz, mctz); |
553 | } | 508 | } |
554 | } | 509 | } |
555 | } | 510 | } |
@@ -610,7 +565,7 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) | |||
610 | * common workload, threashold and synchonization as vmstat[] should be | 565 | * common workload, threashold and synchonization as vmstat[] should be |
611 | * implemented. | 566 | * implemented. |
612 | */ | 567 | */ |
613 | static long mem_cgroup_read_stat(struct mem_cgroup *mem, | 568 | static long mem_cgroup_read_stat(struct mem_cgroup *memcg, |
614 | enum mem_cgroup_stat_index idx) | 569 | enum mem_cgroup_stat_index idx) |
615 | { | 570 | { |
616 | long val = 0; | 571 | long val = 0; |
@@ -618,81 +573,83 @@ static long mem_cgroup_read_stat(struct mem_cgroup *mem, | |||
618 | 573 | ||
619 | get_online_cpus(); | 574 | get_online_cpus(); |
620 | for_each_online_cpu(cpu) | 575 | for_each_online_cpu(cpu) |
621 | val += per_cpu(mem->stat->count[idx], cpu); | 576 | val += per_cpu(memcg->stat->count[idx], cpu); |
622 | #ifdef CONFIG_HOTPLUG_CPU | 577 | #ifdef CONFIG_HOTPLUG_CPU |
623 | spin_lock(&mem->pcp_counter_lock); | 578 | spin_lock(&memcg->pcp_counter_lock); |
624 | val += mem->nocpu_base.count[idx]; | 579 | val += memcg->nocpu_base.count[idx]; |
625 | spin_unlock(&mem->pcp_counter_lock); | 580 | spin_unlock(&memcg->pcp_counter_lock); |
626 | #endif | 581 | #endif |
627 | put_online_cpus(); | 582 | put_online_cpus(); |
628 | return val; | 583 | return val; |
629 | } | 584 | } |
630 | 585 | ||
631 | static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, | 586 | static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, |
632 | bool charge) | 587 | bool charge) |
633 | { | 588 | { |
634 | int val = (charge) ? 1 : -1; | 589 | int val = (charge) ? 1 : -1; |
635 | this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val); | 590 | this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAPOUT], val); |
636 | } | 591 | } |
637 | 592 | ||
638 | void mem_cgroup_pgfault(struct mem_cgroup *mem, int val) | 593 | void mem_cgroup_pgfault(struct mem_cgroup *memcg, int val) |
639 | { | 594 | { |
640 | this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_PGFAULT], val); | 595 | this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT], val); |
641 | } | 596 | } |
642 | 597 | ||
643 | void mem_cgroup_pgmajfault(struct mem_cgroup *mem, int val) | 598 | void mem_cgroup_pgmajfault(struct mem_cgroup *memcg, int val) |
644 | { | 599 | { |
645 | this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT], val); | 600 | this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT], val); |
646 | } | 601 | } |
647 | 602 | ||
648 | static unsigned long mem_cgroup_read_events(struct mem_cgroup *mem, | 603 | static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, |
649 | enum mem_cgroup_events_index idx) | 604 | enum mem_cgroup_events_index idx) |
650 | { | 605 | { |
651 | unsigned long val = 0; | 606 | unsigned long val = 0; |
652 | int cpu; | 607 | int cpu; |
653 | 608 | ||
654 | for_each_online_cpu(cpu) | 609 | for_each_online_cpu(cpu) |
655 | val += per_cpu(mem->stat->events[idx], cpu); | 610 | val += per_cpu(memcg->stat->events[idx], cpu); |
656 | #ifdef CONFIG_HOTPLUG_CPU | 611 | #ifdef CONFIG_HOTPLUG_CPU |
657 | spin_lock(&mem->pcp_counter_lock); | 612 | spin_lock(&memcg->pcp_counter_lock); |
658 | val += mem->nocpu_base.events[idx]; | 613 | val += memcg->nocpu_base.events[idx]; |
659 | spin_unlock(&mem->pcp_counter_lock); | 614 | spin_unlock(&memcg->pcp_counter_lock); |
660 | #endif | 615 | #endif |
661 | return val; | 616 | return val; |
662 | } | 617 | } |
663 | 618 | ||
664 | static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, | 619 | static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, |
665 | bool file, int nr_pages) | 620 | bool file, int nr_pages) |
666 | { | 621 | { |
667 | preempt_disable(); | 622 | preempt_disable(); |
668 | 623 | ||
669 | if (file) | 624 | if (file) |
670 | __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], nr_pages); | 625 | __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE], |
626 | nr_pages); | ||
671 | else | 627 | else |
672 | __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], nr_pages); | 628 | __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS], |
629 | nr_pages); | ||
673 | 630 | ||
674 | /* pagein of a big page is an event. So, ignore page size */ | 631 | /* pagein of a big page is an event. So, ignore page size */ |
675 | if (nr_pages > 0) | 632 | if (nr_pages > 0) |
676 | __this_cpu_inc(mem->stat->events[MEM_CGROUP_EVENTS_PGPGIN]); | 633 | __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]); |
677 | else { | 634 | else { |
678 | __this_cpu_inc(mem->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]); | 635 | __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]); |
679 | nr_pages = -nr_pages; /* for event */ | 636 | nr_pages = -nr_pages; /* for event */ |
680 | } | 637 | } |
681 | 638 | ||
682 | __this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_COUNT], nr_pages); | 639 | __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT], nr_pages); |
683 | 640 | ||
684 | preempt_enable(); | 641 | preempt_enable(); |
685 | } | 642 | } |
686 | 643 | ||
687 | unsigned long | 644 | unsigned long |
688 | mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *mem, int nid, int zid, | 645 | mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid, |
689 | unsigned int lru_mask) | 646 | unsigned int lru_mask) |
690 | { | 647 | { |
691 | struct mem_cgroup_per_zone *mz; | 648 | struct mem_cgroup_per_zone *mz; |
692 | enum lru_list l; | 649 | enum lru_list l; |
693 | unsigned long ret = 0; | 650 | unsigned long ret = 0; |
694 | 651 | ||
695 | mz = mem_cgroup_zoneinfo(mem, nid, zid); | 652 | mz = mem_cgroup_zoneinfo(memcg, nid, zid); |
696 | 653 | ||
697 | for_each_lru(l) { | 654 | for_each_lru(l) { |
698 | if (BIT(l) & lru_mask) | 655 | if (BIT(l) & lru_mask) |
@@ -702,44 +659,45 @@ mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *mem, int nid, int zid, | |||
702 | } | 659 | } |
703 | 660 | ||
704 | static unsigned long | 661 | static unsigned long |
705 | mem_cgroup_node_nr_lru_pages(struct mem_cgroup *mem, | 662 | mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, |
706 | int nid, unsigned int lru_mask) | 663 | int nid, unsigned int lru_mask) |
707 | { | 664 | { |
708 | u64 total = 0; | 665 | u64 total = 0; |
709 | int zid; | 666 | int zid; |
710 | 667 | ||
711 | for (zid = 0; zid < MAX_NR_ZONES; zid++) | 668 | for (zid = 0; zid < MAX_NR_ZONES; zid++) |
712 | total += mem_cgroup_zone_nr_lru_pages(mem, nid, zid, lru_mask); | 669 | total += mem_cgroup_zone_nr_lru_pages(memcg, |
670 | nid, zid, lru_mask); | ||
713 | 671 | ||
714 | return total; | 672 | return total; |
715 | } | 673 | } |
716 | 674 | ||
717 | static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *mem, | 675 | static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, |
718 | unsigned int lru_mask) | 676 | unsigned int lru_mask) |
719 | { | 677 | { |
720 | int nid; | 678 | int nid; |
721 | u64 total = 0; | 679 | u64 total = 0; |
722 | 680 | ||
723 | for_each_node_state(nid, N_HIGH_MEMORY) | 681 | for_each_node_state(nid, N_HIGH_MEMORY) |
724 | total += mem_cgroup_node_nr_lru_pages(mem, nid, lru_mask); | 682 | total += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask); |
725 | return total; | 683 | return total; |
726 | } | 684 | } |
727 | 685 | ||
728 | static bool __memcg_event_check(struct mem_cgroup *mem, int target) | 686 | static bool __memcg_event_check(struct mem_cgroup *memcg, int target) |
729 | { | 687 | { |
730 | unsigned long val, next; | 688 | unsigned long val, next; |
731 | 689 | ||
732 | val = this_cpu_read(mem->stat->events[MEM_CGROUP_EVENTS_COUNT]); | 690 | val = __this_cpu_read(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT]); |
733 | next = this_cpu_read(mem->stat->targets[target]); | 691 | next = __this_cpu_read(memcg->stat->targets[target]); |
734 | /* from time_after() in jiffies.h */ | 692 | /* from time_after() in jiffies.h */ |
735 | return ((long)next - (long)val < 0); | 693 | return ((long)next - (long)val < 0); |
736 | } | 694 | } |
737 | 695 | ||
738 | static void __mem_cgroup_target_update(struct mem_cgroup *mem, int target) | 696 | static void __mem_cgroup_target_update(struct mem_cgroup *memcg, int target) |
739 | { | 697 | { |
740 | unsigned long val, next; | 698 | unsigned long val, next; |
741 | 699 | ||
742 | val = this_cpu_read(mem->stat->events[MEM_CGROUP_EVENTS_COUNT]); | 700 | val = __this_cpu_read(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT]); |
743 | 701 | ||
744 | switch (target) { | 702 | switch (target) { |
745 | case MEM_CGROUP_TARGET_THRESH: | 703 | case MEM_CGROUP_TARGET_THRESH: |
@@ -755,34 +713,36 @@ static void __mem_cgroup_target_update(struct mem_cgroup *mem, int target) | |||
755 | return; | 713 | return; |
756 | } | 714 | } |
757 | 715 | ||
758 | this_cpu_write(mem->stat->targets[target], next); | 716 | __this_cpu_write(memcg->stat->targets[target], next); |
759 | } | 717 | } |
760 | 718 | ||
761 | /* | 719 | /* |
762 | * Check events in order. | 720 | * Check events in order. |
763 | * | 721 | * |
764 | */ | 722 | */ |
765 | static void memcg_check_events(struct mem_cgroup *mem, struct page *page) | 723 | static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) |
766 | { | 724 | { |
725 | preempt_disable(); | ||
767 | /* threshold event is triggered in finer grain than soft limit */ | 726 | /* threshold event is triggered in finer grain than soft limit */ |
768 | if (unlikely(__memcg_event_check(mem, MEM_CGROUP_TARGET_THRESH))) { | 727 | if (unlikely(__memcg_event_check(memcg, MEM_CGROUP_TARGET_THRESH))) { |
769 | mem_cgroup_threshold(mem); | 728 | mem_cgroup_threshold(memcg); |
770 | __mem_cgroup_target_update(mem, MEM_CGROUP_TARGET_THRESH); | 729 | __mem_cgroup_target_update(memcg, MEM_CGROUP_TARGET_THRESH); |
771 | if (unlikely(__memcg_event_check(mem, | 730 | if (unlikely(__memcg_event_check(memcg, |
772 | MEM_CGROUP_TARGET_SOFTLIMIT))) { | 731 | MEM_CGROUP_TARGET_SOFTLIMIT))) { |
773 | mem_cgroup_update_tree(mem, page); | 732 | mem_cgroup_update_tree(memcg, page); |
774 | __mem_cgroup_target_update(mem, | 733 | __mem_cgroup_target_update(memcg, |
775 | MEM_CGROUP_TARGET_SOFTLIMIT); | 734 | MEM_CGROUP_TARGET_SOFTLIMIT); |
776 | } | 735 | } |
777 | #if MAX_NUMNODES > 1 | 736 | #if MAX_NUMNODES > 1 |
778 | if (unlikely(__memcg_event_check(mem, | 737 | if (unlikely(__memcg_event_check(memcg, |
779 | MEM_CGROUP_TARGET_NUMAINFO))) { | 738 | MEM_CGROUP_TARGET_NUMAINFO))) { |
780 | atomic_inc(&mem->numainfo_events); | 739 | atomic_inc(&memcg->numainfo_events); |
781 | __mem_cgroup_target_update(mem, | 740 | __mem_cgroup_target_update(memcg, |
782 | MEM_CGROUP_TARGET_NUMAINFO); | 741 | MEM_CGROUP_TARGET_NUMAINFO); |
783 | } | 742 | } |
784 | #endif | 743 | #endif |
785 | } | 744 | } |
745 | preempt_enable(); | ||
786 | } | 746 | } |
787 | 747 | ||
788 | static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) | 748 | static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) |
@@ -808,7 +768,7 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) | |||
808 | 768 | ||
809 | struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) | 769 | struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) |
810 | { | 770 | { |
811 | struct mem_cgroup *mem = NULL; | 771 | struct mem_cgroup *memcg = NULL; |
812 | 772 | ||
813 | if (!mm) | 773 | if (!mm) |
814 | return NULL; | 774 | return NULL; |
@@ -819,25 +779,25 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) | |||
819 | */ | 779 | */ |
820 | rcu_read_lock(); | 780 | rcu_read_lock(); |
821 | do { | 781 | do { |
822 | mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); | 782 | memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); |
823 | if (unlikely(!mem)) | 783 | if (unlikely(!memcg)) |
824 | break; | 784 | break; |
825 | } while (!css_tryget(&mem->css)); | 785 | } while (!css_tryget(&memcg->css)); |
826 | rcu_read_unlock(); | 786 | rcu_read_unlock(); |
827 | return mem; | 787 | return memcg; |
828 | } | 788 | } |
829 | 789 | ||
830 | /* The caller has to guarantee "mem" exists before calling this */ | 790 | /* The caller has to guarantee "mem" exists before calling this */ |
831 | static struct mem_cgroup *mem_cgroup_start_loop(struct mem_cgroup *mem) | 791 | static struct mem_cgroup *mem_cgroup_start_loop(struct mem_cgroup *memcg) |
832 | { | 792 | { |
833 | struct cgroup_subsys_state *css; | 793 | struct cgroup_subsys_state *css; |
834 | int found; | 794 | int found; |
835 | 795 | ||
836 | if (!mem) /* ROOT cgroup has the smallest ID */ | 796 | if (!memcg) /* ROOT cgroup has the smallest ID */ |
837 | return root_mem_cgroup; /*css_put/get against root is ignored*/ | 797 | return root_mem_cgroup; /*css_put/get against root is ignored*/ |
838 | if (!mem->use_hierarchy) { | 798 | if (!memcg->use_hierarchy) { |
839 | if (css_tryget(&mem->css)) | 799 | if (css_tryget(&memcg->css)) |
840 | return mem; | 800 | return memcg; |
841 | return NULL; | 801 | return NULL; |
842 | } | 802 | } |
843 | rcu_read_lock(); | 803 | rcu_read_lock(); |
@@ -845,13 +805,13 @@ static struct mem_cgroup *mem_cgroup_start_loop(struct mem_cgroup *mem) | |||
845 | * searching a memory cgroup which has the smallest ID under given | 805 | * searching a memory cgroup which has the smallest ID under given |
846 | * ROOT cgroup. (ID >= 1) | 806 | * ROOT cgroup. (ID >= 1) |
847 | */ | 807 | */ |
848 | css = css_get_next(&mem_cgroup_subsys, 1, &mem->css, &found); | 808 | css = css_get_next(&mem_cgroup_subsys, 1, &memcg->css, &found); |
849 | if (css && css_tryget(css)) | 809 | if (css && css_tryget(css)) |
850 | mem = container_of(css, struct mem_cgroup, css); | 810 | memcg = container_of(css, struct mem_cgroup, css); |
851 | else | 811 | else |
852 | mem = NULL; | 812 | memcg = NULL; |
853 | rcu_read_unlock(); | 813 | rcu_read_unlock(); |
854 | return mem; | 814 | return memcg; |
855 | } | 815 | } |
856 | 816 | ||
857 | static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter, | 817 | static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter, |
@@ -905,29 +865,29 @@ static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter, | |||
905 | for_each_mem_cgroup_tree_cond(iter, NULL, true) | 865 | for_each_mem_cgroup_tree_cond(iter, NULL, true) |
906 | 866 | ||
907 | 867 | ||
908 | static inline bool mem_cgroup_is_root(struct mem_cgroup *mem) | 868 | static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) |
909 | { | 869 | { |
910 | return (mem == root_mem_cgroup); | 870 | return (memcg == root_mem_cgroup); |
911 | } | 871 | } |
912 | 872 | ||
913 | void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) | 873 | void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) |
914 | { | 874 | { |
915 | struct mem_cgroup *mem; | 875 | struct mem_cgroup *memcg; |
916 | 876 | ||
917 | if (!mm) | 877 | if (!mm) |
918 | return; | 878 | return; |
919 | 879 | ||
920 | rcu_read_lock(); | 880 | rcu_read_lock(); |
921 | mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); | 881 | memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); |
922 | if (unlikely(!mem)) | 882 | if (unlikely(!memcg)) |
923 | goto out; | 883 | goto out; |
924 | 884 | ||
925 | switch (idx) { | 885 | switch (idx) { |
926 | case PGMAJFAULT: | 886 | case PGMAJFAULT: |
927 | mem_cgroup_pgmajfault(mem, 1); | 887 | mem_cgroup_pgmajfault(memcg, 1); |
928 | break; | 888 | break; |
929 | case PGFAULT: | 889 | case PGFAULT: |
930 | mem_cgroup_pgfault(mem, 1); | 890 | mem_cgroup_pgfault(memcg, 1); |
931 | break; | 891 | break; |
932 | default: | 892 | default: |
933 | BUG(); | 893 | BUG(); |
@@ -1036,6 +996,16 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) | |||
1036 | return; | 996 | return; |
1037 | pc = lookup_page_cgroup(page); | 997 | pc = lookup_page_cgroup(page); |
1038 | VM_BUG_ON(PageCgroupAcctLRU(pc)); | 998 | VM_BUG_ON(PageCgroupAcctLRU(pc)); |
999 | /* | ||
1000 | * putback: charge: | ||
1001 | * SetPageLRU SetPageCgroupUsed | ||
1002 | * smp_mb smp_mb | ||
1003 | * PageCgroupUsed && add to memcg LRU PageLRU && add to memcg LRU | ||
1004 | * | ||
1005 | * Ensure that one of the two sides adds the page to the memcg | ||
1006 | * LRU during a race. | ||
1007 | */ | ||
1008 | smp_mb(); | ||
1039 | if (!PageCgroupUsed(pc)) | 1009 | if (!PageCgroupUsed(pc)) |
1040 | return; | 1010 | return; |
1041 | /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ | 1011 | /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ |
@@ -1087,7 +1057,16 @@ static void mem_cgroup_lru_add_after_commit(struct page *page) | |||
1087 | unsigned long flags; | 1057 | unsigned long flags; |
1088 | struct zone *zone = page_zone(page); | 1058 | struct zone *zone = page_zone(page); |
1089 | struct page_cgroup *pc = lookup_page_cgroup(page); | 1059 | struct page_cgroup *pc = lookup_page_cgroup(page); |
1090 | 1060 | /* | |
1061 | * putback: charge: | ||
1062 | * SetPageLRU SetPageCgroupUsed | ||
1063 | * smp_mb smp_mb | ||
1064 | * PageCgroupUsed && add to memcg LRU PageLRU && add to memcg LRU | ||
1065 | * | ||
1066 | * Ensure that one of the two sides adds the page to the memcg | ||
1067 | * LRU during a race. | ||
1068 | */ | ||
1069 | smp_mb(); | ||
1091 | /* taking care of that the page is added to LRU while we commit it */ | 1070 | /* taking care of that the page is added to LRU while we commit it */ |
1092 | if (likely(!PageLRU(page))) | 1071 | if (likely(!PageLRU(page))) |
1093 | return; | 1072 | return; |
@@ -1109,21 +1088,21 @@ void mem_cgroup_move_lists(struct page *page, | |||
1109 | } | 1088 | } |
1110 | 1089 | ||
1111 | /* | 1090 | /* |
1112 | * Checks whether given mem is same or in the root_mem's | 1091 | * Checks whether given mem is same or in the root_mem_cgroup's |
1113 | * hierarchy subtree | 1092 | * hierarchy subtree |
1114 | */ | 1093 | */ |
1115 | static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_mem, | 1094 | static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, |
1116 | struct mem_cgroup *mem) | 1095 | struct mem_cgroup *memcg) |
1117 | { | 1096 | { |
1118 | if (root_mem != mem) { | 1097 | if (root_memcg != memcg) { |
1119 | return (root_mem->use_hierarchy && | 1098 | return (root_memcg->use_hierarchy && |
1120 | css_is_ancestor(&mem->css, &root_mem->css)); | 1099 | css_is_ancestor(&memcg->css, &root_memcg->css)); |
1121 | } | 1100 | } |
1122 | 1101 | ||
1123 | return true; | 1102 | return true; |
1124 | } | 1103 | } |
1125 | 1104 | ||
1126 | int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) | 1105 | int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg) |
1127 | { | 1106 | { |
1128 | int ret; | 1107 | int ret; |
1129 | struct mem_cgroup *curr = NULL; | 1108 | struct mem_cgroup *curr = NULL; |
@@ -1137,25 +1116,29 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) | |||
1137 | if (!curr) | 1116 | if (!curr) |
1138 | return 0; | 1117 | return 0; |
1139 | /* | 1118 | /* |
1140 | * We should check use_hierarchy of "mem" not "curr". Because checking | 1119 | * We should check use_hierarchy of "memcg" not "curr". Because checking |
1141 | * use_hierarchy of "curr" here make this function true if hierarchy is | 1120 | * use_hierarchy of "curr" here make this function true if hierarchy is |
1142 | * enabled in "curr" and "curr" is a child of "mem" in *cgroup* | 1121 | * enabled in "curr" and "curr" is a child of "memcg" in *cgroup* |
1143 | * hierarchy(even if use_hierarchy is disabled in "mem"). | 1122 | * hierarchy(even if use_hierarchy is disabled in "memcg"). |
1144 | */ | 1123 | */ |
1145 | ret = mem_cgroup_same_or_subtree(mem, curr); | 1124 | ret = mem_cgroup_same_or_subtree(memcg, curr); |
1146 | css_put(&curr->css); | 1125 | css_put(&curr->css); |
1147 | return ret; | 1126 | return ret; |
1148 | } | 1127 | } |
1149 | 1128 | ||
1150 | static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages) | 1129 | int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg, struct zone *zone) |
1151 | { | 1130 | { |
1152 | unsigned long active; | 1131 | unsigned long inactive_ratio; |
1132 | int nid = zone_to_nid(zone); | ||
1133 | int zid = zone_idx(zone); | ||
1153 | unsigned long inactive; | 1134 | unsigned long inactive; |
1135 | unsigned long active; | ||
1154 | unsigned long gb; | 1136 | unsigned long gb; |
1155 | unsigned long inactive_ratio; | ||
1156 | 1137 | ||
1157 | inactive = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_ANON)); | 1138 | inactive = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid, |
1158 | active = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_ANON)); | 1139 | BIT(LRU_INACTIVE_ANON)); |
1140 | active = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid, | ||
1141 | BIT(LRU_ACTIVE_ANON)); | ||
1159 | 1142 | ||
1160 | gb = (inactive + active) >> (30 - PAGE_SHIFT); | 1143 | gb = (inactive + active) >> (30 - PAGE_SHIFT); |
1161 | if (gb) | 1144 | if (gb) |
@@ -1163,39 +1146,20 @@ static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_ | |||
1163 | else | 1146 | else |
1164 | inactive_ratio = 1; | 1147 | inactive_ratio = 1; |
1165 | 1148 | ||
1166 | if (present_pages) { | 1149 | return inactive * inactive_ratio < active; |
1167 | present_pages[0] = inactive; | ||
1168 | present_pages[1] = active; | ||
1169 | } | ||
1170 | |||
1171 | return inactive_ratio; | ||
1172 | } | ||
1173 | |||
1174 | int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg) | ||
1175 | { | ||
1176 | unsigned long active; | ||
1177 | unsigned long inactive; | ||
1178 | unsigned long present_pages[2]; | ||
1179 | unsigned long inactive_ratio; | ||
1180 | |||
1181 | inactive_ratio = calc_inactive_ratio(memcg, present_pages); | ||
1182 | |||
1183 | inactive = present_pages[0]; | ||
1184 | active = present_pages[1]; | ||
1185 | |||
1186 | if (inactive * inactive_ratio < active) | ||
1187 | return 1; | ||
1188 | |||
1189 | return 0; | ||
1190 | } | 1150 | } |
1191 | 1151 | ||
1192 | int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg) | 1152 | int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg, struct zone *zone) |
1193 | { | 1153 | { |
1194 | unsigned long active; | 1154 | unsigned long active; |
1195 | unsigned long inactive; | 1155 | unsigned long inactive; |
1156 | int zid = zone_idx(zone); | ||
1157 | int nid = zone_to_nid(zone); | ||
1196 | 1158 | ||
1197 | inactive = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_FILE)); | 1159 | inactive = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid, |
1198 | active = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_FILE)); | 1160 | BIT(LRU_INACTIVE_FILE)); |
1161 | active = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid, | ||
1162 | BIT(LRU_ACTIVE_FILE)); | ||
1199 | 1163 | ||
1200 | return (active > inactive); | 1164 | return (active > inactive); |
1201 | } | 1165 | } |
@@ -1231,7 +1195,8 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page) | |||
1231 | unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, | 1195 | unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, |
1232 | struct list_head *dst, | 1196 | struct list_head *dst, |
1233 | unsigned long *scanned, int order, | 1197 | unsigned long *scanned, int order, |
1234 | int mode, struct zone *z, | 1198 | isolate_mode_t mode, |
1199 | struct zone *z, | ||
1235 | struct mem_cgroup *mem_cont, | 1200 | struct mem_cgroup *mem_cont, |
1236 | int active, int file) | 1201 | int active, int file) |
1237 | { | 1202 | { |
@@ -1299,13 +1264,13 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, | |||
1299 | * Returns the maximum amount of memory @mem can be charged with, in | 1264 | * Returns the maximum amount of memory @mem can be charged with, in |
1300 | * pages. | 1265 | * pages. |
1301 | */ | 1266 | */ |
1302 | static unsigned long mem_cgroup_margin(struct mem_cgroup *mem) | 1267 | static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) |
1303 | { | 1268 | { |
1304 | unsigned long long margin; | 1269 | unsigned long long margin; |
1305 | 1270 | ||
1306 | margin = res_counter_margin(&mem->res); | 1271 | margin = res_counter_margin(&memcg->res); |
1307 | if (do_swap_account) | 1272 | if (do_swap_account) |
1308 | margin = min(margin, res_counter_margin(&mem->memsw)); | 1273 | margin = min(margin, res_counter_margin(&memcg->memsw)); |
1309 | return margin >> PAGE_SHIFT; | 1274 | return margin >> PAGE_SHIFT; |
1310 | } | 1275 | } |
1311 | 1276 | ||
@@ -1320,33 +1285,33 @@ int mem_cgroup_swappiness(struct mem_cgroup *memcg) | |||
1320 | return memcg->swappiness; | 1285 | return memcg->swappiness; |
1321 | } | 1286 | } |
1322 | 1287 | ||
1323 | static void mem_cgroup_start_move(struct mem_cgroup *mem) | 1288 | static void mem_cgroup_start_move(struct mem_cgroup *memcg) |
1324 | { | 1289 | { |
1325 | int cpu; | 1290 | int cpu; |
1326 | 1291 | ||
1327 | get_online_cpus(); | 1292 | get_online_cpus(); |
1328 | spin_lock(&mem->pcp_counter_lock); | 1293 | spin_lock(&memcg->pcp_counter_lock); |
1329 | for_each_online_cpu(cpu) | 1294 | for_each_online_cpu(cpu) |
1330 | per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1; | 1295 | per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1; |
1331 | mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] += 1; | 1296 | memcg->nocpu_base.count[MEM_CGROUP_ON_MOVE] += 1; |
1332 | spin_unlock(&mem->pcp_counter_lock); | 1297 | spin_unlock(&memcg->pcp_counter_lock); |
1333 | put_online_cpus(); | 1298 | put_online_cpus(); |
1334 | 1299 | ||
1335 | synchronize_rcu(); | 1300 | synchronize_rcu(); |
1336 | } | 1301 | } |
1337 | 1302 | ||
1338 | static void mem_cgroup_end_move(struct mem_cgroup *mem) | 1303 | static void mem_cgroup_end_move(struct mem_cgroup *memcg) |
1339 | { | 1304 | { |
1340 | int cpu; | 1305 | int cpu; |
1341 | 1306 | ||
1342 | if (!mem) | 1307 | if (!memcg) |
1343 | return; | 1308 | return; |
1344 | get_online_cpus(); | 1309 | get_online_cpus(); |
1345 | spin_lock(&mem->pcp_counter_lock); | 1310 | spin_lock(&memcg->pcp_counter_lock); |
1346 | for_each_online_cpu(cpu) | 1311 | for_each_online_cpu(cpu) |
1347 | per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1; | 1312 | per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1; |
1348 | mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] -= 1; | 1313 | memcg->nocpu_base.count[MEM_CGROUP_ON_MOVE] -= 1; |
1349 | spin_unlock(&mem->pcp_counter_lock); | 1314 | spin_unlock(&memcg->pcp_counter_lock); |
1350 | put_online_cpus(); | 1315 | put_online_cpus(); |
1351 | } | 1316 | } |
1352 | /* | 1317 | /* |
@@ -1361,13 +1326,13 @@ static void mem_cgroup_end_move(struct mem_cgroup *mem) | |||
1361 | * waiting at hith-memory prressure caused by "move". | 1326 | * waiting at hith-memory prressure caused by "move". |
1362 | */ | 1327 | */ |
1363 | 1328 | ||
1364 | static bool mem_cgroup_stealed(struct mem_cgroup *mem) | 1329 | static bool mem_cgroup_stealed(struct mem_cgroup *memcg) |
1365 | { | 1330 | { |
1366 | VM_BUG_ON(!rcu_read_lock_held()); | 1331 | VM_BUG_ON(!rcu_read_lock_held()); |
1367 | return this_cpu_read(mem->stat->count[MEM_CGROUP_ON_MOVE]) > 0; | 1332 | return this_cpu_read(memcg->stat->count[MEM_CGROUP_ON_MOVE]) > 0; |
1368 | } | 1333 | } |
1369 | 1334 | ||
1370 | static bool mem_cgroup_under_move(struct mem_cgroup *mem) | 1335 | static bool mem_cgroup_under_move(struct mem_cgroup *memcg) |
1371 | { | 1336 | { |
1372 | struct mem_cgroup *from; | 1337 | struct mem_cgroup *from; |
1373 | struct mem_cgroup *to; | 1338 | struct mem_cgroup *to; |
@@ -1382,17 +1347,17 @@ static bool mem_cgroup_under_move(struct mem_cgroup *mem) | |||
1382 | if (!from) | 1347 | if (!from) |
1383 | goto unlock; | 1348 | goto unlock; |
1384 | 1349 | ||
1385 | ret = mem_cgroup_same_or_subtree(mem, from) | 1350 | ret = mem_cgroup_same_or_subtree(memcg, from) |
1386 | || mem_cgroup_same_or_subtree(mem, to); | 1351 | || mem_cgroup_same_or_subtree(memcg, to); |
1387 | unlock: | 1352 | unlock: |
1388 | spin_unlock(&mc.lock); | 1353 | spin_unlock(&mc.lock); |
1389 | return ret; | 1354 | return ret; |
1390 | } | 1355 | } |
1391 | 1356 | ||
1392 | static bool mem_cgroup_wait_acct_move(struct mem_cgroup *mem) | 1357 | static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg) |
1393 | { | 1358 | { |
1394 | if (mc.moving_task && current != mc.moving_task) { | 1359 | if (mc.moving_task && current != mc.moving_task) { |
1395 | if (mem_cgroup_under_move(mem)) { | 1360 | if (mem_cgroup_under_move(memcg)) { |
1396 | DEFINE_WAIT(wait); | 1361 | DEFINE_WAIT(wait); |
1397 | prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE); | 1362 | prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE); |
1398 | /* moving charge context might have finished. */ | 1363 | /* moving charge context might have finished. */ |
@@ -1476,12 +1441,12 @@ done: | |||
1476 | * This function returns the number of memcg under hierarchy tree. Returns | 1441 | * This function returns the number of memcg under hierarchy tree. Returns |
1477 | * 1(self count) if no children. | 1442 | * 1(self count) if no children. |
1478 | */ | 1443 | */ |
1479 | static int mem_cgroup_count_children(struct mem_cgroup *mem) | 1444 | static int mem_cgroup_count_children(struct mem_cgroup *memcg) |
1480 | { | 1445 | { |
1481 | int num = 0; | 1446 | int num = 0; |
1482 | struct mem_cgroup *iter; | 1447 | struct mem_cgroup *iter; |
1483 | 1448 | ||
1484 | for_each_mem_cgroup_tree(iter, mem) | 1449 | for_each_mem_cgroup_tree(iter, memcg) |
1485 | num++; | 1450 | num++; |
1486 | return num; | 1451 | return num; |
1487 | } | 1452 | } |
@@ -1511,21 +1476,21 @@ u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) | |||
1511 | * that to reclaim free pages from. | 1476 | * that to reclaim free pages from. |
1512 | */ | 1477 | */ |
1513 | static struct mem_cgroup * | 1478 | static struct mem_cgroup * |
1514 | mem_cgroup_select_victim(struct mem_cgroup *root_mem) | 1479 | mem_cgroup_select_victim(struct mem_cgroup *root_memcg) |
1515 | { | 1480 | { |
1516 | struct mem_cgroup *ret = NULL; | 1481 | struct mem_cgroup *ret = NULL; |
1517 | struct cgroup_subsys_state *css; | 1482 | struct cgroup_subsys_state *css; |
1518 | int nextid, found; | 1483 | int nextid, found; |
1519 | 1484 | ||
1520 | if (!root_mem->use_hierarchy) { | 1485 | if (!root_memcg->use_hierarchy) { |
1521 | css_get(&root_mem->css); | 1486 | css_get(&root_memcg->css); |
1522 | ret = root_mem; | 1487 | ret = root_memcg; |
1523 | } | 1488 | } |
1524 | 1489 | ||
1525 | while (!ret) { | 1490 | while (!ret) { |
1526 | rcu_read_lock(); | 1491 | rcu_read_lock(); |
1527 | nextid = root_mem->last_scanned_child + 1; | 1492 | nextid = root_memcg->last_scanned_child + 1; |
1528 | css = css_get_next(&mem_cgroup_subsys, nextid, &root_mem->css, | 1493 | css = css_get_next(&mem_cgroup_subsys, nextid, &root_memcg->css, |
1529 | &found); | 1494 | &found); |
1530 | if (css && css_tryget(css)) | 1495 | if (css && css_tryget(css)) |
1531 | ret = container_of(css, struct mem_cgroup, css); | 1496 | ret = container_of(css, struct mem_cgroup, css); |
@@ -1534,9 +1499,9 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem) | |||
1534 | /* Updates scanning parameter */ | 1499 | /* Updates scanning parameter */ |
1535 | if (!css) { | 1500 | if (!css) { |
1536 | /* this means start scan from ID:1 */ | 1501 | /* this means start scan from ID:1 */ |
1537 | root_mem->last_scanned_child = 0; | 1502 | root_memcg->last_scanned_child = 0; |
1538 | } else | 1503 | } else |
1539 | root_mem->last_scanned_child = found; | 1504 | root_memcg->last_scanned_child = found; |
1540 | } | 1505 | } |
1541 | 1506 | ||
1542 | return ret; | 1507 | return ret; |
@@ -1552,14 +1517,14 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem) | |||
1552 | * reclaimable pages on a node. Returns true if there are any reclaimable | 1517 | * reclaimable pages on a node. Returns true if there are any reclaimable |
1553 | * pages in the node. | 1518 | * pages in the node. |
1554 | */ | 1519 | */ |
1555 | static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *mem, | 1520 | static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg, |
1556 | int nid, bool noswap) | 1521 | int nid, bool noswap) |
1557 | { | 1522 | { |
1558 | if (mem_cgroup_node_nr_lru_pages(mem, nid, LRU_ALL_FILE)) | 1523 | if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE)) |
1559 | return true; | 1524 | return true; |
1560 | if (noswap || !total_swap_pages) | 1525 | if (noswap || !total_swap_pages) |
1561 | return false; | 1526 | return false; |
1562 | if (mem_cgroup_node_nr_lru_pages(mem, nid, LRU_ALL_ANON)) | 1527 | if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON)) |
1563 | return true; | 1528 | return true; |
1564 | return false; | 1529 | return false; |
1565 | 1530 | ||
@@ -1572,29 +1537,29 @@ static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *mem, | |||
1572 | * nodes based on the zonelist. So update the list loosely once per 10 secs. | 1537 | * nodes based on the zonelist. So update the list loosely once per 10 secs. |
1573 | * | 1538 | * |
1574 | */ | 1539 | */ |
1575 | static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem) | 1540 | static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg) |
1576 | { | 1541 | { |
1577 | int nid; | 1542 | int nid; |
1578 | /* | 1543 | /* |
1579 | * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET | 1544 | * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET |
1580 | * pagein/pageout changes since the last update. | 1545 | * pagein/pageout changes since the last update. |
1581 | */ | 1546 | */ |
1582 | if (!atomic_read(&mem->numainfo_events)) | 1547 | if (!atomic_read(&memcg->numainfo_events)) |
1583 | return; | 1548 | return; |
1584 | if (atomic_inc_return(&mem->numainfo_updating) > 1) | 1549 | if (atomic_inc_return(&memcg->numainfo_updating) > 1) |
1585 | return; | 1550 | return; |
1586 | 1551 | ||
1587 | /* make a nodemask where this memcg uses memory from */ | 1552 | /* make a nodemask where this memcg uses memory from */ |
1588 | mem->scan_nodes = node_states[N_HIGH_MEMORY]; | 1553 | memcg->scan_nodes = node_states[N_HIGH_MEMORY]; |
1589 | 1554 | ||
1590 | for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) { | 1555 | for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) { |
1591 | 1556 | ||
1592 | if (!test_mem_cgroup_node_reclaimable(mem, nid, false)) | 1557 | if (!test_mem_cgroup_node_reclaimable(memcg, nid, false)) |
1593 | node_clear(nid, mem->scan_nodes); | 1558 | node_clear(nid, memcg->scan_nodes); |
1594 | } | 1559 | } |
1595 | 1560 | ||
1596 | atomic_set(&mem->numainfo_events, 0); | 1561 | atomic_set(&memcg->numainfo_events, 0); |
1597 | atomic_set(&mem->numainfo_updating, 0); | 1562 | atomic_set(&memcg->numainfo_updating, 0); |
1598 | } | 1563 | } |
1599 | 1564 | ||
1600 | /* | 1565 | /* |
@@ -1609,16 +1574,16 @@ static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem) | |||
1609 | * | 1574 | * |
1610 | * Now, we use round-robin. Better algorithm is welcomed. | 1575 | * Now, we use round-robin. Better algorithm is welcomed. |
1611 | */ | 1576 | */ |
1612 | int mem_cgroup_select_victim_node(struct mem_cgroup *mem) | 1577 | int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) |
1613 | { | 1578 | { |
1614 | int node; | 1579 | int node; |
1615 | 1580 | ||
1616 | mem_cgroup_may_update_nodemask(mem); | 1581 | mem_cgroup_may_update_nodemask(memcg); |
1617 | node = mem->last_scanned_node; | 1582 | node = memcg->last_scanned_node; |
1618 | 1583 | ||
1619 | node = next_node(node, mem->scan_nodes); | 1584 | node = next_node(node, memcg->scan_nodes); |
1620 | if (node == MAX_NUMNODES) | 1585 | if (node == MAX_NUMNODES) |
1621 | node = first_node(mem->scan_nodes); | 1586 | node = first_node(memcg->scan_nodes); |
1622 | /* | 1587 | /* |
1623 | * We call this when we hit limit, not when pages are added to LRU. | 1588 | * We call this when we hit limit, not when pages are added to LRU. |
1624 | * No LRU may hold pages because all pages are UNEVICTABLE or | 1589 | * No LRU may hold pages because all pages are UNEVICTABLE or |
@@ -1628,7 +1593,7 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *mem) | |||
1628 | if (unlikely(node == MAX_NUMNODES)) | 1593 | if (unlikely(node == MAX_NUMNODES)) |
1629 | node = numa_node_id(); | 1594 | node = numa_node_id(); |
1630 | 1595 | ||
1631 | mem->last_scanned_node = node; | 1596 | memcg->last_scanned_node = node; |
1632 | return node; | 1597 | return node; |
1633 | } | 1598 | } |
1634 | 1599 | ||
@@ -1638,7 +1603,7 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *mem) | |||
1638 | * unused nodes. But scan_nodes is lazily updated and may not cotain | 1603 | * unused nodes. But scan_nodes is lazily updated and may not cotain |
1639 | * enough new information. We need to do double check. | 1604 | * enough new information. We need to do double check. |
1640 | */ | 1605 | */ |
1641 | bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap) | 1606 | bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) |
1642 | { | 1607 | { |
1643 | int nid; | 1608 | int nid; |
1644 | 1609 | ||
@@ -1646,12 +1611,12 @@ bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap) | |||
1646 | * quick check...making use of scan_node. | 1611 | * quick check...making use of scan_node. |
1647 | * We can skip unused nodes. | 1612 | * We can skip unused nodes. |
1648 | */ | 1613 | */ |
1649 | if (!nodes_empty(mem->scan_nodes)) { | 1614 | if (!nodes_empty(memcg->scan_nodes)) { |
1650 | for (nid = first_node(mem->scan_nodes); | 1615 | for (nid = first_node(memcg->scan_nodes); |
1651 | nid < MAX_NUMNODES; | 1616 | nid < MAX_NUMNODES; |
1652 | nid = next_node(nid, mem->scan_nodes)) { | 1617 | nid = next_node(nid, memcg->scan_nodes)) { |
1653 | 1618 | ||
1654 | if (test_mem_cgroup_node_reclaimable(mem, nid, noswap)) | 1619 | if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) |
1655 | return true; | 1620 | return true; |
1656 | } | 1621 | } |
1657 | } | 1622 | } |
@@ -1659,77 +1624,39 @@ bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap) | |||
1659 | * Check rest of nodes. | 1624 | * Check rest of nodes. |
1660 | */ | 1625 | */ |
1661 | for_each_node_state(nid, N_HIGH_MEMORY) { | 1626 | for_each_node_state(nid, N_HIGH_MEMORY) { |
1662 | if (node_isset(nid, mem->scan_nodes)) | 1627 | if (node_isset(nid, memcg->scan_nodes)) |
1663 | continue; | 1628 | continue; |
1664 | if (test_mem_cgroup_node_reclaimable(mem, nid, noswap)) | 1629 | if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) |
1665 | return true; | 1630 | return true; |
1666 | } | 1631 | } |
1667 | return false; | 1632 | return false; |
1668 | } | 1633 | } |
1669 | 1634 | ||
1670 | #else | 1635 | #else |
1671 | int mem_cgroup_select_victim_node(struct mem_cgroup *mem) | 1636 | int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) |
1672 | { | 1637 | { |
1673 | return 0; | 1638 | return 0; |
1674 | } | 1639 | } |
1675 | 1640 | ||
1676 | bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap) | 1641 | bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) |
1677 | { | 1642 | { |
1678 | return test_mem_cgroup_node_reclaimable(mem, 0, noswap); | 1643 | return test_mem_cgroup_node_reclaimable(memcg, 0, noswap); |
1679 | } | 1644 | } |
1680 | #endif | 1645 | #endif |
1681 | 1646 | ||
1682 | static void __mem_cgroup_record_scanstat(unsigned long *stats, | ||
1683 | struct memcg_scanrecord *rec) | ||
1684 | { | ||
1685 | |||
1686 | stats[SCAN] += rec->nr_scanned[0] + rec->nr_scanned[1]; | ||
1687 | stats[SCAN_ANON] += rec->nr_scanned[0]; | ||
1688 | stats[SCAN_FILE] += rec->nr_scanned[1]; | ||
1689 | |||
1690 | stats[ROTATE] += rec->nr_rotated[0] + rec->nr_rotated[1]; | ||
1691 | stats[ROTATE_ANON] += rec->nr_rotated[0]; | ||
1692 | stats[ROTATE_FILE] += rec->nr_rotated[1]; | ||
1693 | |||
1694 | stats[FREED] += rec->nr_freed[0] + rec->nr_freed[1]; | ||
1695 | stats[FREED_ANON] += rec->nr_freed[0]; | ||
1696 | stats[FREED_FILE] += rec->nr_freed[1]; | ||
1697 | |||
1698 | stats[ELAPSED] += rec->elapsed; | ||
1699 | } | ||
1700 | |||
1701 | static void mem_cgroup_record_scanstat(struct memcg_scanrecord *rec) | ||
1702 | { | ||
1703 | struct mem_cgroup *mem; | ||
1704 | int context = rec->context; | ||
1705 | |||
1706 | if (context >= NR_SCAN_CONTEXT) | ||
1707 | return; | ||
1708 | |||
1709 | mem = rec->mem; | ||
1710 | spin_lock(&mem->scanstat.lock); | ||
1711 | __mem_cgroup_record_scanstat(mem->scanstat.stats[context], rec); | ||
1712 | spin_unlock(&mem->scanstat.lock); | ||
1713 | |||
1714 | mem = rec->root; | ||
1715 | spin_lock(&mem->scanstat.lock); | ||
1716 | __mem_cgroup_record_scanstat(mem->scanstat.rootstats[context], rec); | ||
1717 | spin_unlock(&mem->scanstat.lock); | ||
1718 | } | ||
1719 | |||
1720 | /* | 1647 | /* |
1721 | * Scan the hierarchy if needed to reclaim memory. We remember the last child | 1648 | * Scan the hierarchy if needed to reclaim memory. We remember the last child |
1722 | * we reclaimed from, so that we don't end up penalizing one child extensively | 1649 | * we reclaimed from, so that we don't end up penalizing one child extensively |
1723 | * based on its position in the children list. | 1650 | * based on its position in the children list. |
1724 | * | 1651 | * |
1725 | * root_mem is the original ancestor that we've been reclaim from. | 1652 | * root_memcg is the original ancestor that we've been reclaim from. |
1726 | * | 1653 | * |
1727 | * We give up and return to the caller when we visit root_mem twice. | 1654 | * We give up and return to the caller when we visit root_memcg twice. |
1728 | * (other groups can be removed while we're walking....) | 1655 | * (other groups can be removed while we're walking....) |
1729 | * | 1656 | * |
1730 | * If shrink==true, for avoiding to free too much, this returns immedieately. | 1657 | * If shrink==true, for avoiding to free too much, this returns immedieately. |
1731 | */ | 1658 | */ |
1732 | static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, | 1659 | static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_memcg, |
1733 | struct zone *zone, | 1660 | struct zone *zone, |
1734 | gfp_t gfp_mask, | 1661 | gfp_t gfp_mask, |
1735 | unsigned long reclaim_options, | 1662 | unsigned long reclaim_options, |
@@ -1741,28 +1668,18 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, | |||
1741 | bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP; | 1668 | bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP; |
1742 | bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK; | 1669 | bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK; |
1743 | bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT; | 1670 | bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT; |
1744 | struct memcg_scanrecord rec; | ||
1745 | unsigned long excess; | 1671 | unsigned long excess; |
1746 | unsigned long scanned; | 1672 | unsigned long nr_scanned; |
1747 | 1673 | ||
1748 | excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT; | 1674 | excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT; |
1749 | 1675 | ||
1750 | /* If memsw_is_minimum==1, swap-out is of-no-use. */ | 1676 | /* If memsw_is_minimum==1, swap-out is of-no-use. */ |
1751 | if (!check_soft && !shrink && root_mem->memsw_is_minimum) | 1677 | if (!check_soft && !shrink && root_memcg->memsw_is_minimum) |
1752 | noswap = true; | 1678 | noswap = true; |
1753 | 1679 | ||
1754 | if (shrink) | ||
1755 | rec.context = SCAN_BY_SHRINK; | ||
1756 | else if (check_soft) | ||
1757 | rec.context = SCAN_BY_SYSTEM; | ||
1758 | else | ||
1759 | rec.context = SCAN_BY_LIMIT; | ||
1760 | |||
1761 | rec.root = root_mem; | ||
1762 | |||
1763 | while (1) { | 1680 | while (1) { |
1764 | victim = mem_cgroup_select_victim(root_mem); | 1681 | victim = mem_cgroup_select_victim(root_memcg); |
1765 | if (victim == root_mem) { | 1682 | if (victim == root_memcg) { |
1766 | loop++; | 1683 | loop++; |
1767 | /* | 1684 | /* |
1768 | * We are not draining per cpu cached charges during | 1685 | * We are not draining per cpu cached charges during |
@@ -1771,7 +1688,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, | |||
1771 | * charges will not give any. | 1688 | * charges will not give any. |
1772 | */ | 1689 | */ |
1773 | if (!check_soft && loop >= 1) | 1690 | if (!check_soft && loop >= 1) |
1774 | drain_all_stock_async(root_mem); | 1691 | drain_all_stock_async(root_memcg); |
1775 | if (loop >= 2) { | 1692 | if (loop >= 2) { |
1776 | /* | 1693 | /* |
1777 | * If we have not been able to reclaim | 1694 | * If we have not been able to reclaim |
@@ -1800,23 +1717,14 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, | |||
1800 | css_put(&victim->css); | 1717 | css_put(&victim->css); |
1801 | continue; | 1718 | continue; |
1802 | } | 1719 | } |
1803 | rec.mem = victim; | ||
1804 | rec.nr_scanned[0] = 0; | ||
1805 | rec.nr_scanned[1] = 0; | ||
1806 | rec.nr_rotated[0] = 0; | ||
1807 | rec.nr_rotated[1] = 0; | ||
1808 | rec.nr_freed[0] = 0; | ||
1809 | rec.nr_freed[1] = 0; | ||
1810 | rec.elapsed = 0; | ||
1811 | /* we use swappiness of local cgroup */ | 1720 | /* we use swappiness of local cgroup */ |
1812 | if (check_soft) { | 1721 | if (check_soft) { |
1813 | ret = mem_cgroup_shrink_node_zone(victim, gfp_mask, | 1722 | ret = mem_cgroup_shrink_node_zone(victim, gfp_mask, |
1814 | noswap, zone, &rec, &scanned); | 1723 | noswap, zone, &nr_scanned); |
1815 | *total_scanned += scanned; | 1724 | *total_scanned += nr_scanned; |
1816 | } else | 1725 | } else |
1817 | ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, | 1726 | ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, |
1818 | noswap, &rec); | 1727 | noswap); |
1819 | mem_cgroup_record_scanstat(&rec); | ||
1820 | css_put(&victim->css); | 1728 | css_put(&victim->css); |
1821 | /* | 1729 | /* |
1822 | * At shrinking usage, we can't check we should stop here or | 1730 | * At shrinking usage, we can't check we should stop here or |
@@ -1827,9 +1735,9 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, | |||
1827 | return ret; | 1735 | return ret; |
1828 | total += ret; | 1736 | total += ret; |
1829 | if (check_soft) { | 1737 | if (check_soft) { |
1830 | if (!res_counter_soft_limit_excess(&root_mem->res)) | 1738 | if (!res_counter_soft_limit_excess(&root_memcg->res)) |
1831 | return total; | 1739 | return total; |
1832 | } else if (mem_cgroup_margin(root_mem)) | 1740 | } else if (mem_cgroup_margin(root_memcg)) |
1833 | return total; | 1741 | return total; |
1834 | } | 1742 | } |
1835 | return total; | 1743 | return total; |
@@ -1840,69 +1748,62 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, | |||
1840 | * If someone is running, return false. | 1748 | * If someone is running, return false. |
1841 | * Has to be called with memcg_oom_lock | 1749 | * Has to be called with memcg_oom_lock |
1842 | */ | 1750 | */ |
1843 | static bool mem_cgroup_oom_lock(struct mem_cgroup *mem) | 1751 | static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg) |
1844 | { | 1752 | { |
1845 | int lock_count = -1; | ||
1846 | struct mem_cgroup *iter, *failed = NULL; | 1753 | struct mem_cgroup *iter, *failed = NULL; |
1847 | bool cond = true; | 1754 | bool cond = true; |
1848 | 1755 | ||
1849 | for_each_mem_cgroup_tree_cond(iter, mem, cond) { | 1756 | for_each_mem_cgroup_tree_cond(iter, memcg, cond) { |
1850 | bool locked = iter->oom_lock; | 1757 | if (iter->oom_lock) { |
1851 | |||
1852 | iter->oom_lock = true; | ||
1853 | if (lock_count == -1) | ||
1854 | lock_count = iter->oom_lock; | ||
1855 | else if (lock_count != locked) { | ||
1856 | /* | 1758 | /* |
1857 | * this subtree of our hierarchy is already locked | 1759 | * this subtree of our hierarchy is already locked |
1858 | * so we cannot give a lock. | 1760 | * so we cannot give a lock. |
1859 | */ | 1761 | */ |
1860 | lock_count = 0; | ||
1861 | failed = iter; | 1762 | failed = iter; |
1862 | cond = false; | 1763 | cond = false; |
1863 | } | 1764 | } else |
1765 | iter->oom_lock = true; | ||
1864 | } | 1766 | } |
1865 | 1767 | ||
1866 | if (!failed) | 1768 | if (!failed) |
1867 | goto done; | 1769 | return true; |
1868 | 1770 | ||
1869 | /* | 1771 | /* |
1870 | * OK, we failed to lock the whole subtree so we have to clean up | 1772 | * OK, we failed to lock the whole subtree so we have to clean up |
1871 | * what we set up to the failing subtree | 1773 | * what we set up to the failing subtree |
1872 | */ | 1774 | */ |
1873 | cond = true; | 1775 | cond = true; |
1874 | for_each_mem_cgroup_tree_cond(iter, mem, cond) { | 1776 | for_each_mem_cgroup_tree_cond(iter, memcg, cond) { |
1875 | if (iter == failed) { | 1777 | if (iter == failed) { |
1876 | cond = false; | 1778 | cond = false; |
1877 | continue; | 1779 | continue; |
1878 | } | 1780 | } |
1879 | iter->oom_lock = false; | 1781 | iter->oom_lock = false; |
1880 | } | 1782 | } |
1881 | done: | 1783 | return false; |
1882 | return lock_count; | ||
1883 | } | 1784 | } |
1884 | 1785 | ||
1885 | /* | 1786 | /* |
1886 | * Has to be called with memcg_oom_lock | 1787 | * Has to be called with memcg_oom_lock |
1887 | */ | 1788 | */ |
1888 | static int mem_cgroup_oom_unlock(struct mem_cgroup *mem) | 1789 | static int mem_cgroup_oom_unlock(struct mem_cgroup *memcg) |
1889 | { | 1790 | { |
1890 | struct mem_cgroup *iter; | 1791 | struct mem_cgroup *iter; |
1891 | 1792 | ||
1892 | for_each_mem_cgroup_tree(iter, mem) | 1793 | for_each_mem_cgroup_tree(iter, memcg) |
1893 | iter->oom_lock = false; | 1794 | iter->oom_lock = false; |
1894 | return 0; | 1795 | return 0; |
1895 | } | 1796 | } |
1896 | 1797 | ||
1897 | static void mem_cgroup_mark_under_oom(struct mem_cgroup *mem) | 1798 | static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg) |
1898 | { | 1799 | { |
1899 | struct mem_cgroup *iter; | 1800 | struct mem_cgroup *iter; |
1900 | 1801 | ||
1901 | for_each_mem_cgroup_tree(iter, mem) | 1802 | for_each_mem_cgroup_tree(iter, memcg) |
1902 | atomic_inc(&iter->under_oom); | 1803 | atomic_inc(&iter->under_oom); |
1903 | } | 1804 | } |
1904 | 1805 | ||
1905 | static void mem_cgroup_unmark_under_oom(struct mem_cgroup *mem) | 1806 | static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg) |
1906 | { | 1807 | { |
1907 | struct mem_cgroup *iter; | 1808 | struct mem_cgroup *iter; |
1908 | 1809 | ||
@@ -1911,7 +1812,7 @@ static void mem_cgroup_unmark_under_oom(struct mem_cgroup *mem) | |||
1911 | * mem_cgroup_oom_lock() may not be called. We have to use | 1812 | * mem_cgroup_oom_lock() may not be called. We have to use |
1912 | * atomic_add_unless() here. | 1813 | * atomic_add_unless() here. |
1913 | */ | 1814 | */ |
1914 | for_each_mem_cgroup_tree(iter, mem) | 1815 | for_each_mem_cgroup_tree(iter, memcg) |
1915 | atomic_add_unless(&iter->under_oom, -1, 0); | 1816 | atomic_add_unless(&iter->under_oom, -1, 0); |
1916 | } | 1817 | } |
1917 | 1818 | ||
@@ -1926,85 +1827,85 @@ struct oom_wait_info { | |||
1926 | static int memcg_oom_wake_function(wait_queue_t *wait, | 1827 | static int memcg_oom_wake_function(wait_queue_t *wait, |
1927 | unsigned mode, int sync, void *arg) | 1828 | unsigned mode, int sync, void *arg) |
1928 | { | 1829 | { |
1929 | struct mem_cgroup *wake_mem = (struct mem_cgroup *)arg, | 1830 | struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg, |
1930 | *oom_wait_mem; | 1831 | *oom_wait_memcg; |
1931 | struct oom_wait_info *oom_wait_info; | 1832 | struct oom_wait_info *oom_wait_info; |
1932 | 1833 | ||
1933 | oom_wait_info = container_of(wait, struct oom_wait_info, wait); | 1834 | oom_wait_info = container_of(wait, struct oom_wait_info, wait); |
1934 | oom_wait_mem = oom_wait_info->mem; | 1835 | oom_wait_memcg = oom_wait_info->mem; |
1935 | 1836 | ||
1936 | /* | 1837 | /* |
1937 | * Both of oom_wait_info->mem and wake_mem are stable under us. | 1838 | * Both of oom_wait_info->mem and wake_mem are stable under us. |
1938 | * Then we can use css_is_ancestor without taking care of RCU. | 1839 | * Then we can use css_is_ancestor without taking care of RCU. |
1939 | */ | 1840 | */ |
1940 | if (!mem_cgroup_same_or_subtree(oom_wait_mem, wake_mem) | 1841 | if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg) |
1941 | && !mem_cgroup_same_or_subtree(wake_mem, oom_wait_mem)) | 1842 | && !mem_cgroup_same_or_subtree(wake_memcg, oom_wait_memcg)) |
1942 | return 0; | 1843 | return 0; |
1943 | return autoremove_wake_function(wait, mode, sync, arg); | 1844 | return autoremove_wake_function(wait, mode, sync, arg); |
1944 | } | 1845 | } |
1945 | 1846 | ||
1946 | static void memcg_wakeup_oom(struct mem_cgroup *mem) | 1847 | static void memcg_wakeup_oom(struct mem_cgroup *memcg) |
1947 | { | 1848 | { |
1948 | /* for filtering, pass "mem" as argument. */ | 1849 | /* for filtering, pass "memcg" as argument. */ |
1949 | __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, mem); | 1850 | __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg); |
1950 | } | 1851 | } |
1951 | 1852 | ||
1952 | static void memcg_oom_recover(struct mem_cgroup *mem) | 1853 | static void memcg_oom_recover(struct mem_cgroup *memcg) |
1953 | { | 1854 | { |
1954 | if (mem && atomic_read(&mem->under_oom)) | 1855 | if (memcg && atomic_read(&memcg->under_oom)) |
1955 | memcg_wakeup_oom(mem); | 1856 | memcg_wakeup_oom(memcg); |
1956 | } | 1857 | } |
1957 | 1858 | ||
1958 | /* | 1859 | /* |
1959 | * try to call OOM killer. returns false if we should exit memory-reclaim loop. | 1860 | * try to call OOM killer. returns false if we should exit memory-reclaim loop. |
1960 | */ | 1861 | */ |
1961 | bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) | 1862 | bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask) |
1962 | { | 1863 | { |
1963 | struct oom_wait_info owait; | 1864 | struct oom_wait_info owait; |
1964 | bool locked, need_to_kill; | 1865 | bool locked, need_to_kill; |
1965 | 1866 | ||
1966 | owait.mem = mem; | 1867 | owait.mem = memcg; |
1967 | owait.wait.flags = 0; | 1868 | owait.wait.flags = 0; |
1968 | owait.wait.func = memcg_oom_wake_function; | 1869 | owait.wait.func = memcg_oom_wake_function; |
1969 | owait.wait.private = current; | 1870 | owait.wait.private = current; |
1970 | INIT_LIST_HEAD(&owait.wait.task_list); | 1871 | INIT_LIST_HEAD(&owait.wait.task_list); |
1971 | need_to_kill = true; | 1872 | need_to_kill = true; |
1972 | mem_cgroup_mark_under_oom(mem); | 1873 | mem_cgroup_mark_under_oom(memcg); |
1973 | 1874 | ||
1974 | /* At first, try to OOM lock hierarchy under mem.*/ | 1875 | /* At first, try to OOM lock hierarchy under memcg.*/ |
1975 | spin_lock(&memcg_oom_lock); | 1876 | spin_lock(&memcg_oom_lock); |
1976 | locked = mem_cgroup_oom_lock(mem); | 1877 | locked = mem_cgroup_oom_lock(memcg); |
1977 | /* | 1878 | /* |
1978 | * Even if signal_pending(), we can't quit charge() loop without | 1879 | * Even if signal_pending(), we can't quit charge() loop without |
1979 | * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL | 1880 | * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL |
1980 | * under OOM is always welcomed, use TASK_KILLABLE here. | 1881 | * under OOM is always welcomed, use TASK_KILLABLE here. |
1981 | */ | 1882 | */ |
1982 | prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); | 1883 | prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); |
1983 | if (!locked || mem->oom_kill_disable) | 1884 | if (!locked || memcg->oom_kill_disable) |
1984 | need_to_kill = false; | 1885 | need_to_kill = false; |
1985 | if (locked) | 1886 | if (locked) |
1986 | mem_cgroup_oom_notify(mem); | 1887 | mem_cgroup_oom_notify(memcg); |
1987 | spin_unlock(&memcg_oom_lock); | 1888 | spin_unlock(&memcg_oom_lock); |
1988 | 1889 | ||
1989 | if (need_to_kill) { | 1890 | if (need_to_kill) { |
1990 | finish_wait(&memcg_oom_waitq, &owait.wait); | 1891 | finish_wait(&memcg_oom_waitq, &owait.wait); |
1991 | mem_cgroup_out_of_memory(mem, mask); | 1892 | mem_cgroup_out_of_memory(memcg, mask); |
1992 | } else { | 1893 | } else { |
1993 | schedule(); | 1894 | schedule(); |
1994 | finish_wait(&memcg_oom_waitq, &owait.wait); | 1895 | finish_wait(&memcg_oom_waitq, &owait.wait); |
1995 | } | 1896 | } |
1996 | spin_lock(&memcg_oom_lock); | 1897 | spin_lock(&memcg_oom_lock); |
1997 | if (locked) | 1898 | if (locked) |
1998 | mem_cgroup_oom_unlock(mem); | 1899 | mem_cgroup_oom_unlock(memcg); |
1999 | memcg_wakeup_oom(mem); | 1900 | memcg_wakeup_oom(memcg); |
2000 | spin_unlock(&memcg_oom_lock); | 1901 | spin_unlock(&memcg_oom_lock); |
2001 | 1902 | ||
2002 | mem_cgroup_unmark_under_oom(mem); | 1903 | mem_cgroup_unmark_under_oom(memcg); |
2003 | 1904 | ||
2004 | if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) | 1905 | if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) |
2005 | return false; | 1906 | return false; |
2006 | /* Give chance to dying process */ | 1907 | /* Give chance to dying process */ |
2007 | schedule_timeout(1); | 1908 | schedule_timeout_uninterruptible(1); |
2008 | return true; | 1909 | return true; |
2009 | } | 1910 | } |
2010 | 1911 | ||
@@ -2035,7 +1936,7 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) | |||
2035 | void mem_cgroup_update_page_stat(struct page *page, | 1936 | void mem_cgroup_update_page_stat(struct page *page, |
2036 | enum mem_cgroup_page_stat_item idx, int val) | 1937 | enum mem_cgroup_page_stat_item idx, int val) |
2037 | { | 1938 | { |
2038 | struct mem_cgroup *mem; | 1939 | struct mem_cgroup *memcg; |
2039 | struct page_cgroup *pc = lookup_page_cgroup(page); | 1940 | struct page_cgroup *pc = lookup_page_cgroup(page); |
2040 | bool need_unlock = false; | 1941 | bool need_unlock = false; |
2041 | unsigned long uninitialized_var(flags); | 1942 | unsigned long uninitialized_var(flags); |
@@ -2044,16 +1945,16 @@ void mem_cgroup_update_page_stat(struct page *page, | |||
2044 | return; | 1945 | return; |
2045 | 1946 | ||
2046 | rcu_read_lock(); | 1947 | rcu_read_lock(); |
2047 | mem = pc->mem_cgroup; | 1948 | memcg = pc->mem_cgroup; |
2048 | if (unlikely(!mem || !PageCgroupUsed(pc))) | 1949 | if (unlikely(!memcg || !PageCgroupUsed(pc))) |
2049 | goto out; | 1950 | goto out; |
2050 | /* pc->mem_cgroup is unstable ? */ | 1951 | /* pc->mem_cgroup is unstable ? */ |
2051 | if (unlikely(mem_cgroup_stealed(mem)) || PageTransHuge(page)) { | 1952 | if (unlikely(mem_cgroup_stealed(memcg)) || PageTransHuge(page)) { |
2052 | /* take a lock against to access pc->mem_cgroup */ | 1953 | /* take a lock against to access pc->mem_cgroup */ |
2053 | move_lock_page_cgroup(pc, &flags); | 1954 | move_lock_page_cgroup(pc, &flags); |
2054 | need_unlock = true; | 1955 | need_unlock = true; |
2055 | mem = pc->mem_cgroup; | 1956 | memcg = pc->mem_cgroup; |
2056 | if (!mem || !PageCgroupUsed(pc)) | 1957 | if (!memcg || !PageCgroupUsed(pc)) |
2057 | goto out; | 1958 | goto out; |
2058 | } | 1959 | } |
2059 | 1960 | ||
@@ -2069,7 +1970,7 @@ void mem_cgroup_update_page_stat(struct page *page, | |||
2069 | BUG(); | 1970 | BUG(); |
2070 | } | 1971 | } |
2071 | 1972 | ||
2072 | this_cpu_add(mem->stat->count[idx], val); | 1973 | this_cpu_add(memcg->stat->count[idx], val); |
2073 | 1974 | ||
2074 | out: | 1975 | out: |
2075 | if (unlikely(need_unlock)) | 1976 | if (unlikely(need_unlock)) |
@@ -2092,6 +1993,7 @@ struct memcg_stock_pcp { | |||
2092 | #define FLUSHING_CACHED_CHARGE (0) | 1993 | #define FLUSHING_CACHED_CHARGE (0) |
2093 | }; | 1994 | }; |
2094 | static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); | 1995 | static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); |
1996 | static DEFINE_MUTEX(percpu_charge_mutex); | ||
2095 | 1997 | ||
2096 | /* | 1998 | /* |
2097 | * Try to consume stocked charge on this cpu. If success, one page is consumed | 1999 | * Try to consume stocked charge on this cpu. If success, one page is consumed |
@@ -2099,13 +2001,13 @@ static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); | |||
2099 | * cgroup which is not current target, returns false. This stock will be | 2001 | * cgroup which is not current target, returns false. This stock will be |
2100 | * refilled. | 2002 | * refilled. |
2101 | */ | 2003 | */ |
2102 | static bool consume_stock(struct mem_cgroup *mem) | 2004 | static bool consume_stock(struct mem_cgroup *memcg) |
2103 | { | 2005 | { |
2104 | struct memcg_stock_pcp *stock; | 2006 | struct memcg_stock_pcp *stock; |
2105 | bool ret = true; | 2007 | bool ret = true; |
2106 | 2008 | ||
2107 | stock = &get_cpu_var(memcg_stock); | 2009 | stock = &get_cpu_var(memcg_stock); |
2108 | if (mem == stock->cached && stock->nr_pages) | 2010 | if (memcg == stock->cached && stock->nr_pages) |
2109 | stock->nr_pages--; | 2011 | stock->nr_pages--; |
2110 | else /* need to call res_counter_charge */ | 2012 | else /* need to call res_counter_charge */ |
2111 | ret = false; | 2013 | ret = false; |
@@ -2146,44 +2048,38 @@ static void drain_local_stock(struct work_struct *dummy) | |||
2146 | * Cache charges(val) which is from res_counter, to local per_cpu area. | 2048 | * Cache charges(val) which is from res_counter, to local per_cpu area. |
2147 | * This will be consumed by consume_stock() function, later. | 2049 | * This will be consumed by consume_stock() function, later. |
2148 | */ | 2050 | */ |
2149 | static void refill_stock(struct mem_cgroup *mem, unsigned int nr_pages) | 2051 | static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) |
2150 | { | 2052 | { |
2151 | struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock); | 2053 | struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock); |
2152 | 2054 | ||
2153 | if (stock->cached != mem) { /* reset if necessary */ | 2055 | if (stock->cached != memcg) { /* reset if necessary */ |
2154 | drain_stock(stock); | 2056 | drain_stock(stock); |
2155 | stock->cached = mem; | 2057 | stock->cached = memcg; |
2156 | } | 2058 | } |
2157 | stock->nr_pages += nr_pages; | 2059 | stock->nr_pages += nr_pages; |
2158 | put_cpu_var(memcg_stock); | 2060 | put_cpu_var(memcg_stock); |
2159 | } | 2061 | } |
2160 | 2062 | ||
2161 | /* | 2063 | /* |
2162 | * Drains all per-CPU charge caches for given root_mem resp. subtree | 2064 | * Drains all per-CPU charge caches for given root_memcg resp. subtree |
2163 | * of the hierarchy under it. sync flag says whether we should block | 2065 | * of the hierarchy under it. sync flag says whether we should block |
2164 | * until the work is done. | 2066 | * until the work is done. |
2165 | */ | 2067 | */ |
2166 | static void drain_all_stock(struct mem_cgroup *root_mem, bool sync) | 2068 | static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync) |
2167 | { | 2069 | { |
2168 | int cpu, curcpu; | 2070 | int cpu, curcpu; |
2169 | 2071 | ||
2170 | /* Notify other cpus that system-wide "drain" is running */ | 2072 | /* Notify other cpus that system-wide "drain" is running */ |
2171 | get_online_cpus(); | 2073 | get_online_cpus(); |
2172 | /* | 2074 | curcpu = get_cpu(); |
2173 | * Get a hint for avoiding draining charges on the current cpu, | ||
2174 | * which must be exhausted by our charging. It is not required that | ||
2175 | * this be a precise check, so we use raw_smp_processor_id() instead of | ||
2176 | * getcpu()/putcpu(). | ||
2177 | */ | ||
2178 | curcpu = raw_smp_processor_id(); | ||
2179 | for_each_online_cpu(cpu) { | 2075 | for_each_online_cpu(cpu) { |
2180 | struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); | 2076 | struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); |
2181 | struct mem_cgroup *mem; | 2077 | struct mem_cgroup *memcg; |
2182 | 2078 | ||
2183 | mem = stock->cached; | 2079 | memcg = stock->cached; |
2184 | if (!mem || !stock->nr_pages) | 2080 | if (!memcg || !stock->nr_pages) |
2185 | continue; | 2081 | continue; |
2186 | if (!mem_cgroup_same_or_subtree(root_mem, mem)) | 2082 | if (!mem_cgroup_same_or_subtree(root_memcg, memcg)) |
2187 | continue; | 2083 | continue; |
2188 | if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) { | 2084 | if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) { |
2189 | if (cpu == curcpu) | 2085 | if (cpu == curcpu) |
@@ -2192,14 +2088,14 @@ static void drain_all_stock(struct mem_cgroup *root_mem, bool sync) | |||
2192 | schedule_work_on(cpu, &stock->work); | 2088 | schedule_work_on(cpu, &stock->work); |
2193 | } | 2089 | } |
2194 | } | 2090 | } |
2091 | put_cpu(); | ||
2195 | 2092 | ||
2196 | if (!sync) | 2093 | if (!sync) |
2197 | goto out; | 2094 | goto out; |
2198 | 2095 | ||
2199 | for_each_online_cpu(cpu) { | 2096 | for_each_online_cpu(cpu) { |
2200 | struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); | 2097 | struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); |
2201 | if (mem_cgroup_same_or_subtree(root_mem, stock->cached) && | 2098 | if (test_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) |
2202 | test_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) | ||
2203 | flush_work(&stock->work); | 2099 | flush_work(&stock->work); |
2204 | } | 2100 | } |
2205 | out: | 2101 | out: |
@@ -2212,51 +2108,59 @@ out: | |||
2212 | * expects some charges will be back to res_counter later but cannot wait for | 2108 | * expects some charges will be back to res_counter later but cannot wait for |
2213 | * it. | 2109 | * it. |
2214 | */ | 2110 | */ |
2215 | static void drain_all_stock_async(struct mem_cgroup *root_mem) | 2111 | static void drain_all_stock_async(struct mem_cgroup *root_memcg) |
2216 | { | 2112 | { |
2217 | drain_all_stock(root_mem, false); | 2113 | /* |
2114 | * If someone calls draining, avoid adding more kworker runs. | ||
2115 | */ | ||
2116 | if (!mutex_trylock(&percpu_charge_mutex)) | ||
2117 | return; | ||
2118 | drain_all_stock(root_memcg, false); | ||
2119 | mutex_unlock(&percpu_charge_mutex); | ||
2218 | } | 2120 | } |
2219 | 2121 | ||
2220 | /* This is a synchronous drain interface. */ | 2122 | /* This is a synchronous drain interface. */ |
2221 | static void drain_all_stock_sync(struct mem_cgroup *root_mem) | 2123 | static void drain_all_stock_sync(struct mem_cgroup *root_memcg) |
2222 | { | 2124 | { |
2223 | /* called when force_empty is called */ | 2125 | /* called when force_empty is called */ |
2224 | drain_all_stock(root_mem, true); | 2126 | mutex_lock(&percpu_charge_mutex); |
2127 | drain_all_stock(root_memcg, true); | ||
2128 | mutex_unlock(&percpu_charge_mutex); | ||
2225 | } | 2129 | } |
2226 | 2130 | ||
2227 | /* | 2131 | /* |
2228 | * This function drains percpu counter value from DEAD cpu and | 2132 | * This function drains percpu counter value from DEAD cpu and |
2229 | * move it to local cpu. Note that this function can be preempted. | 2133 | * move it to local cpu. Note that this function can be preempted. |
2230 | */ | 2134 | */ |
2231 | static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *mem, int cpu) | 2135 | static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu) |
2232 | { | 2136 | { |
2233 | int i; | 2137 | int i; |
2234 | 2138 | ||
2235 | spin_lock(&mem->pcp_counter_lock); | 2139 | spin_lock(&memcg->pcp_counter_lock); |
2236 | for (i = 0; i < MEM_CGROUP_STAT_DATA; i++) { | 2140 | for (i = 0; i < MEM_CGROUP_STAT_DATA; i++) { |
2237 | long x = per_cpu(mem->stat->count[i], cpu); | 2141 | long x = per_cpu(memcg->stat->count[i], cpu); |
2238 | 2142 | ||
2239 | per_cpu(mem->stat->count[i], cpu) = 0; | 2143 | per_cpu(memcg->stat->count[i], cpu) = 0; |
2240 | mem->nocpu_base.count[i] += x; | 2144 | memcg->nocpu_base.count[i] += x; |
2241 | } | 2145 | } |
2242 | for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) { | 2146 | for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) { |
2243 | unsigned long x = per_cpu(mem->stat->events[i], cpu); | 2147 | unsigned long x = per_cpu(memcg->stat->events[i], cpu); |
2244 | 2148 | ||
2245 | per_cpu(mem->stat->events[i], cpu) = 0; | 2149 | per_cpu(memcg->stat->events[i], cpu) = 0; |
2246 | mem->nocpu_base.events[i] += x; | 2150 | memcg->nocpu_base.events[i] += x; |
2247 | } | 2151 | } |
2248 | /* need to clear ON_MOVE value, works as a kind of lock. */ | 2152 | /* need to clear ON_MOVE value, works as a kind of lock. */ |
2249 | per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0; | 2153 | per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0; |
2250 | spin_unlock(&mem->pcp_counter_lock); | 2154 | spin_unlock(&memcg->pcp_counter_lock); |
2251 | } | 2155 | } |
2252 | 2156 | ||
2253 | static void synchronize_mem_cgroup_on_move(struct mem_cgroup *mem, int cpu) | 2157 | static void synchronize_mem_cgroup_on_move(struct mem_cgroup *memcg, int cpu) |
2254 | { | 2158 | { |
2255 | int idx = MEM_CGROUP_ON_MOVE; | 2159 | int idx = MEM_CGROUP_ON_MOVE; |
2256 | 2160 | ||
2257 | spin_lock(&mem->pcp_counter_lock); | 2161 | spin_lock(&memcg->pcp_counter_lock); |
2258 | per_cpu(mem->stat->count[idx], cpu) = mem->nocpu_base.count[idx]; | 2162 | per_cpu(memcg->stat->count[idx], cpu) = memcg->nocpu_base.count[idx]; |
2259 | spin_unlock(&mem->pcp_counter_lock); | 2163 | spin_unlock(&memcg->pcp_counter_lock); |
2260 | } | 2164 | } |
2261 | 2165 | ||
2262 | static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb, | 2166 | static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb, |
@@ -2294,7 +2198,7 @@ enum { | |||
2294 | CHARGE_OOM_DIE, /* the current is killed because of OOM */ | 2198 | CHARGE_OOM_DIE, /* the current is killed because of OOM */ |
2295 | }; | 2199 | }; |
2296 | 2200 | ||
2297 | static int mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, | 2201 | static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, |
2298 | unsigned int nr_pages, bool oom_check) | 2202 | unsigned int nr_pages, bool oom_check) |
2299 | { | 2203 | { |
2300 | unsigned long csize = nr_pages * PAGE_SIZE; | 2204 | unsigned long csize = nr_pages * PAGE_SIZE; |
@@ -2303,16 +2207,16 @@ static int mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, | |||
2303 | unsigned long flags = 0; | 2207 | unsigned long flags = 0; |
2304 | int ret; | 2208 | int ret; |
2305 | 2209 | ||
2306 | ret = res_counter_charge(&mem->res, csize, &fail_res); | 2210 | ret = res_counter_charge(&memcg->res, csize, &fail_res); |
2307 | 2211 | ||
2308 | if (likely(!ret)) { | 2212 | if (likely(!ret)) { |
2309 | if (!do_swap_account) | 2213 | if (!do_swap_account) |
2310 | return CHARGE_OK; | 2214 | return CHARGE_OK; |
2311 | ret = res_counter_charge(&mem->memsw, csize, &fail_res); | 2215 | ret = res_counter_charge(&memcg->memsw, csize, &fail_res); |
2312 | if (likely(!ret)) | 2216 | if (likely(!ret)) |
2313 | return CHARGE_OK; | 2217 | return CHARGE_OK; |
2314 | 2218 | ||
2315 | res_counter_uncharge(&mem->res, csize); | 2219 | res_counter_uncharge(&memcg->res, csize); |
2316 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); | 2220 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); |
2317 | flags |= MEM_CGROUP_RECLAIM_NOSWAP; | 2221 | flags |= MEM_CGROUP_RECLAIM_NOSWAP; |
2318 | } else | 2222 | } else |
@@ -2370,12 +2274,12 @@ static int mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, | |||
2370 | static int __mem_cgroup_try_charge(struct mm_struct *mm, | 2274 | static int __mem_cgroup_try_charge(struct mm_struct *mm, |
2371 | gfp_t gfp_mask, | 2275 | gfp_t gfp_mask, |
2372 | unsigned int nr_pages, | 2276 | unsigned int nr_pages, |
2373 | struct mem_cgroup **memcg, | 2277 | struct mem_cgroup **ptr, |
2374 | bool oom) | 2278 | bool oom) |
2375 | { | 2279 | { |
2376 | unsigned int batch = max(CHARGE_BATCH, nr_pages); | 2280 | unsigned int batch = max(CHARGE_BATCH, nr_pages); |
2377 | int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; | 2281 | int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; |
2378 | struct mem_cgroup *mem = NULL; | 2282 | struct mem_cgroup *memcg = NULL; |
2379 | int ret; | 2283 | int ret; |
2380 | 2284 | ||
2381 | /* | 2285 | /* |
@@ -2393,17 +2297,17 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
2393 | * thread group leader migrates. It's possible that mm is not | 2297 | * thread group leader migrates. It's possible that mm is not |
2394 | * set, if so charge the init_mm (happens for pagecache usage). | 2298 | * set, if so charge the init_mm (happens for pagecache usage). |
2395 | */ | 2299 | */ |
2396 | if (!*memcg && !mm) | 2300 | if (!*ptr && !mm) |
2397 | goto bypass; | 2301 | goto bypass; |
2398 | again: | 2302 | again: |
2399 | if (*memcg) { /* css should be a valid one */ | 2303 | if (*ptr) { /* css should be a valid one */ |
2400 | mem = *memcg; | 2304 | memcg = *ptr; |
2401 | VM_BUG_ON(css_is_removed(&mem->css)); | 2305 | VM_BUG_ON(css_is_removed(&memcg->css)); |
2402 | if (mem_cgroup_is_root(mem)) | 2306 | if (mem_cgroup_is_root(memcg)) |
2403 | goto done; | 2307 | goto done; |
2404 | if (nr_pages == 1 && consume_stock(mem)) | 2308 | if (nr_pages == 1 && consume_stock(memcg)) |
2405 | goto done; | 2309 | goto done; |
2406 | css_get(&mem->css); | 2310 | css_get(&memcg->css); |
2407 | } else { | 2311 | } else { |
2408 | struct task_struct *p; | 2312 | struct task_struct *p; |
2409 | 2313 | ||
@@ -2411,7 +2315,7 @@ again: | |||
2411 | p = rcu_dereference(mm->owner); | 2315 | p = rcu_dereference(mm->owner); |
2412 | /* | 2316 | /* |
2413 | * Because we don't have task_lock(), "p" can exit. | 2317 | * Because we don't have task_lock(), "p" can exit. |
2414 | * In that case, "mem" can point to root or p can be NULL with | 2318 | * In that case, "memcg" can point to root or p can be NULL with |
2415 | * race with swapoff. Then, we have small risk of mis-accouning. | 2319 | * race with swapoff. Then, we have small risk of mis-accouning. |
2416 | * But such kind of mis-account by race always happens because | 2320 | * But such kind of mis-account by race always happens because |
2417 | * we don't have cgroup_mutex(). It's overkill and we allo that | 2321 | * we don't have cgroup_mutex(). It's overkill and we allo that |
@@ -2419,12 +2323,12 @@ again: | |||
2419 | * (*) swapoff at el will charge against mm-struct not against | 2323 | * (*) swapoff at el will charge against mm-struct not against |
2420 | * task-struct. So, mm->owner can be NULL. | 2324 | * task-struct. So, mm->owner can be NULL. |
2421 | */ | 2325 | */ |
2422 | mem = mem_cgroup_from_task(p); | 2326 | memcg = mem_cgroup_from_task(p); |
2423 | if (!mem || mem_cgroup_is_root(mem)) { | 2327 | if (!memcg || mem_cgroup_is_root(memcg)) { |
2424 | rcu_read_unlock(); | 2328 | rcu_read_unlock(); |
2425 | goto done; | 2329 | goto done; |
2426 | } | 2330 | } |
2427 | if (nr_pages == 1 && consume_stock(mem)) { | 2331 | if (nr_pages == 1 && consume_stock(memcg)) { |
2428 | /* | 2332 | /* |
2429 | * It seems dagerous to access memcg without css_get(). | 2333 | * It seems dagerous to access memcg without css_get(). |
2430 | * But considering how consume_stok works, it's not | 2334 | * But considering how consume_stok works, it's not |
@@ -2437,7 +2341,7 @@ again: | |||
2437 | goto done; | 2341 | goto done; |
2438 | } | 2342 | } |
2439 | /* after here, we may be blocked. we need to get refcnt */ | 2343 | /* after here, we may be blocked. we need to get refcnt */ |
2440 | if (!css_tryget(&mem->css)) { | 2344 | if (!css_tryget(&memcg->css)) { |
2441 | rcu_read_unlock(); | 2345 | rcu_read_unlock(); |
2442 | goto again; | 2346 | goto again; |
2443 | } | 2347 | } |
@@ -2449,7 +2353,7 @@ again: | |||
2449 | 2353 | ||
2450 | /* If killed, bypass charge */ | 2354 | /* If killed, bypass charge */ |
2451 | if (fatal_signal_pending(current)) { | 2355 | if (fatal_signal_pending(current)) { |
2452 | css_put(&mem->css); | 2356 | css_put(&memcg->css); |
2453 | goto bypass; | 2357 | goto bypass; |
2454 | } | 2358 | } |
2455 | 2359 | ||
@@ -2459,43 +2363,43 @@ again: | |||
2459 | nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; | 2363 | nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; |
2460 | } | 2364 | } |
2461 | 2365 | ||
2462 | ret = mem_cgroup_do_charge(mem, gfp_mask, batch, oom_check); | 2366 | ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, oom_check); |
2463 | switch (ret) { | 2367 | switch (ret) { |
2464 | case CHARGE_OK: | 2368 | case CHARGE_OK: |
2465 | break; | 2369 | break; |
2466 | case CHARGE_RETRY: /* not in OOM situation but retry */ | 2370 | case CHARGE_RETRY: /* not in OOM situation but retry */ |
2467 | batch = nr_pages; | 2371 | batch = nr_pages; |
2468 | css_put(&mem->css); | 2372 | css_put(&memcg->css); |
2469 | mem = NULL; | 2373 | memcg = NULL; |
2470 | goto again; | 2374 | goto again; |
2471 | case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */ | 2375 | case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */ |
2472 | css_put(&mem->css); | 2376 | css_put(&memcg->css); |
2473 | goto nomem; | 2377 | goto nomem; |
2474 | case CHARGE_NOMEM: /* OOM routine works */ | 2378 | case CHARGE_NOMEM: /* OOM routine works */ |
2475 | if (!oom) { | 2379 | if (!oom) { |
2476 | css_put(&mem->css); | 2380 | css_put(&memcg->css); |
2477 | goto nomem; | 2381 | goto nomem; |
2478 | } | 2382 | } |
2479 | /* If oom, we never return -ENOMEM */ | 2383 | /* If oom, we never return -ENOMEM */ |
2480 | nr_oom_retries--; | 2384 | nr_oom_retries--; |
2481 | break; | 2385 | break; |
2482 | case CHARGE_OOM_DIE: /* Killed by OOM Killer */ | 2386 | case CHARGE_OOM_DIE: /* Killed by OOM Killer */ |
2483 | css_put(&mem->css); | 2387 | css_put(&memcg->css); |
2484 | goto bypass; | 2388 | goto bypass; |
2485 | } | 2389 | } |
2486 | } while (ret != CHARGE_OK); | 2390 | } while (ret != CHARGE_OK); |
2487 | 2391 | ||
2488 | if (batch > nr_pages) | 2392 | if (batch > nr_pages) |
2489 | refill_stock(mem, batch - nr_pages); | 2393 | refill_stock(memcg, batch - nr_pages); |
2490 | css_put(&mem->css); | 2394 | css_put(&memcg->css); |
2491 | done: | 2395 | done: |
2492 | *memcg = mem; | 2396 | *ptr = memcg; |
2493 | return 0; | 2397 | return 0; |
2494 | nomem: | 2398 | nomem: |
2495 | *memcg = NULL; | 2399 | *ptr = NULL; |
2496 | return -ENOMEM; | 2400 | return -ENOMEM; |
2497 | bypass: | 2401 | bypass: |
2498 | *memcg = NULL; | 2402 | *ptr = NULL; |
2499 | return 0; | 2403 | return 0; |
2500 | } | 2404 | } |
2501 | 2405 | ||
@@ -2504,15 +2408,15 @@ bypass: | |||
2504 | * This function is for that and do uncharge, put css's refcnt. | 2408 | * This function is for that and do uncharge, put css's refcnt. |
2505 | * gotten by try_charge(). | 2409 | * gotten by try_charge(). |
2506 | */ | 2410 | */ |
2507 | static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem, | 2411 | static void __mem_cgroup_cancel_charge(struct mem_cgroup *memcg, |
2508 | unsigned int nr_pages) | 2412 | unsigned int nr_pages) |
2509 | { | 2413 | { |
2510 | if (!mem_cgroup_is_root(mem)) { | 2414 | if (!mem_cgroup_is_root(memcg)) { |
2511 | unsigned long bytes = nr_pages * PAGE_SIZE; | 2415 | unsigned long bytes = nr_pages * PAGE_SIZE; |
2512 | 2416 | ||
2513 | res_counter_uncharge(&mem->res, bytes); | 2417 | res_counter_uncharge(&memcg->res, bytes); |
2514 | if (do_swap_account) | 2418 | if (do_swap_account) |
2515 | res_counter_uncharge(&mem->memsw, bytes); | 2419 | res_counter_uncharge(&memcg->memsw, bytes); |
2516 | } | 2420 | } |
2517 | } | 2421 | } |
2518 | 2422 | ||
@@ -2537,7 +2441,7 @@ static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) | |||
2537 | 2441 | ||
2538 | struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) | 2442 | struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) |
2539 | { | 2443 | { |
2540 | struct mem_cgroup *mem = NULL; | 2444 | struct mem_cgroup *memcg = NULL; |
2541 | struct page_cgroup *pc; | 2445 | struct page_cgroup *pc; |
2542 | unsigned short id; | 2446 | unsigned short id; |
2543 | swp_entry_t ent; | 2447 | swp_entry_t ent; |
@@ -2547,23 +2451,23 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) | |||
2547 | pc = lookup_page_cgroup(page); | 2451 | pc = lookup_page_cgroup(page); |
2548 | lock_page_cgroup(pc); | 2452 | lock_page_cgroup(pc); |
2549 | if (PageCgroupUsed(pc)) { | 2453 | if (PageCgroupUsed(pc)) { |
2550 | mem = pc->mem_cgroup; | 2454 | memcg = pc->mem_cgroup; |
2551 | if (mem && !css_tryget(&mem->css)) | 2455 | if (memcg && !css_tryget(&memcg->css)) |
2552 | mem = NULL; | 2456 | memcg = NULL; |
2553 | } else if (PageSwapCache(page)) { | 2457 | } else if (PageSwapCache(page)) { |
2554 | ent.val = page_private(page); | 2458 | ent.val = page_private(page); |
2555 | id = lookup_swap_cgroup(ent); | 2459 | id = lookup_swap_cgroup(ent); |
2556 | rcu_read_lock(); | 2460 | rcu_read_lock(); |
2557 | mem = mem_cgroup_lookup(id); | 2461 | memcg = mem_cgroup_lookup(id); |
2558 | if (mem && !css_tryget(&mem->css)) | 2462 | if (memcg && !css_tryget(&memcg->css)) |
2559 | mem = NULL; | 2463 | memcg = NULL; |
2560 | rcu_read_unlock(); | 2464 | rcu_read_unlock(); |
2561 | } | 2465 | } |
2562 | unlock_page_cgroup(pc); | 2466 | unlock_page_cgroup(pc); |
2563 | return mem; | 2467 | return memcg; |
2564 | } | 2468 | } |
2565 | 2469 | ||
2566 | static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, | 2470 | static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, |
2567 | struct page *page, | 2471 | struct page *page, |
2568 | unsigned int nr_pages, | 2472 | unsigned int nr_pages, |
2569 | struct page_cgroup *pc, | 2473 | struct page_cgroup *pc, |
@@ -2572,14 +2476,14 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, | |||
2572 | lock_page_cgroup(pc); | 2476 | lock_page_cgroup(pc); |
2573 | if (unlikely(PageCgroupUsed(pc))) { | 2477 | if (unlikely(PageCgroupUsed(pc))) { |
2574 | unlock_page_cgroup(pc); | 2478 | unlock_page_cgroup(pc); |
2575 | __mem_cgroup_cancel_charge(mem, nr_pages); | 2479 | __mem_cgroup_cancel_charge(memcg, nr_pages); |
2576 | return; | 2480 | return; |
2577 | } | 2481 | } |
2578 | /* | 2482 | /* |
2579 | * we don't need page_cgroup_lock about tail pages, becase they are not | 2483 | * we don't need page_cgroup_lock about tail pages, becase they are not |
2580 | * accessed by any other context at this point. | 2484 | * accessed by any other context at this point. |
2581 | */ | 2485 | */ |
2582 | pc->mem_cgroup = mem; | 2486 | pc->mem_cgroup = memcg; |
2583 | /* | 2487 | /* |
2584 | * We access a page_cgroup asynchronously without lock_page_cgroup(). | 2488 | * We access a page_cgroup asynchronously without lock_page_cgroup(). |
2585 | * Especially when a page_cgroup is taken from a page, pc->mem_cgroup | 2489 | * Especially when a page_cgroup is taken from a page, pc->mem_cgroup |
@@ -2602,14 +2506,14 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, | |||
2602 | break; | 2506 | break; |
2603 | } | 2507 | } |
2604 | 2508 | ||
2605 | mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), nr_pages); | 2509 | mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), nr_pages); |
2606 | unlock_page_cgroup(pc); | 2510 | unlock_page_cgroup(pc); |
2607 | /* | 2511 | /* |
2608 | * "charge_statistics" updated event counter. Then, check it. | 2512 | * "charge_statistics" updated event counter. Then, check it. |
2609 | * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. | 2513 | * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. |
2610 | * if they exceeds softlimit. | 2514 | * if they exceeds softlimit. |
2611 | */ | 2515 | */ |
2612 | memcg_check_events(mem, page); | 2516 | memcg_check_events(memcg, page); |
2613 | } | 2517 | } |
2614 | 2518 | ||
2615 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 2519 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
@@ -2796,7 +2700,7 @@ out: | |||
2796 | static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, | 2700 | static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, |
2797 | gfp_t gfp_mask, enum charge_type ctype) | 2701 | gfp_t gfp_mask, enum charge_type ctype) |
2798 | { | 2702 | { |
2799 | struct mem_cgroup *mem = NULL; | 2703 | struct mem_cgroup *memcg = NULL; |
2800 | unsigned int nr_pages = 1; | 2704 | unsigned int nr_pages = 1; |
2801 | struct page_cgroup *pc; | 2705 | struct page_cgroup *pc; |
2802 | bool oom = true; | 2706 | bool oom = true; |
@@ -2815,11 +2719,11 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, | |||
2815 | pc = lookup_page_cgroup(page); | 2719 | pc = lookup_page_cgroup(page); |
2816 | BUG_ON(!pc); /* XXX: remove this and move pc lookup into commit */ | 2720 | BUG_ON(!pc); /* XXX: remove this and move pc lookup into commit */ |
2817 | 2721 | ||
2818 | ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &mem, oom); | 2722 | ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom); |
2819 | if (ret || !mem) | 2723 | if (ret || !memcg) |
2820 | return ret; | 2724 | return ret; |
2821 | 2725 | ||
2822 | __mem_cgroup_commit_charge(mem, page, nr_pages, pc, ctype); | 2726 | __mem_cgroup_commit_charge(memcg, page, nr_pages, pc, ctype); |
2823 | return 0; | 2727 | return 0; |
2824 | } | 2728 | } |
2825 | 2729 | ||
@@ -2848,7 +2752,7 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, | |||
2848 | enum charge_type ctype); | 2752 | enum charge_type ctype); |
2849 | 2753 | ||
2850 | static void | 2754 | static void |
2851 | __mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *mem, | 2755 | __mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *memcg, |
2852 | enum charge_type ctype) | 2756 | enum charge_type ctype) |
2853 | { | 2757 | { |
2854 | struct page_cgroup *pc = lookup_page_cgroup(page); | 2758 | struct page_cgroup *pc = lookup_page_cgroup(page); |
@@ -2858,7 +2762,7 @@ __mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *mem, | |||
2858 | * LRU. Take care of it. | 2762 | * LRU. Take care of it. |
2859 | */ | 2763 | */ |
2860 | mem_cgroup_lru_del_before_commit(page); | 2764 | mem_cgroup_lru_del_before_commit(page); |
2861 | __mem_cgroup_commit_charge(mem, page, 1, pc, ctype); | 2765 | __mem_cgroup_commit_charge(memcg, page, 1, pc, ctype); |
2862 | mem_cgroup_lru_add_after_commit(page); | 2766 | mem_cgroup_lru_add_after_commit(page); |
2863 | return; | 2767 | return; |
2864 | } | 2768 | } |
@@ -2866,44 +2770,20 @@ __mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *mem, | |||
2866 | int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, | 2770 | int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, |
2867 | gfp_t gfp_mask) | 2771 | gfp_t gfp_mask) |
2868 | { | 2772 | { |
2869 | struct mem_cgroup *mem = NULL; | 2773 | struct mem_cgroup *memcg = NULL; |
2870 | int ret; | 2774 | int ret; |
2871 | 2775 | ||
2872 | if (mem_cgroup_disabled()) | 2776 | if (mem_cgroup_disabled()) |
2873 | return 0; | 2777 | return 0; |
2874 | if (PageCompound(page)) | 2778 | if (PageCompound(page)) |
2875 | return 0; | 2779 | return 0; |
2876 | /* | ||
2877 | * Corner case handling. This is called from add_to_page_cache() | ||
2878 | * in usual. But some FS (shmem) precharges this page before calling it | ||
2879 | * and call add_to_page_cache() with GFP_NOWAIT. | ||
2880 | * | ||
2881 | * For GFP_NOWAIT case, the page may be pre-charged before calling | ||
2882 | * add_to_page_cache(). (See shmem.c) check it here and avoid to call | ||
2883 | * charge twice. (It works but has to pay a bit larger cost.) | ||
2884 | * And when the page is SwapCache, it should take swap information | ||
2885 | * into account. This is under lock_page() now. | ||
2886 | */ | ||
2887 | if (!(gfp_mask & __GFP_WAIT)) { | ||
2888 | struct page_cgroup *pc; | ||
2889 | |||
2890 | pc = lookup_page_cgroup(page); | ||
2891 | if (!pc) | ||
2892 | return 0; | ||
2893 | lock_page_cgroup(pc); | ||
2894 | if (PageCgroupUsed(pc)) { | ||
2895 | unlock_page_cgroup(pc); | ||
2896 | return 0; | ||
2897 | } | ||
2898 | unlock_page_cgroup(pc); | ||
2899 | } | ||
2900 | 2780 | ||
2901 | if (unlikely(!mm)) | 2781 | if (unlikely(!mm)) |
2902 | mm = &init_mm; | 2782 | mm = &init_mm; |
2903 | 2783 | ||
2904 | if (page_is_file_cache(page)) { | 2784 | if (page_is_file_cache(page)) { |
2905 | ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, &mem, true); | 2785 | ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, &memcg, true); |
2906 | if (ret || !mem) | 2786 | if (ret || !memcg) |
2907 | return ret; | 2787 | return ret; |
2908 | 2788 | ||
2909 | /* | 2789 | /* |
@@ -2911,15 +2791,15 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, | |||
2911 | * put that would remove them from the LRU list, make | 2791 | * put that would remove them from the LRU list, make |
2912 | * sure that they get relinked properly. | 2792 | * sure that they get relinked properly. |
2913 | */ | 2793 | */ |
2914 | __mem_cgroup_commit_charge_lrucare(page, mem, | 2794 | __mem_cgroup_commit_charge_lrucare(page, memcg, |
2915 | MEM_CGROUP_CHARGE_TYPE_CACHE); | 2795 | MEM_CGROUP_CHARGE_TYPE_CACHE); |
2916 | return ret; | 2796 | return ret; |
2917 | } | 2797 | } |
2918 | /* shmem */ | 2798 | /* shmem */ |
2919 | if (PageSwapCache(page)) { | 2799 | if (PageSwapCache(page)) { |
2920 | ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem); | 2800 | ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &memcg); |
2921 | if (!ret) | 2801 | if (!ret) |
2922 | __mem_cgroup_commit_charge_swapin(page, mem, | 2802 | __mem_cgroup_commit_charge_swapin(page, memcg, |
2923 | MEM_CGROUP_CHARGE_TYPE_SHMEM); | 2803 | MEM_CGROUP_CHARGE_TYPE_SHMEM); |
2924 | } else | 2804 | } else |
2925 | ret = mem_cgroup_charge_common(page, mm, gfp_mask, | 2805 | ret = mem_cgroup_charge_common(page, mm, gfp_mask, |
@@ -2938,7 +2818,7 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, | |||
2938 | struct page *page, | 2818 | struct page *page, |
2939 | gfp_t mask, struct mem_cgroup **ptr) | 2819 | gfp_t mask, struct mem_cgroup **ptr) |
2940 | { | 2820 | { |
2941 | struct mem_cgroup *mem; | 2821 | struct mem_cgroup *memcg; |
2942 | int ret; | 2822 | int ret; |
2943 | 2823 | ||
2944 | *ptr = NULL; | 2824 | *ptr = NULL; |
@@ -2956,12 +2836,12 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, | |||
2956 | */ | 2836 | */ |
2957 | if (!PageSwapCache(page)) | 2837 | if (!PageSwapCache(page)) |
2958 | goto charge_cur_mm; | 2838 | goto charge_cur_mm; |
2959 | mem = try_get_mem_cgroup_from_page(page); | 2839 | memcg = try_get_mem_cgroup_from_page(page); |
2960 | if (!mem) | 2840 | if (!memcg) |
2961 | goto charge_cur_mm; | 2841 | goto charge_cur_mm; |
2962 | *ptr = mem; | 2842 | *ptr = memcg; |
2963 | ret = __mem_cgroup_try_charge(NULL, mask, 1, ptr, true); | 2843 | ret = __mem_cgroup_try_charge(NULL, mask, 1, ptr, true); |
2964 | css_put(&mem->css); | 2844 | css_put(&memcg->css); |
2965 | return ret; | 2845 | return ret; |
2966 | charge_cur_mm: | 2846 | charge_cur_mm: |
2967 | if (unlikely(!mm)) | 2847 | if (unlikely(!mm)) |
@@ -3021,16 +2901,16 @@ void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr) | |||
3021 | MEM_CGROUP_CHARGE_TYPE_MAPPED); | 2901 | MEM_CGROUP_CHARGE_TYPE_MAPPED); |
3022 | } | 2902 | } |
3023 | 2903 | ||
3024 | void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) | 2904 | void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg) |
3025 | { | 2905 | { |
3026 | if (mem_cgroup_disabled()) | 2906 | if (mem_cgroup_disabled()) |
3027 | return; | 2907 | return; |
3028 | if (!mem) | 2908 | if (!memcg) |
3029 | return; | 2909 | return; |
3030 | __mem_cgroup_cancel_charge(mem, 1); | 2910 | __mem_cgroup_cancel_charge(memcg, 1); |
3031 | } | 2911 | } |
3032 | 2912 | ||
3033 | static void mem_cgroup_do_uncharge(struct mem_cgroup *mem, | 2913 | static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg, |
3034 | unsigned int nr_pages, | 2914 | unsigned int nr_pages, |
3035 | const enum charge_type ctype) | 2915 | const enum charge_type ctype) |
3036 | { | 2916 | { |
@@ -3048,7 +2928,7 @@ static void mem_cgroup_do_uncharge(struct mem_cgroup *mem, | |||
3048 | * uncharges. Then, it's ok to ignore memcg's refcnt. | 2928 | * uncharges. Then, it's ok to ignore memcg's refcnt. |
3049 | */ | 2929 | */ |
3050 | if (!batch->memcg) | 2930 | if (!batch->memcg) |
3051 | batch->memcg = mem; | 2931 | batch->memcg = memcg; |
3052 | /* | 2932 | /* |
3053 | * do_batch > 0 when unmapping pages or inode invalidate/truncate. | 2933 | * do_batch > 0 when unmapping pages or inode invalidate/truncate. |
3054 | * In those cases, all pages freed continuously can be expected to be in | 2934 | * In those cases, all pages freed continuously can be expected to be in |
@@ -3068,7 +2948,7 @@ static void mem_cgroup_do_uncharge(struct mem_cgroup *mem, | |||
3068 | * merge a series of uncharges to an uncharge of res_counter. | 2948 | * merge a series of uncharges to an uncharge of res_counter. |
3069 | * If not, we uncharge res_counter ony by one. | 2949 | * If not, we uncharge res_counter ony by one. |
3070 | */ | 2950 | */ |
3071 | if (batch->memcg != mem) | 2951 | if (batch->memcg != memcg) |
3072 | goto direct_uncharge; | 2952 | goto direct_uncharge; |
3073 | /* remember freed charge and uncharge it later */ | 2953 | /* remember freed charge and uncharge it later */ |
3074 | batch->nr_pages++; | 2954 | batch->nr_pages++; |
@@ -3076,11 +2956,11 @@ static void mem_cgroup_do_uncharge(struct mem_cgroup *mem, | |||
3076 | batch->memsw_nr_pages++; | 2956 | batch->memsw_nr_pages++; |
3077 | return; | 2957 | return; |
3078 | direct_uncharge: | 2958 | direct_uncharge: |
3079 | res_counter_uncharge(&mem->res, nr_pages * PAGE_SIZE); | 2959 | res_counter_uncharge(&memcg->res, nr_pages * PAGE_SIZE); |
3080 | if (uncharge_memsw) | 2960 | if (uncharge_memsw) |
3081 | res_counter_uncharge(&mem->memsw, nr_pages * PAGE_SIZE); | 2961 | res_counter_uncharge(&memcg->memsw, nr_pages * PAGE_SIZE); |
3082 | if (unlikely(batch->memcg != mem)) | 2962 | if (unlikely(batch->memcg != memcg)) |
3083 | memcg_oom_recover(mem); | 2963 | memcg_oom_recover(memcg); |
3084 | return; | 2964 | return; |
3085 | } | 2965 | } |
3086 | 2966 | ||
@@ -3090,7 +2970,7 @@ direct_uncharge: | |||
3090 | static struct mem_cgroup * | 2970 | static struct mem_cgroup * |
3091 | __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | 2971 | __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) |
3092 | { | 2972 | { |
3093 | struct mem_cgroup *mem = NULL; | 2973 | struct mem_cgroup *memcg = NULL; |
3094 | unsigned int nr_pages = 1; | 2974 | unsigned int nr_pages = 1; |
3095 | struct page_cgroup *pc; | 2975 | struct page_cgroup *pc; |
3096 | 2976 | ||
@@ -3113,7 +2993,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
3113 | 2993 | ||
3114 | lock_page_cgroup(pc); | 2994 | lock_page_cgroup(pc); |
3115 | 2995 | ||
3116 | mem = pc->mem_cgroup; | 2996 | memcg = pc->mem_cgroup; |
3117 | 2997 | ||
3118 | if (!PageCgroupUsed(pc)) | 2998 | if (!PageCgroupUsed(pc)) |
3119 | goto unlock_out; | 2999 | goto unlock_out; |
@@ -3136,7 +3016,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
3136 | break; | 3016 | break; |
3137 | } | 3017 | } |
3138 | 3018 | ||
3139 | mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), -nr_pages); | 3019 | mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), -nr_pages); |
3140 | 3020 | ||
3141 | ClearPageCgroupUsed(pc); | 3021 | ClearPageCgroupUsed(pc); |
3142 | /* | 3022 | /* |
@@ -3148,18 +3028,18 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
3148 | 3028 | ||
3149 | unlock_page_cgroup(pc); | 3029 | unlock_page_cgroup(pc); |
3150 | /* | 3030 | /* |
3151 | * even after unlock, we have mem->res.usage here and this memcg | 3031 | * even after unlock, we have memcg->res.usage here and this memcg |
3152 | * will never be freed. | 3032 | * will never be freed. |
3153 | */ | 3033 | */ |
3154 | memcg_check_events(mem, page); | 3034 | memcg_check_events(memcg, page); |
3155 | if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) { | 3035 | if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) { |
3156 | mem_cgroup_swap_statistics(mem, true); | 3036 | mem_cgroup_swap_statistics(memcg, true); |
3157 | mem_cgroup_get(mem); | 3037 | mem_cgroup_get(memcg); |
3158 | } | 3038 | } |
3159 | if (!mem_cgroup_is_root(mem)) | 3039 | if (!mem_cgroup_is_root(memcg)) |
3160 | mem_cgroup_do_uncharge(mem, nr_pages, ctype); | 3040 | mem_cgroup_do_uncharge(memcg, nr_pages, ctype); |
3161 | 3041 | ||
3162 | return mem; | 3042 | return memcg; |
3163 | 3043 | ||
3164 | unlock_out: | 3044 | unlock_out: |
3165 | unlock_page_cgroup(pc); | 3045 | unlock_page_cgroup(pc); |
@@ -3349,7 +3229,7 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry, | |||
3349 | int mem_cgroup_prepare_migration(struct page *page, | 3229 | int mem_cgroup_prepare_migration(struct page *page, |
3350 | struct page *newpage, struct mem_cgroup **ptr, gfp_t gfp_mask) | 3230 | struct page *newpage, struct mem_cgroup **ptr, gfp_t gfp_mask) |
3351 | { | 3231 | { |
3352 | struct mem_cgroup *mem = NULL; | 3232 | struct mem_cgroup *memcg = NULL; |
3353 | struct page_cgroup *pc; | 3233 | struct page_cgroup *pc; |
3354 | enum charge_type ctype; | 3234 | enum charge_type ctype; |
3355 | int ret = 0; | 3235 | int ret = 0; |
@@ -3363,8 +3243,8 @@ int mem_cgroup_prepare_migration(struct page *page, | |||
3363 | pc = lookup_page_cgroup(page); | 3243 | pc = lookup_page_cgroup(page); |
3364 | lock_page_cgroup(pc); | 3244 | lock_page_cgroup(pc); |
3365 | if (PageCgroupUsed(pc)) { | 3245 | if (PageCgroupUsed(pc)) { |
3366 | mem = pc->mem_cgroup; | 3246 | memcg = pc->mem_cgroup; |
3367 | css_get(&mem->css); | 3247 | css_get(&memcg->css); |
3368 | /* | 3248 | /* |
3369 | * At migrating an anonymous page, its mapcount goes down | 3249 | * At migrating an anonymous page, its mapcount goes down |
3370 | * to 0 and uncharge() will be called. But, even if it's fully | 3250 | * to 0 and uncharge() will be called. But, even if it's fully |
@@ -3402,12 +3282,12 @@ int mem_cgroup_prepare_migration(struct page *page, | |||
3402 | * If the page is not charged at this point, | 3282 | * If the page is not charged at this point, |
3403 | * we return here. | 3283 | * we return here. |
3404 | */ | 3284 | */ |
3405 | if (!mem) | 3285 | if (!memcg) |
3406 | return 0; | 3286 | return 0; |
3407 | 3287 | ||
3408 | *ptr = mem; | 3288 | *ptr = memcg; |
3409 | ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, ptr, false); | 3289 | ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, ptr, false); |
3410 | css_put(&mem->css);/* drop extra refcnt */ | 3290 | css_put(&memcg->css);/* drop extra refcnt */ |
3411 | if (ret || *ptr == NULL) { | 3291 | if (ret || *ptr == NULL) { |
3412 | if (PageAnon(page)) { | 3292 | if (PageAnon(page)) { |
3413 | lock_page_cgroup(pc); | 3293 | lock_page_cgroup(pc); |
@@ -3433,21 +3313,21 @@ int mem_cgroup_prepare_migration(struct page *page, | |||
3433 | ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; | 3313 | ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; |
3434 | else | 3314 | else |
3435 | ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; | 3315 | ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; |
3436 | __mem_cgroup_commit_charge(mem, page, 1, pc, ctype); | 3316 | __mem_cgroup_commit_charge(memcg, page, 1, pc, ctype); |
3437 | return ret; | 3317 | return ret; |
3438 | } | 3318 | } |
3439 | 3319 | ||
3440 | /* remove redundant charge if migration failed*/ | 3320 | /* remove redundant charge if migration failed*/ |
3441 | void mem_cgroup_end_migration(struct mem_cgroup *mem, | 3321 | void mem_cgroup_end_migration(struct mem_cgroup *memcg, |
3442 | struct page *oldpage, struct page *newpage, bool migration_ok) | 3322 | struct page *oldpage, struct page *newpage, bool migration_ok) |
3443 | { | 3323 | { |
3444 | struct page *used, *unused; | 3324 | struct page *used, *unused; |
3445 | struct page_cgroup *pc; | 3325 | struct page_cgroup *pc; |
3446 | 3326 | ||
3447 | if (!mem) | 3327 | if (!memcg) |
3448 | return; | 3328 | return; |
3449 | /* blocks rmdir() */ | 3329 | /* blocks rmdir() */ |
3450 | cgroup_exclude_rmdir(&mem->css); | 3330 | cgroup_exclude_rmdir(&memcg->css); |
3451 | if (!migration_ok) { | 3331 | if (!migration_ok) { |
3452 | used = oldpage; | 3332 | used = oldpage; |
3453 | unused = newpage; | 3333 | unused = newpage; |
@@ -3483,32 +3363,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem, | |||
3483 | * So, rmdir()->pre_destroy() can be called while we do this charge. | 3363 | * So, rmdir()->pre_destroy() can be called while we do this charge. |
3484 | * In that case, we need to call pre_destroy() again. check it here. | 3364 | * In that case, we need to call pre_destroy() again. check it here. |
3485 | */ | 3365 | */ |
3486 | cgroup_release_and_wakeup_rmdir(&mem->css); | 3366 | cgroup_release_and_wakeup_rmdir(&memcg->css); |
3487 | } | ||
3488 | |||
3489 | /* | ||
3490 | * A call to try to shrink memory usage on charge failure at shmem's swapin. | ||
3491 | * Calling hierarchical_reclaim is not enough because we should update | ||
3492 | * last_oom_jiffies to prevent pagefault_out_of_memory from invoking global OOM. | ||
3493 | * Moreover considering hierarchy, we should reclaim from the mem_over_limit, | ||
3494 | * not from the memcg which this page would be charged to. | ||
3495 | * try_charge_swapin does all of these works properly. | ||
3496 | */ | ||
3497 | int mem_cgroup_shmem_charge_fallback(struct page *page, | ||
3498 | struct mm_struct *mm, | ||
3499 | gfp_t gfp_mask) | ||
3500 | { | ||
3501 | struct mem_cgroup *mem; | ||
3502 | int ret; | ||
3503 | |||
3504 | if (mem_cgroup_disabled()) | ||
3505 | return 0; | ||
3506 | |||
3507 | ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem); | ||
3508 | if (!ret) | ||
3509 | mem_cgroup_cancel_charge_swapin(mem); /* it does !mem check */ | ||
3510 | |||
3511 | return ret; | ||
3512 | } | 3367 | } |
3513 | 3368 | ||
3514 | #ifdef CONFIG_DEBUG_VM | 3369 | #ifdef CONFIG_DEBUG_VM |
@@ -3587,7 +3442,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, | |||
3587 | /* | 3442 | /* |
3588 | * Rather than hide all in some function, I do this in | 3443 | * Rather than hide all in some function, I do this in |
3589 | * open coded manner. You see what this really does. | 3444 | * open coded manner. You see what this really does. |
3590 | * We have to guarantee mem->res.limit < mem->memsw.limit. | 3445 | * We have to guarantee memcg->res.limit < memcg->memsw.limit. |
3591 | */ | 3446 | */ |
3592 | mutex_lock(&set_limit_mutex); | 3447 | mutex_lock(&set_limit_mutex); |
3593 | memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); | 3448 | memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); |
@@ -3649,7 +3504,7 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, | |||
3649 | /* | 3504 | /* |
3650 | * Rather than hide all in some function, I do this in | 3505 | * Rather than hide all in some function, I do this in |
3651 | * open coded manner. You see what this really does. | 3506 | * open coded manner. You see what this really does. |
3652 | * We have to guarantee mem->res.limit < mem->memsw.limit. | 3507 | * We have to guarantee memcg->res.limit < memcg->memsw.limit. |
3653 | */ | 3508 | */ |
3654 | mutex_lock(&set_limit_mutex); | 3509 | mutex_lock(&set_limit_mutex); |
3655 | memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); | 3510 | memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); |
@@ -3787,7 +3642,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, | |||
3787 | * This routine traverse page_cgroup in given list and drop them all. | 3642 | * This routine traverse page_cgroup in given list and drop them all. |
3788 | * *And* this routine doesn't reclaim page itself, just removes page_cgroup. | 3643 | * *And* this routine doesn't reclaim page itself, just removes page_cgroup. |
3789 | */ | 3644 | */ |
3790 | static int mem_cgroup_force_empty_list(struct mem_cgroup *mem, | 3645 | static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg, |
3791 | int node, int zid, enum lru_list lru) | 3646 | int node, int zid, enum lru_list lru) |
3792 | { | 3647 | { |
3793 | struct zone *zone; | 3648 | struct zone *zone; |
@@ -3798,7 +3653,7 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem, | |||
3798 | int ret = 0; | 3653 | int ret = 0; |
3799 | 3654 | ||
3800 | zone = &NODE_DATA(node)->node_zones[zid]; | 3655 | zone = &NODE_DATA(node)->node_zones[zid]; |
3801 | mz = mem_cgroup_zoneinfo(mem, node, zid); | 3656 | mz = mem_cgroup_zoneinfo(memcg, node, zid); |
3802 | list = &mz->lists[lru]; | 3657 | list = &mz->lists[lru]; |
3803 | 3658 | ||
3804 | loop = MEM_CGROUP_ZSTAT(mz, lru); | 3659 | loop = MEM_CGROUP_ZSTAT(mz, lru); |
@@ -3825,7 +3680,7 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem, | |||
3825 | 3680 | ||
3826 | page = lookup_cgroup_page(pc); | 3681 | page = lookup_cgroup_page(pc); |
3827 | 3682 | ||
3828 | ret = mem_cgroup_move_parent(page, pc, mem, GFP_KERNEL); | 3683 | ret = mem_cgroup_move_parent(page, pc, memcg, GFP_KERNEL); |
3829 | if (ret == -ENOMEM) | 3684 | if (ret == -ENOMEM) |
3830 | break; | 3685 | break; |
3831 | 3686 | ||
@@ -3846,14 +3701,14 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem, | |||
3846 | * make mem_cgroup's charge to be 0 if there is no task. | 3701 | * make mem_cgroup's charge to be 0 if there is no task. |
3847 | * This enables deleting this mem_cgroup. | 3702 | * This enables deleting this mem_cgroup. |
3848 | */ | 3703 | */ |
3849 | static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all) | 3704 | static int mem_cgroup_force_empty(struct mem_cgroup *memcg, bool free_all) |
3850 | { | 3705 | { |
3851 | int ret; | 3706 | int ret; |
3852 | int node, zid, shrink; | 3707 | int node, zid, shrink; |
3853 | int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; | 3708 | int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; |
3854 | struct cgroup *cgrp = mem->css.cgroup; | 3709 | struct cgroup *cgrp = memcg->css.cgroup; |
3855 | 3710 | ||
3856 | css_get(&mem->css); | 3711 | css_get(&memcg->css); |
3857 | 3712 | ||
3858 | shrink = 0; | 3713 | shrink = 0; |
3859 | /* should free all ? */ | 3714 | /* should free all ? */ |
@@ -3869,14 +3724,14 @@ move_account: | |||
3869 | goto out; | 3724 | goto out; |
3870 | /* This is for making all *used* pages to be on LRU. */ | 3725 | /* This is for making all *used* pages to be on LRU. */ |
3871 | lru_add_drain_all(); | 3726 | lru_add_drain_all(); |
3872 | drain_all_stock_sync(mem); | 3727 | drain_all_stock_sync(memcg); |
3873 | ret = 0; | 3728 | ret = 0; |
3874 | mem_cgroup_start_move(mem); | 3729 | mem_cgroup_start_move(memcg); |
3875 | for_each_node_state(node, N_HIGH_MEMORY) { | 3730 | for_each_node_state(node, N_HIGH_MEMORY) { |
3876 | for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { | 3731 | for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { |
3877 | enum lru_list l; | 3732 | enum lru_list l; |
3878 | for_each_lru(l) { | 3733 | for_each_lru(l) { |
3879 | ret = mem_cgroup_force_empty_list(mem, | 3734 | ret = mem_cgroup_force_empty_list(memcg, |
3880 | node, zid, l); | 3735 | node, zid, l); |
3881 | if (ret) | 3736 | if (ret) |
3882 | break; | 3737 | break; |
@@ -3885,16 +3740,16 @@ move_account: | |||
3885 | if (ret) | 3740 | if (ret) |
3886 | break; | 3741 | break; |
3887 | } | 3742 | } |
3888 | mem_cgroup_end_move(mem); | 3743 | mem_cgroup_end_move(memcg); |
3889 | memcg_oom_recover(mem); | 3744 | memcg_oom_recover(memcg); |
3890 | /* it seems parent cgroup doesn't have enough mem */ | 3745 | /* it seems parent cgroup doesn't have enough mem */ |
3891 | if (ret == -ENOMEM) | 3746 | if (ret == -ENOMEM) |
3892 | goto try_to_free; | 3747 | goto try_to_free; |
3893 | cond_resched(); | 3748 | cond_resched(); |
3894 | /* "ret" should also be checked to ensure all lists are empty. */ | 3749 | /* "ret" should also be checked to ensure all lists are empty. */ |
3895 | } while (mem->res.usage > 0 || ret); | 3750 | } while (memcg->res.usage > 0 || ret); |
3896 | out: | 3751 | out: |
3897 | css_put(&mem->css); | 3752 | css_put(&memcg->css); |
3898 | return ret; | 3753 | return ret; |
3899 | 3754 | ||
3900 | try_to_free: | 3755 | try_to_free: |
@@ -3907,19 +3762,15 @@ try_to_free: | |||
3907 | lru_add_drain_all(); | 3762 | lru_add_drain_all(); |
3908 | /* try to free all pages in this cgroup */ | 3763 | /* try to free all pages in this cgroup */ |
3909 | shrink = 1; | 3764 | shrink = 1; |
3910 | while (nr_retries && mem->res.usage > 0) { | 3765 | while (nr_retries && memcg->res.usage > 0) { |
3911 | struct memcg_scanrecord rec; | ||
3912 | int progress; | 3766 | int progress; |
3913 | 3767 | ||
3914 | if (signal_pending(current)) { | 3768 | if (signal_pending(current)) { |
3915 | ret = -EINTR; | 3769 | ret = -EINTR; |
3916 | goto out; | 3770 | goto out; |
3917 | } | 3771 | } |
3918 | rec.context = SCAN_BY_SHRINK; | 3772 | progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL, |
3919 | rec.mem = mem; | 3773 | false); |
3920 | rec.root = mem; | ||
3921 | progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL, | ||
3922 | false, &rec); | ||
3923 | if (!progress) { | 3774 | if (!progress) { |
3924 | nr_retries--; | 3775 | nr_retries--; |
3925 | /* maybe some writeback is necessary */ | 3776 | /* maybe some writeback is necessary */ |
@@ -3947,12 +3798,12 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, | |||
3947 | u64 val) | 3798 | u64 val) |
3948 | { | 3799 | { |
3949 | int retval = 0; | 3800 | int retval = 0; |
3950 | struct mem_cgroup *mem = mem_cgroup_from_cont(cont); | 3801 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); |
3951 | struct cgroup *parent = cont->parent; | 3802 | struct cgroup *parent = cont->parent; |
3952 | struct mem_cgroup *parent_mem = NULL; | 3803 | struct mem_cgroup *parent_memcg = NULL; |
3953 | 3804 | ||
3954 | if (parent) | 3805 | if (parent) |
3955 | parent_mem = mem_cgroup_from_cont(parent); | 3806 | parent_memcg = mem_cgroup_from_cont(parent); |
3956 | 3807 | ||
3957 | cgroup_lock(); | 3808 | cgroup_lock(); |
3958 | /* | 3809 | /* |
@@ -3963,10 +3814,10 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, | |||
3963 | * For the root cgroup, parent_mem is NULL, we allow value to be | 3814 | * For the root cgroup, parent_mem is NULL, we allow value to be |
3964 | * set if there are no children. | 3815 | * set if there are no children. |
3965 | */ | 3816 | */ |
3966 | if ((!parent_mem || !parent_mem->use_hierarchy) && | 3817 | if ((!parent_memcg || !parent_memcg->use_hierarchy) && |
3967 | (val == 1 || val == 0)) { | 3818 | (val == 1 || val == 0)) { |
3968 | if (list_empty(&cont->children)) | 3819 | if (list_empty(&cont->children)) |
3969 | mem->use_hierarchy = val; | 3820 | memcg->use_hierarchy = val; |
3970 | else | 3821 | else |
3971 | retval = -EBUSY; | 3822 | retval = -EBUSY; |
3972 | } else | 3823 | } else |
@@ -3977,14 +3828,14 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, | |||
3977 | } | 3828 | } |
3978 | 3829 | ||
3979 | 3830 | ||
3980 | static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *mem, | 3831 | static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg, |
3981 | enum mem_cgroup_stat_index idx) | 3832 | enum mem_cgroup_stat_index idx) |
3982 | { | 3833 | { |
3983 | struct mem_cgroup *iter; | 3834 | struct mem_cgroup *iter; |
3984 | long val = 0; | 3835 | long val = 0; |
3985 | 3836 | ||
3986 | /* Per-cpu values can be negative, use a signed accumulator */ | 3837 | /* Per-cpu values can be negative, use a signed accumulator */ |
3987 | for_each_mem_cgroup_tree(iter, mem) | 3838 | for_each_mem_cgroup_tree(iter, memcg) |
3988 | val += mem_cgroup_read_stat(iter, idx); | 3839 | val += mem_cgroup_read_stat(iter, idx); |
3989 | 3840 | ||
3990 | if (val < 0) /* race ? */ | 3841 | if (val < 0) /* race ? */ |
@@ -3992,29 +3843,29 @@ static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *mem, | |||
3992 | return val; | 3843 | return val; |
3993 | } | 3844 | } |
3994 | 3845 | ||
3995 | static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap) | 3846 | static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) |
3996 | { | 3847 | { |
3997 | u64 val; | 3848 | u64 val; |
3998 | 3849 | ||
3999 | if (!mem_cgroup_is_root(mem)) { | 3850 | if (!mem_cgroup_is_root(memcg)) { |
4000 | if (!swap) | 3851 | if (!swap) |
4001 | return res_counter_read_u64(&mem->res, RES_USAGE); | 3852 | return res_counter_read_u64(&memcg->res, RES_USAGE); |
4002 | else | 3853 | else |
4003 | return res_counter_read_u64(&mem->memsw, RES_USAGE); | 3854 | return res_counter_read_u64(&memcg->memsw, RES_USAGE); |
4004 | } | 3855 | } |
4005 | 3856 | ||
4006 | val = mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_CACHE); | 3857 | val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE); |
4007 | val += mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_RSS); | 3858 | val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS); |
4008 | 3859 | ||
4009 | if (swap) | 3860 | if (swap) |
4010 | val += mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_SWAPOUT); | 3861 | val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAPOUT); |
4011 | 3862 | ||
4012 | return val << PAGE_SHIFT; | 3863 | return val << PAGE_SHIFT; |
4013 | } | 3864 | } |
4014 | 3865 | ||
4015 | static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) | 3866 | static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) |
4016 | { | 3867 | { |
4017 | struct mem_cgroup *mem = mem_cgroup_from_cont(cont); | 3868 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); |
4018 | u64 val; | 3869 | u64 val; |
4019 | int type, name; | 3870 | int type, name; |
4020 | 3871 | ||
@@ -4023,15 +3874,15 @@ static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) | |||
4023 | switch (type) { | 3874 | switch (type) { |
4024 | case _MEM: | 3875 | case _MEM: |
4025 | if (name == RES_USAGE) | 3876 | if (name == RES_USAGE) |
4026 | val = mem_cgroup_usage(mem, false); | 3877 | val = mem_cgroup_usage(memcg, false); |
4027 | else | 3878 | else |
4028 | val = res_counter_read_u64(&mem->res, name); | 3879 | val = res_counter_read_u64(&memcg->res, name); |
4029 | break; | 3880 | break; |
4030 | case _MEMSWAP: | 3881 | case _MEMSWAP: |
4031 | if (name == RES_USAGE) | 3882 | if (name == RES_USAGE) |
4032 | val = mem_cgroup_usage(mem, true); | 3883 | val = mem_cgroup_usage(memcg, true); |
4033 | else | 3884 | else |
4034 | val = res_counter_read_u64(&mem->memsw, name); | 3885 | val = res_counter_read_u64(&memcg->memsw, name); |
4035 | break; | 3886 | break; |
4036 | default: | 3887 | default: |
4037 | BUG(); | 3888 | BUG(); |
@@ -4119,24 +3970,24 @@ out: | |||
4119 | 3970 | ||
4120 | static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) | 3971 | static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) |
4121 | { | 3972 | { |
4122 | struct mem_cgroup *mem; | 3973 | struct mem_cgroup *memcg; |
4123 | int type, name; | 3974 | int type, name; |
4124 | 3975 | ||
4125 | mem = mem_cgroup_from_cont(cont); | 3976 | memcg = mem_cgroup_from_cont(cont); |
4126 | type = MEMFILE_TYPE(event); | 3977 | type = MEMFILE_TYPE(event); |
4127 | name = MEMFILE_ATTR(event); | 3978 | name = MEMFILE_ATTR(event); |
4128 | switch (name) { | 3979 | switch (name) { |
4129 | case RES_MAX_USAGE: | 3980 | case RES_MAX_USAGE: |
4130 | if (type == _MEM) | 3981 | if (type == _MEM) |
4131 | res_counter_reset_max(&mem->res); | 3982 | res_counter_reset_max(&memcg->res); |
4132 | else | 3983 | else |
4133 | res_counter_reset_max(&mem->memsw); | 3984 | res_counter_reset_max(&memcg->memsw); |
4134 | break; | 3985 | break; |
4135 | case RES_FAILCNT: | 3986 | case RES_FAILCNT: |
4136 | if (type == _MEM) | 3987 | if (type == _MEM) |
4137 | res_counter_reset_failcnt(&mem->res); | 3988 | res_counter_reset_failcnt(&memcg->res); |
4138 | else | 3989 | else |
4139 | res_counter_reset_failcnt(&mem->memsw); | 3990 | res_counter_reset_failcnt(&memcg->memsw); |
4140 | break; | 3991 | break; |
4141 | } | 3992 | } |
4142 | 3993 | ||
@@ -4153,7 +4004,7 @@ static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp, | |||
4153 | static int mem_cgroup_move_charge_write(struct cgroup *cgrp, | 4004 | static int mem_cgroup_move_charge_write(struct cgroup *cgrp, |
4154 | struct cftype *cft, u64 val) | 4005 | struct cftype *cft, u64 val) |
4155 | { | 4006 | { |
4156 | struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); | 4007 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); |
4157 | 4008 | ||
4158 | if (val >= (1 << NR_MOVE_TYPE)) | 4009 | if (val >= (1 << NR_MOVE_TYPE)) |
4159 | return -EINVAL; | 4010 | return -EINVAL; |
@@ -4163,7 +4014,7 @@ static int mem_cgroup_move_charge_write(struct cgroup *cgrp, | |||
4163 | * inconsistent. | 4014 | * inconsistent. |
4164 | */ | 4015 | */ |
4165 | cgroup_lock(); | 4016 | cgroup_lock(); |
4166 | mem->move_charge_at_immigrate = val; | 4017 | memcg->move_charge_at_immigrate = val; |
4167 | cgroup_unlock(); | 4018 | cgroup_unlock(); |
4168 | 4019 | ||
4169 | return 0; | 4020 | return 0; |
@@ -4220,49 +4071,49 @@ struct { | |||
4220 | 4071 | ||
4221 | 4072 | ||
4222 | static void | 4073 | static void |
4223 | mem_cgroup_get_local_stat(struct mem_cgroup *mem, struct mcs_total_stat *s) | 4074 | mem_cgroup_get_local_stat(struct mem_cgroup *memcg, struct mcs_total_stat *s) |
4224 | { | 4075 | { |
4225 | s64 val; | 4076 | s64 val; |
4226 | 4077 | ||
4227 | /* per cpu stat */ | 4078 | /* per cpu stat */ |
4228 | val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE); | 4079 | val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_CACHE); |
4229 | s->stat[MCS_CACHE] += val * PAGE_SIZE; | 4080 | s->stat[MCS_CACHE] += val * PAGE_SIZE; |
4230 | val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS); | 4081 | val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_RSS); |
4231 | s->stat[MCS_RSS] += val * PAGE_SIZE; | 4082 | s->stat[MCS_RSS] += val * PAGE_SIZE; |
4232 | val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED); | 4083 | val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED); |
4233 | s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE; | 4084 | s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE; |
4234 | val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGPGIN); | 4085 | val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGPGIN); |
4235 | s->stat[MCS_PGPGIN] += val; | 4086 | s->stat[MCS_PGPGIN] += val; |
4236 | val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGPGOUT); | 4087 | val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGPGOUT); |
4237 | s->stat[MCS_PGPGOUT] += val; | 4088 | s->stat[MCS_PGPGOUT] += val; |
4238 | if (do_swap_account) { | 4089 | if (do_swap_account) { |
4239 | val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT); | 4090 | val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_SWAPOUT); |
4240 | s->stat[MCS_SWAP] += val * PAGE_SIZE; | 4091 | s->stat[MCS_SWAP] += val * PAGE_SIZE; |
4241 | } | 4092 | } |
4242 | val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGFAULT); | 4093 | val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGFAULT); |
4243 | s->stat[MCS_PGFAULT] += val; | 4094 | s->stat[MCS_PGFAULT] += val; |
4244 | val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGMAJFAULT); | 4095 | val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGMAJFAULT); |
4245 | s->stat[MCS_PGMAJFAULT] += val; | 4096 | s->stat[MCS_PGMAJFAULT] += val; |
4246 | 4097 | ||
4247 | /* per zone stat */ | 4098 | /* per zone stat */ |
4248 | val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_INACTIVE_ANON)); | 4099 | val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_ANON)); |
4249 | s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE; | 4100 | s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE; |
4250 | val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_ACTIVE_ANON)); | 4101 | val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_ANON)); |
4251 | s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE; | 4102 | s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE; |
4252 | val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_INACTIVE_FILE)); | 4103 | val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_FILE)); |
4253 | s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE; | 4104 | s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE; |
4254 | val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_ACTIVE_FILE)); | 4105 | val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_FILE)); |
4255 | s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE; | 4106 | s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE; |
4256 | val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_UNEVICTABLE)); | 4107 | val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE)); |
4257 | s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE; | 4108 | s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE; |
4258 | } | 4109 | } |
4259 | 4110 | ||
4260 | static void | 4111 | static void |
4261 | mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s) | 4112 | mem_cgroup_get_total_stat(struct mem_cgroup *memcg, struct mcs_total_stat *s) |
4262 | { | 4113 | { |
4263 | struct mem_cgroup *iter; | 4114 | struct mem_cgroup *iter; |
4264 | 4115 | ||
4265 | for_each_mem_cgroup_tree(iter, mem) | 4116 | for_each_mem_cgroup_tree(iter, memcg) |
4266 | mem_cgroup_get_local_stat(iter, s); | 4117 | mem_cgroup_get_local_stat(iter, s); |
4267 | } | 4118 | } |
4268 | 4119 | ||
@@ -4348,8 +4199,6 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, | |||
4348 | } | 4199 | } |
4349 | 4200 | ||
4350 | #ifdef CONFIG_DEBUG_VM | 4201 | #ifdef CONFIG_DEBUG_VM |
4351 | cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL)); | ||
4352 | |||
4353 | { | 4202 | { |
4354 | int nid, zid; | 4203 | int nid, zid; |
4355 | struct mem_cgroup_per_zone *mz; | 4204 | struct mem_cgroup_per_zone *mz; |
@@ -4486,20 +4335,20 @@ static int compare_thresholds(const void *a, const void *b) | |||
4486 | return _a->threshold - _b->threshold; | 4335 | return _a->threshold - _b->threshold; |
4487 | } | 4336 | } |
4488 | 4337 | ||
4489 | static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem) | 4338 | static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg) |
4490 | { | 4339 | { |
4491 | struct mem_cgroup_eventfd_list *ev; | 4340 | struct mem_cgroup_eventfd_list *ev; |
4492 | 4341 | ||
4493 | list_for_each_entry(ev, &mem->oom_notify, list) | 4342 | list_for_each_entry(ev, &memcg->oom_notify, list) |
4494 | eventfd_signal(ev->eventfd, 1); | 4343 | eventfd_signal(ev->eventfd, 1); |
4495 | return 0; | 4344 | return 0; |
4496 | } | 4345 | } |
4497 | 4346 | ||
4498 | static void mem_cgroup_oom_notify(struct mem_cgroup *mem) | 4347 | static void mem_cgroup_oom_notify(struct mem_cgroup *memcg) |
4499 | { | 4348 | { |
4500 | struct mem_cgroup *iter; | 4349 | struct mem_cgroup *iter; |
4501 | 4350 | ||
4502 | for_each_mem_cgroup_tree(iter, mem) | 4351 | for_each_mem_cgroup_tree(iter, memcg) |
4503 | mem_cgroup_oom_notify_cb(iter); | 4352 | mem_cgroup_oom_notify_cb(iter); |
4504 | } | 4353 | } |
4505 | 4354 | ||
@@ -4689,7 +4538,7 @@ static int mem_cgroup_oom_register_event(struct cgroup *cgrp, | |||
4689 | static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp, | 4538 | static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp, |
4690 | struct cftype *cft, struct eventfd_ctx *eventfd) | 4539 | struct cftype *cft, struct eventfd_ctx *eventfd) |
4691 | { | 4540 | { |
4692 | struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); | 4541 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); |
4693 | struct mem_cgroup_eventfd_list *ev, *tmp; | 4542 | struct mem_cgroup_eventfd_list *ev, *tmp; |
4694 | int type = MEMFILE_TYPE(cft->private); | 4543 | int type = MEMFILE_TYPE(cft->private); |
4695 | 4544 | ||
@@ -4697,7 +4546,7 @@ static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp, | |||
4697 | 4546 | ||
4698 | spin_lock(&memcg_oom_lock); | 4547 | spin_lock(&memcg_oom_lock); |
4699 | 4548 | ||
4700 | list_for_each_entry_safe(ev, tmp, &mem->oom_notify, list) { | 4549 | list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) { |
4701 | if (ev->eventfd == eventfd) { | 4550 | if (ev->eventfd == eventfd) { |
4702 | list_del(&ev->list); | 4551 | list_del(&ev->list); |
4703 | kfree(ev); | 4552 | kfree(ev); |
@@ -4710,11 +4559,11 @@ static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp, | |||
4710 | static int mem_cgroup_oom_control_read(struct cgroup *cgrp, | 4559 | static int mem_cgroup_oom_control_read(struct cgroup *cgrp, |
4711 | struct cftype *cft, struct cgroup_map_cb *cb) | 4560 | struct cftype *cft, struct cgroup_map_cb *cb) |
4712 | { | 4561 | { |
4713 | struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); | 4562 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); |
4714 | 4563 | ||
4715 | cb->fill(cb, "oom_kill_disable", mem->oom_kill_disable); | 4564 | cb->fill(cb, "oom_kill_disable", memcg->oom_kill_disable); |
4716 | 4565 | ||
4717 | if (atomic_read(&mem->under_oom)) | 4566 | if (atomic_read(&memcg->under_oom)) |
4718 | cb->fill(cb, "under_oom", 1); | 4567 | cb->fill(cb, "under_oom", 1); |
4719 | else | 4568 | else |
4720 | cb->fill(cb, "under_oom", 0); | 4569 | cb->fill(cb, "under_oom", 0); |
@@ -4724,7 +4573,7 @@ static int mem_cgroup_oom_control_read(struct cgroup *cgrp, | |||
4724 | static int mem_cgroup_oom_control_write(struct cgroup *cgrp, | 4573 | static int mem_cgroup_oom_control_write(struct cgroup *cgrp, |
4725 | struct cftype *cft, u64 val) | 4574 | struct cftype *cft, u64 val) |
4726 | { | 4575 | { |
4727 | struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); | 4576 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); |
4728 | struct mem_cgroup *parent; | 4577 | struct mem_cgroup *parent; |
4729 | 4578 | ||
4730 | /* cannot set to root cgroup and only 0 and 1 are allowed */ | 4579 | /* cannot set to root cgroup and only 0 and 1 are allowed */ |
@@ -4736,13 +4585,13 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp, | |||
4736 | cgroup_lock(); | 4585 | cgroup_lock(); |
4737 | /* oom-kill-disable is a flag for subhierarchy. */ | 4586 | /* oom-kill-disable is a flag for subhierarchy. */ |
4738 | if ((parent->use_hierarchy) || | 4587 | if ((parent->use_hierarchy) || |
4739 | (mem->use_hierarchy && !list_empty(&cgrp->children))) { | 4588 | (memcg->use_hierarchy && !list_empty(&cgrp->children))) { |
4740 | cgroup_unlock(); | 4589 | cgroup_unlock(); |
4741 | return -EINVAL; | 4590 | return -EINVAL; |
4742 | } | 4591 | } |
4743 | mem->oom_kill_disable = val; | 4592 | memcg->oom_kill_disable = val; |
4744 | if (!val) | 4593 | if (!val) |
4745 | memcg_oom_recover(mem); | 4594 | memcg_oom_recover(memcg); |
4746 | cgroup_unlock(); | 4595 | cgroup_unlock(); |
4747 | return 0; | 4596 | return 0; |
4748 | } | 4597 | } |
@@ -4763,54 +4612,6 @@ static int mem_control_numa_stat_open(struct inode *unused, struct file *file) | |||
4763 | } | 4612 | } |
4764 | #endif /* CONFIG_NUMA */ | 4613 | #endif /* CONFIG_NUMA */ |
4765 | 4614 | ||
4766 | static int mem_cgroup_vmscan_stat_read(struct cgroup *cgrp, | ||
4767 | struct cftype *cft, | ||
4768 | struct cgroup_map_cb *cb) | ||
4769 | { | ||
4770 | struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); | ||
4771 | char string[64]; | ||
4772 | int i; | ||
4773 | |||
4774 | for (i = 0; i < NR_SCANSTATS; i++) { | ||
4775 | strcpy(string, scanstat_string[i]); | ||
4776 | strcat(string, SCANSTAT_WORD_LIMIT); | ||
4777 | cb->fill(cb, string, mem->scanstat.stats[SCAN_BY_LIMIT][i]); | ||
4778 | } | ||
4779 | |||
4780 | for (i = 0; i < NR_SCANSTATS; i++) { | ||
4781 | strcpy(string, scanstat_string[i]); | ||
4782 | strcat(string, SCANSTAT_WORD_SYSTEM); | ||
4783 | cb->fill(cb, string, mem->scanstat.stats[SCAN_BY_SYSTEM][i]); | ||
4784 | } | ||
4785 | |||
4786 | for (i = 0; i < NR_SCANSTATS; i++) { | ||
4787 | strcpy(string, scanstat_string[i]); | ||
4788 | strcat(string, SCANSTAT_WORD_LIMIT); | ||
4789 | strcat(string, SCANSTAT_WORD_HIERARCHY); | ||
4790 | cb->fill(cb, string, mem->scanstat.rootstats[SCAN_BY_LIMIT][i]); | ||
4791 | } | ||
4792 | for (i = 0; i < NR_SCANSTATS; i++) { | ||
4793 | strcpy(string, scanstat_string[i]); | ||
4794 | strcat(string, SCANSTAT_WORD_SYSTEM); | ||
4795 | strcat(string, SCANSTAT_WORD_HIERARCHY); | ||
4796 | cb->fill(cb, string, mem->scanstat.rootstats[SCAN_BY_SYSTEM][i]); | ||
4797 | } | ||
4798 | return 0; | ||
4799 | } | ||
4800 | |||
4801 | static int mem_cgroup_reset_vmscan_stat(struct cgroup *cgrp, | ||
4802 | unsigned int event) | ||
4803 | { | ||
4804 | struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); | ||
4805 | |||
4806 | spin_lock(&mem->scanstat.lock); | ||
4807 | memset(&mem->scanstat.stats, 0, sizeof(mem->scanstat.stats)); | ||
4808 | memset(&mem->scanstat.rootstats, 0, sizeof(mem->scanstat.rootstats)); | ||
4809 | spin_unlock(&mem->scanstat.lock); | ||
4810 | return 0; | ||
4811 | } | ||
4812 | |||
4813 | |||
4814 | static struct cftype mem_cgroup_files[] = { | 4615 | static struct cftype mem_cgroup_files[] = { |
4815 | { | 4616 | { |
4816 | .name = "usage_in_bytes", | 4617 | .name = "usage_in_bytes", |
@@ -4881,11 +4682,6 @@ static struct cftype mem_cgroup_files[] = { | |||
4881 | .mode = S_IRUGO, | 4682 | .mode = S_IRUGO, |
4882 | }, | 4683 | }, |
4883 | #endif | 4684 | #endif |
4884 | { | ||
4885 | .name = "vmscan_stat", | ||
4886 | .read_map = mem_cgroup_vmscan_stat_read, | ||
4887 | .trigger = mem_cgroup_reset_vmscan_stat, | ||
4888 | }, | ||
4889 | }; | 4685 | }; |
4890 | 4686 | ||
4891 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | 4687 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP |
@@ -4931,7 +4727,7 @@ static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss) | |||
4931 | } | 4727 | } |
4932 | #endif | 4728 | #endif |
4933 | 4729 | ||
4934 | static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) | 4730 | static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) |
4935 | { | 4731 | { |
4936 | struct mem_cgroup_per_node *pn; | 4732 | struct mem_cgroup_per_node *pn; |
4937 | struct mem_cgroup_per_zone *mz; | 4733 | struct mem_cgroup_per_zone *mz; |
@@ -4951,21 +4747,21 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) | |||
4951 | if (!pn) | 4747 | if (!pn) |
4952 | return 1; | 4748 | return 1; |
4953 | 4749 | ||
4954 | mem->info.nodeinfo[node] = pn; | ||
4955 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { | 4750 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { |
4956 | mz = &pn->zoneinfo[zone]; | 4751 | mz = &pn->zoneinfo[zone]; |
4957 | for_each_lru(l) | 4752 | for_each_lru(l) |
4958 | INIT_LIST_HEAD(&mz->lists[l]); | 4753 | INIT_LIST_HEAD(&mz->lists[l]); |
4959 | mz->usage_in_excess = 0; | 4754 | mz->usage_in_excess = 0; |
4960 | mz->on_tree = false; | 4755 | mz->on_tree = false; |
4961 | mz->mem = mem; | 4756 | mz->mem = memcg; |
4962 | } | 4757 | } |
4758 | memcg->info.nodeinfo[node] = pn; | ||
4963 | return 0; | 4759 | return 0; |
4964 | } | 4760 | } |
4965 | 4761 | ||
4966 | static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) | 4762 | static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) |
4967 | { | 4763 | { |
4968 | kfree(mem->info.nodeinfo[node]); | 4764 | kfree(memcg->info.nodeinfo[node]); |
4969 | } | 4765 | } |
4970 | 4766 | ||
4971 | static struct mem_cgroup *mem_cgroup_alloc(void) | 4767 | static struct mem_cgroup *mem_cgroup_alloc(void) |
@@ -5007,51 +4803,51 @@ out_free: | |||
5007 | * Removal of cgroup itself succeeds regardless of refs from swap. | 4803 | * Removal of cgroup itself succeeds regardless of refs from swap. |
5008 | */ | 4804 | */ |
5009 | 4805 | ||
5010 | static void __mem_cgroup_free(struct mem_cgroup *mem) | 4806 | static void __mem_cgroup_free(struct mem_cgroup *memcg) |
5011 | { | 4807 | { |
5012 | int node; | 4808 | int node; |
5013 | 4809 | ||
5014 | mem_cgroup_remove_from_trees(mem); | 4810 | mem_cgroup_remove_from_trees(memcg); |
5015 | free_css_id(&mem_cgroup_subsys, &mem->css); | 4811 | free_css_id(&mem_cgroup_subsys, &memcg->css); |
5016 | 4812 | ||
5017 | for_each_node_state(node, N_POSSIBLE) | 4813 | for_each_node_state(node, N_POSSIBLE) |
5018 | free_mem_cgroup_per_zone_info(mem, node); | 4814 | free_mem_cgroup_per_zone_info(memcg, node); |
5019 | 4815 | ||
5020 | free_percpu(mem->stat); | 4816 | free_percpu(memcg->stat); |
5021 | if (sizeof(struct mem_cgroup) < PAGE_SIZE) | 4817 | if (sizeof(struct mem_cgroup) < PAGE_SIZE) |
5022 | kfree(mem); | 4818 | kfree(memcg); |
5023 | else | 4819 | else |
5024 | vfree(mem); | 4820 | vfree(memcg); |
5025 | } | 4821 | } |
5026 | 4822 | ||
5027 | static void mem_cgroup_get(struct mem_cgroup *mem) | 4823 | static void mem_cgroup_get(struct mem_cgroup *memcg) |
5028 | { | 4824 | { |
5029 | atomic_inc(&mem->refcnt); | 4825 | atomic_inc(&memcg->refcnt); |
5030 | } | 4826 | } |
5031 | 4827 | ||
5032 | static void __mem_cgroup_put(struct mem_cgroup *mem, int count) | 4828 | static void __mem_cgroup_put(struct mem_cgroup *memcg, int count) |
5033 | { | 4829 | { |
5034 | if (atomic_sub_and_test(count, &mem->refcnt)) { | 4830 | if (atomic_sub_and_test(count, &memcg->refcnt)) { |
5035 | struct mem_cgroup *parent = parent_mem_cgroup(mem); | 4831 | struct mem_cgroup *parent = parent_mem_cgroup(memcg); |
5036 | __mem_cgroup_free(mem); | 4832 | __mem_cgroup_free(memcg); |
5037 | if (parent) | 4833 | if (parent) |
5038 | mem_cgroup_put(parent); | 4834 | mem_cgroup_put(parent); |
5039 | } | 4835 | } |
5040 | } | 4836 | } |
5041 | 4837 | ||
5042 | static void mem_cgroup_put(struct mem_cgroup *mem) | 4838 | static void mem_cgroup_put(struct mem_cgroup *memcg) |
5043 | { | 4839 | { |
5044 | __mem_cgroup_put(mem, 1); | 4840 | __mem_cgroup_put(memcg, 1); |
5045 | } | 4841 | } |
5046 | 4842 | ||
5047 | /* | 4843 | /* |
5048 | * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled. | 4844 | * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled. |
5049 | */ | 4845 | */ |
5050 | static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem) | 4846 | static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) |
5051 | { | 4847 | { |
5052 | if (!mem->res.parent) | 4848 | if (!memcg->res.parent) |
5053 | return NULL; | 4849 | return NULL; |
5054 | return mem_cgroup_from_res_counter(mem->res.parent, res); | 4850 | return mem_cgroup_from_res_counter(memcg->res.parent, res); |
5055 | } | 4851 | } |
5056 | 4852 | ||
5057 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | 4853 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP |
@@ -5094,16 +4890,16 @@ static int mem_cgroup_soft_limit_tree_init(void) | |||
5094 | static struct cgroup_subsys_state * __ref | 4890 | static struct cgroup_subsys_state * __ref |
5095 | mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | 4891 | mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) |
5096 | { | 4892 | { |
5097 | struct mem_cgroup *mem, *parent; | 4893 | struct mem_cgroup *memcg, *parent; |
5098 | long error = -ENOMEM; | 4894 | long error = -ENOMEM; |
5099 | int node; | 4895 | int node; |
5100 | 4896 | ||
5101 | mem = mem_cgroup_alloc(); | 4897 | memcg = mem_cgroup_alloc(); |
5102 | if (!mem) | 4898 | if (!memcg) |
5103 | return ERR_PTR(error); | 4899 | return ERR_PTR(error); |
5104 | 4900 | ||
5105 | for_each_node_state(node, N_POSSIBLE) | 4901 | for_each_node_state(node, N_POSSIBLE) |
5106 | if (alloc_mem_cgroup_per_zone_info(mem, node)) | 4902 | if (alloc_mem_cgroup_per_zone_info(memcg, node)) |
5107 | goto free_out; | 4903 | goto free_out; |
5108 | 4904 | ||
5109 | /* root ? */ | 4905 | /* root ? */ |
@@ -5111,7 +4907,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
5111 | int cpu; | 4907 | int cpu; |
5112 | enable_swap_cgroup(); | 4908 | enable_swap_cgroup(); |
5113 | parent = NULL; | 4909 | parent = NULL; |
5114 | root_mem_cgroup = mem; | 4910 | root_mem_cgroup = memcg; |
5115 | if (mem_cgroup_soft_limit_tree_init()) | 4911 | if (mem_cgroup_soft_limit_tree_init()) |
5116 | goto free_out; | 4912 | goto free_out; |
5117 | for_each_possible_cpu(cpu) { | 4913 | for_each_possible_cpu(cpu) { |
@@ -5122,13 +4918,13 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
5122 | hotcpu_notifier(memcg_cpu_hotplug_callback, 0); | 4918 | hotcpu_notifier(memcg_cpu_hotplug_callback, 0); |
5123 | } else { | 4919 | } else { |
5124 | parent = mem_cgroup_from_cont(cont->parent); | 4920 | parent = mem_cgroup_from_cont(cont->parent); |
5125 | mem->use_hierarchy = parent->use_hierarchy; | 4921 | memcg->use_hierarchy = parent->use_hierarchy; |
5126 | mem->oom_kill_disable = parent->oom_kill_disable; | 4922 | memcg->oom_kill_disable = parent->oom_kill_disable; |
5127 | } | 4923 | } |
5128 | 4924 | ||
5129 | if (parent && parent->use_hierarchy) { | 4925 | if (parent && parent->use_hierarchy) { |
5130 | res_counter_init(&mem->res, &parent->res); | 4926 | res_counter_init(&memcg->res, &parent->res); |
5131 | res_counter_init(&mem->memsw, &parent->memsw); | 4927 | res_counter_init(&memcg->memsw, &parent->memsw); |
5132 | /* | 4928 | /* |
5133 | * We increment refcnt of the parent to ensure that we can | 4929 | * We increment refcnt of the parent to ensure that we can |
5134 | * safely access it on res_counter_charge/uncharge. | 4930 | * safely access it on res_counter_charge/uncharge. |
@@ -5137,22 +4933,21 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
5137 | */ | 4933 | */ |
5138 | mem_cgroup_get(parent); | 4934 | mem_cgroup_get(parent); |
5139 | } else { | 4935 | } else { |
5140 | res_counter_init(&mem->res, NULL); | 4936 | res_counter_init(&memcg->res, NULL); |
5141 | res_counter_init(&mem->memsw, NULL); | 4937 | res_counter_init(&memcg->memsw, NULL); |
5142 | } | 4938 | } |
5143 | mem->last_scanned_child = 0; | 4939 | memcg->last_scanned_child = 0; |
5144 | mem->last_scanned_node = MAX_NUMNODES; | 4940 | memcg->last_scanned_node = MAX_NUMNODES; |
5145 | INIT_LIST_HEAD(&mem->oom_notify); | 4941 | INIT_LIST_HEAD(&memcg->oom_notify); |
5146 | 4942 | ||
5147 | if (parent) | 4943 | if (parent) |
5148 | mem->swappiness = mem_cgroup_swappiness(parent); | 4944 | memcg->swappiness = mem_cgroup_swappiness(parent); |
5149 | atomic_set(&mem->refcnt, 1); | 4945 | atomic_set(&memcg->refcnt, 1); |
5150 | mem->move_charge_at_immigrate = 0; | 4946 | memcg->move_charge_at_immigrate = 0; |
5151 | mutex_init(&mem->thresholds_lock); | 4947 | mutex_init(&memcg->thresholds_lock); |
5152 | spin_lock_init(&mem->scanstat.lock); | 4948 | return &memcg->css; |
5153 | return &mem->css; | ||
5154 | free_out: | 4949 | free_out: |
5155 | __mem_cgroup_free(mem); | 4950 | __mem_cgroup_free(memcg); |
5156 | root_mem_cgroup = NULL; | 4951 | root_mem_cgroup = NULL; |
5157 | return ERR_PTR(error); | 4952 | return ERR_PTR(error); |
5158 | } | 4953 | } |
@@ -5160,17 +4955,17 @@ free_out: | |||
5160 | static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss, | 4955 | static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss, |
5161 | struct cgroup *cont) | 4956 | struct cgroup *cont) |
5162 | { | 4957 | { |
5163 | struct mem_cgroup *mem = mem_cgroup_from_cont(cont); | 4958 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); |
5164 | 4959 | ||
5165 | return mem_cgroup_force_empty(mem, false); | 4960 | return mem_cgroup_force_empty(memcg, false); |
5166 | } | 4961 | } |
5167 | 4962 | ||
5168 | static void mem_cgroup_destroy(struct cgroup_subsys *ss, | 4963 | static void mem_cgroup_destroy(struct cgroup_subsys *ss, |
5169 | struct cgroup *cont) | 4964 | struct cgroup *cont) |
5170 | { | 4965 | { |
5171 | struct mem_cgroup *mem = mem_cgroup_from_cont(cont); | 4966 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); |
5172 | 4967 | ||
5173 | mem_cgroup_put(mem); | 4968 | mem_cgroup_put(memcg); |
5174 | } | 4969 | } |
5175 | 4970 | ||
5176 | static int mem_cgroup_populate(struct cgroup_subsys *ss, | 4971 | static int mem_cgroup_populate(struct cgroup_subsys *ss, |
@@ -5193,9 +4988,9 @@ static int mem_cgroup_do_precharge(unsigned long count) | |||
5193 | { | 4988 | { |
5194 | int ret = 0; | 4989 | int ret = 0; |
5195 | int batch_count = PRECHARGE_COUNT_AT_ONCE; | 4990 | int batch_count = PRECHARGE_COUNT_AT_ONCE; |
5196 | struct mem_cgroup *mem = mc.to; | 4991 | struct mem_cgroup *memcg = mc.to; |
5197 | 4992 | ||
5198 | if (mem_cgroup_is_root(mem)) { | 4993 | if (mem_cgroup_is_root(memcg)) { |
5199 | mc.precharge += count; | 4994 | mc.precharge += count; |
5200 | /* we don't need css_get for root */ | 4995 | /* we don't need css_get for root */ |
5201 | return ret; | 4996 | return ret; |
@@ -5204,16 +4999,16 @@ static int mem_cgroup_do_precharge(unsigned long count) | |||
5204 | if (count > 1) { | 4999 | if (count > 1) { |
5205 | struct res_counter *dummy; | 5000 | struct res_counter *dummy; |
5206 | /* | 5001 | /* |
5207 | * "mem" cannot be under rmdir() because we've already checked | 5002 | * "memcg" cannot be under rmdir() because we've already checked |
5208 | * by cgroup_lock_live_cgroup() that it is not removed and we | 5003 | * by cgroup_lock_live_cgroup() that it is not removed and we |
5209 | * are still under the same cgroup_mutex. So we can postpone | 5004 | * are still under the same cgroup_mutex. So we can postpone |
5210 | * css_get(). | 5005 | * css_get(). |
5211 | */ | 5006 | */ |
5212 | if (res_counter_charge(&mem->res, PAGE_SIZE * count, &dummy)) | 5007 | if (res_counter_charge(&memcg->res, PAGE_SIZE * count, &dummy)) |
5213 | goto one_by_one; | 5008 | goto one_by_one; |
5214 | if (do_swap_account && res_counter_charge(&mem->memsw, | 5009 | if (do_swap_account && res_counter_charge(&memcg->memsw, |
5215 | PAGE_SIZE * count, &dummy)) { | 5010 | PAGE_SIZE * count, &dummy)) { |
5216 | res_counter_uncharge(&mem->res, PAGE_SIZE * count); | 5011 | res_counter_uncharge(&memcg->res, PAGE_SIZE * count); |
5217 | goto one_by_one; | 5012 | goto one_by_one; |
5218 | } | 5013 | } |
5219 | mc.precharge += count; | 5014 | mc.precharge += count; |
@@ -5230,8 +5025,9 @@ one_by_one: | |||
5230 | batch_count = PRECHARGE_COUNT_AT_ONCE; | 5025 | batch_count = PRECHARGE_COUNT_AT_ONCE; |
5231 | cond_resched(); | 5026 | cond_resched(); |
5232 | } | 5027 | } |
5233 | ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, 1, &mem, false); | 5028 | ret = __mem_cgroup_try_charge(NULL, |
5234 | if (ret || !mem) | 5029 | GFP_KERNEL, 1, &memcg, false); |
5030 | if (ret || !memcg) | ||
5235 | /* mem_cgroup_clear_mc() will do uncharge later */ | 5031 | /* mem_cgroup_clear_mc() will do uncharge later */ |
5236 | return -ENOMEM; | 5032 | return -ENOMEM; |
5237 | mc.precharge++; | 5033 | mc.precharge++; |
@@ -5330,15 +5126,17 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma, | |||
5330 | pgoff = pte_to_pgoff(ptent); | 5126 | pgoff = pte_to_pgoff(ptent); |
5331 | 5127 | ||
5332 | /* page is moved even if it's not RSS of this task(page-faulted). */ | 5128 | /* page is moved even if it's not RSS of this task(page-faulted). */ |
5333 | if (!mapping_cap_swap_backed(mapping)) { /* normal file */ | 5129 | page = find_get_page(mapping, pgoff); |
5334 | page = find_get_page(mapping, pgoff); | 5130 | |
5335 | } else { /* shmem/tmpfs file. we should take account of swap too. */ | 5131 | #ifdef CONFIG_SWAP |
5336 | swp_entry_t ent; | 5132 | /* shmem/tmpfs may report page out on swap: account for that too. */ |
5337 | mem_cgroup_get_shmem_target(inode, pgoff, &page, &ent); | 5133 | if (radix_tree_exceptional_entry(page)) { |
5134 | swp_entry_t swap = radix_to_swp_entry(page); | ||
5338 | if (do_swap_account) | 5135 | if (do_swap_account) |
5339 | entry->val = ent.val; | 5136 | *entry = swap; |
5137 | page = find_get_page(&swapper_space, swap.val); | ||
5340 | } | 5138 | } |
5341 | 5139 | #endif | |
5342 | return page; | 5140 | return page; |
5343 | } | 5141 | } |
5344 | 5142 | ||
@@ -5503,13 +5301,13 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss, | |||
5503 | struct task_struct *p) | 5301 | struct task_struct *p) |
5504 | { | 5302 | { |
5505 | int ret = 0; | 5303 | int ret = 0; |
5506 | struct mem_cgroup *mem = mem_cgroup_from_cont(cgroup); | 5304 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgroup); |
5507 | 5305 | ||
5508 | if (mem->move_charge_at_immigrate) { | 5306 | if (memcg->move_charge_at_immigrate) { |
5509 | struct mm_struct *mm; | 5307 | struct mm_struct *mm; |
5510 | struct mem_cgroup *from = mem_cgroup_from_task(p); | 5308 | struct mem_cgroup *from = mem_cgroup_from_task(p); |
5511 | 5309 | ||
5512 | VM_BUG_ON(from == mem); | 5310 | VM_BUG_ON(from == memcg); |
5513 | 5311 | ||
5514 | mm = get_task_mm(p); | 5312 | mm = get_task_mm(p); |
5515 | if (!mm) | 5313 | if (!mm) |
@@ -5524,7 +5322,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss, | |||
5524 | mem_cgroup_start_move(from); | 5322 | mem_cgroup_start_move(from); |
5525 | spin_lock(&mc.lock); | 5323 | spin_lock(&mc.lock); |
5526 | mc.from = from; | 5324 | mc.from = from; |
5527 | mc.to = mem; | 5325 | mc.to = memcg; |
5528 | spin_unlock(&mc.lock); | 5326 | spin_unlock(&mc.lock); |
5529 | /* We set mc.moving_task later */ | 5327 | /* We set mc.moving_task later */ |
5530 | 5328 | ||
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 740c4f52059c..06d3479513aa 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
@@ -42,6 +42,7 @@ | |||
42 | #include <linux/sched.h> | 42 | #include <linux/sched.h> |
43 | #include <linux/ksm.h> | 43 | #include <linux/ksm.h> |
44 | #include <linux/rmap.h> | 44 | #include <linux/rmap.h> |
45 | #include <linux/export.h> | ||
45 | #include <linux/pagemap.h> | 46 | #include <linux/pagemap.h> |
46 | #include <linux/swap.h> | 47 | #include <linux/swap.h> |
47 | #include <linux/backing-dev.h> | 48 | #include <linux/backing-dev.h> |
@@ -53,6 +54,7 @@ | |||
53 | #include <linux/hugetlb.h> | 54 | #include <linux/hugetlb.h> |
54 | #include <linux/memory_hotplug.h> | 55 | #include <linux/memory_hotplug.h> |
55 | #include <linux/mm_inline.h> | 56 | #include <linux/mm_inline.h> |
57 | #include <linux/kfifo.h> | ||
56 | #include "internal.h" | 58 | #include "internal.h" |
57 | 59 | ||
58 | int sysctl_memory_failure_early_kill __read_mostly = 0; | 60 | int sysctl_memory_failure_early_kill __read_mostly = 0; |
@@ -1178,6 +1180,97 @@ void memory_failure(unsigned long pfn, int trapno) | |||
1178 | __memory_failure(pfn, trapno, 0); | 1180 | __memory_failure(pfn, trapno, 0); |
1179 | } | 1181 | } |
1180 | 1182 | ||
1183 | #define MEMORY_FAILURE_FIFO_ORDER 4 | ||
1184 | #define MEMORY_FAILURE_FIFO_SIZE (1 << MEMORY_FAILURE_FIFO_ORDER) | ||
1185 | |||
1186 | struct memory_failure_entry { | ||
1187 | unsigned long pfn; | ||
1188 | int trapno; | ||
1189 | int flags; | ||
1190 | }; | ||
1191 | |||
1192 | struct memory_failure_cpu { | ||
1193 | DECLARE_KFIFO(fifo, struct memory_failure_entry, | ||
1194 | MEMORY_FAILURE_FIFO_SIZE); | ||
1195 | spinlock_t lock; | ||
1196 | struct work_struct work; | ||
1197 | }; | ||
1198 | |||
1199 | static DEFINE_PER_CPU(struct memory_failure_cpu, memory_failure_cpu); | ||
1200 | |||
1201 | /** | ||
1202 | * memory_failure_queue - Schedule handling memory failure of a page. | ||
1203 | * @pfn: Page Number of the corrupted page | ||
1204 | * @trapno: Trap number reported in the signal to user space. | ||
1205 | * @flags: Flags for memory failure handling | ||
1206 | * | ||
1207 | * This function is called by the low level hardware error handler | ||
1208 | * when it detects hardware memory corruption of a page. It schedules | ||
1209 | * the recovering of error page, including dropping pages, killing | ||
1210 | * processes etc. | ||
1211 | * | ||
1212 | * The function is primarily of use for corruptions that | ||
1213 | * happen outside the current execution context (e.g. when | ||
1214 | * detected by a background scrubber) | ||
1215 | * | ||
1216 | * Can run in IRQ context. | ||
1217 | */ | ||
1218 | void memory_failure_queue(unsigned long pfn, int trapno, int flags) | ||
1219 | { | ||
1220 | struct memory_failure_cpu *mf_cpu; | ||
1221 | unsigned long proc_flags; | ||
1222 | struct memory_failure_entry entry = { | ||
1223 | .pfn = pfn, | ||
1224 | .trapno = trapno, | ||
1225 | .flags = flags, | ||
1226 | }; | ||
1227 | |||
1228 | mf_cpu = &get_cpu_var(memory_failure_cpu); | ||
1229 | spin_lock_irqsave(&mf_cpu->lock, proc_flags); | ||
1230 | if (kfifo_put(&mf_cpu->fifo, &entry)) | ||
1231 | schedule_work_on(smp_processor_id(), &mf_cpu->work); | ||
1232 | else | ||
1233 | pr_err("Memory failure: buffer overflow when queuing memory failure at 0x%#lx\n", | ||
1234 | pfn); | ||
1235 | spin_unlock_irqrestore(&mf_cpu->lock, proc_flags); | ||
1236 | put_cpu_var(memory_failure_cpu); | ||
1237 | } | ||
1238 | EXPORT_SYMBOL_GPL(memory_failure_queue); | ||
1239 | |||
1240 | static void memory_failure_work_func(struct work_struct *work) | ||
1241 | { | ||
1242 | struct memory_failure_cpu *mf_cpu; | ||
1243 | struct memory_failure_entry entry = { 0, }; | ||
1244 | unsigned long proc_flags; | ||
1245 | int gotten; | ||
1246 | |||
1247 | mf_cpu = &__get_cpu_var(memory_failure_cpu); | ||
1248 | for (;;) { | ||
1249 | spin_lock_irqsave(&mf_cpu->lock, proc_flags); | ||
1250 | gotten = kfifo_get(&mf_cpu->fifo, &entry); | ||
1251 | spin_unlock_irqrestore(&mf_cpu->lock, proc_flags); | ||
1252 | if (!gotten) | ||
1253 | break; | ||
1254 | __memory_failure(entry.pfn, entry.trapno, entry.flags); | ||
1255 | } | ||
1256 | } | ||
1257 | |||
1258 | static int __init memory_failure_init(void) | ||
1259 | { | ||
1260 | struct memory_failure_cpu *mf_cpu; | ||
1261 | int cpu; | ||
1262 | |||
1263 | for_each_possible_cpu(cpu) { | ||
1264 | mf_cpu = &per_cpu(memory_failure_cpu, cpu); | ||
1265 | spin_lock_init(&mf_cpu->lock); | ||
1266 | INIT_KFIFO(mf_cpu->fifo); | ||
1267 | INIT_WORK(&mf_cpu->work, memory_failure_work_func); | ||
1268 | } | ||
1269 | |||
1270 | return 0; | ||
1271 | } | ||
1272 | core_initcall(memory_failure_init); | ||
1273 | |||
1181 | /** | 1274 | /** |
1182 | * unpoison_memory - Unpoison a previously poisoned page | 1275 | * unpoison_memory - Unpoison a previously poisoned page |
1183 | * @pfn: Page number of the to be unpoisoned page | 1276 | * @pfn: Page number of the to be unpoisoned page |
@@ -1218,7 +1311,7 @@ int unpoison_memory(unsigned long pfn) | |||
1218 | * to the end. | 1311 | * to the end. |
1219 | */ | 1312 | */ |
1220 | if (PageHuge(page)) { | 1313 | if (PageHuge(page)) { |
1221 | pr_debug("MCE: Memory failure is now running on free hugepage %#lx\n", pfn); | 1314 | pr_info("MCE: Memory failure is now running on free hugepage %#lx\n", pfn); |
1222 | return 0; | 1315 | return 0; |
1223 | } | 1316 | } |
1224 | if (TestClearPageHWPoison(p)) | 1317 | if (TestClearPageHWPoison(p)) |
@@ -1327,7 +1420,7 @@ static int soft_offline_huge_page(struct page *page, int flags) | |||
1327 | 1420 | ||
1328 | if (PageHWPoison(hpage)) { | 1421 | if (PageHWPoison(hpage)) { |
1329 | put_page(hpage); | 1422 | put_page(hpage); |
1330 | pr_debug("soft offline: %#lx hugepage already poisoned\n", pfn); | 1423 | pr_info("soft offline: %#lx hugepage already poisoned\n", pfn); |
1331 | return -EBUSY; | 1424 | return -EBUSY; |
1332 | } | 1425 | } |
1333 | 1426 | ||
@@ -1341,8 +1434,8 @@ static int soft_offline_huge_page(struct page *page, int flags) | |||
1341 | list_for_each_entry_safe(page1, page2, &pagelist, lru) | 1434 | list_for_each_entry_safe(page1, page2, &pagelist, lru) |
1342 | put_page(page1); | 1435 | put_page(page1); |
1343 | 1436 | ||
1344 | pr_debug("soft offline: %#lx: migration failed %d, type %lx\n", | 1437 | pr_info("soft offline: %#lx: migration failed %d, type %lx\n", |
1345 | pfn, ret, page->flags); | 1438 | pfn, ret, page->flags); |
1346 | if (ret > 0) | 1439 | if (ret > 0) |
1347 | ret = -EIO; | 1440 | ret = -EIO; |
1348 | return ret; | 1441 | return ret; |
@@ -1413,7 +1506,7 @@ int soft_offline_page(struct page *page, int flags) | |||
1413 | } | 1506 | } |
1414 | if (!PageLRU(page)) { | 1507 | if (!PageLRU(page)) { |
1415 | pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n", | 1508 | pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n", |
1416 | pfn, page->flags); | 1509 | pfn, page->flags); |
1417 | return -EIO; | 1510 | return -EIO; |
1418 | } | 1511 | } |
1419 | 1512 | ||
@@ -1474,7 +1567,7 @@ int soft_offline_page(struct page *page, int flags) | |||
1474 | } | 1567 | } |
1475 | } else { | 1568 | } else { |
1476 | pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n", | 1569 | pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n", |
1477 | pfn, ret, page_count(page), page->flags); | 1570 | pfn, ret, page_count(page), page->flags); |
1478 | } | 1571 | } |
1479 | if (ret) | 1572 | if (ret) |
1480 | return ret; | 1573 | return ret; |
diff --git a/mm/memory.c b/mm/memory.c index a56e3ba816b2..829d43735402 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -47,7 +47,7 @@ | |||
47 | #include <linux/pagemap.h> | 47 | #include <linux/pagemap.h> |
48 | #include <linux/ksm.h> | 48 | #include <linux/ksm.h> |
49 | #include <linux/rmap.h> | 49 | #include <linux/rmap.h> |
50 | #include <linux/module.h> | 50 | #include <linux/export.h> |
51 | #include <linux/delayacct.h> | 51 | #include <linux/delayacct.h> |
52 | #include <linux/init.h> | 52 | #include <linux/init.h> |
53 | #include <linux/writeback.h> | 53 | #include <linux/writeback.h> |
@@ -1503,7 +1503,7 @@ split_fallthrough: | |||
1503 | } | 1503 | } |
1504 | 1504 | ||
1505 | if (flags & FOLL_GET) | 1505 | if (flags & FOLL_GET) |
1506 | get_page(page); | 1506 | get_page_foll(page); |
1507 | if (flags & FOLL_TOUCH) { | 1507 | if (flags & FOLL_TOUCH) { |
1508 | if ((flags & FOLL_WRITE) && | 1508 | if ((flags & FOLL_WRITE) && |
1509 | !pte_dirty(pte) && !PageDirty(page)) | 1509 | !pte_dirty(pte) && !PageDirty(page)) |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 6e7d8b21dbfa..2168489c0bc9 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -11,7 +11,7 @@ | |||
11 | #include <linux/pagemap.h> | 11 | #include <linux/pagemap.h> |
12 | #include <linux/bootmem.h> | 12 | #include <linux/bootmem.h> |
13 | #include <linux/compiler.h> | 13 | #include <linux/compiler.h> |
14 | #include <linux/module.h> | 14 | #include <linux/export.h> |
15 | #include <linux/pagevec.h> | 15 | #include <linux/pagevec.h> |
16 | #include <linux/writeback.h> | 16 | #include <linux/writeback.h> |
17 | #include <linux/slab.h> | 17 | #include <linux/slab.h> |
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 8b57173c1dd5..adc395481813 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -75,7 +75,7 @@ | |||
75 | #include <linux/cpuset.h> | 75 | #include <linux/cpuset.h> |
76 | #include <linux/slab.h> | 76 | #include <linux/slab.h> |
77 | #include <linux/string.h> | 77 | #include <linux/string.h> |
78 | #include <linux/module.h> | 78 | #include <linux/export.h> |
79 | #include <linux/nsproxy.h> | 79 | #include <linux/nsproxy.h> |
80 | #include <linux/interrupt.h> | 80 | #include <linux/interrupt.h> |
81 | #include <linux/init.h> | 81 | #include <linux/init.h> |
@@ -111,7 +111,7 @@ enum zone_type policy_zone = 0; | |||
111 | /* | 111 | /* |
112 | * run-time system-wide default policy => local allocation | 112 | * run-time system-wide default policy => local allocation |
113 | */ | 113 | */ |
114 | struct mempolicy default_policy = { | 114 | static struct mempolicy default_policy = { |
115 | .refcnt = ATOMIC_INIT(1), /* never free it */ | 115 | .refcnt = ATOMIC_INIT(1), /* never free it */ |
116 | .mode = MPOL_PREFERRED, | 116 | .mode = MPOL_PREFERRED, |
117 | .flags = MPOL_F_LOCAL, | 117 | .flags = MPOL_F_LOCAL, |
@@ -636,7 +636,6 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, | |||
636 | struct vm_area_struct *prev; | 636 | struct vm_area_struct *prev; |
637 | struct vm_area_struct *vma; | 637 | struct vm_area_struct *vma; |
638 | int err = 0; | 638 | int err = 0; |
639 | pgoff_t pgoff; | ||
640 | unsigned long vmstart; | 639 | unsigned long vmstart; |
641 | unsigned long vmend; | 640 | unsigned long vmend; |
642 | 641 | ||
@@ -649,9 +648,9 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, | |||
649 | vmstart = max(start, vma->vm_start); | 648 | vmstart = max(start, vma->vm_start); |
650 | vmend = min(end, vma->vm_end); | 649 | vmend = min(end, vma->vm_end); |
651 | 650 | ||
652 | pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); | ||
653 | prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags, | 651 | prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags, |
654 | vma->anon_vma, vma->vm_file, pgoff, new_pol); | 652 | vma->anon_vma, vma->vm_file, vma->vm_pgoff, |
653 | new_pol); | ||
655 | if (prev) { | 654 | if (prev) { |
656 | vma = prev; | 655 | vma = prev; |
657 | next = vma->vm_next; | 656 | next = vma->vm_next; |
@@ -1412,7 +1411,9 @@ asmlinkage long compat_sys_get_mempolicy(int __user *policy, | |||
1412 | err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags); | 1411 | err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags); |
1413 | 1412 | ||
1414 | if (!err && nmask) { | 1413 | if (!err && nmask) { |
1415 | err = copy_from_user(bm, nm, alloc_size); | 1414 | unsigned long copy_size; |
1415 | copy_size = min_t(unsigned long, sizeof(bm), alloc_size); | ||
1416 | err = copy_from_user(bm, nm, copy_size); | ||
1416 | /* ensure entire bitmap is zeroed */ | 1417 | /* ensure entire bitmap is zeroed */ |
1417 | err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8); | 1418 | err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8); |
1418 | err |= compat_put_bitmap(nmask, bm, nr_bits); | 1419 | err |= compat_put_bitmap(nmask, bm, nr_bits); |
diff --git a/mm/mempool.c b/mm/mempool.c index 1a3bc3d4d554..e73641b79bb5 100644 --- a/mm/mempool.c +++ b/mm/mempool.c | |||
@@ -10,7 +10,7 @@ | |||
10 | 10 | ||
11 | #include <linux/mm.h> | 11 | #include <linux/mm.h> |
12 | #include <linux/slab.h> | 12 | #include <linux/slab.h> |
13 | #include <linux/module.h> | 13 | #include <linux/export.h> |
14 | #include <linux/mempool.h> | 14 | #include <linux/mempool.h> |
15 | #include <linux/blkdev.h> | 15 | #include <linux/blkdev.h> |
16 | #include <linux/writeback.h> | 16 | #include <linux/writeback.h> |
diff --git a/mm/migrate.c b/mm/migrate.c index 666e4e677414..578e29174fa6 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -13,7 +13,7 @@ | |||
13 | */ | 13 | */ |
14 | 14 | ||
15 | #include <linux/migrate.h> | 15 | #include <linux/migrate.h> |
16 | #include <linux/module.h> | 16 | #include <linux/export.h> |
17 | #include <linux/swap.h> | 17 | #include <linux/swap.h> |
18 | #include <linux/swapops.h> | 18 | #include <linux/swapops.h> |
19 | #include <linux/pagemap.h> | 19 | #include <linux/pagemap.h> |
@@ -120,10 +120,10 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, | |||
120 | 120 | ||
121 | ptep = pte_offset_map(pmd, addr); | 121 | ptep = pte_offset_map(pmd, addr); |
122 | 122 | ||
123 | if (!is_swap_pte(*ptep)) { | 123 | /* |
124 | pte_unmap(ptep); | 124 | * Peek to check is_swap_pte() before taking ptlock? No, we |
125 | goto out; | 125 | * can race mremap's move_ptes(), which skips anon_vma lock. |
126 | } | 126 | */ |
127 | 127 | ||
128 | ptl = pte_lockptr(mm, pmd); | 128 | ptl = pte_lockptr(mm, pmd); |
129 | } | 129 | } |
@@ -621,38 +621,18 @@ static int move_to_new_page(struct page *newpage, struct page *page, | |||
621 | return rc; | 621 | return rc; |
622 | } | 622 | } |
623 | 623 | ||
624 | /* | 624 | static int __unmap_and_move(struct page *page, struct page *newpage, |
625 | * Obtain the lock on page, remove all ptes and migrate the page | 625 | int force, bool offlining, bool sync) |
626 | * to the newly allocated page in newpage. | ||
627 | */ | ||
628 | static int unmap_and_move(new_page_t get_new_page, unsigned long private, | ||
629 | struct page *page, int force, bool offlining, bool sync) | ||
630 | { | 626 | { |
631 | int rc = 0; | 627 | int rc = -EAGAIN; |
632 | int *result = NULL; | ||
633 | struct page *newpage = get_new_page(page, private, &result); | ||
634 | int remap_swapcache = 1; | 628 | int remap_swapcache = 1; |
635 | int charge = 0; | 629 | int charge = 0; |
636 | struct mem_cgroup *mem; | 630 | struct mem_cgroup *mem; |
637 | struct anon_vma *anon_vma = NULL; | 631 | struct anon_vma *anon_vma = NULL; |
638 | 632 | ||
639 | if (!newpage) | ||
640 | return -ENOMEM; | ||
641 | |||
642 | if (page_count(page) == 1) { | ||
643 | /* page was freed from under us. So we are done. */ | ||
644 | goto move_newpage; | ||
645 | } | ||
646 | if (unlikely(PageTransHuge(page))) | ||
647 | if (unlikely(split_huge_page(page))) | ||
648 | goto move_newpage; | ||
649 | |||
650 | /* prepare cgroup just returns 0 or -ENOMEM */ | ||
651 | rc = -EAGAIN; | ||
652 | |||
653 | if (!trylock_page(page)) { | 633 | if (!trylock_page(page)) { |
654 | if (!force || !sync) | 634 | if (!force || !sync) |
655 | goto move_newpage; | 635 | goto out; |
656 | 636 | ||
657 | /* | 637 | /* |
658 | * It's not safe for direct compaction to call lock_page. | 638 | * It's not safe for direct compaction to call lock_page. |
@@ -668,7 +648,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, | |||
668 | * altogether. | 648 | * altogether. |
669 | */ | 649 | */ |
670 | if (current->flags & PF_MEMALLOC) | 650 | if (current->flags & PF_MEMALLOC) |
671 | goto move_newpage; | 651 | goto out; |
672 | 652 | ||
673 | lock_page(page); | 653 | lock_page(page); |
674 | } | 654 | } |
@@ -785,27 +765,52 @@ uncharge: | |||
785 | mem_cgroup_end_migration(mem, page, newpage, rc == 0); | 765 | mem_cgroup_end_migration(mem, page, newpage, rc == 0); |
786 | unlock: | 766 | unlock: |
787 | unlock_page(page); | 767 | unlock_page(page); |
768 | out: | ||
769 | return rc; | ||
770 | } | ||
788 | 771 | ||
789 | move_newpage: | 772 | /* |
773 | * Obtain the lock on page, remove all ptes and migrate the page | ||
774 | * to the newly allocated page in newpage. | ||
775 | */ | ||
776 | static int unmap_and_move(new_page_t get_new_page, unsigned long private, | ||
777 | struct page *page, int force, bool offlining, bool sync) | ||
778 | { | ||
779 | int rc = 0; | ||
780 | int *result = NULL; | ||
781 | struct page *newpage = get_new_page(page, private, &result); | ||
782 | |||
783 | if (!newpage) | ||
784 | return -ENOMEM; | ||
785 | |||
786 | if (page_count(page) == 1) { | ||
787 | /* page was freed from under us. So we are done. */ | ||
788 | goto out; | ||
789 | } | ||
790 | |||
791 | if (unlikely(PageTransHuge(page))) | ||
792 | if (unlikely(split_huge_page(page))) | ||
793 | goto out; | ||
794 | |||
795 | rc = __unmap_and_move(page, newpage, force, offlining, sync); | ||
796 | out: | ||
790 | if (rc != -EAGAIN) { | 797 | if (rc != -EAGAIN) { |
791 | /* | 798 | /* |
792 | * A page that has been migrated has all references | 799 | * A page that has been migrated has all references |
793 | * removed and will be freed. A page that has not been | 800 | * removed and will be freed. A page that has not been |
794 | * migrated will have kepts its references and be | 801 | * migrated will have kepts its references and be |
795 | * restored. | 802 | * restored. |
796 | */ | 803 | */ |
797 | list_del(&page->lru); | 804 | list_del(&page->lru); |
798 | dec_zone_page_state(page, NR_ISOLATED_ANON + | 805 | dec_zone_page_state(page, NR_ISOLATED_ANON + |
799 | page_is_file_cache(page)); | 806 | page_is_file_cache(page)); |
800 | putback_lru_page(page); | 807 | putback_lru_page(page); |
801 | } | 808 | } |
802 | |||
803 | /* | 809 | /* |
804 | * Move the new page to the LRU. If migration was not successful | 810 | * Move the new page to the LRU. If migration was not successful |
805 | * then this will free the page. | 811 | * then this will free the page. |
806 | */ | 812 | */ |
807 | putback_lru_page(newpage); | 813 | putback_lru_page(newpage); |
808 | |||
809 | if (result) { | 814 | if (result) { |
810 | if (rc) | 815 | if (rc) |
811 | *result = rc; | 816 | *result = rc; |
diff --git a/mm/mincore.c b/mm/mincore.c index a4e6b9d75c76..636a86876ff2 100644 --- a/mm/mincore.c +++ b/mm/mincore.c | |||
@@ -69,12 +69,15 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff) | |||
69 | * file will not get a swp_entry_t in its pte, but rather it is like | 69 | * file will not get a swp_entry_t in its pte, but rather it is like |
70 | * any other file mapping (ie. marked !present and faulted in with | 70 | * any other file mapping (ie. marked !present and faulted in with |
71 | * tmpfs's .fault). So swapped out tmpfs mappings are tested here. | 71 | * tmpfs's .fault). So swapped out tmpfs mappings are tested here. |
72 | * | ||
73 | * However when tmpfs moves the page from pagecache and into swapcache, | ||
74 | * it is still in core, but the find_get_page below won't find it. | ||
75 | * No big deal, but make a note of it. | ||
76 | */ | 72 | */ |
77 | page = find_get_page(mapping, pgoff); | 73 | page = find_get_page(mapping, pgoff); |
74 | #ifdef CONFIG_SWAP | ||
75 | /* shmem/tmpfs may return swap: account for swapcache page too. */ | ||
76 | if (radix_tree_exceptional_entry(page)) { | ||
77 | swp_entry_t swap = radix_to_swp_entry(page); | ||
78 | page = find_get_page(&swapper_space, swap.val); | ||
79 | } | ||
80 | #endif | ||
78 | if (page) { | 81 | if (page) { |
79 | present = PageUptodate(page); | 82 | present = PageUptodate(page); |
80 | page_cache_release(page); | 83 | page_cache_release(page); |
diff --git a/mm/mlock.c b/mm/mlock.c index 048260c4e02e..4f4f53bdc65d 100644 --- a/mm/mlock.c +++ b/mm/mlock.c | |||
@@ -14,7 +14,7 @@ | |||
14 | #include <linux/mempolicy.h> | 14 | #include <linux/mempolicy.h> |
15 | #include <linux/syscalls.h> | 15 | #include <linux/syscalls.h> |
16 | #include <linux/sched.h> | 16 | #include <linux/sched.h> |
17 | #include <linux/module.h> | 17 | #include <linux/export.h> |
18 | #include <linux/rmap.h> | 18 | #include <linux/rmap.h> |
19 | #include <linux/mmzone.h> | 19 | #include <linux/mmzone.h> |
20 | #include <linux/hugetlb.h> | 20 | #include <linux/hugetlb.h> |
@@ -110,7 +110,15 @@ void munlock_vma_page(struct page *page) | |||
110 | if (TestClearPageMlocked(page)) { | 110 | if (TestClearPageMlocked(page)) { |
111 | dec_zone_page_state(page, NR_MLOCK); | 111 | dec_zone_page_state(page, NR_MLOCK); |
112 | if (!isolate_lru_page(page)) { | 112 | if (!isolate_lru_page(page)) { |
113 | int ret = try_to_munlock(page); | 113 | int ret = SWAP_AGAIN; |
114 | |||
115 | /* | ||
116 | * Optimization: if the page was mapped just once, | ||
117 | * that's our mapping and we don't need to check all the | ||
118 | * other vmas. | ||
119 | */ | ||
120 | if (page_mapcount(page) > 1) | ||
121 | ret = try_to_munlock(page); | ||
114 | /* | 122 | /* |
115 | * did try_to_unlock() succeed or punt? | 123 | * did try_to_unlock() succeed or punt? |
116 | */ | 124 | */ |
@@ -549,7 +557,8 @@ SYSCALL_DEFINE1(mlockall, int, flags) | |||
549 | if (!can_do_mlock()) | 557 | if (!can_do_mlock()) |
550 | goto out; | 558 | goto out; |
551 | 559 | ||
552 | lru_add_drain_all(); /* flush pagevec */ | 560 | if (flags & MCL_CURRENT) |
561 | lru_add_drain_all(); /* flush pagevec */ | ||
553 | 562 | ||
554 | down_write(¤t->mm->mmap_sem); | 563 | down_write(¤t->mm->mmap_sem); |
555 | 564 | ||
diff --git a/mm/mm_init.c b/mm/mm_init.c index 4e0e26591dfa..1ffd97ae26d7 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c | |||
@@ -8,7 +8,7 @@ | |||
8 | #include <linux/kernel.h> | 8 | #include <linux/kernel.h> |
9 | #include <linux/init.h> | 9 | #include <linux/init.h> |
10 | #include <linux/kobject.h> | 10 | #include <linux/kobject.h> |
11 | #include <linux/module.h> | 11 | #include <linux/export.h> |
12 | #include "internal.h" | 12 | #include "internal.h" |
13 | 13 | ||
14 | #ifdef CONFIG_DEBUG_MEMORY_INIT | 14 | #ifdef CONFIG_DEBUG_MEMORY_INIT |
@@ -22,7 +22,7 @@ | |||
22 | #include <linux/security.h> | 22 | #include <linux/security.h> |
23 | #include <linux/hugetlb.h> | 23 | #include <linux/hugetlb.h> |
24 | #include <linux/profile.h> | 24 | #include <linux/profile.h> |
25 | #include <linux/module.h> | 25 | #include <linux/export.h> |
26 | #include <linux/mount.h> | 26 | #include <linux/mount.h> |
27 | #include <linux/mempolicy.h> | 27 | #include <linux/mempolicy.h> |
28 | #include <linux/rmap.h> | 28 | #include <linux/rmap.h> |
@@ -2558,7 +2558,6 @@ int mm_take_all_locks(struct mm_struct *mm) | |||
2558 | { | 2558 | { |
2559 | struct vm_area_struct *vma; | 2559 | struct vm_area_struct *vma; |
2560 | struct anon_vma_chain *avc; | 2560 | struct anon_vma_chain *avc; |
2561 | int ret = -EINTR; | ||
2562 | 2561 | ||
2563 | BUG_ON(down_read_trylock(&mm->mmap_sem)); | 2562 | BUG_ON(down_read_trylock(&mm->mmap_sem)); |
2564 | 2563 | ||
@@ -2579,13 +2578,11 @@ int mm_take_all_locks(struct mm_struct *mm) | |||
2579 | vm_lock_anon_vma(mm, avc->anon_vma); | 2578 | vm_lock_anon_vma(mm, avc->anon_vma); |
2580 | } | 2579 | } |
2581 | 2580 | ||
2582 | ret = 0; | 2581 | return 0; |
2583 | 2582 | ||
2584 | out_unlock: | 2583 | out_unlock: |
2585 | if (ret) | 2584 | mm_drop_all_locks(mm); |
2586 | mm_drop_all_locks(mm); | 2585 | return -EINTR; |
2587 | |||
2588 | return ret; | ||
2589 | } | 2586 | } |
2590 | 2587 | ||
2591 | static void vm_unlock_anon_vma(struct anon_vma *anon_vma) | 2588 | static void vm_unlock_anon_vma(struct anon_vma *anon_vma) |
diff --git a/mm/mmu_context.c b/mm/mmu_context.c index 9e82e937000e..cf332bc0080a 100644 --- a/mm/mmu_context.c +++ b/mm/mmu_context.c | |||
@@ -5,7 +5,7 @@ | |||
5 | 5 | ||
6 | #include <linux/mm.h> | 6 | #include <linux/mm.h> |
7 | #include <linux/mmu_context.h> | 7 | #include <linux/mmu_context.h> |
8 | #include <linux/module.h> | 8 | #include <linux/export.h> |
9 | #include <linux/sched.h> | 9 | #include <linux/sched.h> |
10 | 10 | ||
11 | #include <asm/mmu_context.h> | 11 | #include <asm/mmu_context.h> |
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 8d032de4088e..9a611d3a1848 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c | |||
@@ -11,7 +11,7 @@ | |||
11 | 11 | ||
12 | #include <linux/rculist.h> | 12 | #include <linux/rculist.h> |
13 | #include <linux/mmu_notifier.h> | 13 | #include <linux/mmu_notifier.h> |
14 | #include <linux/module.h> | 14 | #include <linux/export.h> |
15 | #include <linux/mm.h> | 15 | #include <linux/mm.h> |
16 | #include <linux/err.h> | 16 | #include <linux/err.h> |
17 | #include <linux/rcupdate.h> | 17 | #include <linux/rcupdate.h> |
diff --git a/mm/mmzone.c b/mm/mmzone.c index f5b7d1760213..7cf7b7ddc7c5 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c | |||
@@ -8,7 +8,6 @@ | |||
8 | #include <linux/stddef.h> | 8 | #include <linux/stddef.h> |
9 | #include <linux/mm.h> | 9 | #include <linux/mm.h> |
10 | #include <linux/mmzone.h> | 10 | #include <linux/mmzone.h> |
11 | #include <linux/module.h> | ||
12 | 11 | ||
13 | struct pglist_data *first_online_pgdat(void) | 12 | struct pglist_data *first_online_pgdat(void) |
14 | { | 13 | { |
diff --git a/mm/mremap.c b/mm/mremap.c index 506fa44403df..d6959cb4df58 100644 --- a/mm/mremap.c +++ b/mm/mremap.c | |||
@@ -41,8 +41,7 @@ static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr) | |||
41 | return NULL; | 41 | return NULL; |
42 | 42 | ||
43 | pmd = pmd_offset(pud, addr); | 43 | pmd = pmd_offset(pud, addr); |
44 | split_huge_page_pmd(mm, pmd); | 44 | if (pmd_none(*pmd)) |
45 | if (pmd_none_or_clear_bad(pmd)) | ||
46 | return NULL; | 45 | return NULL; |
47 | 46 | ||
48 | return pmd; | 47 | return pmd; |
@@ -65,8 +64,6 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma, | |||
65 | return NULL; | 64 | return NULL; |
66 | 65 | ||
67 | VM_BUG_ON(pmd_trans_huge(*pmd)); | 66 | VM_BUG_ON(pmd_trans_huge(*pmd)); |
68 | if (pmd_none(*pmd) && __pte_alloc(mm, vma, pmd, addr)) | ||
69 | return NULL; | ||
70 | 67 | ||
71 | return pmd; | 68 | return pmd; |
72 | } | 69 | } |
@@ -80,11 +77,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, | |||
80 | struct mm_struct *mm = vma->vm_mm; | 77 | struct mm_struct *mm = vma->vm_mm; |
81 | pte_t *old_pte, *new_pte, pte; | 78 | pte_t *old_pte, *new_pte, pte; |
82 | spinlock_t *old_ptl, *new_ptl; | 79 | spinlock_t *old_ptl, *new_ptl; |
83 | unsigned long old_start; | ||
84 | 80 | ||
85 | old_start = old_addr; | ||
86 | mmu_notifier_invalidate_range_start(vma->vm_mm, | ||
87 | old_start, old_end); | ||
88 | if (vma->vm_file) { | 81 | if (vma->vm_file) { |
89 | /* | 82 | /* |
90 | * Subtle point from Rajesh Venkatasubramanian: before | 83 | * Subtle point from Rajesh Venkatasubramanian: before |
@@ -111,7 +104,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, | |||
111 | new_pte++, new_addr += PAGE_SIZE) { | 104 | new_pte++, new_addr += PAGE_SIZE) { |
112 | if (pte_none(*old_pte)) | 105 | if (pte_none(*old_pte)) |
113 | continue; | 106 | continue; |
114 | pte = ptep_clear_flush(vma, old_addr, old_pte); | 107 | pte = ptep_get_and_clear(mm, old_addr, old_pte); |
115 | pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr); | 108 | pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr); |
116 | set_pte_at(mm, new_addr, new_pte, pte); | 109 | set_pte_at(mm, new_addr, new_pte, pte); |
117 | } | 110 | } |
@@ -123,7 +116,6 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, | |||
123 | pte_unmap_unlock(old_pte - 1, old_ptl); | 116 | pte_unmap_unlock(old_pte - 1, old_ptl); |
124 | if (mapping) | 117 | if (mapping) |
125 | mutex_unlock(&mapping->i_mmap_mutex); | 118 | mutex_unlock(&mapping->i_mmap_mutex); |
126 | mmu_notifier_invalidate_range_end(vma->vm_mm, old_start, old_end); | ||
127 | } | 119 | } |
128 | 120 | ||
129 | #define LATENCY_LIMIT (64 * PAGE_SIZE) | 121 | #define LATENCY_LIMIT (64 * PAGE_SIZE) |
@@ -134,22 +126,43 @@ unsigned long move_page_tables(struct vm_area_struct *vma, | |||
134 | { | 126 | { |
135 | unsigned long extent, next, old_end; | 127 | unsigned long extent, next, old_end; |
136 | pmd_t *old_pmd, *new_pmd; | 128 | pmd_t *old_pmd, *new_pmd; |
129 | bool need_flush = false; | ||
137 | 130 | ||
138 | old_end = old_addr + len; | 131 | old_end = old_addr + len; |
139 | flush_cache_range(vma, old_addr, old_end); | 132 | flush_cache_range(vma, old_addr, old_end); |
140 | 133 | ||
134 | mmu_notifier_invalidate_range_start(vma->vm_mm, old_addr, old_end); | ||
135 | |||
141 | for (; old_addr < old_end; old_addr += extent, new_addr += extent) { | 136 | for (; old_addr < old_end; old_addr += extent, new_addr += extent) { |
142 | cond_resched(); | 137 | cond_resched(); |
143 | next = (old_addr + PMD_SIZE) & PMD_MASK; | 138 | next = (old_addr + PMD_SIZE) & PMD_MASK; |
144 | if (next - 1 > old_end) | 139 | /* even if next overflowed, extent below will be ok */ |
145 | next = old_end; | ||
146 | extent = next - old_addr; | 140 | extent = next - old_addr; |
141 | if (extent > old_end - old_addr) | ||
142 | extent = old_end - old_addr; | ||
147 | old_pmd = get_old_pmd(vma->vm_mm, old_addr); | 143 | old_pmd = get_old_pmd(vma->vm_mm, old_addr); |
148 | if (!old_pmd) | 144 | if (!old_pmd) |
149 | continue; | 145 | continue; |
150 | new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr); | 146 | new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr); |
151 | if (!new_pmd) | 147 | if (!new_pmd) |
152 | break; | 148 | break; |
149 | if (pmd_trans_huge(*old_pmd)) { | ||
150 | int err = 0; | ||
151 | if (extent == HPAGE_PMD_SIZE) | ||
152 | err = move_huge_pmd(vma, new_vma, old_addr, | ||
153 | new_addr, old_end, | ||
154 | old_pmd, new_pmd); | ||
155 | if (err > 0) { | ||
156 | need_flush = true; | ||
157 | continue; | ||
158 | } else if (!err) { | ||
159 | split_huge_page_pmd(vma->vm_mm, old_pmd); | ||
160 | } | ||
161 | VM_BUG_ON(pmd_trans_huge(*old_pmd)); | ||
162 | } | ||
163 | if (pmd_none(*new_pmd) && __pte_alloc(new_vma->vm_mm, new_vma, | ||
164 | new_pmd, new_addr)) | ||
165 | break; | ||
153 | next = (new_addr + PMD_SIZE) & PMD_MASK; | 166 | next = (new_addr + PMD_SIZE) & PMD_MASK; |
154 | if (extent > next - new_addr) | 167 | if (extent > next - new_addr) |
155 | extent = next - new_addr; | 168 | extent = next - new_addr; |
@@ -157,7 +170,12 @@ unsigned long move_page_tables(struct vm_area_struct *vma, | |||
157 | extent = LATENCY_LIMIT; | 170 | extent = LATENCY_LIMIT; |
158 | move_ptes(vma, old_pmd, old_addr, old_addr + extent, | 171 | move_ptes(vma, old_pmd, old_addr, old_addr + extent, |
159 | new_vma, new_pmd, new_addr); | 172 | new_vma, new_pmd, new_addr); |
173 | need_flush = true; | ||
160 | } | 174 | } |
175 | if (likely(need_flush)) | ||
176 | flush_tlb_range(vma, old_end-len, old_addr); | ||
177 | |||
178 | mmu_notifier_invalidate_range_end(vma->vm_mm, old_end-len, old_end); | ||
161 | 179 | ||
162 | return len + old_addr - old_end; /* how much done */ | 180 | return len + old_addr - old_end; /* how much done */ |
163 | } | 181 | } |
diff --git a/mm/nobootmem.c b/mm/nobootmem.c index 6e93dc7f2586..7fa41b4a07bf 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c | |||
@@ -12,7 +12,7 @@ | |||
12 | #include <linux/pfn.h> | 12 | #include <linux/pfn.h> |
13 | #include <linux/slab.h> | 13 | #include <linux/slab.h> |
14 | #include <linux/bootmem.h> | 14 | #include <linux/bootmem.h> |
15 | #include <linux/module.h> | 15 | #include <linux/export.h> |
16 | #include <linux/kmemleak.h> | 16 | #include <linux/kmemleak.h> |
17 | #include <linux/range.h> | 17 | #include <linux/range.h> |
18 | #include <linux/memblock.h> | 18 | #include <linux/memblock.h> |
diff --git a/mm/nommu.c b/mm/nommu.c index 4358032566e9..73419c55eda6 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
@@ -13,7 +13,7 @@ | |||
13 | * Copyright (c) 2007-2010 Paul Mundt <lethal@linux-sh.org> | 13 | * Copyright (c) 2007-2010 Paul Mundt <lethal@linux-sh.org> |
14 | */ | 14 | */ |
15 | 15 | ||
16 | #include <linux/module.h> | 16 | #include <linux/export.h> |
17 | #include <linux/mm.h> | 17 | #include <linux/mm.h> |
18 | #include <linux/mman.h> | 18 | #include <linux/mman.h> |
19 | #include <linux/swap.h> | 19 | #include <linux/swap.h> |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index eafff89b3dd6..471dedb463ab 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -26,18 +26,38 @@ | |||
26 | #include <linux/timex.h> | 26 | #include <linux/timex.h> |
27 | #include <linux/jiffies.h> | 27 | #include <linux/jiffies.h> |
28 | #include <linux/cpuset.h> | 28 | #include <linux/cpuset.h> |
29 | #include <linux/module.h> | 29 | #include <linux/export.h> |
30 | #include <linux/notifier.h> | 30 | #include <linux/notifier.h> |
31 | #include <linux/memcontrol.h> | 31 | #include <linux/memcontrol.h> |
32 | #include <linux/mempolicy.h> | 32 | #include <linux/mempolicy.h> |
33 | #include <linux/security.h> | 33 | #include <linux/security.h> |
34 | #include <linux/ptrace.h> | 34 | #include <linux/ptrace.h> |
35 | #include <linux/freezer.h> | ||
35 | 36 | ||
36 | int sysctl_panic_on_oom; | 37 | int sysctl_panic_on_oom; |
37 | int sysctl_oom_kill_allocating_task; | 38 | int sysctl_oom_kill_allocating_task; |
38 | int sysctl_oom_dump_tasks = 1; | 39 | int sysctl_oom_dump_tasks = 1; |
39 | static DEFINE_SPINLOCK(zone_scan_lock); | 40 | static DEFINE_SPINLOCK(zone_scan_lock); |
40 | 41 | ||
42 | /* | ||
43 | * compare_swap_oom_score_adj() - compare and swap current's oom_score_adj | ||
44 | * @old_val: old oom_score_adj for compare | ||
45 | * @new_val: new oom_score_adj for swap | ||
46 | * | ||
47 | * Sets the oom_score_adj value for current to @new_val iff its present value is | ||
48 | * @old_val. Usually used to reinstate a previous value to prevent racing with | ||
49 | * userspacing tuning the value in the interim. | ||
50 | */ | ||
51 | void compare_swap_oom_score_adj(int old_val, int new_val) | ||
52 | { | ||
53 | struct sighand_struct *sighand = current->sighand; | ||
54 | |||
55 | spin_lock_irq(&sighand->siglock); | ||
56 | if (current->signal->oom_score_adj == old_val) | ||
57 | current->signal->oom_score_adj = new_val; | ||
58 | spin_unlock_irq(&sighand->siglock); | ||
59 | } | ||
60 | |||
41 | /** | 61 | /** |
42 | * test_set_oom_score_adj() - set current's oom_score_adj and return old value | 62 | * test_set_oom_score_adj() - set current's oom_score_adj and return old value |
43 | * @new_val: new oom_score_adj value | 63 | * @new_val: new oom_score_adj value |
@@ -53,13 +73,7 @@ int test_set_oom_score_adj(int new_val) | |||
53 | 73 | ||
54 | spin_lock_irq(&sighand->siglock); | 74 | spin_lock_irq(&sighand->siglock); |
55 | old_val = current->signal->oom_score_adj; | 75 | old_val = current->signal->oom_score_adj; |
56 | if (new_val != old_val) { | 76 | current->signal->oom_score_adj = new_val; |
57 | if (new_val == OOM_SCORE_ADJ_MIN) | ||
58 | atomic_inc(¤t->mm->oom_disable_count); | ||
59 | else if (old_val == OOM_SCORE_ADJ_MIN) | ||
60 | atomic_dec(¤t->mm->oom_disable_count); | ||
61 | current->signal->oom_score_adj = new_val; | ||
62 | } | ||
63 | spin_unlock_irq(&sighand->siglock); | 77 | spin_unlock_irq(&sighand->siglock); |
64 | 78 | ||
65 | return old_val; | 79 | return old_val; |
@@ -172,16 +186,6 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem, | |||
172 | return 0; | 186 | return 0; |
173 | 187 | ||
174 | /* | 188 | /* |
175 | * Shortcut check for a thread sharing p->mm that is OOM_SCORE_ADJ_MIN | ||
176 | * so the entire heuristic doesn't need to be executed for something | ||
177 | * that cannot be killed. | ||
178 | */ | ||
179 | if (atomic_read(&p->mm->oom_disable_count)) { | ||
180 | task_unlock(p); | ||
181 | return 0; | ||
182 | } | ||
183 | |||
184 | /* | ||
185 | * The memory controller may have a limit of 0 bytes, so avoid a divide | 189 | * The memory controller may have a limit of 0 bytes, so avoid a divide |
186 | * by zero, if necessary. | 190 | * by zero, if necessary. |
187 | */ | 191 | */ |
@@ -303,7 +307,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, | |||
303 | do_each_thread(g, p) { | 307 | do_each_thread(g, p) { |
304 | unsigned int points; | 308 | unsigned int points; |
305 | 309 | ||
306 | if (!p->mm) | 310 | if (p->exit_state) |
307 | continue; | 311 | continue; |
308 | if (oom_unkillable_task(p, mem, nodemask)) | 312 | if (oom_unkillable_task(p, mem, nodemask)) |
309 | continue; | 313 | continue; |
@@ -317,8 +321,13 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, | |||
317 | * blocked waiting for another task which itself is waiting | 321 | * blocked waiting for another task which itself is waiting |
318 | * for memory. Is there a better alternative? | 322 | * for memory. Is there a better alternative? |
319 | */ | 323 | */ |
320 | if (test_tsk_thread_flag(p, TIF_MEMDIE)) | 324 | if (test_tsk_thread_flag(p, TIF_MEMDIE)) { |
325 | if (unlikely(frozen(p))) | ||
326 | thaw_process(p); | ||
321 | return ERR_PTR(-1UL); | 327 | return ERR_PTR(-1UL); |
328 | } | ||
329 | if (!p->mm) | ||
330 | continue; | ||
322 | 331 | ||
323 | if (p->flags & PF_EXITING) { | 332 | if (p->flags & PF_EXITING) { |
324 | /* | 333 | /* |
@@ -433,7 +442,7 @@ static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem) | |||
433 | task_unlock(p); | 442 | task_unlock(p); |
434 | 443 | ||
435 | /* | 444 | /* |
436 | * Kill all processes sharing p->mm in other thread groups, if any. | 445 | * Kill all user processes sharing p->mm in other thread groups, if any. |
437 | * They don't get access to memory reserves or a higher scheduler | 446 | * They don't get access to memory reserves or a higher scheduler |
438 | * priority, though, to avoid depletion of all memory or task | 447 | * priority, though, to avoid depletion of all memory or task |
439 | * starvation. This prevents mm->mmap_sem livelock when an oom killed | 448 | * starvation. This prevents mm->mmap_sem livelock when an oom killed |
@@ -443,7 +452,11 @@ static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem) | |||
443 | * signal. | 452 | * signal. |
444 | */ | 453 | */ |
445 | for_each_process(q) | 454 | for_each_process(q) |
446 | if (q->mm == mm && !same_thread_group(q, p)) { | 455 | if (q->mm == mm && !same_thread_group(q, p) && |
456 | !(q->flags & PF_KTHREAD)) { | ||
457 | if (q->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) | ||
458 | continue; | ||
459 | |||
447 | task_lock(q); /* Protect ->comm from prctl() */ | 460 | task_lock(q); /* Protect ->comm from prctl() */ |
448 | pr_err("Kill process %d (%s) sharing same memory\n", | 461 | pr_err("Kill process %d (%s) sharing same memory\n", |
449 | task_pid_nr(q), q->comm); | 462 | task_pid_nr(q), q->comm); |
@@ -720,7 +733,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, | |||
720 | read_lock(&tasklist_lock); | 733 | read_lock(&tasklist_lock); |
721 | if (sysctl_oom_kill_allocating_task && | 734 | if (sysctl_oom_kill_allocating_task && |
722 | !oom_unkillable_task(current, NULL, nodemask) && | 735 | !oom_unkillable_task(current, NULL, nodemask) && |
723 | current->mm && !atomic_read(¤t->mm->oom_disable_count)) { | 736 | current->mm) { |
724 | /* | 737 | /* |
725 | * oom_kill_process() needs tasklist_lock held. If it returns | 738 | * oom_kill_process() needs tasklist_lock held. If it returns |
726 | * non-zero, current could not be killed so we must fallback to | 739 | * non-zero, current could not be killed so we must fallback to |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index d1960744f881..a3278f005230 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -12,7 +12,7 @@ | |||
12 | */ | 12 | */ |
13 | 13 | ||
14 | #include <linux/kernel.h> | 14 | #include <linux/kernel.h> |
15 | #include <linux/module.h> | 15 | #include <linux/export.h> |
16 | #include <linux/spinlock.h> | 16 | #include <linux/spinlock.h> |
17 | #include <linux/fs.h> | 17 | #include <linux/fs.h> |
18 | #include <linux/mm.h> | 18 | #include <linux/mm.h> |
@@ -46,26 +46,14 @@ | |||
46 | */ | 46 | */ |
47 | #define BANDWIDTH_INTERVAL max(HZ/5, 1) | 47 | #define BANDWIDTH_INTERVAL max(HZ/5, 1) |
48 | 48 | ||
49 | #define RATELIMIT_CALC_SHIFT 10 | ||
50 | |||
49 | /* | 51 | /* |
50 | * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited | 52 | * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited |
51 | * will look to see if it needs to force writeback or throttling. | 53 | * will look to see if it needs to force writeback or throttling. |
52 | */ | 54 | */ |
53 | static long ratelimit_pages = 32; | 55 | static long ratelimit_pages = 32; |
54 | 56 | ||
55 | /* | ||
56 | * When balance_dirty_pages decides that the caller needs to perform some | ||
57 | * non-background writeback, this is how many pages it will attempt to write. | ||
58 | * It should be somewhat larger than dirtied pages to ensure that reasonably | ||
59 | * large amounts of I/O are submitted. | ||
60 | */ | ||
61 | static inline long sync_writeback_pages(unsigned long dirtied) | ||
62 | { | ||
63 | if (dirtied < ratelimit_pages) | ||
64 | dirtied = ratelimit_pages; | ||
65 | |||
66 | return dirtied + dirtied / 2; | ||
67 | } | ||
68 | |||
69 | /* The following parameters are exported via /proc/sys/vm */ | 57 | /* The following parameters are exported via /proc/sys/vm */ |
70 | 58 | ||
71 | /* | 59 | /* |
@@ -167,6 +155,8 @@ static void update_completion_period(void) | |||
167 | int shift = calc_period_shift(); | 155 | int shift = calc_period_shift(); |
168 | prop_change_shift(&vm_completions, shift); | 156 | prop_change_shift(&vm_completions, shift); |
169 | prop_change_shift(&vm_dirties, shift); | 157 | prop_change_shift(&vm_dirties, shift); |
158 | |||
159 | writeback_set_ratelimit(); | ||
170 | } | 160 | } |
171 | 161 | ||
172 | int dirty_background_ratio_handler(struct ctl_table *table, int write, | 162 | int dirty_background_ratio_handler(struct ctl_table *table, int write, |
@@ -260,52 +250,10 @@ static void bdi_writeout_fraction(struct backing_dev_info *bdi, | |||
260 | numerator, denominator); | 250 | numerator, denominator); |
261 | } | 251 | } |
262 | 252 | ||
263 | static inline void task_dirties_fraction(struct task_struct *tsk, | ||
264 | long *numerator, long *denominator) | ||
265 | { | ||
266 | prop_fraction_single(&vm_dirties, &tsk->dirties, | ||
267 | numerator, denominator); | ||
268 | } | ||
269 | |||
270 | /* | 253 | /* |
271 | * task_dirty_limit - scale down dirty throttling threshold for one task | 254 | * bdi_min_ratio keeps the sum of the minimum dirty shares of all |
272 | * | 255 | * registered backing devices, which, for obvious reasons, can not |
273 | * task specific dirty limit: | 256 | * exceed 100%. |
274 | * | ||
275 | * dirty -= (dirty/8) * p_{t} | ||
276 | * | ||
277 | * To protect light/slow dirtying tasks from heavier/fast ones, we start | ||
278 | * throttling individual tasks before reaching the bdi dirty limit. | ||
279 | * Relatively low thresholds will be allocated to heavy dirtiers. So when | ||
280 | * dirty pages grow large, heavy dirtiers will be throttled first, which will | ||
281 | * effectively curb the growth of dirty pages. Light dirtiers with high enough | ||
282 | * dirty threshold may never get throttled. | ||
283 | */ | ||
284 | #define TASK_LIMIT_FRACTION 8 | ||
285 | static unsigned long task_dirty_limit(struct task_struct *tsk, | ||
286 | unsigned long bdi_dirty) | ||
287 | { | ||
288 | long numerator, denominator; | ||
289 | unsigned long dirty = bdi_dirty; | ||
290 | u64 inv = dirty / TASK_LIMIT_FRACTION; | ||
291 | |||
292 | task_dirties_fraction(tsk, &numerator, &denominator); | ||
293 | inv *= numerator; | ||
294 | do_div(inv, denominator); | ||
295 | |||
296 | dirty -= inv; | ||
297 | |||
298 | return max(dirty, bdi_dirty/2); | ||
299 | } | ||
300 | |||
301 | /* Minimum limit for any task */ | ||
302 | static unsigned long task_min_dirty_limit(unsigned long bdi_dirty) | ||
303 | { | ||
304 | return bdi_dirty - bdi_dirty / TASK_LIMIT_FRACTION; | ||
305 | } | ||
306 | |||
307 | /* | ||
308 | * | ||
309 | */ | 257 | */ |
310 | static unsigned int bdi_min_ratio; | 258 | static unsigned int bdi_min_ratio; |
311 | 259 | ||
@@ -411,6 +359,12 @@ unsigned long determine_dirtyable_memory(void) | |||
411 | return x + 1; /* Ensure that we never return 0 */ | 359 | return x + 1; /* Ensure that we never return 0 */ |
412 | } | 360 | } |
413 | 361 | ||
362 | static unsigned long dirty_freerun_ceiling(unsigned long thresh, | ||
363 | unsigned long bg_thresh) | ||
364 | { | ||
365 | return (thresh + bg_thresh) / 2; | ||
366 | } | ||
367 | |||
414 | static unsigned long hard_dirty_limit(unsigned long thresh) | 368 | static unsigned long hard_dirty_limit(unsigned long thresh) |
415 | { | 369 | { |
416 | return max(thresh, global_dirty_limit); | 370 | return max(thresh, global_dirty_limit); |
@@ -495,6 +449,198 @@ unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty) | |||
495 | return bdi_dirty; | 449 | return bdi_dirty; |
496 | } | 450 | } |
497 | 451 | ||
452 | /* | ||
453 | * Dirty position control. | ||
454 | * | ||
455 | * (o) global/bdi setpoints | ||
456 | * | ||
457 | * We want the dirty pages be balanced around the global/bdi setpoints. | ||
458 | * When the number of dirty pages is higher/lower than the setpoint, the | ||
459 | * dirty position control ratio (and hence task dirty ratelimit) will be | ||
460 | * decreased/increased to bring the dirty pages back to the setpoint. | ||
461 | * | ||
462 | * pos_ratio = 1 << RATELIMIT_CALC_SHIFT | ||
463 | * | ||
464 | * if (dirty < setpoint) scale up pos_ratio | ||
465 | * if (dirty > setpoint) scale down pos_ratio | ||
466 | * | ||
467 | * if (bdi_dirty < bdi_setpoint) scale up pos_ratio | ||
468 | * if (bdi_dirty > bdi_setpoint) scale down pos_ratio | ||
469 | * | ||
470 | * task_ratelimit = dirty_ratelimit * pos_ratio >> RATELIMIT_CALC_SHIFT | ||
471 | * | ||
472 | * (o) global control line | ||
473 | * | ||
474 | * ^ pos_ratio | ||
475 | * | | ||
476 | * | |<===== global dirty control scope ======>| | ||
477 | * 2.0 .............* | ||
478 | * | .* | ||
479 | * | . * | ||
480 | * | . * | ||
481 | * | . * | ||
482 | * | . * | ||
483 | * | . * | ||
484 | * 1.0 ................................* | ||
485 | * | . . * | ||
486 | * | . . * | ||
487 | * | . . * | ||
488 | * | . . * | ||
489 | * | . . * | ||
490 | * 0 +------------.------------------.----------------------*-------------> | ||
491 | * freerun^ setpoint^ limit^ dirty pages | ||
492 | * | ||
493 | * (o) bdi control line | ||
494 | * | ||
495 | * ^ pos_ratio | ||
496 | * | | ||
497 | * | * | ||
498 | * | * | ||
499 | * | * | ||
500 | * | * | ||
501 | * | * |<=========== span ============>| | ||
502 | * 1.0 .......................* | ||
503 | * | . * | ||
504 | * | . * | ||
505 | * | . * | ||
506 | * | . * | ||
507 | * | . * | ||
508 | * | . * | ||
509 | * | . * | ||
510 | * | . * | ||
511 | * | . * | ||
512 | * | . * | ||
513 | * | . * | ||
514 | * 1/4 ...............................................* * * * * * * * * * * * | ||
515 | * | . . | ||
516 | * | . . | ||
517 | * | . . | ||
518 | * 0 +----------------------.-------------------------------.-------------> | ||
519 | * bdi_setpoint^ x_intercept^ | ||
520 | * | ||
521 | * The bdi control line won't drop below pos_ratio=1/4, so that bdi_dirty can | ||
522 | * be smoothly throttled down to normal if it starts high in situations like | ||
523 | * - start writing to a slow SD card and a fast disk at the same time. The SD | ||
524 | * card's bdi_dirty may rush to many times higher than bdi_setpoint. | ||
525 | * - the bdi dirty thresh drops quickly due to change of JBOD workload | ||
526 | */ | ||
527 | static unsigned long bdi_position_ratio(struct backing_dev_info *bdi, | ||
528 | unsigned long thresh, | ||
529 | unsigned long bg_thresh, | ||
530 | unsigned long dirty, | ||
531 | unsigned long bdi_thresh, | ||
532 | unsigned long bdi_dirty) | ||
533 | { | ||
534 | unsigned long write_bw = bdi->avg_write_bandwidth; | ||
535 | unsigned long freerun = dirty_freerun_ceiling(thresh, bg_thresh); | ||
536 | unsigned long limit = hard_dirty_limit(thresh); | ||
537 | unsigned long x_intercept; | ||
538 | unsigned long setpoint; /* dirty pages' target balance point */ | ||
539 | unsigned long bdi_setpoint; | ||
540 | unsigned long span; | ||
541 | long long pos_ratio; /* for scaling up/down the rate limit */ | ||
542 | long x; | ||
543 | |||
544 | if (unlikely(dirty >= limit)) | ||
545 | return 0; | ||
546 | |||
547 | /* | ||
548 | * global setpoint | ||
549 | * | ||
550 | * setpoint - dirty 3 | ||
551 | * f(dirty) := 1.0 + (----------------) | ||
552 | * limit - setpoint | ||
553 | * | ||
554 | * it's a 3rd order polynomial that subjects to | ||
555 | * | ||
556 | * (1) f(freerun) = 2.0 => rampup dirty_ratelimit reasonably fast | ||
557 | * (2) f(setpoint) = 1.0 => the balance point | ||
558 | * (3) f(limit) = 0 => the hard limit | ||
559 | * (4) df/dx <= 0 => negative feedback control | ||
560 | * (5) the closer to setpoint, the smaller |df/dx| (and the reverse) | ||
561 | * => fast response on large errors; small oscillation near setpoint | ||
562 | */ | ||
563 | setpoint = (freerun + limit) / 2; | ||
564 | x = div_s64((setpoint - dirty) << RATELIMIT_CALC_SHIFT, | ||
565 | limit - setpoint + 1); | ||
566 | pos_ratio = x; | ||
567 | pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT; | ||
568 | pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT; | ||
569 | pos_ratio += 1 << RATELIMIT_CALC_SHIFT; | ||
570 | |||
571 | /* | ||
572 | * We have computed basic pos_ratio above based on global situation. If | ||
573 | * the bdi is over/under its share of dirty pages, we want to scale | ||
574 | * pos_ratio further down/up. That is done by the following mechanism. | ||
575 | */ | ||
576 | |||
577 | /* | ||
578 | * bdi setpoint | ||
579 | * | ||
580 | * f(bdi_dirty) := 1.0 + k * (bdi_dirty - bdi_setpoint) | ||
581 | * | ||
582 | * x_intercept - bdi_dirty | ||
583 | * := -------------------------- | ||
584 | * x_intercept - bdi_setpoint | ||
585 | * | ||
586 | * The main bdi control line is a linear function that subjects to | ||
587 | * | ||
588 | * (1) f(bdi_setpoint) = 1.0 | ||
589 | * (2) k = - 1 / (8 * write_bw) (in single bdi case) | ||
590 | * or equally: x_intercept = bdi_setpoint + 8 * write_bw | ||
591 | * | ||
592 | * For single bdi case, the dirty pages are observed to fluctuate | ||
593 | * regularly within range | ||
594 | * [bdi_setpoint - write_bw/2, bdi_setpoint + write_bw/2] | ||
595 | * for various filesystems, where (2) can yield in a reasonable 12.5% | ||
596 | * fluctuation range for pos_ratio. | ||
597 | * | ||
598 | * For JBOD case, bdi_thresh (not bdi_dirty!) could fluctuate up to its | ||
599 | * own size, so move the slope over accordingly and choose a slope that | ||
600 | * yields 100% pos_ratio fluctuation on suddenly doubled bdi_thresh. | ||
601 | */ | ||
602 | if (unlikely(bdi_thresh > thresh)) | ||
603 | bdi_thresh = thresh; | ||
604 | bdi_thresh = max(bdi_thresh, (limit - dirty) / 8); | ||
605 | /* | ||
606 | * scale global setpoint to bdi's: | ||
607 | * bdi_setpoint = setpoint * bdi_thresh / thresh | ||
608 | */ | ||
609 | x = div_u64((u64)bdi_thresh << 16, thresh + 1); | ||
610 | bdi_setpoint = setpoint * (u64)x >> 16; | ||
611 | /* | ||
612 | * Use span=(8*write_bw) in single bdi case as indicated by | ||
613 | * (thresh - bdi_thresh ~= 0) and transit to bdi_thresh in JBOD case. | ||
614 | * | ||
615 | * bdi_thresh thresh - bdi_thresh | ||
616 | * span = ---------- * (8 * write_bw) + ------------------- * bdi_thresh | ||
617 | * thresh thresh | ||
618 | */ | ||
619 | span = (thresh - bdi_thresh + 8 * write_bw) * (u64)x >> 16; | ||
620 | x_intercept = bdi_setpoint + span; | ||
621 | |||
622 | if (bdi_dirty < x_intercept - span / 4) { | ||
623 | pos_ratio = div_u64(pos_ratio * (x_intercept - bdi_dirty), | ||
624 | x_intercept - bdi_setpoint + 1); | ||
625 | } else | ||
626 | pos_ratio /= 4; | ||
627 | |||
628 | /* | ||
629 | * bdi reserve area, safeguard against dirty pool underrun and disk idle | ||
630 | * It may push the desired control point of global dirty pages higher | ||
631 | * than setpoint. | ||
632 | */ | ||
633 | x_intercept = bdi_thresh / 2; | ||
634 | if (bdi_dirty < x_intercept) { | ||
635 | if (bdi_dirty > x_intercept / 8) | ||
636 | pos_ratio = div_u64(pos_ratio * x_intercept, bdi_dirty); | ||
637 | else | ||
638 | pos_ratio *= 8; | ||
639 | } | ||
640 | |||
641 | return pos_ratio; | ||
642 | } | ||
643 | |||
498 | static void bdi_update_write_bandwidth(struct backing_dev_info *bdi, | 644 | static void bdi_update_write_bandwidth(struct backing_dev_info *bdi, |
499 | unsigned long elapsed, | 645 | unsigned long elapsed, |
500 | unsigned long written) | 646 | unsigned long written) |
@@ -591,8 +737,153 @@ static void global_update_bandwidth(unsigned long thresh, | |||
591 | spin_unlock(&dirty_lock); | 737 | spin_unlock(&dirty_lock); |
592 | } | 738 | } |
593 | 739 | ||
740 | /* | ||
741 | * Maintain bdi->dirty_ratelimit, the base dirty throttle rate. | ||
742 | * | ||
743 | * Normal bdi tasks will be curbed at or below it in long term. | ||
744 | * Obviously it should be around (write_bw / N) when there are N dd tasks. | ||
745 | */ | ||
746 | static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi, | ||
747 | unsigned long thresh, | ||
748 | unsigned long bg_thresh, | ||
749 | unsigned long dirty, | ||
750 | unsigned long bdi_thresh, | ||
751 | unsigned long bdi_dirty, | ||
752 | unsigned long dirtied, | ||
753 | unsigned long elapsed) | ||
754 | { | ||
755 | unsigned long freerun = dirty_freerun_ceiling(thresh, bg_thresh); | ||
756 | unsigned long limit = hard_dirty_limit(thresh); | ||
757 | unsigned long setpoint = (freerun + limit) / 2; | ||
758 | unsigned long write_bw = bdi->avg_write_bandwidth; | ||
759 | unsigned long dirty_ratelimit = bdi->dirty_ratelimit; | ||
760 | unsigned long dirty_rate; | ||
761 | unsigned long task_ratelimit; | ||
762 | unsigned long balanced_dirty_ratelimit; | ||
763 | unsigned long pos_ratio; | ||
764 | unsigned long step; | ||
765 | unsigned long x; | ||
766 | |||
767 | /* | ||
768 | * The dirty rate will match the writeout rate in long term, except | ||
769 | * when dirty pages are truncated by userspace or re-dirtied by FS. | ||
770 | */ | ||
771 | dirty_rate = (dirtied - bdi->dirtied_stamp) * HZ / elapsed; | ||
772 | |||
773 | pos_ratio = bdi_position_ratio(bdi, thresh, bg_thresh, dirty, | ||
774 | bdi_thresh, bdi_dirty); | ||
775 | /* | ||
776 | * task_ratelimit reflects each dd's dirty rate for the past 200ms. | ||
777 | */ | ||
778 | task_ratelimit = (u64)dirty_ratelimit * | ||
779 | pos_ratio >> RATELIMIT_CALC_SHIFT; | ||
780 | task_ratelimit++; /* it helps rampup dirty_ratelimit from tiny values */ | ||
781 | |||
782 | /* | ||
783 | * A linear estimation of the "balanced" throttle rate. The theory is, | ||
784 | * if there are N dd tasks, each throttled at task_ratelimit, the bdi's | ||
785 | * dirty_rate will be measured to be (N * task_ratelimit). So the below | ||
786 | * formula will yield the balanced rate limit (write_bw / N). | ||
787 | * | ||
788 | * Note that the expanded form is not a pure rate feedback: | ||
789 | * rate_(i+1) = rate_(i) * (write_bw / dirty_rate) (1) | ||
790 | * but also takes pos_ratio into account: | ||
791 | * rate_(i+1) = rate_(i) * (write_bw / dirty_rate) * pos_ratio (2) | ||
792 | * | ||
793 | * (1) is not realistic because pos_ratio also takes part in balancing | ||
794 | * the dirty rate. Consider the state | ||
795 | * pos_ratio = 0.5 (3) | ||
796 | * rate = 2 * (write_bw / N) (4) | ||
797 | * If (1) is used, it will stuck in that state! Because each dd will | ||
798 | * be throttled at | ||
799 | * task_ratelimit = pos_ratio * rate = (write_bw / N) (5) | ||
800 | * yielding | ||
801 | * dirty_rate = N * task_ratelimit = write_bw (6) | ||
802 | * put (6) into (1) we get | ||
803 | * rate_(i+1) = rate_(i) (7) | ||
804 | * | ||
805 | * So we end up using (2) to always keep | ||
806 | * rate_(i+1) ~= (write_bw / N) (8) | ||
807 | * regardless of the value of pos_ratio. As long as (8) is satisfied, | ||
808 | * pos_ratio is able to drive itself to 1.0, which is not only where | ||
809 | * the dirty count meet the setpoint, but also where the slope of | ||
810 | * pos_ratio is most flat and hence task_ratelimit is least fluctuated. | ||
811 | */ | ||
812 | balanced_dirty_ratelimit = div_u64((u64)task_ratelimit * write_bw, | ||
813 | dirty_rate | 1); | ||
814 | |||
815 | /* | ||
816 | * We could safely do this and return immediately: | ||
817 | * | ||
818 | * bdi->dirty_ratelimit = balanced_dirty_ratelimit; | ||
819 | * | ||
820 | * However to get a more stable dirty_ratelimit, the below elaborated | ||
821 | * code makes use of task_ratelimit to filter out sigular points and | ||
822 | * limit the step size. | ||
823 | * | ||
824 | * The below code essentially only uses the relative value of | ||
825 | * | ||
826 | * task_ratelimit - dirty_ratelimit | ||
827 | * = (pos_ratio - 1) * dirty_ratelimit | ||
828 | * | ||
829 | * which reflects the direction and size of dirty position error. | ||
830 | */ | ||
831 | |||
832 | /* | ||
833 | * dirty_ratelimit will follow balanced_dirty_ratelimit iff | ||
834 | * task_ratelimit is on the same side of dirty_ratelimit, too. | ||
835 | * For example, when | ||
836 | * - dirty_ratelimit > balanced_dirty_ratelimit | ||
837 | * - dirty_ratelimit > task_ratelimit (dirty pages are above setpoint) | ||
838 | * lowering dirty_ratelimit will help meet both the position and rate | ||
839 | * control targets. Otherwise, don't update dirty_ratelimit if it will | ||
840 | * only help meet the rate target. After all, what the users ultimately | ||
841 | * feel and care are stable dirty rate and small position error. | ||
842 | * | ||
843 | * |task_ratelimit - dirty_ratelimit| is used to limit the step size | ||
844 | * and filter out the sigular points of balanced_dirty_ratelimit. Which | ||
845 | * keeps jumping around randomly and can even leap far away at times | ||
846 | * due to the small 200ms estimation period of dirty_rate (we want to | ||
847 | * keep that period small to reduce time lags). | ||
848 | */ | ||
849 | step = 0; | ||
850 | if (dirty < setpoint) { | ||
851 | x = min(bdi->balanced_dirty_ratelimit, | ||
852 | min(balanced_dirty_ratelimit, task_ratelimit)); | ||
853 | if (dirty_ratelimit < x) | ||
854 | step = x - dirty_ratelimit; | ||
855 | } else { | ||
856 | x = max(bdi->balanced_dirty_ratelimit, | ||
857 | max(balanced_dirty_ratelimit, task_ratelimit)); | ||
858 | if (dirty_ratelimit > x) | ||
859 | step = dirty_ratelimit - x; | ||
860 | } | ||
861 | |||
862 | /* | ||
863 | * Don't pursue 100% rate matching. It's impossible since the balanced | ||
864 | * rate itself is constantly fluctuating. So decrease the track speed | ||
865 | * when it gets close to the target. Helps eliminate pointless tremors. | ||
866 | */ | ||
867 | step >>= dirty_ratelimit / (2 * step + 1); | ||
868 | /* | ||
869 | * Limit the tracking speed to avoid overshooting. | ||
870 | */ | ||
871 | step = (step + 7) / 8; | ||
872 | |||
873 | if (dirty_ratelimit < balanced_dirty_ratelimit) | ||
874 | dirty_ratelimit += step; | ||
875 | else | ||
876 | dirty_ratelimit -= step; | ||
877 | |||
878 | bdi->dirty_ratelimit = max(dirty_ratelimit, 1UL); | ||
879 | bdi->balanced_dirty_ratelimit = balanced_dirty_ratelimit; | ||
880 | |||
881 | trace_bdi_dirty_ratelimit(bdi, dirty_rate, task_ratelimit); | ||
882 | } | ||
883 | |||
594 | void __bdi_update_bandwidth(struct backing_dev_info *bdi, | 884 | void __bdi_update_bandwidth(struct backing_dev_info *bdi, |
595 | unsigned long thresh, | 885 | unsigned long thresh, |
886 | unsigned long bg_thresh, | ||
596 | unsigned long dirty, | 887 | unsigned long dirty, |
597 | unsigned long bdi_thresh, | 888 | unsigned long bdi_thresh, |
598 | unsigned long bdi_dirty, | 889 | unsigned long bdi_dirty, |
@@ -600,6 +891,7 @@ void __bdi_update_bandwidth(struct backing_dev_info *bdi, | |||
600 | { | 891 | { |
601 | unsigned long now = jiffies; | 892 | unsigned long now = jiffies; |
602 | unsigned long elapsed = now - bdi->bw_time_stamp; | 893 | unsigned long elapsed = now - bdi->bw_time_stamp; |
894 | unsigned long dirtied; | ||
603 | unsigned long written; | 895 | unsigned long written; |
604 | 896 | ||
605 | /* | 897 | /* |
@@ -608,6 +900,7 @@ void __bdi_update_bandwidth(struct backing_dev_info *bdi, | |||
608 | if (elapsed < BANDWIDTH_INTERVAL) | 900 | if (elapsed < BANDWIDTH_INTERVAL) |
609 | return; | 901 | return; |
610 | 902 | ||
903 | dirtied = percpu_counter_read(&bdi->bdi_stat[BDI_DIRTIED]); | ||
611 | written = percpu_counter_read(&bdi->bdi_stat[BDI_WRITTEN]); | 904 | written = percpu_counter_read(&bdi->bdi_stat[BDI_WRITTEN]); |
612 | 905 | ||
613 | /* | 906 | /* |
@@ -617,18 +910,23 @@ void __bdi_update_bandwidth(struct backing_dev_info *bdi, | |||
617 | if (elapsed > HZ && time_before(bdi->bw_time_stamp, start_time)) | 910 | if (elapsed > HZ && time_before(bdi->bw_time_stamp, start_time)) |
618 | goto snapshot; | 911 | goto snapshot; |
619 | 912 | ||
620 | if (thresh) | 913 | if (thresh) { |
621 | global_update_bandwidth(thresh, dirty, now); | 914 | global_update_bandwidth(thresh, dirty, now); |
622 | 915 | bdi_update_dirty_ratelimit(bdi, thresh, bg_thresh, dirty, | |
916 | bdi_thresh, bdi_dirty, | ||
917 | dirtied, elapsed); | ||
918 | } | ||
623 | bdi_update_write_bandwidth(bdi, elapsed, written); | 919 | bdi_update_write_bandwidth(bdi, elapsed, written); |
624 | 920 | ||
625 | snapshot: | 921 | snapshot: |
922 | bdi->dirtied_stamp = dirtied; | ||
626 | bdi->written_stamp = written; | 923 | bdi->written_stamp = written; |
627 | bdi->bw_time_stamp = now; | 924 | bdi->bw_time_stamp = now; |
628 | } | 925 | } |
629 | 926 | ||
630 | static void bdi_update_bandwidth(struct backing_dev_info *bdi, | 927 | static void bdi_update_bandwidth(struct backing_dev_info *bdi, |
631 | unsigned long thresh, | 928 | unsigned long thresh, |
929 | unsigned long bg_thresh, | ||
632 | unsigned long dirty, | 930 | unsigned long dirty, |
633 | unsigned long bdi_thresh, | 931 | unsigned long bdi_thresh, |
634 | unsigned long bdi_dirty, | 932 | unsigned long bdi_dirty, |
@@ -637,37 +935,99 @@ static void bdi_update_bandwidth(struct backing_dev_info *bdi, | |||
637 | if (time_is_after_eq_jiffies(bdi->bw_time_stamp + BANDWIDTH_INTERVAL)) | 935 | if (time_is_after_eq_jiffies(bdi->bw_time_stamp + BANDWIDTH_INTERVAL)) |
638 | return; | 936 | return; |
639 | spin_lock(&bdi->wb.list_lock); | 937 | spin_lock(&bdi->wb.list_lock); |
640 | __bdi_update_bandwidth(bdi, thresh, dirty, bdi_thresh, bdi_dirty, | 938 | __bdi_update_bandwidth(bdi, thresh, bg_thresh, dirty, |
641 | start_time); | 939 | bdi_thresh, bdi_dirty, start_time); |
642 | spin_unlock(&bdi->wb.list_lock); | 940 | spin_unlock(&bdi->wb.list_lock); |
643 | } | 941 | } |
644 | 942 | ||
645 | /* | 943 | /* |
944 | * After a task dirtied this many pages, balance_dirty_pages_ratelimited_nr() | ||
945 | * will look to see if it needs to start dirty throttling. | ||
946 | * | ||
947 | * If dirty_poll_interval is too low, big NUMA machines will call the expensive | ||
948 | * global_page_state() too often. So scale it near-sqrt to the safety margin | ||
949 | * (the number of pages we may dirty without exceeding the dirty limits). | ||
950 | */ | ||
951 | static unsigned long dirty_poll_interval(unsigned long dirty, | ||
952 | unsigned long thresh) | ||
953 | { | ||
954 | if (thresh > dirty) | ||
955 | return 1UL << (ilog2(thresh - dirty) >> 1); | ||
956 | |||
957 | return 1; | ||
958 | } | ||
959 | |||
960 | static unsigned long bdi_max_pause(struct backing_dev_info *bdi, | ||
961 | unsigned long bdi_dirty) | ||
962 | { | ||
963 | unsigned long bw = bdi->avg_write_bandwidth; | ||
964 | unsigned long hi = ilog2(bw); | ||
965 | unsigned long lo = ilog2(bdi->dirty_ratelimit); | ||
966 | unsigned long t; | ||
967 | |||
968 | /* target for 20ms max pause on 1-dd case */ | ||
969 | t = HZ / 50; | ||
970 | |||
971 | /* | ||
972 | * Scale up pause time for concurrent dirtiers in order to reduce CPU | ||
973 | * overheads. | ||
974 | * | ||
975 | * (N * 20ms) on 2^N concurrent tasks. | ||
976 | */ | ||
977 | if (hi > lo) | ||
978 | t += (hi - lo) * (20 * HZ) / 1024; | ||
979 | |||
980 | /* | ||
981 | * Limit pause time for small memory systems. If sleeping for too long | ||
982 | * time, a small pool of dirty/writeback pages may go empty and disk go | ||
983 | * idle. | ||
984 | * | ||
985 | * 8 serves as the safety ratio. | ||
986 | */ | ||
987 | if (bdi_dirty) | ||
988 | t = min(t, bdi_dirty * HZ / (8 * bw + 1)); | ||
989 | |||
990 | /* | ||
991 | * The pause time will be settled within range (max_pause/4, max_pause). | ||
992 | * Apply a minimal value of 4 to get a non-zero max_pause/4. | ||
993 | */ | ||
994 | return clamp_val(t, 4, MAX_PAUSE); | ||
995 | } | ||
996 | |||
997 | /* | ||
646 | * balance_dirty_pages() must be called by processes which are generating dirty | 998 | * balance_dirty_pages() must be called by processes which are generating dirty |
647 | * data. It looks at the number of dirty pages in the machine and will force | 999 | * data. It looks at the number of dirty pages in the machine and will force |
648 | * the caller to perform writeback if the system is over `vm_dirty_ratio'. | 1000 | * the caller to wait once crossing the (background_thresh + dirty_thresh) / 2. |
649 | * If we're over `background_thresh' then the writeback threads are woken to | 1001 | * If we're over `background_thresh' then the writeback threads are woken to |
650 | * perform some writeout. | 1002 | * perform some writeout. |
651 | */ | 1003 | */ |
652 | static void balance_dirty_pages(struct address_space *mapping, | 1004 | static void balance_dirty_pages(struct address_space *mapping, |
653 | unsigned long write_chunk) | 1005 | unsigned long pages_dirtied) |
654 | { | 1006 | { |
655 | unsigned long nr_reclaimable, bdi_nr_reclaimable; | 1007 | unsigned long nr_reclaimable; /* = file_dirty + unstable_nfs */ |
1008 | unsigned long bdi_reclaimable; | ||
656 | unsigned long nr_dirty; /* = file_dirty + writeback + unstable_nfs */ | 1009 | unsigned long nr_dirty; /* = file_dirty + writeback + unstable_nfs */ |
657 | unsigned long bdi_dirty; | 1010 | unsigned long bdi_dirty; |
1011 | unsigned long freerun; | ||
658 | unsigned long background_thresh; | 1012 | unsigned long background_thresh; |
659 | unsigned long dirty_thresh; | 1013 | unsigned long dirty_thresh; |
660 | unsigned long bdi_thresh; | 1014 | unsigned long bdi_thresh; |
661 | unsigned long task_bdi_thresh; | 1015 | long pause = 0; |
662 | unsigned long min_task_bdi_thresh; | 1016 | long uninitialized_var(max_pause); |
663 | unsigned long pages_written = 0; | ||
664 | unsigned long pause = 1; | ||
665 | bool dirty_exceeded = false; | 1017 | bool dirty_exceeded = false; |
666 | bool clear_dirty_exceeded = true; | 1018 | unsigned long task_ratelimit; |
1019 | unsigned long uninitialized_var(dirty_ratelimit); | ||
1020 | unsigned long pos_ratio; | ||
667 | struct backing_dev_info *bdi = mapping->backing_dev_info; | 1021 | struct backing_dev_info *bdi = mapping->backing_dev_info; |
668 | unsigned long start_time = jiffies; | 1022 | unsigned long start_time = jiffies; |
669 | 1023 | ||
670 | for (;;) { | 1024 | for (;;) { |
1025 | /* | ||
1026 | * Unstable writes are a feature of certain networked | ||
1027 | * filesystems (i.e. NFS) in which data may have been | ||
1028 | * written to the server's write cache, but has not yet | ||
1029 | * been flushed to permanent storage. | ||
1030 | */ | ||
671 | nr_reclaimable = global_page_state(NR_FILE_DIRTY) + | 1031 | nr_reclaimable = global_page_state(NR_FILE_DIRTY) + |
672 | global_page_state(NR_UNSTABLE_NFS); | 1032 | global_page_state(NR_UNSTABLE_NFS); |
673 | nr_dirty = nr_reclaimable + global_page_state(NR_WRITEBACK); | 1033 | nr_dirty = nr_reclaimable + global_page_state(NR_WRITEBACK); |
@@ -679,12 +1039,28 @@ static void balance_dirty_pages(struct address_space *mapping, | |||
679 | * catch-up. This avoids (excessively) small writeouts | 1039 | * catch-up. This avoids (excessively) small writeouts |
680 | * when the bdi limits are ramping up. | 1040 | * when the bdi limits are ramping up. |
681 | */ | 1041 | */ |
682 | if (nr_dirty <= (background_thresh + dirty_thresh) / 2) | 1042 | freerun = dirty_freerun_ceiling(dirty_thresh, |
1043 | background_thresh); | ||
1044 | if (nr_dirty <= freerun) | ||
683 | break; | 1045 | break; |
684 | 1046 | ||
1047 | if (unlikely(!writeback_in_progress(bdi))) | ||
1048 | bdi_start_background_writeback(bdi); | ||
1049 | |||
1050 | /* | ||
1051 | * bdi_thresh is not treated as some limiting factor as | ||
1052 | * dirty_thresh, due to reasons | ||
1053 | * - in JBOD setup, bdi_thresh can fluctuate a lot | ||
1054 | * - in a system with HDD and USB key, the USB key may somehow | ||
1055 | * go into state (bdi_dirty >> bdi_thresh) either because | ||
1056 | * bdi_dirty starts high, or because bdi_thresh drops low. | ||
1057 | * In this case we don't want to hard throttle the USB key | ||
1058 | * dirtiers for 100 seconds until bdi_dirty drops under | ||
1059 | * bdi_thresh. Instead the auxiliary bdi control line in | ||
1060 | * bdi_position_ratio() will let the dirtier task progress | ||
1061 | * at some rate <= (write_bw / 2) for bringing down bdi_dirty. | ||
1062 | */ | ||
685 | bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); | 1063 | bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); |
686 | min_task_bdi_thresh = task_min_dirty_limit(bdi_thresh); | ||
687 | task_bdi_thresh = task_dirty_limit(current, bdi_thresh); | ||
688 | 1064 | ||
689 | /* | 1065 | /* |
690 | * In order to avoid the stacked BDI deadlock we need | 1066 | * In order to avoid the stacked BDI deadlock we need |
@@ -696,56 +1072,69 @@ static void balance_dirty_pages(struct address_space *mapping, | |||
696 | * actually dirty; with m+n sitting in the percpu | 1072 | * actually dirty; with m+n sitting in the percpu |
697 | * deltas. | 1073 | * deltas. |
698 | */ | 1074 | */ |
699 | if (task_bdi_thresh < 2 * bdi_stat_error(bdi)) { | 1075 | if (bdi_thresh < 2 * bdi_stat_error(bdi)) { |
700 | bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE); | 1076 | bdi_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE); |
701 | bdi_dirty = bdi_nr_reclaimable + | 1077 | bdi_dirty = bdi_reclaimable + |
702 | bdi_stat_sum(bdi, BDI_WRITEBACK); | 1078 | bdi_stat_sum(bdi, BDI_WRITEBACK); |
703 | } else { | 1079 | } else { |
704 | bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); | 1080 | bdi_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); |
705 | bdi_dirty = bdi_nr_reclaimable + | 1081 | bdi_dirty = bdi_reclaimable + |
706 | bdi_stat(bdi, BDI_WRITEBACK); | 1082 | bdi_stat(bdi, BDI_WRITEBACK); |
707 | } | 1083 | } |
708 | 1084 | ||
709 | /* | 1085 | dirty_exceeded = (bdi_dirty > bdi_thresh) || |
710 | * The bdi thresh is somehow "soft" limit derived from the | ||
711 | * global "hard" limit. The former helps to prevent heavy IO | ||
712 | * bdi or process from holding back light ones; The latter is | ||
713 | * the last resort safeguard. | ||
714 | */ | ||
715 | dirty_exceeded = (bdi_dirty > task_bdi_thresh) || | ||
716 | (nr_dirty > dirty_thresh); | 1086 | (nr_dirty > dirty_thresh); |
717 | clear_dirty_exceeded = (bdi_dirty <= min_task_bdi_thresh) && | 1087 | if (dirty_exceeded && !bdi->dirty_exceeded) |
718 | (nr_dirty <= dirty_thresh); | ||
719 | |||
720 | if (!dirty_exceeded) | ||
721 | break; | ||
722 | |||
723 | if (!bdi->dirty_exceeded) | ||
724 | bdi->dirty_exceeded = 1; | 1088 | bdi->dirty_exceeded = 1; |
725 | 1089 | ||
726 | bdi_update_bandwidth(bdi, dirty_thresh, nr_dirty, | 1090 | bdi_update_bandwidth(bdi, dirty_thresh, background_thresh, |
727 | bdi_thresh, bdi_dirty, start_time); | 1091 | nr_dirty, bdi_thresh, bdi_dirty, |
728 | 1092 | start_time); | |
729 | /* Note: nr_reclaimable denotes nr_dirty + nr_unstable. | 1093 | |
730 | * Unstable writes are a feature of certain networked | 1094 | max_pause = bdi_max_pause(bdi, bdi_dirty); |
731 | * filesystems (i.e. NFS) in which data may have been | 1095 | |
732 | * written to the server's write cache, but has not yet | 1096 | dirty_ratelimit = bdi->dirty_ratelimit; |
733 | * been flushed to permanent storage. | 1097 | pos_ratio = bdi_position_ratio(bdi, dirty_thresh, |
734 | * Only move pages to writeback if this bdi is over its | 1098 | background_thresh, nr_dirty, |
735 | * threshold otherwise wait until the disk writes catch | 1099 | bdi_thresh, bdi_dirty); |
736 | * up. | 1100 | task_ratelimit = ((u64)dirty_ratelimit * pos_ratio) >> |
737 | */ | 1101 | RATELIMIT_CALC_SHIFT; |
738 | trace_balance_dirty_start(bdi); | 1102 | if (unlikely(task_ratelimit == 0)) { |
739 | if (bdi_nr_reclaimable > task_bdi_thresh) { | 1103 | pause = max_pause; |
740 | pages_written += writeback_inodes_wb(&bdi->wb, | 1104 | goto pause; |
741 | write_chunk); | 1105 | } |
742 | trace_balance_dirty_written(bdi, pages_written); | 1106 | pause = HZ * pages_dirtied / task_ratelimit; |
743 | if (pages_written >= write_chunk) | 1107 | if (unlikely(pause <= 0)) { |
744 | break; /* We've done our duty */ | 1108 | trace_balance_dirty_pages(bdi, |
1109 | dirty_thresh, | ||
1110 | background_thresh, | ||
1111 | nr_dirty, | ||
1112 | bdi_thresh, | ||
1113 | bdi_dirty, | ||
1114 | dirty_ratelimit, | ||
1115 | task_ratelimit, | ||
1116 | pages_dirtied, | ||
1117 | pause, | ||
1118 | start_time); | ||
1119 | pause = 1; /* avoid resetting nr_dirtied_pause below */ | ||
1120 | break; | ||
745 | } | 1121 | } |
1122 | pause = min(pause, max_pause); | ||
1123 | |||
1124 | pause: | ||
1125 | trace_balance_dirty_pages(bdi, | ||
1126 | dirty_thresh, | ||
1127 | background_thresh, | ||
1128 | nr_dirty, | ||
1129 | bdi_thresh, | ||
1130 | bdi_dirty, | ||
1131 | dirty_ratelimit, | ||
1132 | task_ratelimit, | ||
1133 | pages_dirtied, | ||
1134 | pause, | ||
1135 | start_time); | ||
746 | __set_current_state(TASK_UNINTERRUPTIBLE); | 1136 | __set_current_state(TASK_UNINTERRUPTIBLE); |
747 | io_schedule_timeout(pause); | 1137 | io_schedule_timeout(pause); |
748 | trace_balance_dirty_wait(bdi); | ||
749 | 1138 | ||
750 | dirty_thresh = hard_dirty_limit(dirty_thresh); | 1139 | dirty_thresh = hard_dirty_limit(dirty_thresh); |
751 | /* | 1140 | /* |
@@ -754,35 +1143,30 @@ static void balance_dirty_pages(struct address_space *mapping, | |||
754 | * 200ms is typically more than enough to curb heavy dirtiers; | 1143 | * 200ms is typically more than enough to curb heavy dirtiers; |
755 | * (b) the pause time limit makes the dirtiers more responsive. | 1144 | * (b) the pause time limit makes the dirtiers more responsive. |
756 | */ | 1145 | */ |
757 | if (nr_dirty < dirty_thresh + | 1146 | if (nr_dirty < dirty_thresh) |
758 | dirty_thresh / DIRTY_MAXPAUSE_AREA && | ||
759 | time_after(jiffies, start_time + MAX_PAUSE)) | ||
760 | break; | ||
761 | /* | ||
762 | * pass-good area. When some bdi gets blocked (eg. NFS server | ||
763 | * not responding), or write bandwidth dropped dramatically due | ||
764 | * to concurrent reads, or dirty threshold suddenly dropped and | ||
765 | * the dirty pages cannot be brought down anytime soon (eg. on | ||
766 | * slow USB stick), at least let go of the good bdi's. | ||
767 | */ | ||
768 | if (nr_dirty < dirty_thresh + | ||
769 | dirty_thresh / DIRTY_PASSGOOD_AREA && | ||
770 | bdi_dirty < bdi_thresh) | ||
771 | break; | 1147 | break; |
772 | |||
773 | /* | ||
774 | * Increase the delay for each loop, up to our previous | ||
775 | * default of taking a 100ms nap. | ||
776 | */ | ||
777 | pause <<= 1; | ||
778 | if (pause > HZ / 10) | ||
779 | pause = HZ / 10; | ||
780 | } | 1148 | } |
781 | 1149 | ||
782 | /* Clear dirty_exceeded flag only when no task can exceed the limit */ | 1150 | if (!dirty_exceeded && bdi->dirty_exceeded) |
783 | if (clear_dirty_exceeded && bdi->dirty_exceeded) | ||
784 | bdi->dirty_exceeded = 0; | 1151 | bdi->dirty_exceeded = 0; |
785 | 1152 | ||
1153 | current->nr_dirtied = 0; | ||
1154 | if (pause == 0) { /* in freerun area */ | ||
1155 | current->nr_dirtied_pause = | ||
1156 | dirty_poll_interval(nr_dirty, dirty_thresh); | ||
1157 | } else if (pause <= max_pause / 4 && | ||
1158 | pages_dirtied >= current->nr_dirtied_pause) { | ||
1159 | current->nr_dirtied_pause = clamp_val( | ||
1160 | dirty_ratelimit * (max_pause / 2) / HZ, | ||
1161 | pages_dirtied + pages_dirtied / 8, | ||
1162 | pages_dirtied * 4); | ||
1163 | } else if (pause >= max_pause) { | ||
1164 | current->nr_dirtied_pause = 1 | clamp_val( | ||
1165 | dirty_ratelimit * (max_pause / 2) / HZ, | ||
1166 | pages_dirtied / 4, | ||
1167 | pages_dirtied - pages_dirtied / 8); | ||
1168 | } | ||
1169 | |||
786 | if (writeback_in_progress(bdi)) | 1170 | if (writeback_in_progress(bdi)) |
787 | return; | 1171 | return; |
788 | 1172 | ||
@@ -794,8 +1178,10 @@ static void balance_dirty_pages(struct address_space *mapping, | |||
794 | * In normal mode, we start background writeout at the lower | 1178 | * In normal mode, we start background writeout at the lower |
795 | * background_thresh, to keep the amount of dirty memory low. | 1179 | * background_thresh, to keep the amount of dirty memory low. |
796 | */ | 1180 | */ |
797 | if ((laptop_mode && pages_written) || | 1181 | if (laptop_mode) |
798 | (!laptop_mode && (nr_reclaimable > background_thresh))) | 1182 | return; |
1183 | |||
1184 | if (nr_reclaimable > background_thresh) | ||
799 | bdi_start_background_writeback(bdi); | 1185 | bdi_start_background_writeback(bdi); |
800 | } | 1186 | } |
801 | 1187 | ||
@@ -809,7 +1195,7 @@ void set_page_dirty_balance(struct page *page, int page_mkwrite) | |||
809 | } | 1195 | } |
810 | } | 1196 | } |
811 | 1197 | ||
812 | static DEFINE_PER_CPU(unsigned long, bdp_ratelimits) = 0; | 1198 | static DEFINE_PER_CPU(int, bdp_ratelimits); |
813 | 1199 | ||
814 | /** | 1200 | /** |
815 | * balance_dirty_pages_ratelimited_nr - balance dirty memory state | 1201 | * balance_dirty_pages_ratelimited_nr - balance dirty memory state |
@@ -829,31 +1215,39 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, | |||
829 | unsigned long nr_pages_dirtied) | 1215 | unsigned long nr_pages_dirtied) |
830 | { | 1216 | { |
831 | struct backing_dev_info *bdi = mapping->backing_dev_info; | 1217 | struct backing_dev_info *bdi = mapping->backing_dev_info; |
832 | unsigned long ratelimit; | 1218 | int ratelimit; |
833 | unsigned long *p; | 1219 | int *p; |
834 | 1220 | ||
835 | if (!bdi_cap_account_dirty(bdi)) | 1221 | if (!bdi_cap_account_dirty(bdi)) |
836 | return; | 1222 | return; |
837 | 1223 | ||
838 | ratelimit = ratelimit_pages; | 1224 | ratelimit = current->nr_dirtied_pause; |
839 | if (mapping->backing_dev_info->dirty_exceeded) | 1225 | if (bdi->dirty_exceeded) |
840 | ratelimit = 8; | 1226 | ratelimit = min(ratelimit, 32 >> (PAGE_SHIFT - 10)); |
1227 | |||
1228 | current->nr_dirtied += nr_pages_dirtied; | ||
841 | 1229 | ||
1230 | preempt_disable(); | ||
842 | /* | 1231 | /* |
843 | * Check the rate limiting. Also, we do not want to throttle real-time | 1232 | * This prevents one CPU to accumulate too many dirtied pages without |
844 | * tasks in balance_dirty_pages(). Period. | 1233 | * calling into balance_dirty_pages(), which can happen when there are |
1234 | * 1000+ tasks, all of them start dirtying pages at exactly the same | ||
1235 | * time, hence all honoured too large initial task->nr_dirtied_pause. | ||
845 | */ | 1236 | */ |
846 | preempt_disable(); | ||
847 | p = &__get_cpu_var(bdp_ratelimits); | 1237 | p = &__get_cpu_var(bdp_ratelimits); |
848 | *p += nr_pages_dirtied; | 1238 | if (unlikely(current->nr_dirtied >= ratelimit)) |
849 | if (unlikely(*p >= ratelimit)) { | ||
850 | ratelimit = sync_writeback_pages(*p); | ||
851 | *p = 0; | 1239 | *p = 0; |
852 | preempt_enable(); | 1240 | else { |
853 | balance_dirty_pages(mapping, ratelimit); | 1241 | *p += nr_pages_dirtied; |
854 | return; | 1242 | if (unlikely(*p >= ratelimit_pages)) { |
1243 | *p = 0; | ||
1244 | ratelimit = 0; | ||
1245 | } | ||
855 | } | 1246 | } |
856 | preempt_enable(); | 1247 | preempt_enable(); |
1248 | |||
1249 | if (unlikely(current->nr_dirtied >= ratelimit)) | ||
1250 | balance_dirty_pages(mapping, current->nr_dirtied); | ||
857 | } | 1251 | } |
858 | EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr); | 1252 | EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr); |
859 | 1253 | ||
@@ -909,7 +1303,8 @@ void laptop_mode_timer_fn(unsigned long data) | |||
909 | * threshold | 1303 | * threshold |
910 | */ | 1304 | */ |
911 | if (bdi_has_dirty_io(&q->backing_dev_info)) | 1305 | if (bdi_has_dirty_io(&q->backing_dev_info)) |
912 | bdi_start_writeback(&q->backing_dev_info, nr_pages); | 1306 | bdi_start_writeback(&q->backing_dev_info, nr_pages, |
1307 | WB_REASON_LAPTOP_TIMER); | ||
913 | } | 1308 | } |
914 | 1309 | ||
915 | /* | 1310 | /* |
@@ -948,22 +1343,17 @@ void laptop_sync_completion(void) | |||
948 | * | 1343 | * |
949 | * Here we set ratelimit_pages to a level which ensures that when all CPUs are | 1344 | * Here we set ratelimit_pages to a level which ensures that when all CPUs are |
950 | * dirtying in parallel, we cannot go more than 3% (1/32) over the dirty memory | 1345 | * dirtying in parallel, we cannot go more than 3% (1/32) over the dirty memory |
951 | * thresholds before writeback cuts in. | 1346 | * thresholds. |
952 | * | ||
953 | * But the limit should not be set too high. Because it also controls the | ||
954 | * amount of memory which the balance_dirty_pages() caller has to write back. | ||
955 | * If this is too large then the caller will block on the IO queue all the | ||
956 | * time. So limit it to four megabytes - the balance_dirty_pages() caller | ||
957 | * will write six megabyte chunks, max. | ||
958 | */ | 1347 | */ |
959 | 1348 | ||
960 | void writeback_set_ratelimit(void) | 1349 | void writeback_set_ratelimit(void) |
961 | { | 1350 | { |
962 | ratelimit_pages = vm_total_pages / (num_online_cpus() * 32); | 1351 | unsigned long background_thresh; |
1352 | unsigned long dirty_thresh; | ||
1353 | global_dirty_limits(&background_thresh, &dirty_thresh); | ||
1354 | ratelimit_pages = dirty_thresh / (num_online_cpus() * 32); | ||
963 | if (ratelimit_pages < 16) | 1355 | if (ratelimit_pages < 16) |
964 | ratelimit_pages = 16; | 1356 | ratelimit_pages = 16; |
965 | if (ratelimit_pages * PAGE_CACHE_SIZE > 4096 * 1024) | ||
966 | ratelimit_pages = (4096 * 1024) / PAGE_CACHE_SIZE; | ||
967 | } | 1357 | } |
968 | 1358 | ||
969 | static int __cpuinit | 1359 | static int __cpuinit |
@@ -1333,6 +1723,7 @@ void account_page_dirtied(struct page *page, struct address_space *mapping) | |||
1333 | __inc_zone_page_state(page, NR_FILE_DIRTY); | 1723 | __inc_zone_page_state(page, NR_FILE_DIRTY); |
1334 | __inc_zone_page_state(page, NR_DIRTIED); | 1724 | __inc_zone_page_state(page, NR_DIRTIED); |
1335 | __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); | 1725 | __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); |
1726 | __inc_bdi_stat(mapping->backing_dev_info, BDI_DIRTIED); | ||
1336 | task_dirty_inc(current); | 1727 | task_dirty_inc(current); |
1337 | task_io_account_write(PAGE_CACHE_SIZE); | 1728 | task_io_account_write(PAGE_CACHE_SIZE); |
1338 | } | 1729 | } |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 1dbcf8888f14..9dd443d89d8b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -318,6 +318,7 @@ static void bad_page(struct page *page) | |||
318 | current->comm, page_to_pfn(page)); | 318 | current->comm, page_to_pfn(page)); |
319 | dump_page(page); | 319 | dump_page(page); |
320 | 320 | ||
321 | print_modules(); | ||
321 | dump_stack(); | 322 | dump_stack(); |
322 | out: | 323 | out: |
323 | /* Leave bad fields for debug, except PageBuddy could make trouble */ | 324 | /* Leave bad fields for debug, except PageBuddy could make trouble */ |
@@ -1409,14 +1410,11 @@ static int __init fail_page_alloc_debugfs(void) | |||
1409 | { | 1410 | { |
1410 | mode_t mode = S_IFREG | S_IRUSR | S_IWUSR; | 1411 | mode_t mode = S_IFREG | S_IRUSR | S_IWUSR; |
1411 | struct dentry *dir; | 1412 | struct dentry *dir; |
1412 | int err; | ||
1413 | 1413 | ||
1414 | err = init_fault_attr_dentries(&fail_page_alloc.attr, | 1414 | dir = fault_create_debugfs_attr("fail_page_alloc", NULL, |
1415 | "fail_page_alloc"); | 1415 | &fail_page_alloc.attr); |
1416 | if (err) | 1416 | if (IS_ERR(dir)) |
1417 | return err; | 1417 | return PTR_ERR(dir); |
1418 | |||
1419 | dir = fail_page_alloc.attr.dir; | ||
1420 | 1418 | ||
1421 | if (!debugfs_create_bool("ignore-gfp-wait", mode, dir, | 1419 | if (!debugfs_create_bool("ignore-gfp-wait", mode, dir, |
1422 | &fail_page_alloc.ignore_gfp_wait)) | 1420 | &fail_page_alloc.ignore_gfp_wait)) |
@@ -1430,7 +1428,7 @@ static int __init fail_page_alloc_debugfs(void) | |||
1430 | 1428 | ||
1431 | return 0; | 1429 | return 0; |
1432 | fail: | 1430 | fail: |
1433 | cleanup_fault_attr_dentries(&fail_page_alloc.attr); | 1431 | debugfs_remove_recursive(dir); |
1434 | 1432 | ||
1435 | return -ENOMEM; | 1433 | return -ENOMEM; |
1436 | } | 1434 | } |
@@ -1756,7 +1754,6 @@ static DEFINE_RATELIMIT_STATE(nopage_rs, | |||
1756 | 1754 | ||
1757 | void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...) | 1755 | void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...) |
1758 | { | 1756 | { |
1759 | va_list args; | ||
1760 | unsigned int filter = SHOW_MEM_FILTER_NODES; | 1757 | unsigned int filter = SHOW_MEM_FILTER_NODES; |
1761 | 1758 | ||
1762 | if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs)) | 1759 | if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs)) |
@@ -1775,14 +1772,21 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...) | |||
1775 | filter &= ~SHOW_MEM_FILTER_NODES; | 1772 | filter &= ~SHOW_MEM_FILTER_NODES; |
1776 | 1773 | ||
1777 | if (fmt) { | 1774 | if (fmt) { |
1778 | printk(KERN_WARNING); | 1775 | struct va_format vaf; |
1776 | va_list args; | ||
1777 | |||
1779 | va_start(args, fmt); | 1778 | va_start(args, fmt); |
1780 | vprintk(fmt, args); | 1779 | |
1780 | vaf.fmt = fmt; | ||
1781 | vaf.va = &args; | ||
1782 | |||
1783 | pr_warn("%pV", &vaf); | ||
1784 | |||
1781 | va_end(args); | 1785 | va_end(args); |
1782 | } | 1786 | } |
1783 | 1787 | ||
1784 | pr_warning("%s: page allocation failure: order:%d, mode:0x%x\n", | 1788 | pr_warn("%s: page allocation failure: order:%d, mode:0x%x\n", |
1785 | current->comm, order, gfp_mask); | 1789 | current->comm, order, gfp_mask); |
1786 | 1790 | ||
1787 | dump_stack(); | 1791 | dump_stack(); |
1788 | if (!should_suppress_show_mem()) | 1792 | if (!should_suppress_show_mem()) |
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index 39d216d535ea..2d123f94a8df 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c | |||
@@ -133,10 +133,13 @@ struct page *lookup_cgroup_page(struct page_cgroup *pc) | |||
133 | static void *__meminit alloc_page_cgroup(size_t size, int nid) | 133 | static void *__meminit alloc_page_cgroup(size_t size, int nid) |
134 | { | 134 | { |
135 | void *addr = NULL; | 135 | void *addr = NULL; |
136 | gfp_t flags = GFP_KERNEL | __GFP_NOWARN; | ||
136 | 137 | ||
137 | addr = alloc_pages_exact_nid(nid, size, GFP_KERNEL | __GFP_NOWARN); | 138 | addr = alloc_pages_exact_nid(nid, size, flags); |
138 | if (addr) | 139 | if (addr) { |
140 | kmemleak_alloc(addr, size, 1, flags); | ||
139 | return addr; | 141 | return addr; |
142 | } | ||
140 | 143 | ||
141 | if (node_state(nid, N_HIGH_MEMORY)) | 144 | if (node_state(nid, N_HIGH_MEMORY)) |
142 | addr = vmalloc_node(size, nid); | 145 | addr = vmalloc_node(size, nid); |
@@ -357,7 +360,7 @@ struct swap_cgroup_ctrl { | |||
357 | spinlock_t lock; | 360 | spinlock_t lock; |
358 | }; | 361 | }; |
359 | 362 | ||
360 | struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES]; | 363 | static struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES]; |
361 | 364 | ||
362 | struct swap_cgroup { | 365 | struct swap_cgroup { |
363 | unsigned short id; | 366 | unsigned short id; |
@@ -513,11 +516,10 @@ int swap_cgroup_swapon(int type, unsigned long max_pages) | |||
513 | length = DIV_ROUND_UP(max_pages, SC_PER_PAGE); | 516 | length = DIV_ROUND_UP(max_pages, SC_PER_PAGE); |
514 | array_size = length * sizeof(void *); | 517 | array_size = length * sizeof(void *); |
515 | 518 | ||
516 | array = vmalloc(array_size); | 519 | array = vzalloc(array_size); |
517 | if (!array) | 520 | if (!array) |
518 | goto nomem; | 521 | goto nomem; |
519 | 522 | ||
520 | memset(array, 0, array_size); | ||
521 | ctrl = &swap_cgroup_ctrl[type]; | 523 | ctrl = &swap_cgroup_ctrl[type]; |
522 | mutex_lock(&swap_cgroup_mutex); | 524 | mutex_lock(&swap_cgroup_mutex); |
523 | ctrl->length = length; | 525 | ctrl->length = length; |
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c new file mode 100644 index 000000000000..e920aa3ce104 --- /dev/null +++ b/mm/process_vm_access.c | |||
@@ -0,0 +1,496 @@ | |||
1 | /* | ||
2 | * linux/mm/process_vm_access.c | ||
3 | * | ||
4 | * Copyright (C) 2010-2011 Christopher Yeoh <cyeoh@au1.ibm.com>, IBM Corp. | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or | ||
7 | * modify it under the terms of the GNU General Public License | ||
8 | * as published by the Free Software Foundation; either version | ||
9 | * 2 of the License, or (at your option) any later version. | ||
10 | */ | ||
11 | |||
12 | #include <linux/mm.h> | ||
13 | #include <linux/uio.h> | ||
14 | #include <linux/sched.h> | ||
15 | #include <linux/highmem.h> | ||
16 | #include <linux/ptrace.h> | ||
17 | #include <linux/slab.h> | ||
18 | #include <linux/syscalls.h> | ||
19 | |||
20 | #ifdef CONFIG_COMPAT | ||
21 | #include <linux/compat.h> | ||
22 | #endif | ||
23 | |||
24 | /** | ||
25 | * process_vm_rw_pages - read/write pages from task specified | ||
26 | * @task: task to read/write from | ||
27 | * @mm: mm for task | ||
28 | * @process_pages: struct pages area that can store at least | ||
29 | * nr_pages_to_copy struct page pointers | ||
30 | * @pa: address of page in task to start copying from/to | ||
31 | * @start_offset: offset in page to start copying from/to | ||
32 | * @len: number of bytes to copy | ||
33 | * @lvec: iovec array specifying where to copy to/from | ||
34 | * @lvec_cnt: number of elements in iovec array | ||
35 | * @lvec_current: index in iovec array we are up to | ||
36 | * @lvec_offset: offset in bytes from current iovec iov_base we are up to | ||
37 | * @vm_write: 0 means copy from, 1 means copy to | ||
38 | * @nr_pages_to_copy: number of pages to copy | ||
39 | * @bytes_copied: returns number of bytes successfully copied | ||
40 | * Returns 0 on success, error code otherwise | ||
41 | */ | ||
42 | static int process_vm_rw_pages(struct task_struct *task, | ||
43 | struct mm_struct *mm, | ||
44 | struct page **process_pages, | ||
45 | unsigned long pa, | ||
46 | unsigned long start_offset, | ||
47 | unsigned long len, | ||
48 | const struct iovec *lvec, | ||
49 | unsigned long lvec_cnt, | ||
50 | unsigned long *lvec_current, | ||
51 | size_t *lvec_offset, | ||
52 | int vm_write, | ||
53 | unsigned int nr_pages_to_copy, | ||
54 | ssize_t *bytes_copied) | ||
55 | { | ||
56 | int pages_pinned; | ||
57 | void *target_kaddr; | ||
58 | int pgs_copied = 0; | ||
59 | int j; | ||
60 | int ret; | ||
61 | ssize_t bytes_to_copy; | ||
62 | ssize_t rc = 0; | ||
63 | |||
64 | *bytes_copied = 0; | ||
65 | |||
66 | /* Get the pages we're interested in */ | ||
67 | down_read(&mm->mmap_sem); | ||
68 | pages_pinned = get_user_pages(task, mm, pa, | ||
69 | nr_pages_to_copy, | ||
70 | vm_write, 0, process_pages, NULL); | ||
71 | up_read(&mm->mmap_sem); | ||
72 | |||
73 | if (pages_pinned != nr_pages_to_copy) { | ||
74 | rc = -EFAULT; | ||
75 | goto end; | ||
76 | } | ||
77 | |||
78 | /* Do the copy for each page */ | ||
79 | for (pgs_copied = 0; | ||
80 | (pgs_copied < nr_pages_to_copy) && (*lvec_current < lvec_cnt); | ||
81 | pgs_copied++) { | ||
82 | /* Make sure we have a non zero length iovec */ | ||
83 | while (*lvec_current < lvec_cnt | ||
84 | && lvec[*lvec_current].iov_len == 0) | ||
85 | (*lvec_current)++; | ||
86 | if (*lvec_current == lvec_cnt) | ||
87 | break; | ||
88 | |||
89 | /* | ||
90 | * Will copy smallest of: | ||
91 | * - bytes remaining in page | ||
92 | * - bytes remaining in destination iovec | ||
93 | */ | ||
94 | bytes_to_copy = min_t(ssize_t, PAGE_SIZE - start_offset, | ||
95 | len - *bytes_copied); | ||
96 | bytes_to_copy = min_t(ssize_t, bytes_to_copy, | ||
97 | lvec[*lvec_current].iov_len | ||
98 | - *lvec_offset); | ||
99 | |||
100 | target_kaddr = kmap(process_pages[pgs_copied]) + start_offset; | ||
101 | |||
102 | if (vm_write) | ||
103 | ret = copy_from_user(target_kaddr, | ||
104 | lvec[*lvec_current].iov_base | ||
105 | + *lvec_offset, | ||
106 | bytes_to_copy); | ||
107 | else | ||
108 | ret = copy_to_user(lvec[*lvec_current].iov_base | ||
109 | + *lvec_offset, | ||
110 | target_kaddr, bytes_to_copy); | ||
111 | kunmap(process_pages[pgs_copied]); | ||
112 | if (ret) { | ||
113 | *bytes_copied += bytes_to_copy - ret; | ||
114 | pgs_copied++; | ||
115 | rc = -EFAULT; | ||
116 | goto end; | ||
117 | } | ||
118 | *bytes_copied += bytes_to_copy; | ||
119 | *lvec_offset += bytes_to_copy; | ||
120 | if (*lvec_offset == lvec[*lvec_current].iov_len) { | ||
121 | /* | ||
122 | * Need to copy remaining part of page into the | ||
123 | * next iovec if there are any bytes left in page | ||
124 | */ | ||
125 | (*lvec_current)++; | ||
126 | *lvec_offset = 0; | ||
127 | start_offset = (start_offset + bytes_to_copy) | ||
128 | % PAGE_SIZE; | ||
129 | if (start_offset) | ||
130 | pgs_copied--; | ||
131 | } else { | ||
132 | start_offset = 0; | ||
133 | } | ||
134 | } | ||
135 | |||
136 | end: | ||
137 | if (vm_write) { | ||
138 | for (j = 0; j < pages_pinned; j++) { | ||
139 | if (j < pgs_copied) | ||
140 | set_page_dirty_lock(process_pages[j]); | ||
141 | put_page(process_pages[j]); | ||
142 | } | ||
143 | } else { | ||
144 | for (j = 0; j < pages_pinned; j++) | ||
145 | put_page(process_pages[j]); | ||
146 | } | ||
147 | |||
148 | return rc; | ||
149 | } | ||
150 | |||
151 | /* Maximum number of pages kmalloc'd to hold struct page's during copy */ | ||
152 | #define PVM_MAX_KMALLOC_PAGES (PAGE_SIZE * 2) | ||
153 | |||
154 | /** | ||
155 | * process_vm_rw_single_vec - read/write pages from task specified | ||
156 | * @addr: start memory address of target process | ||
157 | * @len: size of area to copy to/from | ||
158 | * @lvec: iovec array specifying where to copy to/from locally | ||
159 | * @lvec_cnt: number of elements in iovec array | ||
160 | * @lvec_current: index in iovec array we are up to | ||
161 | * @lvec_offset: offset in bytes from current iovec iov_base we are up to | ||
162 | * @process_pages: struct pages area that can store at least | ||
163 | * nr_pages_to_copy struct page pointers | ||
164 | * @mm: mm for task | ||
165 | * @task: task to read/write from | ||
166 | * @vm_write: 0 means copy from, 1 means copy to | ||
167 | * @bytes_copied: returns number of bytes successfully copied | ||
168 | * Returns 0 on success or on failure error code | ||
169 | */ | ||
170 | static int process_vm_rw_single_vec(unsigned long addr, | ||
171 | unsigned long len, | ||
172 | const struct iovec *lvec, | ||
173 | unsigned long lvec_cnt, | ||
174 | unsigned long *lvec_current, | ||
175 | size_t *lvec_offset, | ||
176 | struct page **process_pages, | ||
177 | struct mm_struct *mm, | ||
178 | struct task_struct *task, | ||
179 | int vm_write, | ||
180 | ssize_t *bytes_copied) | ||
181 | { | ||
182 | unsigned long pa = addr & PAGE_MASK; | ||
183 | unsigned long start_offset = addr - pa; | ||
184 | unsigned long nr_pages; | ||
185 | ssize_t bytes_copied_loop; | ||
186 | ssize_t rc = 0; | ||
187 | unsigned long nr_pages_copied = 0; | ||
188 | unsigned long nr_pages_to_copy; | ||
189 | unsigned long max_pages_per_loop = PVM_MAX_KMALLOC_PAGES | ||
190 | / sizeof(struct pages *); | ||
191 | |||
192 | *bytes_copied = 0; | ||
193 | |||
194 | /* Work out address and page range required */ | ||
195 | if (len == 0) | ||
196 | return 0; | ||
197 | nr_pages = (addr + len - 1) / PAGE_SIZE - addr / PAGE_SIZE + 1; | ||
198 | |||
199 | while ((nr_pages_copied < nr_pages) && (*lvec_current < lvec_cnt)) { | ||
200 | nr_pages_to_copy = min(nr_pages - nr_pages_copied, | ||
201 | max_pages_per_loop); | ||
202 | |||
203 | rc = process_vm_rw_pages(task, mm, process_pages, pa, | ||
204 | start_offset, len, | ||
205 | lvec, lvec_cnt, | ||
206 | lvec_current, lvec_offset, | ||
207 | vm_write, nr_pages_to_copy, | ||
208 | &bytes_copied_loop); | ||
209 | start_offset = 0; | ||
210 | *bytes_copied += bytes_copied_loop; | ||
211 | |||
212 | if (rc < 0) { | ||
213 | return rc; | ||
214 | } else { | ||
215 | len -= bytes_copied_loop; | ||
216 | nr_pages_copied += nr_pages_to_copy; | ||
217 | pa += nr_pages_to_copy * PAGE_SIZE; | ||
218 | } | ||
219 | } | ||
220 | |||
221 | return rc; | ||
222 | } | ||
223 | |||
224 | /* Maximum number of entries for process pages array | ||
225 | which lives on stack */ | ||
226 | #define PVM_MAX_PP_ARRAY_COUNT 16 | ||
227 | |||
228 | /** | ||
229 | * process_vm_rw_core - core of reading/writing pages from task specified | ||
230 | * @pid: PID of process to read/write from/to | ||
231 | * @lvec: iovec array specifying where to copy to/from locally | ||
232 | * @liovcnt: size of lvec array | ||
233 | * @rvec: iovec array specifying where to copy to/from in the other process | ||
234 | * @riovcnt: size of rvec array | ||
235 | * @flags: currently unused | ||
236 | * @vm_write: 0 if reading from other process, 1 if writing to other process | ||
237 | * Returns the number of bytes read/written or error code. May | ||
238 | * return less bytes than expected if an error occurs during the copying | ||
239 | * process. | ||
240 | */ | ||
241 | static ssize_t process_vm_rw_core(pid_t pid, const struct iovec *lvec, | ||
242 | unsigned long liovcnt, | ||
243 | const struct iovec *rvec, | ||
244 | unsigned long riovcnt, | ||
245 | unsigned long flags, int vm_write) | ||
246 | { | ||
247 | struct task_struct *task; | ||
248 | struct page *pp_stack[PVM_MAX_PP_ARRAY_COUNT]; | ||
249 | struct page **process_pages = pp_stack; | ||
250 | struct mm_struct *mm; | ||
251 | unsigned long i; | ||
252 | ssize_t rc = 0; | ||
253 | ssize_t bytes_copied_loop; | ||
254 | ssize_t bytes_copied = 0; | ||
255 | unsigned long nr_pages = 0; | ||
256 | unsigned long nr_pages_iov; | ||
257 | unsigned long iov_l_curr_idx = 0; | ||
258 | size_t iov_l_curr_offset = 0; | ||
259 | ssize_t iov_len; | ||
260 | |||
261 | /* | ||
262 | * Work out how many pages of struct pages we're going to need | ||
263 | * when eventually calling get_user_pages | ||
264 | */ | ||
265 | for (i = 0; i < riovcnt; i++) { | ||
266 | iov_len = rvec[i].iov_len; | ||
267 | if (iov_len > 0) { | ||
268 | nr_pages_iov = ((unsigned long)rvec[i].iov_base | ||
269 | + iov_len) | ||
270 | / PAGE_SIZE - (unsigned long)rvec[i].iov_base | ||
271 | / PAGE_SIZE + 1; | ||
272 | nr_pages = max(nr_pages, nr_pages_iov); | ||
273 | } | ||
274 | } | ||
275 | |||
276 | if (nr_pages == 0) | ||
277 | return 0; | ||
278 | |||
279 | if (nr_pages > PVM_MAX_PP_ARRAY_COUNT) { | ||
280 | /* For reliability don't try to kmalloc more than | ||
281 | 2 pages worth */ | ||
282 | process_pages = kmalloc(min_t(size_t, PVM_MAX_KMALLOC_PAGES, | ||
283 | sizeof(struct pages *)*nr_pages), | ||
284 | GFP_KERNEL); | ||
285 | |||
286 | if (!process_pages) | ||
287 | return -ENOMEM; | ||
288 | } | ||
289 | |||
290 | /* Get process information */ | ||
291 | rcu_read_lock(); | ||
292 | task = find_task_by_vpid(pid); | ||
293 | if (task) | ||
294 | get_task_struct(task); | ||
295 | rcu_read_unlock(); | ||
296 | if (!task) { | ||
297 | rc = -ESRCH; | ||
298 | goto free_proc_pages; | ||
299 | } | ||
300 | |||
301 | task_lock(task); | ||
302 | if (__ptrace_may_access(task, PTRACE_MODE_ATTACH)) { | ||
303 | task_unlock(task); | ||
304 | rc = -EPERM; | ||
305 | goto put_task_struct; | ||
306 | } | ||
307 | mm = task->mm; | ||
308 | |||
309 | if (!mm || (task->flags & PF_KTHREAD)) { | ||
310 | task_unlock(task); | ||
311 | rc = -EINVAL; | ||
312 | goto put_task_struct; | ||
313 | } | ||
314 | |||
315 | atomic_inc(&mm->mm_users); | ||
316 | task_unlock(task); | ||
317 | |||
318 | for (i = 0; i < riovcnt && iov_l_curr_idx < liovcnt; i++) { | ||
319 | rc = process_vm_rw_single_vec( | ||
320 | (unsigned long)rvec[i].iov_base, rvec[i].iov_len, | ||
321 | lvec, liovcnt, &iov_l_curr_idx, &iov_l_curr_offset, | ||
322 | process_pages, mm, task, vm_write, &bytes_copied_loop); | ||
323 | bytes_copied += bytes_copied_loop; | ||
324 | if (rc != 0) { | ||
325 | /* If we have managed to copy any data at all then | ||
326 | we return the number of bytes copied. Otherwise | ||
327 | we return the error code */ | ||
328 | if (bytes_copied) | ||
329 | rc = bytes_copied; | ||
330 | goto put_mm; | ||
331 | } | ||
332 | } | ||
333 | |||
334 | rc = bytes_copied; | ||
335 | put_mm: | ||
336 | mmput(mm); | ||
337 | |||
338 | put_task_struct: | ||
339 | put_task_struct(task); | ||
340 | |||
341 | free_proc_pages: | ||
342 | if (process_pages != pp_stack) | ||
343 | kfree(process_pages); | ||
344 | return rc; | ||
345 | } | ||
346 | |||
347 | /** | ||
348 | * process_vm_rw - check iovecs before calling core routine | ||
349 | * @pid: PID of process to read/write from/to | ||
350 | * @lvec: iovec array specifying where to copy to/from locally | ||
351 | * @liovcnt: size of lvec array | ||
352 | * @rvec: iovec array specifying where to copy to/from in the other process | ||
353 | * @riovcnt: size of rvec array | ||
354 | * @flags: currently unused | ||
355 | * @vm_write: 0 if reading from other process, 1 if writing to other process | ||
356 | * Returns the number of bytes read/written or error code. May | ||
357 | * return less bytes than expected if an error occurs during the copying | ||
358 | * process. | ||
359 | */ | ||
360 | static ssize_t process_vm_rw(pid_t pid, | ||
361 | const struct iovec __user *lvec, | ||
362 | unsigned long liovcnt, | ||
363 | const struct iovec __user *rvec, | ||
364 | unsigned long riovcnt, | ||
365 | unsigned long flags, int vm_write) | ||
366 | { | ||
367 | struct iovec iovstack_l[UIO_FASTIOV]; | ||
368 | struct iovec iovstack_r[UIO_FASTIOV]; | ||
369 | struct iovec *iov_l = iovstack_l; | ||
370 | struct iovec *iov_r = iovstack_r; | ||
371 | ssize_t rc; | ||
372 | |||
373 | if (flags != 0) | ||
374 | return -EINVAL; | ||
375 | |||
376 | /* Check iovecs */ | ||
377 | if (vm_write) | ||
378 | rc = rw_copy_check_uvector(WRITE, lvec, liovcnt, UIO_FASTIOV, | ||
379 | iovstack_l, &iov_l, 1); | ||
380 | else | ||
381 | rc = rw_copy_check_uvector(READ, lvec, liovcnt, UIO_FASTIOV, | ||
382 | iovstack_l, &iov_l, 1); | ||
383 | if (rc <= 0) | ||
384 | goto free_iovecs; | ||
385 | |||
386 | rc = rw_copy_check_uvector(READ, rvec, riovcnt, UIO_FASTIOV, | ||
387 | iovstack_r, &iov_r, 0); | ||
388 | if (rc <= 0) | ||
389 | goto free_iovecs; | ||
390 | |||
391 | rc = process_vm_rw_core(pid, iov_l, liovcnt, iov_r, riovcnt, flags, | ||
392 | vm_write); | ||
393 | |||
394 | free_iovecs: | ||
395 | if (iov_r != iovstack_r) | ||
396 | kfree(iov_r); | ||
397 | if (iov_l != iovstack_l) | ||
398 | kfree(iov_l); | ||
399 | |||
400 | return rc; | ||
401 | } | ||
402 | |||
403 | SYSCALL_DEFINE6(process_vm_readv, pid_t, pid, const struct iovec __user *, lvec, | ||
404 | unsigned long, liovcnt, const struct iovec __user *, rvec, | ||
405 | unsigned long, riovcnt, unsigned long, flags) | ||
406 | { | ||
407 | return process_vm_rw(pid, lvec, liovcnt, rvec, riovcnt, flags, 0); | ||
408 | } | ||
409 | |||
410 | SYSCALL_DEFINE6(process_vm_writev, pid_t, pid, | ||
411 | const struct iovec __user *, lvec, | ||
412 | unsigned long, liovcnt, const struct iovec __user *, rvec, | ||
413 | unsigned long, riovcnt, unsigned long, flags) | ||
414 | { | ||
415 | return process_vm_rw(pid, lvec, liovcnt, rvec, riovcnt, flags, 1); | ||
416 | } | ||
417 | |||
418 | #ifdef CONFIG_COMPAT | ||
419 | |||
420 | asmlinkage ssize_t | ||
421 | compat_process_vm_rw(compat_pid_t pid, | ||
422 | const struct compat_iovec __user *lvec, | ||
423 | unsigned long liovcnt, | ||
424 | const struct compat_iovec __user *rvec, | ||
425 | unsigned long riovcnt, | ||
426 | unsigned long flags, int vm_write) | ||
427 | { | ||
428 | struct iovec iovstack_l[UIO_FASTIOV]; | ||
429 | struct iovec iovstack_r[UIO_FASTIOV]; | ||
430 | struct iovec *iov_l = iovstack_l; | ||
431 | struct iovec *iov_r = iovstack_r; | ||
432 | ssize_t rc = -EFAULT; | ||
433 | |||
434 | if (flags != 0) | ||
435 | return -EINVAL; | ||
436 | |||
437 | if (!access_ok(VERIFY_READ, lvec, liovcnt * sizeof(*lvec))) | ||
438 | goto out; | ||
439 | |||
440 | if (!access_ok(VERIFY_READ, rvec, riovcnt * sizeof(*rvec))) | ||
441 | goto out; | ||
442 | |||
443 | if (vm_write) | ||
444 | rc = compat_rw_copy_check_uvector(WRITE, lvec, liovcnt, | ||
445 | UIO_FASTIOV, iovstack_l, | ||
446 | &iov_l, 1); | ||
447 | else | ||
448 | rc = compat_rw_copy_check_uvector(READ, lvec, liovcnt, | ||
449 | UIO_FASTIOV, iovstack_l, | ||
450 | &iov_l, 1); | ||
451 | if (rc <= 0) | ||
452 | goto free_iovecs; | ||
453 | rc = compat_rw_copy_check_uvector(READ, rvec, riovcnt, | ||
454 | UIO_FASTIOV, iovstack_r, | ||
455 | &iov_r, 0); | ||
456 | if (rc <= 0) | ||
457 | goto free_iovecs; | ||
458 | |||
459 | rc = process_vm_rw_core(pid, iov_l, liovcnt, iov_r, riovcnt, flags, | ||
460 | vm_write); | ||
461 | |||
462 | free_iovecs: | ||
463 | if (iov_r != iovstack_r) | ||
464 | kfree(iov_r); | ||
465 | if (iov_l != iovstack_l) | ||
466 | kfree(iov_l); | ||
467 | |||
468 | out: | ||
469 | return rc; | ||
470 | } | ||
471 | |||
472 | asmlinkage ssize_t | ||
473 | compat_sys_process_vm_readv(compat_pid_t pid, | ||
474 | const struct compat_iovec __user *lvec, | ||
475 | unsigned long liovcnt, | ||
476 | const struct compat_iovec __user *rvec, | ||
477 | unsigned long riovcnt, | ||
478 | unsigned long flags) | ||
479 | { | ||
480 | return compat_process_vm_rw(pid, lvec, liovcnt, rvec, | ||
481 | riovcnt, flags, 0); | ||
482 | } | ||
483 | |||
484 | asmlinkage ssize_t | ||
485 | compat_sys_process_vm_writev(compat_pid_t pid, | ||
486 | const struct compat_iovec __user *lvec, | ||
487 | unsigned long liovcnt, | ||
488 | const struct compat_iovec __user *rvec, | ||
489 | unsigned long riovcnt, | ||
490 | unsigned long flags) | ||
491 | { | ||
492 | return compat_process_vm_rw(pid, lvec, liovcnt, rvec, | ||
493 | riovcnt, flags, 1); | ||
494 | } | ||
495 | |||
496 | #endif | ||
diff --git a/mm/quicklist.c b/mm/quicklist.c index 2876349339a7..942212970529 100644 --- a/mm/quicklist.c +++ b/mm/quicklist.c | |||
@@ -17,7 +17,6 @@ | |||
17 | #include <linux/gfp.h> | 17 | #include <linux/gfp.h> |
18 | #include <linux/mm.h> | 18 | #include <linux/mm.h> |
19 | #include <linux/mmzone.h> | 19 | #include <linux/mmzone.h> |
20 | #include <linux/module.h> | ||
21 | #include <linux/quicklist.h> | 20 | #include <linux/quicklist.h> |
22 | 21 | ||
23 | DEFINE_PER_CPU(struct quicklist [CONFIG_NR_QUICK], quicklist); | 22 | DEFINE_PER_CPU(struct quicklist [CONFIG_NR_QUICK], quicklist); |
diff --git a/mm/readahead.c b/mm/readahead.c index 867f9dd82dcd..cbcbb02f3e28 100644 --- a/mm/readahead.c +++ b/mm/readahead.c | |||
@@ -11,7 +11,7 @@ | |||
11 | #include <linux/fs.h> | 11 | #include <linux/fs.h> |
12 | #include <linux/gfp.h> | 12 | #include <linux/gfp.h> |
13 | #include <linux/mm.h> | 13 | #include <linux/mm.h> |
14 | #include <linux/module.h> | 14 | #include <linux/export.h> |
15 | #include <linux/blkdev.h> | 15 | #include <linux/blkdev.h> |
16 | #include <linux/backing-dev.h> | 16 | #include <linux/backing-dev.h> |
17 | #include <linux/task_io_accounting_ops.h> | 17 | #include <linux/task_io_accounting_ops.h> |
@@ -51,7 +51,7 @@ | |||
51 | #include <linux/ksm.h> | 51 | #include <linux/ksm.h> |
52 | #include <linux/rmap.h> | 52 | #include <linux/rmap.h> |
53 | #include <linux/rcupdate.h> | 53 | #include <linux/rcupdate.h> |
54 | #include <linux/module.h> | 54 | #include <linux/export.h> |
55 | #include <linux/memcontrol.h> | 55 | #include <linux/memcontrol.h> |
56 | #include <linux/mmu_notifier.h> | 56 | #include <linux/mmu_notifier.h> |
57 | #include <linux/migrate.h> | 57 | #include <linux/migrate.h> |
@@ -1164,7 +1164,7 @@ void page_remove_rmap(struct page *page) | |||
1164 | 1164 | ||
1165 | /* | 1165 | /* |
1166 | * Subfunctions of try_to_unmap: try_to_unmap_one called | 1166 | * Subfunctions of try_to_unmap: try_to_unmap_one called |
1167 | * repeatedly from either try_to_unmap_anon or try_to_unmap_file. | 1167 | * repeatedly from try_to_unmap_ksm, try_to_unmap_anon or try_to_unmap_file. |
1168 | */ | 1168 | */ |
1169 | int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | 1169 | int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, |
1170 | unsigned long address, enum ttu_flags flags) | 1170 | unsigned long address, enum ttu_flags flags) |
diff --git a/mm/shmem.c b/mm/shmem.c index 5cc21f8b4cd3..d6722506d2da 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -6,7 +6,8 @@ | |||
6 | * 2000-2001 Christoph Rohland | 6 | * 2000-2001 Christoph Rohland |
7 | * 2000-2001 SAP AG | 7 | * 2000-2001 SAP AG |
8 | * 2002 Red Hat Inc. | 8 | * 2002 Red Hat Inc. |
9 | * Copyright (C) 2002-2005 Hugh Dickins. | 9 | * Copyright (C) 2002-2011 Hugh Dickins. |
10 | * Copyright (C) 2011 Google Inc. | ||
10 | * Copyright (C) 2002-2005 VERITAS Software Corporation. | 11 | * Copyright (C) 2002-2005 VERITAS Software Corporation. |
11 | * Copyright (C) 2004 Andi Kleen, SuSE Labs | 12 | * Copyright (C) 2004 Andi Kleen, SuSE Labs |
12 | * | 13 | * |
@@ -27,8 +28,7 @@ | |||
27 | #include <linux/pagemap.h> | 28 | #include <linux/pagemap.h> |
28 | #include <linux/file.h> | 29 | #include <linux/file.h> |
29 | #include <linux/mm.h> | 30 | #include <linux/mm.h> |
30 | #include <linux/module.h> | 31 | #include <linux/export.h> |
31 | #include <linux/percpu_counter.h> | ||
32 | #include <linux/swap.h> | 32 | #include <linux/swap.h> |
33 | 33 | ||
34 | static struct vfsmount *shm_mnt; | 34 | static struct vfsmount *shm_mnt; |
@@ -51,6 +51,8 @@ static struct vfsmount *shm_mnt; | |||
51 | #include <linux/shmem_fs.h> | 51 | #include <linux/shmem_fs.h> |
52 | #include <linux/writeback.h> | 52 | #include <linux/writeback.h> |
53 | #include <linux/blkdev.h> | 53 | #include <linux/blkdev.h> |
54 | #include <linux/pagevec.h> | ||
55 | #include <linux/percpu_counter.h> | ||
54 | #include <linux/splice.h> | 56 | #include <linux/splice.h> |
55 | #include <linux/security.h> | 57 | #include <linux/security.h> |
56 | #include <linux/swapops.h> | 58 | #include <linux/swapops.h> |
@@ -63,43 +65,17 @@ static struct vfsmount *shm_mnt; | |||
63 | #include <linux/magic.h> | 65 | #include <linux/magic.h> |
64 | 66 | ||
65 | #include <asm/uaccess.h> | 67 | #include <asm/uaccess.h> |
66 | #include <asm/div64.h> | ||
67 | #include <asm/pgtable.h> | 68 | #include <asm/pgtable.h> |
68 | 69 | ||
69 | /* | ||
70 | * The maximum size of a shmem/tmpfs file is limited by the maximum size of | ||
71 | * its triple-indirect swap vector - see illustration at shmem_swp_entry(). | ||
72 | * | ||
73 | * With 4kB page size, maximum file size is just over 2TB on a 32-bit kernel, | ||
74 | * but one eighth of that on a 64-bit kernel. With 8kB page size, maximum | ||
75 | * file size is just over 4TB on a 64-bit kernel, but 16TB on a 32-bit kernel, | ||
76 | * MAX_LFS_FILESIZE being then more restrictive than swap vector layout. | ||
77 | * | ||
78 | * We use / and * instead of shifts in the definitions below, so that the swap | ||
79 | * vector can be tested with small even values (e.g. 20) for ENTRIES_PER_PAGE. | ||
80 | */ | ||
81 | #define ENTRIES_PER_PAGE (PAGE_CACHE_SIZE/sizeof(unsigned long)) | ||
82 | #define ENTRIES_PER_PAGEPAGE ((unsigned long long)ENTRIES_PER_PAGE*ENTRIES_PER_PAGE) | ||
83 | |||
84 | #define SHMSWP_MAX_INDEX (SHMEM_NR_DIRECT + (ENTRIES_PER_PAGEPAGE/2) * (ENTRIES_PER_PAGE+1)) | ||
85 | #define SHMSWP_MAX_BYTES (SHMSWP_MAX_INDEX << PAGE_CACHE_SHIFT) | ||
86 | |||
87 | #define SHMEM_MAX_BYTES min_t(unsigned long long, SHMSWP_MAX_BYTES, MAX_LFS_FILESIZE) | ||
88 | #define SHMEM_MAX_INDEX ((unsigned long)((SHMEM_MAX_BYTES+1) >> PAGE_CACHE_SHIFT)) | ||
89 | |||
90 | #define BLOCKS_PER_PAGE (PAGE_CACHE_SIZE/512) | 70 | #define BLOCKS_PER_PAGE (PAGE_CACHE_SIZE/512) |
91 | #define VM_ACCT(size) (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT) | 71 | #define VM_ACCT(size) (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT) |
92 | 72 | ||
93 | /* info->flags needs VM_flags to handle pagein/truncate races efficiently */ | ||
94 | #define SHMEM_PAGEIN VM_READ | ||
95 | #define SHMEM_TRUNCATE VM_WRITE | ||
96 | |||
97 | /* Definition to limit shmem_truncate's steps between cond_rescheds */ | ||
98 | #define LATENCY_LIMIT 64 | ||
99 | |||
100 | /* Pretend that each entry is of this size in directory's i_size */ | 73 | /* Pretend that each entry is of this size in directory's i_size */ |
101 | #define BOGO_DIRENT_SIZE 20 | 74 | #define BOGO_DIRENT_SIZE 20 |
102 | 75 | ||
76 | /* Symlink up to this size is kmalloc'ed instead of using a swappable page */ | ||
77 | #define SHORT_SYMLINK_LEN 128 | ||
78 | |||
103 | struct shmem_xattr { | 79 | struct shmem_xattr { |
104 | struct list_head list; /* anchored by shmem_inode_info->xattr_list */ | 80 | struct list_head list; /* anchored by shmem_inode_info->xattr_list */ |
105 | char *name; /* xattr name */ | 81 | char *name; /* xattr name */ |
@@ -107,7 +83,7 @@ struct shmem_xattr { | |||
107 | char value[0]; | 83 | char value[0]; |
108 | }; | 84 | }; |
109 | 85 | ||
110 | /* Flag allocation requirements to shmem_getpage and shmem_swp_alloc */ | 86 | /* Flag allocation requirements to shmem_getpage */ |
111 | enum sgp_type { | 87 | enum sgp_type { |
112 | SGP_READ, /* don't exceed i_size, don't allocate page */ | 88 | SGP_READ, /* don't exceed i_size, don't allocate page */ |
113 | SGP_CACHE, /* don't exceed i_size, may allocate page */ | 89 | SGP_CACHE, /* don't exceed i_size, may allocate page */ |
@@ -137,56 +113,6 @@ static inline int shmem_getpage(struct inode *inode, pgoff_t index, | |||
137 | mapping_gfp_mask(inode->i_mapping), fault_type); | 113 | mapping_gfp_mask(inode->i_mapping), fault_type); |
138 | } | 114 | } |
139 | 115 | ||
140 | static inline struct page *shmem_dir_alloc(gfp_t gfp_mask) | ||
141 | { | ||
142 | /* | ||
143 | * The above definition of ENTRIES_PER_PAGE, and the use of | ||
144 | * BLOCKS_PER_PAGE on indirect pages, assume PAGE_CACHE_SIZE: | ||
145 | * might be reconsidered if it ever diverges from PAGE_SIZE. | ||
146 | * | ||
147 | * Mobility flags are masked out as swap vectors cannot move | ||
148 | */ | ||
149 | return alloc_pages((gfp_mask & ~GFP_MOVABLE_MASK) | __GFP_ZERO, | ||
150 | PAGE_CACHE_SHIFT-PAGE_SHIFT); | ||
151 | } | ||
152 | |||
153 | static inline void shmem_dir_free(struct page *page) | ||
154 | { | ||
155 | __free_pages(page, PAGE_CACHE_SHIFT-PAGE_SHIFT); | ||
156 | } | ||
157 | |||
158 | static struct page **shmem_dir_map(struct page *page) | ||
159 | { | ||
160 | return (struct page **)kmap_atomic(page, KM_USER0); | ||
161 | } | ||
162 | |||
163 | static inline void shmem_dir_unmap(struct page **dir) | ||
164 | { | ||
165 | kunmap_atomic(dir, KM_USER0); | ||
166 | } | ||
167 | |||
168 | static swp_entry_t *shmem_swp_map(struct page *page) | ||
169 | { | ||
170 | return (swp_entry_t *)kmap_atomic(page, KM_USER1); | ||
171 | } | ||
172 | |||
173 | static inline void shmem_swp_balance_unmap(void) | ||
174 | { | ||
175 | /* | ||
176 | * When passing a pointer to an i_direct entry, to code which | ||
177 | * also handles indirect entries and so will shmem_swp_unmap, | ||
178 | * we must arrange for the preempt count to remain in balance. | ||
179 | * What kmap_atomic of a lowmem page does depends on config | ||
180 | * and architecture, so pretend to kmap_atomic some lowmem page. | ||
181 | */ | ||
182 | (void) kmap_atomic(ZERO_PAGE(0), KM_USER1); | ||
183 | } | ||
184 | |||
185 | static inline void shmem_swp_unmap(swp_entry_t *entry) | ||
186 | { | ||
187 | kunmap_atomic(entry, KM_USER1); | ||
188 | } | ||
189 | |||
190 | static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb) | 116 | static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb) |
191 | { | 117 | { |
192 | return sb->s_fs_info; | 118 | return sb->s_fs_info; |
@@ -244,15 +170,6 @@ static struct backing_dev_info shmem_backing_dev_info __read_mostly = { | |||
244 | static LIST_HEAD(shmem_swaplist); | 170 | static LIST_HEAD(shmem_swaplist); |
245 | static DEFINE_MUTEX(shmem_swaplist_mutex); | 171 | static DEFINE_MUTEX(shmem_swaplist_mutex); |
246 | 172 | ||
247 | static void shmem_free_blocks(struct inode *inode, long pages) | ||
248 | { | ||
249 | struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); | ||
250 | if (sbinfo->max_blocks) { | ||
251 | percpu_counter_add(&sbinfo->used_blocks, -pages); | ||
252 | inode->i_blocks -= pages*BLOCKS_PER_PAGE; | ||
253 | } | ||
254 | } | ||
255 | |||
256 | static int shmem_reserve_inode(struct super_block *sb) | 173 | static int shmem_reserve_inode(struct super_block *sb) |
257 | { | 174 | { |
258 | struct shmem_sb_info *sbinfo = SHMEM_SB(sb); | 175 | struct shmem_sb_info *sbinfo = SHMEM_SB(sb); |
@@ -279,7 +196,7 @@ static void shmem_free_inode(struct super_block *sb) | |||
279 | } | 196 | } |
280 | 197 | ||
281 | /** | 198 | /** |
282 | * shmem_recalc_inode - recalculate the size of an inode | 199 | * shmem_recalc_inode - recalculate the block usage of an inode |
283 | * @inode: inode to recalc | 200 | * @inode: inode to recalc |
284 | * | 201 | * |
285 | * We have to calculate the free blocks since the mm can drop | 202 | * We have to calculate the free blocks since the mm can drop |
@@ -297,474 +214,297 @@ static void shmem_recalc_inode(struct inode *inode) | |||
297 | 214 | ||
298 | freed = info->alloced - info->swapped - inode->i_mapping->nrpages; | 215 | freed = info->alloced - info->swapped - inode->i_mapping->nrpages; |
299 | if (freed > 0) { | 216 | if (freed > 0) { |
217 | struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); | ||
218 | if (sbinfo->max_blocks) | ||
219 | percpu_counter_add(&sbinfo->used_blocks, -freed); | ||
300 | info->alloced -= freed; | 220 | info->alloced -= freed; |
221 | inode->i_blocks -= freed * BLOCKS_PER_PAGE; | ||
301 | shmem_unacct_blocks(info->flags, freed); | 222 | shmem_unacct_blocks(info->flags, freed); |
302 | shmem_free_blocks(inode, freed); | ||
303 | } | 223 | } |
304 | } | 224 | } |
305 | 225 | ||
306 | /** | 226 | /* |
307 | * shmem_swp_entry - find the swap vector position in the info structure | 227 | * Replace item expected in radix tree by a new item, while holding tree lock. |
308 | * @info: info structure for the inode | ||
309 | * @index: index of the page to find | ||
310 | * @page: optional page to add to the structure. Has to be preset to | ||
311 | * all zeros | ||
312 | * | ||
313 | * If there is no space allocated yet it will return NULL when | ||
314 | * page is NULL, else it will use the page for the needed block, | ||
315 | * setting it to NULL on return to indicate that it has been used. | ||
316 | * | ||
317 | * The swap vector is organized the following way: | ||
318 | * | ||
319 | * There are SHMEM_NR_DIRECT entries directly stored in the | ||
320 | * shmem_inode_info structure. So small files do not need an addional | ||
321 | * allocation. | ||
322 | * | ||
323 | * For pages with index > SHMEM_NR_DIRECT there is the pointer | ||
324 | * i_indirect which points to a page which holds in the first half | ||
325 | * doubly indirect blocks, in the second half triple indirect blocks: | ||
326 | * | ||
327 | * For an artificial ENTRIES_PER_PAGE = 4 this would lead to the | ||
328 | * following layout (for SHMEM_NR_DIRECT == 16): | ||
329 | * | ||
330 | * i_indirect -> dir --> 16-19 | ||
331 | * | +-> 20-23 | ||
332 | * | | ||
333 | * +-->dir2 --> 24-27 | ||
334 | * | +-> 28-31 | ||
335 | * | +-> 32-35 | ||
336 | * | +-> 36-39 | ||
337 | * | | ||
338 | * +-->dir3 --> 40-43 | ||
339 | * +-> 44-47 | ||
340 | * +-> 48-51 | ||
341 | * +-> 52-55 | ||
342 | */ | 228 | */ |
343 | static swp_entry_t *shmem_swp_entry(struct shmem_inode_info *info, unsigned long index, struct page **page) | 229 | static int shmem_radix_tree_replace(struct address_space *mapping, |
344 | { | 230 | pgoff_t index, void *expected, void *replacement) |
345 | unsigned long offset; | 231 | { |
346 | struct page **dir; | 232 | void **pslot; |
347 | struct page *subdir; | 233 | void *item = NULL; |
348 | 234 | ||
349 | if (index < SHMEM_NR_DIRECT) { | 235 | VM_BUG_ON(!expected); |
350 | shmem_swp_balance_unmap(); | 236 | pslot = radix_tree_lookup_slot(&mapping->page_tree, index); |
351 | return info->i_direct+index; | 237 | if (pslot) |
352 | } | 238 | item = radix_tree_deref_slot_protected(pslot, |
353 | if (!info->i_indirect) { | 239 | &mapping->tree_lock); |
354 | if (page) { | 240 | if (item != expected) |
355 | info->i_indirect = *page; | 241 | return -ENOENT; |
356 | *page = NULL; | 242 | if (replacement) |
357 | } | 243 | radix_tree_replace_slot(pslot, replacement); |
358 | return NULL; /* need another page */ | 244 | else |
359 | } | 245 | radix_tree_delete(&mapping->page_tree, index); |
360 | 246 | return 0; | |
361 | index -= SHMEM_NR_DIRECT; | 247 | } |
362 | offset = index % ENTRIES_PER_PAGE; | ||
363 | index /= ENTRIES_PER_PAGE; | ||
364 | dir = shmem_dir_map(info->i_indirect); | ||
365 | |||
366 | if (index >= ENTRIES_PER_PAGE/2) { | ||
367 | index -= ENTRIES_PER_PAGE/2; | ||
368 | dir += ENTRIES_PER_PAGE/2 + index/ENTRIES_PER_PAGE; | ||
369 | index %= ENTRIES_PER_PAGE; | ||
370 | subdir = *dir; | ||
371 | if (!subdir) { | ||
372 | if (page) { | ||
373 | *dir = *page; | ||
374 | *page = NULL; | ||
375 | } | ||
376 | shmem_dir_unmap(dir); | ||
377 | return NULL; /* need another page */ | ||
378 | } | ||
379 | shmem_dir_unmap(dir); | ||
380 | dir = shmem_dir_map(subdir); | ||
381 | } | ||
382 | 248 | ||
383 | dir += index; | 249 | /* |
384 | subdir = *dir; | 250 | * Like add_to_page_cache_locked, but error if expected item has gone. |
385 | if (!subdir) { | 251 | */ |
386 | if (!page || !(subdir = *page)) { | 252 | static int shmem_add_to_page_cache(struct page *page, |
387 | shmem_dir_unmap(dir); | 253 | struct address_space *mapping, |
388 | return NULL; /* need a page */ | 254 | pgoff_t index, gfp_t gfp, void *expected) |
255 | { | ||
256 | int error = 0; | ||
257 | |||
258 | VM_BUG_ON(!PageLocked(page)); | ||
259 | VM_BUG_ON(!PageSwapBacked(page)); | ||
260 | |||
261 | if (!expected) | ||
262 | error = radix_tree_preload(gfp & GFP_RECLAIM_MASK); | ||
263 | if (!error) { | ||
264 | page_cache_get(page); | ||
265 | page->mapping = mapping; | ||
266 | page->index = index; | ||
267 | |||
268 | spin_lock_irq(&mapping->tree_lock); | ||
269 | if (!expected) | ||
270 | error = radix_tree_insert(&mapping->page_tree, | ||
271 | index, page); | ||
272 | else | ||
273 | error = shmem_radix_tree_replace(mapping, index, | ||
274 | expected, page); | ||
275 | if (!error) { | ||
276 | mapping->nrpages++; | ||
277 | __inc_zone_page_state(page, NR_FILE_PAGES); | ||
278 | __inc_zone_page_state(page, NR_SHMEM); | ||
279 | spin_unlock_irq(&mapping->tree_lock); | ||
280 | } else { | ||
281 | page->mapping = NULL; | ||
282 | spin_unlock_irq(&mapping->tree_lock); | ||
283 | page_cache_release(page); | ||
389 | } | 284 | } |
390 | *dir = subdir; | 285 | if (!expected) |
391 | *page = NULL; | 286 | radix_tree_preload_end(); |
392 | } | 287 | } |
393 | shmem_dir_unmap(dir); | 288 | if (error) |
394 | return shmem_swp_map(subdir) + offset; | 289 | mem_cgroup_uncharge_cache_page(page); |
290 | return error; | ||
395 | } | 291 | } |
396 | 292 | ||
397 | static void shmem_swp_set(struct shmem_inode_info *info, swp_entry_t *entry, unsigned long value) | 293 | /* |
294 | * Like delete_from_page_cache, but substitutes swap for page. | ||
295 | */ | ||
296 | static void shmem_delete_from_page_cache(struct page *page, void *radswap) | ||
398 | { | 297 | { |
399 | long incdec = value? 1: -1; | 298 | struct address_space *mapping = page->mapping; |
299 | int error; | ||
400 | 300 | ||
401 | entry->val = value; | 301 | spin_lock_irq(&mapping->tree_lock); |
402 | info->swapped += incdec; | 302 | error = shmem_radix_tree_replace(mapping, page->index, page, radswap); |
403 | if ((unsigned long)(entry - info->i_direct) >= SHMEM_NR_DIRECT) { | 303 | page->mapping = NULL; |
404 | struct page *page = kmap_atomic_to_page(entry); | 304 | mapping->nrpages--; |
405 | set_page_private(page, page_private(page) + incdec); | 305 | __dec_zone_page_state(page, NR_FILE_PAGES); |
406 | } | 306 | __dec_zone_page_state(page, NR_SHMEM); |
307 | spin_unlock_irq(&mapping->tree_lock); | ||
308 | page_cache_release(page); | ||
309 | BUG_ON(error); | ||
407 | } | 310 | } |
408 | 311 | ||
409 | /** | 312 | /* |
410 | * shmem_swp_alloc - get the position of the swap entry for the page. | 313 | * Like find_get_pages, but collecting swap entries as well as pages. |
411 | * @info: info structure for the inode | ||
412 | * @index: index of the page to find | ||
413 | * @sgp: check and recheck i_size? skip allocation? | ||
414 | * @gfp: gfp mask to use for any page allocation | ||
415 | * | ||
416 | * If the entry does not exist, allocate it. | ||
417 | */ | 314 | */ |
418 | static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, | 315 | static unsigned shmem_find_get_pages_and_swap(struct address_space *mapping, |
419 | unsigned long index, enum sgp_type sgp, gfp_t gfp) | 316 | pgoff_t start, unsigned int nr_pages, |
420 | { | 317 | struct page **pages, pgoff_t *indices) |
421 | struct inode *inode = &info->vfs_inode; | 318 | { |
422 | struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); | 319 | unsigned int i; |
423 | struct page *page = NULL; | 320 | unsigned int ret; |
424 | swp_entry_t *entry; | 321 | unsigned int nr_found; |
425 | 322 | ||
426 | if (sgp != SGP_WRITE && | 323 | rcu_read_lock(); |
427 | ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) | 324 | restart: |
428 | return ERR_PTR(-EINVAL); | 325 | nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, |
429 | 326 | (void ***)pages, indices, start, nr_pages); | |
430 | while (!(entry = shmem_swp_entry(info, index, &page))) { | 327 | ret = 0; |
431 | if (sgp == SGP_READ) | 328 | for (i = 0; i < nr_found; i++) { |
432 | return shmem_swp_map(ZERO_PAGE(0)); | 329 | struct page *page; |
433 | /* | 330 | repeat: |
434 | * Test used_blocks against 1 less max_blocks, since we have 1 data | 331 | page = radix_tree_deref_slot((void **)pages[i]); |
435 | * page (and perhaps indirect index pages) yet to allocate: | 332 | if (unlikely(!page)) |
436 | * a waste to allocate index if we cannot allocate data. | 333 | continue; |
437 | */ | 334 | if (radix_tree_exception(page)) { |
438 | if (sbinfo->max_blocks) { | 335 | if (radix_tree_deref_retry(page)) |
439 | if (percpu_counter_compare(&sbinfo->used_blocks, | 336 | goto restart; |
440 | sbinfo->max_blocks - 1) >= 0) | 337 | /* |
441 | return ERR_PTR(-ENOSPC); | 338 | * Otherwise, we must be storing a swap entry |
442 | percpu_counter_inc(&sbinfo->used_blocks); | 339 | * here as an exceptional entry: so return it |
443 | inode->i_blocks += BLOCKS_PER_PAGE; | 340 | * without attempting to raise page count. |
341 | */ | ||
342 | goto export; | ||
444 | } | 343 | } |
344 | if (!page_cache_get_speculative(page)) | ||
345 | goto repeat; | ||
445 | 346 | ||
446 | spin_unlock(&info->lock); | 347 | /* Has the page moved? */ |
447 | page = shmem_dir_alloc(gfp); | 348 | if (unlikely(page != *((void **)pages[i]))) { |
448 | spin_lock(&info->lock); | 349 | page_cache_release(page); |
449 | 350 | goto repeat; | |
450 | if (!page) { | ||
451 | shmem_free_blocks(inode, 1); | ||
452 | return ERR_PTR(-ENOMEM); | ||
453 | } | ||
454 | if (sgp != SGP_WRITE && | ||
455 | ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { | ||
456 | entry = ERR_PTR(-EINVAL); | ||
457 | break; | ||
458 | } | 351 | } |
459 | if (info->next_index <= index) | 352 | export: |
460 | info->next_index = index + 1; | 353 | indices[ret] = indices[i]; |
461 | } | 354 | pages[ret] = page; |
462 | if (page) { | 355 | ret++; |
463 | /* another task gave its page, or truncated the file */ | 356 | } |
464 | shmem_free_blocks(inode, 1); | 357 | if (unlikely(!ret && nr_found)) |
465 | shmem_dir_free(page); | 358 | goto restart; |
466 | } | 359 | rcu_read_unlock(); |
467 | if (info->next_index <= index && !IS_ERR(entry)) | 360 | return ret; |
468 | info->next_index = index + 1; | ||
469 | return entry; | ||
470 | } | 361 | } |
471 | 362 | ||
472 | /** | 363 | /* |
473 | * shmem_free_swp - free some swap entries in a directory | 364 | * Remove swap entry from radix tree, free the swap and its page cache. |
474 | * @dir: pointer to the directory | ||
475 | * @edir: pointer after last entry of the directory | ||
476 | * @punch_lock: pointer to spinlock when needed for the holepunch case | ||
477 | */ | 365 | */ |
478 | static int shmem_free_swp(swp_entry_t *dir, swp_entry_t *edir, | 366 | static int shmem_free_swap(struct address_space *mapping, |
479 | spinlock_t *punch_lock) | 367 | pgoff_t index, void *radswap) |
480 | { | 368 | { |
481 | spinlock_t *punch_unlock = NULL; | 369 | int error; |
482 | swp_entry_t *ptr; | 370 | |
483 | int freed = 0; | 371 | spin_lock_irq(&mapping->tree_lock); |
484 | 372 | error = shmem_radix_tree_replace(mapping, index, radswap, NULL); | |
485 | for (ptr = dir; ptr < edir; ptr++) { | 373 | spin_unlock_irq(&mapping->tree_lock); |
486 | if (ptr->val) { | 374 | if (!error) |
487 | if (unlikely(punch_lock)) { | 375 | free_swap_and_cache(radix_to_swp_entry(radswap)); |
488 | punch_unlock = punch_lock; | 376 | return error; |
489 | punch_lock = NULL; | ||
490 | spin_lock(punch_unlock); | ||
491 | if (!ptr->val) | ||
492 | continue; | ||
493 | } | ||
494 | free_swap_and_cache(*ptr); | ||
495 | *ptr = (swp_entry_t){0}; | ||
496 | freed++; | ||
497 | } | ||
498 | } | ||
499 | if (punch_unlock) | ||
500 | spin_unlock(punch_unlock); | ||
501 | return freed; | ||
502 | } | ||
503 | |||
504 | static int shmem_map_and_free_swp(struct page *subdir, int offset, | ||
505 | int limit, struct page ***dir, spinlock_t *punch_lock) | ||
506 | { | ||
507 | swp_entry_t *ptr; | ||
508 | int freed = 0; | ||
509 | |||
510 | ptr = shmem_swp_map(subdir); | ||
511 | for (; offset < limit; offset += LATENCY_LIMIT) { | ||
512 | int size = limit - offset; | ||
513 | if (size > LATENCY_LIMIT) | ||
514 | size = LATENCY_LIMIT; | ||
515 | freed += shmem_free_swp(ptr+offset, ptr+offset+size, | ||
516 | punch_lock); | ||
517 | if (need_resched()) { | ||
518 | shmem_swp_unmap(ptr); | ||
519 | if (*dir) { | ||
520 | shmem_dir_unmap(*dir); | ||
521 | *dir = NULL; | ||
522 | } | ||
523 | cond_resched(); | ||
524 | ptr = shmem_swp_map(subdir); | ||
525 | } | ||
526 | } | ||
527 | shmem_swp_unmap(ptr); | ||
528 | return freed; | ||
529 | } | 377 | } |
530 | 378 | ||
531 | static void shmem_free_pages(struct list_head *next) | 379 | /* |
380 | * Pagevec may contain swap entries, so shuffle up pages before releasing. | ||
381 | */ | ||
382 | static void shmem_pagevec_release(struct pagevec *pvec) | ||
532 | { | 383 | { |
533 | struct page *page; | 384 | int i, j; |
534 | int freed = 0; | 385 | |
535 | 386 | for (i = 0, j = 0; i < pagevec_count(pvec); i++) { | |
536 | do { | 387 | struct page *page = pvec->pages[i]; |
537 | page = container_of(next, struct page, lru); | 388 | if (!radix_tree_exceptional_entry(page)) |
538 | next = next->next; | 389 | pvec->pages[j++] = page; |
539 | shmem_dir_free(page); | 390 | } |
540 | freed++; | 391 | pvec->nr = j; |
541 | if (freed >= LATENCY_LIMIT) { | 392 | pagevec_release(pvec); |
542 | cond_resched(); | ||
543 | freed = 0; | ||
544 | } | ||
545 | } while (next); | ||
546 | } | 393 | } |
547 | 394 | ||
548 | void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end) | 395 | /* |
396 | * Remove range of pages and swap entries from radix tree, and free them. | ||
397 | */ | ||
398 | void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) | ||
549 | { | 399 | { |
400 | struct address_space *mapping = inode->i_mapping; | ||
550 | struct shmem_inode_info *info = SHMEM_I(inode); | 401 | struct shmem_inode_info *info = SHMEM_I(inode); |
551 | unsigned long idx; | 402 | pgoff_t start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; |
552 | unsigned long size; | 403 | unsigned partial = lstart & (PAGE_CACHE_SIZE - 1); |
553 | unsigned long limit; | 404 | pgoff_t end = (lend >> PAGE_CACHE_SHIFT); |
554 | unsigned long stage; | 405 | struct pagevec pvec; |
555 | unsigned long diroff; | 406 | pgoff_t indices[PAGEVEC_SIZE]; |
556 | struct page **dir; | ||
557 | struct page *topdir; | ||
558 | struct page *middir; | ||
559 | struct page *subdir; | ||
560 | swp_entry_t *ptr; | ||
561 | LIST_HEAD(pages_to_free); | ||
562 | long nr_pages_to_free = 0; | ||
563 | long nr_swaps_freed = 0; | 407 | long nr_swaps_freed = 0; |
564 | int offset; | 408 | pgoff_t index; |
565 | int freed; | 409 | int i; |
566 | int punch_hole; | ||
567 | spinlock_t *needs_lock; | ||
568 | spinlock_t *punch_lock; | ||
569 | unsigned long upper_limit; | ||
570 | 410 | ||
571 | truncate_inode_pages_range(inode->i_mapping, start, end); | 411 | BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1)); |
572 | 412 | ||
573 | inode->i_ctime = inode->i_mtime = CURRENT_TIME; | 413 | pagevec_init(&pvec, 0); |
574 | idx = (start + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | 414 | index = start; |
575 | if (idx >= info->next_index) | 415 | while (index <= end) { |
576 | return; | 416 | pvec.nr = shmem_find_get_pages_and_swap(mapping, index, |
417 | min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, | ||
418 | pvec.pages, indices); | ||
419 | if (!pvec.nr) | ||
420 | break; | ||
421 | mem_cgroup_uncharge_start(); | ||
422 | for (i = 0; i < pagevec_count(&pvec); i++) { | ||
423 | struct page *page = pvec.pages[i]; | ||
577 | 424 | ||
578 | spin_lock(&info->lock); | 425 | index = indices[i]; |
579 | info->flags |= SHMEM_TRUNCATE; | 426 | if (index > end) |
580 | if (likely(end == (loff_t) -1)) { | 427 | break; |
581 | limit = info->next_index; | 428 | |
582 | upper_limit = SHMEM_MAX_INDEX; | 429 | if (radix_tree_exceptional_entry(page)) { |
583 | info->next_index = idx; | 430 | nr_swaps_freed += !shmem_free_swap(mapping, |
584 | needs_lock = NULL; | 431 | index, page); |
585 | punch_hole = 0; | 432 | continue; |
586 | } else { | 433 | } |
587 | if (end + 1 >= inode->i_size) { /* we may free a little more */ | ||
588 | limit = (inode->i_size + PAGE_CACHE_SIZE - 1) >> | ||
589 | PAGE_CACHE_SHIFT; | ||
590 | upper_limit = SHMEM_MAX_INDEX; | ||
591 | } else { | ||
592 | limit = (end + 1) >> PAGE_CACHE_SHIFT; | ||
593 | upper_limit = limit; | ||
594 | } | ||
595 | needs_lock = &info->lock; | ||
596 | punch_hole = 1; | ||
597 | } | ||
598 | 434 | ||
599 | topdir = info->i_indirect; | 435 | if (!trylock_page(page)) |
600 | if (topdir && idx <= SHMEM_NR_DIRECT && !punch_hole) { | 436 | continue; |
601 | info->i_indirect = NULL; | 437 | if (page->mapping == mapping) { |
602 | nr_pages_to_free++; | 438 | VM_BUG_ON(PageWriteback(page)); |
603 | list_add(&topdir->lru, &pages_to_free); | 439 | truncate_inode_page(mapping, page); |
440 | } | ||
441 | unlock_page(page); | ||
442 | } | ||
443 | shmem_pagevec_release(&pvec); | ||
444 | mem_cgroup_uncharge_end(); | ||
445 | cond_resched(); | ||
446 | index++; | ||
604 | } | 447 | } |
605 | spin_unlock(&info->lock); | ||
606 | 448 | ||
607 | if (info->swapped && idx < SHMEM_NR_DIRECT) { | 449 | if (partial) { |
608 | ptr = info->i_direct; | 450 | struct page *page = NULL; |
609 | size = limit; | 451 | shmem_getpage(inode, start - 1, &page, SGP_READ, NULL); |
610 | if (size > SHMEM_NR_DIRECT) | 452 | if (page) { |
611 | size = SHMEM_NR_DIRECT; | 453 | zero_user_segment(page, partial, PAGE_CACHE_SIZE); |
612 | nr_swaps_freed = shmem_free_swp(ptr+idx, ptr+size, needs_lock); | 454 | set_page_dirty(page); |
455 | unlock_page(page); | ||
456 | page_cache_release(page); | ||
457 | } | ||
613 | } | 458 | } |
614 | 459 | ||
615 | /* | 460 | index = start; |
616 | * If there are no indirect blocks or we are punching a hole | 461 | for ( ; ; ) { |
617 | * below indirect blocks, nothing to be done. | 462 | cond_resched(); |
618 | */ | 463 | pvec.nr = shmem_find_get_pages_and_swap(mapping, index, |
619 | if (!topdir || limit <= SHMEM_NR_DIRECT) | 464 | min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, |
620 | goto done2; | 465 | pvec.pages, indices); |
466 | if (!pvec.nr) { | ||
467 | if (index == start) | ||
468 | break; | ||
469 | index = start; | ||
470 | continue; | ||
471 | } | ||
472 | if (index == start && indices[0] > end) { | ||
473 | shmem_pagevec_release(&pvec); | ||
474 | break; | ||
475 | } | ||
476 | mem_cgroup_uncharge_start(); | ||
477 | for (i = 0; i < pagevec_count(&pvec); i++) { | ||
478 | struct page *page = pvec.pages[i]; | ||
621 | 479 | ||
622 | /* | 480 | index = indices[i]; |
623 | * The truncation case has already dropped info->lock, and we're safe | 481 | if (index > end) |
624 | * because i_size and next_index have already been lowered, preventing | 482 | break; |
625 | * access beyond. But in the punch_hole case, we still need to take | ||
626 | * the lock when updating the swap directory, because there might be | ||
627 | * racing accesses by shmem_getpage(SGP_CACHE), shmem_unuse_inode or | ||
628 | * shmem_writepage. However, whenever we find we can remove a whole | ||
629 | * directory page (not at the misaligned start or end of the range), | ||
630 | * we first NULLify its pointer in the level above, and then have no | ||
631 | * need to take the lock when updating its contents: needs_lock and | ||
632 | * punch_lock (either pointing to info->lock or NULL) manage this. | ||
633 | */ | ||
634 | 483 | ||
635 | upper_limit -= SHMEM_NR_DIRECT; | 484 | if (radix_tree_exceptional_entry(page)) { |
636 | limit -= SHMEM_NR_DIRECT; | 485 | nr_swaps_freed += !shmem_free_swap(mapping, |
637 | idx = (idx > SHMEM_NR_DIRECT)? (idx - SHMEM_NR_DIRECT): 0; | 486 | index, page); |
638 | offset = idx % ENTRIES_PER_PAGE; | 487 | continue; |
639 | idx -= offset; | ||
640 | |||
641 | dir = shmem_dir_map(topdir); | ||
642 | stage = ENTRIES_PER_PAGEPAGE/2; | ||
643 | if (idx < ENTRIES_PER_PAGEPAGE/2) { | ||
644 | middir = topdir; | ||
645 | diroff = idx/ENTRIES_PER_PAGE; | ||
646 | } else { | ||
647 | dir += ENTRIES_PER_PAGE/2; | ||
648 | dir += (idx - ENTRIES_PER_PAGEPAGE/2)/ENTRIES_PER_PAGEPAGE; | ||
649 | while (stage <= idx) | ||
650 | stage += ENTRIES_PER_PAGEPAGE; | ||
651 | middir = *dir; | ||
652 | if (*dir) { | ||
653 | diroff = ((idx - ENTRIES_PER_PAGEPAGE/2) % | ||
654 | ENTRIES_PER_PAGEPAGE) / ENTRIES_PER_PAGE; | ||
655 | if (!diroff && !offset && upper_limit >= stage) { | ||
656 | if (needs_lock) { | ||
657 | spin_lock(needs_lock); | ||
658 | *dir = NULL; | ||
659 | spin_unlock(needs_lock); | ||
660 | needs_lock = NULL; | ||
661 | } else | ||
662 | *dir = NULL; | ||
663 | nr_pages_to_free++; | ||
664 | list_add(&middir->lru, &pages_to_free); | ||
665 | } | 488 | } |
666 | shmem_dir_unmap(dir); | ||
667 | dir = shmem_dir_map(middir); | ||
668 | } else { | ||
669 | diroff = 0; | ||
670 | offset = 0; | ||
671 | idx = stage; | ||
672 | } | ||
673 | } | ||
674 | 489 | ||
675 | for (; idx < limit; idx += ENTRIES_PER_PAGE, diroff++) { | 490 | lock_page(page); |
676 | if (unlikely(idx == stage)) { | 491 | if (page->mapping == mapping) { |
677 | shmem_dir_unmap(dir); | 492 | VM_BUG_ON(PageWriteback(page)); |
678 | dir = shmem_dir_map(topdir) + | 493 | truncate_inode_page(mapping, page); |
679 | ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE; | ||
680 | while (!*dir) { | ||
681 | dir++; | ||
682 | idx += ENTRIES_PER_PAGEPAGE; | ||
683 | if (idx >= limit) | ||
684 | goto done1; | ||
685 | } | 494 | } |
686 | stage = idx + ENTRIES_PER_PAGEPAGE; | 495 | unlock_page(page); |
687 | middir = *dir; | ||
688 | if (punch_hole) | ||
689 | needs_lock = &info->lock; | ||
690 | if (upper_limit >= stage) { | ||
691 | if (needs_lock) { | ||
692 | spin_lock(needs_lock); | ||
693 | *dir = NULL; | ||
694 | spin_unlock(needs_lock); | ||
695 | needs_lock = NULL; | ||
696 | } else | ||
697 | *dir = NULL; | ||
698 | nr_pages_to_free++; | ||
699 | list_add(&middir->lru, &pages_to_free); | ||
700 | } | ||
701 | shmem_dir_unmap(dir); | ||
702 | cond_resched(); | ||
703 | dir = shmem_dir_map(middir); | ||
704 | diroff = 0; | ||
705 | } | ||
706 | punch_lock = needs_lock; | ||
707 | subdir = dir[diroff]; | ||
708 | if (subdir && !offset && upper_limit-idx >= ENTRIES_PER_PAGE) { | ||
709 | if (needs_lock) { | ||
710 | spin_lock(needs_lock); | ||
711 | dir[diroff] = NULL; | ||
712 | spin_unlock(needs_lock); | ||
713 | punch_lock = NULL; | ||
714 | } else | ||
715 | dir[diroff] = NULL; | ||
716 | nr_pages_to_free++; | ||
717 | list_add(&subdir->lru, &pages_to_free); | ||
718 | } | ||
719 | if (subdir && page_private(subdir) /* has swap entries */) { | ||
720 | size = limit - idx; | ||
721 | if (size > ENTRIES_PER_PAGE) | ||
722 | size = ENTRIES_PER_PAGE; | ||
723 | freed = shmem_map_and_free_swp(subdir, | ||
724 | offset, size, &dir, punch_lock); | ||
725 | if (!dir) | ||
726 | dir = shmem_dir_map(middir); | ||
727 | nr_swaps_freed += freed; | ||
728 | if (offset || punch_lock) { | ||
729 | spin_lock(&info->lock); | ||
730 | set_page_private(subdir, | ||
731 | page_private(subdir) - freed); | ||
732 | spin_unlock(&info->lock); | ||
733 | } else | ||
734 | BUG_ON(page_private(subdir) != freed); | ||
735 | } | 496 | } |
736 | offset = 0; | 497 | shmem_pagevec_release(&pvec); |
737 | } | 498 | mem_cgroup_uncharge_end(); |
738 | done1: | 499 | index++; |
739 | shmem_dir_unmap(dir); | ||
740 | done2: | ||
741 | if (inode->i_mapping->nrpages && (info->flags & SHMEM_PAGEIN)) { | ||
742 | /* | ||
743 | * Call truncate_inode_pages again: racing shmem_unuse_inode | ||
744 | * may have swizzled a page in from swap since | ||
745 | * truncate_pagecache or generic_delete_inode did it, before we | ||
746 | * lowered next_index. Also, though shmem_getpage checks | ||
747 | * i_size before adding to cache, no recheck after: so fix the | ||
748 | * narrow window there too. | ||
749 | */ | ||
750 | truncate_inode_pages_range(inode->i_mapping, start, end); | ||
751 | } | 500 | } |
752 | 501 | ||
753 | spin_lock(&info->lock); | 502 | spin_lock(&info->lock); |
754 | info->flags &= ~SHMEM_TRUNCATE; | ||
755 | info->swapped -= nr_swaps_freed; | 503 | info->swapped -= nr_swaps_freed; |
756 | if (nr_pages_to_free) | ||
757 | shmem_free_blocks(inode, nr_pages_to_free); | ||
758 | shmem_recalc_inode(inode); | 504 | shmem_recalc_inode(inode); |
759 | spin_unlock(&info->lock); | 505 | spin_unlock(&info->lock); |
760 | 506 | ||
761 | /* | 507 | inode->i_ctime = inode->i_mtime = CURRENT_TIME; |
762 | * Empty swap vector directory pages to be freed? | ||
763 | */ | ||
764 | if (!list_empty(&pages_to_free)) { | ||
765 | pages_to_free.prev->next = NULL; | ||
766 | shmem_free_pages(pages_to_free.next); | ||
767 | } | ||
768 | } | 508 | } |
769 | EXPORT_SYMBOL_GPL(shmem_truncate_range); | 509 | EXPORT_SYMBOL_GPL(shmem_truncate_range); |
770 | 510 | ||
@@ -780,37 +520,7 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr) | |||
780 | if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { | 520 | if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { |
781 | loff_t oldsize = inode->i_size; | 521 | loff_t oldsize = inode->i_size; |
782 | loff_t newsize = attr->ia_size; | 522 | loff_t newsize = attr->ia_size; |
783 | struct page *page = NULL; | ||
784 | 523 | ||
785 | if (newsize < oldsize) { | ||
786 | /* | ||
787 | * If truncating down to a partial page, then | ||
788 | * if that page is already allocated, hold it | ||
789 | * in memory until the truncation is over, so | ||
790 | * truncate_partial_page cannot miss it were | ||
791 | * it assigned to swap. | ||
792 | */ | ||
793 | if (newsize & (PAGE_CACHE_SIZE-1)) { | ||
794 | (void) shmem_getpage(inode, | ||
795 | newsize >> PAGE_CACHE_SHIFT, | ||
796 | &page, SGP_READ, NULL); | ||
797 | if (page) | ||
798 | unlock_page(page); | ||
799 | } | ||
800 | /* | ||
801 | * Reset SHMEM_PAGEIN flag so that shmem_truncate can | ||
802 | * detect if any pages might have been added to cache | ||
803 | * after truncate_inode_pages. But we needn't bother | ||
804 | * if it's being fully truncated to zero-length: the | ||
805 | * nrpages check is efficient enough in that case. | ||
806 | */ | ||
807 | if (newsize) { | ||
808 | struct shmem_inode_info *info = SHMEM_I(inode); | ||
809 | spin_lock(&info->lock); | ||
810 | info->flags &= ~SHMEM_PAGEIN; | ||
811 | spin_unlock(&info->lock); | ||
812 | } | ||
813 | } | ||
814 | if (newsize != oldsize) { | 524 | if (newsize != oldsize) { |
815 | i_size_write(inode, newsize); | 525 | i_size_write(inode, newsize); |
816 | inode->i_ctime = inode->i_mtime = CURRENT_TIME; | 526 | inode->i_ctime = inode->i_mtime = CURRENT_TIME; |
@@ -822,8 +532,6 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr) | |||
822 | /* unmap again to remove racily COWed private pages */ | 532 | /* unmap again to remove racily COWed private pages */ |
823 | unmap_mapping_range(inode->i_mapping, holebegin, 0, 1); | 533 | unmap_mapping_range(inode->i_mapping, holebegin, 0, 1); |
824 | } | 534 | } |
825 | if (page) | ||
826 | page_cache_release(page); | ||
827 | } | 535 | } |
828 | 536 | ||
829 | setattr_copy(inode, attr); | 537 | setattr_copy(inode, attr); |
@@ -848,7 +556,8 @@ static void shmem_evict_inode(struct inode *inode) | |||
848 | list_del_init(&info->swaplist); | 556 | list_del_init(&info->swaplist); |
849 | mutex_unlock(&shmem_swaplist_mutex); | 557 | mutex_unlock(&shmem_swaplist_mutex); |
850 | } | 558 | } |
851 | } | 559 | } else |
560 | kfree(info->symlink); | ||
852 | 561 | ||
853 | list_for_each_entry_safe(xattr, nxattr, &info->xattr_list, list) { | 562 | list_for_each_entry_safe(xattr, nxattr, &info->xattr_list, list) { |
854 | kfree(xattr->name); | 563 | kfree(xattr->name); |
@@ -859,106 +568,27 @@ static void shmem_evict_inode(struct inode *inode) | |||
859 | end_writeback(inode); | 568 | end_writeback(inode); |
860 | } | 569 | } |
861 | 570 | ||
862 | static inline int shmem_find_swp(swp_entry_t entry, swp_entry_t *dir, swp_entry_t *edir) | 571 | /* |
863 | { | 572 | * If swap found in inode, free it and move page from swapcache to filecache. |
864 | swp_entry_t *ptr; | 573 | */ |
865 | 574 | static int shmem_unuse_inode(struct shmem_inode_info *info, | |
866 | for (ptr = dir; ptr < edir; ptr++) { | 575 | swp_entry_t swap, struct page *page) |
867 | if (ptr->val == entry.val) | ||
868 | return ptr - dir; | ||
869 | } | ||
870 | return -1; | ||
871 | } | ||
872 | |||
873 | static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, struct page *page) | ||
874 | { | 576 | { |
875 | struct address_space *mapping; | 577 | struct address_space *mapping = info->vfs_inode.i_mapping; |
876 | unsigned long idx; | 578 | void *radswap; |
877 | unsigned long size; | 579 | pgoff_t index; |
878 | unsigned long limit; | ||
879 | unsigned long stage; | ||
880 | struct page **dir; | ||
881 | struct page *subdir; | ||
882 | swp_entry_t *ptr; | ||
883 | int offset; | ||
884 | int error; | 580 | int error; |
885 | 581 | ||
886 | idx = 0; | 582 | radswap = swp_to_radix_entry(swap); |
887 | ptr = info->i_direct; | 583 | index = radix_tree_locate_item(&mapping->page_tree, radswap); |
888 | spin_lock(&info->lock); | 584 | if (index == -1) |
889 | if (!info->swapped) { | 585 | return 0; |
890 | list_del_init(&info->swaplist); | ||
891 | goto lost2; | ||
892 | } | ||
893 | limit = info->next_index; | ||
894 | size = limit; | ||
895 | if (size > SHMEM_NR_DIRECT) | ||
896 | size = SHMEM_NR_DIRECT; | ||
897 | offset = shmem_find_swp(entry, ptr, ptr+size); | ||
898 | if (offset >= 0) { | ||
899 | shmem_swp_balance_unmap(); | ||
900 | goto found; | ||
901 | } | ||
902 | if (!info->i_indirect) | ||
903 | goto lost2; | ||
904 | |||
905 | dir = shmem_dir_map(info->i_indirect); | ||
906 | stage = SHMEM_NR_DIRECT + ENTRIES_PER_PAGEPAGE/2; | ||
907 | |||
908 | for (idx = SHMEM_NR_DIRECT; idx < limit; idx += ENTRIES_PER_PAGE, dir++) { | ||
909 | if (unlikely(idx == stage)) { | ||
910 | shmem_dir_unmap(dir-1); | ||
911 | if (cond_resched_lock(&info->lock)) { | ||
912 | /* check it has not been truncated */ | ||
913 | if (limit > info->next_index) { | ||
914 | limit = info->next_index; | ||
915 | if (idx >= limit) | ||
916 | goto lost2; | ||
917 | } | ||
918 | } | ||
919 | dir = shmem_dir_map(info->i_indirect) + | ||
920 | ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE; | ||
921 | while (!*dir) { | ||
922 | dir++; | ||
923 | idx += ENTRIES_PER_PAGEPAGE; | ||
924 | if (idx >= limit) | ||
925 | goto lost1; | ||
926 | } | ||
927 | stage = idx + ENTRIES_PER_PAGEPAGE; | ||
928 | subdir = *dir; | ||
929 | shmem_dir_unmap(dir); | ||
930 | dir = shmem_dir_map(subdir); | ||
931 | } | ||
932 | subdir = *dir; | ||
933 | if (subdir && page_private(subdir)) { | ||
934 | ptr = shmem_swp_map(subdir); | ||
935 | size = limit - idx; | ||
936 | if (size > ENTRIES_PER_PAGE) | ||
937 | size = ENTRIES_PER_PAGE; | ||
938 | offset = shmem_find_swp(entry, ptr, ptr+size); | ||
939 | shmem_swp_unmap(ptr); | ||
940 | if (offset >= 0) { | ||
941 | shmem_dir_unmap(dir); | ||
942 | ptr = shmem_swp_map(subdir); | ||
943 | goto found; | ||
944 | } | ||
945 | } | ||
946 | } | ||
947 | lost1: | ||
948 | shmem_dir_unmap(dir-1); | ||
949 | lost2: | ||
950 | spin_unlock(&info->lock); | ||
951 | return 0; | ||
952 | found: | ||
953 | idx += offset; | ||
954 | ptr += offset; | ||
955 | 586 | ||
956 | /* | 587 | /* |
957 | * Move _head_ to start search for next from here. | 588 | * Move _head_ to start search for next from here. |
958 | * But be careful: shmem_evict_inode checks list_empty without taking | 589 | * But be careful: shmem_evict_inode checks list_empty without taking |
959 | * mutex, and there's an instant in list_move_tail when info->swaplist | 590 | * mutex, and there's an instant in list_move_tail when info->swaplist |
960 | * would appear empty, if it were the only one on shmem_swaplist. We | 591 | * would appear empty, if it were the only one on shmem_swaplist. |
961 | * could avoid doing it if inode NULL; or use this minor optimization. | ||
962 | */ | 592 | */ |
963 | if (shmem_swaplist.next != &info->swaplist) | 593 | if (shmem_swaplist.next != &info->swaplist) |
964 | list_move_tail(&shmem_swaplist, &info->swaplist); | 594 | list_move_tail(&shmem_swaplist, &info->swaplist); |
@@ -968,29 +598,34 @@ found: | |||
968 | * but also to hold up shmem_evict_inode(): so inode cannot be freed | 598 | * but also to hold up shmem_evict_inode(): so inode cannot be freed |
969 | * beneath us (pagelock doesn't help until the page is in pagecache). | 599 | * beneath us (pagelock doesn't help until the page is in pagecache). |
970 | */ | 600 | */ |
971 | mapping = info->vfs_inode.i_mapping; | 601 | error = shmem_add_to_page_cache(page, mapping, index, |
972 | error = add_to_page_cache_locked(page, mapping, idx, GFP_NOWAIT); | 602 | GFP_NOWAIT, radswap); |
973 | /* which does mem_cgroup_uncharge_cache_page on error */ | 603 | /* which does mem_cgroup_uncharge_cache_page on error */ |
974 | 604 | ||
975 | if (error != -ENOMEM) { | 605 | if (error != -ENOMEM) { |
606 | /* | ||
607 | * Truncation and eviction use free_swap_and_cache(), which | ||
608 | * only does trylock page: if we raced, best clean up here. | ||
609 | */ | ||
976 | delete_from_swap_cache(page); | 610 | delete_from_swap_cache(page); |
977 | set_page_dirty(page); | 611 | set_page_dirty(page); |
978 | info->flags |= SHMEM_PAGEIN; | 612 | if (!error) { |
979 | shmem_swp_set(info, ptr, 0); | 613 | spin_lock(&info->lock); |
980 | swap_free(entry); | 614 | info->swapped--; |
615 | spin_unlock(&info->lock); | ||
616 | swap_free(swap); | ||
617 | } | ||
981 | error = 1; /* not an error, but entry was found */ | 618 | error = 1; /* not an error, but entry was found */ |
982 | } | 619 | } |
983 | shmem_swp_unmap(ptr); | ||
984 | spin_unlock(&info->lock); | ||
985 | return error; | 620 | return error; |
986 | } | 621 | } |
987 | 622 | ||
988 | /* | 623 | /* |
989 | * shmem_unuse() search for an eventually swapped out shmem page. | 624 | * Search through swapped inodes to find and replace swap by page. |
990 | */ | 625 | */ |
991 | int shmem_unuse(swp_entry_t entry, struct page *page) | 626 | int shmem_unuse(swp_entry_t swap, struct page *page) |
992 | { | 627 | { |
993 | struct list_head *p, *next; | 628 | struct list_head *this, *next; |
994 | struct shmem_inode_info *info; | 629 | struct shmem_inode_info *info; |
995 | int found = 0; | 630 | int found = 0; |
996 | int error; | 631 | int error; |
@@ -999,32 +634,25 @@ int shmem_unuse(swp_entry_t entry, struct page *page) | |||
999 | * Charge page using GFP_KERNEL while we can wait, before taking | 634 | * Charge page using GFP_KERNEL while we can wait, before taking |
1000 | * the shmem_swaplist_mutex which might hold up shmem_writepage(). | 635 | * the shmem_swaplist_mutex which might hold up shmem_writepage(). |
1001 | * Charged back to the user (not to caller) when swap account is used. | 636 | * Charged back to the user (not to caller) when swap account is used. |
1002 | * add_to_page_cache() will be called with GFP_NOWAIT. | ||
1003 | */ | 637 | */ |
1004 | error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL); | 638 | error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL); |
1005 | if (error) | 639 | if (error) |
1006 | goto out; | 640 | goto out; |
1007 | /* | 641 | /* No radix_tree_preload: swap entry keeps a place for page in tree */ |
1008 | * Try to preload while we can wait, to not make a habit of | ||
1009 | * draining atomic reserves; but don't latch on to this cpu, | ||
1010 | * it's okay if sometimes we get rescheduled after this. | ||
1011 | */ | ||
1012 | error = radix_tree_preload(GFP_KERNEL); | ||
1013 | if (error) | ||
1014 | goto uncharge; | ||
1015 | radix_tree_preload_end(); | ||
1016 | 642 | ||
1017 | mutex_lock(&shmem_swaplist_mutex); | 643 | mutex_lock(&shmem_swaplist_mutex); |
1018 | list_for_each_safe(p, next, &shmem_swaplist) { | 644 | list_for_each_safe(this, next, &shmem_swaplist) { |
1019 | info = list_entry(p, struct shmem_inode_info, swaplist); | 645 | info = list_entry(this, struct shmem_inode_info, swaplist); |
1020 | found = shmem_unuse_inode(info, entry, page); | 646 | if (info->swapped) |
647 | found = shmem_unuse_inode(info, swap, page); | ||
648 | else | ||
649 | list_del_init(&info->swaplist); | ||
1021 | cond_resched(); | 650 | cond_resched(); |
1022 | if (found) | 651 | if (found) |
1023 | break; | 652 | break; |
1024 | } | 653 | } |
1025 | mutex_unlock(&shmem_swaplist_mutex); | 654 | mutex_unlock(&shmem_swaplist_mutex); |
1026 | 655 | ||
1027 | uncharge: | ||
1028 | if (!found) | 656 | if (!found) |
1029 | mem_cgroup_uncharge_cache_page(page); | 657 | mem_cgroup_uncharge_cache_page(page); |
1030 | if (found < 0) | 658 | if (found < 0) |
@@ -1041,10 +669,10 @@ out: | |||
1041 | static int shmem_writepage(struct page *page, struct writeback_control *wbc) | 669 | static int shmem_writepage(struct page *page, struct writeback_control *wbc) |
1042 | { | 670 | { |
1043 | struct shmem_inode_info *info; | 671 | struct shmem_inode_info *info; |
1044 | swp_entry_t *entry, swap; | ||
1045 | struct address_space *mapping; | 672 | struct address_space *mapping; |
1046 | unsigned long index; | ||
1047 | struct inode *inode; | 673 | struct inode *inode; |
674 | swp_entry_t swap; | ||
675 | pgoff_t index; | ||
1048 | 676 | ||
1049 | BUG_ON(!PageLocked(page)); | 677 | BUG_ON(!PageLocked(page)); |
1050 | mapping = page->mapping; | 678 | mapping = page->mapping; |
@@ -1073,50 +701,32 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) | |||
1073 | 701 | ||
1074 | /* | 702 | /* |
1075 | * Add inode to shmem_unuse()'s list of swapped-out inodes, | 703 | * Add inode to shmem_unuse()'s list of swapped-out inodes, |
1076 | * if it's not already there. Do it now because we cannot take | 704 | * if it's not already there. Do it now before the page is |
1077 | * mutex while holding spinlock, and must do so before the page | 705 | * moved to swap cache, when its pagelock no longer protects |
1078 | * is moved to swap cache, when its pagelock no longer protects | ||
1079 | * the inode from eviction. But don't unlock the mutex until | 706 | * the inode from eviction. But don't unlock the mutex until |
1080 | * we've taken the spinlock, because shmem_unuse_inode() will | 707 | * we've incremented swapped, because shmem_unuse_inode() will |
1081 | * prune a !swapped inode from the swaplist under both locks. | 708 | * prune a !swapped inode from the swaplist under this mutex. |
1082 | */ | 709 | */ |
1083 | mutex_lock(&shmem_swaplist_mutex); | 710 | mutex_lock(&shmem_swaplist_mutex); |
1084 | if (list_empty(&info->swaplist)) | 711 | if (list_empty(&info->swaplist)) |
1085 | list_add_tail(&info->swaplist, &shmem_swaplist); | 712 | list_add_tail(&info->swaplist, &shmem_swaplist); |
1086 | 713 | ||
1087 | spin_lock(&info->lock); | ||
1088 | mutex_unlock(&shmem_swaplist_mutex); | ||
1089 | |||
1090 | if (index >= info->next_index) { | ||
1091 | BUG_ON(!(info->flags & SHMEM_TRUNCATE)); | ||
1092 | goto unlock; | ||
1093 | } | ||
1094 | entry = shmem_swp_entry(info, index, NULL); | ||
1095 | if (entry->val) { | ||
1096 | WARN_ON_ONCE(1); /* Still happens? Tell us about it! */ | ||
1097 | free_swap_and_cache(*entry); | ||
1098 | shmem_swp_set(info, entry, 0); | ||
1099 | } | ||
1100 | shmem_recalc_inode(inode); | ||
1101 | |||
1102 | if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) { | 714 | if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) { |
1103 | delete_from_page_cache(page); | ||
1104 | shmem_swp_set(info, entry, swap.val); | ||
1105 | shmem_swp_unmap(entry); | ||
1106 | swap_shmem_alloc(swap); | 715 | swap_shmem_alloc(swap); |
716 | shmem_delete_from_page_cache(page, swp_to_radix_entry(swap)); | ||
717 | |||
718 | spin_lock(&info->lock); | ||
719 | info->swapped++; | ||
720 | shmem_recalc_inode(inode); | ||
1107 | spin_unlock(&info->lock); | 721 | spin_unlock(&info->lock); |
722 | |||
723 | mutex_unlock(&shmem_swaplist_mutex); | ||
1108 | BUG_ON(page_mapped(page)); | 724 | BUG_ON(page_mapped(page)); |
1109 | swap_writepage(page, wbc); | 725 | swap_writepage(page, wbc); |
1110 | return 0; | 726 | return 0; |
1111 | } | 727 | } |
1112 | 728 | ||
1113 | shmem_swp_unmap(entry); | 729 | mutex_unlock(&shmem_swaplist_mutex); |
1114 | unlock: | ||
1115 | spin_unlock(&info->lock); | ||
1116 | /* | ||
1117 | * add_to_swap_cache() doesn't return -EEXIST, so we can safely | ||
1118 | * clear SWAP_HAS_CACHE flag. | ||
1119 | */ | ||
1120 | swapcache_free(swap, NULL); | 730 | swapcache_free(swap, NULL); |
1121 | redirty: | 731 | redirty: |
1122 | set_page_dirty(page); | 732 | set_page_dirty(page); |
@@ -1153,35 +763,33 @@ static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) | |||
1153 | } | 763 | } |
1154 | #endif /* CONFIG_TMPFS */ | 764 | #endif /* CONFIG_TMPFS */ |
1155 | 765 | ||
1156 | static struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp, | 766 | static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp, |
1157 | struct shmem_inode_info *info, unsigned long idx) | 767 | struct shmem_inode_info *info, pgoff_t index) |
1158 | { | 768 | { |
1159 | struct mempolicy mpol, *spol; | 769 | struct mempolicy mpol, *spol; |
1160 | struct vm_area_struct pvma; | 770 | struct vm_area_struct pvma; |
1161 | struct page *page; | ||
1162 | 771 | ||
1163 | spol = mpol_cond_copy(&mpol, | 772 | spol = mpol_cond_copy(&mpol, |
1164 | mpol_shared_policy_lookup(&info->policy, idx)); | 773 | mpol_shared_policy_lookup(&info->policy, index)); |
1165 | 774 | ||
1166 | /* Create a pseudo vma that just contains the policy */ | 775 | /* Create a pseudo vma that just contains the policy */ |
1167 | pvma.vm_start = 0; | 776 | pvma.vm_start = 0; |
1168 | pvma.vm_pgoff = idx; | 777 | pvma.vm_pgoff = index; |
1169 | pvma.vm_ops = NULL; | 778 | pvma.vm_ops = NULL; |
1170 | pvma.vm_policy = spol; | 779 | pvma.vm_policy = spol; |
1171 | page = swapin_readahead(entry, gfp, &pvma, 0); | 780 | return swapin_readahead(swap, gfp, &pvma, 0); |
1172 | return page; | ||
1173 | } | 781 | } |
1174 | 782 | ||
1175 | static struct page *shmem_alloc_page(gfp_t gfp, | 783 | static struct page *shmem_alloc_page(gfp_t gfp, |
1176 | struct shmem_inode_info *info, unsigned long idx) | 784 | struct shmem_inode_info *info, pgoff_t index) |
1177 | { | 785 | { |
1178 | struct vm_area_struct pvma; | 786 | struct vm_area_struct pvma; |
1179 | 787 | ||
1180 | /* Create a pseudo vma that just contains the policy */ | 788 | /* Create a pseudo vma that just contains the policy */ |
1181 | pvma.vm_start = 0; | 789 | pvma.vm_start = 0; |
1182 | pvma.vm_pgoff = idx; | 790 | pvma.vm_pgoff = index; |
1183 | pvma.vm_ops = NULL; | 791 | pvma.vm_ops = NULL; |
1184 | pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx); | 792 | pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index); |
1185 | 793 | ||
1186 | /* | 794 | /* |
1187 | * alloc_page_vma() will drop the shared policy reference | 795 | * alloc_page_vma() will drop the shared policy reference |
@@ -1190,19 +798,19 @@ static struct page *shmem_alloc_page(gfp_t gfp, | |||
1190 | } | 798 | } |
1191 | #else /* !CONFIG_NUMA */ | 799 | #else /* !CONFIG_NUMA */ |
1192 | #ifdef CONFIG_TMPFS | 800 | #ifdef CONFIG_TMPFS |
1193 | static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *p) | 801 | static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol) |
1194 | { | 802 | { |
1195 | } | 803 | } |
1196 | #endif /* CONFIG_TMPFS */ | 804 | #endif /* CONFIG_TMPFS */ |
1197 | 805 | ||
1198 | static inline struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp, | 806 | static inline struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp, |
1199 | struct shmem_inode_info *info, unsigned long idx) | 807 | struct shmem_inode_info *info, pgoff_t index) |
1200 | { | 808 | { |
1201 | return swapin_readahead(entry, gfp, NULL, 0); | 809 | return swapin_readahead(swap, gfp, NULL, 0); |
1202 | } | 810 | } |
1203 | 811 | ||
1204 | static inline struct page *shmem_alloc_page(gfp_t gfp, | 812 | static inline struct page *shmem_alloc_page(gfp_t gfp, |
1205 | struct shmem_inode_info *info, unsigned long idx) | 813 | struct shmem_inode_info *info, pgoff_t index) |
1206 | { | 814 | { |
1207 | return alloc_page(gfp); | 815 | return alloc_page(gfp); |
1208 | } | 816 | } |
@@ -1222,243 +830,190 @@ static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) | |||
1222 | * vm. If we swap it in we mark it dirty since we also free the swap | 830 | * vm. If we swap it in we mark it dirty since we also free the swap |
1223 | * entry since a page cannot live in both the swap and page cache | 831 | * entry since a page cannot live in both the swap and page cache |
1224 | */ | 832 | */ |
1225 | static int shmem_getpage_gfp(struct inode *inode, pgoff_t idx, | 833 | static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, |
1226 | struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type) | 834 | struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type) |
1227 | { | 835 | { |
1228 | struct address_space *mapping = inode->i_mapping; | 836 | struct address_space *mapping = inode->i_mapping; |
1229 | struct shmem_inode_info *info = SHMEM_I(inode); | 837 | struct shmem_inode_info *info; |
1230 | struct shmem_sb_info *sbinfo; | 838 | struct shmem_sb_info *sbinfo; |
1231 | struct page *page; | 839 | struct page *page; |
1232 | struct page *prealloc_page = NULL; | ||
1233 | swp_entry_t *entry; | ||
1234 | swp_entry_t swap; | 840 | swp_entry_t swap; |
1235 | int error; | 841 | int error; |
1236 | int ret; | 842 | int once = 0; |
1237 | 843 | ||
1238 | if (idx >= SHMEM_MAX_INDEX) | 844 | if (index > (MAX_LFS_FILESIZE >> PAGE_CACHE_SHIFT)) |
1239 | return -EFBIG; | 845 | return -EFBIG; |
1240 | repeat: | 846 | repeat: |
1241 | page = find_lock_page(mapping, idx); | 847 | swap.val = 0; |
1242 | if (page) { | 848 | page = find_lock_page(mapping, index); |
849 | if (radix_tree_exceptional_entry(page)) { | ||
850 | swap = radix_to_swp_entry(page); | ||
851 | page = NULL; | ||
852 | } | ||
853 | |||
854 | if (sgp != SGP_WRITE && | ||
855 | ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { | ||
856 | error = -EINVAL; | ||
857 | goto failed; | ||
858 | } | ||
859 | |||
860 | if (page || (sgp == SGP_READ && !swap.val)) { | ||
1243 | /* | 861 | /* |
1244 | * Once we can get the page lock, it must be uptodate: | 862 | * Once we can get the page lock, it must be uptodate: |
1245 | * if there were an error in reading back from swap, | 863 | * if there were an error in reading back from swap, |
1246 | * the page would not be inserted into the filecache. | 864 | * the page would not be inserted into the filecache. |
1247 | */ | 865 | */ |
1248 | BUG_ON(!PageUptodate(page)); | 866 | BUG_ON(page && !PageUptodate(page)); |
1249 | goto done; | 867 | *pagep = page; |
868 | return 0; | ||
1250 | } | 869 | } |
1251 | 870 | ||
1252 | /* | 871 | /* |
1253 | * Try to preload while we can wait, to not make a habit of | 872 | * Fast cache lookup did not find it: |
1254 | * draining atomic reserves; but don't latch on to this cpu. | 873 | * bring it back from swap or allocate. |
1255 | */ | 874 | */ |
1256 | error = radix_tree_preload(gfp & GFP_RECLAIM_MASK); | 875 | info = SHMEM_I(inode); |
1257 | if (error) | 876 | sbinfo = SHMEM_SB(inode->i_sb); |
1258 | goto out; | ||
1259 | radix_tree_preload_end(); | ||
1260 | |||
1261 | if (sgp != SGP_READ && !prealloc_page) { | ||
1262 | prealloc_page = shmem_alloc_page(gfp, info, idx); | ||
1263 | if (prealloc_page) { | ||
1264 | SetPageSwapBacked(prealloc_page); | ||
1265 | if (mem_cgroup_cache_charge(prealloc_page, | ||
1266 | current->mm, GFP_KERNEL)) { | ||
1267 | page_cache_release(prealloc_page); | ||
1268 | prealloc_page = NULL; | ||
1269 | } | ||
1270 | } | ||
1271 | } | ||
1272 | |||
1273 | spin_lock(&info->lock); | ||
1274 | shmem_recalc_inode(inode); | ||
1275 | entry = shmem_swp_alloc(info, idx, sgp, gfp); | ||
1276 | if (IS_ERR(entry)) { | ||
1277 | spin_unlock(&info->lock); | ||
1278 | error = PTR_ERR(entry); | ||
1279 | goto out; | ||
1280 | } | ||
1281 | swap = *entry; | ||
1282 | 877 | ||
1283 | if (swap.val) { | 878 | if (swap.val) { |
1284 | /* Look it up and read it in.. */ | 879 | /* Look it up and read it in.. */ |
1285 | page = lookup_swap_cache(swap); | 880 | page = lookup_swap_cache(swap); |
1286 | if (!page) { | 881 | if (!page) { |
1287 | shmem_swp_unmap(entry); | ||
1288 | spin_unlock(&info->lock); | ||
1289 | /* here we actually do the io */ | 882 | /* here we actually do the io */ |
1290 | if (fault_type) | 883 | if (fault_type) |
1291 | *fault_type |= VM_FAULT_MAJOR; | 884 | *fault_type |= VM_FAULT_MAJOR; |
1292 | page = shmem_swapin(swap, gfp, info, idx); | 885 | page = shmem_swapin(swap, gfp, info, index); |
1293 | if (!page) { | 886 | if (!page) { |
1294 | spin_lock(&info->lock); | 887 | error = -ENOMEM; |
1295 | entry = shmem_swp_alloc(info, idx, sgp, gfp); | 888 | goto failed; |
1296 | if (IS_ERR(entry)) | ||
1297 | error = PTR_ERR(entry); | ||
1298 | else { | ||
1299 | if (entry->val == swap.val) | ||
1300 | error = -ENOMEM; | ||
1301 | shmem_swp_unmap(entry); | ||
1302 | } | ||
1303 | spin_unlock(&info->lock); | ||
1304 | if (error) | ||
1305 | goto out; | ||
1306 | goto repeat; | ||
1307 | } | 889 | } |
1308 | wait_on_page_locked(page); | ||
1309 | page_cache_release(page); | ||
1310 | goto repeat; | ||
1311 | } | 890 | } |
1312 | 891 | ||
1313 | /* We have to do this with page locked to prevent races */ | 892 | /* We have to do this with page locked to prevent races */ |
1314 | if (!trylock_page(page)) { | 893 | lock_page(page); |
1315 | shmem_swp_unmap(entry); | ||
1316 | spin_unlock(&info->lock); | ||
1317 | wait_on_page_locked(page); | ||
1318 | page_cache_release(page); | ||
1319 | goto repeat; | ||
1320 | } | ||
1321 | if (PageWriteback(page)) { | ||
1322 | shmem_swp_unmap(entry); | ||
1323 | spin_unlock(&info->lock); | ||
1324 | wait_on_page_writeback(page); | ||
1325 | unlock_page(page); | ||
1326 | page_cache_release(page); | ||
1327 | goto repeat; | ||
1328 | } | ||
1329 | if (!PageUptodate(page)) { | 894 | if (!PageUptodate(page)) { |
1330 | shmem_swp_unmap(entry); | ||
1331 | spin_unlock(&info->lock); | ||
1332 | unlock_page(page); | ||
1333 | page_cache_release(page); | ||
1334 | error = -EIO; | 895 | error = -EIO; |
1335 | goto out; | 896 | goto failed; |
1336 | } | 897 | } |
1337 | 898 | wait_on_page_writeback(page); | |
1338 | error = add_to_page_cache_locked(page, mapping, | 899 | |
1339 | idx, GFP_NOWAIT); | 900 | /* Someone may have already done it for us */ |
1340 | if (error) { | 901 | if (page->mapping) { |
1341 | shmem_swp_unmap(entry); | 902 | if (page->mapping == mapping && |
1342 | spin_unlock(&info->lock); | 903 | page->index == index) |
1343 | if (error == -ENOMEM) { | 904 | goto done; |
1344 | /* | 905 | error = -EEXIST; |
1345 | * reclaim from proper memory cgroup and | 906 | goto failed; |
1346 | * call memcg's OOM if needed. | ||
1347 | */ | ||
1348 | error = mem_cgroup_shmem_charge_fallback( | ||
1349 | page, current->mm, gfp); | ||
1350 | if (error) { | ||
1351 | unlock_page(page); | ||
1352 | page_cache_release(page); | ||
1353 | goto out; | ||
1354 | } | ||
1355 | } | ||
1356 | unlock_page(page); | ||
1357 | page_cache_release(page); | ||
1358 | goto repeat; | ||
1359 | } | 907 | } |
1360 | 908 | ||
1361 | info->flags |= SHMEM_PAGEIN; | 909 | error = mem_cgroup_cache_charge(page, current->mm, |
1362 | shmem_swp_set(info, entry, 0); | 910 | gfp & GFP_RECLAIM_MASK); |
1363 | shmem_swp_unmap(entry); | 911 | if (!error) |
1364 | delete_from_swap_cache(page); | 912 | error = shmem_add_to_page_cache(page, mapping, index, |
913 | gfp, swp_to_radix_entry(swap)); | ||
914 | if (error) | ||
915 | goto failed; | ||
916 | |||
917 | spin_lock(&info->lock); | ||
918 | info->swapped--; | ||
919 | shmem_recalc_inode(inode); | ||
1365 | spin_unlock(&info->lock); | 920 | spin_unlock(&info->lock); |
921 | |||
922 | delete_from_swap_cache(page); | ||
1366 | set_page_dirty(page); | 923 | set_page_dirty(page); |
1367 | swap_free(swap); | 924 | swap_free(swap); |
1368 | 925 | ||
1369 | } else if (sgp == SGP_READ) { | 926 | } else { |
1370 | shmem_swp_unmap(entry); | 927 | if (shmem_acct_block(info->flags)) { |
1371 | page = find_get_page(mapping, idx); | 928 | error = -ENOSPC; |
1372 | if (page && !trylock_page(page)) { | 929 | goto failed; |
1373 | spin_unlock(&info->lock); | ||
1374 | wait_on_page_locked(page); | ||
1375 | page_cache_release(page); | ||
1376 | goto repeat; | ||
1377 | } | 930 | } |
1378 | spin_unlock(&info->lock); | ||
1379 | |||
1380 | } else if (prealloc_page) { | ||
1381 | shmem_swp_unmap(entry); | ||
1382 | sbinfo = SHMEM_SB(inode->i_sb); | ||
1383 | if (sbinfo->max_blocks) { | 931 | if (sbinfo->max_blocks) { |
1384 | if (percpu_counter_compare(&sbinfo->used_blocks, | 932 | if (percpu_counter_compare(&sbinfo->used_blocks, |
1385 | sbinfo->max_blocks) >= 0 || | 933 | sbinfo->max_blocks) >= 0) { |
1386 | shmem_acct_block(info->flags)) | 934 | error = -ENOSPC; |
1387 | goto nospace; | 935 | goto unacct; |
936 | } | ||
1388 | percpu_counter_inc(&sbinfo->used_blocks); | 937 | percpu_counter_inc(&sbinfo->used_blocks); |
1389 | inode->i_blocks += BLOCKS_PER_PAGE; | ||
1390 | } else if (shmem_acct_block(info->flags)) | ||
1391 | goto nospace; | ||
1392 | |||
1393 | page = prealloc_page; | ||
1394 | prealloc_page = NULL; | ||
1395 | |||
1396 | entry = shmem_swp_alloc(info, idx, sgp, gfp); | ||
1397 | if (IS_ERR(entry)) | ||
1398 | error = PTR_ERR(entry); | ||
1399 | else { | ||
1400 | swap = *entry; | ||
1401 | shmem_swp_unmap(entry); | ||
1402 | } | 938 | } |
1403 | ret = error || swap.val; | 939 | |
1404 | if (ret) | 940 | page = shmem_alloc_page(gfp, info, index); |
1405 | mem_cgroup_uncharge_cache_page(page); | 941 | if (!page) { |
1406 | else | 942 | error = -ENOMEM; |
1407 | ret = add_to_page_cache_lru(page, mapping, | 943 | goto decused; |
1408 | idx, GFP_NOWAIT); | ||
1409 | /* | ||
1410 | * At add_to_page_cache_lru() failure, | ||
1411 | * uncharge will be done automatically. | ||
1412 | */ | ||
1413 | if (ret) { | ||
1414 | shmem_unacct_blocks(info->flags, 1); | ||
1415 | shmem_free_blocks(inode, 1); | ||
1416 | spin_unlock(&info->lock); | ||
1417 | page_cache_release(page); | ||
1418 | if (error) | ||
1419 | goto out; | ||
1420 | goto repeat; | ||
1421 | } | 944 | } |
1422 | 945 | ||
1423 | info->flags |= SHMEM_PAGEIN; | 946 | SetPageSwapBacked(page); |
947 | __set_page_locked(page); | ||
948 | error = mem_cgroup_cache_charge(page, current->mm, | ||
949 | gfp & GFP_RECLAIM_MASK); | ||
950 | if (!error) | ||
951 | error = shmem_add_to_page_cache(page, mapping, index, | ||
952 | gfp, NULL); | ||
953 | if (error) | ||
954 | goto decused; | ||
955 | lru_cache_add_anon(page); | ||
956 | |||
957 | spin_lock(&info->lock); | ||
1424 | info->alloced++; | 958 | info->alloced++; |
959 | inode->i_blocks += BLOCKS_PER_PAGE; | ||
960 | shmem_recalc_inode(inode); | ||
1425 | spin_unlock(&info->lock); | 961 | spin_unlock(&info->lock); |
962 | |||
1426 | clear_highpage(page); | 963 | clear_highpage(page); |
1427 | flush_dcache_page(page); | 964 | flush_dcache_page(page); |
1428 | SetPageUptodate(page); | 965 | SetPageUptodate(page); |
1429 | if (sgp == SGP_DIRTY) | 966 | if (sgp == SGP_DIRTY) |
1430 | set_page_dirty(page); | 967 | set_page_dirty(page); |
1431 | |||
1432 | } else { | ||
1433 | spin_unlock(&info->lock); | ||
1434 | error = -ENOMEM; | ||
1435 | goto out; | ||
1436 | } | 968 | } |
1437 | done: | 969 | done: |
1438 | *pagep = page; | 970 | /* Perhaps the file has been truncated since we checked */ |
1439 | error = 0; | 971 | if (sgp != SGP_WRITE && |
1440 | out: | 972 | ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { |
1441 | if (prealloc_page) { | 973 | error = -EINVAL; |
1442 | mem_cgroup_uncharge_cache_page(prealloc_page); | 974 | goto trunc; |
1443 | page_cache_release(prealloc_page); | ||
1444 | } | 975 | } |
1445 | return error; | 976 | *pagep = page; |
977 | return 0; | ||
1446 | 978 | ||
1447 | nospace: | ||
1448 | /* | 979 | /* |
1449 | * Perhaps the page was brought in from swap between find_lock_page | 980 | * Error recovery. |
1450 | * and taking info->lock? We allow for that at add_to_page_cache_lru, | ||
1451 | * but must also avoid reporting a spurious ENOSPC while working on a | ||
1452 | * full tmpfs. | ||
1453 | */ | 981 | */ |
1454 | page = find_get_page(mapping, idx); | 982 | trunc: |
983 | ClearPageDirty(page); | ||
984 | delete_from_page_cache(page); | ||
985 | spin_lock(&info->lock); | ||
986 | info->alloced--; | ||
987 | inode->i_blocks -= BLOCKS_PER_PAGE; | ||
1455 | spin_unlock(&info->lock); | 988 | spin_unlock(&info->lock); |
989 | decused: | ||
990 | if (sbinfo->max_blocks) | ||
991 | percpu_counter_add(&sbinfo->used_blocks, -1); | ||
992 | unacct: | ||
993 | shmem_unacct_blocks(info->flags, 1); | ||
994 | failed: | ||
995 | if (swap.val && error != -EINVAL) { | ||
996 | struct page *test = find_get_page(mapping, index); | ||
997 | if (test && !radix_tree_exceptional_entry(test)) | ||
998 | page_cache_release(test); | ||
999 | /* Have another try if the entry has changed */ | ||
1000 | if (test != swp_to_radix_entry(swap)) | ||
1001 | error = -EEXIST; | ||
1002 | } | ||
1456 | if (page) { | 1003 | if (page) { |
1004 | unlock_page(page); | ||
1457 | page_cache_release(page); | 1005 | page_cache_release(page); |
1006 | } | ||
1007 | if (error == -ENOSPC && !once++) { | ||
1008 | info = SHMEM_I(inode); | ||
1009 | spin_lock(&info->lock); | ||
1010 | shmem_recalc_inode(inode); | ||
1011 | spin_unlock(&info->lock); | ||
1458 | goto repeat; | 1012 | goto repeat; |
1459 | } | 1013 | } |
1460 | error = -ENOSPC; | 1014 | if (error == -EEXIST) |
1461 | goto out; | 1015 | goto repeat; |
1016 | return error; | ||
1462 | } | 1017 | } |
1463 | 1018 | ||
1464 | static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | 1019 | static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) |
@@ -1467,9 +1022,6 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
1467 | int error; | 1022 | int error; |
1468 | int ret = VM_FAULT_LOCKED; | 1023 | int ret = VM_FAULT_LOCKED; |
1469 | 1024 | ||
1470 | if (((loff_t)vmf->pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode)) | ||
1471 | return VM_FAULT_SIGBUS; | ||
1472 | |||
1473 | error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret); | 1025 | error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret); |
1474 | if (error) | 1026 | if (error) |
1475 | return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); | 1027 | return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); |
@@ -1482,20 +1034,20 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
1482 | } | 1034 | } |
1483 | 1035 | ||
1484 | #ifdef CONFIG_NUMA | 1036 | #ifdef CONFIG_NUMA |
1485 | static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new) | 1037 | static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol) |
1486 | { | 1038 | { |
1487 | struct inode *i = vma->vm_file->f_path.dentry->d_inode; | 1039 | struct inode *inode = vma->vm_file->f_path.dentry->d_inode; |
1488 | return mpol_set_shared_policy(&SHMEM_I(i)->policy, vma, new); | 1040 | return mpol_set_shared_policy(&SHMEM_I(inode)->policy, vma, mpol); |
1489 | } | 1041 | } |
1490 | 1042 | ||
1491 | static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma, | 1043 | static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma, |
1492 | unsigned long addr) | 1044 | unsigned long addr) |
1493 | { | 1045 | { |
1494 | struct inode *i = vma->vm_file->f_path.dentry->d_inode; | 1046 | struct inode *inode = vma->vm_file->f_path.dentry->d_inode; |
1495 | unsigned long idx; | 1047 | pgoff_t index; |
1496 | 1048 | ||
1497 | idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; | 1049 | index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; |
1498 | return mpol_shared_policy_lookup(&SHMEM_I(i)->policy, idx); | 1050 | return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index); |
1499 | } | 1051 | } |
1500 | #endif | 1052 | #endif |
1501 | 1053 | ||
@@ -1516,6 +1068,12 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user) | |||
1516 | user_shm_unlock(inode->i_size, user); | 1068 | user_shm_unlock(inode->i_size, user); |
1517 | info->flags &= ~VM_LOCKED; | 1069 | info->flags &= ~VM_LOCKED; |
1518 | mapping_clear_unevictable(file->f_mapping); | 1070 | mapping_clear_unevictable(file->f_mapping); |
1071 | /* | ||
1072 | * Ensure that a racing putback_lru_page() can see | ||
1073 | * the pages of this mapping are evictable when we | ||
1074 | * skip them due to !PageLRU during the scan. | ||
1075 | */ | ||
1076 | smp_mb__after_clear_bit(); | ||
1519 | scan_mapping_unevictable_pages(file->f_mapping); | 1077 | scan_mapping_unevictable_pages(file->f_mapping); |
1520 | } | 1078 | } |
1521 | retval = 0; | 1079 | retval = 0; |
@@ -1593,7 +1151,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode | |||
1593 | 1151 | ||
1594 | #ifdef CONFIG_TMPFS | 1152 | #ifdef CONFIG_TMPFS |
1595 | static const struct inode_operations shmem_symlink_inode_operations; | 1153 | static const struct inode_operations shmem_symlink_inode_operations; |
1596 | static const struct inode_operations shmem_symlink_inline_operations; | 1154 | static const struct inode_operations shmem_short_symlink_operations; |
1597 | 1155 | ||
1598 | static int | 1156 | static int |
1599 | shmem_write_begin(struct file *file, struct address_space *mapping, | 1157 | shmem_write_begin(struct file *file, struct address_space *mapping, |
@@ -1626,7 +1184,8 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_ | |||
1626 | { | 1184 | { |
1627 | struct inode *inode = filp->f_path.dentry->d_inode; | 1185 | struct inode *inode = filp->f_path.dentry->d_inode; |
1628 | struct address_space *mapping = inode->i_mapping; | 1186 | struct address_space *mapping = inode->i_mapping; |
1629 | unsigned long index, offset; | 1187 | pgoff_t index; |
1188 | unsigned long offset; | ||
1630 | enum sgp_type sgp = SGP_READ; | 1189 | enum sgp_type sgp = SGP_READ; |
1631 | 1190 | ||
1632 | /* | 1191 | /* |
@@ -1642,7 +1201,8 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_ | |||
1642 | 1201 | ||
1643 | for (;;) { | 1202 | for (;;) { |
1644 | struct page *page = NULL; | 1203 | struct page *page = NULL; |
1645 | unsigned long end_index, nr, ret; | 1204 | pgoff_t end_index; |
1205 | unsigned long nr, ret; | ||
1646 | loff_t i_size = i_size_read(inode); | 1206 | loff_t i_size = i_size_read(inode); |
1647 | 1207 | ||
1648 | end_index = i_size >> PAGE_CACHE_SHIFT; | 1208 | end_index = i_size >> PAGE_CACHE_SHIFT; |
@@ -1880,8 +1440,9 @@ static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) | |||
1880 | buf->f_namelen = NAME_MAX; | 1440 | buf->f_namelen = NAME_MAX; |
1881 | if (sbinfo->max_blocks) { | 1441 | if (sbinfo->max_blocks) { |
1882 | buf->f_blocks = sbinfo->max_blocks; | 1442 | buf->f_blocks = sbinfo->max_blocks; |
1883 | buf->f_bavail = buf->f_bfree = | 1443 | buf->f_bavail = |
1884 | sbinfo->max_blocks - percpu_counter_sum(&sbinfo->used_blocks); | 1444 | buf->f_bfree = sbinfo->max_blocks - |
1445 | percpu_counter_sum(&sbinfo->used_blocks); | ||
1885 | } | 1446 | } |
1886 | if (sbinfo->max_inodes) { | 1447 | if (sbinfo->max_inodes) { |
1887 | buf->f_files = sbinfo->max_inodes; | 1448 | buf->f_files = sbinfo->max_inodes; |
@@ -1903,7 +1464,7 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) | |||
1903 | inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE); | 1464 | inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE); |
1904 | if (inode) { | 1465 | if (inode) { |
1905 | error = security_inode_init_security(inode, dir, | 1466 | error = security_inode_init_security(inode, dir, |
1906 | &dentry->d_name, NULL, | 1467 | &dentry->d_name, |
1907 | NULL, NULL); | 1468 | NULL, NULL); |
1908 | if (error) { | 1469 | if (error) { |
1909 | if (error != -EOPNOTSUPP) { | 1470 | if (error != -EOPNOTSUPP) { |
@@ -2043,7 +1604,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s | |||
2043 | if (!inode) | 1604 | if (!inode) |
2044 | return -ENOSPC; | 1605 | return -ENOSPC; |
2045 | 1606 | ||
2046 | error = security_inode_init_security(inode, dir, &dentry->d_name, NULL, | 1607 | error = security_inode_init_security(inode, dir, &dentry->d_name, |
2047 | NULL, NULL); | 1608 | NULL, NULL); |
2048 | if (error) { | 1609 | if (error) { |
2049 | if (error != -EOPNOTSUPP) { | 1610 | if (error != -EOPNOTSUPP) { |
@@ -2055,10 +1616,13 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s | |||
2055 | 1616 | ||
2056 | info = SHMEM_I(inode); | 1617 | info = SHMEM_I(inode); |
2057 | inode->i_size = len-1; | 1618 | inode->i_size = len-1; |
2058 | if (len <= SHMEM_SYMLINK_INLINE_LEN) { | 1619 | if (len <= SHORT_SYMLINK_LEN) { |
2059 | /* do it inline */ | 1620 | info->symlink = kmemdup(symname, len, GFP_KERNEL); |
2060 | memcpy(info->inline_symlink, symname, len); | 1621 | if (!info->symlink) { |
2061 | inode->i_op = &shmem_symlink_inline_operations; | 1622 | iput(inode); |
1623 | return -ENOMEM; | ||
1624 | } | ||
1625 | inode->i_op = &shmem_short_symlink_operations; | ||
2062 | } else { | 1626 | } else { |
2063 | error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL); | 1627 | error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL); |
2064 | if (error) { | 1628 | if (error) { |
@@ -2081,17 +1645,17 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s | |||
2081 | return 0; | 1645 | return 0; |
2082 | } | 1646 | } |
2083 | 1647 | ||
2084 | static void *shmem_follow_link_inline(struct dentry *dentry, struct nameidata *nd) | 1648 | static void *shmem_follow_short_symlink(struct dentry *dentry, struct nameidata *nd) |
2085 | { | 1649 | { |
2086 | nd_set_link(nd, SHMEM_I(dentry->d_inode)->inline_symlink); | 1650 | nd_set_link(nd, SHMEM_I(dentry->d_inode)->symlink); |
2087 | return NULL; | 1651 | return NULL; |
2088 | } | 1652 | } |
2089 | 1653 | ||
2090 | static void *shmem_follow_link(struct dentry *dentry, struct nameidata *nd) | 1654 | static void *shmem_follow_link(struct dentry *dentry, struct nameidata *nd) |
2091 | { | 1655 | { |
2092 | struct page *page = NULL; | 1656 | struct page *page = NULL; |
2093 | int res = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL); | 1657 | int error = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL); |
2094 | nd_set_link(nd, res ? ERR_PTR(res) : kmap(page)); | 1658 | nd_set_link(nd, error ? ERR_PTR(error) : kmap(page)); |
2095 | if (page) | 1659 | if (page) |
2096 | unlock_page(page); | 1660 | unlock_page(page); |
2097 | return page; | 1661 | return page; |
@@ -2202,7 +1766,6 @@ out: | |||
2202 | return err; | 1766 | return err; |
2203 | } | 1767 | } |
2204 | 1768 | ||
2205 | |||
2206 | static const struct xattr_handler *shmem_xattr_handlers[] = { | 1769 | static const struct xattr_handler *shmem_xattr_handlers[] = { |
2207 | #ifdef CONFIG_TMPFS_POSIX_ACL | 1770 | #ifdef CONFIG_TMPFS_POSIX_ACL |
2208 | &generic_acl_access_handler, | 1771 | &generic_acl_access_handler, |
@@ -2332,9 +1895,9 @@ static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size) | |||
2332 | } | 1895 | } |
2333 | #endif /* CONFIG_TMPFS_XATTR */ | 1896 | #endif /* CONFIG_TMPFS_XATTR */ |
2334 | 1897 | ||
2335 | static const struct inode_operations shmem_symlink_inline_operations = { | 1898 | static const struct inode_operations shmem_short_symlink_operations = { |
2336 | .readlink = generic_readlink, | 1899 | .readlink = generic_readlink, |
2337 | .follow_link = shmem_follow_link_inline, | 1900 | .follow_link = shmem_follow_short_symlink, |
2338 | #ifdef CONFIG_TMPFS_XATTR | 1901 | #ifdef CONFIG_TMPFS_XATTR |
2339 | .setxattr = shmem_setxattr, | 1902 | .setxattr = shmem_setxattr, |
2340 | .getxattr = shmem_getxattr, | 1903 | .getxattr = shmem_getxattr, |
@@ -2534,8 +2097,7 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data) | |||
2534 | if (config.max_inodes < inodes) | 2097 | if (config.max_inodes < inodes) |
2535 | goto out; | 2098 | goto out; |
2536 | /* | 2099 | /* |
2537 | * Those tests also disallow limited->unlimited while any are in | 2100 | * Those tests disallow limited->unlimited while any are in use; |
2538 | * use, so i_blocks will always be zero when max_blocks is zero; | ||
2539 | * but we must separately disallow unlimited->limited, because | 2101 | * but we must separately disallow unlimited->limited, because |
2540 | * in that case we have no record of how much is already in use. | 2102 | * in that case we have no record of how much is already in use. |
2541 | */ | 2103 | */ |
@@ -2627,7 +2189,7 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent) | |||
2627 | goto failed; | 2189 | goto failed; |
2628 | sbinfo->free_inodes = sbinfo->max_inodes; | 2190 | sbinfo->free_inodes = sbinfo->max_inodes; |
2629 | 2191 | ||
2630 | sb->s_maxbytes = SHMEM_MAX_BYTES; | 2192 | sb->s_maxbytes = MAX_LFS_FILESIZE; |
2631 | sb->s_blocksize = PAGE_CACHE_SIZE; | 2193 | sb->s_blocksize = PAGE_CACHE_SIZE; |
2632 | sb->s_blocksize_bits = PAGE_CACHE_SHIFT; | 2194 | sb->s_blocksize_bits = PAGE_CACHE_SHIFT; |
2633 | sb->s_magic = TMPFS_MAGIC; | 2195 | sb->s_magic = TMPFS_MAGIC; |
@@ -2662,14 +2224,14 @@ static struct kmem_cache *shmem_inode_cachep; | |||
2662 | 2224 | ||
2663 | static struct inode *shmem_alloc_inode(struct super_block *sb) | 2225 | static struct inode *shmem_alloc_inode(struct super_block *sb) |
2664 | { | 2226 | { |
2665 | struct shmem_inode_info *p; | 2227 | struct shmem_inode_info *info; |
2666 | p = (struct shmem_inode_info *)kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL); | 2228 | info = kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL); |
2667 | if (!p) | 2229 | if (!info) |
2668 | return NULL; | 2230 | return NULL; |
2669 | return &p->vfs_inode; | 2231 | return &info->vfs_inode; |
2670 | } | 2232 | } |
2671 | 2233 | ||
2672 | static void shmem_i_callback(struct rcu_head *head) | 2234 | static void shmem_destroy_callback(struct rcu_head *head) |
2673 | { | 2235 | { |
2674 | struct inode *inode = container_of(head, struct inode, i_rcu); | 2236 | struct inode *inode = container_of(head, struct inode, i_rcu); |
2675 | INIT_LIST_HEAD(&inode->i_dentry); | 2237 | INIT_LIST_HEAD(&inode->i_dentry); |
@@ -2678,29 +2240,26 @@ static void shmem_i_callback(struct rcu_head *head) | |||
2678 | 2240 | ||
2679 | static void shmem_destroy_inode(struct inode *inode) | 2241 | static void shmem_destroy_inode(struct inode *inode) |
2680 | { | 2242 | { |
2681 | if ((inode->i_mode & S_IFMT) == S_IFREG) { | 2243 | if ((inode->i_mode & S_IFMT) == S_IFREG) |
2682 | /* only struct inode is valid if it's an inline symlink */ | ||
2683 | mpol_free_shared_policy(&SHMEM_I(inode)->policy); | 2244 | mpol_free_shared_policy(&SHMEM_I(inode)->policy); |
2684 | } | 2245 | call_rcu(&inode->i_rcu, shmem_destroy_callback); |
2685 | call_rcu(&inode->i_rcu, shmem_i_callback); | ||
2686 | } | 2246 | } |
2687 | 2247 | ||
2688 | static void init_once(void *foo) | 2248 | static void shmem_init_inode(void *foo) |
2689 | { | 2249 | { |
2690 | struct shmem_inode_info *p = (struct shmem_inode_info *) foo; | 2250 | struct shmem_inode_info *info = foo; |
2691 | 2251 | inode_init_once(&info->vfs_inode); | |
2692 | inode_init_once(&p->vfs_inode); | ||
2693 | } | 2252 | } |
2694 | 2253 | ||
2695 | static int init_inodecache(void) | 2254 | static int shmem_init_inodecache(void) |
2696 | { | 2255 | { |
2697 | shmem_inode_cachep = kmem_cache_create("shmem_inode_cache", | 2256 | shmem_inode_cachep = kmem_cache_create("shmem_inode_cache", |
2698 | sizeof(struct shmem_inode_info), | 2257 | sizeof(struct shmem_inode_info), |
2699 | 0, SLAB_PANIC, init_once); | 2258 | 0, SLAB_PANIC, shmem_init_inode); |
2700 | return 0; | 2259 | return 0; |
2701 | } | 2260 | } |
2702 | 2261 | ||
2703 | static void destroy_inodecache(void) | 2262 | static void shmem_destroy_inodecache(void) |
2704 | { | 2263 | { |
2705 | kmem_cache_destroy(shmem_inode_cachep); | 2264 | kmem_cache_destroy(shmem_inode_cachep); |
2706 | } | 2265 | } |
@@ -2797,21 +2356,20 @@ static const struct vm_operations_struct shmem_vm_ops = { | |||
2797 | #endif | 2356 | #endif |
2798 | }; | 2357 | }; |
2799 | 2358 | ||
2800 | |||
2801 | static struct dentry *shmem_mount(struct file_system_type *fs_type, | 2359 | static struct dentry *shmem_mount(struct file_system_type *fs_type, |
2802 | int flags, const char *dev_name, void *data) | 2360 | int flags, const char *dev_name, void *data) |
2803 | { | 2361 | { |
2804 | return mount_nodev(fs_type, flags, data, shmem_fill_super); | 2362 | return mount_nodev(fs_type, flags, data, shmem_fill_super); |
2805 | } | 2363 | } |
2806 | 2364 | ||
2807 | static struct file_system_type tmpfs_fs_type = { | 2365 | static struct file_system_type shmem_fs_type = { |
2808 | .owner = THIS_MODULE, | 2366 | .owner = THIS_MODULE, |
2809 | .name = "tmpfs", | 2367 | .name = "tmpfs", |
2810 | .mount = shmem_mount, | 2368 | .mount = shmem_mount, |
2811 | .kill_sb = kill_litter_super, | 2369 | .kill_sb = kill_litter_super, |
2812 | }; | 2370 | }; |
2813 | 2371 | ||
2814 | int __init init_tmpfs(void) | 2372 | int __init shmem_init(void) |
2815 | { | 2373 | { |
2816 | int error; | 2374 | int error; |
2817 | 2375 | ||
@@ -2819,18 +2377,18 @@ int __init init_tmpfs(void) | |||
2819 | if (error) | 2377 | if (error) |
2820 | goto out4; | 2378 | goto out4; |
2821 | 2379 | ||
2822 | error = init_inodecache(); | 2380 | error = shmem_init_inodecache(); |
2823 | if (error) | 2381 | if (error) |
2824 | goto out3; | 2382 | goto out3; |
2825 | 2383 | ||
2826 | error = register_filesystem(&tmpfs_fs_type); | 2384 | error = register_filesystem(&shmem_fs_type); |
2827 | if (error) { | 2385 | if (error) { |
2828 | printk(KERN_ERR "Could not register tmpfs\n"); | 2386 | printk(KERN_ERR "Could not register tmpfs\n"); |
2829 | goto out2; | 2387 | goto out2; |
2830 | } | 2388 | } |
2831 | 2389 | ||
2832 | shm_mnt = vfs_kern_mount(&tmpfs_fs_type, MS_NOUSER, | 2390 | shm_mnt = vfs_kern_mount(&shmem_fs_type, MS_NOUSER, |
2833 | tmpfs_fs_type.name, NULL); | 2391 | shmem_fs_type.name, NULL); |
2834 | if (IS_ERR(shm_mnt)) { | 2392 | if (IS_ERR(shm_mnt)) { |
2835 | error = PTR_ERR(shm_mnt); | 2393 | error = PTR_ERR(shm_mnt); |
2836 | printk(KERN_ERR "Could not kern_mount tmpfs\n"); | 2394 | printk(KERN_ERR "Could not kern_mount tmpfs\n"); |
@@ -2839,9 +2397,9 @@ int __init init_tmpfs(void) | |||
2839 | return 0; | 2397 | return 0; |
2840 | 2398 | ||
2841 | out1: | 2399 | out1: |
2842 | unregister_filesystem(&tmpfs_fs_type); | 2400 | unregister_filesystem(&shmem_fs_type); |
2843 | out2: | 2401 | out2: |
2844 | destroy_inodecache(); | 2402 | shmem_destroy_inodecache(); |
2845 | out3: | 2403 | out3: |
2846 | bdi_destroy(&shmem_backing_dev_info); | 2404 | bdi_destroy(&shmem_backing_dev_info); |
2847 | out4: | 2405 | out4: |
@@ -2849,45 +2407,6 @@ out4: | |||
2849 | return error; | 2407 | return error; |
2850 | } | 2408 | } |
2851 | 2409 | ||
2852 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR | ||
2853 | /** | ||
2854 | * mem_cgroup_get_shmem_target - find a page or entry assigned to the shmem file | ||
2855 | * @inode: the inode to be searched | ||
2856 | * @pgoff: the offset to be searched | ||
2857 | * @pagep: the pointer for the found page to be stored | ||
2858 | * @ent: the pointer for the found swap entry to be stored | ||
2859 | * | ||
2860 | * If a page is found, refcount of it is incremented. Callers should handle | ||
2861 | * these refcount. | ||
2862 | */ | ||
2863 | void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t pgoff, | ||
2864 | struct page **pagep, swp_entry_t *ent) | ||
2865 | { | ||
2866 | swp_entry_t entry = { .val = 0 }, *ptr; | ||
2867 | struct page *page = NULL; | ||
2868 | struct shmem_inode_info *info = SHMEM_I(inode); | ||
2869 | |||
2870 | if ((pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode)) | ||
2871 | goto out; | ||
2872 | |||
2873 | spin_lock(&info->lock); | ||
2874 | ptr = shmem_swp_entry(info, pgoff, NULL); | ||
2875 | #ifdef CONFIG_SWAP | ||
2876 | if (ptr && ptr->val) { | ||
2877 | entry.val = ptr->val; | ||
2878 | page = find_get_page(&swapper_space, entry.val); | ||
2879 | } else | ||
2880 | #endif | ||
2881 | page = find_get_page(inode->i_mapping, pgoff); | ||
2882 | if (ptr) | ||
2883 | shmem_swp_unmap(ptr); | ||
2884 | spin_unlock(&info->lock); | ||
2885 | out: | ||
2886 | *pagep = page; | ||
2887 | *ent = entry; | ||
2888 | } | ||
2889 | #endif | ||
2890 | |||
2891 | #else /* !CONFIG_SHMEM */ | 2410 | #else /* !CONFIG_SHMEM */ |
2892 | 2411 | ||
2893 | /* | 2412 | /* |
@@ -2901,23 +2420,23 @@ out: | |||
2901 | 2420 | ||
2902 | #include <linux/ramfs.h> | 2421 | #include <linux/ramfs.h> |
2903 | 2422 | ||
2904 | static struct file_system_type tmpfs_fs_type = { | 2423 | static struct file_system_type shmem_fs_type = { |
2905 | .name = "tmpfs", | 2424 | .name = "tmpfs", |
2906 | .mount = ramfs_mount, | 2425 | .mount = ramfs_mount, |
2907 | .kill_sb = kill_litter_super, | 2426 | .kill_sb = kill_litter_super, |
2908 | }; | 2427 | }; |
2909 | 2428 | ||
2910 | int __init init_tmpfs(void) | 2429 | int __init shmem_init(void) |
2911 | { | 2430 | { |
2912 | BUG_ON(register_filesystem(&tmpfs_fs_type) != 0); | 2431 | BUG_ON(register_filesystem(&shmem_fs_type) != 0); |
2913 | 2432 | ||
2914 | shm_mnt = kern_mount(&tmpfs_fs_type); | 2433 | shm_mnt = kern_mount(&shmem_fs_type); |
2915 | BUG_ON(IS_ERR(shm_mnt)); | 2434 | BUG_ON(IS_ERR(shm_mnt)); |
2916 | 2435 | ||
2917 | return 0; | 2436 | return 0; |
2918 | } | 2437 | } |
2919 | 2438 | ||
2920 | int shmem_unuse(swp_entry_t entry, struct page *page) | 2439 | int shmem_unuse(swp_entry_t swap, struct page *page) |
2921 | { | 2440 | { |
2922 | return 0; | 2441 | return 0; |
2923 | } | 2442 | } |
@@ -2927,43 +2446,17 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user) | |||
2927 | return 0; | 2446 | return 0; |
2928 | } | 2447 | } |
2929 | 2448 | ||
2930 | void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end) | 2449 | void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) |
2931 | { | 2450 | { |
2932 | truncate_inode_pages_range(inode->i_mapping, start, end); | 2451 | truncate_inode_pages_range(inode->i_mapping, lstart, lend); |
2933 | } | 2452 | } |
2934 | EXPORT_SYMBOL_GPL(shmem_truncate_range); | 2453 | EXPORT_SYMBOL_GPL(shmem_truncate_range); |
2935 | 2454 | ||
2936 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR | ||
2937 | /** | ||
2938 | * mem_cgroup_get_shmem_target - find a page or entry assigned to the shmem file | ||
2939 | * @inode: the inode to be searched | ||
2940 | * @pgoff: the offset to be searched | ||
2941 | * @pagep: the pointer for the found page to be stored | ||
2942 | * @ent: the pointer for the found swap entry to be stored | ||
2943 | * | ||
2944 | * If a page is found, refcount of it is incremented. Callers should handle | ||
2945 | * these refcount. | ||
2946 | */ | ||
2947 | void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t pgoff, | ||
2948 | struct page **pagep, swp_entry_t *ent) | ||
2949 | { | ||
2950 | struct page *page = NULL; | ||
2951 | |||
2952 | if ((pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode)) | ||
2953 | goto out; | ||
2954 | page = find_get_page(inode->i_mapping, pgoff); | ||
2955 | out: | ||
2956 | *pagep = page; | ||
2957 | *ent = (swp_entry_t){ .val = 0 }; | ||
2958 | } | ||
2959 | #endif | ||
2960 | |||
2961 | #define shmem_vm_ops generic_file_vm_ops | 2455 | #define shmem_vm_ops generic_file_vm_ops |
2962 | #define shmem_file_operations ramfs_file_operations | 2456 | #define shmem_file_operations ramfs_file_operations |
2963 | #define shmem_get_inode(sb, dir, mode, dev, flags) ramfs_get_inode(sb, dir, mode, dev) | 2457 | #define shmem_get_inode(sb, dir, mode, dev, flags) ramfs_get_inode(sb, dir, mode, dev) |
2964 | #define shmem_acct_size(flags, size) 0 | 2458 | #define shmem_acct_size(flags, size) 0 |
2965 | #define shmem_unacct_size(flags, size) do {} while (0) | 2459 | #define shmem_unacct_size(flags, size) do {} while (0) |
2966 | #define SHMEM_MAX_BYTES MAX_LFS_FILESIZE | ||
2967 | 2460 | ||
2968 | #endif /* CONFIG_SHMEM */ | 2461 | #endif /* CONFIG_SHMEM */ |
2969 | 2462 | ||
@@ -2987,7 +2480,7 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags | |||
2987 | if (IS_ERR(shm_mnt)) | 2480 | if (IS_ERR(shm_mnt)) |
2988 | return (void *)shm_mnt; | 2481 | return (void *)shm_mnt; |
2989 | 2482 | ||
2990 | if (size < 0 || size > SHMEM_MAX_BYTES) | 2483 | if (size < 0 || size > MAX_LFS_FILESIZE) |
2991 | return ERR_PTR(-EINVAL); | 2484 | return ERR_PTR(-EINVAL); |
2992 | 2485 | ||
2993 | if (shmem_acct_size(flags, size)) | 2486 | if (shmem_acct_size(flags, size)) |
@@ -3010,7 +2503,7 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags | |||
3010 | 2503 | ||
3011 | d_instantiate(path.dentry, inode); | 2504 | d_instantiate(path.dentry, inode); |
3012 | inode->i_size = size; | 2505 | inode->i_size = size; |
3013 | inode->i_nlink = 0; /* It is unlinked */ | 2506 | clear_nlink(inode); /* It is unlinked */ |
3014 | #ifndef CONFIG_MMU | 2507 | #ifndef CONFIG_MMU |
3015 | error = ramfs_nommu_expand_for_mapping(inode, size); | 2508 | error = ramfs_nommu_expand_for_mapping(inode, size); |
3016 | if (error) | 2509 | if (error) |
@@ -622,6 +622,51 @@ int slab_is_available(void) | |||
622 | static struct lock_class_key on_slab_l3_key; | 622 | static struct lock_class_key on_slab_l3_key; |
623 | static struct lock_class_key on_slab_alc_key; | 623 | static struct lock_class_key on_slab_alc_key; |
624 | 624 | ||
625 | static struct lock_class_key debugobj_l3_key; | ||
626 | static struct lock_class_key debugobj_alc_key; | ||
627 | |||
628 | static void slab_set_lock_classes(struct kmem_cache *cachep, | ||
629 | struct lock_class_key *l3_key, struct lock_class_key *alc_key, | ||
630 | int q) | ||
631 | { | ||
632 | struct array_cache **alc; | ||
633 | struct kmem_list3 *l3; | ||
634 | int r; | ||
635 | |||
636 | l3 = cachep->nodelists[q]; | ||
637 | if (!l3) | ||
638 | return; | ||
639 | |||
640 | lockdep_set_class(&l3->list_lock, l3_key); | ||
641 | alc = l3->alien; | ||
642 | /* | ||
643 | * FIXME: This check for BAD_ALIEN_MAGIC | ||
644 | * should go away when common slab code is taught to | ||
645 | * work even without alien caches. | ||
646 | * Currently, non NUMA code returns BAD_ALIEN_MAGIC | ||
647 | * for alloc_alien_cache, | ||
648 | */ | ||
649 | if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC) | ||
650 | return; | ||
651 | for_each_node(r) { | ||
652 | if (alc[r]) | ||
653 | lockdep_set_class(&alc[r]->lock, alc_key); | ||
654 | } | ||
655 | } | ||
656 | |||
657 | static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, int node) | ||
658 | { | ||
659 | slab_set_lock_classes(cachep, &debugobj_l3_key, &debugobj_alc_key, node); | ||
660 | } | ||
661 | |||
662 | static void slab_set_debugobj_lock_classes(struct kmem_cache *cachep) | ||
663 | { | ||
664 | int node; | ||
665 | |||
666 | for_each_online_node(node) | ||
667 | slab_set_debugobj_lock_classes_node(cachep, node); | ||
668 | } | ||
669 | |||
625 | static void init_node_lock_keys(int q) | 670 | static void init_node_lock_keys(int q) |
626 | { | 671 | { |
627 | struct cache_sizes *s = malloc_sizes; | 672 | struct cache_sizes *s = malloc_sizes; |
@@ -630,29 +675,14 @@ static void init_node_lock_keys(int q) | |||
630 | return; | 675 | return; |
631 | 676 | ||
632 | for (s = malloc_sizes; s->cs_size != ULONG_MAX; s++) { | 677 | for (s = malloc_sizes; s->cs_size != ULONG_MAX; s++) { |
633 | struct array_cache **alc; | ||
634 | struct kmem_list3 *l3; | 678 | struct kmem_list3 *l3; |
635 | int r; | ||
636 | 679 | ||
637 | l3 = s->cs_cachep->nodelists[q]; | 680 | l3 = s->cs_cachep->nodelists[q]; |
638 | if (!l3 || OFF_SLAB(s->cs_cachep)) | 681 | if (!l3 || OFF_SLAB(s->cs_cachep)) |
639 | continue; | 682 | continue; |
640 | lockdep_set_class(&l3->list_lock, &on_slab_l3_key); | 683 | |
641 | alc = l3->alien; | 684 | slab_set_lock_classes(s->cs_cachep, &on_slab_l3_key, |
642 | /* | 685 | &on_slab_alc_key, q); |
643 | * FIXME: This check for BAD_ALIEN_MAGIC | ||
644 | * should go away when common slab code is taught to | ||
645 | * work even without alien caches. | ||
646 | * Currently, non NUMA code returns BAD_ALIEN_MAGIC | ||
647 | * for alloc_alien_cache, | ||
648 | */ | ||
649 | if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC) | ||
650 | continue; | ||
651 | for_each_node(r) { | ||
652 | if (alc[r]) | ||
653 | lockdep_set_class(&alc[r]->lock, | ||
654 | &on_slab_alc_key); | ||
655 | } | ||
656 | } | 686 | } |
657 | } | 687 | } |
658 | 688 | ||
@@ -671,6 +701,14 @@ static void init_node_lock_keys(int q) | |||
671 | static inline void init_lock_keys(void) | 701 | static inline void init_lock_keys(void) |
672 | { | 702 | { |
673 | } | 703 | } |
704 | |||
705 | static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, int node) | ||
706 | { | ||
707 | } | ||
708 | |||
709 | static void slab_set_debugobj_lock_classes(struct kmem_cache *cachep) | ||
710 | { | ||
711 | } | ||
674 | #endif | 712 | #endif |
675 | 713 | ||
676 | /* | 714 | /* |
@@ -1264,6 +1302,8 @@ static int __cpuinit cpuup_prepare(long cpu) | |||
1264 | spin_unlock_irq(&l3->list_lock); | 1302 | spin_unlock_irq(&l3->list_lock); |
1265 | kfree(shared); | 1303 | kfree(shared); |
1266 | free_alien_cache(alien); | 1304 | free_alien_cache(alien); |
1305 | if (cachep->flags & SLAB_DEBUG_OBJECTS) | ||
1306 | slab_set_debugobj_lock_classes_node(cachep, node); | ||
1267 | } | 1307 | } |
1268 | init_node_lock_keys(node); | 1308 | init_node_lock_keys(node); |
1269 | 1309 | ||
@@ -1626,6 +1666,9 @@ void __init kmem_cache_init_late(void) | |||
1626 | { | 1666 | { |
1627 | struct kmem_cache *cachep; | 1667 | struct kmem_cache *cachep; |
1628 | 1668 | ||
1669 | /* Annotate slab for lockdep -- annotate the malloc caches */ | ||
1670 | init_lock_keys(); | ||
1671 | |||
1629 | /* 6) resize the head arrays to their final sizes */ | 1672 | /* 6) resize the head arrays to their final sizes */ |
1630 | mutex_lock(&cache_chain_mutex); | 1673 | mutex_lock(&cache_chain_mutex); |
1631 | list_for_each_entry(cachep, &cache_chain, next) | 1674 | list_for_each_entry(cachep, &cache_chain, next) |
@@ -1636,9 +1679,6 @@ void __init kmem_cache_init_late(void) | |||
1636 | /* Done! */ | 1679 | /* Done! */ |
1637 | g_cpucache_up = FULL; | 1680 | g_cpucache_up = FULL; |
1638 | 1681 | ||
1639 | /* Annotate slab for lockdep -- annotate the malloc caches */ | ||
1640 | init_lock_keys(); | ||
1641 | |||
1642 | /* | 1682 | /* |
1643 | * Register a cpu startup notifier callback that initializes | 1683 | * Register a cpu startup notifier callback that initializes |
1644 | * cpu_cache_get for all new cpus | 1684 | * cpu_cache_get for all new cpus |
@@ -1811,15 +1851,15 @@ static void dump_line(char *data, int offset, int limit) | |||
1811 | unsigned char error = 0; | 1851 | unsigned char error = 0; |
1812 | int bad_count = 0; | 1852 | int bad_count = 0; |
1813 | 1853 | ||
1814 | printk(KERN_ERR "%03x:", offset); | 1854 | printk(KERN_ERR "%03x: ", offset); |
1815 | for (i = 0; i < limit; i++) { | 1855 | for (i = 0; i < limit; i++) { |
1816 | if (data[offset + i] != POISON_FREE) { | 1856 | if (data[offset + i] != POISON_FREE) { |
1817 | error = data[offset + i]; | 1857 | error = data[offset + i]; |
1818 | bad_count++; | 1858 | bad_count++; |
1819 | } | 1859 | } |
1820 | printk(" %02x", (unsigned char)data[offset + i]); | ||
1821 | } | 1860 | } |
1822 | printk("\n"); | 1861 | print_hex_dump(KERN_CONT, "", 0, 16, 1, |
1862 | &data[offset], limit, 1); | ||
1823 | 1863 | ||
1824 | if (bad_count == 1) { | 1864 | if (bad_count == 1) { |
1825 | error ^= POISON_FREE; | 1865 | error ^= POISON_FREE; |
@@ -2426,6 +2466,16 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
2426 | goto oops; | 2466 | goto oops; |
2427 | } | 2467 | } |
2428 | 2468 | ||
2469 | if (flags & SLAB_DEBUG_OBJECTS) { | ||
2470 | /* | ||
2471 | * Would deadlock through slab_destroy()->call_rcu()-> | ||
2472 | * debug_object_activate()->kmem_cache_alloc(). | ||
2473 | */ | ||
2474 | WARN_ON_ONCE(flags & SLAB_DESTROY_BY_RCU); | ||
2475 | |||
2476 | slab_set_debugobj_lock_classes(cachep); | ||
2477 | } | ||
2478 | |||
2429 | /* cache setup completed, link it into the list */ | 2479 | /* cache setup completed, link it into the list */ |
2430 | list_add(&cachep->next, &cache_chain); | 2480 | list_add(&cachep->next, &cache_chain); |
2431 | oops: | 2481 | oops: |
@@ -2989,14 +3039,9 @@ bad: | |||
2989 | printk(KERN_ERR "slab: Internal list corruption detected in " | 3039 | printk(KERN_ERR "slab: Internal list corruption detected in " |
2990 | "cache '%s'(%d), slabp %p(%d). Hexdump:\n", | 3040 | "cache '%s'(%d), slabp %p(%d). Hexdump:\n", |
2991 | cachep->name, cachep->num, slabp, slabp->inuse); | 3041 | cachep->name, cachep->num, slabp, slabp->inuse); |
2992 | for (i = 0; | 3042 | print_hex_dump(KERN_ERR, "", DUMP_PREFIX_OFFSET, 16, 1, slabp, |
2993 | i < sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t); | 3043 | sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t), |
2994 | i++) { | 3044 | 1); |
2995 | if (i % 16 == 0) | ||
2996 | printk("\n%03x:", i); | ||
2997 | printk(" %02x", ((unsigned char *)slabp)[i]); | ||
2998 | } | ||
2999 | printk("\n"); | ||
3000 | BUG(); | 3045 | BUG(); |
3001 | } | 3046 | } |
3002 | } | 3047 | } |
@@ -3403,7 +3448,7 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, | |||
3403 | cache_alloc_debugcheck_before(cachep, flags); | 3448 | cache_alloc_debugcheck_before(cachep, flags); |
3404 | local_irq_save(save_flags); | 3449 | local_irq_save(save_flags); |
3405 | 3450 | ||
3406 | if (nodeid == -1) | 3451 | if (nodeid == NUMA_NO_NODE) |
3407 | nodeid = slab_node; | 3452 | nodeid = slab_node; |
3408 | 3453 | ||
3409 | if (unlikely(!cachep->nodelists[nodeid])) { | 3454 | if (unlikely(!cachep->nodelists[nodeid])) { |
@@ -3934,7 +3979,7 @@ fail: | |||
3934 | 3979 | ||
3935 | struct ccupdate_struct { | 3980 | struct ccupdate_struct { |
3936 | struct kmem_cache *cachep; | 3981 | struct kmem_cache *cachep; |
3937 | struct array_cache *new[NR_CPUS]; | 3982 | struct array_cache *new[0]; |
3938 | }; | 3983 | }; |
3939 | 3984 | ||
3940 | static void do_ccupdate_local(void *info) | 3985 | static void do_ccupdate_local(void *info) |
@@ -3956,7 +4001,8 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, | |||
3956 | struct ccupdate_struct *new; | 4001 | struct ccupdate_struct *new; |
3957 | int i; | 4002 | int i; |
3958 | 4003 | ||
3959 | new = kzalloc(sizeof(*new), gfp); | 4004 | new = kzalloc(sizeof(*new) + nr_cpu_ids * sizeof(struct array_cache *), |
4005 | gfp); | ||
3960 | if (!new) | 4006 | if (!new) |
3961 | return -ENOMEM; | 4007 | return -ENOMEM; |
3962 | 4008 | ||
@@ -4533,7 +4579,7 @@ static const struct file_operations proc_slabstats_operations = { | |||
4533 | 4579 | ||
4534 | static int __init slab_proc_init(void) | 4580 | static int __init slab_proc_init(void) |
4535 | { | 4581 | { |
4536 | proc_create("slabinfo",S_IWUSR|S_IRUGO,NULL,&proc_slabinfo_operations); | 4582 | proc_create("slabinfo",S_IWUSR|S_IRUSR,NULL,&proc_slabinfo_operations); |
4537 | #ifdef CONFIG_DEBUG_SLAB_LEAK | 4583 | #ifdef CONFIG_DEBUG_SLAB_LEAK |
4538 | proc_create("slab_allocators", 0, NULL, &proc_slabstats_operations); | 4584 | proc_create("slab_allocators", 0, NULL, &proc_slabstats_operations); |
4539 | #endif | 4585 | #endif |
@@ -63,7 +63,7 @@ | |||
63 | #include <linux/swap.h> /* struct reclaim_state */ | 63 | #include <linux/swap.h> /* struct reclaim_state */ |
64 | #include <linux/cache.h> | 64 | #include <linux/cache.h> |
65 | #include <linux/init.h> | 65 | #include <linux/init.h> |
66 | #include <linux/module.h> | 66 | #include <linux/export.h> |
67 | #include <linux/rcupdate.h> | 67 | #include <linux/rcupdate.h> |
68 | #include <linux/list.h> | 68 | #include <linux/list.h> |
69 | #include <linux/kmemleak.h> | 69 | #include <linux/kmemleak.h> |
@@ -2,10 +2,11 @@ | |||
2 | * SLUB: A slab allocator that limits cache line use instead of queuing | 2 | * SLUB: A slab allocator that limits cache line use instead of queuing |
3 | * objects in per cpu and per node lists. | 3 | * objects in per cpu and per node lists. |
4 | * | 4 | * |
5 | * The allocator synchronizes using per slab locks and only | 5 | * The allocator synchronizes using per slab locks or atomic operatios |
6 | * uses a centralized lock to manage a pool of partial slabs. | 6 | * and only uses a centralized lock to manage a pool of partial slabs. |
7 | * | 7 | * |
8 | * (C) 2007 SGI, Christoph Lameter | 8 | * (C) 2007 SGI, Christoph Lameter |
9 | * (C) 2011 Linux Foundation, Christoph Lameter | ||
9 | */ | 10 | */ |
10 | 11 | ||
11 | #include <linux/mm.h> | 12 | #include <linux/mm.h> |
@@ -33,15 +34,27 @@ | |||
33 | 34 | ||
34 | /* | 35 | /* |
35 | * Lock order: | 36 | * Lock order: |
36 | * 1. slab_lock(page) | 37 | * 1. slub_lock (Global Semaphore) |
37 | * 2. slab->list_lock | 38 | * 2. node->list_lock |
39 | * 3. slab_lock(page) (Only on some arches and for debugging) | ||
38 | * | 40 | * |
39 | * The slab_lock protects operations on the object of a particular | 41 | * slub_lock |
40 | * slab and its metadata in the page struct. If the slab lock | 42 | * |
41 | * has been taken then no allocations nor frees can be performed | 43 | * The role of the slub_lock is to protect the list of all the slabs |
42 | * on the objects in the slab nor can the slab be added or removed | 44 | * and to synchronize major metadata changes to slab cache structures. |
43 | * from the partial or full lists since this would mean modifying | 45 | * |
44 | * the page_struct of the slab. | 46 | * The slab_lock is only used for debugging and on arches that do not |
47 | * have the ability to do a cmpxchg_double. It only protects the second | ||
48 | * double word in the page struct. Meaning | ||
49 | * A. page->freelist -> List of object free in a page | ||
50 | * B. page->counters -> Counters of objects | ||
51 | * C. page->frozen -> frozen state | ||
52 | * | ||
53 | * If a slab is frozen then it is exempt from list management. It is not | ||
54 | * on any list. The processor that froze the slab is the one who can | ||
55 | * perform list operations on the page. Other processors may put objects | ||
56 | * onto the freelist but the processor that froze the slab is the only | ||
57 | * one that can retrieve the objects from the page's freelist. | ||
45 | * | 58 | * |
46 | * The list_lock protects the partial and full list on each node and | 59 | * The list_lock protects the partial and full list on each node and |
47 | * the partial slab counter. If taken then no new slabs may be added or | 60 | * the partial slab counter. If taken then no new slabs may be added or |
@@ -54,20 +67,6 @@ | |||
54 | * slabs, operations can continue without any centralized lock. F.e. | 67 | * slabs, operations can continue without any centralized lock. F.e. |
55 | * allocating a long series of objects that fill up slabs does not require | 68 | * allocating a long series of objects that fill up slabs does not require |
56 | * the list lock. | 69 | * the list lock. |
57 | * | ||
58 | * The lock order is sometimes inverted when we are trying to get a slab | ||
59 | * off a list. We take the list_lock and then look for a page on the list | ||
60 | * to use. While we do that objects in the slabs may be freed. We can | ||
61 | * only operate on the slab if we have also taken the slab_lock. So we use | ||
62 | * a slab_trylock() on the slab. If trylock was successful then no frees | ||
63 | * can occur anymore and we can use the slab for allocations etc. If the | ||
64 | * slab_trylock() does not succeed then frees are in progress in the slab and | ||
65 | * we must stay away from it for a while since we may cause a bouncing | ||
66 | * cacheline if we try to acquire the lock. So go onto the next slab. | ||
67 | * If all pages are busy then we may allocate a new slab instead of reusing | ||
68 | * a partial slab. A new slab has no one operating on it and thus there is | ||
69 | * no danger of cacheline contention. | ||
70 | * | ||
71 | * Interrupts are disabled during allocation and deallocation in order to | 70 | * Interrupts are disabled during allocation and deallocation in order to |
72 | * make the slab allocator safe to use in the context of an irq. In addition | 71 | * make the slab allocator safe to use in the context of an irq. In addition |
73 | * interrupts are disabled to ensure that the processor does not change | 72 | * interrupts are disabled to ensure that the processor does not change |
@@ -132,6 +131,9 @@ static inline int kmem_cache_debug(struct kmem_cache *s) | |||
132 | /* Enable to test recovery from slab corruption on boot */ | 131 | /* Enable to test recovery from slab corruption on boot */ |
133 | #undef SLUB_RESILIENCY_TEST | 132 | #undef SLUB_RESILIENCY_TEST |
134 | 133 | ||
134 | /* Enable to log cmpxchg failures */ | ||
135 | #undef SLUB_DEBUG_CMPXCHG | ||
136 | |||
135 | /* | 137 | /* |
136 | * Mininum number of partial slabs. These will be left on the partial | 138 | * Mininum number of partial slabs. These will be left on the partial |
137 | * lists even if they are empty. kmem_cache_shrink may reclaim them. | 139 | * lists even if they are empty. kmem_cache_shrink may reclaim them. |
@@ -167,10 +169,11 @@ static inline int kmem_cache_debug(struct kmem_cache *s) | |||
167 | 169 | ||
168 | #define OO_SHIFT 16 | 170 | #define OO_SHIFT 16 |
169 | #define OO_MASK ((1 << OO_SHIFT) - 1) | 171 | #define OO_MASK ((1 << OO_SHIFT) - 1) |
170 | #define MAX_OBJS_PER_PAGE 65535 /* since page.objects is u16 */ | 172 | #define MAX_OBJS_PER_PAGE 32767 /* since page.objects is u15 */ |
171 | 173 | ||
172 | /* Internal SLUB flags */ | 174 | /* Internal SLUB flags */ |
173 | #define __OBJECT_POISON 0x80000000UL /* Poison object */ | 175 | #define __OBJECT_POISON 0x80000000UL /* Poison object */ |
176 | #define __CMPXCHG_DOUBLE 0x40000000UL /* Use cmpxchg_double */ | ||
174 | 177 | ||
175 | static int kmem_size = sizeof(struct kmem_cache); | 178 | static int kmem_size = sizeof(struct kmem_cache); |
176 | 179 | ||
@@ -343,11 +346,99 @@ static inline int oo_objects(struct kmem_cache_order_objects x) | |||
343 | return x.x & OO_MASK; | 346 | return x.x & OO_MASK; |
344 | } | 347 | } |
345 | 348 | ||
349 | /* | ||
350 | * Per slab locking using the pagelock | ||
351 | */ | ||
352 | static __always_inline void slab_lock(struct page *page) | ||
353 | { | ||
354 | bit_spin_lock(PG_locked, &page->flags); | ||
355 | } | ||
356 | |||
357 | static __always_inline void slab_unlock(struct page *page) | ||
358 | { | ||
359 | __bit_spin_unlock(PG_locked, &page->flags); | ||
360 | } | ||
361 | |||
362 | /* Interrupts must be disabled (for the fallback code to work right) */ | ||
363 | static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page, | ||
364 | void *freelist_old, unsigned long counters_old, | ||
365 | void *freelist_new, unsigned long counters_new, | ||
366 | const char *n) | ||
367 | { | ||
368 | VM_BUG_ON(!irqs_disabled()); | ||
369 | #ifdef CONFIG_CMPXCHG_DOUBLE | ||
370 | if (s->flags & __CMPXCHG_DOUBLE) { | ||
371 | if (cmpxchg_double(&page->freelist, | ||
372 | freelist_old, counters_old, | ||
373 | freelist_new, counters_new)) | ||
374 | return 1; | ||
375 | } else | ||
376 | #endif | ||
377 | { | ||
378 | slab_lock(page); | ||
379 | if (page->freelist == freelist_old && page->counters == counters_old) { | ||
380 | page->freelist = freelist_new; | ||
381 | page->counters = counters_new; | ||
382 | slab_unlock(page); | ||
383 | return 1; | ||
384 | } | ||
385 | slab_unlock(page); | ||
386 | } | ||
387 | |||
388 | cpu_relax(); | ||
389 | stat(s, CMPXCHG_DOUBLE_FAIL); | ||
390 | |||
391 | #ifdef SLUB_DEBUG_CMPXCHG | ||
392 | printk(KERN_INFO "%s %s: cmpxchg double redo ", n, s->name); | ||
393 | #endif | ||
394 | |||
395 | return 0; | ||
396 | } | ||
397 | |||
398 | static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, | ||
399 | void *freelist_old, unsigned long counters_old, | ||
400 | void *freelist_new, unsigned long counters_new, | ||
401 | const char *n) | ||
402 | { | ||
403 | #ifdef CONFIG_CMPXCHG_DOUBLE | ||
404 | if (s->flags & __CMPXCHG_DOUBLE) { | ||
405 | if (cmpxchg_double(&page->freelist, | ||
406 | freelist_old, counters_old, | ||
407 | freelist_new, counters_new)) | ||
408 | return 1; | ||
409 | } else | ||
410 | #endif | ||
411 | { | ||
412 | unsigned long flags; | ||
413 | |||
414 | local_irq_save(flags); | ||
415 | slab_lock(page); | ||
416 | if (page->freelist == freelist_old && page->counters == counters_old) { | ||
417 | page->freelist = freelist_new; | ||
418 | page->counters = counters_new; | ||
419 | slab_unlock(page); | ||
420 | local_irq_restore(flags); | ||
421 | return 1; | ||
422 | } | ||
423 | slab_unlock(page); | ||
424 | local_irq_restore(flags); | ||
425 | } | ||
426 | |||
427 | cpu_relax(); | ||
428 | stat(s, CMPXCHG_DOUBLE_FAIL); | ||
429 | |||
430 | #ifdef SLUB_DEBUG_CMPXCHG | ||
431 | printk(KERN_INFO "%s %s: cmpxchg double redo ", n, s->name); | ||
432 | #endif | ||
433 | |||
434 | return 0; | ||
435 | } | ||
436 | |||
346 | #ifdef CONFIG_SLUB_DEBUG | 437 | #ifdef CONFIG_SLUB_DEBUG |
347 | /* | 438 | /* |
348 | * Determine a map of object in use on a page. | 439 | * Determine a map of object in use on a page. |
349 | * | 440 | * |
350 | * Slab lock or node listlock must be held to guarantee that the page does | 441 | * Node listlock must be held to guarantee that the page does |
351 | * not vanish from under us. | 442 | * not vanish from under us. |
352 | */ | 443 | */ |
353 | static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map) | 444 | static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map) |
@@ -376,34 +467,8 @@ static int disable_higher_order_debug; | |||
376 | */ | 467 | */ |
377 | static void print_section(char *text, u8 *addr, unsigned int length) | 468 | static void print_section(char *text, u8 *addr, unsigned int length) |
378 | { | 469 | { |
379 | int i, offset; | 470 | print_hex_dump(KERN_ERR, text, DUMP_PREFIX_ADDRESS, 16, 1, addr, |
380 | int newline = 1; | 471 | length, 1); |
381 | char ascii[17]; | ||
382 | |||
383 | ascii[16] = 0; | ||
384 | |||
385 | for (i = 0; i < length; i++) { | ||
386 | if (newline) { | ||
387 | printk(KERN_ERR "%8s 0x%p: ", text, addr + i); | ||
388 | newline = 0; | ||
389 | } | ||
390 | printk(KERN_CONT " %02x", addr[i]); | ||
391 | offset = i % 16; | ||
392 | ascii[offset] = isgraph(addr[i]) ? addr[i] : '.'; | ||
393 | if (offset == 15) { | ||
394 | printk(KERN_CONT " %s\n", ascii); | ||
395 | newline = 1; | ||
396 | } | ||
397 | } | ||
398 | if (!newline) { | ||
399 | i %= 16; | ||
400 | while (i < 16) { | ||
401 | printk(KERN_CONT " "); | ||
402 | ascii[i] = ' '; | ||
403 | i++; | ||
404 | } | ||
405 | printk(KERN_CONT " %s\n", ascii); | ||
406 | } | ||
407 | } | 472 | } |
408 | 473 | ||
409 | static struct track *get_track(struct kmem_cache *s, void *object, | 474 | static struct track *get_track(struct kmem_cache *s, void *object, |
@@ -534,12 +599,12 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) | |||
534 | p, p - addr, get_freepointer(s, p)); | 599 | p, p - addr, get_freepointer(s, p)); |
535 | 600 | ||
536 | if (p > addr + 16) | 601 | if (p > addr + 16) |
537 | print_section("Bytes b4", p - 16, 16); | 602 | print_section("Bytes b4 ", p - 16, 16); |
538 | |||
539 | print_section("Object", p, min_t(unsigned long, s->objsize, PAGE_SIZE)); | ||
540 | 603 | ||
604 | print_section("Object ", p, min_t(unsigned long, s->objsize, | ||
605 | PAGE_SIZE)); | ||
541 | if (s->flags & SLAB_RED_ZONE) | 606 | if (s->flags & SLAB_RED_ZONE) |
542 | print_section("Redzone", p + s->objsize, | 607 | print_section("Redzone ", p + s->objsize, |
543 | s->inuse - s->objsize); | 608 | s->inuse - s->objsize); |
544 | 609 | ||
545 | if (s->offset) | 610 | if (s->offset) |
@@ -552,7 +617,7 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) | |||
552 | 617 | ||
553 | if (off != s->size) | 618 | if (off != s->size) |
554 | /* Beginning of the filler is the free pointer */ | 619 | /* Beginning of the filler is the free pointer */ |
555 | print_section("Padding", p + off, s->size - off); | 620 | print_section("Padding ", p + off, s->size - off); |
556 | 621 | ||
557 | dump_stack(); | 622 | dump_stack(); |
558 | } | 623 | } |
@@ -590,49 +655,6 @@ static void init_object(struct kmem_cache *s, void *object, u8 val) | |||
590 | memset(p + s->objsize, val, s->inuse - s->objsize); | 655 | memset(p + s->objsize, val, s->inuse - s->objsize); |
591 | } | 656 | } |
592 | 657 | ||
593 | static u8 *check_bytes8(u8 *start, u8 value, unsigned int bytes) | ||
594 | { | ||
595 | while (bytes) { | ||
596 | if (*start != value) | ||
597 | return start; | ||
598 | start++; | ||
599 | bytes--; | ||
600 | } | ||
601 | return NULL; | ||
602 | } | ||
603 | |||
604 | static u8 *check_bytes(u8 *start, u8 value, unsigned int bytes) | ||
605 | { | ||
606 | u64 value64; | ||
607 | unsigned int words, prefix; | ||
608 | |||
609 | if (bytes <= 16) | ||
610 | return check_bytes8(start, value, bytes); | ||
611 | |||
612 | value64 = value | value << 8 | value << 16 | value << 24; | ||
613 | value64 = value64 | value64 << 32; | ||
614 | prefix = 8 - ((unsigned long)start) % 8; | ||
615 | |||
616 | if (prefix) { | ||
617 | u8 *r = check_bytes8(start, value, prefix); | ||
618 | if (r) | ||
619 | return r; | ||
620 | start += prefix; | ||
621 | bytes -= prefix; | ||
622 | } | ||
623 | |||
624 | words = bytes / 8; | ||
625 | |||
626 | while (words) { | ||
627 | if (*(u64 *)start != value64) | ||
628 | return check_bytes8(start, value, 8); | ||
629 | start += 8; | ||
630 | words--; | ||
631 | } | ||
632 | |||
633 | return check_bytes8(start, value, bytes % 8); | ||
634 | } | ||
635 | |||
636 | static void restore_bytes(struct kmem_cache *s, char *message, u8 data, | 658 | static void restore_bytes(struct kmem_cache *s, char *message, u8 data, |
637 | void *from, void *to) | 659 | void *from, void *to) |
638 | { | 660 | { |
@@ -647,7 +669,7 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page, | |||
647 | u8 *fault; | 669 | u8 *fault; |
648 | u8 *end; | 670 | u8 *end; |
649 | 671 | ||
650 | fault = check_bytes(start, value, bytes); | 672 | fault = memchr_inv(start, value, bytes); |
651 | if (!fault) | 673 | if (!fault) |
652 | return 1; | 674 | return 1; |
653 | 675 | ||
@@ -740,14 +762,14 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page) | |||
740 | if (!remainder) | 762 | if (!remainder) |
741 | return 1; | 763 | return 1; |
742 | 764 | ||
743 | fault = check_bytes(end - remainder, POISON_INUSE, remainder); | 765 | fault = memchr_inv(end - remainder, POISON_INUSE, remainder); |
744 | if (!fault) | 766 | if (!fault) |
745 | return 1; | 767 | return 1; |
746 | while (end > fault && end[-1] == POISON_INUSE) | 768 | while (end > fault && end[-1] == POISON_INUSE) |
747 | end--; | 769 | end--; |
748 | 770 | ||
749 | slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1); | 771 | slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1); |
750 | print_section("Padding", end - remainder, remainder); | 772 | print_section("Padding ", end - remainder, remainder); |
751 | 773 | ||
752 | restore_bytes(s, "slab padding", POISON_INUSE, end - remainder, end); | 774 | restore_bytes(s, "slab padding", POISON_INUSE, end - remainder, end); |
753 | return 0; | 775 | return 0; |
@@ -838,10 +860,11 @@ static int check_slab(struct kmem_cache *s, struct page *page) | |||
838 | static int on_freelist(struct kmem_cache *s, struct page *page, void *search) | 860 | static int on_freelist(struct kmem_cache *s, struct page *page, void *search) |
839 | { | 861 | { |
840 | int nr = 0; | 862 | int nr = 0; |
841 | void *fp = page->freelist; | 863 | void *fp; |
842 | void *object = NULL; | 864 | void *object = NULL; |
843 | unsigned long max_objects; | 865 | unsigned long max_objects; |
844 | 866 | ||
867 | fp = page->freelist; | ||
845 | while (fp && nr <= page->objects) { | 868 | while (fp && nr <= page->objects) { |
846 | if (fp == search) | 869 | if (fp == search) |
847 | return 1; | 870 | return 1; |
@@ -895,7 +918,7 @@ static void trace(struct kmem_cache *s, struct page *page, void *object, | |||
895 | page->freelist); | 918 | page->freelist); |
896 | 919 | ||
897 | if (!alloc) | 920 | if (!alloc) |
898 | print_section("Object", (void *)object, s->objsize); | 921 | print_section("Object ", (void *)object, s->objsize); |
899 | 922 | ||
900 | dump_stack(); | 923 | dump_stack(); |
901 | } | 924 | } |
@@ -946,26 +969,27 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x) | |||
946 | 969 | ||
947 | /* | 970 | /* |
948 | * Tracking of fully allocated slabs for debugging purposes. | 971 | * Tracking of fully allocated slabs for debugging purposes. |
972 | * | ||
973 | * list_lock must be held. | ||
949 | */ | 974 | */ |
950 | static void add_full(struct kmem_cache_node *n, struct page *page) | 975 | static void add_full(struct kmem_cache *s, |
976 | struct kmem_cache_node *n, struct page *page) | ||
951 | { | 977 | { |
952 | spin_lock(&n->list_lock); | 978 | if (!(s->flags & SLAB_STORE_USER)) |
979 | return; | ||
980 | |||
953 | list_add(&page->lru, &n->full); | 981 | list_add(&page->lru, &n->full); |
954 | spin_unlock(&n->list_lock); | ||
955 | } | 982 | } |
956 | 983 | ||
984 | /* | ||
985 | * list_lock must be held. | ||
986 | */ | ||
957 | static void remove_full(struct kmem_cache *s, struct page *page) | 987 | static void remove_full(struct kmem_cache *s, struct page *page) |
958 | { | 988 | { |
959 | struct kmem_cache_node *n; | ||
960 | |||
961 | if (!(s->flags & SLAB_STORE_USER)) | 989 | if (!(s->flags & SLAB_STORE_USER)) |
962 | return; | 990 | return; |
963 | 991 | ||
964 | n = get_node(s, page_to_nid(page)); | ||
965 | |||
966 | spin_lock(&n->list_lock); | ||
967 | list_del(&page->lru); | 992 | list_del(&page->lru); |
968 | spin_unlock(&n->list_lock); | ||
969 | } | 993 | } |
970 | 994 | ||
971 | /* Tracking of the number of slabs for debugging purposes */ | 995 | /* Tracking of the number of slabs for debugging purposes */ |
@@ -1021,11 +1045,6 @@ static noinline int alloc_debug_processing(struct kmem_cache *s, struct page *pa | |||
1021 | if (!check_slab(s, page)) | 1045 | if (!check_slab(s, page)) |
1022 | goto bad; | 1046 | goto bad; |
1023 | 1047 | ||
1024 | if (!on_freelist(s, page, object)) { | ||
1025 | object_err(s, page, object, "Object already allocated"); | ||
1026 | goto bad; | ||
1027 | } | ||
1028 | |||
1029 | if (!check_valid_pointer(s, page, object)) { | 1048 | if (!check_valid_pointer(s, page, object)) { |
1030 | object_err(s, page, object, "Freelist Pointer check fails"); | 1049 | object_err(s, page, object, "Freelist Pointer check fails"); |
1031 | goto bad; | 1050 | goto bad; |
@@ -1058,6 +1077,12 @@ bad: | |||
1058 | static noinline int free_debug_processing(struct kmem_cache *s, | 1077 | static noinline int free_debug_processing(struct kmem_cache *s, |
1059 | struct page *page, void *object, unsigned long addr) | 1078 | struct page *page, void *object, unsigned long addr) |
1060 | { | 1079 | { |
1080 | unsigned long flags; | ||
1081 | int rc = 0; | ||
1082 | |||
1083 | local_irq_save(flags); | ||
1084 | slab_lock(page); | ||
1085 | |||
1061 | if (!check_slab(s, page)) | 1086 | if (!check_slab(s, page)) |
1062 | goto fail; | 1087 | goto fail; |
1063 | 1088 | ||
@@ -1072,7 +1097,7 @@ static noinline int free_debug_processing(struct kmem_cache *s, | |||
1072 | } | 1097 | } |
1073 | 1098 | ||
1074 | if (!check_object(s, page, object, SLUB_RED_ACTIVE)) | 1099 | if (!check_object(s, page, object, SLUB_RED_ACTIVE)) |
1075 | return 0; | 1100 | goto out; |
1076 | 1101 | ||
1077 | if (unlikely(s != page->slab)) { | 1102 | if (unlikely(s != page->slab)) { |
1078 | if (!PageSlab(page)) { | 1103 | if (!PageSlab(page)) { |
@@ -1089,18 +1114,19 @@ static noinline int free_debug_processing(struct kmem_cache *s, | |||
1089 | goto fail; | 1114 | goto fail; |
1090 | } | 1115 | } |
1091 | 1116 | ||
1092 | /* Special debug activities for freeing objects */ | ||
1093 | if (!PageSlubFrozen(page) && !page->freelist) | ||
1094 | remove_full(s, page); | ||
1095 | if (s->flags & SLAB_STORE_USER) | 1117 | if (s->flags & SLAB_STORE_USER) |
1096 | set_track(s, object, TRACK_FREE, addr); | 1118 | set_track(s, object, TRACK_FREE, addr); |
1097 | trace(s, page, object, 0); | 1119 | trace(s, page, object, 0); |
1098 | init_object(s, object, SLUB_RED_INACTIVE); | 1120 | init_object(s, object, SLUB_RED_INACTIVE); |
1099 | return 1; | 1121 | rc = 1; |
1122 | out: | ||
1123 | slab_unlock(page); | ||
1124 | local_irq_restore(flags); | ||
1125 | return rc; | ||
1100 | 1126 | ||
1101 | fail: | 1127 | fail: |
1102 | slab_fix(s, "Object at 0x%p not freed", object); | 1128 | slab_fix(s, "Object at 0x%p not freed", object); |
1103 | return 0; | 1129 | goto out; |
1104 | } | 1130 | } |
1105 | 1131 | ||
1106 | static int __init setup_slub_debug(char *str) | 1132 | static int __init setup_slub_debug(char *str) |
@@ -1200,7 +1226,9 @@ static inline int slab_pad_check(struct kmem_cache *s, struct page *page) | |||
1200 | { return 1; } | 1226 | { return 1; } |
1201 | static inline int check_object(struct kmem_cache *s, struct page *page, | 1227 | static inline int check_object(struct kmem_cache *s, struct page *page, |
1202 | void *object, u8 val) { return 1; } | 1228 | void *object, u8 val) { return 1; } |
1203 | static inline void add_full(struct kmem_cache_node *n, struct page *page) {} | 1229 | static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n, |
1230 | struct page *page) {} | ||
1231 | static inline void remove_full(struct kmem_cache *s, struct page *page) {} | ||
1204 | static inline unsigned long kmem_cache_flags(unsigned long objsize, | 1232 | static inline unsigned long kmem_cache_flags(unsigned long objsize, |
1205 | unsigned long flags, const char *name, | 1233 | unsigned long flags, const char *name, |
1206 | void (*ctor)(void *)) | 1234 | void (*ctor)(void *)) |
@@ -1252,6 +1280,11 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) | |||
1252 | struct kmem_cache_order_objects oo = s->oo; | 1280 | struct kmem_cache_order_objects oo = s->oo; |
1253 | gfp_t alloc_gfp; | 1281 | gfp_t alloc_gfp; |
1254 | 1282 | ||
1283 | flags &= gfp_allowed_mask; | ||
1284 | |||
1285 | if (flags & __GFP_WAIT) | ||
1286 | local_irq_enable(); | ||
1287 | |||
1255 | flags |= s->allocflags; | 1288 | flags |= s->allocflags; |
1256 | 1289 | ||
1257 | /* | 1290 | /* |
@@ -1268,12 +1301,17 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) | |||
1268 | * Try a lower order alloc if possible | 1301 | * Try a lower order alloc if possible |
1269 | */ | 1302 | */ |
1270 | page = alloc_slab_page(flags, node, oo); | 1303 | page = alloc_slab_page(flags, node, oo); |
1271 | if (!page) | ||
1272 | return NULL; | ||
1273 | 1304 | ||
1274 | stat(s, ORDER_FALLBACK); | 1305 | if (page) |
1306 | stat(s, ORDER_FALLBACK); | ||
1275 | } | 1307 | } |
1276 | 1308 | ||
1309 | if (flags & __GFP_WAIT) | ||
1310 | local_irq_disable(); | ||
1311 | |||
1312 | if (!page) | ||
1313 | return NULL; | ||
1314 | |||
1277 | if (kmemcheck_enabled | 1315 | if (kmemcheck_enabled |
1278 | && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) { | 1316 | && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) { |
1279 | int pages = 1 << oo_order(oo); | 1317 | int pages = 1 << oo_order(oo); |
@@ -1340,7 +1378,8 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) | |||
1340 | set_freepointer(s, last, NULL); | 1378 | set_freepointer(s, last, NULL); |
1341 | 1379 | ||
1342 | page->freelist = start; | 1380 | page->freelist = start; |
1343 | page->inuse = 0; | 1381 | page->inuse = page->objects; |
1382 | page->frozen = 1; | ||
1344 | out: | 1383 | out: |
1345 | return page; | 1384 | return page; |
1346 | } | 1385 | } |
@@ -1418,79 +1457,80 @@ static void discard_slab(struct kmem_cache *s, struct page *page) | |||
1418 | } | 1457 | } |
1419 | 1458 | ||
1420 | /* | 1459 | /* |
1421 | * Per slab locking using the pagelock | 1460 | * Management of partially allocated slabs. |
1422 | */ | 1461 | * |
1423 | static __always_inline void slab_lock(struct page *page) | 1462 | * list_lock must be held. |
1424 | { | ||
1425 | bit_spin_lock(PG_locked, &page->flags); | ||
1426 | } | ||
1427 | |||
1428 | static __always_inline void slab_unlock(struct page *page) | ||
1429 | { | ||
1430 | __bit_spin_unlock(PG_locked, &page->flags); | ||
1431 | } | ||
1432 | |||
1433 | static __always_inline int slab_trylock(struct page *page) | ||
1434 | { | ||
1435 | int rc = 1; | ||
1436 | |||
1437 | rc = bit_spin_trylock(PG_locked, &page->flags); | ||
1438 | return rc; | ||
1439 | } | ||
1440 | |||
1441 | /* | ||
1442 | * Management of partially allocated slabs | ||
1443 | */ | 1463 | */ |
1444 | static void add_partial(struct kmem_cache_node *n, | 1464 | static inline void add_partial(struct kmem_cache_node *n, |
1445 | struct page *page, int tail) | 1465 | struct page *page, int tail) |
1446 | { | 1466 | { |
1447 | spin_lock(&n->list_lock); | ||
1448 | n->nr_partial++; | 1467 | n->nr_partial++; |
1449 | if (tail) | 1468 | if (tail == DEACTIVATE_TO_TAIL) |
1450 | list_add_tail(&page->lru, &n->partial); | 1469 | list_add_tail(&page->lru, &n->partial); |
1451 | else | 1470 | else |
1452 | list_add(&page->lru, &n->partial); | 1471 | list_add(&page->lru, &n->partial); |
1453 | spin_unlock(&n->list_lock); | ||
1454 | } | 1472 | } |
1455 | 1473 | ||
1456 | static inline void __remove_partial(struct kmem_cache_node *n, | 1474 | /* |
1475 | * list_lock must be held. | ||
1476 | */ | ||
1477 | static inline void remove_partial(struct kmem_cache_node *n, | ||
1457 | struct page *page) | 1478 | struct page *page) |
1458 | { | 1479 | { |
1459 | list_del(&page->lru); | 1480 | list_del(&page->lru); |
1460 | n->nr_partial--; | 1481 | n->nr_partial--; |
1461 | } | 1482 | } |
1462 | 1483 | ||
1463 | static void remove_partial(struct kmem_cache *s, struct page *page) | ||
1464 | { | ||
1465 | struct kmem_cache_node *n = get_node(s, page_to_nid(page)); | ||
1466 | |||
1467 | spin_lock(&n->list_lock); | ||
1468 | __remove_partial(n, page); | ||
1469 | spin_unlock(&n->list_lock); | ||
1470 | } | ||
1471 | |||
1472 | /* | 1484 | /* |
1473 | * Lock slab and remove from the partial list. | 1485 | * Lock slab, remove from the partial list and put the object into the |
1486 | * per cpu freelist. | ||
1487 | * | ||
1488 | * Returns a list of objects or NULL if it fails. | ||
1474 | * | 1489 | * |
1475 | * Must hold list_lock. | 1490 | * Must hold list_lock. |
1476 | */ | 1491 | */ |
1477 | static inline int lock_and_freeze_slab(struct kmem_cache_node *n, | 1492 | static inline void *acquire_slab(struct kmem_cache *s, |
1478 | struct page *page) | 1493 | struct kmem_cache_node *n, struct page *page, |
1494 | int mode) | ||
1479 | { | 1495 | { |
1480 | if (slab_trylock(page)) { | 1496 | void *freelist; |
1481 | __remove_partial(n, page); | 1497 | unsigned long counters; |
1482 | __SetPageSlubFrozen(page); | 1498 | struct page new; |
1483 | return 1; | 1499 | |
1484 | } | 1500 | /* |
1485 | return 0; | 1501 | * Zap the freelist and set the frozen bit. |
1502 | * The old freelist is the list of objects for the | ||
1503 | * per cpu allocation list. | ||
1504 | */ | ||
1505 | do { | ||
1506 | freelist = page->freelist; | ||
1507 | counters = page->counters; | ||
1508 | new.counters = counters; | ||
1509 | if (mode) | ||
1510 | new.inuse = page->objects; | ||
1511 | |||
1512 | VM_BUG_ON(new.frozen); | ||
1513 | new.frozen = 1; | ||
1514 | |||
1515 | } while (!__cmpxchg_double_slab(s, page, | ||
1516 | freelist, counters, | ||
1517 | NULL, new.counters, | ||
1518 | "lock and freeze")); | ||
1519 | |||
1520 | remove_partial(n, page); | ||
1521 | return freelist; | ||
1486 | } | 1522 | } |
1487 | 1523 | ||
1524 | static int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain); | ||
1525 | |||
1488 | /* | 1526 | /* |
1489 | * Try to allocate a partial slab from a specific node. | 1527 | * Try to allocate a partial slab from a specific node. |
1490 | */ | 1528 | */ |
1491 | static struct page *get_partial_node(struct kmem_cache_node *n) | 1529 | static void *get_partial_node(struct kmem_cache *s, |
1530 | struct kmem_cache_node *n, struct kmem_cache_cpu *c) | ||
1492 | { | 1531 | { |
1493 | struct page *page; | 1532 | struct page *page, *page2; |
1533 | void *object = NULL; | ||
1494 | 1534 | ||
1495 | /* | 1535 | /* |
1496 | * Racy check. If we mistakenly see no partial slabs then we | 1536 | * Racy check. If we mistakenly see no partial slabs then we |
@@ -1502,26 +1542,43 @@ static struct page *get_partial_node(struct kmem_cache_node *n) | |||
1502 | return NULL; | 1542 | return NULL; |
1503 | 1543 | ||
1504 | spin_lock(&n->list_lock); | 1544 | spin_lock(&n->list_lock); |
1505 | list_for_each_entry(page, &n->partial, lru) | 1545 | list_for_each_entry_safe(page, page2, &n->partial, lru) { |
1506 | if (lock_and_freeze_slab(n, page)) | 1546 | void *t = acquire_slab(s, n, page, object == NULL); |
1507 | goto out; | 1547 | int available; |
1508 | page = NULL; | 1548 | |
1509 | out: | 1549 | if (!t) |
1550 | break; | ||
1551 | |||
1552 | if (!object) { | ||
1553 | c->page = page; | ||
1554 | c->node = page_to_nid(page); | ||
1555 | stat(s, ALLOC_FROM_PARTIAL); | ||
1556 | object = t; | ||
1557 | available = page->objects - page->inuse; | ||
1558 | } else { | ||
1559 | page->freelist = t; | ||
1560 | available = put_cpu_partial(s, page, 0); | ||
1561 | } | ||
1562 | if (kmem_cache_debug(s) || available > s->cpu_partial / 2) | ||
1563 | break; | ||
1564 | |||
1565 | } | ||
1510 | spin_unlock(&n->list_lock); | 1566 | spin_unlock(&n->list_lock); |
1511 | return page; | 1567 | return object; |
1512 | } | 1568 | } |
1513 | 1569 | ||
1514 | /* | 1570 | /* |
1515 | * Get a page from somewhere. Search in increasing NUMA distances. | 1571 | * Get a page from somewhere. Search in increasing NUMA distances. |
1516 | */ | 1572 | */ |
1517 | static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) | 1573 | static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags, |
1574 | struct kmem_cache_cpu *c) | ||
1518 | { | 1575 | { |
1519 | #ifdef CONFIG_NUMA | 1576 | #ifdef CONFIG_NUMA |
1520 | struct zonelist *zonelist; | 1577 | struct zonelist *zonelist; |
1521 | struct zoneref *z; | 1578 | struct zoneref *z; |
1522 | struct zone *zone; | 1579 | struct zone *zone; |
1523 | enum zone_type high_zoneidx = gfp_zone(flags); | 1580 | enum zone_type high_zoneidx = gfp_zone(flags); |
1524 | struct page *page; | 1581 | void *object; |
1525 | 1582 | ||
1526 | /* | 1583 | /* |
1527 | * The defrag ratio allows a configuration of the tradeoffs between | 1584 | * The defrag ratio allows a configuration of the tradeoffs between |
@@ -1554,10 +1611,10 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) | |||
1554 | 1611 | ||
1555 | if (n && cpuset_zone_allowed_hardwall(zone, flags) && | 1612 | if (n && cpuset_zone_allowed_hardwall(zone, flags) && |
1556 | n->nr_partial > s->min_partial) { | 1613 | n->nr_partial > s->min_partial) { |
1557 | page = get_partial_node(n); | 1614 | object = get_partial_node(s, n, c); |
1558 | if (page) { | 1615 | if (object) { |
1559 | put_mems_allowed(); | 1616 | put_mems_allowed(); |
1560 | return page; | 1617 | return object; |
1561 | } | 1618 | } |
1562 | } | 1619 | } |
1563 | } | 1620 | } |
@@ -1569,63 +1626,17 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) | |||
1569 | /* | 1626 | /* |
1570 | * Get a partial page, lock it and return it. | 1627 | * Get a partial page, lock it and return it. |
1571 | */ | 1628 | */ |
1572 | static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node) | 1629 | static void *get_partial(struct kmem_cache *s, gfp_t flags, int node, |
1630 | struct kmem_cache_cpu *c) | ||
1573 | { | 1631 | { |
1574 | struct page *page; | 1632 | void *object; |
1575 | int searchnode = (node == NUMA_NO_NODE) ? numa_node_id() : node; | 1633 | int searchnode = (node == NUMA_NO_NODE) ? numa_node_id() : node; |
1576 | 1634 | ||
1577 | page = get_partial_node(get_node(s, searchnode)); | 1635 | object = get_partial_node(s, get_node(s, searchnode), c); |
1578 | if (page || node != NUMA_NO_NODE) | 1636 | if (object || node != NUMA_NO_NODE) |
1579 | return page; | 1637 | return object; |
1580 | 1638 | ||
1581 | return get_any_partial(s, flags); | 1639 | return get_any_partial(s, flags, c); |
1582 | } | ||
1583 | |||
1584 | /* | ||
1585 | * Move a page back to the lists. | ||
1586 | * | ||
1587 | * Must be called with the slab lock held. | ||
1588 | * | ||
1589 | * On exit the slab lock will have been dropped. | ||
1590 | */ | ||
1591 | static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) | ||
1592 | __releases(bitlock) | ||
1593 | { | ||
1594 | struct kmem_cache_node *n = get_node(s, page_to_nid(page)); | ||
1595 | |||
1596 | __ClearPageSlubFrozen(page); | ||
1597 | if (page->inuse) { | ||
1598 | |||
1599 | if (page->freelist) { | ||
1600 | add_partial(n, page, tail); | ||
1601 | stat(s, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD); | ||
1602 | } else { | ||
1603 | stat(s, DEACTIVATE_FULL); | ||
1604 | if (kmem_cache_debug(s) && (s->flags & SLAB_STORE_USER)) | ||
1605 | add_full(n, page); | ||
1606 | } | ||
1607 | slab_unlock(page); | ||
1608 | } else { | ||
1609 | stat(s, DEACTIVATE_EMPTY); | ||
1610 | if (n->nr_partial < s->min_partial) { | ||
1611 | /* | ||
1612 | * Adding an empty slab to the partial slabs in order | ||
1613 | * to avoid page allocator overhead. This slab needs | ||
1614 | * to come after the other slabs with objects in | ||
1615 | * so that the others get filled first. That way the | ||
1616 | * size of the partial list stays small. | ||
1617 | * | ||
1618 | * kmem_cache_shrink can reclaim any empty slabs from | ||
1619 | * the partial list. | ||
1620 | */ | ||
1621 | add_partial(n, page, 1); | ||
1622 | slab_unlock(page); | ||
1623 | } else { | ||
1624 | slab_unlock(page); | ||
1625 | stat(s, FREE_SLAB); | ||
1626 | discard_slab(s, page); | ||
1627 | } | ||
1628 | } | ||
1629 | } | 1640 | } |
1630 | 1641 | ||
1631 | #ifdef CONFIG_PREEMPT | 1642 | #ifdef CONFIG_PREEMPT |
@@ -1694,45 +1705,278 @@ void init_kmem_cache_cpus(struct kmem_cache *s) | |||
1694 | for_each_possible_cpu(cpu) | 1705 | for_each_possible_cpu(cpu) |
1695 | per_cpu_ptr(s->cpu_slab, cpu)->tid = init_tid(cpu); | 1706 | per_cpu_ptr(s->cpu_slab, cpu)->tid = init_tid(cpu); |
1696 | } | 1707 | } |
1708 | |||
1697 | /* | 1709 | /* |
1698 | * Remove the cpu slab | 1710 | * Remove the cpu slab |
1699 | */ | 1711 | */ |
1700 | static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) | 1712 | static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) |
1701 | __releases(bitlock) | ||
1702 | { | 1713 | { |
1714 | enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE }; | ||
1703 | struct page *page = c->page; | 1715 | struct page *page = c->page; |
1704 | int tail = 1; | 1716 | struct kmem_cache_node *n = get_node(s, page_to_nid(page)); |
1705 | 1717 | int lock = 0; | |
1706 | if (page->freelist) | 1718 | enum slab_modes l = M_NONE, m = M_NONE; |
1719 | void *freelist; | ||
1720 | void *nextfree; | ||
1721 | int tail = DEACTIVATE_TO_HEAD; | ||
1722 | struct page new; | ||
1723 | struct page old; | ||
1724 | |||
1725 | if (page->freelist) { | ||
1707 | stat(s, DEACTIVATE_REMOTE_FREES); | 1726 | stat(s, DEACTIVATE_REMOTE_FREES); |
1727 | tail = DEACTIVATE_TO_TAIL; | ||
1728 | } | ||
1729 | |||
1730 | c->tid = next_tid(c->tid); | ||
1731 | c->page = NULL; | ||
1732 | freelist = c->freelist; | ||
1733 | c->freelist = NULL; | ||
1734 | |||
1735 | /* | ||
1736 | * Stage one: Free all available per cpu objects back | ||
1737 | * to the page freelist while it is still frozen. Leave the | ||
1738 | * last one. | ||
1739 | * | ||
1740 | * There is no need to take the list->lock because the page | ||
1741 | * is still frozen. | ||
1742 | */ | ||
1743 | while (freelist && (nextfree = get_freepointer(s, freelist))) { | ||
1744 | void *prior; | ||
1745 | unsigned long counters; | ||
1746 | |||
1747 | do { | ||
1748 | prior = page->freelist; | ||
1749 | counters = page->counters; | ||
1750 | set_freepointer(s, freelist, prior); | ||
1751 | new.counters = counters; | ||
1752 | new.inuse--; | ||
1753 | VM_BUG_ON(!new.frozen); | ||
1754 | |||
1755 | } while (!__cmpxchg_double_slab(s, page, | ||
1756 | prior, counters, | ||
1757 | freelist, new.counters, | ||
1758 | "drain percpu freelist")); | ||
1759 | |||
1760 | freelist = nextfree; | ||
1761 | } | ||
1762 | |||
1708 | /* | 1763 | /* |
1709 | * Merge cpu freelist into slab freelist. Typically we get here | 1764 | * Stage two: Ensure that the page is unfrozen while the |
1710 | * because both freelists are empty. So this is unlikely | 1765 | * list presence reflects the actual number of objects |
1711 | * to occur. | 1766 | * during unfreeze. |
1767 | * | ||
1768 | * We setup the list membership and then perform a cmpxchg | ||
1769 | * with the count. If there is a mismatch then the page | ||
1770 | * is not unfrozen but the page is on the wrong list. | ||
1771 | * | ||
1772 | * Then we restart the process which may have to remove | ||
1773 | * the page from the list that we just put it on again | ||
1774 | * because the number of objects in the slab may have | ||
1775 | * changed. | ||
1712 | */ | 1776 | */ |
1713 | while (unlikely(c->freelist)) { | 1777 | redo: |
1714 | void **object; | 1778 | |
1779 | old.freelist = page->freelist; | ||
1780 | old.counters = page->counters; | ||
1781 | VM_BUG_ON(!old.frozen); | ||
1715 | 1782 | ||
1716 | tail = 0; /* Hot objects. Put the slab first */ | 1783 | /* Determine target state of the slab */ |
1784 | new.counters = old.counters; | ||
1785 | if (freelist) { | ||
1786 | new.inuse--; | ||
1787 | set_freepointer(s, freelist, old.freelist); | ||
1788 | new.freelist = freelist; | ||
1789 | } else | ||
1790 | new.freelist = old.freelist; | ||
1717 | 1791 | ||
1718 | /* Retrieve object from cpu_freelist */ | 1792 | new.frozen = 0; |
1719 | object = c->freelist; | ||
1720 | c->freelist = get_freepointer(s, c->freelist); | ||
1721 | 1793 | ||
1722 | /* And put onto the regular freelist */ | 1794 | if (!new.inuse && n->nr_partial > s->min_partial) |
1723 | set_freepointer(s, object, page->freelist); | 1795 | m = M_FREE; |
1724 | page->freelist = object; | 1796 | else if (new.freelist) { |
1725 | page->inuse--; | 1797 | m = M_PARTIAL; |
1798 | if (!lock) { | ||
1799 | lock = 1; | ||
1800 | /* | ||
1801 | * Taking the spinlock removes the possiblity | ||
1802 | * that acquire_slab() will see a slab page that | ||
1803 | * is frozen | ||
1804 | */ | ||
1805 | spin_lock(&n->list_lock); | ||
1806 | } | ||
1807 | } else { | ||
1808 | m = M_FULL; | ||
1809 | if (kmem_cache_debug(s) && !lock) { | ||
1810 | lock = 1; | ||
1811 | /* | ||
1812 | * This also ensures that the scanning of full | ||
1813 | * slabs from diagnostic functions will not see | ||
1814 | * any frozen slabs. | ||
1815 | */ | ||
1816 | spin_lock(&n->list_lock); | ||
1817 | } | ||
1818 | } | ||
1819 | |||
1820 | if (l != m) { | ||
1821 | |||
1822 | if (l == M_PARTIAL) | ||
1823 | |||
1824 | remove_partial(n, page); | ||
1825 | |||
1826 | else if (l == M_FULL) | ||
1827 | |||
1828 | remove_full(s, page); | ||
1829 | |||
1830 | if (m == M_PARTIAL) { | ||
1831 | |||
1832 | add_partial(n, page, tail); | ||
1833 | stat(s, tail); | ||
1834 | |||
1835 | } else if (m == M_FULL) { | ||
1836 | |||
1837 | stat(s, DEACTIVATE_FULL); | ||
1838 | add_full(s, n, page); | ||
1839 | |||
1840 | } | ||
1841 | } | ||
1842 | |||
1843 | l = m; | ||
1844 | if (!__cmpxchg_double_slab(s, page, | ||
1845 | old.freelist, old.counters, | ||
1846 | new.freelist, new.counters, | ||
1847 | "unfreezing slab")) | ||
1848 | goto redo; | ||
1849 | |||
1850 | if (lock) | ||
1851 | spin_unlock(&n->list_lock); | ||
1852 | |||
1853 | if (m == M_FREE) { | ||
1854 | stat(s, DEACTIVATE_EMPTY); | ||
1855 | discard_slab(s, page); | ||
1856 | stat(s, FREE_SLAB); | ||
1726 | } | 1857 | } |
1727 | c->page = NULL; | 1858 | } |
1728 | c->tid = next_tid(c->tid); | 1859 | |
1729 | unfreeze_slab(s, page, tail); | 1860 | /* Unfreeze all the cpu partial slabs */ |
1861 | static void unfreeze_partials(struct kmem_cache *s) | ||
1862 | { | ||
1863 | struct kmem_cache_node *n = NULL; | ||
1864 | struct kmem_cache_cpu *c = this_cpu_ptr(s->cpu_slab); | ||
1865 | struct page *page; | ||
1866 | |||
1867 | while ((page = c->partial)) { | ||
1868 | enum slab_modes { M_PARTIAL, M_FREE }; | ||
1869 | enum slab_modes l, m; | ||
1870 | struct page new; | ||
1871 | struct page old; | ||
1872 | |||
1873 | c->partial = page->next; | ||
1874 | l = M_FREE; | ||
1875 | |||
1876 | do { | ||
1877 | |||
1878 | old.freelist = page->freelist; | ||
1879 | old.counters = page->counters; | ||
1880 | VM_BUG_ON(!old.frozen); | ||
1881 | |||
1882 | new.counters = old.counters; | ||
1883 | new.freelist = old.freelist; | ||
1884 | |||
1885 | new.frozen = 0; | ||
1886 | |||
1887 | if (!new.inuse && (!n || n->nr_partial > s->min_partial)) | ||
1888 | m = M_FREE; | ||
1889 | else { | ||
1890 | struct kmem_cache_node *n2 = get_node(s, | ||
1891 | page_to_nid(page)); | ||
1892 | |||
1893 | m = M_PARTIAL; | ||
1894 | if (n != n2) { | ||
1895 | if (n) | ||
1896 | spin_unlock(&n->list_lock); | ||
1897 | |||
1898 | n = n2; | ||
1899 | spin_lock(&n->list_lock); | ||
1900 | } | ||
1901 | } | ||
1902 | |||
1903 | if (l != m) { | ||
1904 | if (l == M_PARTIAL) | ||
1905 | remove_partial(n, page); | ||
1906 | else | ||
1907 | add_partial(n, page, 1); | ||
1908 | |||
1909 | l = m; | ||
1910 | } | ||
1911 | |||
1912 | } while (!cmpxchg_double_slab(s, page, | ||
1913 | old.freelist, old.counters, | ||
1914 | new.freelist, new.counters, | ||
1915 | "unfreezing slab")); | ||
1916 | |||
1917 | if (m == M_FREE) { | ||
1918 | stat(s, DEACTIVATE_EMPTY); | ||
1919 | discard_slab(s, page); | ||
1920 | stat(s, FREE_SLAB); | ||
1921 | } | ||
1922 | } | ||
1923 | |||
1924 | if (n) | ||
1925 | spin_unlock(&n->list_lock); | ||
1926 | } | ||
1927 | |||
1928 | /* | ||
1929 | * Put a page that was just frozen (in __slab_free) into a partial page | ||
1930 | * slot if available. This is done without interrupts disabled and without | ||
1931 | * preemption disabled. The cmpxchg is racy and may put the partial page | ||
1932 | * onto a random cpus partial slot. | ||
1933 | * | ||
1934 | * If we did not find a slot then simply move all the partials to the | ||
1935 | * per node partial list. | ||
1936 | */ | ||
1937 | int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) | ||
1938 | { | ||
1939 | struct page *oldpage; | ||
1940 | int pages; | ||
1941 | int pobjects; | ||
1942 | |||
1943 | do { | ||
1944 | pages = 0; | ||
1945 | pobjects = 0; | ||
1946 | oldpage = this_cpu_read(s->cpu_slab->partial); | ||
1947 | |||
1948 | if (oldpage) { | ||
1949 | pobjects = oldpage->pobjects; | ||
1950 | pages = oldpage->pages; | ||
1951 | if (drain && pobjects > s->cpu_partial) { | ||
1952 | unsigned long flags; | ||
1953 | /* | ||
1954 | * partial array is full. Move the existing | ||
1955 | * set to the per node partial list. | ||
1956 | */ | ||
1957 | local_irq_save(flags); | ||
1958 | unfreeze_partials(s); | ||
1959 | local_irq_restore(flags); | ||
1960 | pobjects = 0; | ||
1961 | pages = 0; | ||
1962 | } | ||
1963 | } | ||
1964 | |||
1965 | pages++; | ||
1966 | pobjects += page->objects - page->inuse; | ||
1967 | |||
1968 | page->pages = pages; | ||
1969 | page->pobjects = pobjects; | ||
1970 | page->next = oldpage; | ||
1971 | |||
1972 | } while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page) != oldpage); | ||
1973 | stat(s, CPU_PARTIAL_FREE); | ||
1974 | return pobjects; | ||
1730 | } | 1975 | } |
1731 | 1976 | ||
1732 | static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) | 1977 | static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) |
1733 | { | 1978 | { |
1734 | stat(s, CPUSLAB_FLUSH); | 1979 | stat(s, CPUSLAB_FLUSH); |
1735 | slab_lock(c->page); | ||
1736 | deactivate_slab(s, c); | 1980 | deactivate_slab(s, c); |
1737 | } | 1981 | } |
1738 | 1982 | ||
@@ -1745,8 +1989,12 @@ static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) | |||
1745 | { | 1989 | { |
1746 | struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); | 1990 | struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); |
1747 | 1991 | ||
1748 | if (likely(c && c->page)) | 1992 | if (likely(c)) { |
1749 | flush_slab(s, c); | 1993 | if (c->page) |
1994 | flush_slab(s, c); | ||
1995 | |||
1996 | unfreeze_partials(s); | ||
1997 | } | ||
1750 | } | 1998 | } |
1751 | 1999 | ||
1752 | static void flush_cpu_slab(void *d) | 2000 | static void flush_cpu_slab(void *d) |
@@ -1837,12 +2085,39 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) | |||
1837 | } | 2085 | } |
1838 | } | 2086 | } |
1839 | 2087 | ||
2088 | static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags, | ||
2089 | int node, struct kmem_cache_cpu **pc) | ||
2090 | { | ||
2091 | void *object; | ||
2092 | struct kmem_cache_cpu *c; | ||
2093 | struct page *page = new_slab(s, flags, node); | ||
2094 | |||
2095 | if (page) { | ||
2096 | c = __this_cpu_ptr(s->cpu_slab); | ||
2097 | if (c->page) | ||
2098 | flush_slab(s, c); | ||
2099 | |||
2100 | /* | ||
2101 | * No other reference to the page yet so we can | ||
2102 | * muck around with it freely without cmpxchg | ||
2103 | */ | ||
2104 | object = page->freelist; | ||
2105 | page->freelist = NULL; | ||
2106 | |||
2107 | stat(s, ALLOC_SLAB); | ||
2108 | c->node = page_to_nid(page); | ||
2109 | c->page = page; | ||
2110 | *pc = c; | ||
2111 | } else | ||
2112 | object = NULL; | ||
2113 | |||
2114 | return object; | ||
2115 | } | ||
2116 | |||
1840 | /* | 2117 | /* |
1841 | * Slow path. The lockless freelist is empty or we need to perform | 2118 | * Slow path. The lockless freelist is empty or we need to perform |
1842 | * debugging duties. | 2119 | * debugging duties. |
1843 | * | 2120 | * |
1844 | * Interrupts are disabled. | ||
1845 | * | ||
1846 | * Processing is still very fast if new objects have been freed to the | 2121 | * Processing is still very fast if new objects have been freed to the |
1847 | * regular freelist. In that case we simply take over the regular freelist | 2122 | * regular freelist. In that case we simply take over the regular freelist |
1848 | * as the lockless freelist and zap the regular freelist. | 2123 | * as the lockless freelist and zap the regular freelist. |
@@ -1859,8 +2134,9 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, | |||
1859 | unsigned long addr, struct kmem_cache_cpu *c) | 2134 | unsigned long addr, struct kmem_cache_cpu *c) |
1860 | { | 2135 | { |
1861 | void **object; | 2136 | void **object; |
1862 | struct page *page; | ||
1863 | unsigned long flags; | 2137 | unsigned long flags; |
2138 | struct page new; | ||
2139 | unsigned long counters; | ||
1864 | 2140 | ||
1865 | local_irq_save(flags); | 2141 | local_irq_save(flags); |
1866 | #ifdef CONFIG_PREEMPT | 2142 | #ifdef CONFIG_PREEMPT |
@@ -1872,81 +2148,91 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, | |||
1872 | c = this_cpu_ptr(s->cpu_slab); | 2148 | c = this_cpu_ptr(s->cpu_slab); |
1873 | #endif | 2149 | #endif |
1874 | 2150 | ||
1875 | /* We handle __GFP_ZERO in the caller */ | 2151 | if (!c->page) |
1876 | gfpflags &= ~__GFP_ZERO; | ||
1877 | |||
1878 | page = c->page; | ||
1879 | if (!page) | ||
1880 | goto new_slab; | 2152 | goto new_slab; |
2153 | redo: | ||
2154 | if (unlikely(!node_match(c, node))) { | ||
2155 | stat(s, ALLOC_NODE_MISMATCH); | ||
2156 | deactivate_slab(s, c); | ||
2157 | goto new_slab; | ||
2158 | } | ||
1881 | 2159 | ||
1882 | slab_lock(page); | 2160 | stat(s, ALLOC_SLOWPATH); |
1883 | if (unlikely(!node_match(c, node))) | 2161 | |
1884 | goto another_slab; | 2162 | do { |
2163 | object = c->page->freelist; | ||
2164 | counters = c->page->counters; | ||
2165 | new.counters = counters; | ||
2166 | VM_BUG_ON(!new.frozen); | ||
2167 | |||
2168 | /* | ||
2169 | * If there is no object left then we use this loop to | ||
2170 | * deactivate the slab which is simple since no objects | ||
2171 | * are left in the slab and therefore we do not need to | ||
2172 | * put the page back onto the partial list. | ||
2173 | * | ||
2174 | * If there are objects left then we retrieve them | ||
2175 | * and use them to refill the per cpu queue. | ||
2176 | */ | ||
2177 | |||
2178 | new.inuse = c->page->objects; | ||
2179 | new.frozen = object != NULL; | ||
2180 | |||
2181 | } while (!__cmpxchg_double_slab(s, c->page, | ||
2182 | object, counters, | ||
2183 | NULL, new.counters, | ||
2184 | "__slab_alloc")); | ||
2185 | |||
2186 | if (!object) { | ||
2187 | c->page = NULL; | ||
2188 | stat(s, DEACTIVATE_BYPASS); | ||
2189 | goto new_slab; | ||
2190 | } | ||
1885 | 2191 | ||
1886 | stat(s, ALLOC_REFILL); | 2192 | stat(s, ALLOC_REFILL); |
1887 | 2193 | ||
1888 | load_freelist: | 2194 | load_freelist: |
1889 | object = page->freelist; | ||
1890 | if (unlikely(!object)) | ||
1891 | goto another_slab; | ||
1892 | if (kmem_cache_debug(s)) | ||
1893 | goto debug; | ||
1894 | |||
1895 | c->freelist = get_freepointer(s, object); | 2195 | c->freelist = get_freepointer(s, object); |
1896 | page->inuse = page->objects; | ||
1897 | page->freelist = NULL; | ||
1898 | |||
1899 | slab_unlock(page); | ||
1900 | c->tid = next_tid(c->tid); | 2196 | c->tid = next_tid(c->tid); |
1901 | local_irq_restore(flags); | 2197 | local_irq_restore(flags); |
1902 | stat(s, ALLOC_SLOWPATH); | ||
1903 | return object; | 2198 | return object; |
1904 | 2199 | ||
1905 | another_slab: | ||
1906 | deactivate_slab(s, c); | ||
1907 | |||
1908 | new_slab: | 2200 | new_slab: |
1909 | page = get_partial(s, gfpflags, node); | 2201 | |
1910 | if (page) { | 2202 | if (c->partial) { |
1911 | stat(s, ALLOC_FROM_PARTIAL); | 2203 | c->page = c->partial; |
1912 | c->node = page_to_nid(page); | 2204 | c->partial = c->page->next; |
1913 | c->page = page; | 2205 | c->node = page_to_nid(c->page); |
1914 | goto load_freelist; | 2206 | stat(s, CPU_PARTIAL_ALLOC); |
2207 | c->freelist = NULL; | ||
2208 | goto redo; | ||
1915 | } | 2209 | } |
1916 | 2210 | ||
1917 | gfpflags &= gfp_allowed_mask; | 2211 | /* Then do expensive stuff like retrieving pages from the partial lists */ |
1918 | if (gfpflags & __GFP_WAIT) | 2212 | object = get_partial(s, gfpflags, node, c); |
1919 | local_irq_enable(); | ||
1920 | 2213 | ||
1921 | page = new_slab(s, gfpflags, node); | 2214 | if (unlikely(!object)) { |
1922 | 2215 | ||
1923 | if (gfpflags & __GFP_WAIT) | 2216 | object = new_slab_objects(s, gfpflags, node, &c); |
1924 | local_irq_disable(); | ||
1925 | 2217 | ||
1926 | if (page) { | 2218 | if (unlikely(!object)) { |
1927 | c = __this_cpu_ptr(s->cpu_slab); | 2219 | if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit()) |
1928 | stat(s, ALLOC_SLAB); | 2220 | slab_out_of_memory(s, gfpflags, node); |
1929 | if (c->page) | ||
1930 | flush_slab(s, c); | ||
1931 | 2221 | ||
1932 | slab_lock(page); | 2222 | local_irq_restore(flags); |
1933 | __SetPageSlubFrozen(page); | 2223 | return NULL; |
1934 | c->node = page_to_nid(page); | 2224 | } |
1935 | c->page = page; | ||
1936 | goto load_freelist; | ||
1937 | } | 2225 | } |
1938 | if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit()) | ||
1939 | slab_out_of_memory(s, gfpflags, node); | ||
1940 | local_irq_restore(flags); | ||
1941 | return NULL; | ||
1942 | debug: | ||
1943 | if (!alloc_debug_processing(s, page, object, addr)) | ||
1944 | goto another_slab; | ||
1945 | 2226 | ||
1946 | page->inuse++; | 2227 | if (likely(!kmem_cache_debug(s))) |
1947 | page->freelist = get_freepointer(s, object); | 2228 | goto load_freelist; |
2229 | |||
2230 | /* Only entered in the debug case */ | ||
2231 | if (!alloc_debug_processing(s, c->page, object, addr)) | ||
2232 | goto new_slab; /* Slab failed checks. Next slab needed */ | ||
2233 | |||
2234 | c->freelist = get_freepointer(s, object); | ||
1948 | deactivate_slab(s, c); | 2235 | deactivate_slab(s, c); |
1949 | c->page = NULL; | ||
1950 | c->node = NUMA_NO_NODE; | 2236 | c->node = NUMA_NO_NODE; |
1951 | local_irq_restore(flags); | 2237 | local_irq_restore(flags); |
1952 | return object; | 2238 | return object; |
@@ -2096,52 +2382,110 @@ static void __slab_free(struct kmem_cache *s, struct page *page, | |||
2096 | { | 2382 | { |
2097 | void *prior; | 2383 | void *prior; |
2098 | void **object = (void *)x; | 2384 | void **object = (void *)x; |
2099 | unsigned long flags; | 2385 | int was_frozen; |
2386 | int inuse; | ||
2387 | struct page new; | ||
2388 | unsigned long counters; | ||
2389 | struct kmem_cache_node *n = NULL; | ||
2390 | unsigned long uninitialized_var(flags); | ||
2100 | 2391 | ||
2101 | local_irq_save(flags); | ||
2102 | slab_lock(page); | ||
2103 | stat(s, FREE_SLOWPATH); | 2392 | stat(s, FREE_SLOWPATH); |
2104 | 2393 | ||
2105 | if (kmem_cache_debug(s) && !free_debug_processing(s, page, x, addr)) | 2394 | if (kmem_cache_debug(s) && !free_debug_processing(s, page, x, addr)) |
2106 | goto out_unlock; | 2395 | return; |
2107 | 2396 | ||
2108 | prior = page->freelist; | 2397 | do { |
2109 | set_freepointer(s, object, prior); | 2398 | prior = page->freelist; |
2110 | page->freelist = object; | 2399 | counters = page->counters; |
2111 | page->inuse--; | 2400 | set_freepointer(s, object, prior); |
2401 | new.counters = counters; | ||
2402 | was_frozen = new.frozen; | ||
2403 | new.inuse--; | ||
2404 | if ((!new.inuse || !prior) && !was_frozen && !n) { | ||
2112 | 2405 | ||
2113 | if (unlikely(PageSlubFrozen(page))) { | 2406 | if (!kmem_cache_debug(s) && !prior) |
2114 | stat(s, FREE_FROZEN); | 2407 | |
2115 | goto out_unlock; | 2408 | /* |
2116 | } | 2409 | * Slab was on no list before and will be partially empty |
2410 | * We can defer the list move and instead freeze it. | ||
2411 | */ | ||
2412 | new.frozen = 1; | ||
2413 | |||
2414 | else { /* Needs to be taken off a list */ | ||
2415 | |||
2416 | n = get_node(s, page_to_nid(page)); | ||
2417 | /* | ||
2418 | * Speculatively acquire the list_lock. | ||
2419 | * If the cmpxchg does not succeed then we may | ||
2420 | * drop the list_lock without any processing. | ||
2421 | * | ||
2422 | * Otherwise the list_lock will synchronize with | ||
2423 | * other processors updating the list of slabs. | ||
2424 | */ | ||
2425 | spin_lock_irqsave(&n->list_lock, flags); | ||
2426 | |||
2427 | } | ||
2428 | } | ||
2429 | inuse = new.inuse; | ||
2117 | 2430 | ||
2118 | if (unlikely(!page->inuse)) | 2431 | } while (!cmpxchg_double_slab(s, page, |
2119 | goto slab_empty; | 2432 | prior, counters, |
2433 | object, new.counters, | ||
2434 | "__slab_free")); | ||
2435 | |||
2436 | if (likely(!n)) { | ||
2437 | |||
2438 | /* | ||
2439 | * If we just froze the page then put it onto the | ||
2440 | * per cpu partial list. | ||
2441 | */ | ||
2442 | if (new.frozen && !was_frozen) | ||
2443 | put_cpu_partial(s, page, 1); | ||
2444 | |||
2445 | /* | ||
2446 | * The list lock was not taken therefore no list | ||
2447 | * activity can be necessary. | ||
2448 | */ | ||
2449 | if (was_frozen) | ||
2450 | stat(s, FREE_FROZEN); | ||
2451 | return; | ||
2452 | } | ||
2120 | 2453 | ||
2121 | /* | 2454 | /* |
2122 | * Objects left in the slab. If it was not on the partial list before | 2455 | * was_frozen may have been set after we acquired the list_lock in |
2123 | * then add it. | 2456 | * an earlier loop. So we need to check it here again. |
2124 | */ | 2457 | */ |
2125 | if (unlikely(!prior)) { | 2458 | if (was_frozen) |
2126 | add_partial(get_node(s, page_to_nid(page)), page, 1); | 2459 | stat(s, FREE_FROZEN); |
2127 | stat(s, FREE_ADD_PARTIAL); | 2460 | else { |
2128 | } | 2461 | if (unlikely(!inuse && n->nr_partial > s->min_partial)) |
2462 | goto slab_empty; | ||
2129 | 2463 | ||
2130 | out_unlock: | 2464 | /* |
2131 | slab_unlock(page); | 2465 | * Objects left in the slab. If it was not on the partial list before |
2132 | local_irq_restore(flags); | 2466 | * then add it. |
2467 | */ | ||
2468 | if (unlikely(!prior)) { | ||
2469 | remove_full(s, page); | ||
2470 | add_partial(n, page, DEACTIVATE_TO_TAIL); | ||
2471 | stat(s, FREE_ADD_PARTIAL); | ||
2472 | } | ||
2473 | } | ||
2474 | spin_unlock_irqrestore(&n->list_lock, flags); | ||
2133 | return; | 2475 | return; |
2134 | 2476 | ||
2135 | slab_empty: | 2477 | slab_empty: |
2136 | if (prior) { | 2478 | if (prior) { |
2137 | /* | 2479 | /* |
2138 | * Slab still on the partial list. | 2480 | * Slab on the partial list. |
2139 | */ | 2481 | */ |
2140 | remove_partial(s, page); | 2482 | remove_partial(n, page); |
2141 | stat(s, FREE_REMOVE_PARTIAL); | 2483 | stat(s, FREE_REMOVE_PARTIAL); |
2142 | } | 2484 | } else |
2143 | slab_unlock(page); | 2485 | /* Slab must be on the full list */ |
2144 | local_irq_restore(flags); | 2486 | remove_full(s, page); |
2487 | |||
2488 | spin_unlock_irqrestore(&n->list_lock, flags); | ||
2145 | stat(s, FREE_SLAB); | 2489 | stat(s, FREE_SLAB); |
2146 | discard_slab(s, page); | 2490 | discard_slab(s, page); |
2147 | } | 2491 | } |
@@ -2167,7 +2511,6 @@ static __always_inline void slab_free(struct kmem_cache *s, | |||
2167 | slab_free_hook(s, x); | 2511 | slab_free_hook(s, x); |
2168 | 2512 | ||
2169 | redo: | 2513 | redo: |
2170 | |||
2171 | /* | 2514 | /* |
2172 | * Determine the currently cpus per cpu slab. | 2515 | * Determine the currently cpus per cpu slab. |
2173 | * The cpu may change afterward. However that does not matter since | 2516 | * The cpu may change afterward. However that does not matter since |
@@ -2415,7 +2758,6 @@ static void early_kmem_cache_node_alloc(int node) | |||
2415 | { | 2758 | { |
2416 | struct page *page; | 2759 | struct page *page; |
2417 | struct kmem_cache_node *n; | 2760 | struct kmem_cache_node *n; |
2418 | unsigned long flags; | ||
2419 | 2761 | ||
2420 | BUG_ON(kmem_cache_node->size < sizeof(struct kmem_cache_node)); | 2762 | BUG_ON(kmem_cache_node->size < sizeof(struct kmem_cache_node)); |
2421 | 2763 | ||
@@ -2432,7 +2774,8 @@ static void early_kmem_cache_node_alloc(int node) | |||
2432 | n = page->freelist; | 2774 | n = page->freelist; |
2433 | BUG_ON(!n); | 2775 | BUG_ON(!n); |
2434 | page->freelist = get_freepointer(kmem_cache_node, n); | 2776 | page->freelist = get_freepointer(kmem_cache_node, n); |
2435 | page->inuse++; | 2777 | page->inuse = 1; |
2778 | page->frozen = 0; | ||
2436 | kmem_cache_node->node[node] = n; | 2779 | kmem_cache_node->node[node] = n; |
2437 | #ifdef CONFIG_SLUB_DEBUG | 2780 | #ifdef CONFIG_SLUB_DEBUG |
2438 | init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); | 2781 | init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); |
@@ -2441,14 +2784,7 @@ static void early_kmem_cache_node_alloc(int node) | |||
2441 | init_kmem_cache_node(n, kmem_cache_node); | 2784 | init_kmem_cache_node(n, kmem_cache_node); |
2442 | inc_slabs_node(kmem_cache_node, node, page->objects); | 2785 | inc_slabs_node(kmem_cache_node, node, page->objects); |
2443 | 2786 | ||
2444 | /* | 2787 | add_partial(n, page, DEACTIVATE_TO_HEAD); |
2445 | * lockdep requires consistent irq usage for each lock | ||
2446 | * so even though there cannot be a race this early in | ||
2447 | * the boot sequence, we still disable irqs. | ||
2448 | */ | ||
2449 | local_irq_save(flags); | ||
2450 | add_partial(n, page, 0); | ||
2451 | local_irq_restore(flags); | ||
2452 | } | 2788 | } |
2453 | 2789 | ||
2454 | static void free_kmem_cache_nodes(struct kmem_cache *s) | 2790 | static void free_kmem_cache_nodes(struct kmem_cache *s) |
@@ -2654,11 +2990,44 @@ static int kmem_cache_open(struct kmem_cache *s, | |||
2654 | } | 2990 | } |
2655 | } | 2991 | } |
2656 | 2992 | ||
2993 | #ifdef CONFIG_CMPXCHG_DOUBLE | ||
2994 | if (system_has_cmpxchg_double() && (s->flags & SLAB_DEBUG_FLAGS) == 0) | ||
2995 | /* Enable fast mode */ | ||
2996 | s->flags |= __CMPXCHG_DOUBLE; | ||
2997 | #endif | ||
2998 | |||
2657 | /* | 2999 | /* |
2658 | * The larger the object size is, the more pages we want on the partial | 3000 | * The larger the object size is, the more pages we want on the partial |
2659 | * list to avoid pounding the page allocator excessively. | 3001 | * list to avoid pounding the page allocator excessively. |
2660 | */ | 3002 | */ |
2661 | set_min_partial(s, ilog2(s->size)); | 3003 | set_min_partial(s, ilog2(s->size) / 2); |
3004 | |||
3005 | /* | ||
3006 | * cpu_partial determined the maximum number of objects kept in the | ||
3007 | * per cpu partial lists of a processor. | ||
3008 | * | ||
3009 | * Per cpu partial lists mainly contain slabs that just have one | ||
3010 | * object freed. If they are used for allocation then they can be | ||
3011 | * filled up again with minimal effort. The slab will never hit the | ||
3012 | * per node partial lists and therefore no locking will be required. | ||
3013 | * | ||
3014 | * This setting also determines | ||
3015 | * | ||
3016 | * A) The number of objects from per cpu partial slabs dumped to the | ||
3017 | * per node list when we reach the limit. | ||
3018 | * B) The number of objects in cpu partial slabs to extract from the | ||
3019 | * per node list when we run out of per cpu objects. We only fetch 50% | ||
3020 | * to keep some capacity around for frees. | ||
3021 | */ | ||
3022 | if (s->size >= PAGE_SIZE) | ||
3023 | s->cpu_partial = 2; | ||
3024 | else if (s->size >= 1024) | ||
3025 | s->cpu_partial = 6; | ||
3026 | else if (s->size >= 256) | ||
3027 | s->cpu_partial = 13; | ||
3028 | else | ||
3029 | s->cpu_partial = 30; | ||
3030 | |||
2662 | s->refcount = 1; | 3031 | s->refcount = 1; |
2663 | #ifdef CONFIG_NUMA | 3032 | #ifdef CONFIG_NUMA |
2664 | s->remote_node_defrag_ratio = 1000; | 3033 | s->remote_node_defrag_ratio = 1000; |
@@ -2717,23 +3086,22 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page, | |||
2717 | 3086 | ||
2718 | /* | 3087 | /* |
2719 | * Attempt to free all partial slabs on a node. | 3088 | * Attempt to free all partial slabs on a node. |
3089 | * This is called from kmem_cache_close(). We must be the last thread | ||
3090 | * using the cache and therefore we do not need to lock anymore. | ||
2720 | */ | 3091 | */ |
2721 | static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) | 3092 | static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) |
2722 | { | 3093 | { |
2723 | unsigned long flags; | ||
2724 | struct page *page, *h; | 3094 | struct page *page, *h; |
2725 | 3095 | ||
2726 | spin_lock_irqsave(&n->list_lock, flags); | ||
2727 | list_for_each_entry_safe(page, h, &n->partial, lru) { | 3096 | list_for_each_entry_safe(page, h, &n->partial, lru) { |
2728 | if (!page->inuse) { | 3097 | if (!page->inuse) { |
2729 | __remove_partial(n, page); | 3098 | remove_partial(n, page); |
2730 | discard_slab(s, page); | 3099 | discard_slab(s, page); |
2731 | } else { | 3100 | } else { |
2732 | list_slab_objects(s, page, | 3101 | list_slab_objects(s, page, |
2733 | "Objects remaining on kmem_cache_close()"); | 3102 | "Objects remaining on kmem_cache_close()"); |
2734 | } | 3103 | } |
2735 | } | 3104 | } |
2736 | spin_unlock_irqrestore(&n->list_lock, flags); | ||
2737 | } | 3105 | } |
2738 | 3106 | ||
2739 | /* | 3107 | /* |
@@ -2767,6 +3135,7 @@ void kmem_cache_destroy(struct kmem_cache *s) | |||
2767 | s->refcount--; | 3135 | s->refcount--; |
2768 | if (!s->refcount) { | 3136 | if (!s->refcount) { |
2769 | list_del(&s->list); | 3137 | list_del(&s->list); |
3138 | up_write(&slub_lock); | ||
2770 | if (kmem_cache_close(s)) { | 3139 | if (kmem_cache_close(s)) { |
2771 | printk(KERN_ERR "SLUB %s: %s called for cache that " | 3140 | printk(KERN_ERR "SLUB %s: %s called for cache that " |
2772 | "still has objects.\n", s->name, __func__); | 3141 | "still has objects.\n", s->name, __func__); |
@@ -2775,8 +3144,8 @@ void kmem_cache_destroy(struct kmem_cache *s) | |||
2775 | if (s->flags & SLAB_DESTROY_BY_RCU) | 3144 | if (s->flags & SLAB_DESTROY_BY_RCU) |
2776 | rcu_barrier(); | 3145 | rcu_barrier(); |
2777 | sysfs_slab_remove(s); | 3146 | sysfs_slab_remove(s); |
2778 | } | 3147 | } else |
2779 | up_write(&slub_lock); | 3148 | up_write(&slub_lock); |
2780 | } | 3149 | } |
2781 | EXPORT_SYMBOL(kmem_cache_destroy); | 3150 | EXPORT_SYMBOL(kmem_cache_destroy); |
2782 | 3151 | ||
@@ -3094,29 +3463,23 @@ int kmem_cache_shrink(struct kmem_cache *s) | |||
3094 | * list_lock. page->inuse here is the upper limit. | 3463 | * list_lock. page->inuse here is the upper limit. |
3095 | */ | 3464 | */ |
3096 | list_for_each_entry_safe(page, t, &n->partial, lru) { | 3465 | list_for_each_entry_safe(page, t, &n->partial, lru) { |
3097 | if (!page->inuse && slab_trylock(page)) { | 3466 | list_move(&page->lru, slabs_by_inuse + page->inuse); |
3098 | /* | 3467 | if (!page->inuse) |
3099 | * Must hold slab lock here because slab_free | 3468 | n->nr_partial--; |
3100 | * may have freed the last object and be | ||
3101 | * waiting to release the slab. | ||
3102 | */ | ||
3103 | __remove_partial(n, page); | ||
3104 | slab_unlock(page); | ||
3105 | discard_slab(s, page); | ||
3106 | } else { | ||
3107 | list_move(&page->lru, | ||
3108 | slabs_by_inuse + page->inuse); | ||
3109 | } | ||
3110 | } | 3469 | } |
3111 | 3470 | ||
3112 | /* | 3471 | /* |
3113 | * Rebuild the partial list with the slabs filled up most | 3472 | * Rebuild the partial list with the slabs filled up most |
3114 | * first and the least used slabs at the end. | 3473 | * first and the least used slabs at the end. |
3115 | */ | 3474 | */ |
3116 | for (i = objects - 1; i >= 0; i--) | 3475 | for (i = objects - 1; i > 0; i--) |
3117 | list_splice(slabs_by_inuse + i, n->partial.prev); | 3476 | list_splice(slabs_by_inuse + i, n->partial.prev); |
3118 | 3477 | ||
3119 | spin_unlock_irqrestore(&n->list_lock, flags); | 3478 | spin_unlock_irqrestore(&n->list_lock, flags); |
3479 | |||
3480 | /* Release empty slabs */ | ||
3481 | list_for_each_entry_safe(page, t, slabs_by_inuse, lru) | ||
3482 | discard_slab(s, page); | ||
3120 | } | 3483 | } |
3121 | 3484 | ||
3122 | kfree(slabs_by_inuse); | 3485 | kfree(slabs_by_inuse); |
@@ -3689,12 +4052,9 @@ static int validate_slab(struct kmem_cache *s, struct page *page, | |||
3689 | static void validate_slab_slab(struct kmem_cache *s, struct page *page, | 4052 | static void validate_slab_slab(struct kmem_cache *s, struct page *page, |
3690 | unsigned long *map) | 4053 | unsigned long *map) |
3691 | { | 4054 | { |
3692 | if (slab_trylock(page)) { | 4055 | slab_lock(page); |
3693 | validate_slab(s, page, map); | 4056 | validate_slab(s, page, map); |
3694 | slab_unlock(page); | 4057 | slab_unlock(page); |
3695 | } else | ||
3696 | printk(KERN_INFO "SLUB %s: Skipped busy slab 0x%p\n", | ||
3697 | s->name, page); | ||
3698 | } | 4058 | } |
3699 | 4059 | ||
3700 | static int validate_slab_node(struct kmem_cache *s, | 4060 | static int validate_slab_node(struct kmem_cache *s, |
@@ -4075,6 +4435,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s, | |||
4075 | 4435 | ||
4076 | for_each_possible_cpu(cpu) { | 4436 | for_each_possible_cpu(cpu) { |
4077 | struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); | 4437 | struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); |
4438 | struct page *page; | ||
4078 | 4439 | ||
4079 | if (!c || c->node < 0) | 4440 | if (!c || c->node < 0) |
4080 | continue; | 4441 | continue; |
@@ -4090,6 +4451,13 @@ static ssize_t show_slab_objects(struct kmem_cache *s, | |||
4090 | total += x; | 4451 | total += x; |
4091 | nodes[c->node] += x; | 4452 | nodes[c->node] += x; |
4092 | } | 4453 | } |
4454 | page = c->partial; | ||
4455 | |||
4456 | if (page) { | ||
4457 | x = page->pobjects; | ||
4458 | total += x; | ||
4459 | nodes[c->node] += x; | ||
4460 | } | ||
4093 | per_cpu[c->node]++; | 4461 | per_cpu[c->node]++; |
4094 | } | 4462 | } |
4095 | } | 4463 | } |
@@ -4168,11 +4536,12 @@ struct slab_attribute { | |||
4168 | }; | 4536 | }; |
4169 | 4537 | ||
4170 | #define SLAB_ATTR_RO(_name) \ | 4538 | #define SLAB_ATTR_RO(_name) \ |
4171 | static struct slab_attribute _name##_attr = __ATTR_RO(_name) | 4539 | static struct slab_attribute _name##_attr = \ |
4540 | __ATTR(_name, 0400, _name##_show, NULL) | ||
4172 | 4541 | ||
4173 | #define SLAB_ATTR(_name) \ | 4542 | #define SLAB_ATTR(_name) \ |
4174 | static struct slab_attribute _name##_attr = \ | 4543 | static struct slab_attribute _name##_attr = \ |
4175 | __ATTR(_name, 0644, _name##_show, _name##_store) | 4544 | __ATTR(_name, 0600, _name##_show, _name##_store) |
4176 | 4545 | ||
4177 | static ssize_t slab_size_show(struct kmem_cache *s, char *buf) | 4546 | static ssize_t slab_size_show(struct kmem_cache *s, char *buf) |
4178 | { | 4547 | { |
@@ -4241,6 +4610,27 @@ static ssize_t min_partial_store(struct kmem_cache *s, const char *buf, | |||
4241 | } | 4610 | } |
4242 | SLAB_ATTR(min_partial); | 4611 | SLAB_ATTR(min_partial); |
4243 | 4612 | ||
4613 | static ssize_t cpu_partial_show(struct kmem_cache *s, char *buf) | ||
4614 | { | ||
4615 | return sprintf(buf, "%u\n", s->cpu_partial); | ||
4616 | } | ||
4617 | |||
4618 | static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf, | ||
4619 | size_t length) | ||
4620 | { | ||
4621 | unsigned long objects; | ||
4622 | int err; | ||
4623 | |||
4624 | err = strict_strtoul(buf, 10, &objects); | ||
4625 | if (err) | ||
4626 | return err; | ||
4627 | |||
4628 | s->cpu_partial = objects; | ||
4629 | flush_all(s); | ||
4630 | return length; | ||
4631 | } | ||
4632 | SLAB_ATTR(cpu_partial); | ||
4633 | |||
4244 | static ssize_t ctor_show(struct kmem_cache *s, char *buf) | 4634 | static ssize_t ctor_show(struct kmem_cache *s, char *buf) |
4245 | { | 4635 | { |
4246 | if (!s->ctor) | 4636 | if (!s->ctor) |
@@ -4279,6 +4669,37 @@ static ssize_t objects_partial_show(struct kmem_cache *s, char *buf) | |||
4279 | } | 4669 | } |
4280 | SLAB_ATTR_RO(objects_partial); | 4670 | SLAB_ATTR_RO(objects_partial); |
4281 | 4671 | ||
4672 | static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf) | ||
4673 | { | ||
4674 | int objects = 0; | ||
4675 | int pages = 0; | ||
4676 | int cpu; | ||
4677 | int len; | ||
4678 | |||
4679 | for_each_online_cpu(cpu) { | ||
4680 | struct page *page = per_cpu_ptr(s->cpu_slab, cpu)->partial; | ||
4681 | |||
4682 | if (page) { | ||
4683 | pages += page->pages; | ||
4684 | objects += page->pobjects; | ||
4685 | } | ||
4686 | } | ||
4687 | |||
4688 | len = sprintf(buf, "%d(%d)", objects, pages); | ||
4689 | |||
4690 | #ifdef CONFIG_SMP | ||
4691 | for_each_online_cpu(cpu) { | ||
4692 | struct page *page = per_cpu_ptr(s->cpu_slab, cpu) ->partial; | ||
4693 | |||
4694 | if (page && len < PAGE_SIZE - 20) | ||
4695 | len += sprintf(buf + len, " C%d=%d(%d)", cpu, | ||
4696 | page->pobjects, page->pages); | ||
4697 | } | ||
4698 | #endif | ||
4699 | return len + sprintf(buf + len, "\n"); | ||
4700 | } | ||
4701 | SLAB_ATTR_RO(slabs_cpu_partial); | ||
4702 | |||
4282 | static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf) | 4703 | static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf) |
4283 | { | 4704 | { |
4284 | return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT)); | 4705 | return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT)); |
@@ -4342,8 +4763,10 @@ static ssize_t sanity_checks_store(struct kmem_cache *s, | |||
4342 | const char *buf, size_t length) | 4763 | const char *buf, size_t length) |
4343 | { | 4764 | { |
4344 | s->flags &= ~SLAB_DEBUG_FREE; | 4765 | s->flags &= ~SLAB_DEBUG_FREE; |
4345 | if (buf[0] == '1') | 4766 | if (buf[0] == '1') { |
4767 | s->flags &= ~__CMPXCHG_DOUBLE; | ||
4346 | s->flags |= SLAB_DEBUG_FREE; | 4768 | s->flags |= SLAB_DEBUG_FREE; |
4769 | } | ||
4347 | return length; | 4770 | return length; |
4348 | } | 4771 | } |
4349 | SLAB_ATTR(sanity_checks); | 4772 | SLAB_ATTR(sanity_checks); |
@@ -4357,8 +4780,10 @@ static ssize_t trace_store(struct kmem_cache *s, const char *buf, | |||
4357 | size_t length) | 4780 | size_t length) |
4358 | { | 4781 | { |
4359 | s->flags &= ~SLAB_TRACE; | 4782 | s->flags &= ~SLAB_TRACE; |
4360 | if (buf[0] == '1') | 4783 | if (buf[0] == '1') { |
4784 | s->flags &= ~__CMPXCHG_DOUBLE; | ||
4361 | s->flags |= SLAB_TRACE; | 4785 | s->flags |= SLAB_TRACE; |
4786 | } | ||
4362 | return length; | 4787 | return length; |
4363 | } | 4788 | } |
4364 | SLAB_ATTR(trace); | 4789 | SLAB_ATTR(trace); |
@@ -4375,8 +4800,10 @@ static ssize_t red_zone_store(struct kmem_cache *s, | |||
4375 | return -EBUSY; | 4800 | return -EBUSY; |
4376 | 4801 | ||
4377 | s->flags &= ~SLAB_RED_ZONE; | 4802 | s->flags &= ~SLAB_RED_ZONE; |
4378 | if (buf[0] == '1') | 4803 | if (buf[0] == '1') { |
4804 | s->flags &= ~__CMPXCHG_DOUBLE; | ||
4379 | s->flags |= SLAB_RED_ZONE; | 4805 | s->flags |= SLAB_RED_ZONE; |
4806 | } | ||
4380 | calculate_sizes(s, -1); | 4807 | calculate_sizes(s, -1); |
4381 | return length; | 4808 | return length; |
4382 | } | 4809 | } |
@@ -4394,8 +4821,10 @@ static ssize_t poison_store(struct kmem_cache *s, | |||
4394 | return -EBUSY; | 4821 | return -EBUSY; |
4395 | 4822 | ||
4396 | s->flags &= ~SLAB_POISON; | 4823 | s->flags &= ~SLAB_POISON; |
4397 | if (buf[0] == '1') | 4824 | if (buf[0] == '1') { |
4825 | s->flags &= ~__CMPXCHG_DOUBLE; | ||
4398 | s->flags |= SLAB_POISON; | 4826 | s->flags |= SLAB_POISON; |
4827 | } | ||
4399 | calculate_sizes(s, -1); | 4828 | calculate_sizes(s, -1); |
4400 | return length; | 4829 | return length; |
4401 | } | 4830 | } |
@@ -4413,8 +4842,10 @@ static ssize_t store_user_store(struct kmem_cache *s, | |||
4413 | return -EBUSY; | 4842 | return -EBUSY; |
4414 | 4843 | ||
4415 | s->flags &= ~SLAB_STORE_USER; | 4844 | s->flags &= ~SLAB_STORE_USER; |
4416 | if (buf[0] == '1') | 4845 | if (buf[0] == '1') { |
4846 | s->flags &= ~__CMPXCHG_DOUBLE; | ||
4417 | s->flags |= SLAB_STORE_USER; | 4847 | s->flags |= SLAB_STORE_USER; |
4848 | } | ||
4418 | calculate_sizes(s, -1); | 4849 | calculate_sizes(s, -1); |
4419 | return length; | 4850 | return length; |
4420 | } | 4851 | } |
@@ -4579,6 +5010,7 @@ STAT_ATTR(FREE_REMOVE_PARTIAL, free_remove_partial); | |||
4579 | STAT_ATTR(ALLOC_FROM_PARTIAL, alloc_from_partial); | 5010 | STAT_ATTR(ALLOC_FROM_PARTIAL, alloc_from_partial); |
4580 | STAT_ATTR(ALLOC_SLAB, alloc_slab); | 5011 | STAT_ATTR(ALLOC_SLAB, alloc_slab); |
4581 | STAT_ATTR(ALLOC_REFILL, alloc_refill); | 5012 | STAT_ATTR(ALLOC_REFILL, alloc_refill); |
5013 | STAT_ATTR(ALLOC_NODE_MISMATCH, alloc_node_mismatch); | ||
4582 | STAT_ATTR(FREE_SLAB, free_slab); | 5014 | STAT_ATTR(FREE_SLAB, free_slab); |
4583 | STAT_ATTR(CPUSLAB_FLUSH, cpuslab_flush); | 5015 | STAT_ATTR(CPUSLAB_FLUSH, cpuslab_flush); |
4584 | STAT_ATTR(DEACTIVATE_FULL, deactivate_full); | 5016 | STAT_ATTR(DEACTIVATE_FULL, deactivate_full); |
@@ -4586,7 +5018,12 @@ STAT_ATTR(DEACTIVATE_EMPTY, deactivate_empty); | |||
4586 | STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate_to_head); | 5018 | STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate_to_head); |
4587 | STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail); | 5019 | STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail); |
4588 | STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees); | 5020 | STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees); |
5021 | STAT_ATTR(DEACTIVATE_BYPASS, deactivate_bypass); | ||
4589 | STAT_ATTR(ORDER_FALLBACK, order_fallback); | 5022 | STAT_ATTR(ORDER_FALLBACK, order_fallback); |
5023 | STAT_ATTR(CMPXCHG_DOUBLE_CPU_FAIL, cmpxchg_double_cpu_fail); | ||
5024 | STAT_ATTR(CMPXCHG_DOUBLE_FAIL, cmpxchg_double_fail); | ||
5025 | STAT_ATTR(CPU_PARTIAL_ALLOC, cpu_partial_alloc); | ||
5026 | STAT_ATTR(CPU_PARTIAL_FREE, cpu_partial_free); | ||
4590 | #endif | 5027 | #endif |
4591 | 5028 | ||
4592 | static struct attribute *slab_attrs[] = { | 5029 | static struct attribute *slab_attrs[] = { |
@@ -4595,6 +5032,7 @@ static struct attribute *slab_attrs[] = { | |||
4595 | &objs_per_slab_attr.attr, | 5032 | &objs_per_slab_attr.attr, |
4596 | &order_attr.attr, | 5033 | &order_attr.attr, |
4597 | &min_partial_attr.attr, | 5034 | &min_partial_attr.attr, |
5035 | &cpu_partial_attr.attr, | ||
4598 | &objects_attr.attr, | 5036 | &objects_attr.attr, |
4599 | &objects_partial_attr.attr, | 5037 | &objects_partial_attr.attr, |
4600 | &partial_attr.attr, | 5038 | &partial_attr.attr, |
@@ -4607,6 +5045,7 @@ static struct attribute *slab_attrs[] = { | |||
4607 | &destroy_by_rcu_attr.attr, | 5045 | &destroy_by_rcu_attr.attr, |
4608 | &shrink_attr.attr, | 5046 | &shrink_attr.attr, |
4609 | &reserved_attr.attr, | 5047 | &reserved_attr.attr, |
5048 | &slabs_cpu_partial_attr.attr, | ||
4610 | #ifdef CONFIG_SLUB_DEBUG | 5049 | #ifdef CONFIG_SLUB_DEBUG |
4611 | &total_objects_attr.attr, | 5050 | &total_objects_attr.attr, |
4612 | &slabs_attr.attr, | 5051 | &slabs_attr.attr, |
@@ -4636,6 +5075,7 @@ static struct attribute *slab_attrs[] = { | |||
4636 | &alloc_from_partial_attr.attr, | 5075 | &alloc_from_partial_attr.attr, |
4637 | &alloc_slab_attr.attr, | 5076 | &alloc_slab_attr.attr, |
4638 | &alloc_refill_attr.attr, | 5077 | &alloc_refill_attr.attr, |
5078 | &alloc_node_mismatch_attr.attr, | ||
4639 | &free_slab_attr.attr, | 5079 | &free_slab_attr.attr, |
4640 | &cpuslab_flush_attr.attr, | 5080 | &cpuslab_flush_attr.attr, |
4641 | &deactivate_full_attr.attr, | 5081 | &deactivate_full_attr.attr, |
@@ -4643,7 +5083,12 @@ static struct attribute *slab_attrs[] = { | |||
4643 | &deactivate_to_head_attr.attr, | 5083 | &deactivate_to_head_attr.attr, |
4644 | &deactivate_to_tail_attr.attr, | 5084 | &deactivate_to_tail_attr.attr, |
4645 | &deactivate_remote_frees_attr.attr, | 5085 | &deactivate_remote_frees_attr.attr, |
5086 | &deactivate_bypass_attr.attr, | ||
4646 | &order_fallback_attr.attr, | 5087 | &order_fallback_attr.attr, |
5088 | &cmpxchg_double_fail_attr.attr, | ||
5089 | &cmpxchg_double_cpu_fail_attr.attr, | ||
5090 | &cpu_partial_alloc_attr.attr, | ||
5091 | &cpu_partial_free_attr.attr, | ||
4647 | #endif | 5092 | #endif |
4648 | #ifdef CONFIG_FAILSLAB | 5093 | #ifdef CONFIG_FAILSLAB |
4649 | &failslab_attr.attr, | 5094 | &failslab_attr.attr, |
@@ -4995,7 +5440,7 @@ static const struct file_operations proc_slabinfo_operations = { | |||
4995 | 5440 | ||
4996 | static int __init slab_proc_init(void) | 5441 | static int __init slab_proc_init(void) |
4997 | { | 5442 | { |
4998 | proc_create("slabinfo", S_IRUGO, NULL, &proc_slabinfo_operations); | 5443 | proc_create("slabinfo", S_IRUSR, NULL, &proc_slabinfo_operations); |
4999 | return 0; | 5444 | return 0; |
5000 | } | 5445 | } |
5001 | module_init(slab_proc_init); | 5446 | module_init(slab_proc_init); |
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 64b984091edb..1b7e22ab9b09 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c | |||
@@ -21,7 +21,6 @@ | |||
21 | #include <linux/mmzone.h> | 21 | #include <linux/mmzone.h> |
22 | #include <linux/bootmem.h> | 22 | #include <linux/bootmem.h> |
23 | #include <linux/highmem.h> | 23 | #include <linux/highmem.h> |
24 | #include <linux/module.h> | ||
25 | #include <linux/slab.h> | 24 | #include <linux/slab.h> |
26 | #include <linux/spinlock.h> | 25 | #include <linux/spinlock.h> |
27 | #include <linux/vmalloc.h> | 26 | #include <linux/vmalloc.h> |
diff --git a/mm/sparse.c b/mm/sparse.c index 858e1dff9b2a..61d7cde23111 100644 --- a/mm/sparse.c +++ b/mm/sparse.c | |||
@@ -6,7 +6,7 @@ | |||
6 | #include <linux/mmzone.h> | 6 | #include <linux/mmzone.h> |
7 | #include <linux/bootmem.h> | 7 | #include <linux/bootmem.h> |
8 | #include <linux/highmem.h> | 8 | #include <linux/highmem.h> |
9 | #include <linux/module.h> | 9 | #include <linux/export.h> |
10 | #include <linux/spinlock.h> | 10 | #include <linux/spinlock.h> |
11 | #include <linux/vmalloc.h> | 11 | #include <linux/vmalloc.h> |
12 | #include "internal.h" | 12 | #include "internal.h" |
@@ -21,7 +21,7 @@ | |||
21 | #include <linux/pagemap.h> | 21 | #include <linux/pagemap.h> |
22 | #include <linux/pagevec.h> | 22 | #include <linux/pagevec.h> |
23 | #include <linux/init.h> | 23 | #include <linux/init.h> |
24 | #include <linux/module.h> | 24 | #include <linux/export.h> |
25 | #include <linux/mm_inline.h> | 25 | #include <linux/mm_inline.h> |
26 | #include <linux/buffer_head.h> /* for try_to_release_page() */ | 26 | #include <linux/buffer_head.h> /* for try_to_release_page() */ |
27 | #include <linux/percpu_counter.h> | 27 | #include <linux/percpu_counter.h> |
@@ -78,39 +78,22 @@ static void put_compound_page(struct page *page) | |||
78 | { | 78 | { |
79 | if (unlikely(PageTail(page))) { | 79 | if (unlikely(PageTail(page))) { |
80 | /* __split_huge_page_refcount can run under us */ | 80 | /* __split_huge_page_refcount can run under us */ |
81 | struct page *page_head = page->first_page; | 81 | struct page *page_head = compound_trans_head(page); |
82 | smp_rmb(); | 82 | |
83 | /* | 83 | if (likely(page != page_head && |
84 | * If PageTail is still set after smp_rmb() we can be sure | 84 | get_page_unless_zero(page_head))) { |
85 | * that the page->first_page we read wasn't a dangling pointer. | ||
86 | * See __split_huge_page_refcount() smp_wmb(). | ||
87 | */ | ||
88 | if (likely(PageTail(page) && get_page_unless_zero(page_head))) { | ||
89 | unsigned long flags; | 85 | unsigned long flags; |
90 | /* | 86 | /* |
91 | * Verify that our page_head wasn't converted | 87 | * page_head wasn't a dangling pointer but it |
92 | * to a a regular page before we got a | 88 | * may not be a head page anymore by the time |
93 | * reference on it. | 89 | * we obtain the lock. That is ok as long as it |
90 | * can't be freed from under us. | ||
94 | */ | 91 | */ |
95 | if (unlikely(!PageHead(page_head))) { | ||
96 | /* PageHead is cleared after PageTail */ | ||
97 | smp_rmb(); | ||
98 | VM_BUG_ON(PageTail(page)); | ||
99 | goto out_put_head; | ||
100 | } | ||
101 | /* | ||
102 | * Only run compound_lock on a valid PageHead, | ||
103 | * after having it pinned with | ||
104 | * get_page_unless_zero() above. | ||
105 | */ | ||
106 | smp_mb(); | ||
107 | /* page_head wasn't a dangling pointer */ | ||
108 | flags = compound_lock_irqsave(page_head); | 92 | flags = compound_lock_irqsave(page_head); |
109 | if (unlikely(!PageTail(page))) { | 93 | if (unlikely(!PageTail(page))) { |
110 | /* __split_huge_page_refcount run before us */ | 94 | /* __split_huge_page_refcount run before us */ |
111 | compound_unlock_irqrestore(page_head, flags); | 95 | compound_unlock_irqrestore(page_head, flags); |
112 | VM_BUG_ON(PageHead(page_head)); | 96 | VM_BUG_ON(PageHead(page_head)); |
113 | out_put_head: | ||
114 | if (put_page_testzero(page_head)) | 97 | if (put_page_testzero(page_head)) |
115 | __put_single_page(page_head); | 98 | __put_single_page(page_head); |
116 | out_put_single: | 99 | out_put_single: |
@@ -121,16 +104,17 @@ static void put_compound_page(struct page *page) | |||
121 | VM_BUG_ON(page_head != page->first_page); | 104 | VM_BUG_ON(page_head != page->first_page); |
122 | /* | 105 | /* |
123 | * We can release the refcount taken by | 106 | * We can release the refcount taken by |
124 | * get_page_unless_zero now that | 107 | * get_page_unless_zero() now that |
125 | * split_huge_page_refcount is blocked on the | 108 | * __split_huge_page_refcount() is blocked on |
126 | * compound_lock. | 109 | * the compound_lock. |
127 | */ | 110 | */ |
128 | if (put_page_testzero(page_head)) | 111 | if (put_page_testzero(page_head)) |
129 | VM_BUG_ON(1); | 112 | VM_BUG_ON(1); |
130 | /* __split_huge_page_refcount will wait now */ | 113 | /* __split_huge_page_refcount will wait now */ |
131 | VM_BUG_ON(atomic_read(&page->_count) <= 0); | 114 | VM_BUG_ON(page_mapcount(page) <= 0); |
132 | atomic_dec(&page->_count); | 115 | atomic_dec(&page->_mapcount); |
133 | VM_BUG_ON(atomic_read(&page_head->_count) <= 0); | 116 | VM_BUG_ON(atomic_read(&page_head->_count) <= 0); |
117 | VM_BUG_ON(atomic_read(&page->_count) != 0); | ||
134 | compound_unlock_irqrestore(page_head, flags); | 118 | compound_unlock_irqrestore(page_head, flags); |
135 | if (put_page_testzero(page_head)) { | 119 | if (put_page_testzero(page_head)) { |
136 | if (PageHead(page_head)) | 120 | if (PageHead(page_head)) |
@@ -160,6 +144,45 @@ void put_page(struct page *page) | |||
160 | } | 144 | } |
161 | EXPORT_SYMBOL(put_page); | 145 | EXPORT_SYMBOL(put_page); |
162 | 146 | ||
147 | /* | ||
148 | * This function is exported but must not be called by anything other | ||
149 | * than get_page(). It implements the slow path of get_page(). | ||
150 | */ | ||
151 | bool __get_page_tail(struct page *page) | ||
152 | { | ||
153 | /* | ||
154 | * This takes care of get_page() if run on a tail page | ||
155 | * returned by one of the get_user_pages/follow_page variants. | ||
156 | * get_user_pages/follow_page itself doesn't need the compound | ||
157 | * lock because it runs __get_page_tail_foll() under the | ||
158 | * proper PT lock that already serializes against | ||
159 | * split_huge_page(). | ||
160 | */ | ||
161 | unsigned long flags; | ||
162 | bool got = false; | ||
163 | struct page *page_head = compound_trans_head(page); | ||
164 | |||
165 | if (likely(page != page_head && get_page_unless_zero(page_head))) { | ||
166 | /* | ||
167 | * page_head wasn't a dangling pointer but it | ||
168 | * may not be a head page anymore by the time | ||
169 | * we obtain the lock. That is ok as long as it | ||
170 | * can't be freed from under us. | ||
171 | */ | ||
172 | flags = compound_lock_irqsave(page_head); | ||
173 | /* here __split_huge_page_refcount won't run anymore */ | ||
174 | if (likely(PageTail(page))) { | ||
175 | __get_page_tail_foll(page, false); | ||
176 | got = true; | ||
177 | } | ||
178 | compound_unlock_irqrestore(page_head, flags); | ||
179 | if (unlikely(!got)) | ||
180 | put_page(page_head); | ||
181 | } | ||
182 | return got; | ||
183 | } | ||
184 | EXPORT_SYMBOL(__get_page_tail); | ||
185 | |||
163 | /** | 186 | /** |
164 | * put_pages_list() - release a list of pages | 187 | * put_pages_list() - release a list of pages |
165 | * @pages: list of pages threaded on page->lru | 188 | * @pages: list of pages threaded on page->lru |
diff --git a/mm/swap_state.c b/mm/swap_state.c index 46680461785b..78cc4d1f6cce 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c | |||
@@ -6,7 +6,6 @@ | |||
6 | * | 6 | * |
7 | * Rewritten to use page cache, (C) 1998 Stephen Tweedie | 7 | * Rewritten to use page cache, (C) 1998 Stephen Tweedie |
8 | */ | 8 | */ |
9 | #include <linux/module.h> | ||
10 | #include <linux/mm.h> | 9 | #include <linux/mm.h> |
11 | #include <linux/gfp.h> | 10 | #include <linux/gfp.h> |
12 | #include <linux/kernel_stat.h> | 11 | #include <linux/kernel_stat.h> |
diff --git a/mm/swapfile.c b/mm/swapfile.c index 1b8c33907242..b1cd12060723 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -21,7 +21,6 @@ | |||
21 | #include <linux/proc_fs.h> | 21 | #include <linux/proc_fs.h> |
22 | #include <linux/seq_file.h> | 22 | #include <linux/seq_file.h> |
23 | #include <linux/init.h> | 23 | #include <linux/init.h> |
24 | #include <linux/module.h> | ||
25 | #include <linux/ksm.h> | 24 | #include <linux/ksm.h> |
26 | #include <linux/rmap.h> | 25 | #include <linux/rmap.h> |
27 | #include <linux/security.h> | 26 | #include <linux/security.h> |
@@ -1617,7 +1616,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1617 | 1616 | ||
1618 | oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX); | 1617 | oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX); |
1619 | err = try_to_unuse(type); | 1618 | err = try_to_unuse(type); |
1620 | test_set_oom_score_adj(oom_score_adj); | 1619 | compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, oom_score_adj); |
1621 | 1620 | ||
1622 | if (err) { | 1621 | if (err) { |
1623 | /* | 1622 | /* |
@@ -1924,20 +1923,24 @@ static unsigned long read_swap_header(struct swap_info_struct *p, | |||
1924 | 1923 | ||
1925 | /* | 1924 | /* |
1926 | * Find out how many pages are allowed for a single swap | 1925 | * Find out how many pages are allowed for a single swap |
1927 | * device. There are two limiting factors: 1) the number of | 1926 | * device. There are three limiting factors: 1) the number |
1928 | * bits for the swap offset in the swp_entry_t type and | 1927 | * of bits for the swap offset in the swp_entry_t type, and |
1929 | * 2) the number of bits in the a swap pte as defined by | 1928 | * 2) the number of bits in the swap pte as defined by the |
1930 | * the different architectures. In order to find the | 1929 | * the different architectures, and 3) the number of free bits |
1931 | * largest possible bit mask a swap entry with swap type 0 | 1930 | * in an exceptional radix_tree entry. In order to find the |
1931 | * largest possible bit mask, a swap entry with swap type 0 | ||
1932 | * and swap offset ~0UL is created, encoded to a swap pte, | 1932 | * and swap offset ~0UL is created, encoded to a swap pte, |
1933 | * decoded to a swp_entry_t again and finally the swap | 1933 | * decoded to a swp_entry_t again, and finally the swap |
1934 | * offset is extracted. This will mask all the bits from | 1934 | * offset is extracted. This will mask all the bits from |
1935 | * the initial ~0UL mask that can't be encoded in either | 1935 | * the initial ~0UL mask that can't be encoded in either |
1936 | * the swp_entry_t or the architecture definition of a | 1936 | * the swp_entry_t or the architecture definition of a |
1937 | * swap pte. | 1937 | * swap pte. Then the same is done for a radix_tree entry. |
1938 | */ | 1938 | */ |
1939 | maxpages = swp_offset(pte_to_swp_entry( | 1939 | maxpages = swp_offset(pte_to_swp_entry( |
1940 | swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1; | 1940 | swp_entry_to_pte(swp_entry(0, ~0UL)))); |
1941 | maxpages = swp_offset(radix_to_swp_entry( | ||
1942 | swp_to_radix_entry(swp_entry(0, maxpages)))) + 1; | ||
1943 | |||
1941 | if (maxpages > swap_header->info.last_page) { | 1944 | if (maxpages > swap_header->info.last_page) { |
1942 | maxpages = swap_header->info.last_page + 1; | 1945 | maxpages = swap_header->info.last_page + 1; |
1943 | /* p->max is an unsigned int: don't overflow it */ | 1946 | /* p->max is an unsigned int: don't overflow it */ |
diff --git a/mm/thrash.c b/mm/thrash.c index e53f7d02c17c..57ad495dbd54 100644 --- a/mm/thrash.c +++ b/mm/thrash.c | |||
@@ -29,7 +29,7 @@ | |||
29 | 29 | ||
30 | static DEFINE_SPINLOCK(swap_token_lock); | 30 | static DEFINE_SPINLOCK(swap_token_lock); |
31 | struct mm_struct *swap_token_mm; | 31 | struct mm_struct *swap_token_mm; |
32 | struct mem_cgroup *swap_token_memcg; | 32 | static struct mem_cgroup *swap_token_memcg; |
33 | 33 | ||
34 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR | 34 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR |
35 | static struct mem_cgroup *swap_token_memcg_from_mm(struct mm_struct *mm) | 35 | static struct mem_cgroup *swap_token_memcg_from_mm(struct mm_struct *mm) |
diff --git a/mm/truncate.c b/mm/truncate.c index 232eb2736a79..632b15e29f74 100644 --- a/mm/truncate.c +++ b/mm/truncate.c | |||
@@ -12,7 +12,7 @@ | |||
12 | #include <linux/gfp.h> | 12 | #include <linux/gfp.h> |
13 | #include <linux/mm.h> | 13 | #include <linux/mm.h> |
14 | #include <linux/swap.h> | 14 | #include <linux/swap.h> |
15 | #include <linux/module.h> | 15 | #include <linux/export.h> |
16 | #include <linux/pagemap.h> | 16 | #include <linux/pagemap.h> |
17 | #include <linux/highmem.h> | 17 | #include <linux/highmem.h> |
18 | #include <linux/pagevec.h> | 18 | #include <linux/pagevec.h> |
@@ -336,6 +336,14 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, | |||
336 | unsigned long count = 0; | 336 | unsigned long count = 0; |
337 | int i; | 337 | int i; |
338 | 338 | ||
339 | /* | ||
340 | * Note: this function may get called on a shmem/tmpfs mapping: | ||
341 | * pagevec_lookup() might then return 0 prematurely (because it | ||
342 | * got a gangful of swap entries); but it's hardly worth worrying | ||
343 | * about - it can rarely have anything to free from such a mapping | ||
344 | * (most pages are dirty), and already skips over any difficulties. | ||
345 | */ | ||
346 | |||
339 | pagevec_init(&pvec, 0); | 347 | pagevec_init(&pvec, 0); |
340 | while (index <= end && pagevec_lookup(&pvec, mapping, index, | 348 | while (index <= end && pagevec_lookup(&pvec, mapping, index, |
341 | min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { | 349 | min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { |
@@ -1,7 +1,7 @@ | |||
1 | #include <linux/mm.h> | 1 | #include <linux/mm.h> |
2 | #include <linux/slab.h> | 2 | #include <linux/slab.h> |
3 | #include <linux/string.h> | 3 | #include <linux/string.h> |
4 | #include <linux/module.h> | 4 | #include <linux/export.h> |
5 | #include <linux/err.h> | 5 | #include <linux/err.h> |
6 | #include <linux/sched.h> | 6 | #include <linux/sched.h> |
7 | #include <asm/uaccess.h> | 7 | #include <asm/uaccess.h> |
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 464621d18eb2..b669aa6f6caf 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
@@ -725,9 +725,10 @@ static void free_unmap_vmap_area_addr(unsigned long addr) | |||
725 | #define VMAP_BBMAP_BITS_MIN (VMAP_MAX_ALLOC*2) | 725 | #define VMAP_BBMAP_BITS_MIN (VMAP_MAX_ALLOC*2) |
726 | #define VMAP_MIN(x, y) ((x) < (y) ? (x) : (y)) /* can't use min() */ | 726 | #define VMAP_MIN(x, y) ((x) < (y) ? (x) : (y)) /* can't use min() */ |
727 | #define VMAP_MAX(x, y) ((x) > (y) ? (x) : (y)) /* can't use max() */ | 727 | #define VMAP_MAX(x, y) ((x) > (y) ? (x) : (y)) /* can't use max() */ |
728 | #define VMAP_BBMAP_BITS VMAP_MIN(VMAP_BBMAP_BITS_MAX, \ | 728 | #define VMAP_BBMAP_BITS \ |
729 | VMAP_MAX(VMAP_BBMAP_BITS_MIN, \ | 729 | VMAP_MIN(VMAP_BBMAP_BITS_MAX, \ |
730 | VMALLOC_PAGES / NR_CPUS / 16)) | 730 | VMAP_MAX(VMAP_BBMAP_BITS_MIN, \ |
731 | VMALLOC_PAGES / roundup_pow_of_two(NR_CPUS) / 16)) | ||
731 | 732 | ||
732 | #define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE) | 733 | #define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE) |
733 | 734 | ||
@@ -1252,18 +1253,22 @@ EXPORT_SYMBOL_GPL(map_vm_area); | |||
1252 | DEFINE_RWLOCK(vmlist_lock); | 1253 | DEFINE_RWLOCK(vmlist_lock); |
1253 | struct vm_struct *vmlist; | 1254 | struct vm_struct *vmlist; |
1254 | 1255 | ||
1255 | static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, | 1256 | static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, |
1256 | unsigned long flags, void *caller) | 1257 | unsigned long flags, void *caller) |
1257 | { | 1258 | { |
1258 | struct vm_struct *tmp, **p; | ||
1259 | |||
1260 | vm->flags = flags; | 1259 | vm->flags = flags; |
1261 | vm->addr = (void *)va->va_start; | 1260 | vm->addr = (void *)va->va_start; |
1262 | vm->size = va->va_end - va->va_start; | 1261 | vm->size = va->va_end - va->va_start; |
1263 | vm->caller = caller; | 1262 | vm->caller = caller; |
1264 | va->private = vm; | 1263 | va->private = vm; |
1265 | va->flags |= VM_VM_AREA; | 1264 | va->flags |= VM_VM_AREA; |
1265 | } | ||
1266 | 1266 | ||
1267 | static void insert_vmalloc_vmlist(struct vm_struct *vm) | ||
1268 | { | ||
1269 | struct vm_struct *tmp, **p; | ||
1270 | |||
1271 | vm->flags &= ~VM_UNLIST; | ||
1267 | write_lock(&vmlist_lock); | 1272 | write_lock(&vmlist_lock); |
1268 | for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) { | 1273 | for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) { |
1269 | if (tmp->addr >= vm->addr) | 1274 | if (tmp->addr >= vm->addr) |
@@ -1274,6 +1279,13 @@ static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, | |||
1274 | write_unlock(&vmlist_lock); | 1279 | write_unlock(&vmlist_lock); |
1275 | } | 1280 | } |
1276 | 1281 | ||
1282 | static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, | ||
1283 | unsigned long flags, void *caller) | ||
1284 | { | ||
1285 | setup_vmalloc_vm(vm, va, flags, caller); | ||
1286 | insert_vmalloc_vmlist(vm); | ||
1287 | } | ||
1288 | |||
1277 | static struct vm_struct *__get_vm_area_node(unsigned long size, | 1289 | static struct vm_struct *__get_vm_area_node(unsigned long size, |
1278 | unsigned long align, unsigned long flags, unsigned long start, | 1290 | unsigned long align, unsigned long flags, unsigned long start, |
1279 | unsigned long end, int node, gfp_t gfp_mask, void *caller) | 1291 | unsigned long end, int node, gfp_t gfp_mask, void *caller) |
@@ -1312,7 +1324,18 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, | |||
1312 | return NULL; | 1324 | return NULL; |
1313 | } | 1325 | } |
1314 | 1326 | ||
1315 | insert_vmalloc_vm(area, va, flags, caller); | 1327 | /* |
1328 | * When this function is called from __vmalloc_node_range, | ||
1329 | * we do not add vm_struct to vmlist here to avoid | ||
1330 | * accessing uninitialized members of vm_struct such as | ||
1331 | * pages and nr_pages fields. They will be set later. | ||
1332 | * To distinguish it from others, we use a VM_UNLIST flag. | ||
1333 | */ | ||
1334 | if (flags & VM_UNLIST) | ||
1335 | setup_vmalloc_vm(area, va, flags, caller); | ||
1336 | else | ||
1337 | insert_vmalloc_vm(area, va, flags, caller); | ||
1338 | |||
1316 | return area; | 1339 | return area; |
1317 | } | 1340 | } |
1318 | 1341 | ||
@@ -1380,17 +1403,20 @@ struct vm_struct *remove_vm_area(const void *addr) | |||
1380 | va = find_vmap_area((unsigned long)addr); | 1403 | va = find_vmap_area((unsigned long)addr); |
1381 | if (va && va->flags & VM_VM_AREA) { | 1404 | if (va && va->flags & VM_VM_AREA) { |
1382 | struct vm_struct *vm = va->private; | 1405 | struct vm_struct *vm = va->private; |
1383 | struct vm_struct *tmp, **p; | 1406 | |
1384 | /* | 1407 | if (!(vm->flags & VM_UNLIST)) { |
1385 | * remove from list and disallow access to this vm_struct | 1408 | struct vm_struct *tmp, **p; |
1386 | * before unmap. (address range confliction is maintained by | 1409 | /* |
1387 | * vmap.) | 1410 | * remove from list and disallow access to |
1388 | */ | 1411 | * this vm_struct before unmap. (address range |
1389 | write_lock(&vmlist_lock); | 1412 | * confliction is maintained by vmap.) |
1390 | for (p = &vmlist; (tmp = *p) != vm; p = &tmp->next) | 1413 | */ |
1391 | ; | 1414 | write_lock(&vmlist_lock); |
1392 | *p = tmp->next; | 1415 | for (p = &vmlist; (tmp = *p) != vm; p = &tmp->next) |
1393 | write_unlock(&vmlist_lock); | 1416 | ; |
1417 | *p = tmp->next; | ||
1418 | write_unlock(&vmlist_lock); | ||
1419 | } | ||
1394 | 1420 | ||
1395 | vmap_debug_free_range(va->va_start, va->va_end); | 1421 | vmap_debug_free_range(va->va_start, va->va_end); |
1396 | free_unmap_vmap_area(va); | 1422 | free_unmap_vmap_area(va); |
@@ -1567,8 +1593,8 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, | |||
1567 | return area->addr; | 1593 | return area->addr; |
1568 | 1594 | ||
1569 | fail: | 1595 | fail: |
1570 | warn_alloc_failed(gfp_mask, order, "vmalloc: allocation failure, " | 1596 | warn_alloc_failed(gfp_mask, order, |
1571 | "allocated %ld of %ld bytes\n", | 1597 | "vmalloc: allocation failure, allocated %ld of %ld bytes\n", |
1572 | (area->nr_pages*PAGE_SIZE), area->size); | 1598 | (area->nr_pages*PAGE_SIZE), area->size); |
1573 | vfree(area->addr); | 1599 | vfree(area->addr); |
1574 | return NULL; | 1600 | return NULL; |
@@ -1599,17 +1625,22 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, | |||
1599 | 1625 | ||
1600 | size = PAGE_ALIGN(size); | 1626 | size = PAGE_ALIGN(size); |
1601 | if (!size || (size >> PAGE_SHIFT) > totalram_pages) | 1627 | if (!size || (size >> PAGE_SHIFT) > totalram_pages) |
1602 | return NULL; | 1628 | goto fail; |
1603 | |||
1604 | area = __get_vm_area_node(size, align, VM_ALLOC, start, end, node, | ||
1605 | gfp_mask, caller); | ||
1606 | 1629 | ||
1630 | area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNLIST, | ||
1631 | start, end, node, gfp_mask, caller); | ||
1607 | if (!area) | 1632 | if (!area) |
1608 | return NULL; | 1633 | goto fail; |
1609 | 1634 | ||
1610 | addr = __vmalloc_area_node(area, gfp_mask, prot, node, caller); | 1635 | addr = __vmalloc_area_node(area, gfp_mask, prot, node, caller); |
1611 | 1636 | ||
1612 | /* | 1637 | /* |
1638 | * In this function, newly allocated vm_struct is not added | ||
1639 | * to vmlist at __get_vm_area_node(). so, it is added here. | ||
1640 | */ | ||
1641 | insert_vmalloc_vmlist(area); | ||
1642 | |||
1643 | /* | ||
1613 | * A ref_count = 3 is needed because the vm_struct and vmap_area | 1644 | * A ref_count = 3 is needed because the vm_struct and vmap_area |
1614 | * structures allocated in the __get_vm_area_node() function contain | 1645 | * structures allocated in the __get_vm_area_node() function contain |
1615 | * references to the virtual address of the vmalloc'ed block. | 1646 | * references to the virtual address of the vmalloc'ed block. |
@@ -1617,6 +1648,12 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, | |||
1617 | kmemleak_alloc(addr, real_size, 3, gfp_mask); | 1648 | kmemleak_alloc(addr, real_size, 3, gfp_mask); |
1618 | 1649 | ||
1619 | return addr; | 1650 | return addr; |
1651 | |||
1652 | fail: | ||
1653 | warn_alloc_failed(gfp_mask, 0, | ||
1654 | "vmalloc: allocation failure: %lu bytes\n", | ||
1655 | real_size); | ||
1656 | return NULL; | ||
1620 | } | 1657 | } |
1621 | 1658 | ||
1622 | /** | 1659 | /** |
@@ -2139,6 +2176,14 @@ struct vm_struct *alloc_vm_area(size_t size) | |||
2139 | return NULL; | 2176 | return NULL; |
2140 | } | 2177 | } |
2141 | 2178 | ||
2179 | /* | ||
2180 | * If the allocated address space is passed to a hypercall | ||
2181 | * before being used then we cannot rely on a page fault to | ||
2182 | * trigger an update of the page tables. So sync all the page | ||
2183 | * tables here. | ||
2184 | */ | ||
2185 | vmalloc_sync_all(); | ||
2186 | |||
2142 | return area; | 2187 | return area; |
2143 | } | 2188 | } |
2144 | EXPORT_SYMBOL_GPL(alloc_vm_area); | 2189 | EXPORT_SYMBOL_GPL(alloc_vm_area); |
diff --git a/mm/vmscan.c b/mm/vmscan.c index 7ef69124fa3e..a1893c050795 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -105,7 +105,6 @@ struct scan_control { | |||
105 | 105 | ||
106 | /* Which cgroup do we reclaim from */ | 106 | /* Which cgroup do we reclaim from */ |
107 | struct mem_cgroup *mem_cgroup; | 107 | struct mem_cgroup *mem_cgroup; |
108 | struct memcg_scanrecord *memcg_record; | ||
109 | 108 | ||
110 | /* | 109 | /* |
111 | * Nodemask of nodes allowed by the caller. If NULL, all nodes | 110 | * Nodemask of nodes allowed by the caller. If NULL, all nodes |
@@ -496,15 +495,6 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, | |||
496 | return PAGE_ACTIVATE; | 495 | return PAGE_ACTIVATE; |
497 | } | 496 | } |
498 | 497 | ||
499 | /* | ||
500 | * Wait on writeback if requested to. This happens when | ||
501 | * direct reclaiming a large contiguous area and the | ||
502 | * first attempt to free a range of pages fails. | ||
503 | */ | ||
504 | if (PageWriteback(page) && | ||
505 | (sc->reclaim_mode & RECLAIM_MODE_SYNC)) | ||
506 | wait_on_page_writeback(page); | ||
507 | |||
508 | if (!PageWriteback(page)) { | 498 | if (!PageWriteback(page)) { |
509 | /* synchronous write or broken a_ops? */ | 499 | /* synchronous write or broken a_ops? */ |
510 | ClearPageReclaim(page); | 500 | ClearPageReclaim(page); |
@@ -643,13 +633,14 @@ redo: | |||
643 | lru = LRU_UNEVICTABLE; | 633 | lru = LRU_UNEVICTABLE; |
644 | add_page_to_unevictable_list(page); | 634 | add_page_to_unevictable_list(page); |
645 | /* | 635 | /* |
646 | * When racing with an mlock clearing (page is | 636 | * When racing with an mlock or AS_UNEVICTABLE clearing |
647 | * unlocked), make sure that if the other thread does | 637 | * (page is unlocked) make sure that if the other thread |
648 | * not observe our setting of PG_lru and fails | 638 | * does not observe our setting of PG_lru and fails |
649 | * isolation, we see PG_mlocked cleared below and move | 639 | * isolation/check_move_unevictable_page, |
640 | * we see PG_mlocked/AS_UNEVICTABLE cleared below and move | ||
650 | * the page back to the evictable list. | 641 | * the page back to the evictable list. |
651 | * | 642 | * |
652 | * The other side is TestClearPageMlocked(). | 643 | * The other side is TestClearPageMlocked() or shmem_lock(). |
653 | */ | 644 | */ |
654 | smp_mb(); | 645 | smp_mb(); |
655 | } | 646 | } |
@@ -760,7 +751,10 @@ static noinline_for_stack void free_page_list(struct list_head *free_pages) | |||
760 | */ | 751 | */ |
761 | static unsigned long shrink_page_list(struct list_head *page_list, | 752 | static unsigned long shrink_page_list(struct list_head *page_list, |
762 | struct zone *zone, | 753 | struct zone *zone, |
763 | struct scan_control *sc) | 754 | struct scan_control *sc, |
755 | int priority, | ||
756 | unsigned long *ret_nr_dirty, | ||
757 | unsigned long *ret_nr_writeback) | ||
764 | { | 758 | { |
765 | LIST_HEAD(ret_pages); | 759 | LIST_HEAD(ret_pages); |
766 | LIST_HEAD(free_pages); | 760 | LIST_HEAD(free_pages); |
@@ -768,6 +762,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
768 | unsigned long nr_dirty = 0; | 762 | unsigned long nr_dirty = 0; |
769 | unsigned long nr_congested = 0; | 763 | unsigned long nr_congested = 0; |
770 | unsigned long nr_reclaimed = 0; | 764 | unsigned long nr_reclaimed = 0; |
765 | unsigned long nr_writeback = 0; | ||
771 | 766 | ||
772 | cond_resched(); | 767 | cond_resched(); |
773 | 768 | ||
@@ -804,13 +799,12 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
804 | (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); | 799 | (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); |
805 | 800 | ||
806 | if (PageWriteback(page)) { | 801 | if (PageWriteback(page)) { |
802 | nr_writeback++; | ||
807 | /* | 803 | /* |
808 | * Synchronous reclaim is performed in two passes, | 804 | * Synchronous reclaim cannot queue pages for |
809 | * first an asynchronous pass over the list to | 805 | * writeback due to the possibility of stack overflow |
810 | * start parallel writeback, and a second synchronous | 806 | * but if it encounters a page under writeback, wait |
811 | * pass to wait for the IO to complete. Wait here | 807 | * for the IO to complete. |
812 | * for any page for which writeback has already | ||
813 | * started. | ||
814 | */ | 808 | */ |
815 | if ((sc->reclaim_mode & RECLAIM_MODE_SYNC) && | 809 | if ((sc->reclaim_mode & RECLAIM_MODE_SYNC) && |
816 | may_enter_fs) | 810 | may_enter_fs) |
@@ -866,6 +860,25 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
866 | if (PageDirty(page)) { | 860 | if (PageDirty(page)) { |
867 | nr_dirty++; | 861 | nr_dirty++; |
868 | 862 | ||
863 | /* | ||
864 | * Only kswapd can writeback filesystem pages to | ||
865 | * avoid risk of stack overflow but do not writeback | ||
866 | * unless under significant pressure. | ||
867 | */ | ||
868 | if (page_is_file_cache(page) && | ||
869 | (!current_is_kswapd() || priority >= DEF_PRIORITY - 2)) { | ||
870 | /* | ||
871 | * Immediately reclaim when written back. | ||
872 | * Similar in principal to deactivate_page() | ||
873 | * except we already have the page isolated | ||
874 | * and know it's dirty | ||
875 | */ | ||
876 | inc_zone_page_state(page, NR_VMSCAN_IMMEDIATE); | ||
877 | SetPageReclaim(page); | ||
878 | |||
879 | goto keep_locked; | ||
880 | } | ||
881 | |||
869 | if (references == PAGEREF_RECLAIM_CLEAN) | 882 | if (references == PAGEREF_RECLAIM_CLEAN) |
870 | goto keep_locked; | 883 | goto keep_locked; |
871 | if (!may_enter_fs) | 884 | if (!may_enter_fs) |
@@ -1000,6 +1013,8 @@ keep_lumpy: | |||
1000 | 1013 | ||
1001 | list_splice(&ret_pages, page_list); | 1014 | list_splice(&ret_pages, page_list); |
1002 | count_vm_events(PGACTIVATE, pgactivate); | 1015 | count_vm_events(PGACTIVATE, pgactivate); |
1016 | *ret_nr_dirty += nr_dirty; | ||
1017 | *ret_nr_writeback += nr_writeback; | ||
1003 | return nr_reclaimed; | 1018 | return nr_reclaimed; |
1004 | } | 1019 | } |
1005 | 1020 | ||
@@ -1013,23 +1028,27 @@ keep_lumpy: | |||
1013 | * | 1028 | * |
1014 | * returns 0 on success, -ve errno on failure. | 1029 | * returns 0 on success, -ve errno on failure. |
1015 | */ | 1030 | */ |
1016 | int __isolate_lru_page(struct page *page, int mode, int file) | 1031 | int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file) |
1017 | { | 1032 | { |
1033 | bool all_lru_mode; | ||
1018 | int ret = -EINVAL; | 1034 | int ret = -EINVAL; |
1019 | 1035 | ||
1020 | /* Only take pages on the LRU. */ | 1036 | /* Only take pages on the LRU. */ |
1021 | if (!PageLRU(page)) | 1037 | if (!PageLRU(page)) |
1022 | return ret; | 1038 | return ret; |
1023 | 1039 | ||
1040 | all_lru_mode = (mode & (ISOLATE_ACTIVE|ISOLATE_INACTIVE)) == | ||
1041 | (ISOLATE_ACTIVE|ISOLATE_INACTIVE); | ||
1042 | |||
1024 | /* | 1043 | /* |
1025 | * When checking the active state, we need to be sure we are | 1044 | * When checking the active state, we need to be sure we are |
1026 | * dealing with comparible boolean values. Take the logical not | 1045 | * dealing with comparible boolean values. Take the logical not |
1027 | * of each. | 1046 | * of each. |
1028 | */ | 1047 | */ |
1029 | if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode)) | 1048 | if (!all_lru_mode && !PageActive(page) != !(mode & ISOLATE_ACTIVE)) |
1030 | return ret; | 1049 | return ret; |
1031 | 1050 | ||
1032 | if (mode != ISOLATE_BOTH && page_is_file_cache(page) != file) | 1051 | if (!all_lru_mode && !!page_is_file_cache(page) != file) |
1033 | return ret; | 1052 | return ret; |
1034 | 1053 | ||
1035 | /* | 1054 | /* |
@@ -1042,6 +1061,12 @@ int __isolate_lru_page(struct page *page, int mode, int file) | |||
1042 | 1061 | ||
1043 | ret = -EBUSY; | 1062 | ret = -EBUSY; |
1044 | 1063 | ||
1064 | if ((mode & ISOLATE_CLEAN) && (PageDirty(page) || PageWriteback(page))) | ||
1065 | return ret; | ||
1066 | |||
1067 | if ((mode & ISOLATE_UNMAPPED) && page_mapped(page)) | ||
1068 | return ret; | ||
1069 | |||
1045 | if (likely(get_page_unless_zero(page))) { | 1070 | if (likely(get_page_unless_zero(page))) { |
1046 | /* | 1071 | /* |
1047 | * Be careful not to clear PageLRU until after we're | 1072 | * Be careful not to clear PageLRU until after we're |
@@ -1077,7 +1102,8 @@ int __isolate_lru_page(struct page *page, int mode, int file) | |||
1077 | */ | 1102 | */ |
1078 | static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | 1103 | static unsigned long isolate_lru_pages(unsigned long nr_to_scan, |
1079 | struct list_head *src, struct list_head *dst, | 1104 | struct list_head *src, struct list_head *dst, |
1080 | unsigned long *scanned, int order, int mode, int file) | 1105 | unsigned long *scanned, int order, isolate_mode_t mode, |
1106 | int file) | ||
1081 | { | 1107 | { |
1082 | unsigned long nr_taken = 0; | 1108 | unsigned long nr_taken = 0; |
1083 | unsigned long nr_lumpy_taken = 0; | 1109 | unsigned long nr_lumpy_taken = 0; |
@@ -1202,8 +1228,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | |||
1202 | static unsigned long isolate_pages_global(unsigned long nr, | 1228 | static unsigned long isolate_pages_global(unsigned long nr, |
1203 | struct list_head *dst, | 1229 | struct list_head *dst, |
1204 | unsigned long *scanned, int order, | 1230 | unsigned long *scanned, int order, |
1205 | int mode, struct zone *z, | 1231 | isolate_mode_t mode, |
1206 | int active, int file) | 1232 | struct zone *z, int active, int file) |
1207 | { | 1233 | { |
1208 | int lru = LRU_BASE; | 1234 | int lru = LRU_BASE; |
1209 | if (active) | 1235 | if (active) |
@@ -1349,8 +1375,6 @@ putback_lru_pages(struct zone *zone, struct scan_control *sc, | |||
1349 | int file = is_file_lru(lru); | 1375 | int file = is_file_lru(lru); |
1350 | int numpages = hpage_nr_pages(page); | 1376 | int numpages = hpage_nr_pages(page); |
1351 | reclaim_stat->recent_rotated[file] += numpages; | 1377 | reclaim_stat->recent_rotated[file] += numpages; |
1352 | if (!scanning_global_lru(sc)) | ||
1353 | sc->memcg_record->nr_rotated[file] += numpages; | ||
1354 | } | 1378 | } |
1355 | if (!pagevec_add(&pvec, page)) { | 1379 | if (!pagevec_add(&pvec, page)) { |
1356 | spin_unlock_irq(&zone->lru_lock); | 1380 | spin_unlock_irq(&zone->lru_lock); |
@@ -1394,14 +1418,10 @@ static noinline_for_stack void update_isolated_counts(struct zone *zone, | |||
1394 | 1418 | ||
1395 | reclaim_stat->recent_scanned[0] += *nr_anon; | 1419 | reclaim_stat->recent_scanned[0] += *nr_anon; |
1396 | reclaim_stat->recent_scanned[1] += *nr_file; | 1420 | reclaim_stat->recent_scanned[1] += *nr_file; |
1397 | if (!scanning_global_lru(sc)) { | ||
1398 | sc->memcg_record->nr_scanned[0] += *nr_anon; | ||
1399 | sc->memcg_record->nr_scanned[1] += *nr_file; | ||
1400 | } | ||
1401 | } | 1421 | } |
1402 | 1422 | ||
1403 | /* | 1423 | /* |
1404 | * Returns true if the caller should wait to clean dirty/writeback pages. | 1424 | * Returns true if a direct reclaim should wait on pages under writeback. |
1405 | * | 1425 | * |
1406 | * If we are direct reclaiming for contiguous pages and we do not reclaim | 1426 | * If we are direct reclaiming for contiguous pages and we do not reclaim |
1407 | * everything in the list, try again and wait for writeback IO to complete. | 1427 | * everything in the list, try again and wait for writeback IO to complete. |
@@ -1423,7 +1443,7 @@ static inline bool should_reclaim_stall(unsigned long nr_taken, | |||
1423 | if (sc->reclaim_mode & RECLAIM_MODE_SINGLE) | 1443 | if (sc->reclaim_mode & RECLAIM_MODE_SINGLE) |
1424 | return false; | 1444 | return false; |
1425 | 1445 | ||
1426 | /* If we have relaimed everything on the isolated list, no stall */ | 1446 | /* If we have reclaimed everything on the isolated list, no stall */ |
1427 | if (nr_freed == nr_taken) | 1447 | if (nr_freed == nr_taken) |
1428 | return false; | 1448 | return false; |
1429 | 1449 | ||
@@ -1455,6 +1475,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, | |||
1455 | unsigned long nr_taken; | 1475 | unsigned long nr_taken; |
1456 | unsigned long nr_anon; | 1476 | unsigned long nr_anon; |
1457 | unsigned long nr_file; | 1477 | unsigned long nr_file; |
1478 | unsigned long nr_dirty = 0; | ||
1479 | unsigned long nr_writeback = 0; | ||
1480 | isolate_mode_t reclaim_mode = ISOLATE_INACTIVE; | ||
1458 | 1481 | ||
1459 | while (unlikely(too_many_isolated(zone, file, sc))) { | 1482 | while (unlikely(too_many_isolated(zone, file, sc))) { |
1460 | congestion_wait(BLK_RW_ASYNC, HZ/10); | 1483 | congestion_wait(BLK_RW_ASYNC, HZ/10); |
@@ -1465,15 +1488,21 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, | |||
1465 | } | 1488 | } |
1466 | 1489 | ||
1467 | set_reclaim_mode(priority, sc, false); | 1490 | set_reclaim_mode(priority, sc, false); |
1491 | if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM) | ||
1492 | reclaim_mode |= ISOLATE_ACTIVE; | ||
1493 | |||
1468 | lru_add_drain(); | 1494 | lru_add_drain(); |
1495 | |||
1496 | if (!sc->may_unmap) | ||
1497 | reclaim_mode |= ISOLATE_UNMAPPED; | ||
1498 | if (!sc->may_writepage) | ||
1499 | reclaim_mode |= ISOLATE_CLEAN; | ||
1500 | |||
1469 | spin_lock_irq(&zone->lru_lock); | 1501 | spin_lock_irq(&zone->lru_lock); |
1470 | 1502 | ||
1471 | if (scanning_global_lru(sc)) { | 1503 | if (scanning_global_lru(sc)) { |
1472 | nr_taken = isolate_pages_global(nr_to_scan, | 1504 | nr_taken = isolate_pages_global(nr_to_scan, &page_list, |
1473 | &page_list, &nr_scanned, sc->order, | 1505 | &nr_scanned, sc->order, reclaim_mode, zone, 0, file); |
1474 | sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ? | ||
1475 | ISOLATE_BOTH : ISOLATE_INACTIVE, | ||
1476 | zone, 0, file); | ||
1477 | zone->pages_scanned += nr_scanned; | 1506 | zone->pages_scanned += nr_scanned; |
1478 | if (current_is_kswapd()) | 1507 | if (current_is_kswapd()) |
1479 | __count_zone_vm_events(PGSCAN_KSWAPD, zone, | 1508 | __count_zone_vm_events(PGSCAN_KSWAPD, zone, |
@@ -1482,12 +1511,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, | |||
1482 | __count_zone_vm_events(PGSCAN_DIRECT, zone, | 1511 | __count_zone_vm_events(PGSCAN_DIRECT, zone, |
1483 | nr_scanned); | 1512 | nr_scanned); |
1484 | } else { | 1513 | } else { |
1485 | nr_taken = mem_cgroup_isolate_pages(nr_to_scan, | 1514 | nr_taken = mem_cgroup_isolate_pages(nr_to_scan, &page_list, |
1486 | &page_list, &nr_scanned, sc->order, | 1515 | &nr_scanned, sc->order, reclaim_mode, zone, |
1487 | sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ? | 1516 | sc->mem_cgroup, 0, file); |
1488 | ISOLATE_BOTH : ISOLATE_INACTIVE, | ||
1489 | zone, sc->mem_cgroup, | ||
1490 | 0, file); | ||
1491 | /* | 1517 | /* |
1492 | * mem_cgroup_isolate_pages() keeps track of | 1518 | * mem_cgroup_isolate_pages() keeps track of |
1493 | * scanned pages on its own. | 1519 | * scanned pages on its own. |
@@ -1503,17 +1529,16 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, | |||
1503 | 1529 | ||
1504 | spin_unlock_irq(&zone->lru_lock); | 1530 | spin_unlock_irq(&zone->lru_lock); |
1505 | 1531 | ||
1506 | nr_reclaimed = shrink_page_list(&page_list, zone, sc); | 1532 | nr_reclaimed = shrink_page_list(&page_list, zone, sc, priority, |
1533 | &nr_dirty, &nr_writeback); | ||
1507 | 1534 | ||
1508 | /* Check if we should syncronously wait for writeback */ | 1535 | /* Check if we should syncronously wait for writeback */ |
1509 | if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) { | 1536 | if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) { |
1510 | set_reclaim_mode(priority, sc, true); | 1537 | set_reclaim_mode(priority, sc, true); |
1511 | nr_reclaimed += shrink_page_list(&page_list, zone, sc); | 1538 | nr_reclaimed += shrink_page_list(&page_list, zone, sc, |
1539 | priority, &nr_dirty, &nr_writeback); | ||
1512 | } | 1540 | } |
1513 | 1541 | ||
1514 | if (!scanning_global_lru(sc)) | ||
1515 | sc->memcg_record->nr_freed[file] += nr_reclaimed; | ||
1516 | |||
1517 | local_irq_disable(); | 1542 | local_irq_disable(); |
1518 | if (current_is_kswapd()) | 1543 | if (current_is_kswapd()) |
1519 | __count_vm_events(KSWAPD_STEAL, nr_reclaimed); | 1544 | __count_vm_events(KSWAPD_STEAL, nr_reclaimed); |
@@ -1521,6 +1546,32 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, | |||
1521 | 1546 | ||
1522 | putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list); | 1547 | putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list); |
1523 | 1548 | ||
1549 | /* | ||
1550 | * If reclaim is isolating dirty pages under writeback, it implies | ||
1551 | * that the long-lived page allocation rate is exceeding the page | ||
1552 | * laundering rate. Either the global limits are not being effective | ||
1553 | * at throttling processes due to the page distribution throughout | ||
1554 | * zones or there is heavy usage of a slow backing device. The | ||
1555 | * only option is to throttle from reclaim context which is not ideal | ||
1556 | * as there is no guarantee the dirtying process is throttled in the | ||
1557 | * same way balance_dirty_pages() manages. | ||
1558 | * | ||
1559 | * This scales the number of dirty pages that must be under writeback | ||
1560 | * before throttling depending on priority. It is a simple backoff | ||
1561 | * function that has the most effect in the range DEF_PRIORITY to | ||
1562 | * DEF_PRIORITY-2 which is the priority reclaim is considered to be | ||
1563 | * in trouble and reclaim is considered to be in trouble. | ||
1564 | * | ||
1565 | * DEF_PRIORITY 100% isolated pages must be PageWriteback to throttle | ||
1566 | * DEF_PRIORITY-1 50% must be PageWriteback | ||
1567 | * DEF_PRIORITY-2 25% must be PageWriteback, kswapd in trouble | ||
1568 | * ... | ||
1569 | * DEF_PRIORITY-6 For SWAP_CLUSTER_MAX isolated pages, throttle if any | ||
1570 | * isolated page is PageWriteback | ||
1571 | */ | ||
1572 | if (nr_writeback && nr_writeback >= (nr_taken >> (DEF_PRIORITY-priority))) | ||
1573 | wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10); | ||
1574 | |||
1524 | trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id, | 1575 | trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id, |
1525 | zone_idx(zone), | 1576 | zone_idx(zone), |
1526 | nr_scanned, nr_reclaimed, | 1577 | nr_scanned, nr_reclaimed, |
@@ -1592,19 +1643,26 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
1592 | struct page *page; | 1643 | struct page *page; |
1593 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); | 1644 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); |
1594 | unsigned long nr_rotated = 0; | 1645 | unsigned long nr_rotated = 0; |
1646 | isolate_mode_t reclaim_mode = ISOLATE_ACTIVE; | ||
1595 | 1647 | ||
1596 | lru_add_drain(); | 1648 | lru_add_drain(); |
1649 | |||
1650 | if (!sc->may_unmap) | ||
1651 | reclaim_mode |= ISOLATE_UNMAPPED; | ||
1652 | if (!sc->may_writepage) | ||
1653 | reclaim_mode |= ISOLATE_CLEAN; | ||
1654 | |||
1597 | spin_lock_irq(&zone->lru_lock); | 1655 | spin_lock_irq(&zone->lru_lock); |
1598 | if (scanning_global_lru(sc)) { | 1656 | if (scanning_global_lru(sc)) { |
1599 | nr_taken = isolate_pages_global(nr_pages, &l_hold, | 1657 | nr_taken = isolate_pages_global(nr_pages, &l_hold, |
1600 | &pgscanned, sc->order, | 1658 | &pgscanned, sc->order, |
1601 | ISOLATE_ACTIVE, zone, | 1659 | reclaim_mode, zone, |
1602 | 1, file); | 1660 | 1, file); |
1603 | zone->pages_scanned += pgscanned; | 1661 | zone->pages_scanned += pgscanned; |
1604 | } else { | 1662 | } else { |
1605 | nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold, | 1663 | nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold, |
1606 | &pgscanned, sc->order, | 1664 | &pgscanned, sc->order, |
1607 | ISOLATE_ACTIVE, zone, | 1665 | reclaim_mode, zone, |
1608 | sc->mem_cgroup, 1, file); | 1666 | sc->mem_cgroup, 1, file); |
1609 | /* | 1667 | /* |
1610 | * mem_cgroup_isolate_pages() keeps track of | 1668 | * mem_cgroup_isolate_pages() keeps track of |
@@ -1613,8 +1671,6 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
1613 | } | 1671 | } |
1614 | 1672 | ||
1615 | reclaim_stat->recent_scanned[file] += nr_taken; | 1673 | reclaim_stat->recent_scanned[file] += nr_taken; |
1616 | if (!scanning_global_lru(sc)) | ||
1617 | sc->memcg_record->nr_scanned[file] += nr_taken; | ||
1618 | 1674 | ||
1619 | __count_zone_vm_events(PGREFILL, zone, pgscanned); | 1675 | __count_zone_vm_events(PGREFILL, zone, pgscanned); |
1620 | if (file) | 1676 | if (file) |
@@ -1666,8 +1722,6 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
1666 | * get_scan_ratio. | 1722 | * get_scan_ratio. |
1667 | */ | 1723 | */ |
1668 | reclaim_stat->recent_rotated[file] += nr_rotated; | 1724 | reclaim_stat->recent_rotated[file] += nr_rotated; |
1669 | if (!scanning_global_lru(sc)) | ||
1670 | sc->memcg_record->nr_rotated[file] += nr_rotated; | ||
1671 | 1725 | ||
1672 | move_active_pages_to_lru(zone, &l_active, | 1726 | move_active_pages_to_lru(zone, &l_active, |
1673 | LRU_ACTIVE + file * LRU_FILE); | 1727 | LRU_ACTIVE + file * LRU_FILE); |
@@ -1713,7 +1767,7 @@ static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc) | |||
1713 | if (scanning_global_lru(sc)) | 1767 | if (scanning_global_lru(sc)) |
1714 | low = inactive_anon_is_low_global(zone); | 1768 | low = inactive_anon_is_low_global(zone); |
1715 | else | 1769 | else |
1716 | low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup); | 1770 | low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup, zone); |
1717 | return low; | 1771 | return low; |
1718 | } | 1772 | } |
1719 | #else | 1773 | #else |
@@ -1756,7 +1810,7 @@ static int inactive_file_is_low(struct zone *zone, struct scan_control *sc) | |||
1756 | if (scanning_global_lru(sc)) | 1810 | if (scanning_global_lru(sc)) |
1757 | low = inactive_file_is_low_global(zone); | 1811 | low = inactive_file_is_low_global(zone); |
1758 | else | 1812 | else |
1759 | low = mem_cgroup_inactive_file_is_low(sc->mem_cgroup); | 1813 | low = mem_cgroup_inactive_file_is_low(sc->mem_cgroup, zone); |
1760 | return low; | 1814 | return low; |
1761 | } | 1815 | } |
1762 | 1816 | ||
@@ -1808,23 +1862,22 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, | |||
1808 | u64 fraction[2], denominator; | 1862 | u64 fraction[2], denominator; |
1809 | enum lru_list l; | 1863 | enum lru_list l; |
1810 | int noswap = 0; | 1864 | int noswap = 0; |
1811 | int force_scan = 0; | 1865 | bool force_scan = false; |
1812 | unsigned long nr_force_scan[2]; | ||
1813 | |||
1814 | 1866 | ||
1815 | anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) + | 1867 | /* |
1816 | zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON); | 1868 | * If the zone or memcg is small, nr[l] can be 0. This |
1817 | file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) + | 1869 | * results in no scanning on this priority and a potential |
1818 | zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); | 1870 | * priority drop. Global direct reclaim can go to the next |
1819 | 1871 | * zone and tends to have no problems. Global kswapd is for | |
1820 | if (((anon + file) >> priority) < SWAP_CLUSTER_MAX) { | 1872 | * zone balancing and it needs to scan a minimum amount. When |
1821 | /* kswapd does zone balancing and need to scan this zone */ | 1873 | * reclaiming for a memcg, a priority drop can cause high |
1822 | if (scanning_global_lru(sc) && current_is_kswapd()) | 1874 | * latencies, so it's better to scan a minimum amount there as |
1823 | force_scan = 1; | 1875 | * well. |
1824 | /* memcg may have small limit and need to avoid priority drop */ | 1876 | */ |
1825 | if (!scanning_global_lru(sc)) | 1877 | if (scanning_global_lru(sc) && current_is_kswapd()) |
1826 | force_scan = 1; | 1878 | force_scan = true; |
1827 | } | 1879 | if (!scanning_global_lru(sc)) |
1880 | force_scan = true; | ||
1828 | 1881 | ||
1829 | /* If we have no swap space, do not bother scanning anon pages. */ | 1882 | /* If we have no swap space, do not bother scanning anon pages. */ |
1830 | if (!sc->may_swap || (nr_swap_pages <= 0)) { | 1883 | if (!sc->may_swap || (nr_swap_pages <= 0)) { |
@@ -1832,11 +1885,14 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, | |||
1832 | fraction[0] = 0; | 1885 | fraction[0] = 0; |
1833 | fraction[1] = 1; | 1886 | fraction[1] = 1; |
1834 | denominator = 1; | 1887 | denominator = 1; |
1835 | nr_force_scan[0] = 0; | ||
1836 | nr_force_scan[1] = SWAP_CLUSTER_MAX; | ||
1837 | goto out; | 1888 | goto out; |
1838 | } | 1889 | } |
1839 | 1890 | ||
1891 | anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) + | ||
1892 | zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON); | ||
1893 | file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) + | ||
1894 | zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); | ||
1895 | |||
1840 | if (scanning_global_lru(sc)) { | 1896 | if (scanning_global_lru(sc)) { |
1841 | free = zone_page_state(zone, NR_FREE_PAGES); | 1897 | free = zone_page_state(zone, NR_FREE_PAGES); |
1842 | /* If we have very few page cache pages, | 1898 | /* If we have very few page cache pages, |
@@ -1845,8 +1901,6 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, | |||
1845 | fraction[0] = 1; | 1901 | fraction[0] = 1; |
1846 | fraction[1] = 0; | 1902 | fraction[1] = 0; |
1847 | denominator = 1; | 1903 | denominator = 1; |
1848 | nr_force_scan[0] = SWAP_CLUSTER_MAX; | ||
1849 | nr_force_scan[1] = 0; | ||
1850 | goto out; | 1904 | goto out; |
1851 | } | 1905 | } |
1852 | } | 1906 | } |
@@ -1895,11 +1949,6 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, | |||
1895 | fraction[0] = ap; | 1949 | fraction[0] = ap; |
1896 | fraction[1] = fp; | 1950 | fraction[1] = fp; |
1897 | denominator = ap + fp + 1; | 1951 | denominator = ap + fp + 1; |
1898 | if (force_scan) { | ||
1899 | unsigned long scan = SWAP_CLUSTER_MAX; | ||
1900 | nr_force_scan[0] = div64_u64(scan * ap, denominator); | ||
1901 | nr_force_scan[1] = div64_u64(scan * fp, denominator); | ||
1902 | } | ||
1903 | out: | 1952 | out: |
1904 | for_each_evictable_lru(l) { | 1953 | for_each_evictable_lru(l) { |
1905 | int file = is_file_lru(l); | 1954 | int file = is_file_lru(l); |
@@ -1908,20 +1957,10 @@ out: | |||
1908 | scan = zone_nr_lru_pages(zone, sc, l); | 1957 | scan = zone_nr_lru_pages(zone, sc, l); |
1909 | if (priority || noswap) { | 1958 | if (priority || noswap) { |
1910 | scan >>= priority; | 1959 | scan >>= priority; |
1960 | if (!scan && force_scan) | ||
1961 | scan = SWAP_CLUSTER_MAX; | ||
1911 | scan = div64_u64(scan * fraction[file], denominator); | 1962 | scan = div64_u64(scan * fraction[file], denominator); |
1912 | } | 1963 | } |
1913 | |||
1914 | /* | ||
1915 | * If zone is small or memcg is small, nr[l] can be 0. | ||
1916 | * This results no-scan on this priority and priority drop down. | ||
1917 | * For global direct reclaim, it can visit next zone and tend | ||
1918 | * not to have problems. For global kswapd, it's for zone | ||
1919 | * balancing and it need to scan a small amounts. When using | ||
1920 | * memcg, priority drop can cause big latency. So, it's better | ||
1921 | * to scan small amount. See may_noscan above. | ||
1922 | */ | ||
1923 | if (!scan && force_scan) | ||
1924 | scan = nr_force_scan[file]; | ||
1925 | nr[l] = scan; | 1964 | nr[l] = scan; |
1926 | } | 1965 | } |
1927 | } | 1966 | } |
@@ -2000,12 +2039,14 @@ static void shrink_zone(int priority, struct zone *zone, | |||
2000 | enum lru_list l; | 2039 | enum lru_list l; |
2001 | unsigned long nr_reclaimed, nr_scanned; | 2040 | unsigned long nr_reclaimed, nr_scanned; |
2002 | unsigned long nr_to_reclaim = sc->nr_to_reclaim; | 2041 | unsigned long nr_to_reclaim = sc->nr_to_reclaim; |
2042 | struct blk_plug plug; | ||
2003 | 2043 | ||
2004 | restart: | 2044 | restart: |
2005 | nr_reclaimed = 0; | 2045 | nr_reclaimed = 0; |
2006 | nr_scanned = sc->nr_scanned; | 2046 | nr_scanned = sc->nr_scanned; |
2007 | get_scan_count(zone, sc, nr, priority); | 2047 | get_scan_count(zone, sc, nr, priority); |
2008 | 2048 | ||
2049 | blk_start_plug(&plug); | ||
2009 | while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || | 2050 | while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || |
2010 | nr[LRU_INACTIVE_FILE]) { | 2051 | nr[LRU_INACTIVE_FILE]) { |
2011 | for_each_evictable_lru(l) { | 2052 | for_each_evictable_lru(l) { |
@@ -2029,6 +2070,7 @@ restart: | |||
2029 | if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY) | 2070 | if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY) |
2030 | break; | 2071 | break; |
2031 | } | 2072 | } |
2073 | blk_finish_plug(&plug); | ||
2032 | sc->nr_reclaimed += nr_reclaimed; | 2074 | sc->nr_reclaimed += nr_reclaimed; |
2033 | 2075 | ||
2034 | /* | 2076 | /* |
@@ -2061,14 +2103,19 @@ restart: | |||
2061 | * | 2103 | * |
2062 | * If a zone is deemed to be full of pinned pages then just give it a light | 2104 | * If a zone is deemed to be full of pinned pages then just give it a light |
2063 | * scan then give up on it. | 2105 | * scan then give up on it. |
2106 | * | ||
2107 | * This function returns true if a zone is being reclaimed for a costly | ||
2108 | * high-order allocation and compaction is either ready to begin or deferred. | ||
2109 | * This indicates to the caller that it should retry the allocation or fail. | ||
2064 | */ | 2110 | */ |
2065 | static void shrink_zones(int priority, struct zonelist *zonelist, | 2111 | static bool shrink_zones(int priority, struct zonelist *zonelist, |
2066 | struct scan_control *sc) | 2112 | struct scan_control *sc) |
2067 | { | 2113 | { |
2068 | struct zoneref *z; | 2114 | struct zoneref *z; |
2069 | struct zone *zone; | 2115 | struct zone *zone; |
2070 | unsigned long nr_soft_reclaimed; | 2116 | unsigned long nr_soft_reclaimed; |
2071 | unsigned long nr_soft_scanned; | 2117 | unsigned long nr_soft_scanned; |
2118 | bool should_abort_reclaim = false; | ||
2072 | 2119 | ||
2073 | for_each_zone_zonelist_nodemask(zone, z, zonelist, | 2120 | for_each_zone_zonelist_nodemask(zone, z, zonelist, |
2074 | gfp_zone(sc->gfp_mask), sc->nodemask) { | 2121 | gfp_zone(sc->gfp_mask), sc->nodemask) { |
@@ -2083,6 +2130,23 @@ static void shrink_zones(int priority, struct zonelist *zonelist, | |||
2083 | continue; | 2130 | continue; |
2084 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) | 2131 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) |
2085 | continue; /* Let kswapd poll it */ | 2132 | continue; /* Let kswapd poll it */ |
2133 | if (COMPACTION_BUILD) { | ||
2134 | /* | ||
2135 | * If we already have plenty of memory free for | ||
2136 | * compaction in this zone, don't free any more. | ||
2137 | * Even though compaction is invoked for any | ||
2138 | * non-zero order, only frequent costly order | ||
2139 | * reclamation is disruptive enough to become a | ||
2140 | * noticable problem, like transparent huge page | ||
2141 | * allocations. | ||
2142 | */ | ||
2143 | if (sc->order > PAGE_ALLOC_COSTLY_ORDER && | ||
2144 | (compaction_suitable(zone, sc->order) || | ||
2145 | compaction_deferred(zone))) { | ||
2146 | should_abort_reclaim = true; | ||
2147 | continue; | ||
2148 | } | ||
2149 | } | ||
2086 | /* | 2150 | /* |
2087 | * This steals pages from memory cgroups over softlimit | 2151 | * This steals pages from memory cgroups over softlimit |
2088 | * and returns the number of reclaimed pages and | 2152 | * and returns the number of reclaimed pages and |
@@ -2100,6 +2164,8 @@ static void shrink_zones(int priority, struct zonelist *zonelist, | |||
2100 | 2164 | ||
2101 | shrink_zone(priority, zone, sc); | 2165 | shrink_zone(priority, zone, sc); |
2102 | } | 2166 | } |
2167 | |||
2168 | return should_abort_reclaim; | ||
2103 | } | 2169 | } |
2104 | 2170 | ||
2105 | static bool zone_reclaimable(struct zone *zone) | 2171 | static bool zone_reclaimable(struct zone *zone) |
@@ -2164,7 +2230,9 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
2164 | sc->nr_scanned = 0; | 2230 | sc->nr_scanned = 0; |
2165 | if (!priority) | 2231 | if (!priority) |
2166 | disable_swap_token(sc->mem_cgroup); | 2232 | disable_swap_token(sc->mem_cgroup); |
2167 | shrink_zones(priority, zonelist, sc); | 2233 | if (shrink_zones(priority, zonelist, sc)) |
2234 | break; | ||
2235 | |||
2168 | /* | 2236 | /* |
2169 | * Don't shrink slabs when reclaiming memory from | 2237 | * Don't shrink slabs when reclaiming memory from |
2170 | * over limit cgroups | 2238 | * over limit cgroups |
@@ -2198,7 +2266,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
2198 | */ | 2266 | */ |
2199 | writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2; | 2267 | writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2; |
2200 | if (total_scanned > writeback_threshold) { | 2268 | if (total_scanned > writeback_threshold) { |
2201 | wakeup_flusher_threads(laptop_mode ? 0 : total_scanned); | 2269 | wakeup_flusher_threads(laptop_mode ? 0 : total_scanned, |
2270 | WB_REASON_TRY_TO_FREE_PAGES); | ||
2202 | sc->may_writepage = 1; | 2271 | sc->may_writepage = 1; |
2203 | } | 2272 | } |
2204 | 2273 | ||
@@ -2268,10 +2337,9 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, | |||
2268 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR | 2337 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR |
2269 | 2338 | ||
2270 | unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, | 2339 | unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, |
2271 | gfp_t gfp_mask, bool noswap, | 2340 | gfp_t gfp_mask, bool noswap, |
2272 | struct zone *zone, | 2341 | struct zone *zone, |
2273 | struct memcg_scanrecord *rec, | 2342 | unsigned long *nr_scanned) |
2274 | unsigned long *scanned) | ||
2275 | { | 2343 | { |
2276 | struct scan_control sc = { | 2344 | struct scan_control sc = { |
2277 | .nr_scanned = 0, | 2345 | .nr_scanned = 0, |
@@ -2281,9 +2349,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, | |||
2281 | .may_swap = !noswap, | 2349 | .may_swap = !noswap, |
2282 | .order = 0, | 2350 | .order = 0, |
2283 | .mem_cgroup = mem, | 2351 | .mem_cgroup = mem, |
2284 | .memcg_record = rec, | ||
2285 | }; | 2352 | }; |
2286 | unsigned long start, end; | ||
2287 | 2353 | ||
2288 | sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | | 2354 | sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | |
2289 | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); | 2355 | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); |
@@ -2292,7 +2358,6 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, | |||
2292 | sc.may_writepage, | 2358 | sc.may_writepage, |
2293 | sc.gfp_mask); | 2359 | sc.gfp_mask); |
2294 | 2360 | ||
2295 | start = sched_clock(); | ||
2296 | /* | 2361 | /* |
2297 | * NOTE: Although we can get the priority field, using it | 2362 | * NOTE: Although we can get the priority field, using it |
2298 | * here is not a good idea, since it limits the pages we can scan. | 2363 | * here is not a good idea, since it limits the pages we can scan. |
@@ -2301,25 +2366,19 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, | |||
2301 | * the priority and make it zero. | 2366 | * the priority and make it zero. |
2302 | */ | 2367 | */ |
2303 | shrink_zone(0, zone, &sc); | 2368 | shrink_zone(0, zone, &sc); |
2304 | end = sched_clock(); | ||
2305 | |||
2306 | if (rec) | ||
2307 | rec->elapsed += end - start; | ||
2308 | *scanned = sc.nr_scanned; | ||
2309 | 2369 | ||
2310 | trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); | 2370 | trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); |
2311 | 2371 | ||
2372 | *nr_scanned = sc.nr_scanned; | ||
2312 | return sc.nr_reclaimed; | 2373 | return sc.nr_reclaimed; |
2313 | } | 2374 | } |
2314 | 2375 | ||
2315 | unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, | 2376 | unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, |
2316 | gfp_t gfp_mask, | 2377 | gfp_t gfp_mask, |
2317 | bool noswap, | 2378 | bool noswap) |
2318 | struct memcg_scanrecord *rec) | ||
2319 | { | 2379 | { |
2320 | struct zonelist *zonelist; | 2380 | struct zonelist *zonelist; |
2321 | unsigned long nr_reclaimed; | 2381 | unsigned long nr_reclaimed; |
2322 | unsigned long start, end; | ||
2323 | int nid; | 2382 | int nid; |
2324 | struct scan_control sc = { | 2383 | struct scan_control sc = { |
2325 | .may_writepage = !laptop_mode, | 2384 | .may_writepage = !laptop_mode, |
@@ -2328,7 +2387,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, | |||
2328 | .nr_to_reclaim = SWAP_CLUSTER_MAX, | 2387 | .nr_to_reclaim = SWAP_CLUSTER_MAX, |
2329 | .order = 0, | 2388 | .order = 0, |
2330 | .mem_cgroup = mem_cont, | 2389 | .mem_cgroup = mem_cont, |
2331 | .memcg_record = rec, | ||
2332 | .nodemask = NULL, /* we don't care the placement */ | 2390 | .nodemask = NULL, /* we don't care the placement */ |
2333 | .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | | 2391 | .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | |
2334 | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), | 2392 | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), |
@@ -2337,7 +2395,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, | |||
2337 | .gfp_mask = sc.gfp_mask, | 2395 | .gfp_mask = sc.gfp_mask, |
2338 | }; | 2396 | }; |
2339 | 2397 | ||
2340 | start = sched_clock(); | ||
2341 | /* | 2398 | /* |
2342 | * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't | 2399 | * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't |
2343 | * take care of from where we get pages. So the node where we start the | 2400 | * take care of from where we get pages. So the node where we start the |
@@ -2352,9 +2409,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, | |||
2352 | sc.gfp_mask); | 2409 | sc.gfp_mask); |
2353 | 2410 | ||
2354 | nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink); | 2411 | nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink); |
2355 | end = sched_clock(); | ||
2356 | if (rec) | ||
2357 | rec->elapsed += end - start; | ||
2358 | 2412 | ||
2359 | trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); | 2413 | trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); |
2360 | 2414 | ||
@@ -2529,6 +2583,9 @@ loop_again: | |||
2529 | high_wmark_pages(zone), 0, 0)) { | 2583 | high_wmark_pages(zone), 0, 0)) { |
2530 | end_zone = i; | 2584 | end_zone = i; |
2531 | break; | 2585 | break; |
2586 | } else { | ||
2587 | /* If balanced, clear the congested flag */ | ||
2588 | zone_clear_flag(zone, ZONE_CONGESTED); | ||
2532 | } | 2589 | } |
2533 | } | 2590 | } |
2534 | if (i < 0) | 2591 | if (i < 0) |
@@ -2719,6 +2776,8 @@ out: | |||
2719 | 2776 | ||
2720 | /* If balanced, clear the congested flag */ | 2777 | /* If balanced, clear the congested flag */ |
2721 | zone_clear_flag(zone, ZONE_CONGESTED); | 2778 | zone_clear_flag(zone, ZONE_CONGESTED); |
2779 | if (i <= *classzone_idx) | ||
2780 | balanced += zone->present_pages; | ||
2722 | } | 2781 | } |
2723 | } | 2782 | } |
2724 | 2783 | ||
@@ -2792,7 +2851,9 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) | |||
2792 | static int kswapd(void *p) | 2851 | static int kswapd(void *p) |
2793 | { | 2852 | { |
2794 | unsigned long order, new_order; | 2853 | unsigned long order, new_order; |
2854 | unsigned balanced_order; | ||
2795 | int classzone_idx, new_classzone_idx; | 2855 | int classzone_idx, new_classzone_idx; |
2856 | int balanced_classzone_idx; | ||
2796 | pg_data_t *pgdat = (pg_data_t*)p; | 2857 | pg_data_t *pgdat = (pg_data_t*)p; |
2797 | struct task_struct *tsk = current; | 2858 | struct task_struct *tsk = current; |
2798 | 2859 | ||
@@ -2823,7 +2884,9 @@ static int kswapd(void *p) | |||
2823 | set_freezable(); | 2884 | set_freezable(); |
2824 | 2885 | ||
2825 | order = new_order = 0; | 2886 | order = new_order = 0; |
2887 | balanced_order = 0; | ||
2826 | classzone_idx = new_classzone_idx = pgdat->nr_zones - 1; | 2888 | classzone_idx = new_classzone_idx = pgdat->nr_zones - 1; |
2889 | balanced_classzone_idx = classzone_idx; | ||
2827 | for ( ; ; ) { | 2890 | for ( ; ; ) { |
2828 | int ret; | 2891 | int ret; |
2829 | 2892 | ||
@@ -2832,7 +2895,8 @@ static int kswapd(void *p) | |||
2832 | * new request of a similar or harder type will succeed soon | 2895 | * new request of a similar or harder type will succeed soon |
2833 | * so consider going to sleep on the basis we reclaimed at | 2896 | * so consider going to sleep on the basis we reclaimed at |
2834 | */ | 2897 | */ |
2835 | if (classzone_idx >= new_classzone_idx && order == new_order) { | 2898 | if (balanced_classzone_idx >= new_classzone_idx && |
2899 | balanced_order == new_order) { | ||
2836 | new_order = pgdat->kswapd_max_order; | 2900 | new_order = pgdat->kswapd_max_order; |
2837 | new_classzone_idx = pgdat->classzone_idx; | 2901 | new_classzone_idx = pgdat->classzone_idx; |
2838 | pgdat->kswapd_max_order = 0; | 2902 | pgdat->kswapd_max_order = 0; |
@@ -2847,9 +2911,12 @@ static int kswapd(void *p) | |||
2847 | order = new_order; | 2911 | order = new_order; |
2848 | classzone_idx = new_classzone_idx; | 2912 | classzone_idx = new_classzone_idx; |
2849 | } else { | 2913 | } else { |
2850 | kswapd_try_to_sleep(pgdat, order, classzone_idx); | 2914 | kswapd_try_to_sleep(pgdat, balanced_order, |
2915 | balanced_classzone_idx); | ||
2851 | order = pgdat->kswapd_max_order; | 2916 | order = pgdat->kswapd_max_order; |
2852 | classzone_idx = pgdat->classzone_idx; | 2917 | classzone_idx = pgdat->classzone_idx; |
2918 | new_order = order; | ||
2919 | new_classzone_idx = classzone_idx; | ||
2853 | pgdat->kswapd_max_order = 0; | 2920 | pgdat->kswapd_max_order = 0; |
2854 | pgdat->classzone_idx = pgdat->nr_zones - 1; | 2921 | pgdat->classzone_idx = pgdat->nr_zones - 1; |
2855 | } | 2922 | } |
@@ -2864,7 +2931,9 @@ static int kswapd(void *p) | |||
2864 | */ | 2931 | */ |
2865 | if (!ret) { | 2932 | if (!ret) { |
2866 | trace_mm_vmscan_kswapd_wake(pgdat->node_id, order); | 2933 | trace_mm_vmscan_kswapd_wake(pgdat->node_id, order); |
2867 | order = balance_pgdat(pgdat, order, &classzone_idx); | 2934 | balanced_classzone_idx = classzone_idx; |
2935 | balanced_order = balance_pgdat(pgdat, order, | ||
2936 | &balanced_classzone_idx); | ||
2868 | } | 2937 | } |
2869 | } | 2938 | } |
2870 | return 0; | 2939 | return 0; |
@@ -3376,66 +3445,12 @@ void scan_mapping_unevictable_pages(struct address_space *mapping) | |||
3376 | 3445 | ||
3377 | } | 3446 | } |
3378 | 3447 | ||
3379 | /** | 3448 | static void warn_scan_unevictable_pages(void) |
3380 | * scan_zone_unevictable_pages - check unevictable list for evictable pages | ||
3381 | * @zone - zone of which to scan the unevictable list | ||
3382 | * | ||
3383 | * Scan @zone's unevictable LRU lists to check for pages that have become | ||
3384 | * evictable. Move those that have to @zone's inactive list where they | ||
3385 | * become candidates for reclaim, unless shrink_inactive_zone() decides | ||
3386 | * to reactivate them. Pages that are still unevictable are rotated | ||
3387 | * back onto @zone's unevictable list. | ||
3388 | */ | ||
3389 | #define SCAN_UNEVICTABLE_BATCH_SIZE 16UL /* arbitrary lock hold batch size */ | ||
3390 | static void scan_zone_unevictable_pages(struct zone *zone) | ||
3391 | { | 3449 | { |
3392 | struct list_head *l_unevictable = &zone->lru[LRU_UNEVICTABLE].list; | 3450 | printk_once(KERN_WARNING |
3393 | unsigned long scan; | 3451 | "The scan_unevictable_pages sysctl/node-interface has been " |
3394 | unsigned long nr_to_scan = zone_page_state(zone, NR_UNEVICTABLE); | 3452 | "disabled for lack of a legitimate use case. If you have " |
3395 | 3453 | "one, please send an email to linux-mm@kvack.org.\n"); | |
3396 | while (nr_to_scan > 0) { | ||
3397 | unsigned long batch_size = min(nr_to_scan, | ||
3398 | SCAN_UNEVICTABLE_BATCH_SIZE); | ||
3399 | |||
3400 | spin_lock_irq(&zone->lru_lock); | ||
3401 | for (scan = 0; scan < batch_size; scan++) { | ||
3402 | struct page *page = lru_to_page(l_unevictable); | ||
3403 | |||
3404 | if (!trylock_page(page)) | ||
3405 | continue; | ||
3406 | |||
3407 | prefetchw_prev_lru_page(page, l_unevictable, flags); | ||
3408 | |||
3409 | if (likely(PageLRU(page) && PageUnevictable(page))) | ||
3410 | check_move_unevictable_page(page, zone); | ||
3411 | |||
3412 | unlock_page(page); | ||
3413 | } | ||
3414 | spin_unlock_irq(&zone->lru_lock); | ||
3415 | |||
3416 | nr_to_scan -= batch_size; | ||
3417 | } | ||
3418 | } | ||
3419 | |||
3420 | |||
3421 | /** | ||
3422 | * scan_all_zones_unevictable_pages - scan all unevictable lists for evictable pages | ||
3423 | * | ||
3424 | * A really big hammer: scan all zones' unevictable LRU lists to check for | ||
3425 | * pages that have become evictable. Move those back to the zones' | ||
3426 | * inactive list where they become candidates for reclaim. | ||
3427 | * This occurs when, e.g., we have unswappable pages on the unevictable lists, | ||
3428 | * and we add swap to the system. As such, it runs in the context of a task | ||
3429 | * that has possibly/probably made some previously unevictable pages | ||
3430 | * evictable. | ||
3431 | */ | ||
3432 | static void scan_all_zones_unevictable_pages(void) | ||
3433 | { | ||
3434 | struct zone *zone; | ||
3435 | |||
3436 | for_each_zone(zone) { | ||
3437 | scan_zone_unevictable_pages(zone); | ||
3438 | } | ||
3439 | } | 3454 | } |
3440 | 3455 | ||
3441 | /* | 3456 | /* |
@@ -3448,11 +3463,8 @@ int scan_unevictable_handler(struct ctl_table *table, int write, | |||
3448 | void __user *buffer, | 3463 | void __user *buffer, |
3449 | size_t *length, loff_t *ppos) | 3464 | size_t *length, loff_t *ppos) |
3450 | { | 3465 | { |
3466 | warn_scan_unevictable_pages(); | ||
3451 | proc_doulongvec_minmax(table, write, buffer, length, ppos); | 3467 | proc_doulongvec_minmax(table, write, buffer, length, ppos); |
3452 | |||
3453 | if (write && *(unsigned long *)table->data) | ||
3454 | scan_all_zones_unevictable_pages(); | ||
3455 | |||
3456 | scan_unevictable_pages = 0; | 3468 | scan_unevictable_pages = 0; |
3457 | return 0; | 3469 | return 0; |
3458 | } | 3470 | } |
@@ -3467,6 +3479,7 @@ static ssize_t read_scan_unevictable_node(struct sys_device *dev, | |||
3467 | struct sysdev_attribute *attr, | 3479 | struct sysdev_attribute *attr, |
3468 | char *buf) | 3480 | char *buf) |
3469 | { | 3481 | { |
3482 | warn_scan_unevictable_pages(); | ||
3470 | return sprintf(buf, "0\n"); /* always zero; should fit... */ | 3483 | return sprintf(buf, "0\n"); /* always zero; should fit... */ |
3471 | } | 3484 | } |
3472 | 3485 | ||
@@ -3474,19 +3487,7 @@ static ssize_t write_scan_unevictable_node(struct sys_device *dev, | |||
3474 | struct sysdev_attribute *attr, | 3487 | struct sysdev_attribute *attr, |
3475 | const char *buf, size_t count) | 3488 | const char *buf, size_t count) |
3476 | { | 3489 | { |
3477 | struct zone *node_zones = NODE_DATA(dev->id)->node_zones; | 3490 | warn_scan_unevictable_pages(); |
3478 | struct zone *zone; | ||
3479 | unsigned long res; | ||
3480 | unsigned long req = strict_strtoul(buf, 10, &res); | ||
3481 | |||
3482 | if (!req) | ||
3483 | return 1; /* zero is no-op */ | ||
3484 | |||
3485 | for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { | ||
3486 | if (!populated_zone(zone)) | ||
3487 | continue; | ||
3488 | scan_zone_unevictable_pages(zone); | ||
3489 | } | ||
3490 | return 1; | 3491 | return 1; |
3491 | } | 3492 | } |
3492 | 3493 | ||
diff --git a/mm/vmstat.c b/mm/vmstat.c index 20c18b7694b2..8fd603b1665e 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
@@ -78,7 +78,7 @@ void vm_events_fold_cpu(int cpu) | |||
78 | * | 78 | * |
79 | * vm_stat contains the global counters | 79 | * vm_stat contains the global counters |
80 | */ | 80 | */ |
81 | atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS]; | 81 | atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp; |
82 | EXPORT_SYMBOL(vm_stat); | 82 | EXPORT_SYMBOL(vm_stat); |
83 | 83 | ||
84 | #ifdef CONFIG_SMP | 84 | #ifdef CONFIG_SMP |
@@ -659,7 +659,7 @@ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat, | |||
659 | } | 659 | } |
660 | #endif | 660 | #endif |
661 | 661 | ||
662 | #if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS) | 662 | #if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS) || defined(CONFIG_NUMA) |
663 | #ifdef CONFIG_ZONE_DMA | 663 | #ifdef CONFIG_ZONE_DMA |
664 | #define TEXT_FOR_DMA(xx) xx "_dma", | 664 | #define TEXT_FOR_DMA(xx) xx "_dma", |
665 | #else | 665 | #else |
@@ -702,6 +702,7 @@ const char * const vmstat_text[] = { | |||
702 | "nr_unstable", | 702 | "nr_unstable", |
703 | "nr_bounce", | 703 | "nr_bounce", |
704 | "nr_vmscan_write", | 704 | "nr_vmscan_write", |
705 | "nr_vmscan_immediate_reclaim", | ||
705 | "nr_writeback_temp", | 706 | "nr_writeback_temp", |
706 | "nr_isolated_anon", | 707 | "nr_isolated_anon", |
707 | "nr_isolated_file", | 708 | "nr_isolated_file", |
@@ -788,7 +789,7 @@ const char * const vmstat_text[] = { | |||
788 | 789 | ||
789 | #endif /* CONFIG_VM_EVENTS_COUNTERS */ | 790 | #endif /* CONFIG_VM_EVENTS_COUNTERS */ |
790 | }; | 791 | }; |
791 | #endif /* CONFIG_PROC_FS || CONFIG_SYSFS */ | 792 | #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA */ |
792 | 793 | ||
793 | 794 | ||
794 | #ifdef CONFIG_PROC_FS | 795 | #ifdef CONFIG_PROC_FS |