aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorLuciano Coelho <coelho@ti.com>2011-12-01 05:14:48 -0500
committerLuciano Coelho <coelho@ti.com>2011-12-01 05:14:48 -0500
commite4da3fbfbd1de56d2367653e3823e6445e49f8a9 (patch)
treef69f424f731b89a75f881967903ff2f38f4b6a92 /mm
parentb693289406f0b8ca70ab77e745be6196d5740eb0 (diff)
parentba5736a5e9ac20c378ae4179e8a0ed3cc4b44351 (diff)
Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/linville/wireless-next into wl12xx-next
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig3
-rw-r--r--mm/Makefile3
-rw-r--r--mm/backing-dev.c36
-rw-r--r--mm/bootmem.c2
-rw-r--r--mm/bounce.c11
-rw-r--r--mm/compaction.c26
-rw-r--r--mm/debug-pagealloc.c56
-rw-r--r--mm/dmapool.c3
-rw-r--r--mm/failslab.c14
-rw-r--r--mm/filemap.c117
-rw-r--r--mm/filemap_xip.c2
-rw-r--r--mm/fremap.c1
-rw-r--r--mm/highmem.c8
-rw-r--r--mm/huge_memory.c91
-rw-r--r--mm/internal.h46
-rw-r--r--mm/kmemleak.c2
-rw-r--r--mm/ksm.c3
-rw-r--r--mm/maccess.c2
-rw-r--r--mm/memblock.c11
-rw-r--r--mm/memcontrol.c1286
-rw-r--r--mm/memory-failure.c105
-rw-r--r--mm/memory.c4
-rw-r--r--mm/memory_hotplug.c2
-rw-r--r--mm/mempolicy.c13
-rw-r--r--mm/mempool.c2
-rw-r--r--mm/migrate.c85
-rw-r--r--mm/mincore.c11
-rw-r--r--mm/mlock.c15
-rw-r--r--mm/mm_init.c2
-rw-r--r--mm/mmap.c11
-rw-r--r--mm/mmu_context.c2
-rw-r--r--mm/mmu_notifier.c2
-rw-r--r--mm/mmzone.c1
-rw-r--r--mm/mremap.c42
-rw-r--r--mm/nobootmem.c2
-rw-r--r--mm/nommu.c2
-rw-r--r--mm/oom_kill.c59
-rw-r--r--mm/page-writeback.c723
-rw-r--r--mm/page_alloc.c30
-rw-r--r--mm/page_cgroup.c12
-rw-r--r--mm/process_vm_access.c496
-rw-r--r--mm/quicklist.c1
-rw-r--r--mm/readahead.c2
-rw-r--r--mm/rmap.c4
-rw-r--r--mm/shmem.c1507
-rw-r--r--mm/slab.c118
-rw-r--r--mm/slob.c2
-rw-r--r--mm/slub.c1239
-rw-r--r--mm/sparse-vmemmap.c1
-rw-r--r--mm/sparse.c2
-rw-r--r--mm/swap.c85
-rw-r--r--mm/swap_state.c1
-rw-r--r--mm/swapfile.c23
-rw-r--r--mm/thrash.c2
-rw-r--r--mm/truncate.c10
-rw-r--r--mm/util.c2
-rw-r--r--mm/vmalloc.c95
-rw-r--r--mm/vmscan.c399
-rw-r--r--mm/vmstat.c7
59 files changed, 3934 insertions, 2910 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index f2f1ca19ed53..011b110365c8 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -131,6 +131,9 @@ config SPARSEMEM_VMEMMAP
131config HAVE_MEMBLOCK 131config HAVE_MEMBLOCK
132 boolean 132 boolean
133 133
134config NO_BOOTMEM
135 boolean
136
134# eventually, we can have this option just 'select SPARSEMEM' 137# eventually, we can have this option just 'select SPARSEMEM'
135config MEMORY_HOTPLUG 138config MEMORY_HOTPLUG
136 bool "Allow for memory hot-add" 139 bool "Allow for memory hot-add"
diff --git a/mm/Makefile b/mm/Makefile
index 836e4163c1bf..50ec00ef2a0e 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -5,7 +5,8 @@
5mmu-y := nommu.o 5mmu-y := nommu.o
6mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ 6mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \
7 mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ 7 mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
8 vmalloc.o pagewalk.o pgtable-generic.o 8 vmalloc.o pagewalk.o pgtable-generic.o \
9 process_vm_access.o
9 10
10obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ 11obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
11 maccess.o page_alloc.o page-writeback.o \ 12 maccess.o page_alloc.o page-writeback.o \
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index d6edf8d14f9c..a0860640378d 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -97,6 +97,7 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
97 "BdiDirtyThresh: %10lu kB\n" 97 "BdiDirtyThresh: %10lu kB\n"
98 "DirtyThresh: %10lu kB\n" 98 "DirtyThresh: %10lu kB\n"
99 "BackgroundThresh: %10lu kB\n" 99 "BackgroundThresh: %10lu kB\n"
100 "BdiDirtied: %10lu kB\n"
100 "BdiWritten: %10lu kB\n" 101 "BdiWritten: %10lu kB\n"
101 "BdiWriteBandwidth: %10lu kBps\n" 102 "BdiWriteBandwidth: %10lu kBps\n"
102 "b_dirty: %10lu\n" 103 "b_dirty: %10lu\n"
@@ -109,6 +110,7 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
109 K(bdi_thresh), 110 K(bdi_thresh),
110 K(dirty_thresh), 111 K(dirty_thresh),
111 K(background_thresh), 112 K(background_thresh),
113 (unsigned long) K(bdi_stat(bdi, BDI_DIRTIED)),
112 (unsigned long) K(bdi_stat(bdi, BDI_WRITTEN)), 114 (unsigned long) K(bdi_stat(bdi, BDI_WRITTEN)),
113 (unsigned long) K(bdi->write_bandwidth), 115 (unsigned long) K(bdi->write_bandwidth),
114 nr_dirty, 116 nr_dirty,
@@ -359,6 +361,17 @@ static unsigned long bdi_longest_inactive(void)
359 return max(5UL * 60 * HZ, interval); 361 return max(5UL * 60 * HZ, interval);
360} 362}
361 363
364/*
365 * Clear pending bit and wakeup anybody waiting for flusher thread creation or
366 * shutdown
367 */
368static void bdi_clear_pending(struct backing_dev_info *bdi)
369{
370 clear_bit(BDI_pending, &bdi->state);
371 smp_mb__after_clear_bit();
372 wake_up_bit(&bdi->state, BDI_pending);
373}
374
362static int bdi_forker_thread(void *ptr) 375static int bdi_forker_thread(void *ptr)
363{ 376{
364 struct bdi_writeback *me = ptr; 377 struct bdi_writeback *me = ptr;
@@ -390,6 +403,12 @@ static int bdi_forker_thread(void *ptr)
390 } 403 }
391 404
392 spin_lock_bh(&bdi_lock); 405 spin_lock_bh(&bdi_lock);
406 /*
407 * In the following loop we are going to check whether we have
408 * some work to do without any synchronization with tasks
409 * waking us up to do work for them. Set the task state here
410 * so that we don't miss wakeups after verifying conditions.
411 */
393 set_current_state(TASK_INTERRUPTIBLE); 412 set_current_state(TASK_INTERRUPTIBLE);
394 413
395 list_for_each_entry(bdi, &bdi_list, bdi_list) { 414 list_for_each_entry(bdi, &bdi_list, bdi_list) {
@@ -456,7 +475,8 @@ static int bdi_forker_thread(void *ptr)
456 * the bdi from the thread. Hopefully 1024 is 475 * the bdi from the thread. Hopefully 1024 is
457 * large enough for efficient IO. 476 * large enough for efficient IO.
458 */ 477 */
459 writeback_inodes_wb(&bdi->wb, 1024); 478 writeback_inodes_wb(&bdi->wb, 1024,
479 WB_REASON_FORKER_THREAD);
460 } else { 480 } else {
461 /* 481 /*
462 * The spinlock makes sure we do not lose 482 * The spinlock makes sure we do not lose
@@ -469,11 +489,13 @@ static int bdi_forker_thread(void *ptr)
469 spin_unlock_bh(&bdi->wb_lock); 489 spin_unlock_bh(&bdi->wb_lock);
470 wake_up_process(task); 490 wake_up_process(task);
471 } 491 }
492 bdi_clear_pending(bdi);
472 break; 493 break;
473 494
474 case KILL_THREAD: 495 case KILL_THREAD:
475 __set_current_state(TASK_RUNNING); 496 __set_current_state(TASK_RUNNING);
476 kthread_stop(task); 497 kthread_stop(task);
498 bdi_clear_pending(bdi);
477 break; 499 break;
478 500
479 case NO_ACTION: 501 case NO_ACTION:
@@ -489,16 +511,8 @@ static int bdi_forker_thread(void *ptr)
489 else 511 else
490 schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10)); 512 schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
491 try_to_freeze(); 513 try_to_freeze();
492 /* Back to the main loop */ 514 break;
493 continue;
494 } 515 }
495
496 /*
497 * Clear pending bit and wakeup anybody waiting to tear us down.
498 */
499 clear_bit(BDI_pending, &bdi->state);
500 smp_mb__after_clear_bit();
501 wake_up_bit(&bdi->state, BDI_pending);
502 } 516 }
503 517
504 return 0; 518 return 0;
@@ -672,6 +686,8 @@ int bdi_init(struct backing_dev_info *bdi)
672 bdi->bw_time_stamp = jiffies; 686 bdi->bw_time_stamp = jiffies;
673 bdi->written_stamp = 0; 687 bdi->written_stamp = 0;
674 688
689 bdi->balanced_dirty_ratelimit = INIT_BW;
690 bdi->dirty_ratelimit = INIT_BW;
675 bdi->write_bandwidth = INIT_BW; 691 bdi->write_bandwidth = INIT_BW;
676 bdi->avg_write_bandwidth = INIT_BW; 692 bdi->avg_write_bandwidth = INIT_BW;
677 693
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 01d5a4b3dd0c..1a77012ecdb3 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -12,7 +12,7 @@
12#include <linux/pfn.h> 12#include <linux/pfn.h>
13#include <linux/slab.h> 13#include <linux/slab.h>
14#include <linux/bootmem.h> 14#include <linux/bootmem.h>
15#include <linux/module.h> 15#include <linux/export.h>
16#include <linux/kmemleak.h> 16#include <linux/kmemleak.h>
17#include <linux/range.h> 17#include <linux/range.h>
18#include <linux/memblock.h> 18#include <linux/memblock.h>
diff --git a/mm/bounce.c b/mm/bounce.c
index 1481de68184b..4e9ae722af83 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -4,7 +4,7 @@
4 */ 4 */
5 5
6#include <linux/mm.h> 6#include <linux/mm.h>
7#include <linux/module.h> 7#include <linux/export.h>
8#include <linux/swap.h> 8#include <linux/swap.h>
9#include <linux/gfp.h> 9#include <linux/gfp.h>
10#include <linux/bio.h> 10#include <linux/bio.h>
@@ -14,6 +14,7 @@
14#include <linux/init.h> 14#include <linux/init.h>
15#include <linux/hash.h> 15#include <linux/hash.h>
16#include <linux/highmem.h> 16#include <linux/highmem.h>
17#include <linux/bootmem.h>
17#include <asm/tlbflush.h> 18#include <asm/tlbflush.h>
18 19
19#include <trace/events/block.h> 20#include <trace/events/block.h>
@@ -26,12 +27,10 @@ static mempool_t *page_pool, *isa_page_pool;
26#ifdef CONFIG_HIGHMEM 27#ifdef CONFIG_HIGHMEM
27static __init int init_emergency_pool(void) 28static __init int init_emergency_pool(void)
28{ 29{
29 struct sysinfo i; 30#ifndef CONFIG_MEMORY_HOTPLUG
30 si_meminfo(&i); 31 if (max_pfn <= max_low_pfn)
31 si_swapinfo(&i);
32
33 if (!i.totalhigh)
34 return 0; 32 return 0;
33#endif
35 34
36 page_pool = mempool_create_page_pool(POOL_SIZE, 0); 35 page_pool = mempool_create_page_pool(POOL_SIZE, 0);
37 BUG_ON(!page_pool); 36 BUG_ON(!page_pool);
diff --git a/mm/compaction.c b/mm/compaction.c
index 6cc604bd5649..899d95638586 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -35,10 +35,6 @@ struct compact_control {
35 unsigned long migrate_pfn; /* isolate_migratepages search base */ 35 unsigned long migrate_pfn; /* isolate_migratepages search base */
36 bool sync; /* Synchronous migration */ 36 bool sync; /* Synchronous migration */
37 37
38 /* Account for isolated anon and file pages */
39 unsigned long nr_anon;
40 unsigned long nr_file;
41
42 unsigned int order; /* order a direct compactor needs */ 38 unsigned int order; /* order a direct compactor needs */
43 int migratetype; /* MOVABLE, RECLAIMABLE etc */ 39 int migratetype; /* MOVABLE, RECLAIMABLE etc */
44 struct zone *zone; 40 struct zone *zone;
@@ -223,17 +219,13 @@ static void isolate_freepages(struct zone *zone,
223static void acct_isolated(struct zone *zone, struct compact_control *cc) 219static void acct_isolated(struct zone *zone, struct compact_control *cc)
224{ 220{
225 struct page *page; 221 struct page *page;
226 unsigned int count[NR_LRU_LISTS] = { 0, }; 222 unsigned int count[2] = { 0, };
227 223
228 list_for_each_entry(page, &cc->migratepages, lru) { 224 list_for_each_entry(page, &cc->migratepages, lru)
229 int lru = page_lru_base_type(page); 225 count[!!page_is_file_cache(page)]++;
230 count[lru]++;
231 }
232 226
233 cc->nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON]; 227 __mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]);
234 cc->nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE]; 228 __mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]);
235 __mod_zone_page_state(zone, NR_ISOLATED_ANON, cc->nr_anon);
236 __mod_zone_page_state(zone, NR_ISOLATED_FILE, cc->nr_file);
237} 229}
238 230
239/* Similar to reclaim, but different enough that they don't share logic */ 231/* Similar to reclaim, but different enough that they don't share logic */
@@ -269,6 +261,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
269 unsigned long last_pageblock_nr = 0, pageblock_nr; 261 unsigned long last_pageblock_nr = 0, pageblock_nr;
270 unsigned long nr_scanned = 0, nr_isolated = 0; 262 unsigned long nr_scanned = 0, nr_isolated = 0;
271 struct list_head *migratelist = &cc->migratepages; 263 struct list_head *migratelist = &cc->migratepages;
264 isolate_mode_t mode = ISOLATE_ACTIVE|ISOLATE_INACTIVE;
272 265
273 /* Do not scan outside zone boundaries */ 266 /* Do not scan outside zone boundaries */
274 low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn); 267 low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn);
@@ -356,8 +349,11 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
356 continue; 349 continue;
357 } 350 }
358 351
352 if (!cc->sync)
353 mode |= ISOLATE_CLEAN;
354
359 /* Try isolate the page */ 355 /* Try isolate the page */
360 if (__isolate_lru_page(page, ISOLATE_BOTH, 0) != 0) 356 if (__isolate_lru_page(page, mode, 0) != 0)
361 continue; 357 continue;
362 358
363 VM_BUG_ON(PageTransCompound(page)); 359 VM_BUG_ON(PageTransCompound(page));
@@ -586,7 +582,7 @@ out:
586 return ret; 582 return ret;
587} 583}
588 584
589unsigned long compact_zone_order(struct zone *zone, 585static unsigned long compact_zone_order(struct zone *zone,
590 int order, gfp_t gfp_mask, 586 int order, gfp_t gfp_mask,
591 bool sync) 587 bool sync)
592{ 588{
diff --git a/mm/debug-pagealloc.c b/mm/debug-pagealloc.c
index a1e3324de2b5..7cea557407f4 100644
--- a/mm/debug-pagealloc.c
+++ b/mm/debug-pagealloc.c
@@ -1,7 +1,10 @@
1#include <linux/kernel.h> 1#include <linux/kernel.h>
2#include <linux/string.h>
2#include <linux/mm.h> 3#include <linux/mm.h>
4#include <linux/highmem.h>
3#include <linux/page-debug-flags.h> 5#include <linux/page-debug-flags.h>
4#include <linux/poison.h> 6#include <linux/poison.h>
7#include <linux/ratelimit.h>
5 8
6static inline void set_page_poison(struct page *page) 9static inline void set_page_poison(struct page *page)
7{ 10{
@@ -18,28 +21,13 @@ static inline bool page_poison(struct page *page)
18 return test_bit(PAGE_DEBUG_FLAG_POISON, &page->debug_flags); 21 return test_bit(PAGE_DEBUG_FLAG_POISON, &page->debug_flags);
19} 22}
20 23
21static void poison_highpage(struct page *page)
22{
23 /*
24 * Page poisoning for highmem pages is not implemented.
25 *
26 * This can be called from interrupt contexts.
27 * So we need to create a new kmap_atomic slot for this
28 * application and it will need interrupt protection.
29 */
30}
31
32static void poison_page(struct page *page) 24static void poison_page(struct page *page)
33{ 25{
34 void *addr; 26 void *addr = kmap_atomic(page);
35 27
36 if (PageHighMem(page)) {
37 poison_highpage(page);
38 return;
39 }
40 set_page_poison(page); 28 set_page_poison(page);
41 addr = page_address(page);
42 memset(addr, PAGE_POISON, PAGE_SIZE); 29 memset(addr, PAGE_POISON, PAGE_SIZE);
30 kunmap_atomic(addr);
43} 31}
44 32
45static void poison_pages(struct page *page, int n) 33static void poison_pages(struct page *page, int n)
@@ -59,14 +47,12 @@ static bool single_bit_flip(unsigned char a, unsigned char b)
59 47
60static void check_poison_mem(unsigned char *mem, size_t bytes) 48static void check_poison_mem(unsigned char *mem, size_t bytes)
61{ 49{
50 static DEFINE_RATELIMIT_STATE(ratelimit, 5 * HZ, 10);
62 unsigned char *start; 51 unsigned char *start;
63 unsigned char *end; 52 unsigned char *end;
64 53
65 for (start = mem; start < mem + bytes; start++) { 54 start = memchr_inv(mem, PAGE_POISON, bytes);
66 if (*start != PAGE_POISON) 55 if (!start)
67 break;
68 }
69 if (start == mem + bytes)
70 return; 56 return;
71 57
72 for (end = mem + bytes - 1; end > start; end--) { 58 for (end = mem + bytes - 1; end > start; end--) {
@@ -74,7 +60,7 @@ static void check_poison_mem(unsigned char *mem, size_t bytes)
74 break; 60 break;
75 } 61 }
76 62
77 if (!printk_ratelimit()) 63 if (!__ratelimit(&ratelimit))
78 return; 64 return;
79 else if (start == end && single_bit_flip(*start, PAGE_POISON)) 65 else if (start == end && single_bit_flip(*start, PAGE_POISON))
80 printk(KERN_ERR "pagealloc: single bit error\n"); 66 printk(KERN_ERR "pagealloc: single bit error\n");
@@ -86,27 +72,17 @@ static void check_poison_mem(unsigned char *mem, size_t bytes)
86 dump_stack(); 72 dump_stack();
87} 73}
88 74
89static void unpoison_highpage(struct page *page)
90{
91 /*
92 * See comment in poison_highpage().
93 * Highmem pages should not be poisoned for now
94 */
95 BUG_ON(page_poison(page));
96}
97
98static void unpoison_page(struct page *page) 75static void unpoison_page(struct page *page)
99{ 76{
100 if (PageHighMem(page)) { 77 void *addr;
101 unpoison_highpage(page); 78
79 if (!page_poison(page))
102 return; 80 return;
103 }
104 if (page_poison(page)) {
105 void *addr = page_address(page);
106 81
107 check_poison_mem(addr, PAGE_SIZE); 82 addr = kmap_atomic(page);
108 clear_page_poison(page); 83 check_poison_mem(addr, PAGE_SIZE);
109 } 84 clear_page_poison(page);
85 kunmap_atomic(addr);
110} 86}
111 87
112static void unpoison_pages(struct page *page, int n) 88static void unpoison_pages(struct page *page, int n)
diff --git a/mm/dmapool.c b/mm/dmapool.c
index fbb58e346888..c5ab33bca0a8 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -27,11 +27,12 @@
27#include <linux/dmapool.h> 27#include <linux/dmapool.h>
28#include <linux/kernel.h> 28#include <linux/kernel.h>
29#include <linux/list.h> 29#include <linux/list.h>
30#include <linux/module.h> 30#include <linux/export.h>
31#include <linux/mutex.h> 31#include <linux/mutex.h>
32#include <linux/poison.h> 32#include <linux/poison.h>
33#include <linux/sched.h> 33#include <linux/sched.h>
34#include <linux/slab.h> 34#include <linux/slab.h>
35#include <linux/stat.h>
35#include <linux/spinlock.h> 36#include <linux/spinlock.h>
36#include <linux/string.h> 37#include <linux/string.h>
37#include <linux/types.h> 38#include <linux/types.h>
diff --git a/mm/failslab.c b/mm/failslab.c
index 1ce58c201dca..0dd7b8fec71c 100644
--- a/mm/failslab.c
+++ b/mm/failslab.c
@@ -34,23 +34,23 @@ __setup("failslab=", setup_failslab);
34#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS 34#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
35static int __init failslab_debugfs_init(void) 35static int __init failslab_debugfs_init(void)
36{ 36{
37 struct dentry *dir;
37 mode_t mode = S_IFREG | S_IRUSR | S_IWUSR; 38 mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
38 int err;
39 39
40 err = init_fault_attr_dentries(&failslab.attr, "failslab"); 40 dir = fault_create_debugfs_attr("failslab", NULL, &failslab.attr);
41 if (err) 41 if (IS_ERR(dir))
42 return err; 42 return PTR_ERR(dir);
43 43
44 if (!debugfs_create_bool("ignore-gfp-wait", mode, failslab.attr.dir, 44 if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,
45 &failslab.ignore_gfp_wait)) 45 &failslab.ignore_gfp_wait))
46 goto fail; 46 goto fail;
47 if (!debugfs_create_bool("cache-filter", mode, failslab.attr.dir, 47 if (!debugfs_create_bool("cache-filter", mode, dir,
48 &failslab.cache_filter)) 48 &failslab.cache_filter))
49 goto fail; 49 goto fail;
50 50
51 return 0; 51 return 0;
52fail: 52fail:
53 cleanup_fault_attr_dentries(&failslab.attr); 53 debugfs_remove_recursive(dir);
54 54
55 return -ENOMEM; 55 return -ENOMEM;
56} 56}
diff --git a/mm/filemap.c b/mm/filemap.c
index 867d40222ec7..c0018f2d50e0 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -9,7 +9,7 @@
9 * most "normal" filesystems (but you don't /have/ to use this: 9 * most "normal" filesystems (but you don't /have/ to use this:
10 * the NFS filesystem used to do this differently, for example) 10 * the NFS filesystem used to do this differently, for example)
11 */ 11 */
12#include <linux/module.h> 12#include <linux/export.h>
13#include <linux/compiler.h> 13#include <linux/compiler.h>
14#include <linux/fs.h> 14#include <linux/fs.h>
15#include <linux/uaccess.h> 15#include <linux/uaccess.h>
@@ -33,7 +33,6 @@
33#include <linux/cpuset.h> 33#include <linux/cpuset.h>
34#include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */ 34#include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
35#include <linux/memcontrol.h> 35#include <linux/memcontrol.h>
36#include <linux/mm_inline.h> /* for page_is_file_cache() */
37#include <linux/cleancache.h> 36#include <linux/cleancache.h>
38#include "internal.h" 37#include "internal.h"
39 38
@@ -462,6 +461,7 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
462 int error; 461 int error;
463 462
464 VM_BUG_ON(!PageLocked(page)); 463 VM_BUG_ON(!PageLocked(page));
464 VM_BUG_ON(PageSwapBacked(page));
465 465
466 error = mem_cgroup_cache_charge(page, current->mm, 466 error = mem_cgroup_cache_charge(page, current->mm,
467 gfp_mask & GFP_RECLAIM_MASK); 467 gfp_mask & GFP_RECLAIM_MASK);
@@ -479,8 +479,6 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
479 if (likely(!error)) { 479 if (likely(!error)) {
480 mapping->nrpages++; 480 mapping->nrpages++;
481 __inc_zone_page_state(page, NR_FILE_PAGES); 481 __inc_zone_page_state(page, NR_FILE_PAGES);
482 if (PageSwapBacked(page))
483 __inc_zone_page_state(page, NR_SHMEM);
484 spin_unlock_irq(&mapping->tree_lock); 482 spin_unlock_irq(&mapping->tree_lock);
485 } else { 483 } else {
486 page->mapping = NULL; 484 page->mapping = NULL;
@@ -502,22 +500,9 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
502{ 500{
503 int ret; 501 int ret;
504 502
505 /*
506 * Splice_read and readahead add shmem/tmpfs pages into the page cache
507 * before shmem_readpage has a chance to mark them as SwapBacked: they
508 * need to go on the anon lru below, and mem_cgroup_cache_charge
509 * (called in add_to_page_cache) needs to know where they're going too.
510 */
511 if (mapping_cap_swap_backed(mapping))
512 SetPageSwapBacked(page);
513
514 ret = add_to_page_cache(page, mapping, offset, gfp_mask); 503 ret = add_to_page_cache(page, mapping, offset, gfp_mask);
515 if (ret == 0) { 504 if (ret == 0)
516 if (page_is_file_cache(page)) 505 lru_cache_add_file(page);
517 lru_cache_add_file(page);
518 else
519 lru_cache_add_anon(page);
520 }
521 return ret; 506 return ret;
522} 507}
523EXPORT_SYMBOL_GPL(add_to_page_cache_lru); 508EXPORT_SYMBOL_GPL(add_to_page_cache_lru);
@@ -714,9 +699,16 @@ repeat:
714 page = radix_tree_deref_slot(pagep); 699 page = radix_tree_deref_slot(pagep);
715 if (unlikely(!page)) 700 if (unlikely(!page))
716 goto out; 701 goto out;
717 if (radix_tree_deref_retry(page)) 702 if (radix_tree_exception(page)) {
718 goto repeat; 703 if (radix_tree_deref_retry(page))
719 704 goto repeat;
705 /*
706 * Otherwise, shmem/tmpfs must be storing a swap entry
707 * here as an exceptional entry: so return it without
708 * attempting to raise page count.
709 */
710 goto out;
711 }
720 if (!page_cache_get_speculative(page)) 712 if (!page_cache_get_speculative(page))
721 goto repeat; 713 goto repeat;
722 714
@@ -753,7 +745,7 @@ struct page *find_lock_page(struct address_space *mapping, pgoff_t offset)
753 745
754repeat: 746repeat:
755 page = find_get_page(mapping, offset); 747 page = find_get_page(mapping, offset);
756 if (page) { 748 if (page && !radix_tree_exception(page)) {
757 lock_page(page); 749 lock_page(page);
758 /* Has the page been truncated? */ 750 /* Has the page been truncated? */
759 if (unlikely(page->mapping != mapping)) { 751 if (unlikely(page->mapping != mapping)) {
@@ -835,13 +827,14 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
835{ 827{
836 unsigned int i; 828 unsigned int i;
837 unsigned int ret; 829 unsigned int ret;
838 unsigned int nr_found; 830 unsigned int nr_found, nr_skip;
839 831
840 rcu_read_lock(); 832 rcu_read_lock();
841restart: 833restart:
842 nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, 834 nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
843 (void ***)pages, start, nr_pages); 835 (void ***)pages, NULL, start, nr_pages);
844 ret = 0; 836 ret = 0;
837 nr_skip = 0;
845 for (i = 0; i < nr_found; i++) { 838 for (i = 0; i < nr_found; i++) {
846 struct page *page; 839 struct page *page;
847repeat: 840repeat:
@@ -849,13 +842,23 @@ repeat:
849 if (unlikely(!page)) 842 if (unlikely(!page))
850 continue; 843 continue;
851 844
852 /* 845 if (radix_tree_exception(page)) {
853 * This can only trigger when the entry at index 0 moves out 846 if (radix_tree_deref_retry(page)) {
854 * of or back to the root: none yet gotten, safe to restart. 847 /*
855 */ 848 * Transient condition which can only trigger
856 if (radix_tree_deref_retry(page)) { 849 * when entry at index 0 moves out of or back
857 WARN_ON(start | i); 850 * to root: none yet gotten, safe to restart.
858 goto restart; 851 */
852 WARN_ON(start | i);
853 goto restart;
854 }
855 /*
856 * Otherwise, shmem/tmpfs must be storing a swap entry
857 * here as an exceptional entry: so skip over it -
858 * we only reach this from invalidate_mapping_pages().
859 */
860 nr_skip++;
861 continue;
859 } 862 }
860 863
861 if (!page_cache_get_speculative(page)) 864 if (!page_cache_get_speculative(page))
@@ -875,7 +878,7 @@ repeat:
875 * If all entries were removed before we could secure them, 878 * If all entries were removed before we could secure them,
876 * try again, because callers stop trying once 0 is returned. 879 * try again, because callers stop trying once 0 is returned.
877 */ 880 */
878 if (unlikely(!ret && nr_found)) 881 if (unlikely(!ret && nr_found > nr_skip))
879 goto restart; 882 goto restart;
880 rcu_read_unlock(); 883 rcu_read_unlock();
881 return ret; 884 return ret;
@@ -903,7 +906,7 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
903 rcu_read_lock(); 906 rcu_read_lock();
904restart: 907restart:
905 nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, 908 nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
906 (void ***)pages, index, nr_pages); 909 (void ***)pages, NULL, index, nr_pages);
907 ret = 0; 910 ret = 0;
908 for (i = 0; i < nr_found; i++) { 911 for (i = 0; i < nr_found; i++) {
909 struct page *page; 912 struct page *page;
@@ -912,12 +915,22 @@ repeat:
912 if (unlikely(!page)) 915 if (unlikely(!page))
913 continue; 916 continue;
914 917
915 /* 918 if (radix_tree_exception(page)) {
916 * This can only trigger when the entry at index 0 moves out 919 if (radix_tree_deref_retry(page)) {
917 * of or back to the root: none yet gotten, safe to restart. 920 /*
918 */ 921 * Transient condition which can only trigger
919 if (radix_tree_deref_retry(page)) 922 * when entry at index 0 moves out of or back
920 goto restart; 923 * to root: none yet gotten, safe to restart.
924 */
925 goto restart;
926 }
927 /*
928 * Otherwise, shmem/tmpfs must be storing a swap entry
929 * here as an exceptional entry: so stop looking for
930 * contiguous pages.
931 */
932 break;
933 }
921 934
922 if (!page_cache_get_speculative(page)) 935 if (!page_cache_get_speculative(page))
923 goto repeat; 936 goto repeat;
@@ -977,12 +990,21 @@ repeat:
977 if (unlikely(!page)) 990 if (unlikely(!page))
978 continue; 991 continue;
979 992
980 /* 993 if (radix_tree_exception(page)) {
981 * This can only trigger when the entry at index 0 moves out 994 if (radix_tree_deref_retry(page)) {
982 * of or back to the root: none yet gotten, safe to restart. 995 /*
983 */ 996 * Transient condition which can only trigger
984 if (radix_tree_deref_retry(page)) 997 * when entry at index 0 moves out of or back
985 goto restart; 998 * to root: none yet gotten, safe to restart.
999 */
1000 goto restart;
1001 }
1002 /*
1003 * This function is never used on a shmem/tmpfs
1004 * mapping, so a swap entry won't be found here.
1005 */
1006 BUG();
1007 }
986 1008
987 if (!page_cache_get_speculative(page)) 1009 if (!page_cache_get_speculative(page))
988 goto repeat; 1010 goto repeat;
@@ -2093,6 +2115,7 @@ void iov_iter_advance(struct iov_iter *i, size_t bytes)
2093 } else { 2115 } else {
2094 const struct iovec *iov = i->iov; 2116 const struct iovec *iov = i->iov;
2095 size_t base = i->iov_offset; 2117 size_t base = i->iov_offset;
2118 unsigned long nr_segs = i->nr_segs;
2096 2119
2097 /* 2120 /*
2098 * The !iov->iov_len check ensures we skip over unlikely 2121 * The !iov->iov_len check ensures we skip over unlikely
@@ -2108,11 +2131,13 @@ void iov_iter_advance(struct iov_iter *i, size_t bytes)
2108 base += copy; 2131 base += copy;
2109 if (iov->iov_len == base) { 2132 if (iov->iov_len == base) {
2110 iov++; 2133 iov++;
2134 nr_segs--;
2111 base = 0; 2135 base = 0;
2112 } 2136 }
2113 } 2137 }
2114 i->iov = iov; 2138 i->iov = iov;
2115 i->iov_offset = base; 2139 i->iov_offset = base;
2140 i->nr_segs = nr_segs;
2116 } 2141 }
2117} 2142}
2118EXPORT_SYMBOL(iov_iter_advance); 2143EXPORT_SYMBOL(iov_iter_advance);
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 93356cd12828..f91b2f687343 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -10,7 +10,7 @@
10 10
11#include <linux/fs.h> 11#include <linux/fs.h>
12#include <linux/pagemap.h> 12#include <linux/pagemap.h>
13#include <linux/module.h> 13#include <linux/export.h>
14#include <linux/uio.h> 14#include <linux/uio.h>
15#include <linux/rmap.h> 15#include <linux/rmap.h>
16#include <linux/mmu_notifier.h> 16#include <linux/mmu_notifier.h>
diff --git a/mm/fremap.c b/mm/fremap.c
index b8e0e2d468af..9ed4fd432467 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -13,7 +13,6 @@
13#include <linux/pagemap.h> 13#include <linux/pagemap.h>
14#include <linux/swapops.h> 14#include <linux/swapops.h>
15#include <linux/rmap.h> 15#include <linux/rmap.h>
16#include <linux/module.h>
17#include <linux/syscalls.h> 16#include <linux/syscalls.h>
18#include <linux/mmu_notifier.h> 17#include <linux/mmu_notifier.h>
19 18
diff --git a/mm/highmem.c b/mm/highmem.c
index 693394daa2ed..57d82c6250c3 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -17,7 +17,7 @@
17 */ 17 */
18 18
19#include <linux/mm.h> 19#include <linux/mm.h>
20#include <linux/module.h> 20#include <linux/export.h>
21#include <linux/swap.h> 21#include <linux/swap.h>
22#include <linux/bio.h> 22#include <linux/bio.h>
23#include <linux/pagemap.h> 23#include <linux/pagemap.h>
@@ -250,7 +250,7 @@ void *kmap_high_get(struct page *page)
250#endif 250#endif
251 251
252/** 252/**
253 * kunmap_high - map a highmem page into memory 253 * kunmap_high - unmap a highmem page into memory
254 * @page: &struct page to unmap 254 * @page: &struct page to unmap
255 * 255 *
256 * If ARCH_NEEDS_KMAP_HIGH_GET is not defined then this may be called 256 * If ARCH_NEEDS_KMAP_HIGH_GET is not defined then this may be called
@@ -326,7 +326,7 @@ static struct page_address_slot {
326 spinlock_t lock; /* Protect this bucket's list */ 326 spinlock_t lock; /* Protect this bucket's list */
327} ____cacheline_aligned_in_smp page_address_htable[1<<PA_HASH_ORDER]; 327} ____cacheline_aligned_in_smp page_address_htable[1<<PA_HASH_ORDER];
328 328
329static struct page_address_slot *page_slot(struct page *page) 329static struct page_address_slot *page_slot(const struct page *page)
330{ 330{
331 return &page_address_htable[hash_ptr(page, PA_HASH_ORDER)]; 331 return &page_address_htable[hash_ptr(page, PA_HASH_ORDER)];
332} 332}
@@ -337,7 +337,7 @@ static struct page_address_slot *page_slot(struct page *page)
337 * 337 *
338 * Returns the page's virtual address. 338 * Returns the page's virtual address.
339 */ 339 */
340void *page_address(struct page *page) 340void *page_address(const struct page *page)
341{ 341{
342 unsigned long flags; 342 unsigned long flags;
343 void *ret; 343 void *ret;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index e2d1587be269..4298abaae153 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -89,7 +89,8 @@ struct khugepaged_scan {
89 struct list_head mm_head; 89 struct list_head mm_head;
90 struct mm_slot *mm_slot; 90 struct mm_slot *mm_slot;
91 unsigned long address; 91 unsigned long address;
92} khugepaged_scan = { 92};
93static struct khugepaged_scan khugepaged_scan = {
93 .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head), 94 .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),
94}; 95};
95 96
@@ -829,7 +830,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
829 830
830 for (i = 0; i < HPAGE_PMD_NR; i++) { 831 for (i = 0; i < HPAGE_PMD_NR; i++) {
831 copy_user_highpage(pages[i], page + i, 832 copy_user_highpage(pages[i], page + i,
832 haddr + PAGE_SHIFT*i, vma); 833 haddr + PAGE_SIZE * i, vma);
833 __SetPageUptodate(pages[i]); 834 __SetPageUptodate(pages[i]);
834 cond_resched(); 835 cond_resched();
835 } 836 }
@@ -989,7 +990,7 @@ struct page *follow_trans_huge_pmd(struct mm_struct *mm,
989 page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; 990 page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
990 VM_BUG_ON(!PageCompound(page)); 991 VM_BUG_ON(!PageCompound(page));
991 if (flags & FOLL_GET) 992 if (flags & FOLL_GET)
992 get_page(page); 993 get_page_foll(page);
993 994
994out: 995out:
995 return page; 996 return page;
@@ -1052,6 +1053,51 @@ int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
1052 return ret; 1053 return ret;
1053} 1054}
1054 1055
1056int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
1057 unsigned long old_addr,
1058 unsigned long new_addr, unsigned long old_end,
1059 pmd_t *old_pmd, pmd_t *new_pmd)
1060{
1061 int ret = 0;
1062 pmd_t pmd;
1063
1064 struct mm_struct *mm = vma->vm_mm;
1065
1066 if ((old_addr & ~HPAGE_PMD_MASK) ||
1067 (new_addr & ~HPAGE_PMD_MASK) ||
1068 old_end - old_addr < HPAGE_PMD_SIZE ||
1069 (new_vma->vm_flags & VM_NOHUGEPAGE))
1070 goto out;
1071
1072 /*
1073 * The destination pmd shouldn't be established, free_pgtables()
1074 * should have release it.
1075 */
1076 if (WARN_ON(!pmd_none(*new_pmd))) {
1077 VM_BUG_ON(pmd_trans_huge(*new_pmd));
1078 goto out;
1079 }
1080
1081 spin_lock(&mm->page_table_lock);
1082 if (likely(pmd_trans_huge(*old_pmd))) {
1083 if (pmd_trans_splitting(*old_pmd)) {
1084 spin_unlock(&mm->page_table_lock);
1085 wait_split_huge_page(vma->anon_vma, old_pmd);
1086 ret = -1;
1087 } else {
1088 pmd = pmdp_get_and_clear(mm, old_addr, old_pmd);
1089 VM_BUG_ON(!pmd_none(*new_pmd));
1090 set_pmd_at(mm, new_addr, new_pmd, pmd);
1091 spin_unlock(&mm->page_table_lock);
1092 ret = 1;
1093 }
1094 } else {
1095 spin_unlock(&mm->page_table_lock);
1096 }
1097out:
1098 return ret;
1099}
1100
1055int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, 1101int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
1056 unsigned long addr, pgprot_t newprot) 1102 unsigned long addr, pgprot_t newprot)
1057{ 1103{
@@ -1156,6 +1202,7 @@ static void __split_huge_page_refcount(struct page *page)
1156 unsigned long head_index = page->index; 1202 unsigned long head_index = page->index;
1157 struct zone *zone = page_zone(page); 1203 struct zone *zone = page_zone(page);
1158 int zonestat; 1204 int zonestat;
1205 int tail_count = 0;
1159 1206
1160 /* prevent PageLRU to go away from under us, and freeze lru stats */ 1207 /* prevent PageLRU to go away from under us, and freeze lru stats */
1161 spin_lock_irq(&zone->lru_lock); 1208 spin_lock_irq(&zone->lru_lock);
@@ -1164,11 +1211,27 @@ static void __split_huge_page_refcount(struct page *page)
1164 for (i = 1; i < HPAGE_PMD_NR; i++) { 1211 for (i = 1; i < HPAGE_PMD_NR; i++) {
1165 struct page *page_tail = page + i; 1212 struct page *page_tail = page + i;
1166 1213
1167 /* tail_page->_count cannot change */ 1214 /* tail_page->_mapcount cannot change */
1168 atomic_sub(atomic_read(&page_tail->_count), &page->_count); 1215 BUG_ON(page_mapcount(page_tail) < 0);
1169 BUG_ON(page_count(page) <= 0); 1216 tail_count += page_mapcount(page_tail);
1170 atomic_add(page_mapcount(page) + 1, &page_tail->_count); 1217 /* check for overflow */
1171 BUG_ON(atomic_read(&page_tail->_count) <= 0); 1218 BUG_ON(tail_count < 0);
1219 BUG_ON(atomic_read(&page_tail->_count) != 0);
1220 /*
1221 * tail_page->_count is zero and not changing from
1222 * under us. But get_page_unless_zero() may be running
1223 * from under us on the tail_page. If we used
1224 * atomic_set() below instead of atomic_add(), we
1225 * would then run atomic_set() concurrently with
1226 * get_page_unless_zero(), and atomic_set() is
1227 * implemented in C not using locked ops. spin_unlock
1228 * on x86 sometime uses locked ops because of PPro
1229 * errata 66, 92, so unless somebody can guarantee
1230 * atomic_set() here would be safe on all archs (and
1231 * not only on x86), it's safer to use atomic_add().
1232 */
1233 atomic_add(page_mapcount(page) + page_mapcount(page_tail) + 1,
1234 &page_tail->_count);
1172 1235
1173 /* after clearing PageTail the gup refcount can be released */ 1236 /* after clearing PageTail the gup refcount can be released */
1174 smp_mb(); 1237 smp_mb();
@@ -1186,10 +1249,7 @@ static void __split_huge_page_refcount(struct page *page)
1186 (1L << PG_uptodate))); 1249 (1L << PG_uptodate)));
1187 page_tail->flags |= (1L << PG_dirty); 1250 page_tail->flags |= (1L << PG_dirty);
1188 1251
1189 /* 1252 /* clear PageTail before overwriting first_page */
1190 * 1) clear PageTail before overwriting first_page
1191 * 2) clear PageTail before clearing PageHead for VM_BUG_ON
1192 */
1193 smp_wmb(); 1253 smp_wmb();
1194 1254
1195 /* 1255 /*
@@ -1206,7 +1266,6 @@ static void __split_huge_page_refcount(struct page *page)
1206 * status is achieved setting a reserved bit in the 1266 * status is achieved setting a reserved bit in the
1207 * pmd, not by clearing the present bit. 1267 * pmd, not by clearing the present bit.
1208 */ 1268 */
1209 BUG_ON(page_mapcount(page_tail));
1210 page_tail->_mapcount = page->_mapcount; 1269 page_tail->_mapcount = page->_mapcount;
1211 1270
1212 BUG_ON(page_tail->mapping); 1271 BUG_ON(page_tail->mapping);
@@ -1223,6 +1282,8 @@ static void __split_huge_page_refcount(struct page *page)
1223 1282
1224 lru_add_page_tail(zone, page, page_tail); 1283 lru_add_page_tail(zone, page, page_tail);
1225 } 1284 }
1285 atomic_sub(tail_count, &page->_count);
1286 BUG_ON(atomic_read(&page->_count) <= 0);
1226 1287
1227 __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); 1288 __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
1228 __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR); 1289 __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR);
@@ -1906,7 +1967,7 @@ static void collapse_huge_page(struct mm_struct *mm,
1906 BUG_ON(!pmd_none(*pmd)); 1967 BUG_ON(!pmd_none(*pmd));
1907 page_add_new_anon_rmap(new_page, vma, address); 1968 page_add_new_anon_rmap(new_page, vma, address);
1908 set_pmd_at(mm, address, pmd, _pmd); 1969 set_pmd_at(mm, address, pmd, _pmd);
1909 update_mmu_cache(vma, address, entry); 1970 update_mmu_cache(vma, address, _pmd);
1910 prepare_pmd_huge_pte(pgtable, mm); 1971 prepare_pmd_huge_pte(pgtable, mm);
1911 mm->nr_ptes--; 1972 mm->nr_ptes--;
1912 spin_unlock(&mm->page_table_lock); 1973 spin_unlock(&mm->page_table_lock);
@@ -2024,6 +2085,8 @@ static void collect_mm_slot(struct mm_slot *mm_slot)
2024 2085
2025static unsigned int khugepaged_scan_mm_slot(unsigned int pages, 2086static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
2026 struct page **hpage) 2087 struct page **hpage)
2088 __releases(&khugepaged_mm_lock)
2089 __acquires(&khugepaged_mm_lock)
2027{ 2090{
2028 struct mm_slot *mm_slot; 2091 struct mm_slot *mm_slot;
2029 struct mm_struct *mm; 2092 struct mm_struct *mm;
diff --git a/mm/internal.h b/mm/internal.h
index d071d380fb49..2189af491783 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -37,6 +37,52 @@ static inline void __put_page(struct page *page)
37 atomic_dec(&page->_count); 37 atomic_dec(&page->_count);
38} 38}
39 39
40static inline void __get_page_tail_foll(struct page *page,
41 bool get_page_head)
42{
43 /*
44 * If we're getting a tail page, the elevated page->_count is
45 * required only in the head page and we will elevate the head
46 * page->_count and tail page->_mapcount.
47 *
48 * We elevate page_tail->_mapcount for tail pages to force
49 * page_tail->_count to be zero at all times to avoid getting
50 * false positives from get_page_unless_zero() with
51 * speculative page access (like in
52 * page_cache_get_speculative()) on tail pages.
53 */
54 VM_BUG_ON(atomic_read(&page->first_page->_count) <= 0);
55 VM_BUG_ON(atomic_read(&page->_count) != 0);
56 VM_BUG_ON(page_mapcount(page) < 0);
57 if (get_page_head)
58 atomic_inc(&page->first_page->_count);
59 atomic_inc(&page->_mapcount);
60}
61
62/*
63 * This is meant to be called as the FOLL_GET operation of
64 * follow_page() and it must be called while holding the proper PT
65 * lock while the pte (or pmd_trans_huge) is still mapping the page.
66 */
67static inline void get_page_foll(struct page *page)
68{
69 if (unlikely(PageTail(page)))
70 /*
71 * This is safe only because
72 * __split_huge_page_refcount() can't run under
73 * get_page_foll() because we hold the proper PT lock.
74 */
75 __get_page_tail_foll(page, true);
76 else {
77 /*
78 * Getting a normal page or the head of a compound page
79 * requires to already have an elevated page->_count.
80 */
81 VM_BUG_ON(atomic_read(&page->_count) <= 0);
82 atomic_inc(&page->_count);
83 }
84}
85
40extern unsigned long highest_memmap_pfn; 86extern unsigned long highest_memmap_pfn;
41 87
42/* 88/*
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index d6880f542f95..f3b2a00fe9c1 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -69,7 +69,7 @@
69#include <linux/sched.h> 69#include <linux/sched.h>
70#include <linux/jiffies.h> 70#include <linux/jiffies.h>
71#include <linux/delay.h> 71#include <linux/delay.h>
72#include <linux/module.h> 72#include <linux/export.h>
73#include <linux/kthread.h> 73#include <linux/kthread.h>
74#include <linux/prio_tree.h> 74#include <linux/prio_tree.h>
75#include <linux/fs.h> 75#include <linux/fs.h>
diff --git a/mm/ksm.c b/mm/ksm.c
index 9a68b0cf0a1c..310544a379ae 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1905,7 +1905,8 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
1905 1905
1906 oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX); 1906 oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX);
1907 err = unmerge_and_remove_all_rmap_items(); 1907 err = unmerge_and_remove_all_rmap_items();
1908 test_set_oom_score_adj(oom_score_adj); 1908 compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX,
1909 oom_score_adj);
1909 if (err) { 1910 if (err) {
1910 ksm_run = KSM_RUN_STOP; 1911 ksm_run = KSM_RUN_STOP;
1911 count = err; 1912 count = err;
diff --git a/mm/maccess.c b/mm/maccess.c
index 4cee182ab5f3..d53adf9ba84b 100644
--- a/mm/maccess.c
+++ b/mm/maccess.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * Access kernel memory without faulting. 2 * Access kernel memory without faulting.
3 */ 3 */
4#include <linux/module.h> 4#include <linux/export.h>
5#include <linux/mm.h> 5#include <linux/mm.h>
6#include <linux/uaccess.h> 6#include <linux/uaccess.h>
7 7
diff --git a/mm/memblock.c b/mm/memblock.c
index ccbf97339592..84bec4969ed5 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -58,7 +58,8 @@ static unsigned long __init_memblock memblock_addrs_overlap(phys_addr_t base1, p
58 return ((base1 < (base2 + size2)) && (base2 < (base1 + size1))); 58 return ((base1 < (base2 + size2)) && (base2 < (base1 + size1)));
59} 59}
60 60
61long __init_memblock memblock_overlaps_region(struct memblock_type *type, phys_addr_t base, phys_addr_t size) 61static long __init_memblock memblock_overlaps_region(struct memblock_type *type,
62 phys_addr_t base, phys_addr_t size)
62{ 63{
63 unsigned long i; 64 unsigned long i;
64 65
@@ -267,7 +268,7 @@ static int __init_memblock memblock_double_array(struct memblock_type *type)
267 return 0; 268 return 0;
268} 269}
269 270
270extern int __init_memblock __weak memblock_memory_can_coalesce(phys_addr_t addr1, phys_addr_t size1, 271int __init_memblock __weak memblock_memory_can_coalesce(phys_addr_t addr1, phys_addr_t size1,
271 phys_addr_t addr2, phys_addr_t size2) 272 phys_addr_t addr2, phys_addr_t size2)
272{ 273{
273 return 1; 274 return 1;
@@ -626,6 +627,12 @@ phys_addr_t __init memblock_phys_mem_size(void)
626 return memblock.memory_size; 627 return memblock.memory_size;
627} 628}
628 629
630/* lowest address */
631phys_addr_t __init_memblock memblock_start_of_DRAM(void)
632{
633 return memblock.memory.regions[0].base;
634}
635
629phys_addr_t __init_memblock memblock_end_of_DRAM(void) 636phys_addr_t __init_memblock memblock_end_of_DRAM(void)
630{ 637{
631 int idx = memblock.memory.cnt - 1; 638 int idx = memblock.memory.cnt - 1;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 5f84d2351ddb..6aff93c98aca 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -33,9 +33,9 @@
33#include <linux/bit_spinlock.h> 33#include <linux/bit_spinlock.h>
34#include <linux/rcupdate.h> 34#include <linux/rcupdate.h>
35#include <linux/limits.h> 35#include <linux/limits.h>
36#include <linux/export.h>
36#include <linux/mutex.h> 37#include <linux/mutex.h>
37#include <linux/rbtree.h> 38#include <linux/rbtree.h>
38#include <linux/shmem_fs.h>
39#include <linux/slab.h> 39#include <linux/slab.h>
40#include <linux/swap.h> 40#include <linux/swap.h>
41#include <linux/swapops.h> 41#include <linux/swapops.h>
@@ -202,52 +202,8 @@ struct mem_cgroup_eventfd_list {
202 struct eventfd_ctx *eventfd; 202 struct eventfd_ctx *eventfd;
203}; 203};
204 204
205static void mem_cgroup_threshold(struct mem_cgroup *mem); 205static void mem_cgroup_threshold(struct mem_cgroup *memcg);
206static void mem_cgroup_oom_notify(struct mem_cgroup *mem); 206static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
207
208enum {
209 SCAN_BY_LIMIT,
210 SCAN_BY_SYSTEM,
211 NR_SCAN_CONTEXT,
212 SCAN_BY_SHRINK, /* not recorded now */
213};
214
215enum {
216 SCAN,
217 SCAN_ANON,
218 SCAN_FILE,
219 ROTATE,
220 ROTATE_ANON,
221 ROTATE_FILE,
222 FREED,
223 FREED_ANON,
224 FREED_FILE,
225 ELAPSED,
226 NR_SCANSTATS,
227};
228
229struct scanstat {
230 spinlock_t lock;
231 unsigned long stats[NR_SCAN_CONTEXT][NR_SCANSTATS];
232 unsigned long rootstats[NR_SCAN_CONTEXT][NR_SCANSTATS];
233};
234
235const char *scanstat_string[NR_SCANSTATS] = {
236 "scanned_pages",
237 "scanned_anon_pages",
238 "scanned_file_pages",
239 "rotated_pages",
240 "rotated_anon_pages",
241 "rotated_file_pages",
242 "freed_pages",
243 "freed_anon_pages",
244 "freed_file_pages",
245 "elapsed_ns",
246};
247#define SCANSTAT_WORD_LIMIT "_by_limit"
248#define SCANSTAT_WORD_SYSTEM "_by_system"
249#define SCANSTAT_WORD_HIERARCHY "_under_hierarchy"
250
251 207
252/* 208/*
253 * The memory controller data structure. The memory controller controls both 209 * The memory controller data structure. The memory controller controls both
@@ -314,8 +270,7 @@ struct mem_cgroup {
314 270
315 /* For oom notifier event fd */ 271 /* For oom notifier event fd */
316 struct list_head oom_notify; 272 struct list_head oom_notify;
317 /* For recording LRU-scan statistics */ 273
318 struct scanstat scanstat;
319 /* 274 /*
320 * Should we move charges of a task when a task is moved into this 275 * Should we move charges of a task when a task is moved into this
321 * mem_cgroup ? And what type of charges should we move ? 276 * mem_cgroup ? And what type of charges should we move ?
@@ -408,29 +363,29 @@ enum charge_type {
408#define MEM_CGROUP_RECLAIM_SOFT_BIT 0x2 363#define MEM_CGROUP_RECLAIM_SOFT_BIT 0x2
409#define MEM_CGROUP_RECLAIM_SOFT (1 << MEM_CGROUP_RECLAIM_SOFT_BIT) 364#define MEM_CGROUP_RECLAIM_SOFT (1 << MEM_CGROUP_RECLAIM_SOFT_BIT)
410 365
411static void mem_cgroup_get(struct mem_cgroup *mem); 366static void mem_cgroup_get(struct mem_cgroup *memcg);
412static void mem_cgroup_put(struct mem_cgroup *mem); 367static void mem_cgroup_put(struct mem_cgroup *memcg);
413static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); 368static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg);
414static void drain_all_stock_async(struct mem_cgroup *mem); 369static void drain_all_stock_async(struct mem_cgroup *memcg);
415 370
416static struct mem_cgroup_per_zone * 371static struct mem_cgroup_per_zone *
417mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) 372mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid)
418{ 373{
419 return &mem->info.nodeinfo[nid]->zoneinfo[zid]; 374 return &memcg->info.nodeinfo[nid]->zoneinfo[zid];
420} 375}
421 376
422struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem) 377struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg)
423{ 378{
424 return &mem->css; 379 return &memcg->css;
425} 380}
426 381
427static struct mem_cgroup_per_zone * 382static struct mem_cgroup_per_zone *
428page_cgroup_zoneinfo(struct mem_cgroup *mem, struct page *page) 383page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page)
429{ 384{
430 int nid = page_to_nid(page); 385 int nid = page_to_nid(page);
431 int zid = page_zonenum(page); 386 int zid = page_zonenum(page);
432 387
433 return mem_cgroup_zoneinfo(mem, nid, zid); 388 return mem_cgroup_zoneinfo(memcg, nid, zid);
434} 389}
435 390
436static struct mem_cgroup_tree_per_zone * 391static struct mem_cgroup_tree_per_zone *
@@ -449,7 +404,7 @@ soft_limit_tree_from_page(struct page *page)
449} 404}
450 405
451static void 406static void
452__mem_cgroup_insert_exceeded(struct mem_cgroup *mem, 407__mem_cgroup_insert_exceeded(struct mem_cgroup *memcg,
453 struct mem_cgroup_per_zone *mz, 408 struct mem_cgroup_per_zone *mz,
454 struct mem_cgroup_tree_per_zone *mctz, 409 struct mem_cgroup_tree_per_zone *mctz,
455 unsigned long long new_usage_in_excess) 410 unsigned long long new_usage_in_excess)
@@ -483,7 +438,7 @@ __mem_cgroup_insert_exceeded(struct mem_cgroup *mem,
483} 438}
484 439
485static void 440static void
486__mem_cgroup_remove_exceeded(struct mem_cgroup *mem, 441__mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
487 struct mem_cgroup_per_zone *mz, 442 struct mem_cgroup_per_zone *mz,
488 struct mem_cgroup_tree_per_zone *mctz) 443 struct mem_cgroup_tree_per_zone *mctz)
489{ 444{
@@ -494,17 +449,17 @@ __mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
494} 449}
495 450
496static void 451static void
497mem_cgroup_remove_exceeded(struct mem_cgroup *mem, 452mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
498 struct mem_cgroup_per_zone *mz, 453 struct mem_cgroup_per_zone *mz,
499 struct mem_cgroup_tree_per_zone *mctz) 454 struct mem_cgroup_tree_per_zone *mctz)
500{ 455{
501 spin_lock(&mctz->lock); 456 spin_lock(&mctz->lock);
502 __mem_cgroup_remove_exceeded(mem, mz, mctz); 457 __mem_cgroup_remove_exceeded(memcg, mz, mctz);
503 spin_unlock(&mctz->lock); 458 spin_unlock(&mctz->lock);
504} 459}
505 460
506 461
507static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page) 462static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
508{ 463{
509 unsigned long long excess; 464 unsigned long long excess;
510 struct mem_cgroup_per_zone *mz; 465 struct mem_cgroup_per_zone *mz;
@@ -517,9 +472,9 @@ static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page)
517 * Necessary to update all ancestors when hierarchy is used. 472 * Necessary to update all ancestors when hierarchy is used.
518 * because their event counter is not touched. 473 * because their event counter is not touched.
519 */ 474 */
520 for (; mem; mem = parent_mem_cgroup(mem)) { 475 for (; memcg; memcg = parent_mem_cgroup(memcg)) {
521 mz = mem_cgroup_zoneinfo(mem, nid, zid); 476 mz = mem_cgroup_zoneinfo(memcg, nid, zid);
522 excess = res_counter_soft_limit_excess(&mem->res); 477 excess = res_counter_soft_limit_excess(&memcg->res);
523 /* 478 /*
524 * We have to update the tree if mz is on RB-tree or 479 * We have to update the tree if mz is on RB-tree or
525 * mem is over its softlimit. 480 * mem is over its softlimit.
@@ -528,18 +483,18 @@ static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page)
528 spin_lock(&mctz->lock); 483 spin_lock(&mctz->lock);
529 /* if on-tree, remove it */ 484 /* if on-tree, remove it */
530 if (mz->on_tree) 485 if (mz->on_tree)
531 __mem_cgroup_remove_exceeded(mem, mz, mctz); 486 __mem_cgroup_remove_exceeded(memcg, mz, mctz);
532 /* 487 /*
533 * Insert again. mz->usage_in_excess will be updated. 488 * Insert again. mz->usage_in_excess will be updated.
534 * If excess is 0, no tree ops. 489 * If excess is 0, no tree ops.
535 */ 490 */
536 __mem_cgroup_insert_exceeded(mem, mz, mctz, excess); 491 __mem_cgroup_insert_exceeded(memcg, mz, mctz, excess);
537 spin_unlock(&mctz->lock); 492 spin_unlock(&mctz->lock);
538 } 493 }
539 } 494 }
540} 495}
541 496
542static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem) 497static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
543{ 498{
544 int node, zone; 499 int node, zone;
545 struct mem_cgroup_per_zone *mz; 500 struct mem_cgroup_per_zone *mz;
@@ -547,9 +502,9 @@ static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem)
547 502
548 for_each_node_state(node, N_POSSIBLE) { 503 for_each_node_state(node, N_POSSIBLE) {
549 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 504 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
550 mz = mem_cgroup_zoneinfo(mem, node, zone); 505 mz = mem_cgroup_zoneinfo(memcg, node, zone);
551 mctz = soft_limit_tree_node_zone(node, zone); 506 mctz = soft_limit_tree_node_zone(node, zone);
552 mem_cgroup_remove_exceeded(mem, mz, mctz); 507 mem_cgroup_remove_exceeded(memcg, mz, mctz);
553 } 508 }
554 } 509 }
555} 510}
@@ -610,7 +565,7 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
610 * common workload, threashold and synchonization as vmstat[] should be 565 * common workload, threashold and synchonization as vmstat[] should be
611 * implemented. 566 * implemented.
612 */ 567 */
613static long mem_cgroup_read_stat(struct mem_cgroup *mem, 568static long mem_cgroup_read_stat(struct mem_cgroup *memcg,
614 enum mem_cgroup_stat_index idx) 569 enum mem_cgroup_stat_index idx)
615{ 570{
616 long val = 0; 571 long val = 0;
@@ -618,81 +573,83 @@ static long mem_cgroup_read_stat(struct mem_cgroup *mem,
618 573
619 get_online_cpus(); 574 get_online_cpus();
620 for_each_online_cpu(cpu) 575 for_each_online_cpu(cpu)
621 val += per_cpu(mem->stat->count[idx], cpu); 576 val += per_cpu(memcg->stat->count[idx], cpu);
622#ifdef CONFIG_HOTPLUG_CPU 577#ifdef CONFIG_HOTPLUG_CPU
623 spin_lock(&mem->pcp_counter_lock); 578 spin_lock(&memcg->pcp_counter_lock);
624 val += mem->nocpu_base.count[idx]; 579 val += memcg->nocpu_base.count[idx];
625 spin_unlock(&mem->pcp_counter_lock); 580 spin_unlock(&memcg->pcp_counter_lock);
626#endif 581#endif
627 put_online_cpus(); 582 put_online_cpus();
628 return val; 583 return val;
629} 584}
630 585
631static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, 586static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
632 bool charge) 587 bool charge)
633{ 588{
634 int val = (charge) ? 1 : -1; 589 int val = (charge) ? 1 : -1;
635 this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val); 590 this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAPOUT], val);
636} 591}
637 592
638void mem_cgroup_pgfault(struct mem_cgroup *mem, int val) 593void mem_cgroup_pgfault(struct mem_cgroup *memcg, int val)
639{ 594{
640 this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_PGFAULT], val); 595 this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT], val);
641} 596}
642 597
643void mem_cgroup_pgmajfault(struct mem_cgroup *mem, int val) 598void mem_cgroup_pgmajfault(struct mem_cgroup *memcg, int val)
644{ 599{
645 this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT], val); 600 this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT], val);
646} 601}
647 602
648static unsigned long mem_cgroup_read_events(struct mem_cgroup *mem, 603static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
649 enum mem_cgroup_events_index idx) 604 enum mem_cgroup_events_index idx)
650{ 605{
651 unsigned long val = 0; 606 unsigned long val = 0;
652 int cpu; 607 int cpu;
653 608
654 for_each_online_cpu(cpu) 609 for_each_online_cpu(cpu)
655 val += per_cpu(mem->stat->events[idx], cpu); 610 val += per_cpu(memcg->stat->events[idx], cpu);
656#ifdef CONFIG_HOTPLUG_CPU 611#ifdef CONFIG_HOTPLUG_CPU
657 spin_lock(&mem->pcp_counter_lock); 612 spin_lock(&memcg->pcp_counter_lock);
658 val += mem->nocpu_base.events[idx]; 613 val += memcg->nocpu_base.events[idx];
659 spin_unlock(&mem->pcp_counter_lock); 614 spin_unlock(&memcg->pcp_counter_lock);
660#endif 615#endif
661 return val; 616 return val;
662} 617}
663 618
664static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, 619static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
665 bool file, int nr_pages) 620 bool file, int nr_pages)
666{ 621{
667 preempt_disable(); 622 preempt_disable();
668 623
669 if (file) 624 if (file)
670 __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], nr_pages); 625 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],
626 nr_pages);
671 else 627 else
672 __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], nr_pages); 628 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS],
629 nr_pages);
673 630
674 /* pagein of a big page is an event. So, ignore page size */ 631 /* pagein of a big page is an event. So, ignore page size */
675 if (nr_pages > 0) 632 if (nr_pages > 0)
676 __this_cpu_inc(mem->stat->events[MEM_CGROUP_EVENTS_PGPGIN]); 633 __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]);
677 else { 634 else {
678 __this_cpu_inc(mem->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]); 635 __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]);
679 nr_pages = -nr_pages; /* for event */ 636 nr_pages = -nr_pages; /* for event */
680 } 637 }
681 638
682 __this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_COUNT], nr_pages); 639 __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT], nr_pages);
683 640
684 preempt_enable(); 641 preempt_enable();
685} 642}
686 643
687unsigned long 644unsigned long
688mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *mem, int nid, int zid, 645mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid,
689 unsigned int lru_mask) 646 unsigned int lru_mask)
690{ 647{
691 struct mem_cgroup_per_zone *mz; 648 struct mem_cgroup_per_zone *mz;
692 enum lru_list l; 649 enum lru_list l;
693 unsigned long ret = 0; 650 unsigned long ret = 0;
694 651
695 mz = mem_cgroup_zoneinfo(mem, nid, zid); 652 mz = mem_cgroup_zoneinfo(memcg, nid, zid);
696 653
697 for_each_lru(l) { 654 for_each_lru(l) {
698 if (BIT(l) & lru_mask) 655 if (BIT(l) & lru_mask)
@@ -702,44 +659,45 @@ mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *mem, int nid, int zid,
702} 659}
703 660
704static unsigned long 661static unsigned long
705mem_cgroup_node_nr_lru_pages(struct mem_cgroup *mem, 662mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
706 int nid, unsigned int lru_mask) 663 int nid, unsigned int lru_mask)
707{ 664{
708 u64 total = 0; 665 u64 total = 0;
709 int zid; 666 int zid;
710 667
711 for (zid = 0; zid < MAX_NR_ZONES; zid++) 668 for (zid = 0; zid < MAX_NR_ZONES; zid++)
712 total += mem_cgroup_zone_nr_lru_pages(mem, nid, zid, lru_mask); 669 total += mem_cgroup_zone_nr_lru_pages(memcg,
670 nid, zid, lru_mask);
713 671
714 return total; 672 return total;
715} 673}
716 674
717static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *mem, 675static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
718 unsigned int lru_mask) 676 unsigned int lru_mask)
719{ 677{
720 int nid; 678 int nid;
721 u64 total = 0; 679 u64 total = 0;
722 680
723 for_each_node_state(nid, N_HIGH_MEMORY) 681 for_each_node_state(nid, N_HIGH_MEMORY)
724 total += mem_cgroup_node_nr_lru_pages(mem, nid, lru_mask); 682 total += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);
725 return total; 683 return total;
726} 684}
727 685
728static bool __memcg_event_check(struct mem_cgroup *mem, int target) 686static bool __memcg_event_check(struct mem_cgroup *memcg, int target)
729{ 687{
730 unsigned long val, next; 688 unsigned long val, next;
731 689
732 val = this_cpu_read(mem->stat->events[MEM_CGROUP_EVENTS_COUNT]); 690 val = __this_cpu_read(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT]);
733 next = this_cpu_read(mem->stat->targets[target]); 691 next = __this_cpu_read(memcg->stat->targets[target]);
734 /* from time_after() in jiffies.h */ 692 /* from time_after() in jiffies.h */
735 return ((long)next - (long)val < 0); 693 return ((long)next - (long)val < 0);
736} 694}
737 695
738static void __mem_cgroup_target_update(struct mem_cgroup *mem, int target) 696static void __mem_cgroup_target_update(struct mem_cgroup *memcg, int target)
739{ 697{
740 unsigned long val, next; 698 unsigned long val, next;
741 699
742 val = this_cpu_read(mem->stat->events[MEM_CGROUP_EVENTS_COUNT]); 700 val = __this_cpu_read(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT]);
743 701
744 switch (target) { 702 switch (target) {
745 case MEM_CGROUP_TARGET_THRESH: 703 case MEM_CGROUP_TARGET_THRESH:
@@ -755,34 +713,36 @@ static void __mem_cgroup_target_update(struct mem_cgroup *mem, int target)
755 return; 713 return;
756 } 714 }
757 715
758 this_cpu_write(mem->stat->targets[target], next); 716 __this_cpu_write(memcg->stat->targets[target], next);
759} 717}
760 718
761/* 719/*
762 * Check events in order. 720 * Check events in order.
763 * 721 *
764 */ 722 */
765static void memcg_check_events(struct mem_cgroup *mem, struct page *page) 723static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
766{ 724{
725 preempt_disable();
767 /* threshold event is triggered in finer grain than soft limit */ 726 /* threshold event is triggered in finer grain than soft limit */
768 if (unlikely(__memcg_event_check(mem, MEM_CGROUP_TARGET_THRESH))) { 727 if (unlikely(__memcg_event_check(memcg, MEM_CGROUP_TARGET_THRESH))) {
769 mem_cgroup_threshold(mem); 728 mem_cgroup_threshold(memcg);
770 __mem_cgroup_target_update(mem, MEM_CGROUP_TARGET_THRESH); 729 __mem_cgroup_target_update(memcg, MEM_CGROUP_TARGET_THRESH);
771 if (unlikely(__memcg_event_check(mem, 730 if (unlikely(__memcg_event_check(memcg,
772 MEM_CGROUP_TARGET_SOFTLIMIT))) { 731 MEM_CGROUP_TARGET_SOFTLIMIT))) {
773 mem_cgroup_update_tree(mem, page); 732 mem_cgroup_update_tree(memcg, page);
774 __mem_cgroup_target_update(mem, 733 __mem_cgroup_target_update(memcg,
775 MEM_CGROUP_TARGET_SOFTLIMIT); 734 MEM_CGROUP_TARGET_SOFTLIMIT);
776 } 735 }
777#if MAX_NUMNODES > 1 736#if MAX_NUMNODES > 1
778 if (unlikely(__memcg_event_check(mem, 737 if (unlikely(__memcg_event_check(memcg,
779 MEM_CGROUP_TARGET_NUMAINFO))) { 738 MEM_CGROUP_TARGET_NUMAINFO))) {
780 atomic_inc(&mem->numainfo_events); 739 atomic_inc(&memcg->numainfo_events);
781 __mem_cgroup_target_update(mem, 740 __mem_cgroup_target_update(memcg,
782 MEM_CGROUP_TARGET_NUMAINFO); 741 MEM_CGROUP_TARGET_NUMAINFO);
783 } 742 }
784#endif 743#endif
785 } 744 }
745 preempt_enable();
786} 746}
787 747
788static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) 748static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
@@ -808,7 +768,7 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
808 768
809struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) 769struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
810{ 770{
811 struct mem_cgroup *mem = NULL; 771 struct mem_cgroup *memcg = NULL;
812 772
813 if (!mm) 773 if (!mm)
814 return NULL; 774 return NULL;
@@ -819,25 +779,25 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
819 */ 779 */
820 rcu_read_lock(); 780 rcu_read_lock();
821 do { 781 do {
822 mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); 782 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
823 if (unlikely(!mem)) 783 if (unlikely(!memcg))
824 break; 784 break;
825 } while (!css_tryget(&mem->css)); 785 } while (!css_tryget(&memcg->css));
826 rcu_read_unlock(); 786 rcu_read_unlock();
827 return mem; 787 return memcg;
828} 788}
829 789
830/* The caller has to guarantee "mem" exists before calling this */ 790/* The caller has to guarantee "mem" exists before calling this */
831static struct mem_cgroup *mem_cgroup_start_loop(struct mem_cgroup *mem) 791static struct mem_cgroup *mem_cgroup_start_loop(struct mem_cgroup *memcg)
832{ 792{
833 struct cgroup_subsys_state *css; 793 struct cgroup_subsys_state *css;
834 int found; 794 int found;
835 795
836 if (!mem) /* ROOT cgroup has the smallest ID */ 796 if (!memcg) /* ROOT cgroup has the smallest ID */
837 return root_mem_cgroup; /*css_put/get against root is ignored*/ 797 return root_mem_cgroup; /*css_put/get against root is ignored*/
838 if (!mem->use_hierarchy) { 798 if (!memcg->use_hierarchy) {
839 if (css_tryget(&mem->css)) 799 if (css_tryget(&memcg->css))
840 return mem; 800 return memcg;
841 return NULL; 801 return NULL;
842 } 802 }
843 rcu_read_lock(); 803 rcu_read_lock();
@@ -845,13 +805,13 @@ static struct mem_cgroup *mem_cgroup_start_loop(struct mem_cgroup *mem)
845 * searching a memory cgroup which has the smallest ID under given 805 * searching a memory cgroup which has the smallest ID under given
846 * ROOT cgroup. (ID >= 1) 806 * ROOT cgroup. (ID >= 1)
847 */ 807 */
848 css = css_get_next(&mem_cgroup_subsys, 1, &mem->css, &found); 808 css = css_get_next(&mem_cgroup_subsys, 1, &memcg->css, &found);
849 if (css && css_tryget(css)) 809 if (css && css_tryget(css))
850 mem = container_of(css, struct mem_cgroup, css); 810 memcg = container_of(css, struct mem_cgroup, css);
851 else 811 else
852 mem = NULL; 812 memcg = NULL;
853 rcu_read_unlock(); 813 rcu_read_unlock();
854 return mem; 814 return memcg;
855} 815}
856 816
857static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter, 817static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter,
@@ -905,29 +865,29 @@ static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter,
905 for_each_mem_cgroup_tree_cond(iter, NULL, true) 865 for_each_mem_cgroup_tree_cond(iter, NULL, true)
906 866
907 867
908static inline bool mem_cgroup_is_root(struct mem_cgroup *mem) 868static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
909{ 869{
910 return (mem == root_mem_cgroup); 870 return (memcg == root_mem_cgroup);
911} 871}
912 872
913void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) 873void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
914{ 874{
915 struct mem_cgroup *mem; 875 struct mem_cgroup *memcg;
916 876
917 if (!mm) 877 if (!mm)
918 return; 878 return;
919 879
920 rcu_read_lock(); 880 rcu_read_lock();
921 mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); 881 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
922 if (unlikely(!mem)) 882 if (unlikely(!memcg))
923 goto out; 883 goto out;
924 884
925 switch (idx) { 885 switch (idx) {
926 case PGMAJFAULT: 886 case PGMAJFAULT:
927 mem_cgroup_pgmajfault(mem, 1); 887 mem_cgroup_pgmajfault(memcg, 1);
928 break; 888 break;
929 case PGFAULT: 889 case PGFAULT:
930 mem_cgroup_pgfault(mem, 1); 890 mem_cgroup_pgfault(memcg, 1);
931 break; 891 break;
932 default: 892 default:
933 BUG(); 893 BUG();
@@ -1036,6 +996,16 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
1036 return; 996 return;
1037 pc = lookup_page_cgroup(page); 997 pc = lookup_page_cgroup(page);
1038 VM_BUG_ON(PageCgroupAcctLRU(pc)); 998 VM_BUG_ON(PageCgroupAcctLRU(pc));
999 /*
1000 * putback: charge:
1001 * SetPageLRU SetPageCgroupUsed
1002 * smp_mb smp_mb
1003 * PageCgroupUsed && add to memcg LRU PageLRU && add to memcg LRU
1004 *
1005 * Ensure that one of the two sides adds the page to the memcg
1006 * LRU during a race.
1007 */
1008 smp_mb();
1039 if (!PageCgroupUsed(pc)) 1009 if (!PageCgroupUsed(pc))
1040 return; 1010 return;
1041 /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ 1011 /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
@@ -1087,7 +1057,16 @@ static void mem_cgroup_lru_add_after_commit(struct page *page)
1087 unsigned long flags; 1057 unsigned long flags;
1088 struct zone *zone = page_zone(page); 1058 struct zone *zone = page_zone(page);
1089 struct page_cgroup *pc = lookup_page_cgroup(page); 1059 struct page_cgroup *pc = lookup_page_cgroup(page);
1090 1060 /*
1061 * putback: charge:
1062 * SetPageLRU SetPageCgroupUsed
1063 * smp_mb smp_mb
1064 * PageCgroupUsed && add to memcg LRU PageLRU && add to memcg LRU
1065 *
1066 * Ensure that one of the two sides adds the page to the memcg
1067 * LRU during a race.
1068 */
1069 smp_mb();
1091 /* taking care of that the page is added to LRU while we commit it */ 1070 /* taking care of that the page is added to LRU while we commit it */
1092 if (likely(!PageLRU(page))) 1071 if (likely(!PageLRU(page)))
1093 return; 1072 return;
@@ -1109,21 +1088,21 @@ void mem_cgroup_move_lists(struct page *page,
1109} 1088}
1110 1089
1111/* 1090/*
1112 * Checks whether given mem is same or in the root_mem's 1091 * Checks whether given mem is same or in the root_mem_cgroup's
1113 * hierarchy subtree 1092 * hierarchy subtree
1114 */ 1093 */
1115static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_mem, 1094static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
1116 struct mem_cgroup *mem) 1095 struct mem_cgroup *memcg)
1117{ 1096{
1118 if (root_mem != mem) { 1097 if (root_memcg != memcg) {
1119 return (root_mem->use_hierarchy && 1098 return (root_memcg->use_hierarchy &&
1120 css_is_ancestor(&mem->css, &root_mem->css)); 1099 css_is_ancestor(&memcg->css, &root_memcg->css));
1121 } 1100 }
1122 1101
1123 return true; 1102 return true;
1124} 1103}
1125 1104
1126int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) 1105int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg)
1127{ 1106{
1128 int ret; 1107 int ret;
1129 struct mem_cgroup *curr = NULL; 1108 struct mem_cgroup *curr = NULL;
@@ -1137,25 +1116,29 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
1137 if (!curr) 1116 if (!curr)
1138 return 0; 1117 return 0;
1139 /* 1118 /*
1140 * We should check use_hierarchy of "mem" not "curr". Because checking 1119 * We should check use_hierarchy of "memcg" not "curr". Because checking
1141 * use_hierarchy of "curr" here make this function true if hierarchy is 1120 * use_hierarchy of "curr" here make this function true if hierarchy is
1142 * enabled in "curr" and "curr" is a child of "mem" in *cgroup* 1121 * enabled in "curr" and "curr" is a child of "memcg" in *cgroup*
1143 * hierarchy(even if use_hierarchy is disabled in "mem"). 1122 * hierarchy(even if use_hierarchy is disabled in "memcg").
1144 */ 1123 */
1145 ret = mem_cgroup_same_or_subtree(mem, curr); 1124 ret = mem_cgroup_same_or_subtree(memcg, curr);
1146 css_put(&curr->css); 1125 css_put(&curr->css);
1147 return ret; 1126 return ret;
1148} 1127}
1149 1128
1150static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages) 1129int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg, struct zone *zone)
1151{ 1130{
1152 unsigned long active; 1131 unsigned long inactive_ratio;
1132 int nid = zone_to_nid(zone);
1133 int zid = zone_idx(zone);
1153 unsigned long inactive; 1134 unsigned long inactive;
1135 unsigned long active;
1154 unsigned long gb; 1136 unsigned long gb;
1155 unsigned long inactive_ratio;
1156 1137
1157 inactive = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_ANON)); 1138 inactive = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid,
1158 active = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_ANON)); 1139 BIT(LRU_INACTIVE_ANON));
1140 active = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid,
1141 BIT(LRU_ACTIVE_ANON));
1159 1142
1160 gb = (inactive + active) >> (30 - PAGE_SHIFT); 1143 gb = (inactive + active) >> (30 - PAGE_SHIFT);
1161 if (gb) 1144 if (gb)
@@ -1163,39 +1146,20 @@ static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_
1163 else 1146 else
1164 inactive_ratio = 1; 1147 inactive_ratio = 1;
1165 1148
1166 if (present_pages) { 1149 return inactive * inactive_ratio < active;
1167 present_pages[0] = inactive;
1168 present_pages[1] = active;
1169 }
1170
1171 return inactive_ratio;
1172}
1173
1174int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg)
1175{
1176 unsigned long active;
1177 unsigned long inactive;
1178 unsigned long present_pages[2];
1179 unsigned long inactive_ratio;
1180
1181 inactive_ratio = calc_inactive_ratio(memcg, present_pages);
1182
1183 inactive = present_pages[0];
1184 active = present_pages[1];
1185
1186 if (inactive * inactive_ratio < active)
1187 return 1;
1188
1189 return 0;
1190} 1150}
1191 1151
1192int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg) 1152int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg, struct zone *zone)
1193{ 1153{
1194 unsigned long active; 1154 unsigned long active;
1195 unsigned long inactive; 1155 unsigned long inactive;
1156 int zid = zone_idx(zone);
1157 int nid = zone_to_nid(zone);
1196 1158
1197 inactive = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_FILE)); 1159 inactive = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid,
1198 active = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_FILE)); 1160 BIT(LRU_INACTIVE_FILE));
1161 active = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid,
1162 BIT(LRU_ACTIVE_FILE));
1199 1163
1200 return (active > inactive); 1164 return (active > inactive);
1201} 1165}
@@ -1231,7 +1195,8 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page)
1231unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, 1195unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
1232 struct list_head *dst, 1196 struct list_head *dst,
1233 unsigned long *scanned, int order, 1197 unsigned long *scanned, int order,
1234 int mode, struct zone *z, 1198 isolate_mode_t mode,
1199 struct zone *z,
1235 struct mem_cgroup *mem_cont, 1200 struct mem_cgroup *mem_cont,
1236 int active, int file) 1201 int active, int file)
1237{ 1202{
@@ -1299,13 +1264,13 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
1299 * Returns the maximum amount of memory @mem can be charged with, in 1264 * Returns the maximum amount of memory @mem can be charged with, in
1300 * pages. 1265 * pages.
1301 */ 1266 */
1302static unsigned long mem_cgroup_margin(struct mem_cgroup *mem) 1267static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
1303{ 1268{
1304 unsigned long long margin; 1269 unsigned long long margin;
1305 1270
1306 margin = res_counter_margin(&mem->res); 1271 margin = res_counter_margin(&memcg->res);
1307 if (do_swap_account) 1272 if (do_swap_account)
1308 margin = min(margin, res_counter_margin(&mem->memsw)); 1273 margin = min(margin, res_counter_margin(&memcg->memsw));
1309 return margin >> PAGE_SHIFT; 1274 return margin >> PAGE_SHIFT;
1310} 1275}
1311 1276
@@ -1320,33 +1285,33 @@ int mem_cgroup_swappiness(struct mem_cgroup *memcg)
1320 return memcg->swappiness; 1285 return memcg->swappiness;
1321} 1286}
1322 1287
1323static void mem_cgroup_start_move(struct mem_cgroup *mem) 1288static void mem_cgroup_start_move(struct mem_cgroup *memcg)
1324{ 1289{
1325 int cpu; 1290 int cpu;
1326 1291
1327 get_online_cpus(); 1292 get_online_cpus();
1328 spin_lock(&mem->pcp_counter_lock); 1293 spin_lock(&memcg->pcp_counter_lock);
1329 for_each_online_cpu(cpu) 1294 for_each_online_cpu(cpu)
1330 per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1; 1295 per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1;
1331 mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] += 1; 1296 memcg->nocpu_base.count[MEM_CGROUP_ON_MOVE] += 1;
1332 spin_unlock(&mem->pcp_counter_lock); 1297 spin_unlock(&memcg->pcp_counter_lock);
1333 put_online_cpus(); 1298 put_online_cpus();
1334 1299
1335 synchronize_rcu(); 1300 synchronize_rcu();
1336} 1301}
1337 1302
1338static void mem_cgroup_end_move(struct mem_cgroup *mem) 1303static void mem_cgroup_end_move(struct mem_cgroup *memcg)
1339{ 1304{
1340 int cpu; 1305 int cpu;
1341 1306
1342 if (!mem) 1307 if (!memcg)
1343 return; 1308 return;
1344 get_online_cpus(); 1309 get_online_cpus();
1345 spin_lock(&mem->pcp_counter_lock); 1310 spin_lock(&memcg->pcp_counter_lock);
1346 for_each_online_cpu(cpu) 1311 for_each_online_cpu(cpu)
1347 per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1; 1312 per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1;
1348 mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] -= 1; 1313 memcg->nocpu_base.count[MEM_CGROUP_ON_MOVE] -= 1;
1349 spin_unlock(&mem->pcp_counter_lock); 1314 spin_unlock(&memcg->pcp_counter_lock);
1350 put_online_cpus(); 1315 put_online_cpus();
1351} 1316}
1352/* 1317/*
@@ -1361,13 +1326,13 @@ static void mem_cgroup_end_move(struct mem_cgroup *mem)
1361 * waiting at hith-memory prressure caused by "move". 1326 * waiting at hith-memory prressure caused by "move".
1362 */ 1327 */
1363 1328
1364static bool mem_cgroup_stealed(struct mem_cgroup *mem) 1329static bool mem_cgroup_stealed(struct mem_cgroup *memcg)
1365{ 1330{
1366 VM_BUG_ON(!rcu_read_lock_held()); 1331 VM_BUG_ON(!rcu_read_lock_held());
1367 return this_cpu_read(mem->stat->count[MEM_CGROUP_ON_MOVE]) > 0; 1332 return this_cpu_read(memcg->stat->count[MEM_CGROUP_ON_MOVE]) > 0;
1368} 1333}
1369 1334
1370static bool mem_cgroup_under_move(struct mem_cgroup *mem) 1335static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
1371{ 1336{
1372 struct mem_cgroup *from; 1337 struct mem_cgroup *from;
1373 struct mem_cgroup *to; 1338 struct mem_cgroup *to;
@@ -1382,17 +1347,17 @@ static bool mem_cgroup_under_move(struct mem_cgroup *mem)
1382 if (!from) 1347 if (!from)
1383 goto unlock; 1348 goto unlock;
1384 1349
1385 ret = mem_cgroup_same_or_subtree(mem, from) 1350 ret = mem_cgroup_same_or_subtree(memcg, from)
1386 || mem_cgroup_same_or_subtree(mem, to); 1351 || mem_cgroup_same_or_subtree(memcg, to);
1387unlock: 1352unlock:
1388 spin_unlock(&mc.lock); 1353 spin_unlock(&mc.lock);
1389 return ret; 1354 return ret;
1390} 1355}
1391 1356
1392static bool mem_cgroup_wait_acct_move(struct mem_cgroup *mem) 1357static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
1393{ 1358{
1394 if (mc.moving_task && current != mc.moving_task) { 1359 if (mc.moving_task && current != mc.moving_task) {
1395 if (mem_cgroup_under_move(mem)) { 1360 if (mem_cgroup_under_move(memcg)) {
1396 DEFINE_WAIT(wait); 1361 DEFINE_WAIT(wait);
1397 prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE); 1362 prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
1398 /* moving charge context might have finished. */ 1363 /* moving charge context might have finished. */
@@ -1476,12 +1441,12 @@ done:
1476 * This function returns the number of memcg under hierarchy tree. Returns 1441 * This function returns the number of memcg under hierarchy tree. Returns
1477 * 1(self count) if no children. 1442 * 1(self count) if no children.
1478 */ 1443 */
1479static int mem_cgroup_count_children(struct mem_cgroup *mem) 1444static int mem_cgroup_count_children(struct mem_cgroup *memcg)
1480{ 1445{
1481 int num = 0; 1446 int num = 0;
1482 struct mem_cgroup *iter; 1447 struct mem_cgroup *iter;
1483 1448
1484 for_each_mem_cgroup_tree(iter, mem) 1449 for_each_mem_cgroup_tree(iter, memcg)
1485 num++; 1450 num++;
1486 return num; 1451 return num;
1487} 1452}
@@ -1511,21 +1476,21 @@ u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
1511 * that to reclaim free pages from. 1476 * that to reclaim free pages from.
1512 */ 1477 */
1513static struct mem_cgroup * 1478static struct mem_cgroup *
1514mem_cgroup_select_victim(struct mem_cgroup *root_mem) 1479mem_cgroup_select_victim(struct mem_cgroup *root_memcg)
1515{ 1480{
1516 struct mem_cgroup *ret = NULL; 1481 struct mem_cgroup *ret = NULL;
1517 struct cgroup_subsys_state *css; 1482 struct cgroup_subsys_state *css;
1518 int nextid, found; 1483 int nextid, found;
1519 1484
1520 if (!root_mem->use_hierarchy) { 1485 if (!root_memcg->use_hierarchy) {
1521 css_get(&root_mem->css); 1486 css_get(&root_memcg->css);
1522 ret = root_mem; 1487 ret = root_memcg;
1523 } 1488 }
1524 1489
1525 while (!ret) { 1490 while (!ret) {
1526 rcu_read_lock(); 1491 rcu_read_lock();
1527 nextid = root_mem->last_scanned_child + 1; 1492 nextid = root_memcg->last_scanned_child + 1;
1528 css = css_get_next(&mem_cgroup_subsys, nextid, &root_mem->css, 1493 css = css_get_next(&mem_cgroup_subsys, nextid, &root_memcg->css,
1529 &found); 1494 &found);
1530 if (css && css_tryget(css)) 1495 if (css && css_tryget(css))
1531 ret = container_of(css, struct mem_cgroup, css); 1496 ret = container_of(css, struct mem_cgroup, css);
@@ -1534,9 +1499,9 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem)
1534 /* Updates scanning parameter */ 1499 /* Updates scanning parameter */
1535 if (!css) { 1500 if (!css) {
1536 /* this means start scan from ID:1 */ 1501 /* this means start scan from ID:1 */
1537 root_mem->last_scanned_child = 0; 1502 root_memcg->last_scanned_child = 0;
1538 } else 1503 } else
1539 root_mem->last_scanned_child = found; 1504 root_memcg->last_scanned_child = found;
1540 } 1505 }
1541 1506
1542 return ret; 1507 return ret;
@@ -1552,14 +1517,14 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem)
1552 * reclaimable pages on a node. Returns true if there are any reclaimable 1517 * reclaimable pages on a node. Returns true if there are any reclaimable
1553 * pages in the node. 1518 * pages in the node.
1554 */ 1519 */
1555static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *mem, 1520static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
1556 int nid, bool noswap) 1521 int nid, bool noswap)
1557{ 1522{
1558 if (mem_cgroup_node_nr_lru_pages(mem, nid, LRU_ALL_FILE)) 1523 if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE))
1559 return true; 1524 return true;
1560 if (noswap || !total_swap_pages) 1525 if (noswap || !total_swap_pages)
1561 return false; 1526 return false;
1562 if (mem_cgroup_node_nr_lru_pages(mem, nid, LRU_ALL_ANON)) 1527 if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON))
1563 return true; 1528 return true;
1564 return false; 1529 return false;
1565 1530
@@ -1572,29 +1537,29 @@ static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *mem,
1572 * nodes based on the zonelist. So update the list loosely once per 10 secs. 1537 * nodes based on the zonelist. So update the list loosely once per 10 secs.
1573 * 1538 *
1574 */ 1539 */
1575static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem) 1540static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg)
1576{ 1541{
1577 int nid; 1542 int nid;
1578 /* 1543 /*
1579 * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET 1544 * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET
1580 * pagein/pageout changes since the last update. 1545 * pagein/pageout changes since the last update.
1581 */ 1546 */
1582 if (!atomic_read(&mem->numainfo_events)) 1547 if (!atomic_read(&memcg->numainfo_events))
1583 return; 1548 return;
1584 if (atomic_inc_return(&mem->numainfo_updating) > 1) 1549 if (atomic_inc_return(&memcg->numainfo_updating) > 1)
1585 return; 1550 return;
1586 1551
1587 /* make a nodemask where this memcg uses memory from */ 1552 /* make a nodemask where this memcg uses memory from */
1588 mem->scan_nodes = node_states[N_HIGH_MEMORY]; 1553 memcg->scan_nodes = node_states[N_HIGH_MEMORY];
1589 1554
1590 for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) { 1555 for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) {
1591 1556
1592 if (!test_mem_cgroup_node_reclaimable(mem, nid, false)) 1557 if (!test_mem_cgroup_node_reclaimable(memcg, nid, false))
1593 node_clear(nid, mem->scan_nodes); 1558 node_clear(nid, memcg->scan_nodes);
1594 } 1559 }
1595 1560
1596 atomic_set(&mem->numainfo_events, 0); 1561 atomic_set(&memcg->numainfo_events, 0);
1597 atomic_set(&mem->numainfo_updating, 0); 1562 atomic_set(&memcg->numainfo_updating, 0);
1598} 1563}
1599 1564
1600/* 1565/*
@@ -1609,16 +1574,16 @@ static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem)
1609 * 1574 *
1610 * Now, we use round-robin. Better algorithm is welcomed. 1575 * Now, we use round-robin. Better algorithm is welcomed.
1611 */ 1576 */
1612int mem_cgroup_select_victim_node(struct mem_cgroup *mem) 1577int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1613{ 1578{
1614 int node; 1579 int node;
1615 1580
1616 mem_cgroup_may_update_nodemask(mem); 1581 mem_cgroup_may_update_nodemask(memcg);
1617 node = mem->last_scanned_node; 1582 node = memcg->last_scanned_node;
1618 1583
1619 node = next_node(node, mem->scan_nodes); 1584 node = next_node(node, memcg->scan_nodes);
1620 if (node == MAX_NUMNODES) 1585 if (node == MAX_NUMNODES)
1621 node = first_node(mem->scan_nodes); 1586 node = first_node(memcg->scan_nodes);
1622 /* 1587 /*
1623 * We call this when we hit limit, not when pages are added to LRU. 1588 * We call this when we hit limit, not when pages are added to LRU.
1624 * No LRU may hold pages because all pages are UNEVICTABLE or 1589 * No LRU may hold pages because all pages are UNEVICTABLE or
@@ -1628,7 +1593,7 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
1628 if (unlikely(node == MAX_NUMNODES)) 1593 if (unlikely(node == MAX_NUMNODES))
1629 node = numa_node_id(); 1594 node = numa_node_id();
1630 1595
1631 mem->last_scanned_node = node; 1596 memcg->last_scanned_node = node;
1632 return node; 1597 return node;
1633} 1598}
1634 1599
@@ -1638,7 +1603,7 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
1638 * unused nodes. But scan_nodes is lazily updated and may not cotain 1603 * unused nodes. But scan_nodes is lazily updated and may not cotain
1639 * enough new information. We need to do double check. 1604 * enough new information. We need to do double check.
1640 */ 1605 */
1641bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap) 1606bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
1642{ 1607{
1643 int nid; 1608 int nid;
1644 1609
@@ -1646,12 +1611,12 @@ bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap)
1646 * quick check...making use of scan_node. 1611 * quick check...making use of scan_node.
1647 * We can skip unused nodes. 1612 * We can skip unused nodes.
1648 */ 1613 */
1649 if (!nodes_empty(mem->scan_nodes)) { 1614 if (!nodes_empty(memcg->scan_nodes)) {
1650 for (nid = first_node(mem->scan_nodes); 1615 for (nid = first_node(memcg->scan_nodes);
1651 nid < MAX_NUMNODES; 1616 nid < MAX_NUMNODES;
1652 nid = next_node(nid, mem->scan_nodes)) { 1617 nid = next_node(nid, memcg->scan_nodes)) {
1653 1618
1654 if (test_mem_cgroup_node_reclaimable(mem, nid, noswap)) 1619 if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
1655 return true; 1620 return true;
1656 } 1621 }
1657 } 1622 }
@@ -1659,77 +1624,39 @@ bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap)
1659 * Check rest of nodes. 1624 * Check rest of nodes.
1660 */ 1625 */
1661 for_each_node_state(nid, N_HIGH_MEMORY) { 1626 for_each_node_state(nid, N_HIGH_MEMORY) {
1662 if (node_isset(nid, mem->scan_nodes)) 1627 if (node_isset(nid, memcg->scan_nodes))
1663 continue; 1628 continue;
1664 if (test_mem_cgroup_node_reclaimable(mem, nid, noswap)) 1629 if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
1665 return true; 1630 return true;
1666 } 1631 }
1667 return false; 1632 return false;
1668} 1633}
1669 1634
1670#else 1635#else
1671int mem_cgroup_select_victim_node(struct mem_cgroup *mem) 1636int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1672{ 1637{
1673 return 0; 1638 return 0;
1674} 1639}
1675 1640
1676bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap) 1641bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
1677{ 1642{
1678 return test_mem_cgroup_node_reclaimable(mem, 0, noswap); 1643 return test_mem_cgroup_node_reclaimable(memcg, 0, noswap);
1679} 1644}
1680#endif 1645#endif
1681 1646
1682static void __mem_cgroup_record_scanstat(unsigned long *stats,
1683 struct memcg_scanrecord *rec)
1684{
1685
1686 stats[SCAN] += rec->nr_scanned[0] + rec->nr_scanned[1];
1687 stats[SCAN_ANON] += rec->nr_scanned[0];
1688 stats[SCAN_FILE] += rec->nr_scanned[1];
1689
1690 stats[ROTATE] += rec->nr_rotated[0] + rec->nr_rotated[1];
1691 stats[ROTATE_ANON] += rec->nr_rotated[0];
1692 stats[ROTATE_FILE] += rec->nr_rotated[1];
1693
1694 stats[FREED] += rec->nr_freed[0] + rec->nr_freed[1];
1695 stats[FREED_ANON] += rec->nr_freed[0];
1696 stats[FREED_FILE] += rec->nr_freed[1];
1697
1698 stats[ELAPSED] += rec->elapsed;
1699}
1700
1701static void mem_cgroup_record_scanstat(struct memcg_scanrecord *rec)
1702{
1703 struct mem_cgroup *mem;
1704 int context = rec->context;
1705
1706 if (context >= NR_SCAN_CONTEXT)
1707 return;
1708
1709 mem = rec->mem;
1710 spin_lock(&mem->scanstat.lock);
1711 __mem_cgroup_record_scanstat(mem->scanstat.stats[context], rec);
1712 spin_unlock(&mem->scanstat.lock);
1713
1714 mem = rec->root;
1715 spin_lock(&mem->scanstat.lock);
1716 __mem_cgroup_record_scanstat(mem->scanstat.rootstats[context], rec);
1717 spin_unlock(&mem->scanstat.lock);
1718}
1719
1720/* 1647/*
1721 * Scan the hierarchy if needed to reclaim memory. We remember the last child 1648 * Scan the hierarchy if needed to reclaim memory. We remember the last child
1722 * we reclaimed from, so that we don't end up penalizing one child extensively 1649 * we reclaimed from, so that we don't end up penalizing one child extensively
1723 * based on its position in the children list. 1650 * based on its position in the children list.
1724 * 1651 *
1725 * root_mem is the original ancestor that we've been reclaim from. 1652 * root_memcg is the original ancestor that we've been reclaim from.
1726 * 1653 *
1727 * We give up and return to the caller when we visit root_mem twice. 1654 * We give up and return to the caller when we visit root_memcg twice.
1728 * (other groups can be removed while we're walking....) 1655 * (other groups can be removed while we're walking....)
1729 * 1656 *
1730 * If shrink==true, for avoiding to free too much, this returns immedieately. 1657 * If shrink==true, for avoiding to free too much, this returns immedieately.
1731 */ 1658 */
1732static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, 1659static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_memcg,
1733 struct zone *zone, 1660 struct zone *zone,
1734 gfp_t gfp_mask, 1661 gfp_t gfp_mask,
1735 unsigned long reclaim_options, 1662 unsigned long reclaim_options,
@@ -1741,28 +1668,18 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1741 bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP; 1668 bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP;
1742 bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK; 1669 bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK;
1743 bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT; 1670 bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT;
1744 struct memcg_scanrecord rec;
1745 unsigned long excess; 1671 unsigned long excess;
1746 unsigned long scanned; 1672 unsigned long nr_scanned;
1747 1673
1748 excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT; 1674 excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT;
1749 1675
1750 /* If memsw_is_minimum==1, swap-out is of-no-use. */ 1676 /* If memsw_is_minimum==1, swap-out is of-no-use. */
1751 if (!check_soft && !shrink && root_mem->memsw_is_minimum) 1677 if (!check_soft && !shrink && root_memcg->memsw_is_minimum)
1752 noswap = true; 1678 noswap = true;
1753 1679
1754 if (shrink)
1755 rec.context = SCAN_BY_SHRINK;
1756 else if (check_soft)
1757 rec.context = SCAN_BY_SYSTEM;
1758 else
1759 rec.context = SCAN_BY_LIMIT;
1760
1761 rec.root = root_mem;
1762
1763 while (1) { 1680 while (1) {
1764 victim = mem_cgroup_select_victim(root_mem); 1681 victim = mem_cgroup_select_victim(root_memcg);
1765 if (victim == root_mem) { 1682 if (victim == root_memcg) {
1766 loop++; 1683 loop++;
1767 /* 1684 /*
1768 * We are not draining per cpu cached charges during 1685 * We are not draining per cpu cached charges during
@@ -1771,7 +1688,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1771 * charges will not give any. 1688 * charges will not give any.
1772 */ 1689 */
1773 if (!check_soft && loop >= 1) 1690 if (!check_soft && loop >= 1)
1774 drain_all_stock_async(root_mem); 1691 drain_all_stock_async(root_memcg);
1775 if (loop >= 2) { 1692 if (loop >= 2) {
1776 /* 1693 /*
1777 * If we have not been able to reclaim 1694 * If we have not been able to reclaim
@@ -1800,23 +1717,14 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1800 css_put(&victim->css); 1717 css_put(&victim->css);
1801 continue; 1718 continue;
1802 } 1719 }
1803 rec.mem = victim;
1804 rec.nr_scanned[0] = 0;
1805 rec.nr_scanned[1] = 0;
1806 rec.nr_rotated[0] = 0;
1807 rec.nr_rotated[1] = 0;
1808 rec.nr_freed[0] = 0;
1809 rec.nr_freed[1] = 0;
1810 rec.elapsed = 0;
1811 /* we use swappiness of local cgroup */ 1720 /* we use swappiness of local cgroup */
1812 if (check_soft) { 1721 if (check_soft) {
1813 ret = mem_cgroup_shrink_node_zone(victim, gfp_mask, 1722 ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,
1814 noswap, zone, &rec, &scanned); 1723 noswap, zone, &nr_scanned);
1815 *total_scanned += scanned; 1724 *total_scanned += nr_scanned;
1816 } else 1725 } else
1817 ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, 1726 ret = try_to_free_mem_cgroup_pages(victim, gfp_mask,
1818 noswap, &rec); 1727 noswap);
1819 mem_cgroup_record_scanstat(&rec);
1820 css_put(&victim->css); 1728 css_put(&victim->css);
1821 /* 1729 /*
1822 * At shrinking usage, we can't check we should stop here or 1730 * At shrinking usage, we can't check we should stop here or
@@ -1827,9 +1735,9 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1827 return ret; 1735 return ret;
1828 total += ret; 1736 total += ret;
1829 if (check_soft) { 1737 if (check_soft) {
1830 if (!res_counter_soft_limit_excess(&root_mem->res)) 1738 if (!res_counter_soft_limit_excess(&root_memcg->res))
1831 return total; 1739 return total;
1832 } else if (mem_cgroup_margin(root_mem)) 1740 } else if (mem_cgroup_margin(root_memcg))
1833 return total; 1741 return total;
1834 } 1742 }
1835 return total; 1743 return total;
@@ -1840,69 +1748,62 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1840 * If someone is running, return false. 1748 * If someone is running, return false.
1841 * Has to be called with memcg_oom_lock 1749 * Has to be called with memcg_oom_lock
1842 */ 1750 */
1843static bool mem_cgroup_oom_lock(struct mem_cgroup *mem) 1751static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg)
1844{ 1752{
1845 int lock_count = -1;
1846 struct mem_cgroup *iter, *failed = NULL; 1753 struct mem_cgroup *iter, *failed = NULL;
1847 bool cond = true; 1754 bool cond = true;
1848 1755
1849 for_each_mem_cgroup_tree_cond(iter, mem, cond) { 1756 for_each_mem_cgroup_tree_cond(iter, memcg, cond) {
1850 bool locked = iter->oom_lock; 1757 if (iter->oom_lock) {
1851
1852 iter->oom_lock = true;
1853 if (lock_count == -1)
1854 lock_count = iter->oom_lock;
1855 else if (lock_count != locked) {
1856 /* 1758 /*
1857 * this subtree of our hierarchy is already locked 1759 * this subtree of our hierarchy is already locked
1858 * so we cannot give a lock. 1760 * so we cannot give a lock.
1859 */ 1761 */
1860 lock_count = 0;
1861 failed = iter; 1762 failed = iter;
1862 cond = false; 1763 cond = false;
1863 } 1764 } else
1765 iter->oom_lock = true;
1864 } 1766 }
1865 1767
1866 if (!failed) 1768 if (!failed)
1867 goto done; 1769 return true;
1868 1770
1869 /* 1771 /*
1870 * OK, we failed to lock the whole subtree so we have to clean up 1772 * OK, we failed to lock the whole subtree so we have to clean up
1871 * what we set up to the failing subtree 1773 * what we set up to the failing subtree
1872 */ 1774 */
1873 cond = true; 1775 cond = true;
1874 for_each_mem_cgroup_tree_cond(iter, mem, cond) { 1776 for_each_mem_cgroup_tree_cond(iter, memcg, cond) {
1875 if (iter == failed) { 1777 if (iter == failed) {
1876 cond = false; 1778 cond = false;
1877 continue; 1779 continue;
1878 } 1780 }
1879 iter->oom_lock = false; 1781 iter->oom_lock = false;
1880 } 1782 }
1881done: 1783 return false;
1882 return lock_count;
1883} 1784}
1884 1785
1885/* 1786/*
1886 * Has to be called with memcg_oom_lock 1787 * Has to be called with memcg_oom_lock
1887 */ 1788 */
1888static int mem_cgroup_oom_unlock(struct mem_cgroup *mem) 1789static int mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
1889{ 1790{
1890 struct mem_cgroup *iter; 1791 struct mem_cgroup *iter;
1891 1792
1892 for_each_mem_cgroup_tree(iter, mem) 1793 for_each_mem_cgroup_tree(iter, memcg)
1893 iter->oom_lock = false; 1794 iter->oom_lock = false;
1894 return 0; 1795 return 0;
1895} 1796}
1896 1797
1897static void mem_cgroup_mark_under_oom(struct mem_cgroup *mem) 1798static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
1898{ 1799{
1899 struct mem_cgroup *iter; 1800 struct mem_cgroup *iter;
1900 1801
1901 for_each_mem_cgroup_tree(iter, mem) 1802 for_each_mem_cgroup_tree(iter, memcg)
1902 atomic_inc(&iter->under_oom); 1803 atomic_inc(&iter->under_oom);
1903} 1804}
1904 1805
1905static void mem_cgroup_unmark_under_oom(struct mem_cgroup *mem) 1806static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
1906{ 1807{
1907 struct mem_cgroup *iter; 1808 struct mem_cgroup *iter;
1908 1809
@@ -1911,7 +1812,7 @@ static void mem_cgroup_unmark_under_oom(struct mem_cgroup *mem)
1911 * mem_cgroup_oom_lock() may not be called. We have to use 1812 * mem_cgroup_oom_lock() may not be called. We have to use
1912 * atomic_add_unless() here. 1813 * atomic_add_unless() here.
1913 */ 1814 */
1914 for_each_mem_cgroup_tree(iter, mem) 1815 for_each_mem_cgroup_tree(iter, memcg)
1915 atomic_add_unless(&iter->under_oom, -1, 0); 1816 atomic_add_unless(&iter->under_oom, -1, 0);
1916} 1817}
1917 1818
@@ -1926,85 +1827,85 @@ struct oom_wait_info {
1926static int memcg_oom_wake_function(wait_queue_t *wait, 1827static int memcg_oom_wake_function(wait_queue_t *wait,
1927 unsigned mode, int sync, void *arg) 1828 unsigned mode, int sync, void *arg)
1928{ 1829{
1929 struct mem_cgroup *wake_mem = (struct mem_cgroup *)arg, 1830 struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg,
1930 *oom_wait_mem; 1831 *oom_wait_memcg;
1931 struct oom_wait_info *oom_wait_info; 1832 struct oom_wait_info *oom_wait_info;
1932 1833
1933 oom_wait_info = container_of(wait, struct oom_wait_info, wait); 1834 oom_wait_info = container_of(wait, struct oom_wait_info, wait);
1934 oom_wait_mem = oom_wait_info->mem; 1835 oom_wait_memcg = oom_wait_info->mem;
1935 1836
1936 /* 1837 /*
1937 * Both of oom_wait_info->mem and wake_mem are stable under us. 1838 * Both of oom_wait_info->mem and wake_mem are stable under us.
1938 * Then we can use css_is_ancestor without taking care of RCU. 1839 * Then we can use css_is_ancestor without taking care of RCU.
1939 */ 1840 */
1940 if (!mem_cgroup_same_or_subtree(oom_wait_mem, wake_mem) 1841 if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg)
1941 && !mem_cgroup_same_or_subtree(wake_mem, oom_wait_mem)) 1842 && !mem_cgroup_same_or_subtree(wake_memcg, oom_wait_memcg))
1942 return 0; 1843 return 0;
1943 return autoremove_wake_function(wait, mode, sync, arg); 1844 return autoremove_wake_function(wait, mode, sync, arg);
1944} 1845}
1945 1846
1946static void memcg_wakeup_oom(struct mem_cgroup *mem) 1847static void memcg_wakeup_oom(struct mem_cgroup *memcg)
1947{ 1848{
1948 /* for filtering, pass "mem" as argument. */ 1849 /* for filtering, pass "memcg" as argument. */
1949 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, mem); 1850 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
1950} 1851}
1951 1852
1952static void memcg_oom_recover(struct mem_cgroup *mem) 1853static void memcg_oom_recover(struct mem_cgroup *memcg)
1953{ 1854{
1954 if (mem && atomic_read(&mem->under_oom)) 1855 if (memcg && atomic_read(&memcg->under_oom))
1955 memcg_wakeup_oom(mem); 1856 memcg_wakeup_oom(memcg);
1956} 1857}
1957 1858
1958/* 1859/*
1959 * try to call OOM killer. returns false if we should exit memory-reclaim loop. 1860 * try to call OOM killer. returns false if we should exit memory-reclaim loop.
1960 */ 1861 */
1961bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) 1862bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask)
1962{ 1863{
1963 struct oom_wait_info owait; 1864 struct oom_wait_info owait;
1964 bool locked, need_to_kill; 1865 bool locked, need_to_kill;
1965 1866
1966 owait.mem = mem; 1867 owait.mem = memcg;
1967 owait.wait.flags = 0; 1868 owait.wait.flags = 0;
1968 owait.wait.func = memcg_oom_wake_function; 1869 owait.wait.func = memcg_oom_wake_function;
1969 owait.wait.private = current; 1870 owait.wait.private = current;
1970 INIT_LIST_HEAD(&owait.wait.task_list); 1871 INIT_LIST_HEAD(&owait.wait.task_list);
1971 need_to_kill = true; 1872 need_to_kill = true;
1972 mem_cgroup_mark_under_oom(mem); 1873 mem_cgroup_mark_under_oom(memcg);
1973 1874
1974 /* At first, try to OOM lock hierarchy under mem.*/ 1875 /* At first, try to OOM lock hierarchy under memcg.*/
1975 spin_lock(&memcg_oom_lock); 1876 spin_lock(&memcg_oom_lock);
1976 locked = mem_cgroup_oom_lock(mem); 1877 locked = mem_cgroup_oom_lock(memcg);
1977 /* 1878 /*
1978 * Even if signal_pending(), we can't quit charge() loop without 1879 * Even if signal_pending(), we can't quit charge() loop without
1979 * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL 1880 * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL
1980 * under OOM is always welcomed, use TASK_KILLABLE here. 1881 * under OOM is always welcomed, use TASK_KILLABLE here.
1981 */ 1882 */
1982 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); 1883 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
1983 if (!locked || mem->oom_kill_disable) 1884 if (!locked || memcg->oom_kill_disable)
1984 need_to_kill = false; 1885 need_to_kill = false;
1985 if (locked) 1886 if (locked)
1986 mem_cgroup_oom_notify(mem); 1887 mem_cgroup_oom_notify(memcg);
1987 spin_unlock(&memcg_oom_lock); 1888 spin_unlock(&memcg_oom_lock);
1988 1889
1989 if (need_to_kill) { 1890 if (need_to_kill) {
1990 finish_wait(&memcg_oom_waitq, &owait.wait); 1891 finish_wait(&memcg_oom_waitq, &owait.wait);
1991 mem_cgroup_out_of_memory(mem, mask); 1892 mem_cgroup_out_of_memory(memcg, mask);
1992 } else { 1893 } else {
1993 schedule(); 1894 schedule();
1994 finish_wait(&memcg_oom_waitq, &owait.wait); 1895 finish_wait(&memcg_oom_waitq, &owait.wait);
1995 } 1896 }
1996 spin_lock(&memcg_oom_lock); 1897 spin_lock(&memcg_oom_lock);
1997 if (locked) 1898 if (locked)
1998 mem_cgroup_oom_unlock(mem); 1899 mem_cgroup_oom_unlock(memcg);
1999 memcg_wakeup_oom(mem); 1900 memcg_wakeup_oom(memcg);
2000 spin_unlock(&memcg_oom_lock); 1901 spin_unlock(&memcg_oom_lock);
2001 1902
2002 mem_cgroup_unmark_under_oom(mem); 1903 mem_cgroup_unmark_under_oom(memcg);
2003 1904
2004 if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) 1905 if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
2005 return false; 1906 return false;
2006 /* Give chance to dying process */ 1907 /* Give chance to dying process */
2007 schedule_timeout(1); 1908 schedule_timeout_uninterruptible(1);
2008 return true; 1909 return true;
2009} 1910}
2010 1911
@@ -2035,7 +1936,7 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
2035void mem_cgroup_update_page_stat(struct page *page, 1936void mem_cgroup_update_page_stat(struct page *page,
2036 enum mem_cgroup_page_stat_item idx, int val) 1937 enum mem_cgroup_page_stat_item idx, int val)
2037{ 1938{
2038 struct mem_cgroup *mem; 1939 struct mem_cgroup *memcg;
2039 struct page_cgroup *pc = lookup_page_cgroup(page); 1940 struct page_cgroup *pc = lookup_page_cgroup(page);
2040 bool need_unlock = false; 1941 bool need_unlock = false;
2041 unsigned long uninitialized_var(flags); 1942 unsigned long uninitialized_var(flags);
@@ -2044,16 +1945,16 @@ void mem_cgroup_update_page_stat(struct page *page,
2044 return; 1945 return;
2045 1946
2046 rcu_read_lock(); 1947 rcu_read_lock();
2047 mem = pc->mem_cgroup; 1948 memcg = pc->mem_cgroup;
2048 if (unlikely(!mem || !PageCgroupUsed(pc))) 1949 if (unlikely(!memcg || !PageCgroupUsed(pc)))
2049 goto out; 1950 goto out;
2050 /* pc->mem_cgroup is unstable ? */ 1951 /* pc->mem_cgroup is unstable ? */
2051 if (unlikely(mem_cgroup_stealed(mem)) || PageTransHuge(page)) { 1952 if (unlikely(mem_cgroup_stealed(memcg)) || PageTransHuge(page)) {
2052 /* take a lock against to access pc->mem_cgroup */ 1953 /* take a lock against to access pc->mem_cgroup */
2053 move_lock_page_cgroup(pc, &flags); 1954 move_lock_page_cgroup(pc, &flags);
2054 need_unlock = true; 1955 need_unlock = true;
2055 mem = pc->mem_cgroup; 1956 memcg = pc->mem_cgroup;
2056 if (!mem || !PageCgroupUsed(pc)) 1957 if (!memcg || !PageCgroupUsed(pc))
2057 goto out; 1958 goto out;
2058 } 1959 }
2059 1960
@@ -2069,7 +1970,7 @@ void mem_cgroup_update_page_stat(struct page *page,
2069 BUG(); 1970 BUG();
2070 } 1971 }
2071 1972
2072 this_cpu_add(mem->stat->count[idx], val); 1973 this_cpu_add(memcg->stat->count[idx], val);
2073 1974
2074out: 1975out:
2075 if (unlikely(need_unlock)) 1976 if (unlikely(need_unlock))
@@ -2092,6 +1993,7 @@ struct memcg_stock_pcp {
2092#define FLUSHING_CACHED_CHARGE (0) 1993#define FLUSHING_CACHED_CHARGE (0)
2093}; 1994};
2094static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); 1995static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
1996static DEFINE_MUTEX(percpu_charge_mutex);
2095 1997
2096/* 1998/*
2097 * Try to consume stocked charge on this cpu. If success, one page is consumed 1999 * Try to consume stocked charge on this cpu. If success, one page is consumed
@@ -2099,13 +2001,13 @@ static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
2099 * cgroup which is not current target, returns false. This stock will be 2001 * cgroup which is not current target, returns false. This stock will be
2100 * refilled. 2002 * refilled.
2101 */ 2003 */
2102static bool consume_stock(struct mem_cgroup *mem) 2004static bool consume_stock(struct mem_cgroup *memcg)
2103{ 2005{
2104 struct memcg_stock_pcp *stock; 2006 struct memcg_stock_pcp *stock;
2105 bool ret = true; 2007 bool ret = true;
2106 2008
2107 stock = &get_cpu_var(memcg_stock); 2009 stock = &get_cpu_var(memcg_stock);
2108 if (mem == stock->cached && stock->nr_pages) 2010 if (memcg == stock->cached && stock->nr_pages)
2109 stock->nr_pages--; 2011 stock->nr_pages--;
2110 else /* need to call res_counter_charge */ 2012 else /* need to call res_counter_charge */
2111 ret = false; 2013 ret = false;
@@ -2146,44 +2048,38 @@ static void drain_local_stock(struct work_struct *dummy)
2146 * Cache charges(val) which is from res_counter, to local per_cpu area. 2048 * Cache charges(val) which is from res_counter, to local per_cpu area.
2147 * This will be consumed by consume_stock() function, later. 2049 * This will be consumed by consume_stock() function, later.
2148 */ 2050 */
2149static void refill_stock(struct mem_cgroup *mem, unsigned int nr_pages) 2051static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
2150{ 2052{
2151 struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock); 2053 struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);
2152 2054
2153 if (stock->cached != mem) { /* reset if necessary */ 2055 if (stock->cached != memcg) { /* reset if necessary */
2154 drain_stock(stock); 2056 drain_stock(stock);
2155 stock->cached = mem; 2057 stock->cached = memcg;
2156 } 2058 }
2157 stock->nr_pages += nr_pages; 2059 stock->nr_pages += nr_pages;
2158 put_cpu_var(memcg_stock); 2060 put_cpu_var(memcg_stock);
2159} 2061}
2160 2062
2161/* 2063/*
2162 * Drains all per-CPU charge caches for given root_mem resp. subtree 2064 * Drains all per-CPU charge caches for given root_memcg resp. subtree
2163 * of the hierarchy under it. sync flag says whether we should block 2065 * of the hierarchy under it. sync flag says whether we should block
2164 * until the work is done. 2066 * until the work is done.
2165 */ 2067 */
2166static void drain_all_stock(struct mem_cgroup *root_mem, bool sync) 2068static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync)
2167{ 2069{
2168 int cpu, curcpu; 2070 int cpu, curcpu;
2169 2071
2170 /* Notify other cpus that system-wide "drain" is running */ 2072 /* Notify other cpus that system-wide "drain" is running */
2171 get_online_cpus(); 2073 get_online_cpus();
2172 /* 2074 curcpu = get_cpu();
2173 * Get a hint for avoiding draining charges on the current cpu,
2174 * which must be exhausted by our charging. It is not required that
2175 * this be a precise check, so we use raw_smp_processor_id() instead of
2176 * getcpu()/putcpu().
2177 */
2178 curcpu = raw_smp_processor_id();
2179 for_each_online_cpu(cpu) { 2075 for_each_online_cpu(cpu) {
2180 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); 2076 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
2181 struct mem_cgroup *mem; 2077 struct mem_cgroup *memcg;
2182 2078
2183 mem = stock->cached; 2079 memcg = stock->cached;
2184 if (!mem || !stock->nr_pages) 2080 if (!memcg || !stock->nr_pages)
2185 continue; 2081 continue;
2186 if (!mem_cgroup_same_or_subtree(root_mem, mem)) 2082 if (!mem_cgroup_same_or_subtree(root_memcg, memcg))
2187 continue; 2083 continue;
2188 if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) { 2084 if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
2189 if (cpu == curcpu) 2085 if (cpu == curcpu)
@@ -2192,14 +2088,14 @@ static void drain_all_stock(struct mem_cgroup *root_mem, bool sync)
2192 schedule_work_on(cpu, &stock->work); 2088 schedule_work_on(cpu, &stock->work);
2193 } 2089 }
2194 } 2090 }
2091 put_cpu();
2195 2092
2196 if (!sync) 2093 if (!sync)
2197 goto out; 2094 goto out;
2198 2095
2199 for_each_online_cpu(cpu) { 2096 for_each_online_cpu(cpu) {
2200 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); 2097 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
2201 if (mem_cgroup_same_or_subtree(root_mem, stock->cached) && 2098 if (test_bit(FLUSHING_CACHED_CHARGE, &stock->flags))
2202 test_bit(FLUSHING_CACHED_CHARGE, &stock->flags))
2203 flush_work(&stock->work); 2099 flush_work(&stock->work);
2204 } 2100 }
2205out: 2101out:
@@ -2212,51 +2108,59 @@ out:
2212 * expects some charges will be back to res_counter later but cannot wait for 2108 * expects some charges will be back to res_counter later but cannot wait for
2213 * it. 2109 * it.
2214 */ 2110 */
2215static void drain_all_stock_async(struct mem_cgroup *root_mem) 2111static void drain_all_stock_async(struct mem_cgroup *root_memcg)
2216{ 2112{
2217 drain_all_stock(root_mem, false); 2113 /*
2114 * If someone calls draining, avoid adding more kworker runs.
2115 */
2116 if (!mutex_trylock(&percpu_charge_mutex))
2117 return;
2118 drain_all_stock(root_memcg, false);
2119 mutex_unlock(&percpu_charge_mutex);
2218} 2120}
2219 2121
2220/* This is a synchronous drain interface. */ 2122/* This is a synchronous drain interface. */
2221static void drain_all_stock_sync(struct mem_cgroup *root_mem) 2123static void drain_all_stock_sync(struct mem_cgroup *root_memcg)
2222{ 2124{
2223 /* called when force_empty is called */ 2125 /* called when force_empty is called */
2224 drain_all_stock(root_mem, true); 2126 mutex_lock(&percpu_charge_mutex);
2127 drain_all_stock(root_memcg, true);
2128 mutex_unlock(&percpu_charge_mutex);
2225} 2129}
2226 2130
2227/* 2131/*
2228 * This function drains percpu counter value from DEAD cpu and 2132 * This function drains percpu counter value from DEAD cpu and
2229 * move it to local cpu. Note that this function can be preempted. 2133 * move it to local cpu. Note that this function can be preempted.
2230 */ 2134 */
2231static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *mem, int cpu) 2135static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu)
2232{ 2136{
2233 int i; 2137 int i;
2234 2138
2235 spin_lock(&mem->pcp_counter_lock); 2139 spin_lock(&memcg->pcp_counter_lock);
2236 for (i = 0; i < MEM_CGROUP_STAT_DATA; i++) { 2140 for (i = 0; i < MEM_CGROUP_STAT_DATA; i++) {
2237 long x = per_cpu(mem->stat->count[i], cpu); 2141 long x = per_cpu(memcg->stat->count[i], cpu);
2238 2142
2239 per_cpu(mem->stat->count[i], cpu) = 0; 2143 per_cpu(memcg->stat->count[i], cpu) = 0;
2240 mem->nocpu_base.count[i] += x; 2144 memcg->nocpu_base.count[i] += x;
2241 } 2145 }
2242 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) { 2146 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
2243 unsigned long x = per_cpu(mem->stat->events[i], cpu); 2147 unsigned long x = per_cpu(memcg->stat->events[i], cpu);
2244 2148
2245 per_cpu(mem->stat->events[i], cpu) = 0; 2149 per_cpu(memcg->stat->events[i], cpu) = 0;
2246 mem->nocpu_base.events[i] += x; 2150 memcg->nocpu_base.events[i] += x;
2247 } 2151 }
2248 /* need to clear ON_MOVE value, works as a kind of lock. */ 2152 /* need to clear ON_MOVE value, works as a kind of lock. */
2249 per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0; 2153 per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0;
2250 spin_unlock(&mem->pcp_counter_lock); 2154 spin_unlock(&memcg->pcp_counter_lock);
2251} 2155}
2252 2156
2253static void synchronize_mem_cgroup_on_move(struct mem_cgroup *mem, int cpu) 2157static void synchronize_mem_cgroup_on_move(struct mem_cgroup *memcg, int cpu)
2254{ 2158{
2255 int idx = MEM_CGROUP_ON_MOVE; 2159 int idx = MEM_CGROUP_ON_MOVE;
2256 2160
2257 spin_lock(&mem->pcp_counter_lock); 2161 spin_lock(&memcg->pcp_counter_lock);
2258 per_cpu(mem->stat->count[idx], cpu) = mem->nocpu_base.count[idx]; 2162 per_cpu(memcg->stat->count[idx], cpu) = memcg->nocpu_base.count[idx];
2259 spin_unlock(&mem->pcp_counter_lock); 2163 spin_unlock(&memcg->pcp_counter_lock);
2260} 2164}
2261 2165
2262static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb, 2166static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb,
@@ -2294,7 +2198,7 @@ enum {
2294 CHARGE_OOM_DIE, /* the current is killed because of OOM */ 2198 CHARGE_OOM_DIE, /* the current is killed because of OOM */
2295}; 2199};
2296 2200
2297static int mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, 2201static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
2298 unsigned int nr_pages, bool oom_check) 2202 unsigned int nr_pages, bool oom_check)
2299{ 2203{
2300 unsigned long csize = nr_pages * PAGE_SIZE; 2204 unsigned long csize = nr_pages * PAGE_SIZE;
@@ -2303,16 +2207,16 @@ static int mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
2303 unsigned long flags = 0; 2207 unsigned long flags = 0;
2304 int ret; 2208 int ret;
2305 2209
2306 ret = res_counter_charge(&mem->res, csize, &fail_res); 2210 ret = res_counter_charge(&memcg->res, csize, &fail_res);
2307 2211
2308 if (likely(!ret)) { 2212 if (likely(!ret)) {
2309 if (!do_swap_account) 2213 if (!do_swap_account)
2310 return CHARGE_OK; 2214 return CHARGE_OK;
2311 ret = res_counter_charge(&mem->memsw, csize, &fail_res); 2215 ret = res_counter_charge(&memcg->memsw, csize, &fail_res);
2312 if (likely(!ret)) 2216 if (likely(!ret))
2313 return CHARGE_OK; 2217 return CHARGE_OK;
2314 2218
2315 res_counter_uncharge(&mem->res, csize); 2219 res_counter_uncharge(&memcg->res, csize);
2316 mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); 2220 mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
2317 flags |= MEM_CGROUP_RECLAIM_NOSWAP; 2221 flags |= MEM_CGROUP_RECLAIM_NOSWAP;
2318 } else 2222 } else
@@ -2370,12 +2274,12 @@ static int mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
2370static int __mem_cgroup_try_charge(struct mm_struct *mm, 2274static int __mem_cgroup_try_charge(struct mm_struct *mm,
2371 gfp_t gfp_mask, 2275 gfp_t gfp_mask,
2372 unsigned int nr_pages, 2276 unsigned int nr_pages,
2373 struct mem_cgroup **memcg, 2277 struct mem_cgroup **ptr,
2374 bool oom) 2278 bool oom)
2375{ 2279{
2376 unsigned int batch = max(CHARGE_BATCH, nr_pages); 2280 unsigned int batch = max(CHARGE_BATCH, nr_pages);
2377 int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; 2281 int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
2378 struct mem_cgroup *mem = NULL; 2282 struct mem_cgroup *memcg = NULL;
2379 int ret; 2283 int ret;
2380 2284
2381 /* 2285 /*
@@ -2393,17 +2297,17 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
2393 * thread group leader migrates. It's possible that mm is not 2297 * thread group leader migrates. It's possible that mm is not
2394 * set, if so charge the init_mm (happens for pagecache usage). 2298 * set, if so charge the init_mm (happens for pagecache usage).
2395 */ 2299 */
2396 if (!*memcg && !mm) 2300 if (!*ptr && !mm)
2397 goto bypass; 2301 goto bypass;
2398again: 2302again:
2399 if (*memcg) { /* css should be a valid one */ 2303 if (*ptr) { /* css should be a valid one */
2400 mem = *memcg; 2304 memcg = *ptr;
2401 VM_BUG_ON(css_is_removed(&mem->css)); 2305 VM_BUG_ON(css_is_removed(&memcg->css));
2402 if (mem_cgroup_is_root(mem)) 2306 if (mem_cgroup_is_root(memcg))
2403 goto done; 2307 goto done;
2404 if (nr_pages == 1 && consume_stock(mem)) 2308 if (nr_pages == 1 && consume_stock(memcg))
2405 goto done; 2309 goto done;
2406 css_get(&mem->css); 2310 css_get(&memcg->css);
2407 } else { 2311 } else {
2408 struct task_struct *p; 2312 struct task_struct *p;
2409 2313
@@ -2411,7 +2315,7 @@ again:
2411 p = rcu_dereference(mm->owner); 2315 p = rcu_dereference(mm->owner);
2412 /* 2316 /*
2413 * Because we don't have task_lock(), "p" can exit. 2317 * Because we don't have task_lock(), "p" can exit.
2414 * In that case, "mem" can point to root or p can be NULL with 2318 * In that case, "memcg" can point to root or p can be NULL with
2415 * race with swapoff. Then, we have small risk of mis-accouning. 2319 * race with swapoff. Then, we have small risk of mis-accouning.
2416 * But such kind of mis-account by race always happens because 2320 * But such kind of mis-account by race always happens because
2417 * we don't have cgroup_mutex(). It's overkill and we allo that 2321 * we don't have cgroup_mutex(). It's overkill and we allo that
@@ -2419,12 +2323,12 @@ again:
2419 * (*) swapoff at el will charge against mm-struct not against 2323 * (*) swapoff at el will charge against mm-struct not against
2420 * task-struct. So, mm->owner can be NULL. 2324 * task-struct. So, mm->owner can be NULL.
2421 */ 2325 */
2422 mem = mem_cgroup_from_task(p); 2326 memcg = mem_cgroup_from_task(p);
2423 if (!mem || mem_cgroup_is_root(mem)) { 2327 if (!memcg || mem_cgroup_is_root(memcg)) {
2424 rcu_read_unlock(); 2328 rcu_read_unlock();
2425 goto done; 2329 goto done;
2426 } 2330 }
2427 if (nr_pages == 1 && consume_stock(mem)) { 2331 if (nr_pages == 1 && consume_stock(memcg)) {
2428 /* 2332 /*
2429 * It seems dagerous to access memcg without css_get(). 2333 * It seems dagerous to access memcg without css_get().
2430 * But considering how consume_stok works, it's not 2334 * But considering how consume_stok works, it's not
@@ -2437,7 +2341,7 @@ again:
2437 goto done; 2341 goto done;
2438 } 2342 }
2439 /* after here, we may be blocked. we need to get refcnt */ 2343 /* after here, we may be blocked. we need to get refcnt */
2440 if (!css_tryget(&mem->css)) { 2344 if (!css_tryget(&memcg->css)) {
2441 rcu_read_unlock(); 2345 rcu_read_unlock();
2442 goto again; 2346 goto again;
2443 } 2347 }
@@ -2449,7 +2353,7 @@ again:
2449 2353
2450 /* If killed, bypass charge */ 2354 /* If killed, bypass charge */
2451 if (fatal_signal_pending(current)) { 2355 if (fatal_signal_pending(current)) {
2452 css_put(&mem->css); 2356 css_put(&memcg->css);
2453 goto bypass; 2357 goto bypass;
2454 } 2358 }
2455 2359
@@ -2459,43 +2363,43 @@ again:
2459 nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; 2363 nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
2460 } 2364 }
2461 2365
2462 ret = mem_cgroup_do_charge(mem, gfp_mask, batch, oom_check); 2366 ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, oom_check);
2463 switch (ret) { 2367 switch (ret) {
2464 case CHARGE_OK: 2368 case CHARGE_OK:
2465 break; 2369 break;
2466 case CHARGE_RETRY: /* not in OOM situation but retry */ 2370 case CHARGE_RETRY: /* not in OOM situation but retry */
2467 batch = nr_pages; 2371 batch = nr_pages;
2468 css_put(&mem->css); 2372 css_put(&memcg->css);
2469 mem = NULL; 2373 memcg = NULL;
2470 goto again; 2374 goto again;
2471 case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */ 2375 case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */
2472 css_put(&mem->css); 2376 css_put(&memcg->css);
2473 goto nomem; 2377 goto nomem;
2474 case CHARGE_NOMEM: /* OOM routine works */ 2378 case CHARGE_NOMEM: /* OOM routine works */
2475 if (!oom) { 2379 if (!oom) {
2476 css_put(&mem->css); 2380 css_put(&memcg->css);
2477 goto nomem; 2381 goto nomem;
2478 } 2382 }
2479 /* If oom, we never return -ENOMEM */ 2383 /* If oom, we never return -ENOMEM */
2480 nr_oom_retries--; 2384 nr_oom_retries--;
2481 break; 2385 break;
2482 case CHARGE_OOM_DIE: /* Killed by OOM Killer */ 2386 case CHARGE_OOM_DIE: /* Killed by OOM Killer */
2483 css_put(&mem->css); 2387 css_put(&memcg->css);
2484 goto bypass; 2388 goto bypass;
2485 } 2389 }
2486 } while (ret != CHARGE_OK); 2390 } while (ret != CHARGE_OK);
2487 2391
2488 if (batch > nr_pages) 2392 if (batch > nr_pages)
2489 refill_stock(mem, batch - nr_pages); 2393 refill_stock(memcg, batch - nr_pages);
2490 css_put(&mem->css); 2394 css_put(&memcg->css);
2491done: 2395done:
2492 *memcg = mem; 2396 *ptr = memcg;
2493 return 0; 2397 return 0;
2494nomem: 2398nomem:
2495 *memcg = NULL; 2399 *ptr = NULL;
2496 return -ENOMEM; 2400 return -ENOMEM;
2497bypass: 2401bypass:
2498 *memcg = NULL; 2402 *ptr = NULL;
2499 return 0; 2403 return 0;
2500} 2404}
2501 2405
@@ -2504,15 +2408,15 @@ bypass:
2504 * This function is for that and do uncharge, put css's refcnt. 2408 * This function is for that and do uncharge, put css's refcnt.
2505 * gotten by try_charge(). 2409 * gotten by try_charge().
2506 */ 2410 */
2507static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem, 2411static void __mem_cgroup_cancel_charge(struct mem_cgroup *memcg,
2508 unsigned int nr_pages) 2412 unsigned int nr_pages)
2509{ 2413{
2510 if (!mem_cgroup_is_root(mem)) { 2414 if (!mem_cgroup_is_root(memcg)) {
2511 unsigned long bytes = nr_pages * PAGE_SIZE; 2415 unsigned long bytes = nr_pages * PAGE_SIZE;
2512 2416
2513 res_counter_uncharge(&mem->res, bytes); 2417 res_counter_uncharge(&memcg->res, bytes);
2514 if (do_swap_account) 2418 if (do_swap_account)
2515 res_counter_uncharge(&mem->memsw, bytes); 2419 res_counter_uncharge(&memcg->memsw, bytes);
2516 } 2420 }
2517} 2421}
2518 2422
@@ -2537,7 +2441,7 @@ static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
2537 2441
2538struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) 2442struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
2539{ 2443{
2540 struct mem_cgroup *mem = NULL; 2444 struct mem_cgroup *memcg = NULL;
2541 struct page_cgroup *pc; 2445 struct page_cgroup *pc;
2542 unsigned short id; 2446 unsigned short id;
2543 swp_entry_t ent; 2447 swp_entry_t ent;
@@ -2547,23 +2451,23 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
2547 pc = lookup_page_cgroup(page); 2451 pc = lookup_page_cgroup(page);
2548 lock_page_cgroup(pc); 2452 lock_page_cgroup(pc);
2549 if (PageCgroupUsed(pc)) { 2453 if (PageCgroupUsed(pc)) {
2550 mem = pc->mem_cgroup; 2454 memcg = pc->mem_cgroup;
2551 if (mem && !css_tryget(&mem->css)) 2455 if (memcg && !css_tryget(&memcg->css))
2552 mem = NULL; 2456 memcg = NULL;
2553 } else if (PageSwapCache(page)) { 2457 } else if (PageSwapCache(page)) {
2554 ent.val = page_private(page); 2458 ent.val = page_private(page);
2555 id = lookup_swap_cgroup(ent); 2459 id = lookup_swap_cgroup(ent);
2556 rcu_read_lock(); 2460 rcu_read_lock();
2557 mem = mem_cgroup_lookup(id); 2461 memcg = mem_cgroup_lookup(id);
2558 if (mem && !css_tryget(&mem->css)) 2462 if (memcg && !css_tryget(&memcg->css))
2559 mem = NULL; 2463 memcg = NULL;
2560 rcu_read_unlock(); 2464 rcu_read_unlock();
2561 } 2465 }
2562 unlock_page_cgroup(pc); 2466 unlock_page_cgroup(pc);
2563 return mem; 2467 return memcg;
2564} 2468}
2565 2469
2566static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, 2470static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
2567 struct page *page, 2471 struct page *page,
2568 unsigned int nr_pages, 2472 unsigned int nr_pages,
2569 struct page_cgroup *pc, 2473 struct page_cgroup *pc,
@@ -2572,14 +2476,14 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
2572 lock_page_cgroup(pc); 2476 lock_page_cgroup(pc);
2573 if (unlikely(PageCgroupUsed(pc))) { 2477 if (unlikely(PageCgroupUsed(pc))) {
2574 unlock_page_cgroup(pc); 2478 unlock_page_cgroup(pc);
2575 __mem_cgroup_cancel_charge(mem, nr_pages); 2479 __mem_cgroup_cancel_charge(memcg, nr_pages);
2576 return; 2480 return;
2577 } 2481 }
2578 /* 2482 /*
2579 * we don't need page_cgroup_lock about tail pages, becase they are not 2483 * we don't need page_cgroup_lock about tail pages, becase they are not
2580 * accessed by any other context at this point. 2484 * accessed by any other context at this point.
2581 */ 2485 */
2582 pc->mem_cgroup = mem; 2486 pc->mem_cgroup = memcg;
2583 /* 2487 /*
2584 * We access a page_cgroup asynchronously without lock_page_cgroup(). 2488 * We access a page_cgroup asynchronously without lock_page_cgroup().
2585 * Especially when a page_cgroup is taken from a page, pc->mem_cgroup 2489 * Especially when a page_cgroup is taken from a page, pc->mem_cgroup
@@ -2602,14 +2506,14 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
2602 break; 2506 break;
2603 } 2507 }
2604 2508
2605 mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), nr_pages); 2509 mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), nr_pages);
2606 unlock_page_cgroup(pc); 2510 unlock_page_cgroup(pc);
2607 /* 2511 /*
2608 * "charge_statistics" updated event counter. Then, check it. 2512 * "charge_statistics" updated event counter. Then, check it.
2609 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. 2513 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
2610 * if they exceeds softlimit. 2514 * if they exceeds softlimit.
2611 */ 2515 */
2612 memcg_check_events(mem, page); 2516 memcg_check_events(memcg, page);
2613} 2517}
2614 2518
2615#ifdef CONFIG_TRANSPARENT_HUGEPAGE 2519#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -2796,7 +2700,7 @@ out:
2796static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, 2700static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
2797 gfp_t gfp_mask, enum charge_type ctype) 2701 gfp_t gfp_mask, enum charge_type ctype)
2798{ 2702{
2799 struct mem_cgroup *mem = NULL; 2703 struct mem_cgroup *memcg = NULL;
2800 unsigned int nr_pages = 1; 2704 unsigned int nr_pages = 1;
2801 struct page_cgroup *pc; 2705 struct page_cgroup *pc;
2802 bool oom = true; 2706 bool oom = true;
@@ -2815,11 +2719,11 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
2815 pc = lookup_page_cgroup(page); 2719 pc = lookup_page_cgroup(page);
2816 BUG_ON(!pc); /* XXX: remove this and move pc lookup into commit */ 2720 BUG_ON(!pc); /* XXX: remove this and move pc lookup into commit */
2817 2721
2818 ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &mem, oom); 2722 ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom);
2819 if (ret || !mem) 2723 if (ret || !memcg)
2820 return ret; 2724 return ret;
2821 2725
2822 __mem_cgroup_commit_charge(mem, page, nr_pages, pc, ctype); 2726 __mem_cgroup_commit_charge(memcg, page, nr_pages, pc, ctype);
2823 return 0; 2727 return 0;
2824} 2728}
2825 2729
@@ -2848,7 +2752,7 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
2848 enum charge_type ctype); 2752 enum charge_type ctype);
2849 2753
2850static void 2754static void
2851__mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *mem, 2755__mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *memcg,
2852 enum charge_type ctype) 2756 enum charge_type ctype)
2853{ 2757{
2854 struct page_cgroup *pc = lookup_page_cgroup(page); 2758 struct page_cgroup *pc = lookup_page_cgroup(page);
@@ -2858,7 +2762,7 @@ __mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *mem,
2858 * LRU. Take care of it. 2762 * LRU. Take care of it.
2859 */ 2763 */
2860 mem_cgroup_lru_del_before_commit(page); 2764 mem_cgroup_lru_del_before_commit(page);
2861 __mem_cgroup_commit_charge(mem, page, 1, pc, ctype); 2765 __mem_cgroup_commit_charge(memcg, page, 1, pc, ctype);
2862 mem_cgroup_lru_add_after_commit(page); 2766 mem_cgroup_lru_add_after_commit(page);
2863 return; 2767 return;
2864} 2768}
@@ -2866,44 +2770,20 @@ __mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *mem,
2866int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, 2770int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
2867 gfp_t gfp_mask) 2771 gfp_t gfp_mask)
2868{ 2772{
2869 struct mem_cgroup *mem = NULL; 2773 struct mem_cgroup *memcg = NULL;
2870 int ret; 2774 int ret;
2871 2775
2872 if (mem_cgroup_disabled()) 2776 if (mem_cgroup_disabled())
2873 return 0; 2777 return 0;
2874 if (PageCompound(page)) 2778 if (PageCompound(page))
2875 return 0; 2779 return 0;
2876 /*
2877 * Corner case handling. This is called from add_to_page_cache()
2878 * in usual. But some FS (shmem) precharges this page before calling it
2879 * and call add_to_page_cache() with GFP_NOWAIT.
2880 *
2881 * For GFP_NOWAIT case, the page may be pre-charged before calling
2882 * add_to_page_cache(). (See shmem.c) check it here and avoid to call
2883 * charge twice. (It works but has to pay a bit larger cost.)
2884 * And when the page is SwapCache, it should take swap information
2885 * into account. This is under lock_page() now.
2886 */
2887 if (!(gfp_mask & __GFP_WAIT)) {
2888 struct page_cgroup *pc;
2889
2890 pc = lookup_page_cgroup(page);
2891 if (!pc)
2892 return 0;
2893 lock_page_cgroup(pc);
2894 if (PageCgroupUsed(pc)) {
2895 unlock_page_cgroup(pc);
2896 return 0;
2897 }
2898 unlock_page_cgroup(pc);
2899 }
2900 2780
2901 if (unlikely(!mm)) 2781 if (unlikely(!mm))
2902 mm = &init_mm; 2782 mm = &init_mm;
2903 2783
2904 if (page_is_file_cache(page)) { 2784 if (page_is_file_cache(page)) {
2905 ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, &mem, true); 2785 ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, &memcg, true);
2906 if (ret || !mem) 2786 if (ret || !memcg)
2907 return ret; 2787 return ret;
2908 2788
2909 /* 2789 /*
@@ -2911,15 +2791,15 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
2911 * put that would remove them from the LRU list, make 2791 * put that would remove them from the LRU list, make
2912 * sure that they get relinked properly. 2792 * sure that they get relinked properly.
2913 */ 2793 */
2914 __mem_cgroup_commit_charge_lrucare(page, mem, 2794 __mem_cgroup_commit_charge_lrucare(page, memcg,
2915 MEM_CGROUP_CHARGE_TYPE_CACHE); 2795 MEM_CGROUP_CHARGE_TYPE_CACHE);
2916 return ret; 2796 return ret;
2917 } 2797 }
2918 /* shmem */ 2798 /* shmem */
2919 if (PageSwapCache(page)) { 2799 if (PageSwapCache(page)) {
2920 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem); 2800 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &memcg);
2921 if (!ret) 2801 if (!ret)
2922 __mem_cgroup_commit_charge_swapin(page, mem, 2802 __mem_cgroup_commit_charge_swapin(page, memcg,
2923 MEM_CGROUP_CHARGE_TYPE_SHMEM); 2803 MEM_CGROUP_CHARGE_TYPE_SHMEM);
2924 } else 2804 } else
2925 ret = mem_cgroup_charge_common(page, mm, gfp_mask, 2805 ret = mem_cgroup_charge_common(page, mm, gfp_mask,
@@ -2938,7 +2818,7 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
2938 struct page *page, 2818 struct page *page,
2939 gfp_t mask, struct mem_cgroup **ptr) 2819 gfp_t mask, struct mem_cgroup **ptr)
2940{ 2820{
2941 struct mem_cgroup *mem; 2821 struct mem_cgroup *memcg;
2942 int ret; 2822 int ret;
2943 2823
2944 *ptr = NULL; 2824 *ptr = NULL;
@@ -2956,12 +2836,12 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
2956 */ 2836 */
2957 if (!PageSwapCache(page)) 2837 if (!PageSwapCache(page))
2958 goto charge_cur_mm; 2838 goto charge_cur_mm;
2959 mem = try_get_mem_cgroup_from_page(page); 2839 memcg = try_get_mem_cgroup_from_page(page);
2960 if (!mem) 2840 if (!memcg)
2961 goto charge_cur_mm; 2841 goto charge_cur_mm;
2962 *ptr = mem; 2842 *ptr = memcg;
2963 ret = __mem_cgroup_try_charge(NULL, mask, 1, ptr, true); 2843 ret = __mem_cgroup_try_charge(NULL, mask, 1, ptr, true);
2964 css_put(&mem->css); 2844 css_put(&memcg->css);
2965 return ret; 2845 return ret;
2966charge_cur_mm: 2846charge_cur_mm:
2967 if (unlikely(!mm)) 2847 if (unlikely(!mm))
@@ -3021,16 +2901,16 @@ void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)
3021 MEM_CGROUP_CHARGE_TYPE_MAPPED); 2901 MEM_CGROUP_CHARGE_TYPE_MAPPED);
3022} 2902}
3023 2903
3024void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) 2904void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg)
3025{ 2905{
3026 if (mem_cgroup_disabled()) 2906 if (mem_cgroup_disabled())
3027 return; 2907 return;
3028 if (!mem) 2908 if (!memcg)
3029 return; 2909 return;
3030 __mem_cgroup_cancel_charge(mem, 1); 2910 __mem_cgroup_cancel_charge(memcg, 1);
3031} 2911}
3032 2912
3033static void mem_cgroup_do_uncharge(struct mem_cgroup *mem, 2913static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg,
3034 unsigned int nr_pages, 2914 unsigned int nr_pages,
3035 const enum charge_type ctype) 2915 const enum charge_type ctype)
3036{ 2916{
@@ -3048,7 +2928,7 @@ static void mem_cgroup_do_uncharge(struct mem_cgroup *mem,
3048 * uncharges. Then, it's ok to ignore memcg's refcnt. 2928 * uncharges. Then, it's ok to ignore memcg's refcnt.
3049 */ 2929 */
3050 if (!batch->memcg) 2930 if (!batch->memcg)
3051 batch->memcg = mem; 2931 batch->memcg = memcg;
3052 /* 2932 /*
3053 * do_batch > 0 when unmapping pages or inode invalidate/truncate. 2933 * do_batch > 0 when unmapping pages or inode invalidate/truncate.
3054 * In those cases, all pages freed continuously can be expected to be in 2934 * In those cases, all pages freed continuously can be expected to be in
@@ -3068,7 +2948,7 @@ static void mem_cgroup_do_uncharge(struct mem_cgroup *mem,
3068 * merge a series of uncharges to an uncharge of res_counter. 2948 * merge a series of uncharges to an uncharge of res_counter.
3069 * If not, we uncharge res_counter ony by one. 2949 * If not, we uncharge res_counter ony by one.
3070 */ 2950 */
3071 if (batch->memcg != mem) 2951 if (batch->memcg != memcg)
3072 goto direct_uncharge; 2952 goto direct_uncharge;
3073 /* remember freed charge and uncharge it later */ 2953 /* remember freed charge and uncharge it later */
3074 batch->nr_pages++; 2954 batch->nr_pages++;
@@ -3076,11 +2956,11 @@ static void mem_cgroup_do_uncharge(struct mem_cgroup *mem,
3076 batch->memsw_nr_pages++; 2956 batch->memsw_nr_pages++;
3077 return; 2957 return;
3078direct_uncharge: 2958direct_uncharge:
3079 res_counter_uncharge(&mem->res, nr_pages * PAGE_SIZE); 2959 res_counter_uncharge(&memcg->res, nr_pages * PAGE_SIZE);
3080 if (uncharge_memsw) 2960 if (uncharge_memsw)
3081 res_counter_uncharge(&mem->memsw, nr_pages * PAGE_SIZE); 2961 res_counter_uncharge(&memcg->memsw, nr_pages * PAGE_SIZE);
3082 if (unlikely(batch->memcg != mem)) 2962 if (unlikely(batch->memcg != memcg))
3083 memcg_oom_recover(mem); 2963 memcg_oom_recover(memcg);
3084 return; 2964 return;
3085} 2965}
3086 2966
@@ -3090,7 +2970,7 @@ direct_uncharge:
3090static struct mem_cgroup * 2970static struct mem_cgroup *
3091__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) 2971__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
3092{ 2972{
3093 struct mem_cgroup *mem = NULL; 2973 struct mem_cgroup *memcg = NULL;
3094 unsigned int nr_pages = 1; 2974 unsigned int nr_pages = 1;
3095 struct page_cgroup *pc; 2975 struct page_cgroup *pc;
3096 2976
@@ -3113,7 +2993,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
3113 2993
3114 lock_page_cgroup(pc); 2994 lock_page_cgroup(pc);
3115 2995
3116 mem = pc->mem_cgroup; 2996 memcg = pc->mem_cgroup;
3117 2997
3118 if (!PageCgroupUsed(pc)) 2998 if (!PageCgroupUsed(pc))
3119 goto unlock_out; 2999 goto unlock_out;
@@ -3136,7 +3016,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
3136 break; 3016 break;
3137 } 3017 }
3138 3018
3139 mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), -nr_pages); 3019 mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), -nr_pages);
3140 3020
3141 ClearPageCgroupUsed(pc); 3021 ClearPageCgroupUsed(pc);
3142 /* 3022 /*
@@ -3148,18 +3028,18 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
3148 3028
3149 unlock_page_cgroup(pc); 3029 unlock_page_cgroup(pc);
3150 /* 3030 /*
3151 * even after unlock, we have mem->res.usage here and this memcg 3031 * even after unlock, we have memcg->res.usage here and this memcg
3152 * will never be freed. 3032 * will never be freed.
3153 */ 3033 */
3154 memcg_check_events(mem, page); 3034 memcg_check_events(memcg, page);
3155 if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) { 3035 if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) {
3156 mem_cgroup_swap_statistics(mem, true); 3036 mem_cgroup_swap_statistics(memcg, true);
3157 mem_cgroup_get(mem); 3037 mem_cgroup_get(memcg);
3158 } 3038 }
3159 if (!mem_cgroup_is_root(mem)) 3039 if (!mem_cgroup_is_root(memcg))
3160 mem_cgroup_do_uncharge(mem, nr_pages, ctype); 3040 mem_cgroup_do_uncharge(memcg, nr_pages, ctype);
3161 3041
3162 return mem; 3042 return memcg;
3163 3043
3164unlock_out: 3044unlock_out:
3165 unlock_page_cgroup(pc); 3045 unlock_page_cgroup(pc);
@@ -3349,7 +3229,7 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
3349int mem_cgroup_prepare_migration(struct page *page, 3229int mem_cgroup_prepare_migration(struct page *page,
3350 struct page *newpage, struct mem_cgroup **ptr, gfp_t gfp_mask) 3230 struct page *newpage, struct mem_cgroup **ptr, gfp_t gfp_mask)
3351{ 3231{
3352 struct mem_cgroup *mem = NULL; 3232 struct mem_cgroup *memcg = NULL;
3353 struct page_cgroup *pc; 3233 struct page_cgroup *pc;
3354 enum charge_type ctype; 3234 enum charge_type ctype;
3355 int ret = 0; 3235 int ret = 0;
@@ -3363,8 +3243,8 @@ int mem_cgroup_prepare_migration(struct page *page,
3363 pc = lookup_page_cgroup(page); 3243 pc = lookup_page_cgroup(page);
3364 lock_page_cgroup(pc); 3244 lock_page_cgroup(pc);
3365 if (PageCgroupUsed(pc)) { 3245 if (PageCgroupUsed(pc)) {
3366 mem = pc->mem_cgroup; 3246 memcg = pc->mem_cgroup;
3367 css_get(&mem->css); 3247 css_get(&memcg->css);
3368 /* 3248 /*
3369 * At migrating an anonymous page, its mapcount goes down 3249 * At migrating an anonymous page, its mapcount goes down
3370 * to 0 and uncharge() will be called. But, even if it's fully 3250 * to 0 and uncharge() will be called. But, even if it's fully
@@ -3402,12 +3282,12 @@ int mem_cgroup_prepare_migration(struct page *page,
3402 * If the page is not charged at this point, 3282 * If the page is not charged at this point,
3403 * we return here. 3283 * we return here.
3404 */ 3284 */
3405 if (!mem) 3285 if (!memcg)
3406 return 0; 3286 return 0;
3407 3287
3408 *ptr = mem; 3288 *ptr = memcg;
3409 ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, ptr, false); 3289 ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, ptr, false);
3410 css_put(&mem->css);/* drop extra refcnt */ 3290 css_put(&memcg->css);/* drop extra refcnt */
3411 if (ret || *ptr == NULL) { 3291 if (ret || *ptr == NULL) {
3412 if (PageAnon(page)) { 3292 if (PageAnon(page)) {
3413 lock_page_cgroup(pc); 3293 lock_page_cgroup(pc);
@@ -3433,21 +3313,21 @@ int mem_cgroup_prepare_migration(struct page *page,
3433 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; 3313 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
3434 else 3314 else
3435 ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; 3315 ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
3436 __mem_cgroup_commit_charge(mem, page, 1, pc, ctype); 3316 __mem_cgroup_commit_charge(memcg, page, 1, pc, ctype);
3437 return ret; 3317 return ret;
3438} 3318}
3439 3319
3440/* remove redundant charge if migration failed*/ 3320/* remove redundant charge if migration failed*/
3441void mem_cgroup_end_migration(struct mem_cgroup *mem, 3321void mem_cgroup_end_migration(struct mem_cgroup *memcg,
3442 struct page *oldpage, struct page *newpage, bool migration_ok) 3322 struct page *oldpage, struct page *newpage, bool migration_ok)
3443{ 3323{
3444 struct page *used, *unused; 3324 struct page *used, *unused;
3445 struct page_cgroup *pc; 3325 struct page_cgroup *pc;
3446 3326
3447 if (!mem) 3327 if (!memcg)
3448 return; 3328 return;
3449 /* blocks rmdir() */ 3329 /* blocks rmdir() */
3450 cgroup_exclude_rmdir(&mem->css); 3330 cgroup_exclude_rmdir(&memcg->css);
3451 if (!migration_ok) { 3331 if (!migration_ok) {
3452 used = oldpage; 3332 used = oldpage;
3453 unused = newpage; 3333 unused = newpage;
@@ -3483,32 +3363,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem,
3483 * So, rmdir()->pre_destroy() can be called while we do this charge. 3363 * So, rmdir()->pre_destroy() can be called while we do this charge.
3484 * In that case, we need to call pre_destroy() again. check it here. 3364 * In that case, we need to call pre_destroy() again. check it here.
3485 */ 3365 */
3486 cgroup_release_and_wakeup_rmdir(&mem->css); 3366 cgroup_release_and_wakeup_rmdir(&memcg->css);
3487}
3488
3489/*
3490 * A call to try to shrink memory usage on charge failure at shmem's swapin.
3491 * Calling hierarchical_reclaim is not enough because we should update
3492 * last_oom_jiffies to prevent pagefault_out_of_memory from invoking global OOM.
3493 * Moreover considering hierarchy, we should reclaim from the mem_over_limit,
3494 * not from the memcg which this page would be charged to.
3495 * try_charge_swapin does all of these works properly.
3496 */
3497int mem_cgroup_shmem_charge_fallback(struct page *page,
3498 struct mm_struct *mm,
3499 gfp_t gfp_mask)
3500{
3501 struct mem_cgroup *mem;
3502 int ret;
3503
3504 if (mem_cgroup_disabled())
3505 return 0;
3506
3507 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem);
3508 if (!ret)
3509 mem_cgroup_cancel_charge_swapin(mem); /* it does !mem check */
3510
3511 return ret;
3512} 3367}
3513 3368
3514#ifdef CONFIG_DEBUG_VM 3369#ifdef CONFIG_DEBUG_VM
@@ -3587,7 +3442,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
3587 /* 3442 /*
3588 * Rather than hide all in some function, I do this in 3443 * Rather than hide all in some function, I do this in
3589 * open coded manner. You see what this really does. 3444 * open coded manner. You see what this really does.
3590 * We have to guarantee mem->res.limit < mem->memsw.limit. 3445 * We have to guarantee memcg->res.limit < memcg->memsw.limit.
3591 */ 3446 */
3592 mutex_lock(&set_limit_mutex); 3447 mutex_lock(&set_limit_mutex);
3593 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 3448 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
@@ -3649,7 +3504,7 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
3649 /* 3504 /*
3650 * Rather than hide all in some function, I do this in 3505 * Rather than hide all in some function, I do this in
3651 * open coded manner. You see what this really does. 3506 * open coded manner. You see what this really does.
3652 * We have to guarantee mem->res.limit < mem->memsw.limit. 3507 * We have to guarantee memcg->res.limit < memcg->memsw.limit.
3653 */ 3508 */
3654 mutex_lock(&set_limit_mutex); 3509 mutex_lock(&set_limit_mutex);
3655 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); 3510 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
@@ -3787,7 +3642,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
3787 * This routine traverse page_cgroup in given list and drop them all. 3642 * This routine traverse page_cgroup in given list and drop them all.
3788 * *And* this routine doesn't reclaim page itself, just removes page_cgroup. 3643 * *And* this routine doesn't reclaim page itself, just removes page_cgroup.
3789 */ 3644 */
3790static int mem_cgroup_force_empty_list(struct mem_cgroup *mem, 3645static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
3791 int node, int zid, enum lru_list lru) 3646 int node, int zid, enum lru_list lru)
3792{ 3647{
3793 struct zone *zone; 3648 struct zone *zone;
@@ -3798,7 +3653,7 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
3798 int ret = 0; 3653 int ret = 0;
3799 3654
3800 zone = &NODE_DATA(node)->node_zones[zid]; 3655 zone = &NODE_DATA(node)->node_zones[zid];
3801 mz = mem_cgroup_zoneinfo(mem, node, zid); 3656 mz = mem_cgroup_zoneinfo(memcg, node, zid);
3802 list = &mz->lists[lru]; 3657 list = &mz->lists[lru];
3803 3658
3804 loop = MEM_CGROUP_ZSTAT(mz, lru); 3659 loop = MEM_CGROUP_ZSTAT(mz, lru);
@@ -3825,7 +3680,7 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
3825 3680
3826 page = lookup_cgroup_page(pc); 3681 page = lookup_cgroup_page(pc);
3827 3682
3828 ret = mem_cgroup_move_parent(page, pc, mem, GFP_KERNEL); 3683 ret = mem_cgroup_move_parent(page, pc, memcg, GFP_KERNEL);
3829 if (ret == -ENOMEM) 3684 if (ret == -ENOMEM)
3830 break; 3685 break;
3831 3686
@@ -3846,14 +3701,14 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
3846 * make mem_cgroup's charge to be 0 if there is no task. 3701 * make mem_cgroup's charge to be 0 if there is no task.
3847 * This enables deleting this mem_cgroup. 3702 * This enables deleting this mem_cgroup.
3848 */ 3703 */
3849static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all) 3704static int mem_cgroup_force_empty(struct mem_cgroup *memcg, bool free_all)
3850{ 3705{
3851 int ret; 3706 int ret;
3852 int node, zid, shrink; 3707 int node, zid, shrink;
3853 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 3708 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
3854 struct cgroup *cgrp = mem->css.cgroup; 3709 struct cgroup *cgrp = memcg->css.cgroup;
3855 3710
3856 css_get(&mem->css); 3711 css_get(&memcg->css);
3857 3712
3858 shrink = 0; 3713 shrink = 0;
3859 /* should free all ? */ 3714 /* should free all ? */
@@ -3869,14 +3724,14 @@ move_account:
3869 goto out; 3724 goto out;
3870 /* This is for making all *used* pages to be on LRU. */ 3725 /* This is for making all *used* pages to be on LRU. */
3871 lru_add_drain_all(); 3726 lru_add_drain_all();
3872 drain_all_stock_sync(mem); 3727 drain_all_stock_sync(memcg);
3873 ret = 0; 3728 ret = 0;
3874 mem_cgroup_start_move(mem); 3729 mem_cgroup_start_move(memcg);
3875 for_each_node_state(node, N_HIGH_MEMORY) { 3730 for_each_node_state(node, N_HIGH_MEMORY) {
3876 for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { 3731 for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
3877 enum lru_list l; 3732 enum lru_list l;
3878 for_each_lru(l) { 3733 for_each_lru(l) {
3879 ret = mem_cgroup_force_empty_list(mem, 3734 ret = mem_cgroup_force_empty_list(memcg,
3880 node, zid, l); 3735 node, zid, l);
3881 if (ret) 3736 if (ret)
3882 break; 3737 break;
@@ -3885,16 +3740,16 @@ move_account:
3885 if (ret) 3740 if (ret)
3886 break; 3741 break;
3887 } 3742 }
3888 mem_cgroup_end_move(mem); 3743 mem_cgroup_end_move(memcg);
3889 memcg_oom_recover(mem); 3744 memcg_oom_recover(memcg);
3890 /* it seems parent cgroup doesn't have enough mem */ 3745 /* it seems parent cgroup doesn't have enough mem */
3891 if (ret == -ENOMEM) 3746 if (ret == -ENOMEM)
3892 goto try_to_free; 3747 goto try_to_free;
3893 cond_resched(); 3748 cond_resched();
3894 /* "ret" should also be checked to ensure all lists are empty. */ 3749 /* "ret" should also be checked to ensure all lists are empty. */
3895 } while (mem->res.usage > 0 || ret); 3750 } while (memcg->res.usage > 0 || ret);
3896out: 3751out:
3897 css_put(&mem->css); 3752 css_put(&memcg->css);
3898 return ret; 3753 return ret;
3899 3754
3900try_to_free: 3755try_to_free:
@@ -3907,19 +3762,15 @@ try_to_free:
3907 lru_add_drain_all(); 3762 lru_add_drain_all();
3908 /* try to free all pages in this cgroup */ 3763 /* try to free all pages in this cgroup */
3909 shrink = 1; 3764 shrink = 1;
3910 while (nr_retries && mem->res.usage > 0) { 3765 while (nr_retries && memcg->res.usage > 0) {
3911 struct memcg_scanrecord rec;
3912 int progress; 3766 int progress;
3913 3767
3914 if (signal_pending(current)) { 3768 if (signal_pending(current)) {
3915 ret = -EINTR; 3769 ret = -EINTR;
3916 goto out; 3770 goto out;
3917 } 3771 }
3918 rec.context = SCAN_BY_SHRINK; 3772 progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL,
3919 rec.mem = mem; 3773 false);
3920 rec.root = mem;
3921 progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL,
3922 false, &rec);
3923 if (!progress) { 3774 if (!progress) {
3924 nr_retries--; 3775 nr_retries--;
3925 /* maybe some writeback is necessary */ 3776 /* maybe some writeback is necessary */
@@ -3947,12 +3798,12 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
3947 u64 val) 3798 u64 val)
3948{ 3799{
3949 int retval = 0; 3800 int retval = 0;
3950 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 3801 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
3951 struct cgroup *parent = cont->parent; 3802 struct cgroup *parent = cont->parent;
3952 struct mem_cgroup *parent_mem = NULL; 3803 struct mem_cgroup *parent_memcg = NULL;
3953 3804
3954 if (parent) 3805 if (parent)
3955 parent_mem = mem_cgroup_from_cont(parent); 3806 parent_memcg = mem_cgroup_from_cont(parent);
3956 3807
3957 cgroup_lock(); 3808 cgroup_lock();
3958 /* 3809 /*
@@ -3963,10 +3814,10 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
3963 * For the root cgroup, parent_mem is NULL, we allow value to be 3814 * For the root cgroup, parent_mem is NULL, we allow value to be
3964 * set if there are no children. 3815 * set if there are no children.
3965 */ 3816 */
3966 if ((!parent_mem || !parent_mem->use_hierarchy) && 3817 if ((!parent_memcg || !parent_memcg->use_hierarchy) &&
3967 (val == 1 || val == 0)) { 3818 (val == 1 || val == 0)) {
3968 if (list_empty(&cont->children)) 3819 if (list_empty(&cont->children))
3969 mem->use_hierarchy = val; 3820 memcg->use_hierarchy = val;
3970 else 3821 else
3971 retval = -EBUSY; 3822 retval = -EBUSY;
3972 } else 3823 } else
@@ -3977,14 +3828,14 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
3977} 3828}
3978 3829
3979 3830
3980static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *mem, 3831static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg,
3981 enum mem_cgroup_stat_index idx) 3832 enum mem_cgroup_stat_index idx)
3982{ 3833{
3983 struct mem_cgroup *iter; 3834 struct mem_cgroup *iter;
3984 long val = 0; 3835 long val = 0;
3985 3836
3986 /* Per-cpu values can be negative, use a signed accumulator */ 3837 /* Per-cpu values can be negative, use a signed accumulator */
3987 for_each_mem_cgroup_tree(iter, mem) 3838 for_each_mem_cgroup_tree(iter, memcg)
3988 val += mem_cgroup_read_stat(iter, idx); 3839 val += mem_cgroup_read_stat(iter, idx);
3989 3840
3990 if (val < 0) /* race ? */ 3841 if (val < 0) /* race ? */
@@ -3992,29 +3843,29 @@ static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *mem,
3992 return val; 3843 return val;
3993} 3844}
3994 3845
3995static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap) 3846static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
3996{ 3847{
3997 u64 val; 3848 u64 val;
3998 3849
3999 if (!mem_cgroup_is_root(mem)) { 3850 if (!mem_cgroup_is_root(memcg)) {
4000 if (!swap) 3851 if (!swap)
4001 return res_counter_read_u64(&mem->res, RES_USAGE); 3852 return res_counter_read_u64(&memcg->res, RES_USAGE);
4002 else 3853 else
4003 return res_counter_read_u64(&mem->memsw, RES_USAGE); 3854 return res_counter_read_u64(&memcg->memsw, RES_USAGE);
4004 } 3855 }
4005 3856
4006 val = mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_CACHE); 3857 val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE);
4007 val += mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_RSS); 3858 val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS);
4008 3859
4009 if (swap) 3860 if (swap)
4010 val += mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_SWAPOUT); 3861 val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAPOUT);
4011 3862
4012 return val << PAGE_SHIFT; 3863 return val << PAGE_SHIFT;
4013} 3864}
4014 3865
4015static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) 3866static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
4016{ 3867{
4017 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 3868 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
4018 u64 val; 3869 u64 val;
4019 int type, name; 3870 int type, name;
4020 3871
@@ -4023,15 +3874,15 @@ static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
4023 switch (type) { 3874 switch (type) {
4024 case _MEM: 3875 case _MEM:
4025 if (name == RES_USAGE) 3876 if (name == RES_USAGE)
4026 val = mem_cgroup_usage(mem, false); 3877 val = mem_cgroup_usage(memcg, false);
4027 else 3878 else
4028 val = res_counter_read_u64(&mem->res, name); 3879 val = res_counter_read_u64(&memcg->res, name);
4029 break; 3880 break;
4030 case _MEMSWAP: 3881 case _MEMSWAP:
4031 if (name == RES_USAGE) 3882 if (name == RES_USAGE)
4032 val = mem_cgroup_usage(mem, true); 3883 val = mem_cgroup_usage(memcg, true);
4033 else 3884 else
4034 val = res_counter_read_u64(&mem->memsw, name); 3885 val = res_counter_read_u64(&memcg->memsw, name);
4035 break; 3886 break;
4036 default: 3887 default:
4037 BUG(); 3888 BUG();
@@ -4119,24 +3970,24 @@ out:
4119 3970
4120static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) 3971static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
4121{ 3972{
4122 struct mem_cgroup *mem; 3973 struct mem_cgroup *memcg;
4123 int type, name; 3974 int type, name;
4124 3975
4125 mem = mem_cgroup_from_cont(cont); 3976 memcg = mem_cgroup_from_cont(cont);
4126 type = MEMFILE_TYPE(event); 3977 type = MEMFILE_TYPE(event);
4127 name = MEMFILE_ATTR(event); 3978 name = MEMFILE_ATTR(event);
4128 switch (name) { 3979 switch (name) {
4129 case RES_MAX_USAGE: 3980 case RES_MAX_USAGE:
4130 if (type == _MEM) 3981 if (type == _MEM)
4131 res_counter_reset_max(&mem->res); 3982 res_counter_reset_max(&memcg->res);
4132 else 3983 else
4133 res_counter_reset_max(&mem->memsw); 3984 res_counter_reset_max(&memcg->memsw);
4134 break; 3985 break;
4135 case RES_FAILCNT: 3986 case RES_FAILCNT:
4136 if (type == _MEM) 3987 if (type == _MEM)
4137 res_counter_reset_failcnt(&mem->res); 3988 res_counter_reset_failcnt(&memcg->res);
4138 else 3989 else
4139 res_counter_reset_failcnt(&mem->memsw); 3990 res_counter_reset_failcnt(&memcg->memsw);
4140 break; 3991 break;
4141 } 3992 }
4142 3993
@@ -4153,7 +4004,7 @@ static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp,
4153static int mem_cgroup_move_charge_write(struct cgroup *cgrp, 4004static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
4154 struct cftype *cft, u64 val) 4005 struct cftype *cft, u64 val)
4155{ 4006{
4156 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); 4007 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4157 4008
4158 if (val >= (1 << NR_MOVE_TYPE)) 4009 if (val >= (1 << NR_MOVE_TYPE))
4159 return -EINVAL; 4010 return -EINVAL;
@@ -4163,7 +4014,7 @@ static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
4163 * inconsistent. 4014 * inconsistent.
4164 */ 4015 */
4165 cgroup_lock(); 4016 cgroup_lock();
4166 mem->move_charge_at_immigrate = val; 4017 memcg->move_charge_at_immigrate = val;
4167 cgroup_unlock(); 4018 cgroup_unlock();
4168 4019
4169 return 0; 4020 return 0;
@@ -4220,49 +4071,49 @@ struct {
4220 4071
4221 4072
4222static void 4073static void
4223mem_cgroup_get_local_stat(struct mem_cgroup *mem, struct mcs_total_stat *s) 4074mem_cgroup_get_local_stat(struct mem_cgroup *memcg, struct mcs_total_stat *s)
4224{ 4075{
4225 s64 val; 4076 s64 val;
4226 4077
4227 /* per cpu stat */ 4078 /* per cpu stat */
4228 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE); 4079 val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_CACHE);
4229 s->stat[MCS_CACHE] += val * PAGE_SIZE; 4080 s->stat[MCS_CACHE] += val * PAGE_SIZE;
4230 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS); 4081 val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_RSS);
4231 s->stat[MCS_RSS] += val * PAGE_SIZE; 4082 s->stat[MCS_RSS] += val * PAGE_SIZE;
4232 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED); 4083 val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED);
4233 s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE; 4084 s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE;
4234 val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGPGIN); 4085 val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGPGIN);
4235 s->stat[MCS_PGPGIN] += val; 4086 s->stat[MCS_PGPGIN] += val;
4236 val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGPGOUT); 4087 val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGPGOUT);
4237 s->stat[MCS_PGPGOUT] += val; 4088 s->stat[MCS_PGPGOUT] += val;
4238 if (do_swap_account) { 4089 if (do_swap_account) {
4239 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT); 4090 val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_SWAPOUT);
4240 s->stat[MCS_SWAP] += val * PAGE_SIZE; 4091 s->stat[MCS_SWAP] += val * PAGE_SIZE;
4241 } 4092 }
4242 val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGFAULT); 4093 val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGFAULT);
4243 s->stat[MCS_PGFAULT] += val; 4094 s->stat[MCS_PGFAULT] += val;
4244 val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGMAJFAULT); 4095 val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGMAJFAULT);
4245 s->stat[MCS_PGMAJFAULT] += val; 4096 s->stat[MCS_PGMAJFAULT] += val;
4246 4097
4247 /* per zone stat */ 4098 /* per zone stat */
4248 val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_INACTIVE_ANON)); 4099 val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_ANON));
4249 s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE; 4100 s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE;
4250 val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_ACTIVE_ANON)); 4101 val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_ANON));
4251 s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE; 4102 s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE;
4252 val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_INACTIVE_FILE)); 4103 val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_FILE));
4253 s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE; 4104 s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE;
4254 val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_ACTIVE_FILE)); 4105 val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_FILE));
4255 s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE; 4106 s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE;
4256 val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_UNEVICTABLE)); 4107 val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE));
4257 s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE; 4108 s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE;
4258} 4109}
4259 4110
4260static void 4111static void
4261mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s) 4112mem_cgroup_get_total_stat(struct mem_cgroup *memcg, struct mcs_total_stat *s)
4262{ 4113{
4263 struct mem_cgroup *iter; 4114 struct mem_cgroup *iter;
4264 4115
4265 for_each_mem_cgroup_tree(iter, mem) 4116 for_each_mem_cgroup_tree(iter, memcg)
4266 mem_cgroup_get_local_stat(iter, s); 4117 mem_cgroup_get_local_stat(iter, s);
4267} 4118}
4268 4119
@@ -4348,8 +4199,6 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
4348 } 4199 }
4349 4200
4350#ifdef CONFIG_DEBUG_VM 4201#ifdef CONFIG_DEBUG_VM
4351 cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL));
4352
4353 { 4202 {
4354 int nid, zid; 4203 int nid, zid;
4355 struct mem_cgroup_per_zone *mz; 4204 struct mem_cgroup_per_zone *mz;
@@ -4486,20 +4335,20 @@ static int compare_thresholds(const void *a, const void *b)
4486 return _a->threshold - _b->threshold; 4335 return _a->threshold - _b->threshold;
4487} 4336}
4488 4337
4489static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem) 4338static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
4490{ 4339{
4491 struct mem_cgroup_eventfd_list *ev; 4340 struct mem_cgroup_eventfd_list *ev;
4492 4341
4493 list_for_each_entry(ev, &mem->oom_notify, list) 4342 list_for_each_entry(ev, &memcg->oom_notify, list)
4494 eventfd_signal(ev->eventfd, 1); 4343 eventfd_signal(ev->eventfd, 1);
4495 return 0; 4344 return 0;
4496} 4345}
4497 4346
4498static void mem_cgroup_oom_notify(struct mem_cgroup *mem) 4347static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
4499{ 4348{
4500 struct mem_cgroup *iter; 4349 struct mem_cgroup *iter;
4501 4350
4502 for_each_mem_cgroup_tree(iter, mem) 4351 for_each_mem_cgroup_tree(iter, memcg)
4503 mem_cgroup_oom_notify_cb(iter); 4352 mem_cgroup_oom_notify_cb(iter);
4504} 4353}
4505 4354
@@ -4689,7 +4538,7 @@ static int mem_cgroup_oom_register_event(struct cgroup *cgrp,
4689static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp, 4538static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,
4690 struct cftype *cft, struct eventfd_ctx *eventfd) 4539 struct cftype *cft, struct eventfd_ctx *eventfd)
4691{ 4540{
4692 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); 4541 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4693 struct mem_cgroup_eventfd_list *ev, *tmp; 4542 struct mem_cgroup_eventfd_list *ev, *tmp;
4694 int type = MEMFILE_TYPE(cft->private); 4543 int type = MEMFILE_TYPE(cft->private);
4695 4544
@@ -4697,7 +4546,7 @@ static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,
4697 4546
4698 spin_lock(&memcg_oom_lock); 4547 spin_lock(&memcg_oom_lock);
4699 4548
4700 list_for_each_entry_safe(ev, tmp, &mem->oom_notify, list) { 4549 list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {
4701 if (ev->eventfd == eventfd) { 4550 if (ev->eventfd == eventfd) {
4702 list_del(&ev->list); 4551 list_del(&ev->list);
4703 kfree(ev); 4552 kfree(ev);
@@ -4710,11 +4559,11 @@ static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,
4710static int mem_cgroup_oom_control_read(struct cgroup *cgrp, 4559static int mem_cgroup_oom_control_read(struct cgroup *cgrp,
4711 struct cftype *cft, struct cgroup_map_cb *cb) 4560 struct cftype *cft, struct cgroup_map_cb *cb)
4712{ 4561{
4713 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); 4562 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4714 4563
4715 cb->fill(cb, "oom_kill_disable", mem->oom_kill_disable); 4564 cb->fill(cb, "oom_kill_disable", memcg->oom_kill_disable);
4716 4565
4717 if (atomic_read(&mem->under_oom)) 4566 if (atomic_read(&memcg->under_oom))
4718 cb->fill(cb, "under_oom", 1); 4567 cb->fill(cb, "under_oom", 1);
4719 else 4568 else
4720 cb->fill(cb, "under_oom", 0); 4569 cb->fill(cb, "under_oom", 0);
@@ -4724,7 +4573,7 @@ static int mem_cgroup_oom_control_read(struct cgroup *cgrp,
4724static int mem_cgroup_oom_control_write(struct cgroup *cgrp, 4573static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
4725 struct cftype *cft, u64 val) 4574 struct cftype *cft, u64 val)
4726{ 4575{
4727 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); 4576 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4728 struct mem_cgroup *parent; 4577 struct mem_cgroup *parent;
4729 4578
4730 /* cannot set to root cgroup and only 0 and 1 are allowed */ 4579 /* cannot set to root cgroup and only 0 and 1 are allowed */
@@ -4736,13 +4585,13 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
4736 cgroup_lock(); 4585 cgroup_lock();
4737 /* oom-kill-disable is a flag for subhierarchy. */ 4586 /* oom-kill-disable is a flag for subhierarchy. */
4738 if ((parent->use_hierarchy) || 4587 if ((parent->use_hierarchy) ||
4739 (mem->use_hierarchy && !list_empty(&cgrp->children))) { 4588 (memcg->use_hierarchy && !list_empty(&cgrp->children))) {
4740 cgroup_unlock(); 4589 cgroup_unlock();
4741 return -EINVAL; 4590 return -EINVAL;
4742 } 4591 }
4743 mem->oom_kill_disable = val; 4592 memcg->oom_kill_disable = val;
4744 if (!val) 4593 if (!val)
4745 memcg_oom_recover(mem); 4594 memcg_oom_recover(memcg);
4746 cgroup_unlock(); 4595 cgroup_unlock();
4747 return 0; 4596 return 0;
4748} 4597}
@@ -4763,54 +4612,6 @@ static int mem_control_numa_stat_open(struct inode *unused, struct file *file)
4763} 4612}
4764#endif /* CONFIG_NUMA */ 4613#endif /* CONFIG_NUMA */
4765 4614
4766static int mem_cgroup_vmscan_stat_read(struct cgroup *cgrp,
4767 struct cftype *cft,
4768 struct cgroup_map_cb *cb)
4769{
4770 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
4771 char string[64];
4772 int i;
4773
4774 for (i = 0; i < NR_SCANSTATS; i++) {
4775 strcpy(string, scanstat_string[i]);
4776 strcat(string, SCANSTAT_WORD_LIMIT);
4777 cb->fill(cb, string, mem->scanstat.stats[SCAN_BY_LIMIT][i]);
4778 }
4779
4780 for (i = 0; i < NR_SCANSTATS; i++) {
4781 strcpy(string, scanstat_string[i]);
4782 strcat(string, SCANSTAT_WORD_SYSTEM);
4783 cb->fill(cb, string, mem->scanstat.stats[SCAN_BY_SYSTEM][i]);
4784 }
4785
4786 for (i = 0; i < NR_SCANSTATS; i++) {
4787 strcpy(string, scanstat_string[i]);
4788 strcat(string, SCANSTAT_WORD_LIMIT);
4789 strcat(string, SCANSTAT_WORD_HIERARCHY);
4790 cb->fill(cb, string, mem->scanstat.rootstats[SCAN_BY_LIMIT][i]);
4791 }
4792 for (i = 0; i < NR_SCANSTATS; i++) {
4793 strcpy(string, scanstat_string[i]);
4794 strcat(string, SCANSTAT_WORD_SYSTEM);
4795 strcat(string, SCANSTAT_WORD_HIERARCHY);
4796 cb->fill(cb, string, mem->scanstat.rootstats[SCAN_BY_SYSTEM][i]);
4797 }
4798 return 0;
4799}
4800
4801static int mem_cgroup_reset_vmscan_stat(struct cgroup *cgrp,
4802 unsigned int event)
4803{
4804 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
4805
4806 spin_lock(&mem->scanstat.lock);
4807 memset(&mem->scanstat.stats, 0, sizeof(mem->scanstat.stats));
4808 memset(&mem->scanstat.rootstats, 0, sizeof(mem->scanstat.rootstats));
4809 spin_unlock(&mem->scanstat.lock);
4810 return 0;
4811}
4812
4813
4814static struct cftype mem_cgroup_files[] = { 4615static struct cftype mem_cgroup_files[] = {
4815 { 4616 {
4816 .name = "usage_in_bytes", 4617 .name = "usage_in_bytes",
@@ -4881,11 +4682,6 @@ static struct cftype mem_cgroup_files[] = {
4881 .mode = S_IRUGO, 4682 .mode = S_IRUGO,
4882 }, 4683 },
4883#endif 4684#endif
4884 {
4885 .name = "vmscan_stat",
4886 .read_map = mem_cgroup_vmscan_stat_read,
4887 .trigger = mem_cgroup_reset_vmscan_stat,
4888 },
4889}; 4685};
4890 4686
4891#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 4687#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
@@ -4931,7 +4727,7 @@ static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
4931} 4727}
4932#endif 4728#endif
4933 4729
4934static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) 4730static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
4935{ 4731{
4936 struct mem_cgroup_per_node *pn; 4732 struct mem_cgroup_per_node *pn;
4937 struct mem_cgroup_per_zone *mz; 4733 struct mem_cgroup_per_zone *mz;
@@ -4951,21 +4747,21 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
4951 if (!pn) 4747 if (!pn)
4952 return 1; 4748 return 1;
4953 4749
4954 mem->info.nodeinfo[node] = pn;
4955 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 4750 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
4956 mz = &pn->zoneinfo[zone]; 4751 mz = &pn->zoneinfo[zone];
4957 for_each_lru(l) 4752 for_each_lru(l)
4958 INIT_LIST_HEAD(&mz->lists[l]); 4753 INIT_LIST_HEAD(&mz->lists[l]);
4959 mz->usage_in_excess = 0; 4754 mz->usage_in_excess = 0;
4960 mz->on_tree = false; 4755 mz->on_tree = false;
4961 mz->mem = mem; 4756 mz->mem = memcg;
4962 } 4757 }
4758 memcg->info.nodeinfo[node] = pn;
4963 return 0; 4759 return 0;
4964} 4760}
4965 4761
4966static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) 4762static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
4967{ 4763{
4968 kfree(mem->info.nodeinfo[node]); 4764 kfree(memcg->info.nodeinfo[node]);
4969} 4765}
4970 4766
4971static struct mem_cgroup *mem_cgroup_alloc(void) 4767static struct mem_cgroup *mem_cgroup_alloc(void)
@@ -5007,51 +4803,51 @@ out_free:
5007 * Removal of cgroup itself succeeds regardless of refs from swap. 4803 * Removal of cgroup itself succeeds regardless of refs from swap.
5008 */ 4804 */
5009 4805
5010static void __mem_cgroup_free(struct mem_cgroup *mem) 4806static void __mem_cgroup_free(struct mem_cgroup *memcg)
5011{ 4807{
5012 int node; 4808 int node;
5013 4809
5014 mem_cgroup_remove_from_trees(mem); 4810 mem_cgroup_remove_from_trees(memcg);
5015 free_css_id(&mem_cgroup_subsys, &mem->css); 4811 free_css_id(&mem_cgroup_subsys, &memcg->css);
5016 4812
5017 for_each_node_state(node, N_POSSIBLE) 4813 for_each_node_state(node, N_POSSIBLE)
5018 free_mem_cgroup_per_zone_info(mem, node); 4814 free_mem_cgroup_per_zone_info(memcg, node);
5019 4815
5020 free_percpu(mem->stat); 4816 free_percpu(memcg->stat);
5021 if (sizeof(struct mem_cgroup) < PAGE_SIZE) 4817 if (sizeof(struct mem_cgroup) < PAGE_SIZE)
5022 kfree(mem); 4818 kfree(memcg);
5023 else 4819 else
5024 vfree(mem); 4820 vfree(memcg);
5025} 4821}
5026 4822
5027static void mem_cgroup_get(struct mem_cgroup *mem) 4823static void mem_cgroup_get(struct mem_cgroup *memcg)
5028{ 4824{
5029 atomic_inc(&mem->refcnt); 4825 atomic_inc(&memcg->refcnt);
5030} 4826}
5031 4827
5032static void __mem_cgroup_put(struct mem_cgroup *mem, int count) 4828static void __mem_cgroup_put(struct mem_cgroup *memcg, int count)
5033{ 4829{
5034 if (atomic_sub_and_test(count, &mem->refcnt)) { 4830 if (atomic_sub_and_test(count, &memcg->refcnt)) {
5035 struct mem_cgroup *parent = parent_mem_cgroup(mem); 4831 struct mem_cgroup *parent = parent_mem_cgroup(memcg);
5036 __mem_cgroup_free(mem); 4832 __mem_cgroup_free(memcg);
5037 if (parent) 4833 if (parent)
5038 mem_cgroup_put(parent); 4834 mem_cgroup_put(parent);
5039 } 4835 }
5040} 4836}
5041 4837
5042static void mem_cgroup_put(struct mem_cgroup *mem) 4838static void mem_cgroup_put(struct mem_cgroup *memcg)
5043{ 4839{
5044 __mem_cgroup_put(mem, 1); 4840 __mem_cgroup_put(memcg, 1);
5045} 4841}
5046 4842
5047/* 4843/*
5048 * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled. 4844 * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled.
5049 */ 4845 */
5050static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem) 4846static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
5051{ 4847{
5052 if (!mem->res.parent) 4848 if (!memcg->res.parent)
5053 return NULL; 4849 return NULL;
5054 return mem_cgroup_from_res_counter(mem->res.parent, res); 4850 return mem_cgroup_from_res_counter(memcg->res.parent, res);
5055} 4851}
5056 4852
5057#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 4853#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
@@ -5094,16 +4890,16 @@ static int mem_cgroup_soft_limit_tree_init(void)
5094static struct cgroup_subsys_state * __ref 4890static struct cgroup_subsys_state * __ref
5095mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) 4891mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
5096{ 4892{
5097 struct mem_cgroup *mem, *parent; 4893 struct mem_cgroup *memcg, *parent;
5098 long error = -ENOMEM; 4894 long error = -ENOMEM;
5099 int node; 4895 int node;
5100 4896
5101 mem = mem_cgroup_alloc(); 4897 memcg = mem_cgroup_alloc();
5102 if (!mem) 4898 if (!memcg)
5103 return ERR_PTR(error); 4899 return ERR_PTR(error);
5104 4900
5105 for_each_node_state(node, N_POSSIBLE) 4901 for_each_node_state(node, N_POSSIBLE)
5106 if (alloc_mem_cgroup_per_zone_info(mem, node)) 4902 if (alloc_mem_cgroup_per_zone_info(memcg, node))
5107 goto free_out; 4903 goto free_out;
5108 4904
5109 /* root ? */ 4905 /* root ? */
@@ -5111,7 +4907,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
5111 int cpu; 4907 int cpu;
5112 enable_swap_cgroup(); 4908 enable_swap_cgroup();
5113 parent = NULL; 4909 parent = NULL;
5114 root_mem_cgroup = mem; 4910 root_mem_cgroup = memcg;
5115 if (mem_cgroup_soft_limit_tree_init()) 4911 if (mem_cgroup_soft_limit_tree_init())
5116 goto free_out; 4912 goto free_out;
5117 for_each_possible_cpu(cpu) { 4913 for_each_possible_cpu(cpu) {
@@ -5122,13 +4918,13 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
5122 hotcpu_notifier(memcg_cpu_hotplug_callback, 0); 4918 hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
5123 } else { 4919 } else {
5124 parent = mem_cgroup_from_cont(cont->parent); 4920 parent = mem_cgroup_from_cont(cont->parent);
5125 mem->use_hierarchy = parent->use_hierarchy; 4921 memcg->use_hierarchy = parent->use_hierarchy;
5126 mem->oom_kill_disable = parent->oom_kill_disable; 4922 memcg->oom_kill_disable = parent->oom_kill_disable;
5127 } 4923 }
5128 4924
5129 if (parent && parent->use_hierarchy) { 4925 if (parent && parent->use_hierarchy) {
5130 res_counter_init(&mem->res, &parent->res); 4926 res_counter_init(&memcg->res, &parent->res);
5131 res_counter_init(&mem->memsw, &parent->memsw); 4927 res_counter_init(&memcg->memsw, &parent->memsw);
5132 /* 4928 /*
5133 * We increment refcnt of the parent to ensure that we can 4929 * We increment refcnt of the parent to ensure that we can
5134 * safely access it on res_counter_charge/uncharge. 4930 * safely access it on res_counter_charge/uncharge.
@@ -5137,22 +4933,21 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
5137 */ 4933 */
5138 mem_cgroup_get(parent); 4934 mem_cgroup_get(parent);
5139 } else { 4935 } else {
5140 res_counter_init(&mem->res, NULL); 4936 res_counter_init(&memcg->res, NULL);
5141 res_counter_init(&mem->memsw, NULL); 4937 res_counter_init(&memcg->memsw, NULL);
5142 } 4938 }
5143 mem->last_scanned_child = 0; 4939 memcg->last_scanned_child = 0;
5144 mem->last_scanned_node = MAX_NUMNODES; 4940 memcg->last_scanned_node = MAX_NUMNODES;
5145 INIT_LIST_HEAD(&mem->oom_notify); 4941 INIT_LIST_HEAD(&memcg->oom_notify);
5146 4942
5147 if (parent) 4943 if (parent)
5148 mem->swappiness = mem_cgroup_swappiness(parent); 4944 memcg->swappiness = mem_cgroup_swappiness(parent);
5149 atomic_set(&mem->refcnt, 1); 4945 atomic_set(&memcg->refcnt, 1);
5150 mem->move_charge_at_immigrate = 0; 4946 memcg->move_charge_at_immigrate = 0;
5151 mutex_init(&mem->thresholds_lock); 4947 mutex_init(&memcg->thresholds_lock);
5152 spin_lock_init(&mem->scanstat.lock); 4948 return &memcg->css;
5153 return &mem->css;
5154free_out: 4949free_out:
5155 __mem_cgroup_free(mem); 4950 __mem_cgroup_free(memcg);
5156 root_mem_cgroup = NULL; 4951 root_mem_cgroup = NULL;
5157 return ERR_PTR(error); 4952 return ERR_PTR(error);
5158} 4953}
@@ -5160,17 +4955,17 @@ free_out:
5160static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss, 4955static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
5161 struct cgroup *cont) 4956 struct cgroup *cont)
5162{ 4957{
5163 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 4958 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
5164 4959
5165 return mem_cgroup_force_empty(mem, false); 4960 return mem_cgroup_force_empty(memcg, false);
5166} 4961}
5167 4962
5168static void mem_cgroup_destroy(struct cgroup_subsys *ss, 4963static void mem_cgroup_destroy(struct cgroup_subsys *ss,
5169 struct cgroup *cont) 4964 struct cgroup *cont)
5170{ 4965{
5171 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 4966 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
5172 4967
5173 mem_cgroup_put(mem); 4968 mem_cgroup_put(memcg);
5174} 4969}
5175 4970
5176static int mem_cgroup_populate(struct cgroup_subsys *ss, 4971static int mem_cgroup_populate(struct cgroup_subsys *ss,
@@ -5193,9 +4988,9 @@ static int mem_cgroup_do_precharge(unsigned long count)
5193{ 4988{
5194 int ret = 0; 4989 int ret = 0;
5195 int batch_count = PRECHARGE_COUNT_AT_ONCE; 4990 int batch_count = PRECHARGE_COUNT_AT_ONCE;
5196 struct mem_cgroup *mem = mc.to; 4991 struct mem_cgroup *memcg = mc.to;
5197 4992
5198 if (mem_cgroup_is_root(mem)) { 4993 if (mem_cgroup_is_root(memcg)) {
5199 mc.precharge += count; 4994 mc.precharge += count;
5200 /* we don't need css_get for root */ 4995 /* we don't need css_get for root */
5201 return ret; 4996 return ret;
@@ -5204,16 +4999,16 @@ static int mem_cgroup_do_precharge(unsigned long count)
5204 if (count > 1) { 4999 if (count > 1) {
5205 struct res_counter *dummy; 5000 struct res_counter *dummy;
5206 /* 5001 /*
5207 * "mem" cannot be under rmdir() because we've already checked 5002 * "memcg" cannot be under rmdir() because we've already checked
5208 * by cgroup_lock_live_cgroup() that it is not removed and we 5003 * by cgroup_lock_live_cgroup() that it is not removed and we
5209 * are still under the same cgroup_mutex. So we can postpone 5004 * are still under the same cgroup_mutex. So we can postpone
5210 * css_get(). 5005 * css_get().
5211 */ 5006 */
5212 if (res_counter_charge(&mem->res, PAGE_SIZE * count, &dummy)) 5007 if (res_counter_charge(&memcg->res, PAGE_SIZE * count, &dummy))
5213 goto one_by_one; 5008 goto one_by_one;
5214 if (do_swap_account && res_counter_charge(&mem->memsw, 5009 if (do_swap_account && res_counter_charge(&memcg->memsw,
5215 PAGE_SIZE * count, &dummy)) { 5010 PAGE_SIZE * count, &dummy)) {
5216 res_counter_uncharge(&mem->res, PAGE_SIZE * count); 5011 res_counter_uncharge(&memcg->res, PAGE_SIZE * count);
5217 goto one_by_one; 5012 goto one_by_one;
5218 } 5013 }
5219 mc.precharge += count; 5014 mc.precharge += count;
@@ -5230,8 +5025,9 @@ one_by_one:
5230 batch_count = PRECHARGE_COUNT_AT_ONCE; 5025 batch_count = PRECHARGE_COUNT_AT_ONCE;
5231 cond_resched(); 5026 cond_resched();
5232 } 5027 }
5233 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, 1, &mem, false); 5028 ret = __mem_cgroup_try_charge(NULL,
5234 if (ret || !mem) 5029 GFP_KERNEL, 1, &memcg, false);
5030 if (ret || !memcg)
5235 /* mem_cgroup_clear_mc() will do uncharge later */ 5031 /* mem_cgroup_clear_mc() will do uncharge later */
5236 return -ENOMEM; 5032 return -ENOMEM;
5237 mc.precharge++; 5033 mc.precharge++;
@@ -5330,15 +5126,17 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
5330 pgoff = pte_to_pgoff(ptent); 5126 pgoff = pte_to_pgoff(ptent);
5331 5127
5332 /* page is moved even if it's not RSS of this task(page-faulted). */ 5128 /* page is moved even if it's not RSS of this task(page-faulted). */
5333 if (!mapping_cap_swap_backed(mapping)) { /* normal file */ 5129 page = find_get_page(mapping, pgoff);
5334 page = find_get_page(mapping, pgoff); 5130
5335 } else { /* shmem/tmpfs file. we should take account of swap too. */ 5131#ifdef CONFIG_SWAP
5336 swp_entry_t ent; 5132 /* shmem/tmpfs may report page out on swap: account for that too. */
5337 mem_cgroup_get_shmem_target(inode, pgoff, &page, &ent); 5133 if (radix_tree_exceptional_entry(page)) {
5134 swp_entry_t swap = radix_to_swp_entry(page);
5338 if (do_swap_account) 5135 if (do_swap_account)
5339 entry->val = ent.val; 5136 *entry = swap;
5137 page = find_get_page(&swapper_space, swap.val);
5340 } 5138 }
5341 5139#endif
5342 return page; 5140 return page;
5343} 5141}
5344 5142
@@ -5503,13 +5301,13 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
5503 struct task_struct *p) 5301 struct task_struct *p)
5504{ 5302{
5505 int ret = 0; 5303 int ret = 0;
5506 struct mem_cgroup *mem = mem_cgroup_from_cont(cgroup); 5304 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgroup);
5507 5305
5508 if (mem->move_charge_at_immigrate) { 5306 if (memcg->move_charge_at_immigrate) {
5509 struct mm_struct *mm; 5307 struct mm_struct *mm;
5510 struct mem_cgroup *from = mem_cgroup_from_task(p); 5308 struct mem_cgroup *from = mem_cgroup_from_task(p);
5511 5309
5512 VM_BUG_ON(from == mem); 5310 VM_BUG_ON(from == memcg);
5513 5311
5514 mm = get_task_mm(p); 5312 mm = get_task_mm(p);
5515 if (!mm) 5313 if (!mm)
@@ -5524,7 +5322,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
5524 mem_cgroup_start_move(from); 5322 mem_cgroup_start_move(from);
5525 spin_lock(&mc.lock); 5323 spin_lock(&mc.lock);
5526 mc.from = from; 5324 mc.from = from;
5527 mc.to = mem; 5325 mc.to = memcg;
5528 spin_unlock(&mc.lock); 5326 spin_unlock(&mc.lock);
5529 /* We set mc.moving_task later */ 5327 /* We set mc.moving_task later */
5530 5328
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 740c4f52059c..06d3479513aa 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -42,6 +42,7 @@
42#include <linux/sched.h> 42#include <linux/sched.h>
43#include <linux/ksm.h> 43#include <linux/ksm.h>
44#include <linux/rmap.h> 44#include <linux/rmap.h>
45#include <linux/export.h>
45#include <linux/pagemap.h> 46#include <linux/pagemap.h>
46#include <linux/swap.h> 47#include <linux/swap.h>
47#include <linux/backing-dev.h> 48#include <linux/backing-dev.h>
@@ -53,6 +54,7 @@
53#include <linux/hugetlb.h> 54#include <linux/hugetlb.h>
54#include <linux/memory_hotplug.h> 55#include <linux/memory_hotplug.h>
55#include <linux/mm_inline.h> 56#include <linux/mm_inline.h>
57#include <linux/kfifo.h>
56#include "internal.h" 58#include "internal.h"
57 59
58int sysctl_memory_failure_early_kill __read_mostly = 0; 60int sysctl_memory_failure_early_kill __read_mostly = 0;
@@ -1178,6 +1180,97 @@ void memory_failure(unsigned long pfn, int trapno)
1178 __memory_failure(pfn, trapno, 0); 1180 __memory_failure(pfn, trapno, 0);
1179} 1181}
1180 1182
1183#define MEMORY_FAILURE_FIFO_ORDER 4
1184#define MEMORY_FAILURE_FIFO_SIZE (1 << MEMORY_FAILURE_FIFO_ORDER)
1185
1186struct memory_failure_entry {
1187 unsigned long pfn;
1188 int trapno;
1189 int flags;
1190};
1191
1192struct memory_failure_cpu {
1193 DECLARE_KFIFO(fifo, struct memory_failure_entry,
1194 MEMORY_FAILURE_FIFO_SIZE);
1195 spinlock_t lock;
1196 struct work_struct work;
1197};
1198
1199static DEFINE_PER_CPU(struct memory_failure_cpu, memory_failure_cpu);
1200
1201/**
1202 * memory_failure_queue - Schedule handling memory failure of a page.
1203 * @pfn: Page Number of the corrupted page
1204 * @trapno: Trap number reported in the signal to user space.
1205 * @flags: Flags for memory failure handling
1206 *
1207 * This function is called by the low level hardware error handler
1208 * when it detects hardware memory corruption of a page. It schedules
1209 * the recovering of error page, including dropping pages, killing
1210 * processes etc.
1211 *
1212 * The function is primarily of use for corruptions that
1213 * happen outside the current execution context (e.g. when
1214 * detected by a background scrubber)
1215 *
1216 * Can run in IRQ context.
1217 */
1218void memory_failure_queue(unsigned long pfn, int trapno, int flags)
1219{
1220 struct memory_failure_cpu *mf_cpu;
1221 unsigned long proc_flags;
1222 struct memory_failure_entry entry = {
1223 .pfn = pfn,
1224 .trapno = trapno,
1225 .flags = flags,
1226 };
1227
1228 mf_cpu = &get_cpu_var(memory_failure_cpu);
1229 spin_lock_irqsave(&mf_cpu->lock, proc_flags);
1230 if (kfifo_put(&mf_cpu->fifo, &entry))
1231 schedule_work_on(smp_processor_id(), &mf_cpu->work);
1232 else
1233 pr_err("Memory failure: buffer overflow when queuing memory failure at 0x%#lx\n",
1234 pfn);
1235 spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
1236 put_cpu_var(memory_failure_cpu);
1237}
1238EXPORT_SYMBOL_GPL(memory_failure_queue);
1239
1240static void memory_failure_work_func(struct work_struct *work)
1241{
1242 struct memory_failure_cpu *mf_cpu;
1243 struct memory_failure_entry entry = { 0, };
1244 unsigned long proc_flags;
1245 int gotten;
1246
1247 mf_cpu = &__get_cpu_var(memory_failure_cpu);
1248 for (;;) {
1249 spin_lock_irqsave(&mf_cpu->lock, proc_flags);
1250 gotten = kfifo_get(&mf_cpu->fifo, &entry);
1251 spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
1252 if (!gotten)
1253 break;
1254 __memory_failure(entry.pfn, entry.trapno, entry.flags);
1255 }
1256}
1257
1258static int __init memory_failure_init(void)
1259{
1260 struct memory_failure_cpu *mf_cpu;
1261 int cpu;
1262
1263 for_each_possible_cpu(cpu) {
1264 mf_cpu = &per_cpu(memory_failure_cpu, cpu);
1265 spin_lock_init(&mf_cpu->lock);
1266 INIT_KFIFO(mf_cpu->fifo);
1267 INIT_WORK(&mf_cpu->work, memory_failure_work_func);
1268 }
1269
1270 return 0;
1271}
1272core_initcall(memory_failure_init);
1273
1181/** 1274/**
1182 * unpoison_memory - Unpoison a previously poisoned page 1275 * unpoison_memory - Unpoison a previously poisoned page
1183 * @pfn: Page number of the to be unpoisoned page 1276 * @pfn: Page number of the to be unpoisoned page
@@ -1218,7 +1311,7 @@ int unpoison_memory(unsigned long pfn)
1218 * to the end. 1311 * to the end.
1219 */ 1312 */
1220 if (PageHuge(page)) { 1313 if (PageHuge(page)) {
1221 pr_debug("MCE: Memory failure is now running on free hugepage %#lx\n", pfn); 1314 pr_info("MCE: Memory failure is now running on free hugepage %#lx\n", pfn);
1222 return 0; 1315 return 0;
1223 } 1316 }
1224 if (TestClearPageHWPoison(p)) 1317 if (TestClearPageHWPoison(p))
@@ -1327,7 +1420,7 @@ static int soft_offline_huge_page(struct page *page, int flags)
1327 1420
1328 if (PageHWPoison(hpage)) { 1421 if (PageHWPoison(hpage)) {
1329 put_page(hpage); 1422 put_page(hpage);
1330 pr_debug("soft offline: %#lx hugepage already poisoned\n", pfn); 1423 pr_info("soft offline: %#lx hugepage already poisoned\n", pfn);
1331 return -EBUSY; 1424 return -EBUSY;
1332 } 1425 }
1333 1426
@@ -1341,8 +1434,8 @@ static int soft_offline_huge_page(struct page *page, int flags)
1341 list_for_each_entry_safe(page1, page2, &pagelist, lru) 1434 list_for_each_entry_safe(page1, page2, &pagelist, lru)
1342 put_page(page1); 1435 put_page(page1);
1343 1436
1344 pr_debug("soft offline: %#lx: migration failed %d, type %lx\n", 1437 pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
1345 pfn, ret, page->flags); 1438 pfn, ret, page->flags);
1346 if (ret > 0) 1439 if (ret > 0)
1347 ret = -EIO; 1440 ret = -EIO;
1348 return ret; 1441 return ret;
@@ -1413,7 +1506,7 @@ int soft_offline_page(struct page *page, int flags)
1413 } 1506 }
1414 if (!PageLRU(page)) { 1507 if (!PageLRU(page)) {
1415 pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n", 1508 pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n",
1416 pfn, page->flags); 1509 pfn, page->flags);
1417 return -EIO; 1510 return -EIO;
1418 } 1511 }
1419 1512
@@ -1474,7 +1567,7 @@ int soft_offline_page(struct page *page, int flags)
1474 } 1567 }
1475 } else { 1568 } else {
1476 pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n", 1569 pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
1477 pfn, ret, page_count(page), page->flags); 1570 pfn, ret, page_count(page), page->flags);
1478 } 1571 }
1479 if (ret) 1572 if (ret)
1480 return ret; 1573 return ret;
diff --git a/mm/memory.c b/mm/memory.c
index a56e3ba816b2..829d43735402 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -47,7 +47,7 @@
47#include <linux/pagemap.h> 47#include <linux/pagemap.h>
48#include <linux/ksm.h> 48#include <linux/ksm.h>
49#include <linux/rmap.h> 49#include <linux/rmap.h>
50#include <linux/module.h> 50#include <linux/export.h>
51#include <linux/delayacct.h> 51#include <linux/delayacct.h>
52#include <linux/init.h> 52#include <linux/init.h>
53#include <linux/writeback.h> 53#include <linux/writeback.h>
@@ -1503,7 +1503,7 @@ split_fallthrough:
1503 } 1503 }
1504 1504
1505 if (flags & FOLL_GET) 1505 if (flags & FOLL_GET)
1506 get_page(page); 1506 get_page_foll(page);
1507 if (flags & FOLL_TOUCH) { 1507 if (flags & FOLL_TOUCH) {
1508 if ((flags & FOLL_WRITE) && 1508 if ((flags & FOLL_WRITE) &&
1509 !pte_dirty(pte) && !PageDirty(page)) 1509 !pte_dirty(pte) && !PageDirty(page))
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 6e7d8b21dbfa..2168489c0bc9 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -11,7 +11,7 @@
11#include <linux/pagemap.h> 11#include <linux/pagemap.h>
12#include <linux/bootmem.h> 12#include <linux/bootmem.h>
13#include <linux/compiler.h> 13#include <linux/compiler.h>
14#include <linux/module.h> 14#include <linux/export.h>
15#include <linux/pagevec.h> 15#include <linux/pagevec.h>
16#include <linux/writeback.h> 16#include <linux/writeback.h>
17#include <linux/slab.h> 17#include <linux/slab.h>
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 8b57173c1dd5..adc395481813 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -75,7 +75,7 @@
75#include <linux/cpuset.h> 75#include <linux/cpuset.h>
76#include <linux/slab.h> 76#include <linux/slab.h>
77#include <linux/string.h> 77#include <linux/string.h>
78#include <linux/module.h> 78#include <linux/export.h>
79#include <linux/nsproxy.h> 79#include <linux/nsproxy.h>
80#include <linux/interrupt.h> 80#include <linux/interrupt.h>
81#include <linux/init.h> 81#include <linux/init.h>
@@ -111,7 +111,7 @@ enum zone_type policy_zone = 0;
111/* 111/*
112 * run-time system-wide default policy => local allocation 112 * run-time system-wide default policy => local allocation
113 */ 113 */
114struct mempolicy default_policy = { 114static struct mempolicy default_policy = {
115 .refcnt = ATOMIC_INIT(1), /* never free it */ 115 .refcnt = ATOMIC_INIT(1), /* never free it */
116 .mode = MPOL_PREFERRED, 116 .mode = MPOL_PREFERRED,
117 .flags = MPOL_F_LOCAL, 117 .flags = MPOL_F_LOCAL,
@@ -636,7 +636,6 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
636 struct vm_area_struct *prev; 636 struct vm_area_struct *prev;
637 struct vm_area_struct *vma; 637 struct vm_area_struct *vma;
638 int err = 0; 638 int err = 0;
639 pgoff_t pgoff;
640 unsigned long vmstart; 639 unsigned long vmstart;
641 unsigned long vmend; 640 unsigned long vmend;
642 641
@@ -649,9 +648,9 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
649 vmstart = max(start, vma->vm_start); 648 vmstart = max(start, vma->vm_start);
650 vmend = min(end, vma->vm_end); 649 vmend = min(end, vma->vm_end);
651 650
652 pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
653 prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags, 651 prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
654 vma->anon_vma, vma->vm_file, pgoff, new_pol); 652 vma->anon_vma, vma->vm_file, vma->vm_pgoff,
653 new_pol);
655 if (prev) { 654 if (prev) {
656 vma = prev; 655 vma = prev;
657 next = vma->vm_next; 656 next = vma->vm_next;
@@ -1412,7 +1411,9 @@ asmlinkage long compat_sys_get_mempolicy(int __user *policy,
1412 err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags); 1411 err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1413 1412
1414 if (!err && nmask) { 1413 if (!err && nmask) {
1415 err = copy_from_user(bm, nm, alloc_size); 1414 unsigned long copy_size;
1415 copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
1416 err = copy_from_user(bm, nm, copy_size);
1416 /* ensure entire bitmap is zeroed */ 1417 /* ensure entire bitmap is zeroed */
1417 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8); 1418 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1418 err |= compat_put_bitmap(nmask, bm, nr_bits); 1419 err |= compat_put_bitmap(nmask, bm, nr_bits);
diff --git a/mm/mempool.c b/mm/mempool.c
index 1a3bc3d4d554..e73641b79bb5 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -10,7 +10,7 @@
10 10
11#include <linux/mm.h> 11#include <linux/mm.h>
12#include <linux/slab.h> 12#include <linux/slab.h>
13#include <linux/module.h> 13#include <linux/export.h>
14#include <linux/mempool.h> 14#include <linux/mempool.h>
15#include <linux/blkdev.h> 15#include <linux/blkdev.h>
16#include <linux/writeback.h> 16#include <linux/writeback.h>
diff --git a/mm/migrate.c b/mm/migrate.c
index 666e4e677414..578e29174fa6 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -13,7 +13,7 @@
13 */ 13 */
14 14
15#include <linux/migrate.h> 15#include <linux/migrate.h>
16#include <linux/module.h> 16#include <linux/export.h>
17#include <linux/swap.h> 17#include <linux/swap.h>
18#include <linux/swapops.h> 18#include <linux/swapops.h>
19#include <linux/pagemap.h> 19#include <linux/pagemap.h>
@@ -120,10 +120,10 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
120 120
121 ptep = pte_offset_map(pmd, addr); 121 ptep = pte_offset_map(pmd, addr);
122 122
123 if (!is_swap_pte(*ptep)) { 123 /*
124 pte_unmap(ptep); 124 * Peek to check is_swap_pte() before taking ptlock? No, we
125 goto out; 125 * can race mremap's move_ptes(), which skips anon_vma lock.
126 } 126 */
127 127
128 ptl = pte_lockptr(mm, pmd); 128 ptl = pte_lockptr(mm, pmd);
129 } 129 }
@@ -621,38 +621,18 @@ static int move_to_new_page(struct page *newpage, struct page *page,
621 return rc; 621 return rc;
622} 622}
623 623
624/* 624static int __unmap_and_move(struct page *page, struct page *newpage,
625 * Obtain the lock on page, remove all ptes and migrate the page 625 int force, bool offlining, bool sync)
626 * to the newly allocated page in newpage.
627 */
628static int unmap_and_move(new_page_t get_new_page, unsigned long private,
629 struct page *page, int force, bool offlining, bool sync)
630{ 626{
631 int rc = 0; 627 int rc = -EAGAIN;
632 int *result = NULL;
633 struct page *newpage = get_new_page(page, private, &result);
634 int remap_swapcache = 1; 628 int remap_swapcache = 1;
635 int charge = 0; 629 int charge = 0;
636 struct mem_cgroup *mem; 630 struct mem_cgroup *mem;
637 struct anon_vma *anon_vma = NULL; 631 struct anon_vma *anon_vma = NULL;
638 632
639 if (!newpage)
640 return -ENOMEM;
641
642 if (page_count(page) == 1) {
643 /* page was freed from under us. So we are done. */
644 goto move_newpage;
645 }
646 if (unlikely(PageTransHuge(page)))
647 if (unlikely(split_huge_page(page)))
648 goto move_newpage;
649
650 /* prepare cgroup just returns 0 or -ENOMEM */
651 rc = -EAGAIN;
652
653 if (!trylock_page(page)) { 633 if (!trylock_page(page)) {
654 if (!force || !sync) 634 if (!force || !sync)
655 goto move_newpage; 635 goto out;
656 636
657 /* 637 /*
658 * It's not safe for direct compaction to call lock_page. 638 * It's not safe for direct compaction to call lock_page.
@@ -668,7 +648,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
668 * altogether. 648 * altogether.
669 */ 649 */
670 if (current->flags & PF_MEMALLOC) 650 if (current->flags & PF_MEMALLOC)
671 goto move_newpage; 651 goto out;
672 652
673 lock_page(page); 653 lock_page(page);
674 } 654 }
@@ -785,27 +765,52 @@ uncharge:
785 mem_cgroup_end_migration(mem, page, newpage, rc == 0); 765 mem_cgroup_end_migration(mem, page, newpage, rc == 0);
786unlock: 766unlock:
787 unlock_page(page); 767 unlock_page(page);
768out:
769 return rc;
770}
788 771
789move_newpage: 772/*
773 * Obtain the lock on page, remove all ptes and migrate the page
774 * to the newly allocated page in newpage.
775 */
776static int unmap_and_move(new_page_t get_new_page, unsigned long private,
777 struct page *page, int force, bool offlining, bool sync)
778{
779 int rc = 0;
780 int *result = NULL;
781 struct page *newpage = get_new_page(page, private, &result);
782
783 if (!newpage)
784 return -ENOMEM;
785
786 if (page_count(page) == 1) {
787 /* page was freed from under us. So we are done. */
788 goto out;
789 }
790
791 if (unlikely(PageTransHuge(page)))
792 if (unlikely(split_huge_page(page)))
793 goto out;
794
795 rc = __unmap_and_move(page, newpage, force, offlining, sync);
796out:
790 if (rc != -EAGAIN) { 797 if (rc != -EAGAIN) {
791 /* 798 /*
792 * A page that has been migrated has all references 799 * A page that has been migrated has all references
793 * removed and will be freed. A page that has not been 800 * removed and will be freed. A page that has not been
794 * migrated will have kepts its references and be 801 * migrated will have kepts its references and be
795 * restored. 802 * restored.
796 */ 803 */
797 list_del(&page->lru); 804 list_del(&page->lru);
798 dec_zone_page_state(page, NR_ISOLATED_ANON + 805 dec_zone_page_state(page, NR_ISOLATED_ANON +
799 page_is_file_cache(page)); 806 page_is_file_cache(page));
800 putback_lru_page(page); 807 putback_lru_page(page);
801 } 808 }
802
803 /* 809 /*
804 * Move the new page to the LRU. If migration was not successful 810 * Move the new page to the LRU. If migration was not successful
805 * then this will free the page. 811 * then this will free the page.
806 */ 812 */
807 putback_lru_page(newpage); 813 putback_lru_page(newpage);
808
809 if (result) { 814 if (result) {
810 if (rc) 815 if (rc)
811 *result = rc; 816 *result = rc;
diff --git a/mm/mincore.c b/mm/mincore.c
index a4e6b9d75c76..636a86876ff2 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -69,12 +69,15 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff)
69 * file will not get a swp_entry_t in its pte, but rather it is like 69 * file will not get a swp_entry_t in its pte, but rather it is like
70 * any other file mapping (ie. marked !present and faulted in with 70 * any other file mapping (ie. marked !present and faulted in with
71 * tmpfs's .fault). So swapped out tmpfs mappings are tested here. 71 * tmpfs's .fault). So swapped out tmpfs mappings are tested here.
72 *
73 * However when tmpfs moves the page from pagecache and into swapcache,
74 * it is still in core, but the find_get_page below won't find it.
75 * No big deal, but make a note of it.
76 */ 72 */
77 page = find_get_page(mapping, pgoff); 73 page = find_get_page(mapping, pgoff);
74#ifdef CONFIG_SWAP
75 /* shmem/tmpfs may return swap: account for swapcache page too. */
76 if (radix_tree_exceptional_entry(page)) {
77 swp_entry_t swap = radix_to_swp_entry(page);
78 page = find_get_page(&swapper_space, swap.val);
79 }
80#endif
78 if (page) { 81 if (page) {
79 present = PageUptodate(page); 82 present = PageUptodate(page);
80 page_cache_release(page); 83 page_cache_release(page);
diff --git a/mm/mlock.c b/mm/mlock.c
index 048260c4e02e..4f4f53bdc65d 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -14,7 +14,7 @@
14#include <linux/mempolicy.h> 14#include <linux/mempolicy.h>
15#include <linux/syscalls.h> 15#include <linux/syscalls.h>
16#include <linux/sched.h> 16#include <linux/sched.h>
17#include <linux/module.h> 17#include <linux/export.h>
18#include <linux/rmap.h> 18#include <linux/rmap.h>
19#include <linux/mmzone.h> 19#include <linux/mmzone.h>
20#include <linux/hugetlb.h> 20#include <linux/hugetlb.h>
@@ -110,7 +110,15 @@ void munlock_vma_page(struct page *page)
110 if (TestClearPageMlocked(page)) { 110 if (TestClearPageMlocked(page)) {
111 dec_zone_page_state(page, NR_MLOCK); 111 dec_zone_page_state(page, NR_MLOCK);
112 if (!isolate_lru_page(page)) { 112 if (!isolate_lru_page(page)) {
113 int ret = try_to_munlock(page); 113 int ret = SWAP_AGAIN;
114
115 /*
116 * Optimization: if the page was mapped just once,
117 * that's our mapping and we don't need to check all the
118 * other vmas.
119 */
120 if (page_mapcount(page) > 1)
121 ret = try_to_munlock(page);
114 /* 122 /*
115 * did try_to_unlock() succeed or punt? 123 * did try_to_unlock() succeed or punt?
116 */ 124 */
@@ -549,7 +557,8 @@ SYSCALL_DEFINE1(mlockall, int, flags)
549 if (!can_do_mlock()) 557 if (!can_do_mlock())
550 goto out; 558 goto out;
551 559
552 lru_add_drain_all(); /* flush pagevec */ 560 if (flags & MCL_CURRENT)
561 lru_add_drain_all(); /* flush pagevec */
553 562
554 down_write(&current->mm->mmap_sem); 563 down_write(&current->mm->mmap_sem);
555 564
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 4e0e26591dfa..1ffd97ae26d7 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -8,7 +8,7 @@
8#include <linux/kernel.h> 8#include <linux/kernel.h>
9#include <linux/init.h> 9#include <linux/init.h>
10#include <linux/kobject.h> 10#include <linux/kobject.h>
11#include <linux/module.h> 11#include <linux/export.h>
12#include "internal.h" 12#include "internal.h"
13 13
14#ifdef CONFIG_DEBUG_MEMORY_INIT 14#ifdef CONFIG_DEBUG_MEMORY_INIT
diff --git a/mm/mmap.c b/mm/mmap.c
index a65efd4db3e1..eae90af60ea6 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -22,7 +22,7 @@
22#include <linux/security.h> 22#include <linux/security.h>
23#include <linux/hugetlb.h> 23#include <linux/hugetlb.h>
24#include <linux/profile.h> 24#include <linux/profile.h>
25#include <linux/module.h> 25#include <linux/export.h>
26#include <linux/mount.h> 26#include <linux/mount.h>
27#include <linux/mempolicy.h> 27#include <linux/mempolicy.h>
28#include <linux/rmap.h> 28#include <linux/rmap.h>
@@ -2558,7 +2558,6 @@ int mm_take_all_locks(struct mm_struct *mm)
2558{ 2558{
2559 struct vm_area_struct *vma; 2559 struct vm_area_struct *vma;
2560 struct anon_vma_chain *avc; 2560 struct anon_vma_chain *avc;
2561 int ret = -EINTR;
2562 2561
2563 BUG_ON(down_read_trylock(&mm->mmap_sem)); 2562 BUG_ON(down_read_trylock(&mm->mmap_sem));
2564 2563
@@ -2579,13 +2578,11 @@ int mm_take_all_locks(struct mm_struct *mm)
2579 vm_lock_anon_vma(mm, avc->anon_vma); 2578 vm_lock_anon_vma(mm, avc->anon_vma);
2580 } 2579 }
2581 2580
2582 ret = 0; 2581 return 0;
2583 2582
2584out_unlock: 2583out_unlock:
2585 if (ret) 2584 mm_drop_all_locks(mm);
2586 mm_drop_all_locks(mm); 2585 return -EINTR;
2587
2588 return ret;
2589} 2586}
2590 2587
2591static void vm_unlock_anon_vma(struct anon_vma *anon_vma) 2588static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
diff --git a/mm/mmu_context.c b/mm/mmu_context.c
index 9e82e937000e..cf332bc0080a 100644
--- a/mm/mmu_context.c
+++ b/mm/mmu_context.c
@@ -5,7 +5,7 @@
5 5
6#include <linux/mm.h> 6#include <linux/mm.h>
7#include <linux/mmu_context.h> 7#include <linux/mmu_context.h>
8#include <linux/module.h> 8#include <linux/export.h>
9#include <linux/sched.h> 9#include <linux/sched.h>
10 10
11#include <asm/mmu_context.h> 11#include <asm/mmu_context.h>
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 8d032de4088e..9a611d3a1848 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -11,7 +11,7 @@
11 11
12#include <linux/rculist.h> 12#include <linux/rculist.h>
13#include <linux/mmu_notifier.h> 13#include <linux/mmu_notifier.h>
14#include <linux/module.h> 14#include <linux/export.h>
15#include <linux/mm.h> 15#include <linux/mm.h>
16#include <linux/err.h> 16#include <linux/err.h>
17#include <linux/rcupdate.h> 17#include <linux/rcupdate.h>
diff --git a/mm/mmzone.c b/mm/mmzone.c
index f5b7d1760213..7cf7b7ddc7c5 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -8,7 +8,6 @@
8#include <linux/stddef.h> 8#include <linux/stddef.h>
9#include <linux/mm.h> 9#include <linux/mm.h>
10#include <linux/mmzone.h> 10#include <linux/mmzone.h>
11#include <linux/module.h>
12 11
13struct pglist_data *first_online_pgdat(void) 12struct pglist_data *first_online_pgdat(void)
14{ 13{
diff --git a/mm/mremap.c b/mm/mremap.c
index 506fa44403df..d6959cb4df58 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -41,8 +41,7 @@ static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
41 return NULL; 41 return NULL;
42 42
43 pmd = pmd_offset(pud, addr); 43 pmd = pmd_offset(pud, addr);
44 split_huge_page_pmd(mm, pmd); 44 if (pmd_none(*pmd))
45 if (pmd_none_or_clear_bad(pmd))
46 return NULL; 45 return NULL;
47 46
48 return pmd; 47 return pmd;
@@ -65,8 +64,6 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
65 return NULL; 64 return NULL;
66 65
67 VM_BUG_ON(pmd_trans_huge(*pmd)); 66 VM_BUG_ON(pmd_trans_huge(*pmd));
68 if (pmd_none(*pmd) && __pte_alloc(mm, vma, pmd, addr))
69 return NULL;
70 67
71 return pmd; 68 return pmd;
72} 69}
@@ -80,11 +77,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
80 struct mm_struct *mm = vma->vm_mm; 77 struct mm_struct *mm = vma->vm_mm;
81 pte_t *old_pte, *new_pte, pte; 78 pte_t *old_pte, *new_pte, pte;
82 spinlock_t *old_ptl, *new_ptl; 79 spinlock_t *old_ptl, *new_ptl;
83 unsigned long old_start;
84 80
85 old_start = old_addr;
86 mmu_notifier_invalidate_range_start(vma->vm_mm,
87 old_start, old_end);
88 if (vma->vm_file) { 81 if (vma->vm_file) {
89 /* 82 /*
90 * Subtle point from Rajesh Venkatasubramanian: before 83 * Subtle point from Rajesh Venkatasubramanian: before
@@ -111,7 +104,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
111 new_pte++, new_addr += PAGE_SIZE) { 104 new_pte++, new_addr += PAGE_SIZE) {
112 if (pte_none(*old_pte)) 105 if (pte_none(*old_pte))
113 continue; 106 continue;
114 pte = ptep_clear_flush(vma, old_addr, old_pte); 107 pte = ptep_get_and_clear(mm, old_addr, old_pte);
115 pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr); 108 pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr);
116 set_pte_at(mm, new_addr, new_pte, pte); 109 set_pte_at(mm, new_addr, new_pte, pte);
117 } 110 }
@@ -123,7 +116,6 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
123 pte_unmap_unlock(old_pte - 1, old_ptl); 116 pte_unmap_unlock(old_pte - 1, old_ptl);
124 if (mapping) 117 if (mapping)
125 mutex_unlock(&mapping->i_mmap_mutex); 118 mutex_unlock(&mapping->i_mmap_mutex);
126 mmu_notifier_invalidate_range_end(vma->vm_mm, old_start, old_end);
127} 119}
128 120
129#define LATENCY_LIMIT (64 * PAGE_SIZE) 121#define LATENCY_LIMIT (64 * PAGE_SIZE)
@@ -134,22 +126,43 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
134{ 126{
135 unsigned long extent, next, old_end; 127 unsigned long extent, next, old_end;
136 pmd_t *old_pmd, *new_pmd; 128 pmd_t *old_pmd, *new_pmd;
129 bool need_flush = false;
137 130
138 old_end = old_addr + len; 131 old_end = old_addr + len;
139 flush_cache_range(vma, old_addr, old_end); 132 flush_cache_range(vma, old_addr, old_end);
140 133
134 mmu_notifier_invalidate_range_start(vma->vm_mm, old_addr, old_end);
135
141 for (; old_addr < old_end; old_addr += extent, new_addr += extent) { 136 for (; old_addr < old_end; old_addr += extent, new_addr += extent) {
142 cond_resched(); 137 cond_resched();
143 next = (old_addr + PMD_SIZE) & PMD_MASK; 138 next = (old_addr + PMD_SIZE) & PMD_MASK;
144 if (next - 1 > old_end) 139 /* even if next overflowed, extent below will be ok */
145 next = old_end;
146 extent = next - old_addr; 140 extent = next - old_addr;
141 if (extent > old_end - old_addr)
142 extent = old_end - old_addr;
147 old_pmd = get_old_pmd(vma->vm_mm, old_addr); 143 old_pmd = get_old_pmd(vma->vm_mm, old_addr);
148 if (!old_pmd) 144 if (!old_pmd)
149 continue; 145 continue;
150 new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr); 146 new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr);
151 if (!new_pmd) 147 if (!new_pmd)
152 break; 148 break;
149 if (pmd_trans_huge(*old_pmd)) {
150 int err = 0;
151 if (extent == HPAGE_PMD_SIZE)
152 err = move_huge_pmd(vma, new_vma, old_addr,
153 new_addr, old_end,
154 old_pmd, new_pmd);
155 if (err > 0) {
156 need_flush = true;
157 continue;
158 } else if (!err) {
159 split_huge_page_pmd(vma->vm_mm, old_pmd);
160 }
161 VM_BUG_ON(pmd_trans_huge(*old_pmd));
162 }
163 if (pmd_none(*new_pmd) && __pte_alloc(new_vma->vm_mm, new_vma,
164 new_pmd, new_addr))
165 break;
153 next = (new_addr + PMD_SIZE) & PMD_MASK; 166 next = (new_addr + PMD_SIZE) & PMD_MASK;
154 if (extent > next - new_addr) 167 if (extent > next - new_addr)
155 extent = next - new_addr; 168 extent = next - new_addr;
@@ -157,7 +170,12 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
157 extent = LATENCY_LIMIT; 170 extent = LATENCY_LIMIT;
158 move_ptes(vma, old_pmd, old_addr, old_addr + extent, 171 move_ptes(vma, old_pmd, old_addr, old_addr + extent,
159 new_vma, new_pmd, new_addr); 172 new_vma, new_pmd, new_addr);
173 need_flush = true;
160 } 174 }
175 if (likely(need_flush))
176 flush_tlb_range(vma, old_end-len, old_addr);
177
178 mmu_notifier_invalidate_range_end(vma->vm_mm, old_end-len, old_end);
161 179
162 return len + old_addr - old_end; /* how much done */ 180 return len + old_addr - old_end; /* how much done */
163} 181}
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 6e93dc7f2586..7fa41b4a07bf 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -12,7 +12,7 @@
12#include <linux/pfn.h> 12#include <linux/pfn.h>
13#include <linux/slab.h> 13#include <linux/slab.h>
14#include <linux/bootmem.h> 14#include <linux/bootmem.h>
15#include <linux/module.h> 15#include <linux/export.h>
16#include <linux/kmemleak.h> 16#include <linux/kmemleak.h>
17#include <linux/range.h> 17#include <linux/range.h>
18#include <linux/memblock.h> 18#include <linux/memblock.h>
diff --git a/mm/nommu.c b/mm/nommu.c
index 4358032566e9..73419c55eda6 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -13,7 +13,7 @@
13 * Copyright (c) 2007-2010 Paul Mundt <lethal@linux-sh.org> 13 * Copyright (c) 2007-2010 Paul Mundt <lethal@linux-sh.org>
14 */ 14 */
15 15
16#include <linux/module.h> 16#include <linux/export.h>
17#include <linux/mm.h> 17#include <linux/mm.h>
18#include <linux/mman.h> 18#include <linux/mman.h>
19#include <linux/swap.h> 19#include <linux/swap.h>
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index eafff89b3dd6..471dedb463ab 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -26,18 +26,38 @@
26#include <linux/timex.h> 26#include <linux/timex.h>
27#include <linux/jiffies.h> 27#include <linux/jiffies.h>
28#include <linux/cpuset.h> 28#include <linux/cpuset.h>
29#include <linux/module.h> 29#include <linux/export.h>
30#include <linux/notifier.h> 30#include <linux/notifier.h>
31#include <linux/memcontrol.h> 31#include <linux/memcontrol.h>
32#include <linux/mempolicy.h> 32#include <linux/mempolicy.h>
33#include <linux/security.h> 33#include <linux/security.h>
34#include <linux/ptrace.h> 34#include <linux/ptrace.h>
35#include <linux/freezer.h>
35 36
36int sysctl_panic_on_oom; 37int sysctl_panic_on_oom;
37int sysctl_oom_kill_allocating_task; 38int sysctl_oom_kill_allocating_task;
38int sysctl_oom_dump_tasks = 1; 39int sysctl_oom_dump_tasks = 1;
39static DEFINE_SPINLOCK(zone_scan_lock); 40static DEFINE_SPINLOCK(zone_scan_lock);
40 41
42/*
43 * compare_swap_oom_score_adj() - compare and swap current's oom_score_adj
44 * @old_val: old oom_score_adj for compare
45 * @new_val: new oom_score_adj for swap
46 *
47 * Sets the oom_score_adj value for current to @new_val iff its present value is
48 * @old_val. Usually used to reinstate a previous value to prevent racing with
49 * userspacing tuning the value in the interim.
50 */
51void compare_swap_oom_score_adj(int old_val, int new_val)
52{
53 struct sighand_struct *sighand = current->sighand;
54
55 spin_lock_irq(&sighand->siglock);
56 if (current->signal->oom_score_adj == old_val)
57 current->signal->oom_score_adj = new_val;
58 spin_unlock_irq(&sighand->siglock);
59}
60
41/** 61/**
42 * test_set_oom_score_adj() - set current's oom_score_adj and return old value 62 * test_set_oom_score_adj() - set current's oom_score_adj and return old value
43 * @new_val: new oom_score_adj value 63 * @new_val: new oom_score_adj value
@@ -53,13 +73,7 @@ int test_set_oom_score_adj(int new_val)
53 73
54 spin_lock_irq(&sighand->siglock); 74 spin_lock_irq(&sighand->siglock);
55 old_val = current->signal->oom_score_adj; 75 old_val = current->signal->oom_score_adj;
56 if (new_val != old_val) { 76 current->signal->oom_score_adj = new_val;
57 if (new_val == OOM_SCORE_ADJ_MIN)
58 atomic_inc(&current->mm->oom_disable_count);
59 else if (old_val == OOM_SCORE_ADJ_MIN)
60 atomic_dec(&current->mm->oom_disable_count);
61 current->signal->oom_score_adj = new_val;
62 }
63 spin_unlock_irq(&sighand->siglock); 77 spin_unlock_irq(&sighand->siglock);
64 78
65 return old_val; 79 return old_val;
@@ -172,16 +186,6 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem,
172 return 0; 186 return 0;
173 187
174 /* 188 /*
175 * Shortcut check for a thread sharing p->mm that is OOM_SCORE_ADJ_MIN
176 * so the entire heuristic doesn't need to be executed for something
177 * that cannot be killed.
178 */
179 if (atomic_read(&p->mm->oom_disable_count)) {
180 task_unlock(p);
181 return 0;
182 }
183
184 /*
185 * The memory controller may have a limit of 0 bytes, so avoid a divide 189 * The memory controller may have a limit of 0 bytes, so avoid a divide
186 * by zero, if necessary. 190 * by zero, if necessary.
187 */ 191 */
@@ -303,7 +307,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
303 do_each_thread(g, p) { 307 do_each_thread(g, p) {
304 unsigned int points; 308 unsigned int points;
305 309
306 if (!p->mm) 310 if (p->exit_state)
307 continue; 311 continue;
308 if (oom_unkillable_task(p, mem, nodemask)) 312 if (oom_unkillable_task(p, mem, nodemask))
309 continue; 313 continue;
@@ -317,8 +321,13 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
317 * blocked waiting for another task which itself is waiting 321 * blocked waiting for another task which itself is waiting
318 * for memory. Is there a better alternative? 322 * for memory. Is there a better alternative?
319 */ 323 */
320 if (test_tsk_thread_flag(p, TIF_MEMDIE)) 324 if (test_tsk_thread_flag(p, TIF_MEMDIE)) {
325 if (unlikely(frozen(p)))
326 thaw_process(p);
321 return ERR_PTR(-1UL); 327 return ERR_PTR(-1UL);
328 }
329 if (!p->mm)
330 continue;
322 331
323 if (p->flags & PF_EXITING) { 332 if (p->flags & PF_EXITING) {
324 /* 333 /*
@@ -433,7 +442,7 @@ static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem)
433 task_unlock(p); 442 task_unlock(p);
434 443
435 /* 444 /*
436 * Kill all processes sharing p->mm in other thread groups, if any. 445 * Kill all user processes sharing p->mm in other thread groups, if any.
437 * They don't get access to memory reserves or a higher scheduler 446 * They don't get access to memory reserves or a higher scheduler
438 * priority, though, to avoid depletion of all memory or task 447 * priority, though, to avoid depletion of all memory or task
439 * starvation. This prevents mm->mmap_sem livelock when an oom killed 448 * starvation. This prevents mm->mmap_sem livelock when an oom killed
@@ -443,7 +452,11 @@ static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem)
443 * signal. 452 * signal.
444 */ 453 */
445 for_each_process(q) 454 for_each_process(q)
446 if (q->mm == mm && !same_thread_group(q, p)) { 455 if (q->mm == mm && !same_thread_group(q, p) &&
456 !(q->flags & PF_KTHREAD)) {
457 if (q->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
458 continue;
459
447 task_lock(q); /* Protect ->comm from prctl() */ 460 task_lock(q); /* Protect ->comm from prctl() */
448 pr_err("Kill process %d (%s) sharing same memory\n", 461 pr_err("Kill process %d (%s) sharing same memory\n",
449 task_pid_nr(q), q->comm); 462 task_pid_nr(q), q->comm);
@@ -720,7 +733,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
720 read_lock(&tasklist_lock); 733 read_lock(&tasklist_lock);
721 if (sysctl_oom_kill_allocating_task && 734 if (sysctl_oom_kill_allocating_task &&
722 !oom_unkillable_task(current, NULL, nodemask) && 735 !oom_unkillable_task(current, NULL, nodemask) &&
723 current->mm && !atomic_read(&current->mm->oom_disable_count)) { 736 current->mm) {
724 /* 737 /*
725 * oom_kill_process() needs tasklist_lock held. If it returns 738 * oom_kill_process() needs tasklist_lock held. If it returns
726 * non-zero, current could not be killed so we must fallback to 739 * non-zero, current could not be killed so we must fallback to
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index d1960744f881..a3278f005230 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -12,7 +12,7 @@
12 */ 12 */
13 13
14#include <linux/kernel.h> 14#include <linux/kernel.h>
15#include <linux/module.h> 15#include <linux/export.h>
16#include <linux/spinlock.h> 16#include <linux/spinlock.h>
17#include <linux/fs.h> 17#include <linux/fs.h>
18#include <linux/mm.h> 18#include <linux/mm.h>
@@ -46,26 +46,14 @@
46 */ 46 */
47#define BANDWIDTH_INTERVAL max(HZ/5, 1) 47#define BANDWIDTH_INTERVAL max(HZ/5, 1)
48 48
49#define RATELIMIT_CALC_SHIFT 10
50
49/* 51/*
50 * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited 52 * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
51 * will look to see if it needs to force writeback or throttling. 53 * will look to see if it needs to force writeback or throttling.
52 */ 54 */
53static long ratelimit_pages = 32; 55static long ratelimit_pages = 32;
54 56
55/*
56 * When balance_dirty_pages decides that the caller needs to perform some
57 * non-background writeback, this is how many pages it will attempt to write.
58 * It should be somewhat larger than dirtied pages to ensure that reasonably
59 * large amounts of I/O are submitted.
60 */
61static inline long sync_writeback_pages(unsigned long dirtied)
62{
63 if (dirtied < ratelimit_pages)
64 dirtied = ratelimit_pages;
65
66 return dirtied + dirtied / 2;
67}
68
69/* The following parameters are exported via /proc/sys/vm */ 57/* The following parameters are exported via /proc/sys/vm */
70 58
71/* 59/*
@@ -167,6 +155,8 @@ static void update_completion_period(void)
167 int shift = calc_period_shift(); 155 int shift = calc_period_shift();
168 prop_change_shift(&vm_completions, shift); 156 prop_change_shift(&vm_completions, shift);
169 prop_change_shift(&vm_dirties, shift); 157 prop_change_shift(&vm_dirties, shift);
158
159 writeback_set_ratelimit();
170} 160}
171 161
172int dirty_background_ratio_handler(struct ctl_table *table, int write, 162int dirty_background_ratio_handler(struct ctl_table *table, int write,
@@ -260,52 +250,10 @@ static void bdi_writeout_fraction(struct backing_dev_info *bdi,
260 numerator, denominator); 250 numerator, denominator);
261} 251}
262 252
263static inline void task_dirties_fraction(struct task_struct *tsk,
264 long *numerator, long *denominator)
265{
266 prop_fraction_single(&vm_dirties, &tsk->dirties,
267 numerator, denominator);
268}
269
270/* 253/*
271 * task_dirty_limit - scale down dirty throttling threshold for one task 254 * bdi_min_ratio keeps the sum of the minimum dirty shares of all
272 * 255 * registered backing devices, which, for obvious reasons, can not
273 * task specific dirty limit: 256 * exceed 100%.
274 *
275 * dirty -= (dirty/8) * p_{t}
276 *
277 * To protect light/slow dirtying tasks from heavier/fast ones, we start
278 * throttling individual tasks before reaching the bdi dirty limit.
279 * Relatively low thresholds will be allocated to heavy dirtiers. So when
280 * dirty pages grow large, heavy dirtiers will be throttled first, which will
281 * effectively curb the growth of dirty pages. Light dirtiers with high enough
282 * dirty threshold may never get throttled.
283 */
284#define TASK_LIMIT_FRACTION 8
285static unsigned long task_dirty_limit(struct task_struct *tsk,
286 unsigned long bdi_dirty)
287{
288 long numerator, denominator;
289 unsigned long dirty = bdi_dirty;
290 u64 inv = dirty / TASK_LIMIT_FRACTION;
291
292 task_dirties_fraction(tsk, &numerator, &denominator);
293 inv *= numerator;
294 do_div(inv, denominator);
295
296 dirty -= inv;
297
298 return max(dirty, bdi_dirty/2);
299}
300
301/* Minimum limit for any task */
302static unsigned long task_min_dirty_limit(unsigned long bdi_dirty)
303{
304 return bdi_dirty - bdi_dirty / TASK_LIMIT_FRACTION;
305}
306
307/*
308 *
309 */ 257 */
310static unsigned int bdi_min_ratio; 258static unsigned int bdi_min_ratio;
311 259
@@ -411,6 +359,12 @@ unsigned long determine_dirtyable_memory(void)
411 return x + 1; /* Ensure that we never return 0 */ 359 return x + 1; /* Ensure that we never return 0 */
412} 360}
413 361
362static unsigned long dirty_freerun_ceiling(unsigned long thresh,
363 unsigned long bg_thresh)
364{
365 return (thresh + bg_thresh) / 2;
366}
367
414static unsigned long hard_dirty_limit(unsigned long thresh) 368static unsigned long hard_dirty_limit(unsigned long thresh)
415{ 369{
416 return max(thresh, global_dirty_limit); 370 return max(thresh, global_dirty_limit);
@@ -495,6 +449,198 @@ unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty)
495 return bdi_dirty; 449 return bdi_dirty;
496} 450}
497 451
452/*
453 * Dirty position control.
454 *
455 * (o) global/bdi setpoints
456 *
457 * We want the dirty pages be balanced around the global/bdi setpoints.
458 * When the number of dirty pages is higher/lower than the setpoint, the
459 * dirty position control ratio (and hence task dirty ratelimit) will be
460 * decreased/increased to bring the dirty pages back to the setpoint.
461 *
462 * pos_ratio = 1 << RATELIMIT_CALC_SHIFT
463 *
464 * if (dirty < setpoint) scale up pos_ratio
465 * if (dirty > setpoint) scale down pos_ratio
466 *
467 * if (bdi_dirty < bdi_setpoint) scale up pos_ratio
468 * if (bdi_dirty > bdi_setpoint) scale down pos_ratio
469 *
470 * task_ratelimit = dirty_ratelimit * pos_ratio >> RATELIMIT_CALC_SHIFT
471 *
472 * (o) global control line
473 *
474 * ^ pos_ratio
475 * |
476 * | |<===== global dirty control scope ======>|
477 * 2.0 .............*
478 * | .*
479 * | . *
480 * | . *
481 * | . *
482 * | . *
483 * | . *
484 * 1.0 ................................*
485 * | . . *
486 * | . . *
487 * | . . *
488 * | . . *
489 * | . . *
490 * 0 +------------.------------------.----------------------*------------->
491 * freerun^ setpoint^ limit^ dirty pages
492 *
493 * (o) bdi control line
494 *
495 * ^ pos_ratio
496 * |
497 * | *
498 * | *
499 * | *
500 * | *
501 * | * |<=========== span ============>|
502 * 1.0 .......................*
503 * | . *
504 * | . *
505 * | . *
506 * | . *
507 * | . *
508 * | . *
509 * | . *
510 * | . *
511 * | . *
512 * | . *
513 * | . *
514 * 1/4 ...............................................* * * * * * * * * * * *
515 * | . .
516 * | . .
517 * | . .
518 * 0 +----------------------.-------------------------------.------------->
519 * bdi_setpoint^ x_intercept^
520 *
521 * The bdi control line won't drop below pos_ratio=1/4, so that bdi_dirty can
522 * be smoothly throttled down to normal if it starts high in situations like
523 * - start writing to a slow SD card and a fast disk at the same time. The SD
524 * card's bdi_dirty may rush to many times higher than bdi_setpoint.
525 * - the bdi dirty thresh drops quickly due to change of JBOD workload
526 */
527static unsigned long bdi_position_ratio(struct backing_dev_info *bdi,
528 unsigned long thresh,
529 unsigned long bg_thresh,
530 unsigned long dirty,
531 unsigned long bdi_thresh,
532 unsigned long bdi_dirty)
533{
534 unsigned long write_bw = bdi->avg_write_bandwidth;
535 unsigned long freerun = dirty_freerun_ceiling(thresh, bg_thresh);
536 unsigned long limit = hard_dirty_limit(thresh);
537 unsigned long x_intercept;
538 unsigned long setpoint; /* dirty pages' target balance point */
539 unsigned long bdi_setpoint;
540 unsigned long span;
541 long long pos_ratio; /* for scaling up/down the rate limit */
542 long x;
543
544 if (unlikely(dirty >= limit))
545 return 0;
546
547 /*
548 * global setpoint
549 *
550 * setpoint - dirty 3
551 * f(dirty) := 1.0 + (----------------)
552 * limit - setpoint
553 *
554 * it's a 3rd order polynomial that subjects to
555 *
556 * (1) f(freerun) = 2.0 => rampup dirty_ratelimit reasonably fast
557 * (2) f(setpoint) = 1.0 => the balance point
558 * (3) f(limit) = 0 => the hard limit
559 * (4) df/dx <= 0 => negative feedback control
560 * (5) the closer to setpoint, the smaller |df/dx| (and the reverse)
561 * => fast response on large errors; small oscillation near setpoint
562 */
563 setpoint = (freerun + limit) / 2;
564 x = div_s64((setpoint - dirty) << RATELIMIT_CALC_SHIFT,
565 limit - setpoint + 1);
566 pos_ratio = x;
567 pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
568 pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
569 pos_ratio += 1 << RATELIMIT_CALC_SHIFT;
570
571 /*
572 * We have computed basic pos_ratio above based on global situation. If
573 * the bdi is over/under its share of dirty pages, we want to scale
574 * pos_ratio further down/up. That is done by the following mechanism.
575 */
576
577 /*
578 * bdi setpoint
579 *
580 * f(bdi_dirty) := 1.0 + k * (bdi_dirty - bdi_setpoint)
581 *
582 * x_intercept - bdi_dirty
583 * := --------------------------
584 * x_intercept - bdi_setpoint
585 *
586 * The main bdi control line is a linear function that subjects to
587 *
588 * (1) f(bdi_setpoint) = 1.0
589 * (2) k = - 1 / (8 * write_bw) (in single bdi case)
590 * or equally: x_intercept = bdi_setpoint + 8 * write_bw
591 *
592 * For single bdi case, the dirty pages are observed to fluctuate
593 * regularly within range
594 * [bdi_setpoint - write_bw/2, bdi_setpoint + write_bw/2]
595 * for various filesystems, where (2) can yield in a reasonable 12.5%
596 * fluctuation range for pos_ratio.
597 *
598 * For JBOD case, bdi_thresh (not bdi_dirty!) could fluctuate up to its
599 * own size, so move the slope over accordingly and choose a slope that
600 * yields 100% pos_ratio fluctuation on suddenly doubled bdi_thresh.
601 */
602 if (unlikely(bdi_thresh > thresh))
603 bdi_thresh = thresh;
604 bdi_thresh = max(bdi_thresh, (limit - dirty) / 8);
605 /*
606 * scale global setpoint to bdi's:
607 * bdi_setpoint = setpoint * bdi_thresh / thresh
608 */
609 x = div_u64((u64)bdi_thresh << 16, thresh + 1);
610 bdi_setpoint = setpoint * (u64)x >> 16;
611 /*
612 * Use span=(8*write_bw) in single bdi case as indicated by
613 * (thresh - bdi_thresh ~= 0) and transit to bdi_thresh in JBOD case.
614 *
615 * bdi_thresh thresh - bdi_thresh
616 * span = ---------- * (8 * write_bw) + ------------------- * bdi_thresh
617 * thresh thresh
618 */
619 span = (thresh - bdi_thresh + 8 * write_bw) * (u64)x >> 16;
620 x_intercept = bdi_setpoint + span;
621
622 if (bdi_dirty < x_intercept - span / 4) {
623 pos_ratio = div_u64(pos_ratio * (x_intercept - bdi_dirty),
624 x_intercept - bdi_setpoint + 1);
625 } else
626 pos_ratio /= 4;
627
628 /*
629 * bdi reserve area, safeguard against dirty pool underrun and disk idle
630 * It may push the desired control point of global dirty pages higher
631 * than setpoint.
632 */
633 x_intercept = bdi_thresh / 2;
634 if (bdi_dirty < x_intercept) {
635 if (bdi_dirty > x_intercept / 8)
636 pos_ratio = div_u64(pos_ratio * x_intercept, bdi_dirty);
637 else
638 pos_ratio *= 8;
639 }
640
641 return pos_ratio;
642}
643
498static void bdi_update_write_bandwidth(struct backing_dev_info *bdi, 644static void bdi_update_write_bandwidth(struct backing_dev_info *bdi,
499 unsigned long elapsed, 645 unsigned long elapsed,
500 unsigned long written) 646 unsigned long written)
@@ -591,8 +737,153 @@ static void global_update_bandwidth(unsigned long thresh,
591 spin_unlock(&dirty_lock); 737 spin_unlock(&dirty_lock);
592} 738}
593 739
740/*
741 * Maintain bdi->dirty_ratelimit, the base dirty throttle rate.
742 *
743 * Normal bdi tasks will be curbed at or below it in long term.
744 * Obviously it should be around (write_bw / N) when there are N dd tasks.
745 */
746static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi,
747 unsigned long thresh,
748 unsigned long bg_thresh,
749 unsigned long dirty,
750 unsigned long bdi_thresh,
751 unsigned long bdi_dirty,
752 unsigned long dirtied,
753 unsigned long elapsed)
754{
755 unsigned long freerun = dirty_freerun_ceiling(thresh, bg_thresh);
756 unsigned long limit = hard_dirty_limit(thresh);
757 unsigned long setpoint = (freerun + limit) / 2;
758 unsigned long write_bw = bdi->avg_write_bandwidth;
759 unsigned long dirty_ratelimit = bdi->dirty_ratelimit;
760 unsigned long dirty_rate;
761 unsigned long task_ratelimit;
762 unsigned long balanced_dirty_ratelimit;
763 unsigned long pos_ratio;
764 unsigned long step;
765 unsigned long x;
766
767 /*
768 * The dirty rate will match the writeout rate in long term, except
769 * when dirty pages are truncated by userspace or re-dirtied by FS.
770 */
771 dirty_rate = (dirtied - bdi->dirtied_stamp) * HZ / elapsed;
772
773 pos_ratio = bdi_position_ratio(bdi, thresh, bg_thresh, dirty,
774 bdi_thresh, bdi_dirty);
775 /*
776 * task_ratelimit reflects each dd's dirty rate for the past 200ms.
777 */
778 task_ratelimit = (u64)dirty_ratelimit *
779 pos_ratio >> RATELIMIT_CALC_SHIFT;
780 task_ratelimit++; /* it helps rampup dirty_ratelimit from tiny values */
781
782 /*
783 * A linear estimation of the "balanced" throttle rate. The theory is,
784 * if there are N dd tasks, each throttled at task_ratelimit, the bdi's
785 * dirty_rate will be measured to be (N * task_ratelimit). So the below
786 * formula will yield the balanced rate limit (write_bw / N).
787 *
788 * Note that the expanded form is not a pure rate feedback:
789 * rate_(i+1) = rate_(i) * (write_bw / dirty_rate) (1)
790 * but also takes pos_ratio into account:
791 * rate_(i+1) = rate_(i) * (write_bw / dirty_rate) * pos_ratio (2)
792 *
793 * (1) is not realistic because pos_ratio also takes part in balancing
794 * the dirty rate. Consider the state
795 * pos_ratio = 0.5 (3)
796 * rate = 2 * (write_bw / N) (4)
797 * If (1) is used, it will stuck in that state! Because each dd will
798 * be throttled at
799 * task_ratelimit = pos_ratio * rate = (write_bw / N) (5)
800 * yielding
801 * dirty_rate = N * task_ratelimit = write_bw (6)
802 * put (6) into (1) we get
803 * rate_(i+1) = rate_(i) (7)
804 *
805 * So we end up using (2) to always keep
806 * rate_(i+1) ~= (write_bw / N) (8)
807 * regardless of the value of pos_ratio. As long as (8) is satisfied,
808 * pos_ratio is able to drive itself to 1.0, which is not only where
809 * the dirty count meet the setpoint, but also where the slope of
810 * pos_ratio is most flat and hence task_ratelimit is least fluctuated.
811 */
812 balanced_dirty_ratelimit = div_u64((u64)task_ratelimit * write_bw,
813 dirty_rate | 1);
814
815 /*
816 * We could safely do this and return immediately:
817 *
818 * bdi->dirty_ratelimit = balanced_dirty_ratelimit;
819 *
820 * However to get a more stable dirty_ratelimit, the below elaborated
821 * code makes use of task_ratelimit to filter out sigular points and
822 * limit the step size.
823 *
824 * The below code essentially only uses the relative value of
825 *
826 * task_ratelimit - dirty_ratelimit
827 * = (pos_ratio - 1) * dirty_ratelimit
828 *
829 * which reflects the direction and size of dirty position error.
830 */
831
832 /*
833 * dirty_ratelimit will follow balanced_dirty_ratelimit iff
834 * task_ratelimit is on the same side of dirty_ratelimit, too.
835 * For example, when
836 * - dirty_ratelimit > balanced_dirty_ratelimit
837 * - dirty_ratelimit > task_ratelimit (dirty pages are above setpoint)
838 * lowering dirty_ratelimit will help meet both the position and rate
839 * control targets. Otherwise, don't update dirty_ratelimit if it will
840 * only help meet the rate target. After all, what the users ultimately
841 * feel and care are stable dirty rate and small position error.
842 *
843 * |task_ratelimit - dirty_ratelimit| is used to limit the step size
844 * and filter out the sigular points of balanced_dirty_ratelimit. Which
845 * keeps jumping around randomly and can even leap far away at times
846 * due to the small 200ms estimation period of dirty_rate (we want to
847 * keep that period small to reduce time lags).
848 */
849 step = 0;
850 if (dirty < setpoint) {
851 x = min(bdi->balanced_dirty_ratelimit,
852 min(balanced_dirty_ratelimit, task_ratelimit));
853 if (dirty_ratelimit < x)
854 step = x - dirty_ratelimit;
855 } else {
856 x = max(bdi->balanced_dirty_ratelimit,
857 max(balanced_dirty_ratelimit, task_ratelimit));
858 if (dirty_ratelimit > x)
859 step = dirty_ratelimit - x;
860 }
861
862 /*
863 * Don't pursue 100% rate matching. It's impossible since the balanced
864 * rate itself is constantly fluctuating. So decrease the track speed
865 * when it gets close to the target. Helps eliminate pointless tremors.
866 */
867 step >>= dirty_ratelimit / (2 * step + 1);
868 /*
869 * Limit the tracking speed to avoid overshooting.
870 */
871 step = (step + 7) / 8;
872
873 if (dirty_ratelimit < balanced_dirty_ratelimit)
874 dirty_ratelimit += step;
875 else
876 dirty_ratelimit -= step;
877
878 bdi->dirty_ratelimit = max(dirty_ratelimit, 1UL);
879 bdi->balanced_dirty_ratelimit = balanced_dirty_ratelimit;
880
881 trace_bdi_dirty_ratelimit(bdi, dirty_rate, task_ratelimit);
882}
883
594void __bdi_update_bandwidth(struct backing_dev_info *bdi, 884void __bdi_update_bandwidth(struct backing_dev_info *bdi,
595 unsigned long thresh, 885 unsigned long thresh,
886 unsigned long bg_thresh,
596 unsigned long dirty, 887 unsigned long dirty,
597 unsigned long bdi_thresh, 888 unsigned long bdi_thresh,
598 unsigned long bdi_dirty, 889 unsigned long bdi_dirty,
@@ -600,6 +891,7 @@ void __bdi_update_bandwidth(struct backing_dev_info *bdi,
600{ 891{
601 unsigned long now = jiffies; 892 unsigned long now = jiffies;
602 unsigned long elapsed = now - bdi->bw_time_stamp; 893 unsigned long elapsed = now - bdi->bw_time_stamp;
894 unsigned long dirtied;
603 unsigned long written; 895 unsigned long written;
604 896
605 /* 897 /*
@@ -608,6 +900,7 @@ void __bdi_update_bandwidth(struct backing_dev_info *bdi,
608 if (elapsed < BANDWIDTH_INTERVAL) 900 if (elapsed < BANDWIDTH_INTERVAL)
609 return; 901 return;
610 902
903 dirtied = percpu_counter_read(&bdi->bdi_stat[BDI_DIRTIED]);
611 written = percpu_counter_read(&bdi->bdi_stat[BDI_WRITTEN]); 904 written = percpu_counter_read(&bdi->bdi_stat[BDI_WRITTEN]);
612 905
613 /* 906 /*
@@ -617,18 +910,23 @@ void __bdi_update_bandwidth(struct backing_dev_info *bdi,
617 if (elapsed > HZ && time_before(bdi->bw_time_stamp, start_time)) 910 if (elapsed > HZ && time_before(bdi->bw_time_stamp, start_time))
618 goto snapshot; 911 goto snapshot;
619 912
620 if (thresh) 913 if (thresh) {
621 global_update_bandwidth(thresh, dirty, now); 914 global_update_bandwidth(thresh, dirty, now);
622 915 bdi_update_dirty_ratelimit(bdi, thresh, bg_thresh, dirty,
916 bdi_thresh, bdi_dirty,
917 dirtied, elapsed);
918 }
623 bdi_update_write_bandwidth(bdi, elapsed, written); 919 bdi_update_write_bandwidth(bdi, elapsed, written);
624 920
625snapshot: 921snapshot:
922 bdi->dirtied_stamp = dirtied;
626 bdi->written_stamp = written; 923 bdi->written_stamp = written;
627 bdi->bw_time_stamp = now; 924 bdi->bw_time_stamp = now;
628} 925}
629 926
630static void bdi_update_bandwidth(struct backing_dev_info *bdi, 927static void bdi_update_bandwidth(struct backing_dev_info *bdi,
631 unsigned long thresh, 928 unsigned long thresh,
929 unsigned long bg_thresh,
632 unsigned long dirty, 930 unsigned long dirty,
633 unsigned long bdi_thresh, 931 unsigned long bdi_thresh,
634 unsigned long bdi_dirty, 932 unsigned long bdi_dirty,
@@ -637,37 +935,99 @@ static void bdi_update_bandwidth(struct backing_dev_info *bdi,
637 if (time_is_after_eq_jiffies(bdi->bw_time_stamp + BANDWIDTH_INTERVAL)) 935 if (time_is_after_eq_jiffies(bdi->bw_time_stamp + BANDWIDTH_INTERVAL))
638 return; 936 return;
639 spin_lock(&bdi->wb.list_lock); 937 spin_lock(&bdi->wb.list_lock);
640 __bdi_update_bandwidth(bdi, thresh, dirty, bdi_thresh, bdi_dirty, 938 __bdi_update_bandwidth(bdi, thresh, bg_thresh, dirty,
641 start_time); 939 bdi_thresh, bdi_dirty, start_time);
642 spin_unlock(&bdi->wb.list_lock); 940 spin_unlock(&bdi->wb.list_lock);
643} 941}
644 942
645/* 943/*
944 * After a task dirtied this many pages, balance_dirty_pages_ratelimited_nr()
945 * will look to see if it needs to start dirty throttling.
946 *
947 * If dirty_poll_interval is too low, big NUMA machines will call the expensive
948 * global_page_state() too often. So scale it near-sqrt to the safety margin
949 * (the number of pages we may dirty without exceeding the dirty limits).
950 */
951static unsigned long dirty_poll_interval(unsigned long dirty,
952 unsigned long thresh)
953{
954 if (thresh > dirty)
955 return 1UL << (ilog2(thresh - dirty) >> 1);
956
957 return 1;
958}
959
960static unsigned long bdi_max_pause(struct backing_dev_info *bdi,
961 unsigned long bdi_dirty)
962{
963 unsigned long bw = bdi->avg_write_bandwidth;
964 unsigned long hi = ilog2(bw);
965 unsigned long lo = ilog2(bdi->dirty_ratelimit);
966 unsigned long t;
967
968 /* target for 20ms max pause on 1-dd case */
969 t = HZ / 50;
970
971 /*
972 * Scale up pause time for concurrent dirtiers in order to reduce CPU
973 * overheads.
974 *
975 * (N * 20ms) on 2^N concurrent tasks.
976 */
977 if (hi > lo)
978 t += (hi - lo) * (20 * HZ) / 1024;
979
980 /*
981 * Limit pause time for small memory systems. If sleeping for too long
982 * time, a small pool of dirty/writeback pages may go empty and disk go
983 * idle.
984 *
985 * 8 serves as the safety ratio.
986 */
987 if (bdi_dirty)
988 t = min(t, bdi_dirty * HZ / (8 * bw + 1));
989
990 /*
991 * The pause time will be settled within range (max_pause/4, max_pause).
992 * Apply a minimal value of 4 to get a non-zero max_pause/4.
993 */
994 return clamp_val(t, 4, MAX_PAUSE);
995}
996
997/*
646 * balance_dirty_pages() must be called by processes which are generating dirty 998 * balance_dirty_pages() must be called by processes which are generating dirty
647 * data. It looks at the number of dirty pages in the machine and will force 999 * data. It looks at the number of dirty pages in the machine and will force
648 * the caller to perform writeback if the system is over `vm_dirty_ratio'. 1000 * the caller to wait once crossing the (background_thresh + dirty_thresh) / 2.
649 * If we're over `background_thresh' then the writeback threads are woken to 1001 * If we're over `background_thresh' then the writeback threads are woken to
650 * perform some writeout. 1002 * perform some writeout.
651 */ 1003 */
652static void balance_dirty_pages(struct address_space *mapping, 1004static void balance_dirty_pages(struct address_space *mapping,
653 unsigned long write_chunk) 1005 unsigned long pages_dirtied)
654{ 1006{
655 unsigned long nr_reclaimable, bdi_nr_reclaimable; 1007 unsigned long nr_reclaimable; /* = file_dirty + unstable_nfs */
1008 unsigned long bdi_reclaimable;
656 unsigned long nr_dirty; /* = file_dirty + writeback + unstable_nfs */ 1009 unsigned long nr_dirty; /* = file_dirty + writeback + unstable_nfs */
657 unsigned long bdi_dirty; 1010 unsigned long bdi_dirty;
1011 unsigned long freerun;
658 unsigned long background_thresh; 1012 unsigned long background_thresh;
659 unsigned long dirty_thresh; 1013 unsigned long dirty_thresh;
660 unsigned long bdi_thresh; 1014 unsigned long bdi_thresh;
661 unsigned long task_bdi_thresh; 1015 long pause = 0;
662 unsigned long min_task_bdi_thresh; 1016 long uninitialized_var(max_pause);
663 unsigned long pages_written = 0;
664 unsigned long pause = 1;
665 bool dirty_exceeded = false; 1017 bool dirty_exceeded = false;
666 bool clear_dirty_exceeded = true; 1018 unsigned long task_ratelimit;
1019 unsigned long uninitialized_var(dirty_ratelimit);
1020 unsigned long pos_ratio;
667 struct backing_dev_info *bdi = mapping->backing_dev_info; 1021 struct backing_dev_info *bdi = mapping->backing_dev_info;
668 unsigned long start_time = jiffies; 1022 unsigned long start_time = jiffies;
669 1023
670 for (;;) { 1024 for (;;) {
1025 /*
1026 * Unstable writes are a feature of certain networked
1027 * filesystems (i.e. NFS) in which data may have been
1028 * written to the server's write cache, but has not yet
1029 * been flushed to permanent storage.
1030 */
671 nr_reclaimable = global_page_state(NR_FILE_DIRTY) + 1031 nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
672 global_page_state(NR_UNSTABLE_NFS); 1032 global_page_state(NR_UNSTABLE_NFS);
673 nr_dirty = nr_reclaimable + global_page_state(NR_WRITEBACK); 1033 nr_dirty = nr_reclaimable + global_page_state(NR_WRITEBACK);
@@ -679,12 +1039,28 @@ static void balance_dirty_pages(struct address_space *mapping,
679 * catch-up. This avoids (excessively) small writeouts 1039 * catch-up. This avoids (excessively) small writeouts
680 * when the bdi limits are ramping up. 1040 * when the bdi limits are ramping up.
681 */ 1041 */
682 if (nr_dirty <= (background_thresh + dirty_thresh) / 2) 1042 freerun = dirty_freerun_ceiling(dirty_thresh,
1043 background_thresh);
1044 if (nr_dirty <= freerun)
683 break; 1045 break;
684 1046
1047 if (unlikely(!writeback_in_progress(bdi)))
1048 bdi_start_background_writeback(bdi);
1049
1050 /*
1051 * bdi_thresh is not treated as some limiting factor as
1052 * dirty_thresh, due to reasons
1053 * - in JBOD setup, bdi_thresh can fluctuate a lot
1054 * - in a system with HDD and USB key, the USB key may somehow
1055 * go into state (bdi_dirty >> bdi_thresh) either because
1056 * bdi_dirty starts high, or because bdi_thresh drops low.
1057 * In this case we don't want to hard throttle the USB key
1058 * dirtiers for 100 seconds until bdi_dirty drops under
1059 * bdi_thresh. Instead the auxiliary bdi control line in
1060 * bdi_position_ratio() will let the dirtier task progress
1061 * at some rate <= (write_bw / 2) for bringing down bdi_dirty.
1062 */
685 bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); 1063 bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
686 min_task_bdi_thresh = task_min_dirty_limit(bdi_thresh);
687 task_bdi_thresh = task_dirty_limit(current, bdi_thresh);
688 1064
689 /* 1065 /*
690 * In order to avoid the stacked BDI deadlock we need 1066 * In order to avoid the stacked BDI deadlock we need
@@ -696,56 +1072,69 @@ static void balance_dirty_pages(struct address_space *mapping,
696 * actually dirty; with m+n sitting in the percpu 1072 * actually dirty; with m+n sitting in the percpu
697 * deltas. 1073 * deltas.
698 */ 1074 */
699 if (task_bdi_thresh < 2 * bdi_stat_error(bdi)) { 1075 if (bdi_thresh < 2 * bdi_stat_error(bdi)) {
700 bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE); 1076 bdi_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
701 bdi_dirty = bdi_nr_reclaimable + 1077 bdi_dirty = bdi_reclaimable +
702 bdi_stat_sum(bdi, BDI_WRITEBACK); 1078 bdi_stat_sum(bdi, BDI_WRITEBACK);
703 } else { 1079 } else {
704 bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); 1080 bdi_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
705 bdi_dirty = bdi_nr_reclaimable + 1081 bdi_dirty = bdi_reclaimable +
706 bdi_stat(bdi, BDI_WRITEBACK); 1082 bdi_stat(bdi, BDI_WRITEBACK);
707 } 1083 }
708 1084
709 /* 1085 dirty_exceeded = (bdi_dirty > bdi_thresh) ||
710 * The bdi thresh is somehow "soft" limit derived from the
711 * global "hard" limit. The former helps to prevent heavy IO
712 * bdi or process from holding back light ones; The latter is
713 * the last resort safeguard.
714 */
715 dirty_exceeded = (bdi_dirty > task_bdi_thresh) ||
716 (nr_dirty > dirty_thresh); 1086 (nr_dirty > dirty_thresh);
717 clear_dirty_exceeded = (bdi_dirty <= min_task_bdi_thresh) && 1087 if (dirty_exceeded && !bdi->dirty_exceeded)
718 (nr_dirty <= dirty_thresh);
719
720 if (!dirty_exceeded)
721 break;
722
723 if (!bdi->dirty_exceeded)
724 bdi->dirty_exceeded = 1; 1088 bdi->dirty_exceeded = 1;
725 1089
726 bdi_update_bandwidth(bdi, dirty_thresh, nr_dirty, 1090 bdi_update_bandwidth(bdi, dirty_thresh, background_thresh,
727 bdi_thresh, bdi_dirty, start_time); 1091 nr_dirty, bdi_thresh, bdi_dirty,
728 1092 start_time);
729 /* Note: nr_reclaimable denotes nr_dirty + nr_unstable. 1093
730 * Unstable writes are a feature of certain networked 1094 max_pause = bdi_max_pause(bdi, bdi_dirty);
731 * filesystems (i.e. NFS) in which data may have been 1095
732 * written to the server's write cache, but has not yet 1096 dirty_ratelimit = bdi->dirty_ratelimit;
733 * been flushed to permanent storage. 1097 pos_ratio = bdi_position_ratio(bdi, dirty_thresh,
734 * Only move pages to writeback if this bdi is over its 1098 background_thresh, nr_dirty,
735 * threshold otherwise wait until the disk writes catch 1099 bdi_thresh, bdi_dirty);
736 * up. 1100 task_ratelimit = ((u64)dirty_ratelimit * pos_ratio) >>
737 */ 1101 RATELIMIT_CALC_SHIFT;
738 trace_balance_dirty_start(bdi); 1102 if (unlikely(task_ratelimit == 0)) {
739 if (bdi_nr_reclaimable > task_bdi_thresh) { 1103 pause = max_pause;
740 pages_written += writeback_inodes_wb(&bdi->wb, 1104 goto pause;
741 write_chunk); 1105 }
742 trace_balance_dirty_written(bdi, pages_written); 1106 pause = HZ * pages_dirtied / task_ratelimit;
743 if (pages_written >= write_chunk) 1107 if (unlikely(pause <= 0)) {
744 break; /* We've done our duty */ 1108 trace_balance_dirty_pages(bdi,
1109 dirty_thresh,
1110 background_thresh,
1111 nr_dirty,
1112 bdi_thresh,
1113 bdi_dirty,
1114 dirty_ratelimit,
1115 task_ratelimit,
1116 pages_dirtied,
1117 pause,
1118 start_time);
1119 pause = 1; /* avoid resetting nr_dirtied_pause below */
1120 break;
745 } 1121 }
1122 pause = min(pause, max_pause);
1123
1124pause:
1125 trace_balance_dirty_pages(bdi,
1126 dirty_thresh,
1127 background_thresh,
1128 nr_dirty,
1129 bdi_thresh,
1130 bdi_dirty,
1131 dirty_ratelimit,
1132 task_ratelimit,
1133 pages_dirtied,
1134 pause,
1135 start_time);
746 __set_current_state(TASK_UNINTERRUPTIBLE); 1136 __set_current_state(TASK_UNINTERRUPTIBLE);
747 io_schedule_timeout(pause); 1137 io_schedule_timeout(pause);
748 trace_balance_dirty_wait(bdi);
749 1138
750 dirty_thresh = hard_dirty_limit(dirty_thresh); 1139 dirty_thresh = hard_dirty_limit(dirty_thresh);
751 /* 1140 /*
@@ -754,35 +1143,30 @@ static void balance_dirty_pages(struct address_space *mapping,
754 * 200ms is typically more than enough to curb heavy dirtiers; 1143 * 200ms is typically more than enough to curb heavy dirtiers;
755 * (b) the pause time limit makes the dirtiers more responsive. 1144 * (b) the pause time limit makes the dirtiers more responsive.
756 */ 1145 */
757 if (nr_dirty < dirty_thresh + 1146 if (nr_dirty < dirty_thresh)
758 dirty_thresh / DIRTY_MAXPAUSE_AREA &&
759 time_after(jiffies, start_time + MAX_PAUSE))
760 break;
761 /*
762 * pass-good area. When some bdi gets blocked (eg. NFS server
763 * not responding), or write bandwidth dropped dramatically due
764 * to concurrent reads, or dirty threshold suddenly dropped and
765 * the dirty pages cannot be brought down anytime soon (eg. on
766 * slow USB stick), at least let go of the good bdi's.
767 */
768 if (nr_dirty < dirty_thresh +
769 dirty_thresh / DIRTY_PASSGOOD_AREA &&
770 bdi_dirty < bdi_thresh)
771 break; 1147 break;
772
773 /*
774 * Increase the delay for each loop, up to our previous
775 * default of taking a 100ms nap.
776 */
777 pause <<= 1;
778 if (pause > HZ / 10)
779 pause = HZ / 10;
780 } 1148 }
781 1149
782 /* Clear dirty_exceeded flag only when no task can exceed the limit */ 1150 if (!dirty_exceeded && bdi->dirty_exceeded)
783 if (clear_dirty_exceeded && bdi->dirty_exceeded)
784 bdi->dirty_exceeded = 0; 1151 bdi->dirty_exceeded = 0;
785 1152
1153 current->nr_dirtied = 0;
1154 if (pause == 0) { /* in freerun area */
1155 current->nr_dirtied_pause =
1156 dirty_poll_interval(nr_dirty, dirty_thresh);
1157 } else if (pause <= max_pause / 4 &&
1158 pages_dirtied >= current->nr_dirtied_pause) {
1159 current->nr_dirtied_pause = clamp_val(
1160 dirty_ratelimit * (max_pause / 2) / HZ,
1161 pages_dirtied + pages_dirtied / 8,
1162 pages_dirtied * 4);
1163 } else if (pause >= max_pause) {
1164 current->nr_dirtied_pause = 1 | clamp_val(
1165 dirty_ratelimit * (max_pause / 2) / HZ,
1166 pages_dirtied / 4,
1167 pages_dirtied - pages_dirtied / 8);
1168 }
1169
786 if (writeback_in_progress(bdi)) 1170 if (writeback_in_progress(bdi))
787 return; 1171 return;
788 1172
@@ -794,8 +1178,10 @@ static void balance_dirty_pages(struct address_space *mapping,
794 * In normal mode, we start background writeout at the lower 1178 * In normal mode, we start background writeout at the lower
795 * background_thresh, to keep the amount of dirty memory low. 1179 * background_thresh, to keep the amount of dirty memory low.
796 */ 1180 */
797 if ((laptop_mode && pages_written) || 1181 if (laptop_mode)
798 (!laptop_mode && (nr_reclaimable > background_thresh))) 1182 return;
1183
1184 if (nr_reclaimable > background_thresh)
799 bdi_start_background_writeback(bdi); 1185 bdi_start_background_writeback(bdi);
800} 1186}
801 1187
@@ -809,7 +1195,7 @@ void set_page_dirty_balance(struct page *page, int page_mkwrite)
809 } 1195 }
810} 1196}
811 1197
812static DEFINE_PER_CPU(unsigned long, bdp_ratelimits) = 0; 1198static DEFINE_PER_CPU(int, bdp_ratelimits);
813 1199
814/** 1200/**
815 * balance_dirty_pages_ratelimited_nr - balance dirty memory state 1201 * balance_dirty_pages_ratelimited_nr - balance dirty memory state
@@ -829,31 +1215,39 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
829 unsigned long nr_pages_dirtied) 1215 unsigned long nr_pages_dirtied)
830{ 1216{
831 struct backing_dev_info *bdi = mapping->backing_dev_info; 1217 struct backing_dev_info *bdi = mapping->backing_dev_info;
832 unsigned long ratelimit; 1218 int ratelimit;
833 unsigned long *p; 1219 int *p;
834 1220
835 if (!bdi_cap_account_dirty(bdi)) 1221 if (!bdi_cap_account_dirty(bdi))
836 return; 1222 return;
837 1223
838 ratelimit = ratelimit_pages; 1224 ratelimit = current->nr_dirtied_pause;
839 if (mapping->backing_dev_info->dirty_exceeded) 1225 if (bdi->dirty_exceeded)
840 ratelimit = 8; 1226 ratelimit = min(ratelimit, 32 >> (PAGE_SHIFT - 10));
1227
1228 current->nr_dirtied += nr_pages_dirtied;
841 1229
1230 preempt_disable();
842 /* 1231 /*
843 * Check the rate limiting. Also, we do not want to throttle real-time 1232 * This prevents one CPU to accumulate too many dirtied pages without
844 * tasks in balance_dirty_pages(). Period. 1233 * calling into balance_dirty_pages(), which can happen when there are
1234 * 1000+ tasks, all of them start dirtying pages at exactly the same
1235 * time, hence all honoured too large initial task->nr_dirtied_pause.
845 */ 1236 */
846 preempt_disable();
847 p = &__get_cpu_var(bdp_ratelimits); 1237 p = &__get_cpu_var(bdp_ratelimits);
848 *p += nr_pages_dirtied; 1238 if (unlikely(current->nr_dirtied >= ratelimit))
849 if (unlikely(*p >= ratelimit)) {
850 ratelimit = sync_writeback_pages(*p);
851 *p = 0; 1239 *p = 0;
852 preempt_enable(); 1240 else {
853 balance_dirty_pages(mapping, ratelimit); 1241 *p += nr_pages_dirtied;
854 return; 1242 if (unlikely(*p >= ratelimit_pages)) {
1243 *p = 0;
1244 ratelimit = 0;
1245 }
855 } 1246 }
856 preempt_enable(); 1247 preempt_enable();
1248
1249 if (unlikely(current->nr_dirtied >= ratelimit))
1250 balance_dirty_pages(mapping, current->nr_dirtied);
857} 1251}
858EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr); 1252EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr);
859 1253
@@ -909,7 +1303,8 @@ void laptop_mode_timer_fn(unsigned long data)
909 * threshold 1303 * threshold
910 */ 1304 */
911 if (bdi_has_dirty_io(&q->backing_dev_info)) 1305 if (bdi_has_dirty_io(&q->backing_dev_info))
912 bdi_start_writeback(&q->backing_dev_info, nr_pages); 1306 bdi_start_writeback(&q->backing_dev_info, nr_pages,
1307 WB_REASON_LAPTOP_TIMER);
913} 1308}
914 1309
915/* 1310/*
@@ -948,22 +1343,17 @@ void laptop_sync_completion(void)
948 * 1343 *
949 * Here we set ratelimit_pages to a level which ensures that when all CPUs are 1344 * Here we set ratelimit_pages to a level which ensures that when all CPUs are
950 * dirtying in parallel, we cannot go more than 3% (1/32) over the dirty memory 1345 * dirtying in parallel, we cannot go more than 3% (1/32) over the dirty memory
951 * thresholds before writeback cuts in. 1346 * thresholds.
952 *
953 * But the limit should not be set too high. Because it also controls the
954 * amount of memory which the balance_dirty_pages() caller has to write back.
955 * If this is too large then the caller will block on the IO queue all the
956 * time. So limit it to four megabytes - the balance_dirty_pages() caller
957 * will write six megabyte chunks, max.
958 */ 1347 */
959 1348
960void writeback_set_ratelimit(void) 1349void writeback_set_ratelimit(void)
961{ 1350{
962 ratelimit_pages = vm_total_pages / (num_online_cpus() * 32); 1351 unsigned long background_thresh;
1352 unsigned long dirty_thresh;
1353 global_dirty_limits(&background_thresh, &dirty_thresh);
1354 ratelimit_pages = dirty_thresh / (num_online_cpus() * 32);
963 if (ratelimit_pages < 16) 1355 if (ratelimit_pages < 16)
964 ratelimit_pages = 16; 1356 ratelimit_pages = 16;
965 if (ratelimit_pages * PAGE_CACHE_SIZE > 4096 * 1024)
966 ratelimit_pages = (4096 * 1024) / PAGE_CACHE_SIZE;
967} 1357}
968 1358
969static int __cpuinit 1359static int __cpuinit
@@ -1333,6 +1723,7 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)
1333 __inc_zone_page_state(page, NR_FILE_DIRTY); 1723 __inc_zone_page_state(page, NR_FILE_DIRTY);
1334 __inc_zone_page_state(page, NR_DIRTIED); 1724 __inc_zone_page_state(page, NR_DIRTIED);
1335 __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); 1725 __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
1726 __inc_bdi_stat(mapping->backing_dev_info, BDI_DIRTIED);
1336 task_dirty_inc(current); 1727 task_dirty_inc(current);
1337 task_io_account_write(PAGE_CACHE_SIZE); 1728 task_io_account_write(PAGE_CACHE_SIZE);
1338 } 1729 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 1dbcf8888f14..9dd443d89d8b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -318,6 +318,7 @@ static void bad_page(struct page *page)
318 current->comm, page_to_pfn(page)); 318 current->comm, page_to_pfn(page));
319 dump_page(page); 319 dump_page(page);
320 320
321 print_modules();
321 dump_stack(); 322 dump_stack();
322out: 323out:
323 /* Leave bad fields for debug, except PageBuddy could make trouble */ 324 /* Leave bad fields for debug, except PageBuddy could make trouble */
@@ -1409,14 +1410,11 @@ static int __init fail_page_alloc_debugfs(void)
1409{ 1410{
1410 mode_t mode = S_IFREG | S_IRUSR | S_IWUSR; 1411 mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
1411 struct dentry *dir; 1412 struct dentry *dir;
1412 int err;
1413 1413
1414 err = init_fault_attr_dentries(&fail_page_alloc.attr, 1414 dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
1415 "fail_page_alloc"); 1415 &fail_page_alloc.attr);
1416 if (err) 1416 if (IS_ERR(dir))
1417 return err; 1417 return PTR_ERR(dir);
1418
1419 dir = fail_page_alloc.attr.dir;
1420 1418
1421 if (!debugfs_create_bool("ignore-gfp-wait", mode, dir, 1419 if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,
1422 &fail_page_alloc.ignore_gfp_wait)) 1420 &fail_page_alloc.ignore_gfp_wait))
@@ -1430,7 +1428,7 @@ static int __init fail_page_alloc_debugfs(void)
1430 1428
1431 return 0; 1429 return 0;
1432fail: 1430fail:
1433 cleanup_fault_attr_dentries(&fail_page_alloc.attr); 1431 debugfs_remove_recursive(dir);
1434 1432
1435 return -ENOMEM; 1433 return -ENOMEM;
1436} 1434}
@@ -1756,7 +1754,6 @@ static DEFINE_RATELIMIT_STATE(nopage_rs,
1756 1754
1757void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...) 1755void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
1758{ 1756{
1759 va_list args;
1760 unsigned int filter = SHOW_MEM_FILTER_NODES; 1757 unsigned int filter = SHOW_MEM_FILTER_NODES;
1761 1758
1762 if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs)) 1759 if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs))
@@ -1775,14 +1772,21 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
1775 filter &= ~SHOW_MEM_FILTER_NODES; 1772 filter &= ~SHOW_MEM_FILTER_NODES;
1776 1773
1777 if (fmt) { 1774 if (fmt) {
1778 printk(KERN_WARNING); 1775 struct va_format vaf;
1776 va_list args;
1777
1779 va_start(args, fmt); 1778 va_start(args, fmt);
1780 vprintk(fmt, args); 1779
1780 vaf.fmt = fmt;
1781 vaf.va = &args;
1782
1783 pr_warn("%pV", &vaf);
1784
1781 va_end(args); 1785 va_end(args);
1782 } 1786 }
1783 1787
1784 pr_warning("%s: page allocation failure: order:%d, mode:0x%x\n", 1788 pr_warn("%s: page allocation failure: order:%d, mode:0x%x\n",
1785 current->comm, order, gfp_mask); 1789 current->comm, order, gfp_mask);
1786 1790
1787 dump_stack(); 1791 dump_stack();
1788 if (!should_suppress_show_mem()) 1792 if (!should_suppress_show_mem())
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 39d216d535ea..2d123f94a8df 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -133,10 +133,13 @@ struct page *lookup_cgroup_page(struct page_cgroup *pc)
133static void *__meminit alloc_page_cgroup(size_t size, int nid) 133static void *__meminit alloc_page_cgroup(size_t size, int nid)
134{ 134{
135 void *addr = NULL; 135 void *addr = NULL;
136 gfp_t flags = GFP_KERNEL | __GFP_NOWARN;
136 137
137 addr = alloc_pages_exact_nid(nid, size, GFP_KERNEL | __GFP_NOWARN); 138 addr = alloc_pages_exact_nid(nid, size, flags);
138 if (addr) 139 if (addr) {
140 kmemleak_alloc(addr, size, 1, flags);
139 return addr; 141 return addr;
142 }
140 143
141 if (node_state(nid, N_HIGH_MEMORY)) 144 if (node_state(nid, N_HIGH_MEMORY))
142 addr = vmalloc_node(size, nid); 145 addr = vmalloc_node(size, nid);
@@ -357,7 +360,7 @@ struct swap_cgroup_ctrl {
357 spinlock_t lock; 360 spinlock_t lock;
358}; 361};
359 362
360struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES]; 363static struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES];
361 364
362struct swap_cgroup { 365struct swap_cgroup {
363 unsigned short id; 366 unsigned short id;
@@ -513,11 +516,10 @@ int swap_cgroup_swapon(int type, unsigned long max_pages)
513 length = DIV_ROUND_UP(max_pages, SC_PER_PAGE); 516 length = DIV_ROUND_UP(max_pages, SC_PER_PAGE);
514 array_size = length * sizeof(void *); 517 array_size = length * sizeof(void *);
515 518
516 array = vmalloc(array_size); 519 array = vzalloc(array_size);
517 if (!array) 520 if (!array)
518 goto nomem; 521 goto nomem;
519 522
520 memset(array, 0, array_size);
521 ctrl = &swap_cgroup_ctrl[type]; 523 ctrl = &swap_cgroup_ctrl[type];
522 mutex_lock(&swap_cgroup_mutex); 524 mutex_lock(&swap_cgroup_mutex);
523 ctrl->length = length; 525 ctrl->length = length;
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
new file mode 100644
index 000000000000..e920aa3ce104
--- /dev/null
+++ b/mm/process_vm_access.c
@@ -0,0 +1,496 @@
1/*
2 * linux/mm/process_vm_access.c
3 *
4 * Copyright (C) 2010-2011 Christopher Yeoh <cyeoh@au1.ibm.com>, IBM Corp.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12#include <linux/mm.h>
13#include <linux/uio.h>
14#include <linux/sched.h>
15#include <linux/highmem.h>
16#include <linux/ptrace.h>
17#include <linux/slab.h>
18#include <linux/syscalls.h>
19
20#ifdef CONFIG_COMPAT
21#include <linux/compat.h>
22#endif
23
24/**
25 * process_vm_rw_pages - read/write pages from task specified
26 * @task: task to read/write from
27 * @mm: mm for task
28 * @process_pages: struct pages area that can store at least
29 * nr_pages_to_copy struct page pointers
30 * @pa: address of page in task to start copying from/to
31 * @start_offset: offset in page to start copying from/to
32 * @len: number of bytes to copy
33 * @lvec: iovec array specifying where to copy to/from
34 * @lvec_cnt: number of elements in iovec array
35 * @lvec_current: index in iovec array we are up to
36 * @lvec_offset: offset in bytes from current iovec iov_base we are up to
37 * @vm_write: 0 means copy from, 1 means copy to
38 * @nr_pages_to_copy: number of pages to copy
39 * @bytes_copied: returns number of bytes successfully copied
40 * Returns 0 on success, error code otherwise
41 */
42static int process_vm_rw_pages(struct task_struct *task,
43 struct mm_struct *mm,
44 struct page **process_pages,
45 unsigned long pa,
46 unsigned long start_offset,
47 unsigned long len,
48 const struct iovec *lvec,
49 unsigned long lvec_cnt,
50 unsigned long *lvec_current,
51 size_t *lvec_offset,
52 int vm_write,
53 unsigned int nr_pages_to_copy,
54 ssize_t *bytes_copied)
55{
56 int pages_pinned;
57 void *target_kaddr;
58 int pgs_copied = 0;
59 int j;
60 int ret;
61 ssize_t bytes_to_copy;
62 ssize_t rc = 0;
63
64 *bytes_copied = 0;
65
66 /* Get the pages we're interested in */
67 down_read(&mm->mmap_sem);
68 pages_pinned = get_user_pages(task, mm, pa,
69 nr_pages_to_copy,
70 vm_write, 0, process_pages, NULL);
71 up_read(&mm->mmap_sem);
72
73 if (pages_pinned != nr_pages_to_copy) {
74 rc = -EFAULT;
75 goto end;
76 }
77
78 /* Do the copy for each page */
79 for (pgs_copied = 0;
80 (pgs_copied < nr_pages_to_copy) && (*lvec_current < lvec_cnt);
81 pgs_copied++) {
82 /* Make sure we have a non zero length iovec */
83 while (*lvec_current < lvec_cnt
84 && lvec[*lvec_current].iov_len == 0)
85 (*lvec_current)++;
86 if (*lvec_current == lvec_cnt)
87 break;
88
89 /*
90 * Will copy smallest of:
91 * - bytes remaining in page
92 * - bytes remaining in destination iovec
93 */
94 bytes_to_copy = min_t(ssize_t, PAGE_SIZE - start_offset,
95 len - *bytes_copied);
96 bytes_to_copy = min_t(ssize_t, bytes_to_copy,
97 lvec[*lvec_current].iov_len
98 - *lvec_offset);
99
100 target_kaddr = kmap(process_pages[pgs_copied]) + start_offset;
101
102 if (vm_write)
103 ret = copy_from_user(target_kaddr,
104 lvec[*lvec_current].iov_base
105 + *lvec_offset,
106 bytes_to_copy);
107 else
108 ret = copy_to_user(lvec[*lvec_current].iov_base
109 + *lvec_offset,
110 target_kaddr, bytes_to_copy);
111 kunmap(process_pages[pgs_copied]);
112 if (ret) {
113 *bytes_copied += bytes_to_copy - ret;
114 pgs_copied++;
115 rc = -EFAULT;
116 goto end;
117 }
118 *bytes_copied += bytes_to_copy;
119 *lvec_offset += bytes_to_copy;
120 if (*lvec_offset == lvec[*lvec_current].iov_len) {
121 /*
122 * Need to copy remaining part of page into the
123 * next iovec if there are any bytes left in page
124 */
125 (*lvec_current)++;
126 *lvec_offset = 0;
127 start_offset = (start_offset + bytes_to_copy)
128 % PAGE_SIZE;
129 if (start_offset)
130 pgs_copied--;
131 } else {
132 start_offset = 0;
133 }
134 }
135
136end:
137 if (vm_write) {
138 for (j = 0; j < pages_pinned; j++) {
139 if (j < pgs_copied)
140 set_page_dirty_lock(process_pages[j]);
141 put_page(process_pages[j]);
142 }
143 } else {
144 for (j = 0; j < pages_pinned; j++)
145 put_page(process_pages[j]);
146 }
147
148 return rc;
149}
150
151/* Maximum number of pages kmalloc'd to hold struct page's during copy */
152#define PVM_MAX_KMALLOC_PAGES (PAGE_SIZE * 2)
153
154/**
155 * process_vm_rw_single_vec - read/write pages from task specified
156 * @addr: start memory address of target process
157 * @len: size of area to copy to/from
158 * @lvec: iovec array specifying where to copy to/from locally
159 * @lvec_cnt: number of elements in iovec array
160 * @lvec_current: index in iovec array we are up to
161 * @lvec_offset: offset in bytes from current iovec iov_base we are up to
162 * @process_pages: struct pages area that can store at least
163 * nr_pages_to_copy struct page pointers
164 * @mm: mm for task
165 * @task: task to read/write from
166 * @vm_write: 0 means copy from, 1 means copy to
167 * @bytes_copied: returns number of bytes successfully copied
168 * Returns 0 on success or on failure error code
169 */
170static int process_vm_rw_single_vec(unsigned long addr,
171 unsigned long len,
172 const struct iovec *lvec,
173 unsigned long lvec_cnt,
174 unsigned long *lvec_current,
175 size_t *lvec_offset,
176 struct page **process_pages,
177 struct mm_struct *mm,
178 struct task_struct *task,
179 int vm_write,
180 ssize_t *bytes_copied)
181{
182 unsigned long pa = addr & PAGE_MASK;
183 unsigned long start_offset = addr - pa;
184 unsigned long nr_pages;
185 ssize_t bytes_copied_loop;
186 ssize_t rc = 0;
187 unsigned long nr_pages_copied = 0;
188 unsigned long nr_pages_to_copy;
189 unsigned long max_pages_per_loop = PVM_MAX_KMALLOC_PAGES
190 / sizeof(struct pages *);
191
192 *bytes_copied = 0;
193
194 /* Work out address and page range required */
195 if (len == 0)
196 return 0;
197 nr_pages = (addr + len - 1) / PAGE_SIZE - addr / PAGE_SIZE + 1;
198
199 while ((nr_pages_copied < nr_pages) && (*lvec_current < lvec_cnt)) {
200 nr_pages_to_copy = min(nr_pages - nr_pages_copied,
201 max_pages_per_loop);
202
203 rc = process_vm_rw_pages(task, mm, process_pages, pa,
204 start_offset, len,
205 lvec, lvec_cnt,
206 lvec_current, lvec_offset,
207 vm_write, nr_pages_to_copy,
208 &bytes_copied_loop);
209 start_offset = 0;
210 *bytes_copied += bytes_copied_loop;
211
212 if (rc < 0) {
213 return rc;
214 } else {
215 len -= bytes_copied_loop;
216 nr_pages_copied += nr_pages_to_copy;
217 pa += nr_pages_to_copy * PAGE_SIZE;
218 }
219 }
220
221 return rc;
222}
223
224/* Maximum number of entries for process pages array
225 which lives on stack */
226#define PVM_MAX_PP_ARRAY_COUNT 16
227
228/**
229 * process_vm_rw_core - core of reading/writing pages from task specified
230 * @pid: PID of process to read/write from/to
231 * @lvec: iovec array specifying where to copy to/from locally
232 * @liovcnt: size of lvec array
233 * @rvec: iovec array specifying where to copy to/from in the other process
234 * @riovcnt: size of rvec array
235 * @flags: currently unused
236 * @vm_write: 0 if reading from other process, 1 if writing to other process
237 * Returns the number of bytes read/written or error code. May
238 * return less bytes than expected if an error occurs during the copying
239 * process.
240 */
241static ssize_t process_vm_rw_core(pid_t pid, const struct iovec *lvec,
242 unsigned long liovcnt,
243 const struct iovec *rvec,
244 unsigned long riovcnt,
245 unsigned long flags, int vm_write)
246{
247 struct task_struct *task;
248 struct page *pp_stack[PVM_MAX_PP_ARRAY_COUNT];
249 struct page **process_pages = pp_stack;
250 struct mm_struct *mm;
251 unsigned long i;
252 ssize_t rc = 0;
253 ssize_t bytes_copied_loop;
254 ssize_t bytes_copied = 0;
255 unsigned long nr_pages = 0;
256 unsigned long nr_pages_iov;
257 unsigned long iov_l_curr_idx = 0;
258 size_t iov_l_curr_offset = 0;
259 ssize_t iov_len;
260
261 /*
262 * Work out how many pages of struct pages we're going to need
263 * when eventually calling get_user_pages
264 */
265 for (i = 0; i < riovcnt; i++) {
266 iov_len = rvec[i].iov_len;
267 if (iov_len > 0) {
268 nr_pages_iov = ((unsigned long)rvec[i].iov_base
269 + iov_len)
270 / PAGE_SIZE - (unsigned long)rvec[i].iov_base
271 / PAGE_SIZE + 1;
272 nr_pages = max(nr_pages, nr_pages_iov);
273 }
274 }
275
276 if (nr_pages == 0)
277 return 0;
278
279 if (nr_pages > PVM_MAX_PP_ARRAY_COUNT) {
280 /* For reliability don't try to kmalloc more than
281 2 pages worth */
282 process_pages = kmalloc(min_t(size_t, PVM_MAX_KMALLOC_PAGES,
283 sizeof(struct pages *)*nr_pages),
284 GFP_KERNEL);
285
286 if (!process_pages)
287 return -ENOMEM;
288 }
289
290 /* Get process information */
291 rcu_read_lock();
292 task = find_task_by_vpid(pid);
293 if (task)
294 get_task_struct(task);
295 rcu_read_unlock();
296 if (!task) {
297 rc = -ESRCH;
298 goto free_proc_pages;
299 }
300
301 task_lock(task);
302 if (__ptrace_may_access(task, PTRACE_MODE_ATTACH)) {
303 task_unlock(task);
304 rc = -EPERM;
305 goto put_task_struct;
306 }
307 mm = task->mm;
308
309 if (!mm || (task->flags & PF_KTHREAD)) {
310 task_unlock(task);
311 rc = -EINVAL;
312 goto put_task_struct;
313 }
314
315 atomic_inc(&mm->mm_users);
316 task_unlock(task);
317
318 for (i = 0; i < riovcnt && iov_l_curr_idx < liovcnt; i++) {
319 rc = process_vm_rw_single_vec(
320 (unsigned long)rvec[i].iov_base, rvec[i].iov_len,
321 lvec, liovcnt, &iov_l_curr_idx, &iov_l_curr_offset,
322 process_pages, mm, task, vm_write, &bytes_copied_loop);
323 bytes_copied += bytes_copied_loop;
324 if (rc != 0) {
325 /* If we have managed to copy any data at all then
326 we return the number of bytes copied. Otherwise
327 we return the error code */
328 if (bytes_copied)
329 rc = bytes_copied;
330 goto put_mm;
331 }
332 }
333
334 rc = bytes_copied;
335put_mm:
336 mmput(mm);
337
338put_task_struct:
339 put_task_struct(task);
340
341free_proc_pages:
342 if (process_pages != pp_stack)
343 kfree(process_pages);
344 return rc;
345}
346
347/**
348 * process_vm_rw - check iovecs before calling core routine
349 * @pid: PID of process to read/write from/to
350 * @lvec: iovec array specifying where to copy to/from locally
351 * @liovcnt: size of lvec array
352 * @rvec: iovec array specifying where to copy to/from in the other process
353 * @riovcnt: size of rvec array
354 * @flags: currently unused
355 * @vm_write: 0 if reading from other process, 1 if writing to other process
356 * Returns the number of bytes read/written or error code. May
357 * return less bytes than expected if an error occurs during the copying
358 * process.
359 */
360static ssize_t process_vm_rw(pid_t pid,
361 const struct iovec __user *lvec,
362 unsigned long liovcnt,
363 const struct iovec __user *rvec,
364 unsigned long riovcnt,
365 unsigned long flags, int vm_write)
366{
367 struct iovec iovstack_l[UIO_FASTIOV];
368 struct iovec iovstack_r[UIO_FASTIOV];
369 struct iovec *iov_l = iovstack_l;
370 struct iovec *iov_r = iovstack_r;
371 ssize_t rc;
372
373 if (flags != 0)
374 return -EINVAL;
375
376 /* Check iovecs */
377 if (vm_write)
378 rc = rw_copy_check_uvector(WRITE, lvec, liovcnt, UIO_FASTIOV,
379 iovstack_l, &iov_l, 1);
380 else
381 rc = rw_copy_check_uvector(READ, lvec, liovcnt, UIO_FASTIOV,
382 iovstack_l, &iov_l, 1);
383 if (rc <= 0)
384 goto free_iovecs;
385
386 rc = rw_copy_check_uvector(READ, rvec, riovcnt, UIO_FASTIOV,
387 iovstack_r, &iov_r, 0);
388 if (rc <= 0)
389 goto free_iovecs;
390
391 rc = process_vm_rw_core(pid, iov_l, liovcnt, iov_r, riovcnt, flags,
392 vm_write);
393
394free_iovecs:
395 if (iov_r != iovstack_r)
396 kfree(iov_r);
397 if (iov_l != iovstack_l)
398 kfree(iov_l);
399
400 return rc;
401}
402
403SYSCALL_DEFINE6(process_vm_readv, pid_t, pid, const struct iovec __user *, lvec,
404 unsigned long, liovcnt, const struct iovec __user *, rvec,
405 unsigned long, riovcnt, unsigned long, flags)
406{
407 return process_vm_rw(pid, lvec, liovcnt, rvec, riovcnt, flags, 0);
408}
409
410SYSCALL_DEFINE6(process_vm_writev, pid_t, pid,
411 const struct iovec __user *, lvec,
412 unsigned long, liovcnt, const struct iovec __user *, rvec,
413 unsigned long, riovcnt, unsigned long, flags)
414{
415 return process_vm_rw(pid, lvec, liovcnt, rvec, riovcnt, flags, 1);
416}
417
418#ifdef CONFIG_COMPAT
419
420asmlinkage ssize_t
421compat_process_vm_rw(compat_pid_t pid,
422 const struct compat_iovec __user *lvec,
423 unsigned long liovcnt,
424 const struct compat_iovec __user *rvec,
425 unsigned long riovcnt,
426 unsigned long flags, int vm_write)
427{
428 struct iovec iovstack_l[UIO_FASTIOV];
429 struct iovec iovstack_r[UIO_FASTIOV];
430 struct iovec *iov_l = iovstack_l;
431 struct iovec *iov_r = iovstack_r;
432 ssize_t rc = -EFAULT;
433
434 if (flags != 0)
435 return -EINVAL;
436
437 if (!access_ok(VERIFY_READ, lvec, liovcnt * sizeof(*lvec)))
438 goto out;
439
440 if (!access_ok(VERIFY_READ, rvec, riovcnt * sizeof(*rvec)))
441 goto out;
442
443 if (vm_write)
444 rc = compat_rw_copy_check_uvector(WRITE, lvec, liovcnt,
445 UIO_FASTIOV, iovstack_l,
446 &iov_l, 1);
447 else
448 rc = compat_rw_copy_check_uvector(READ, lvec, liovcnt,
449 UIO_FASTIOV, iovstack_l,
450 &iov_l, 1);
451 if (rc <= 0)
452 goto free_iovecs;
453 rc = compat_rw_copy_check_uvector(READ, rvec, riovcnt,
454 UIO_FASTIOV, iovstack_r,
455 &iov_r, 0);
456 if (rc <= 0)
457 goto free_iovecs;
458
459 rc = process_vm_rw_core(pid, iov_l, liovcnt, iov_r, riovcnt, flags,
460 vm_write);
461
462free_iovecs:
463 if (iov_r != iovstack_r)
464 kfree(iov_r);
465 if (iov_l != iovstack_l)
466 kfree(iov_l);
467
468out:
469 return rc;
470}
471
472asmlinkage ssize_t
473compat_sys_process_vm_readv(compat_pid_t pid,
474 const struct compat_iovec __user *lvec,
475 unsigned long liovcnt,
476 const struct compat_iovec __user *rvec,
477 unsigned long riovcnt,
478 unsigned long flags)
479{
480 return compat_process_vm_rw(pid, lvec, liovcnt, rvec,
481 riovcnt, flags, 0);
482}
483
484asmlinkage ssize_t
485compat_sys_process_vm_writev(compat_pid_t pid,
486 const struct compat_iovec __user *lvec,
487 unsigned long liovcnt,
488 const struct compat_iovec __user *rvec,
489 unsigned long riovcnt,
490 unsigned long flags)
491{
492 return compat_process_vm_rw(pid, lvec, liovcnt, rvec,
493 riovcnt, flags, 1);
494}
495
496#endif
diff --git a/mm/quicklist.c b/mm/quicklist.c
index 2876349339a7..942212970529 100644
--- a/mm/quicklist.c
+++ b/mm/quicklist.c
@@ -17,7 +17,6 @@
17#include <linux/gfp.h> 17#include <linux/gfp.h>
18#include <linux/mm.h> 18#include <linux/mm.h>
19#include <linux/mmzone.h> 19#include <linux/mmzone.h>
20#include <linux/module.h>
21#include <linux/quicklist.h> 20#include <linux/quicklist.h>
22 21
23DEFINE_PER_CPU(struct quicklist [CONFIG_NR_QUICK], quicklist); 22DEFINE_PER_CPU(struct quicklist [CONFIG_NR_QUICK], quicklist);
diff --git a/mm/readahead.c b/mm/readahead.c
index 867f9dd82dcd..cbcbb02f3e28 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -11,7 +11,7 @@
11#include <linux/fs.h> 11#include <linux/fs.h>
12#include <linux/gfp.h> 12#include <linux/gfp.h>
13#include <linux/mm.h> 13#include <linux/mm.h>
14#include <linux/module.h> 14#include <linux/export.h>
15#include <linux/blkdev.h> 15#include <linux/blkdev.h>
16#include <linux/backing-dev.h> 16#include <linux/backing-dev.h>
17#include <linux/task_io_accounting_ops.h> 17#include <linux/task_io_accounting_ops.h>
diff --git a/mm/rmap.c b/mm/rmap.c
index 8005080fb9e3..a4fd3680038b 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -51,7 +51,7 @@
51#include <linux/ksm.h> 51#include <linux/ksm.h>
52#include <linux/rmap.h> 52#include <linux/rmap.h>
53#include <linux/rcupdate.h> 53#include <linux/rcupdate.h>
54#include <linux/module.h> 54#include <linux/export.h>
55#include <linux/memcontrol.h> 55#include <linux/memcontrol.h>
56#include <linux/mmu_notifier.h> 56#include <linux/mmu_notifier.h>
57#include <linux/migrate.h> 57#include <linux/migrate.h>
@@ -1164,7 +1164,7 @@ void page_remove_rmap(struct page *page)
1164 1164
1165/* 1165/*
1166 * Subfunctions of try_to_unmap: try_to_unmap_one called 1166 * Subfunctions of try_to_unmap: try_to_unmap_one called
1167 * repeatedly from either try_to_unmap_anon or try_to_unmap_file. 1167 * repeatedly from try_to_unmap_ksm, try_to_unmap_anon or try_to_unmap_file.
1168 */ 1168 */
1169int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, 1169int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
1170 unsigned long address, enum ttu_flags flags) 1170 unsigned long address, enum ttu_flags flags)
diff --git a/mm/shmem.c b/mm/shmem.c
index 5cc21f8b4cd3..d6722506d2da 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -6,7 +6,8 @@
6 * 2000-2001 Christoph Rohland 6 * 2000-2001 Christoph Rohland
7 * 2000-2001 SAP AG 7 * 2000-2001 SAP AG
8 * 2002 Red Hat Inc. 8 * 2002 Red Hat Inc.
9 * Copyright (C) 2002-2005 Hugh Dickins. 9 * Copyright (C) 2002-2011 Hugh Dickins.
10 * Copyright (C) 2011 Google Inc.
10 * Copyright (C) 2002-2005 VERITAS Software Corporation. 11 * Copyright (C) 2002-2005 VERITAS Software Corporation.
11 * Copyright (C) 2004 Andi Kleen, SuSE Labs 12 * Copyright (C) 2004 Andi Kleen, SuSE Labs
12 * 13 *
@@ -27,8 +28,7 @@
27#include <linux/pagemap.h> 28#include <linux/pagemap.h>
28#include <linux/file.h> 29#include <linux/file.h>
29#include <linux/mm.h> 30#include <linux/mm.h>
30#include <linux/module.h> 31#include <linux/export.h>
31#include <linux/percpu_counter.h>
32#include <linux/swap.h> 32#include <linux/swap.h>
33 33
34static struct vfsmount *shm_mnt; 34static struct vfsmount *shm_mnt;
@@ -51,6 +51,8 @@ static struct vfsmount *shm_mnt;
51#include <linux/shmem_fs.h> 51#include <linux/shmem_fs.h>
52#include <linux/writeback.h> 52#include <linux/writeback.h>
53#include <linux/blkdev.h> 53#include <linux/blkdev.h>
54#include <linux/pagevec.h>
55#include <linux/percpu_counter.h>
54#include <linux/splice.h> 56#include <linux/splice.h>
55#include <linux/security.h> 57#include <linux/security.h>
56#include <linux/swapops.h> 58#include <linux/swapops.h>
@@ -63,43 +65,17 @@ static struct vfsmount *shm_mnt;
63#include <linux/magic.h> 65#include <linux/magic.h>
64 66
65#include <asm/uaccess.h> 67#include <asm/uaccess.h>
66#include <asm/div64.h>
67#include <asm/pgtable.h> 68#include <asm/pgtable.h>
68 69
69/*
70 * The maximum size of a shmem/tmpfs file is limited by the maximum size of
71 * its triple-indirect swap vector - see illustration at shmem_swp_entry().
72 *
73 * With 4kB page size, maximum file size is just over 2TB on a 32-bit kernel,
74 * but one eighth of that on a 64-bit kernel. With 8kB page size, maximum
75 * file size is just over 4TB on a 64-bit kernel, but 16TB on a 32-bit kernel,
76 * MAX_LFS_FILESIZE being then more restrictive than swap vector layout.
77 *
78 * We use / and * instead of shifts in the definitions below, so that the swap
79 * vector can be tested with small even values (e.g. 20) for ENTRIES_PER_PAGE.
80 */
81#define ENTRIES_PER_PAGE (PAGE_CACHE_SIZE/sizeof(unsigned long))
82#define ENTRIES_PER_PAGEPAGE ((unsigned long long)ENTRIES_PER_PAGE*ENTRIES_PER_PAGE)
83
84#define SHMSWP_MAX_INDEX (SHMEM_NR_DIRECT + (ENTRIES_PER_PAGEPAGE/2) * (ENTRIES_PER_PAGE+1))
85#define SHMSWP_MAX_BYTES (SHMSWP_MAX_INDEX << PAGE_CACHE_SHIFT)
86
87#define SHMEM_MAX_BYTES min_t(unsigned long long, SHMSWP_MAX_BYTES, MAX_LFS_FILESIZE)
88#define SHMEM_MAX_INDEX ((unsigned long)((SHMEM_MAX_BYTES+1) >> PAGE_CACHE_SHIFT))
89
90#define BLOCKS_PER_PAGE (PAGE_CACHE_SIZE/512) 70#define BLOCKS_PER_PAGE (PAGE_CACHE_SIZE/512)
91#define VM_ACCT(size) (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT) 71#define VM_ACCT(size) (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT)
92 72
93/* info->flags needs VM_flags to handle pagein/truncate races efficiently */
94#define SHMEM_PAGEIN VM_READ
95#define SHMEM_TRUNCATE VM_WRITE
96
97/* Definition to limit shmem_truncate's steps between cond_rescheds */
98#define LATENCY_LIMIT 64
99
100/* Pretend that each entry is of this size in directory's i_size */ 73/* Pretend that each entry is of this size in directory's i_size */
101#define BOGO_DIRENT_SIZE 20 74#define BOGO_DIRENT_SIZE 20
102 75
76/* Symlink up to this size is kmalloc'ed instead of using a swappable page */
77#define SHORT_SYMLINK_LEN 128
78
103struct shmem_xattr { 79struct shmem_xattr {
104 struct list_head list; /* anchored by shmem_inode_info->xattr_list */ 80 struct list_head list; /* anchored by shmem_inode_info->xattr_list */
105 char *name; /* xattr name */ 81 char *name; /* xattr name */
@@ -107,7 +83,7 @@ struct shmem_xattr {
107 char value[0]; 83 char value[0];
108}; 84};
109 85
110/* Flag allocation requirements to shmem_getpage and shmem_swp_alloc */ 86/* Flag allocation requirements to shmem_getpage */
111enum sgp_type { 87enum sgp_type {
112 SGP_READ, /* don't exceed i_size, don't allocate page */ 88 SGP_READ, /* don't exceed i_size, don't allocate page */
113 SGP_CACHE, /* don't exceed i_size, may allocate page */ 89 SGP_CACHE, /* don't exceed i_size, may allocate page */
@@ -137,56 +113,6 @@ static inline int shmem_getpage(struct inode *inode, pgoff_t index,
137 mapping_gfp_mask(inode->i_mapping), fault_type); 113 mapping_gfp_mask(inode->i_mapping), fault_type);
138} 114}
139 115
140static inline struct page *shmem_dir_alloc(gfp_t gfp_mask)
141{
142 /*
143 * The above definition of ENTRIES_PER_PAGE, and the use of
144 * BLOCKS_PER_PAGE on indirect pages, assume PAGE_CACHE_SIZE:
145 * might be reconsidered if it ever diverges from PAGE_SIZE.
146 *
147 * Mobility flags are masked out as swap vectors cannot move
148 */
149 return alloc_pages((gfp_mask & ~GFP_MOVABLE_MASK) | __GFP_ZERO,
150 PAGE_CACHE_SHIFT-PAGE_SHIFT);
151}
152
153static inline void shmem_dir_free(struct page *page)
154{
155 __free_pages(page, PAGE_CACHE_SHIFT-PAGE_SHIFT);
156}
157
158static struct page **shmem_dir_map(struct page *page)
159{
160 return (struct page **)kmap_atomic(page, KM_USER0);
161}
162
163static inline void shmem_dir_unmap(struct page **dir)
164{
165 kunmap_atomic(dir, KM_USER0);
166}
167
168static swp_entry_t *shmem_swp_map(struct page *page)
169{
170 return (swp_entry_t *)kmap_atomic(page, KM_USER1);
171}
172
173static inline void shmem_swp_balance_unmap(void)
174{
175 /*
176 * When passing a pointer to an i_direct entry, to code which
177 * also handles indirect entries and so will shmem_swp_unmap,
178 * we must arrange for the preempt count to remain in balance.
179 * What kmap_atomic of a lowmem page does depends on config
180 * and architecture, so pretend to kmap_atomic some lowmem page.
181 */
182 (void) kmap_atomic(ZERO_PAGE(0), KM_USER1);
183}
184
185static inline void shmem_swp_unmap(swp_entry_t *entry)
186{
187 kunmap_atomic(entry, KM_USER1);
188}
189
190static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb) 116static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
191{ 117{
192 return sb->s_fs_info; 118 return sb->s_fs_info;
@@ -244,15 +170,6 @@ static struct backing_dev_info shmem_backing_dev_info __read_mostly = {
244static LIST_HEAD(shmem_swaplist); 170static LIST_HEAD(shmem_swaplist);
245static DEFINE_MUTEX(shmem_swaplist_mutex); 171static DEFINE_MUTEX(shmem_swaplist_mutex);
246 172
247static void shmem_free_blocks(struct inode *inode, long pages)
248{
249 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
250 if (sbinfo->max_blocks) {
251 percpu_counter_add(&sbinfo->used_blocks, -pages);
252 inode->i_blocks -= pages*BLOCKS_PER_PAGE;
253 }
254}
255
256static int shmem_reserve_inode(struct super_block *sb) 173static int shmem_reserve_inode(struct super_block *sb)
257{ 174{
258 struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 175 struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
@@ -279,7 +196,7 @@ static void shmem_free_inode(struct super_block *sb)
279} 196}
280 197
281/** 198/**
282 * shmem_recalc_inode - recalculate the size of an inode 199 * shmem_recalc_inode - recalculate the block usage of an inode
283 * @inode: inode to recalc 200 * @inode: inode to recalc
284 * 201 *
285 * We have to calculate the free blocks since the mm can drop 202 * We have to calculate the free blocks since the mm can drop
@@ -297,474 +214,297 @@ static void shmem_recalc_inode(struct inode *inode)
297 214
298 freed = info->alloced - info->swapped - inode->i_mapping->nrpages; 215 freed = info->alloced - info->swapped - inode->i_mapping->nrpages;
299 if (freed > 0) { 216 if (freed > 0) {
217 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
218 if (sbinfo->max_blocks)
219 percpu_counter_add(&sbinfo->used_blocks, -freed);
300 info->alloced -= freed; 220 info->alloced -= freed;
221 inode->i_blocks -= freed * BLOCKS_PER_PAGE;
301 shmem_unacct_blocks(info->flags, freed); 222 shmem_unacct_blocks(info->flags, freed);
302 shmem_free_blocks(inode, freed);
303 } 223 }
304} 224}
305 225
306/** 226/*
307 * shmem_swp_entry - find the swap vector position in the info structure 227 * Replace item expected in radix tree by a new item, while holding tree lock.
308 * @info: info structure for the inode
309 * @index: index of the page to find
310 * @page: optional page to add to the structure. Has to be preset to
311 * all zeros
312 *
313 * If there is no space allocated yet it will return NULL when
314 * page is NULL, else it will use the page for the needed block,
315 * setting it to NULL on return to indicate that it has been used.
316 *
317 * The swap vector is organized the following way:
318 *
319 * There are SHMEM_NR_DIRECT entries directly stored in the
320 * shmem_inode_info structure. So small files do not need an addional
321 * allocation.
322 *
323 * For pages with index > SHMEM_NR_DIRECT there is the pointer
324 * i_indirect which points to a page which holds in the first half
325 * doubly indirect blocks, in the second half triple indirect blocks:
326 *
327 * For an artificial ENTRIES_PER_PAGE = 4 this would lead to the
328 * following layout (for SHMEM_NR_DIRECT == 16):
329 *
330 * i_indirect -> dir --> 16-19
331 * | +-> 20-23
332 * |
333 * +-->dir2 --> 24-27
334 * | +-> 28-31
335 * | +-> 32-35
336 * | +-> 36-39
337 * |
338 * +-->dir3 --> 40-43
339 * +-> 44-47
340 * +-> 48-51
341 * +-> 52-55
342 */ 228 */
343static swp_entry_t *shmem_swp_entry(struct shmem_inode_info *info, unsigned long index, struct page **page) 229static int shmem_radix_tree_replace(struct address_space *mapping,
344{ 230 pgoff_t index, void *expected, void *replacement)
345 unsigned long offset; 231{
346 struct page **dir; 232 void **pslot;
347 struct page *subdir; 233 void *item = NULL;
348 234
349 if (index < SHMEM_NR_DIRECT) { 235 VM_BUG_ON(!expected);
350 shmem_swp_balance_unmap(); 236 pslot = radix_tree_lookup_slot(&mapping->page_tree, index);
351 return info->i_direct+index; 237 if (pslot)
352 } 238 item = radix_tree_deref_slot_protected(pslot,
353 if (!info->i_indirect) { 239 &mapping->tree_lock);
354 if (page) { 240 if (item != expected)
355 info->i_indirect = *page; 241 return -ENOENT;
356 *page = NULL; 242 if (replacement)
357 } 243 radix_tree_replace_slot(pslot, replacement);
358 return NULL; /* need another page */ 244 else
359 } 245 radix_tree_delete(&mapping->page_tree, index);
360 246 return 0;
361 index -= SHMEM_NR_DIRECT; 247}
362 offset = index % ENTRIES_PER_PAGE;
363 index /= ENTRIES_PER_PAGE;
364 dir = shmem_dir_map(info->i_indirect);
365
366 if (index >= ENTRIES_PER_PAGE/2) {
367 index -= ENTRIES_PER_PAGE/2;
368 dir += ENTRIES_PER_PAGE/2 + index/ENTRIES_PER_PAGE;
369 index %= ENTRIES_PER_PAGE;
370 subdir = *dir;
371 if (!subdir) {
372 if (page) {
373 *dir = *page;
374 *page = NULL;
375 }
376 shmem_dir_unmap(dir);
377 return NULL; /* need another page */
378 }
379 shmem_dir_unmap(dir);
380 dir = shmem_dir_map(subdir);
381 }
382 248
383 dir += index; 249/*
384 subdir = *dir; 250 * Like add_to_page_cache_locked, but error if expected item has gone.
385 if (!subdir) { 251 */
386 if (!page || !(subdir = *page)) { 252static int shmem_add_to_page_cache(struct page *page,
387 shmem_dir_unmap(dir); 253 struct address_space *mapping,
388 return NULL; /* need a page */ 254 pgoff_t index, gfp_t gfp, void *expected)
255{
256 int error = 0;
257
258 VM_BUG_ON(!PageLocked(page));
259 VM_BUG_ON(!PageSwapBacked(page));
260
261 if (!expected)
262 error = radix_tree_preload(gfp & GFP_RECLAIM_MASK);
263 if (!error) {
264 page_cache_get(page);
265 page->mapping = mapping;
266 page->index = index;
267
268 spin_lock_irq(&mapping->tree_lock);
269 if (!expected)
270 error = radix_tree_insert(&mapping->page_tree,
271 index, page);
272 else
273 error = shmem_radix_tree_replace(mapping, index,
274 expected, page);
275 if (!error) {
276 mapping->nrpages++;
277 __inc_zone_page_state(page, NR_FILE_PAGES);
278 __inc_zone_page_state(page, NR_SHMEM);
279 spin_unlock_irq(&mapping->tree_lock);
280 } else {
281 page->mapping = NULL;
282 spin_unlock_irq(&mapping->tree_lock);
283 page_cache_release(page);
389 } 284 }
390 *dir = subdir; 285 if (!expected)
391 *page = NULL; 286 radix_tree_preload_end();
392 } 287 }
393 shmem_dir_unmap(dir); 288 if (error)
394 return shmem_swp_map(subdir) + offset; 289 mem_cgroup_uncharge_cache_page(page);
290 return error;
395} 291}
396 292
397static void shmem_swp_set(struct shmem_inode_info *info, swp_entry_t *entry, unsigned long value) 293/*
294 * Like delete_from_page_cache, but substitutes swap for page.
295 */
296static void shmem_delete_from_page_cache(struct page *page, void *radswap)
398{ 297{
399 long incdec = value? 1: -1; 298 struct address_space *mapping = page->mapping;
299 int error;
400 300
401 entry->val = value; 301 spin_lock_irq(&mapping->tree_lock);
402 info->swapped += incdec; 302 error = shmem_radix_tree_replace(mapping, page->index, page, radswap);
403 if ((unsigned long)(entry - info->i_direct) >= SHMEM_NR_DIRECT) { 303 page->mapping = NULL;
404 struct page *page = kmap_atomic_to_page(entry); 304 mapping->nrpages--;
405 set_page_private(page, page_private(page) + incdec); 305 __dec_zone_page_state(page, NR_FILE_PAGES);
406 } 306 __dec_zone_page_state(page, NR_SHMEM);
307 spin_unlock_irq(&mapping->tree_lock);
308 page_cache_release(page);
309 BUG_ON(error);
407} 310}
408 311
409/** 312/*
410 * shmem_swp_alloc - get the position of the swap entry for the page. 313 * Like find_get_pages, but collecting swap entries as well as pages.
411 * @info: info structure for the inode
412 * @index: index of the page to find
413 * @sgp: check and recheck i_size? skip allocation?
414 * @gfp: gfp mask to use for any page allocation
415 *
416 * If the entry does not exist, allocate it.
417 */ 314 */
418static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, 315static unsigned shmem_find_get_pages_and_swap(struct address_space *mapping,
419 unsigned long index, enum sgp_type sgp, gfp_t gfp) 316 pgoff_t start, unsigned int nr_pages,
420{ 317 struct page **pages, pgoff_t *indices)
421 struct inode *inode = &info->vfs_inode; 318{
422 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); 319 unsigned int i;
423 struct page *page = NULL; 320 unsigned int ret;
424 swp_entry_t *entry; 321 unsigned int nr_found;
425 322
426 if (sgp != SGP_WRITE && 323 rcu_read_lock();
427 ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) 324restart:
428 return ERR_PTR(-EINVAL); 325 nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
429 326 (void ***)pages, indices, start, nr_pages);
430 while (!(entry = shmem_swp_entry(info, index, &page))) { 327 ret = 0;
431 if (sgp == SGP_READ) 328 for (i = 0; i < nr_found; i++) {
432 return shmem_swp_map(ZERO_PAGE(0)); 329 struct page *page;
433 /* 330repeat:
434 * Test used_blocks against 1 less max_blocks, since we have 1 data 331 page = radix_tree_deref_slot((void **)pages[i]);
435 * page (and perhaps indirect index pages) yet to allocate: 332 if (unlikely(!page))
436 * a waste to allocate index if we cannot allocate data. 333 continue;
437 */ 334 if (radix_tree_exception(page)) {
438 if (sbinfo->max_blocks) { 335 if (radix_tree_deref_retry(page))
439 if (percpu_counter_compare(&sbinfo->used_blocks, 336 goto restart;
440 sbinfo->max_blocks - 1) >= 0) 337 /*
441 return ERR_PTR(-ENOSPC); 338 * Otherwise, we must be storing a swap entry
442 percpu_counter_inc(&sbinfo->used_blocks); 339 * here as an exceptional entry: so return it
443 inode->i_blocks += BLOCKS_PER_PAGE; 340 * without attempting to raise page count.
341 */
342 goto export;
444 } 343 }
344 if (!page_cache_get_speculative(page))
345 goto repeat;
445 346
446 spin_unlock(&info->lock); 347 /* Has the page moved? */
447 page = shmem_dir_alloc(gfp); 348 if (unlikely(page != *((void **)pages[i]))) {
448 spin_lock(&info->lock); 349 page_cache_release(page);
449 350 goto repeat;
450 if (!page) {
451 shmem_free_blocks(inode, 1);
452 return ERR_PTR(-ENOMEM);
453 }
454 if (sgp != SGP_WRITE &&
455 ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
456 entry = ERR_PTR(-EINVAL);
457 break;
458 } 351 }
459 if (info->next_index <= index) 352export:
460 info->next_index = index + 1; 353 indices[ret] = indices[i];
461 } 354 pages[ret] = page;
462 if (page) { 355 ret++;
463 /* another task gave its page, or truncated the file */ 356 }
464 shmem_free_blocks(inode, 1); 357 if (unlikely(!ret && nr_found))
465 shmem_dir_free(page); 358 goto restart;
466 } 359 rcu_read_unlock();
467 if (info->next_index <= index && !IS_ERR(entry)) 360 return ret;
468 info->next_index = index + 1;
469 return entry;
470} 361}
471 362
472/** 363/*
473 * shmem_free_swp - free some swap entries in a directory 364 * Remove swap entry from radix tree, free the swap and its page cache.
474 * @dir: pointer to the directory
475 * @edir: pointer after last entry of the directory
476 * @punch_lock: pointer to spinlock when needed for the holepunch case
477 */ 365 */
478static int shmem_free_swp(swp_entry_t *dir, swp_entry_t *edir, 366static int shmem_free_swap(struct address_space *mapping,
479 spinlock_t *punch_lock) 367 pgoff_t index, void *radswap)
480{ 368{
481 spinlock_t *punch_unlock = NULL; 369 int error;
482 swp_entry_t *ptr; 370
483 int freed = 0; 371 spin_lock_irq(&mapping->tree_lock);
484 372 error = shmem_radix_tree_replace(mapping, index, radswap, NULL);
485 for (ptr = dir; ptr < edir; ptr++) { 373 spin_unlock_irq(&mapping->tree_lock);
486 if (ptr->val) { 374 if (!error)
487 if (unlikely(punch_lock)) { 375 free_swap_and_cache(radix_to_swp_entry(radswap));
488 punch_unlock = punch_lock; 376 return error;
489 punch_lock = NULL;
490 spin_lock(punch_unlock);
491 if (!ptr->val)
492 continue;
493 }
494 free_swap_and_cache(*ptr);
495 *ptr = (swp_entry_t){0};
496 freed++;
497 }
498 }
499 if (punch_unlock)
500 spin_unlock(punch_unlock);
501 return freed;
502}
503
504static int shmem_map_and_free_swp(struct page *subdir, int offset,
505 int limit, struct page ***dir, spinlock_t *punch_lock)
506{
507 swp_entry_t *ptr;
508 int freed = 0;
509
510 ptr = shmem_swp_map(subdir);
511 for (; offset < limit; offset += LATENCY_LIMIT) {
512 int size = limit - offset;
513 if (size > LATENCY_LIMIT)
514 size = LATENCY_LIMIT;
515 freed += shmem_free_swp(ptr+offset, ptr+offset+size,
516 punch_lock);
517 if (need_resched()) {
518 shmem_swp_unmap(ptr);
519 if (*dir) {
520 shmem_dir_unmap(*dir);
521 *dir = NULL;
522 }
523 cond_resched();
524 ptr = shmem_swp_map(subdir);
525 }
526 }
527 shmem_swp_unmap(ptr);
528 return freed;
529} 377}
530 378
531static void shmem_free_pages(struct list_head *next) 379/*
380 * Pagevec may contain swap entries, so shuffle up pages before releasing.
381 */
382static void shmem_pagevec_release(struct pagevec *pvec)
532{ 383{
533 struct page *page; 384 int i, j;
534 int freed = 0; 385
535 386 for (i = 0, j = 0; i < pagevec_count(pvec); i++) {
536 do { 387 struct page *page = pvec->pages[i];
537 page = container_of(next, struct page, lru); 388 if (!radix_tree_exceptional_entry(page))
538 next = next->next; 389 pvec->pages[j++] = page;
539 shmem_dir_free(page); 390 }
540 freed++; 391 pvec->nr = j;
541 if (freed >= LATENCY_LIMIT) { 392 pagevec_release(pvec);
542 cond_resched();
543 freed = 0;
544 }
545 } while (next);
546} 393}
547 394
548void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end) 395/*
396 * Remove range of pages and swap entries from radix tree, and free them.
397 */
398void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
549{ 399{
400 struct address_space *mapping = inode->i_mapping;
550 struct shmem_inode_info *info = SHMEM_I(inode); 401 struct shmem_inode_info *info = SHMEM_I(inode);
551 unsigned long idx; 402 pgoff_t start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
552 unsigned long size; 403 unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
553 unsigned long limit; 404 pgoff_t end = (lend >> PAGE_CACHE_SHIFT);
554 unsigned long stage; 405 struct pagevec pvec;
555 unsigned long diroff; 406 pgoff_t indices[PAGEVEC_SIZE];
556 struct page **dir;
557 struct page *topdir;
558 struct page *middir;
559 struct page *subdir;
560 swp_entry_t *ptr;
561 LIST_HEAD(pages_to_free);
562 long nr_pages_to_free = 0;
563 long nr_swaps_freed = 0; 407 long nr_swaps_freed = 0;
564 int offset; 408 pgoff_t index;
565 int freed; 409 int i;
566 int punch_hole;
567 spinlock_t *needs_lock;
568 spinlock_t *punch_lock;
569 unsigned long upper_limit;
570 410
571 truncate_inode_pages_range(inode->i_mapping, start, end); 411 BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1));
572 412
573 inode->i_ctime = inode->i_mtime = CURRENT_TIME; 413 pagevec_init(&pvec, 0);
574 idx = (start + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 414 index = start;
575 if (idx >= info->next_index) 415 while (index <= end) {
576 return; 416 pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
417 min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
418 pvec.pages, indices);
419 if (!pvec.nr)
420 break;
421 mem_cgroup_uncharge_start();
422 for (i = 0; i < pagevec_count(&pvec); i++) {
423 struct page *page = pvec.pages[i];
577 424
578 spin_lock(&info->lock); 425 index = indices[i];
579 info->flags |= SHMEM_TRUNCATE; 426 if (index > end)
580 if (likely(end == (loff_t) -1)) { 427 break;
581 limit = info->next_index; 428
582 upper_limit = SHMEM_MAX_INDEX; 429 if (radix_tree_exceptional_entry(page)) {
583 info->next_index = idx; 430 nr_swaps_freed += !shmem_free_swap(mapping,
584 needs_lock = NULL; 431 index, page);
585 punch_hole = 0; 432 continue;
586 } else { 433 }
587 if (end + 1 >= inode->i_size) { /* we may free a little more */
588 limit = (inode->i_size + PAGE_CACHE_SIZE - 1) >>
589 PAGE_CACHE_SHIFT;
590 upper_limit = SHMEM_MAX_INDEX;
591 } else {
592 limit = (end + 1) >> PAGE_CACHE_SHIFT;
593 upper_limit = limit;
594 }
595 needs_lock = &info->lock;
596 punch_hole = 1;
597 }
598 434
599 topdir = info->i_indirect; 435 if (!trylock_page(page))
600 if (topdir && idx <= SHMEM_NR_DIRECT && !punch_hole) { 436 continue;
601 info->i_indirect = NULL; 437 if (page->mapping == mapping) {
602 nr_pages_to_free++; 438 VM_BUG_ON(PageWriteback(page));
603 list_add(&topdir->lru, &pages_to_free); 439 truncate_inode_page(mapping, page);
440 }
441 unlock_page(page);
442 }
443 shmem_pagevec_release(&pvec);
444 mem_cgroup_uncharge_end();
445 cond_resched();
446 index++;
604 } 447 }
605 spin_unlock(&info->lock);
606 448
607 if (info->swapped && idx < SHMEM_NR_DIRECT) { 449 if (partial) {
608 ptr = info->i_direct; 450 struct page *page = NULL;
609 size = limit; 451 shmem_getpage(inode, start - 1, &page, SGP_READ, NULL);
610 if (size > SHMEM_NR_DIRECT) 452 if (page) {
611 size = SHMEM_NR_DIRECT; 453 zero_user_segment(page, partial, PAGE_CACHE_SIZE);
612 nr_swaps_freed = shmem_free_swp(ptr+idx, ptr+size, needs_lock); 454 set_page_dirty(page);
455 unlock_page(page);
456 page_cache_release(page);
457 }
613 } 458 }
614 459
615 /* 460 index = start;
616 * If there are no indirect blocks or we are punching a hole 461 for ( ; ; ) {
617 * below indirect blocks, nothing to be done. 462 cond_resched();
618 */ 463 pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
619 if (!topdir || limit <= SHMEM_NR_DIRECT) 464 min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
620 goto done2; 465 pvec.pages, indices);
466 if (!pvec.nr) {
467 if (index == start)
468 break;
469 index = start;
470 continue;
471 }
472 if (index == start && indices[0] > end) {
473 shmem_pagevec_release(&pvec);
474 break;
475 }
476 mem_cgroup_uncharge_start();
477 for (i = 0; i < pagevec_count(&pvec); i++) {
478 struct page *page = pvec.pages[i];
621 479
622 /* 480 index = indices[i];
623 * The truncation case has already dropped info->lock, and we're safe 481 if (index > end)
624 * because i_size and next_index have already been lowered, preventing 482 break;
625 * access beyond. But in the punch_hole case, we still need to take
626 * the lock when updating the swap directory, because there might be
627 * racing accesses by shmem_getpage(SGP_CACHE), shmem_unuse_inode or
628 * shmem_writepage. However, whenever we find we can remove a whole
629 * directory page (not at the misaligned start or end of the range),
630 * we first NULLify its pointer in the level above, and then have no
631 * need to take the lock when updating its contents: needs_lock and
632 * punch_lock (either pointing to info->lock or NULL) manage this.
633 */
634 483
635 upper_limit -= SHMEM_NR_DIRECT; 484 if (radix_tree_exceptional_entry(page)) {
636 limit -= SHMEM_NR_DIRECT; 485 nr_swaps_freed += !shmem_free_swap(mapping,
637 idx = (idx > SHMEM_NR_DIRECT)? (idx - SHMEM_NR_DIRECT): 0; 486 index, page);
638 offset = idx % ENTRIES_PER_PAGE; 487 continue;
639 idx -= offset;
640
641 dir = shmem_dir_map(topdir);
642 stage = ENTRIES_PER_PAGEPAGE/2;
643 if (idx < ENTRIES_PER_PAGEPAGE/2) {
644 middir = topdir;
645 diroff = idx/ENTRIES_PER_PAGE;
646 } else {
647 dir += ENTRIES_PER_PAGE/2;
648 dir += (idx - ENTRIES_PER_PAGEPAGE/2)/ENTRIES_PER_PAGEPAGE;
649 while (stage <= idx)
650 stage += ENTRIES_PER_PAGEPAGE;
651 middir = *dir;
652 if (*dir) {
653 diroff = ((idx - ENTRIES_PER_PAGEPAGE/2) %
654 ENTRIES_PER_PAGEPAGE) / ENTRIES_PER_PAGE;
655 if (!diroff && !offset && upper_limit >= stage) {
656 if (needs_lock) {
657 spin_lock(needs_lock);
658 *dir = NULL;
659 spin_unlock(needs_lock);
660 needs_lock = NULL;
661 } else
662 *dir = NULL;
663 nr_pages_to_free++;
664 list_add(&middir->lru, &pages_to_free);
665 } 488 }
666 shmem_dir_unmap(dir);
667 dir = shmem_dir_map(middir);
668 } else {
669 diroff = 0;
670 offset = 0;
671 idx = stage;
672 }
673 }
674 489
675 for (; idx < limit; idx += ENTRIES_PER_PAGE, diroff++) { 490 lock_page(page);
676 if (unlikely(idx == stage)) { 491 if (page->mapping == mapping) {
677 shmem_dir_unmap(dir); 492 VM_BUG_ON(PageWriteback(page));
678 dir = shmem_dir_map(topdir) + 493 truncate_inode_page(mapping, page);
679 ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE;
680 while (!*dir) {
681 dir++;
682 idx += ENTRIES_PER_PAGEPAGE;
683 if (idx >= limit)
684 goto done1;
685 } 494 }
686 stage = idx + ENTRIES_PER_PAGEPAGE; 495 unlock_page(page);
687 middir = *dir;
688 if (punch_hole)
689 needs_lock = &info->lock;
690 if (upper_limit >= stage) {
691 if (needs_lock) {
692 spin_lock(needs_lock);
693 *dir = NULL;
694 spin_unlock(needs_lock);
695 needs_lock = NULL;
696 } else
697 *dir = NULL;
698 nr_pages_to_free++;
699 list_add(&middir->lru, &pages_to_free);
700 }
701 shmem_dir_unmap(dir);
702 cond_resched();
703 dir = shmem_dir_map(middir);
704 diroff = 0;
705 }
706 punch_lock = needs_lock;
707 subdir = dir[diroff];
708 if (subdir && !offset && upper_limit-idx >= ENTRIES_PER_PAGE) {
709 if (needs_lock) {
710 spin_lock(needs_lock);
711 dir[diroff] = NULL;
712 spin_unlock(needs_lock);
713 punch_lock = NULL;
714 } else
715 dir[diroff] = NULL;
716 nr_pages_to_free++;
717 list_add(&subdir->lru, &pages_to_free);
718 }
719 if (subdir && page_private(subdir) /* has swap entries */) {
720 size = limit - idx;
721 if (size > ENTRIES_PER_PAGE)
722 size = ENTRIES_PER_PAGE;
723 freed = shmem_map_and_free_swp(subdir,
724 offset, size, &dir, punch_lock);
725 if (!dir)
726 dir = shmem_dir_map(middir);
727 nr_swaps_freed += freed;
728 if (offset || punch_lock) {
729 spin_lock(&info->lock);
730 set_page_private(subdir,
731 page_private(subdir) - freed);
732 spin_unlock(&info->lock);
733 } else
734 BUG_ON(page_private(subdir) != freed);
735 } 496 }
736 offset = 0; 497 shmem_pagevec_release(&pvec);
737 } 498 mem_cgroup_uncharge_end();
738done1: 499 index++;
739 shmem_dir_unmap(dir);
740done2:
741 if (inode->i_mapping->nrpages && (info->flags & SHMEM_PAGEIN)) {
742 /*
743 * Call truncate_inode_pages again: racing shmem_unuse_inode
744 * may have swizzled a page in from swap since
745 * truncate_pagecache or generic_delete_inode did it, before we
746 * lowered next_index. Also, though shmem_getpage checks
747 * i_size before adding to cache, no recheck after: so fix the
748 * narrow window there too.
749 */
750 truncate_inode_pages_range(inode->i_mapping, start, end);
751 } 500 }
752 501
753 spin_lock(&info->lock); 502 spin_lock(&info->lock);
754 info->flags &= ~SHMEM_TRUNCATE;
755 info->swapped -= nr_swaps_freed; 503 info->swapped -= nr_swaps_freed;
756 if (nr_pages_to_free)
757 shmem_free_blocks(inode, nr_pages_to_free);
758 shmem_recalc_inode(inode); 504 shmem_recalc_inode(inode);
759 spin_unlock(&info->lock); 505 spin_unlock(&info->lock);
760 506
761 /* 507 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
762 * Empty swap vector directory pages to be freed?
763 */
764 if (!list_empty(&pages_to_free)) {
765 pages_to_free.prev->next = NULL;
766 shmem_free_pages(pages_to_free.next);
767 }
768} 508}
769EXPORT_SYMBOL_GPL(shmem_truncate_range); 509EXPORT_SYMBOL_GPL(shmem_truncate_range);
770 510
@@ -780,37 +520,7 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
780 if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { 520 if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
781 loff_t oldsize = inode->i_size; 521 loff_t oldsize = inode->i_size;
782 loff_t newsize = attr->ia_size; 522 loff_t newsize = attr->ia_size;
783 struct page *page = NULL;
784 523
785 if (newsize < oldsize) {
786 /*
787 * If truncating down to a partial page, then
788 * if that page is already allocated, hold it
789 * in memory until the truncation is over, so
790 * truncate_partial_page cannot miss it were
791 * it assigned to swap.
792 */
793 if (newsize & (PAGE_CACHE_SIZE-1)) {
794 (void) shmem_getpage(inode,
795 newsize >> PAGE_CACHE_SHIFT,
796 &page, SGP_READ, NULL);
797 if (page)
798 unlock_page(page);
799 }
800 /*
801 * Reset SHMEM_PAGEIN flag so that shmem_truncate can
802 * detect if any pages might have been added to cache
803 * after truncate_inode_pages. But we needn't bother
804 * if it's being fully truncated to zero-length: the
805 * nrpages check is efficient enough in that case.
806 */
807 if (newsize) {
808 struct shmem_inode_info *info = SHMEM_I(inode);
809 spin_lock(&info->lock);
810 info->flags &= ~SHMEM_PAGEIN;
811 spin_unlock(&info->lock);
812 }
813 }
814 if (newsize != oldsize) { 524 if (newsize != oldsize) {
815 i_size_write(inode, newsize); 525 i_size_write(inode, newsize);
816 inode->i_ctime = inode->i_mtime = CURRENT_TIME; 526 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
@@ -822,8 +532,6 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
822 /* unmap again to remove racily COWed private pages */ 532 /* unmap again to remove racily COWed private pages */
823 unmap_mapping_range(inode->i_mapping, holebegin, 0, 1); 533 unmap_mapping_range(inode->i_mapping, holebegin, 0, 1);
824 } 534 }
825 if (page)
826 page_cache_release(page);
827 } 535 }
828 536
829 setattr_copy(inode, attr); 537 setattr_copy(inode, attr);
@@ -848,7 +556,8 @@ static void shmem_evict_inode(struct inode *inode)
848 list_del_init(&info->swaplist); 556 list_del_init(&info->swaplist);
849 mutex_unlock(&shmem_swaplist_mutex); 557 mutex_unlock(&shmem_swaplist_mutex);
850 } 558 }
851 } 559 } else
560 kfree(info->symlink);
852 561
853 list_for_each_entry_safe(xattr, nxattr, &info->xattr_list, list) { 562 list_for_each_entry_safe(xattr, nxattr, &info->xattr_list, list) {
854 kfree(xattr->name); 563 kfree(xattr->name);
@@ -859,106 +568,27 @@ static void shmem_evict_inode(struct inode *inode)
859 end_writeback(inode); 568 end_writeback(inode);
860} 569}
861 570
862static inline int shmem_find_swp(swp_entry_t entry, swp_entry_t *dir, swp_entry_t *edir) 571/*
863{ 572 * If swap found in inode, free it and move page from swapcache to filecache.
864 swp_entry_t *ptr; 573 */
865 574static int shmem_unuse_inode(struct shmem_inode_info *info,
866 for (ptr = dir; ptr < edir; ptr++) { 575 swp_entry_t swap, struct page *page)
867 if (ptr->val == entry.val)
868 return ptr - dir;
869 }
870 return -1;
871}
872
873static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, struct page *page)
874{ 576{
875 struct address_space *mapping; 577 struct address_space *mapping = info->vfs_inode.i_mapping;
876 unsigned long idx; 578 void *radswap;
877 unsigned long size; 579 pgoff_t index;
878 unsigned long limit;
879 unsigned long stage;
880 struct page **dir;
881 struct page *subdir;
882 swp_entry_t *ptr;
883 int offset;
884 int error; 580 int error;
885 581
886 idx = 0; 582 radswap = swp_to_radix_entry(swap);
887 ptr = info->i_direct; 583 index = radix_tree_locate_item(&mapping->page_tree, radswap);
888 spin_lock(&info->lock); 584 if (index == -1)
889 if (!info->swapped) { 585 return 0;
890 list_del_init(&info->swaplist);
891 goto lost2;
892 }
893 limit = info->next_index;
894 size = limit;
895 if (size > SHMEM_NR_DIRECT)
896 size = SHMEM_NR_DIRECT;
897 offset = shmem_find_swp(entry, ptr, ptr+size);
898 if (offset >= 0) {
899 shmem_swp_balance_unmap();
900 goto found;
901 }
902 if (!info->i_indirect)
903 goto lost2;
904
905 dir = shmem_dir_map(info->i_indirect);
906 stage = SHMEM_NR_DIRECT + ENTRIES_PER_PAGEPAGE/2;
907
908 for (idx = SHMEM_NR_DIRECT; idx < limit; idx += ENTRIES_PER_PAGE, dir++) {
909 if (unlikely(idx == stage)) {
910 shmem_dir_unmap(dir-1);
911 if (cond_resched_lock(&info->lock)) {
912 /* check it has not been truncated */
913 if (limit > info->next_index) {
914 limit = info->next_index;
915 if (idx >= limit)
916 goto lost2;
917 }
918 }
919 dir = shmem_dir_map(info->i_indirect) +
920 ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE;
921 while (!*dir) {
922 dir++;
923 idx += ENTRIES_PER_PAGEPAGE;
924 if (idx >= limit)
925 goto lost1;
926 }
927 stage = idx + ENTRIES_PER_PAGEPAGE;
928 subdir = *dir;
929 shmem_dir_unmap(dir);
930 dir = shmem_dir_map(subdir);
931 }
932 subdir = *dir;
933 if (subdir && page_private(subdir)) {
934 ptr = shmem_swp_map(subdir);
935 size = limit - idx;
936 if (size > ENTRIES_PER_PAGE)
937 size = ENTRIES_PER_PAGE;
938 offset = shmem_find_swp(entry, ptr, ptr+size);
939 shmem_swp_unmap(ptr);
940 if (offset >= 0) {
941 shmem_dir_unmap(dir);
942 ptr = shmem_swp_map(subdir);
943 goto found;
944 }
945 }
946 }
947lost1:
948 shmem_dir_unmap(dir-1);
949lost2:
950 spin_unlock(&info->lock);
951 return 0;
952found:
953 idx += offset;
954 ptr += offset;
955 586
956 /* 587 /*
957 * Move _head_ to start search for next from here. 588 * Move _head_ to start search for next from here.
958 * But be careful: shmem_evict_inode checks list_empty without taking 589 * But be careful: shmem_evict_inode checks list_empty without taking
959 * mutex, and there's an instant in list_move_tail when info->swaplist 590 * mutex, and there's an instant in list_move_tail when info->swaplist
960 * would appear empty, if it were the only one on shmem_swaplist. We 591 * would appear empty, if it were the only one on shmem_swaplist.
961 * could avoid doing it if inode NULL; or use this minor optimization.
962 */ 592 */
963 if (shmem_swaplist.next != &info->swaplist) 593 if (shmem_swaplist.next != &info->swaplist)
964 list_move_tail(&shmem_swaplist, &info->swaplist); 594 list_move_tail(&shmem_swaplist, &info->swaplist);
@@ -968,29 +598,34 @@ found:
968 * but also to hold up shmem_evict_inode(): so inode cannot be freed 598 * but also to hold up shmem_evict_inode(): so inode cannot be freed
969 * beneath us (pagelock doesn't help until the page is in pagecache). 599 * beneath us (pagelock doesn't help until the page is in pagecache).
970 */ 600 */
971 mapping = info->vfs_inode.i_mapping; 601 error = shmem_add_to_page_cache(page, mapping, index,
972 error = add_to_page_cache_locked(page, mapping, idx, GFP_NOWAIT); 602 GFP_NOWAIT, radswap);
973 /* which does mem_cgroup_uncharge_cache_page on error */ 603 /* which does mem_cgroup_uncharge_cache_page on error */
974 604
975 if (error != -ENOMEM) { 605 if (error != -ENOMEM) {
606 /*
607 * Truncation and eviction use free_swap_and_cache(), which
608 * only does trylock page: if we raced, best clean up here.
609 */
976 delete_from_swap_cache(page); 610 delete_from_swap_cache(page);
977 set_page_dirty(page); 611 set_page_dirty(page);
978 info->flags |= SHMEM_PAGEIN; 612 if (!error) {
979 shmem_swp_set(info, ptr, 0); 613 spin_lock(&info->lock);
980 swap_free(entry); 614 info->swapped--;
615 spin_unlock(&info->lock);
616 swap_free(swap);
617 }
981 error = 1; /* not an error, but entry was found */ 618 error = 1; /* not an error, but entry was found */
982 } 619 }
983 shmem_swp_unmap(ptr);
984 spin_unlock(&info->lock);
985 return error; 620 return error;
986} 621}
987 622
988/* 623/*
989 * shmem_unuse() search for an eventually swapped out shmem page. 624 * Search through swapped inodes to find and replace swap by page.
990 */ 625 */
991int shmem_unuse(swp_entry_t entry, struct page *page) 626int shmem_unuse(swp_entry_t swap, struct page *page)
992{ 627{
993 struct list_head *p, *next; 628 struct list_head *this, *next;
994 struct shmem_inode_info *info; 629 struct shmem_inode_info *info;
995 int found = 0; 630 int found = 0;
996 int error; 631 int error;
@@ -999,32 +634,25 @@ int shmem_unuse(swp_entry_t entry, struct page *page)
999 * Charge page using GFP_KERNEL while we can wait, before taking 634 * Charge page using GFP_KERNEL while we can wait, before taking
1000 * the shmem_swaplist_mutex which might hold up shmem_writepage(). 635 * the shmem_swaplist_mutex which might hold up shmem_writepage().
1001 * Charged back to the user (not to caller) when swap account is used. 636 * Charged back to the user (not to caller) when swap account is used.
1002 * add_to_page_cache() will be called with GFP_NOWAIT.
1003 */ 637 */
1004 error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL); 638 error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL);
1005 if (error) 639 if (error)
1006 goto out; 640 goto out;
1007 /* 641 /* No radix_tree_preload: swap entry keeps a place for page in tree */
1008 * Try to preload while we can wait, to not make a habit of
1009 * draining atomic reserves; but don't latch on to this cpu,
1010 * it's okay if sometimes we get rescheduled after this.
1011 */
1012 error = radix_tree_preload(GFP_KERNEL);
1013 if (error)
1014 goto uncharge;
1015 radix_tree_preload_end();
1016 642
1017 mutex_lock(&shmem_swaplist_mutex); 643 mutex_lock(&shmem_swaplist_mutex);
1018 list_for_each_safe(p, next, &shmem_swaplist) { 644 list_for_each_safe(this, next, &shmem_swaplist) {
1019 info = list_entry(p, struct shmem_inode_info, swaplist); 645 info = list_entry(this, struct shmem_inode_info, swaplist);
1020 found = shmem_unuse_inode(info, entry, page); 646 if (info->swapped)
647 found = shmem_unuse_inode(info, swap, page);
648 else
649 list_del_init(&info->swaplist);
1021 cond_resched(); 650 cond_resched();
1022 if (found) 651 if (found)
1023 break; 652 break;
1024 } 653 }
1025 mutex_unlock(&shmem_swaplist_mutex); 654 mutex_unlock(&shmem_swaplist_mutex);
1026 655
1027uncharge:
1028 if (!found) 656 if (!found)
1029 mem_cgroup_uncharge_cache_page(page); 657 mem_cgroup_uncharge_cache_page(page);
1030 if (found < 0) 658 if (found < 0)
@@ -1041,10 +669,10 @@ out:
1041static int shmem_writepage(struct page *page, struct writeback_control *wbc) 669static int shmem_writepage(struct page *page, struct writeback_control *wbc)
1042{ 670{
1043 struct shmem_inode_info *info; 671 struct shmem_inode_info *info;
1044 swp_entry_t *entry, swap;
1045 struct address_space *mapping; 672 struct address_space *mapping;
1046 unsigned long index;
1047 struct inode *inode; 673 struct inode *inode;
674 swp_entry_t swap;
675 pgoff_t index;
1048 676
1049 BUG_ON(!PageLocked(page)); 677 BUG_ON(!PageLocked(page));
1050 mapping = page->mapping; 678 mapping = page->mapping;
@@ -1073,50 +701,32 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
1073 701
1074 /* 702 /*
1075 * Add inode to shmem_unuse()'s list of swapped-out inodes, 703 * Add inode to shmem_unuse()'s list of swapped-out inodes,
1076 * if it's not already there. Do it now because we cannot take 704 * if it's not already there. Do it now before the page is
1077 * mutex while holding spinlock, and must do so before the page 705 * moved to swap cache, when its pagelock no longer protects
1078 * is moved to swap cache, when its pagelock no longer protects
1079 * the inode from eviction. But don't unlock the mutex until 706 * the inode from eviction. But don't unlock the mutex until
1080 * we've taken the spinlock, because shmem_unuse_inode() will 707 * we've incremented swapped, because shmem_unuse_inode() will
1081 * prune a !swapped inode from the swaplist under both locks. 708 * prune a !swapped inode from the swaplist under this mutex.
1082 */ 709 */
1083 mutex_lock(&shmem_swaplist_mutex); 710 mutex_lock(&shmem_swaplist_mutex);
1084 if (list_empty(&info->swaplist)) 711 if (list_empty(&info->swaplist))
1085 list_add_tail(&info->swaplist, &shmem_swaplist); 712 list_add_tail(&info->swaplist, &shmem_swaplist);
1086 713
1087 spin_lock(&info->lock);
1088 mutex_unlock(&shmem_swaplist_mutex);
1089
1090 if (index >= info->next_index) {
1091 BUG_ON(!(info->flags & SHMEM_TRUNCATE));
1092 goto unlock;
1093 }
1094 entry = shmem_swp_entry(info, index, NULL);
1095 if (entry->val) {
1096 WARN_ON_ONCE(1); /* Still happens? Tell us about it! */
1097 free_swap_and_cache(*entry);
1098 shmem_swp_set(info, entry, 0);
1099 }
1100 shmem_recalc_inode(inode);
1101
1102 if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) { 714 if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) {
1103 delete_from_page_cache(page);
1104 shmem_swp_set(info, entry, swap.val);
1105 shmem_swp_unmap(entry);
1106 swap_shmem_alloc(swap); 715 swap_shmem_alloc(swap);
716 shmem_delete_from_page_cache(page, swp_to_radix_entry(swap));
717
718 spin_lock(&info->lock);
719 info->swapped++;
720 shmem_recalc_inode(inode);
1107 spin_unlock(&info->lock); 721 spin_unlock(&info->lock);
722
723 mutex_unlock(&shmem_swaplist_mutex);
1108 BUG_ON(page_mapped(page)); 724 BUG_ON(page_mapped(page));
1109 swap_writepage(page, wbc); 725 swap_writepage(page, wbc);
1110 return 0; 726 return 0;
1111 } 727 }
1112 728
1113 shmem_swp_unmap(entry); 729 mutex_unlock(&shmem_swaplist_mutex);
1114unlock:
1115 spin_unlock(&info->lock);
1116 /*
1117 * add_to_swap_cache() doesn't return -EEXIST, so we can safely
1118 * clear SWAP_HAS_CACHE flag.
1119 */
1120 swapcache_free(swap, NULL); 730 swapcache_free(swap, NULL);
1121redirty: 731redirty:
1122 set_page_dirty(page); 732 set_page_dirty(page);
@@ -1153,35 +763,33 @@ static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
1153} 763}
1154#endif /* CONFIG_TMPFS */ 764#endif /* CONFIG_TMPFS */
1155 765
1156static struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp, 766static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
1157 struct shmem_inode_info *info, unsigned long idx) 767 struct shmem_inode_info *info, pgoff_t index)
1158{ 768{
1159 struct mempolicy mpol, *spol; 769 struct mempolicy mpol, *spol;
1160 struct vm_area_struct pvma; 770 struct vm_area_struct pvma;
1161 struct page *page;
1162 771
1163 spol = mpol_cond_copy(&mpol, 772 spol = mpol_cond_copy(&mpol,
1164 mpol_shared_policy_lookup(&info->policy, idx)); 773 mpol_shared_policy_lookup(&info->policy, index));
1165 774
1166 /* Create a pseudo vma that just contains the policy */ 775 /* Create a pseudo vma that just contains the policy */
1167 pvma.vm_start = 0; 776 pvma.vm_start = 0;
1168 pvma.vm_pgoff = idx; 777 pvma.vm_pgoff = index;
1169 pvma.vm_ops = NULL; 778 pvma.vm_ops = NULL;
1170 pvma.vm_policy = spol; 779 pvma.vm_policy = spol;
1171 page = swapin_readahead(entry, gfp, &pvma, 0); 780 return swapin_readahead(swap, gfp, &pvma, 0);
1172 return page;
1173} 781}
1174 782
1175static struct page *shmem_alloc_page(gfp_t gfp, 783static struct page *shmem_alloc_page(gfp_t gfp,
1176 struct shmem_inode_info *info, unsigned long idx) 784 struct shmem_inode_info *info, pgoff_t index)
1177{ 785{
1178 struct vm_area_struct pvma; 786 struct vm_area_struct pvma;
1179 787
1180 /* Create a pseudo vma that just contains the policy */ 788 /* Create a pseudo vma that just contains the policy */
1181 pvma.vm_start = 0; 789 pvma.vm_start = 0;
1182 pvma.vm_pgoff = idx; 790 pvma.vm_pgoff = index;
1183 pvma.vm_ops = NULL; 791 pvma.vm_ops = NULL;
1184 pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx); 792 pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index);
1185 793
1186 /* 794 /*
1187 * alloc_page_vma() will drop the shared policy reference 795 * alloc_page_vma() will drop the shared policy reference
@@ -1190,19 +798,19 @@ static struct page *shmem_alloc_page(gfp_t gfp,
1190} 798}
1191#else /* !CONFIG_NUMA */ 799#else /* !CONFIG_NUMA */
1192#ifdef CONFIG_TMPFS 800#ifdef CONFIG_TMPFS
1193static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *p) 801static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
1194{ 802{
1195} 803}
1196#endif /* CONFIG_TMPFS */ 804#endif /* CONFIG_TMPFS */
1197 805
1198static inline struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp, 806static inline struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
1199 struct shmem_inode_info *info, unsigned long idx) 807 struct shmem_inode_info *info, pgoff_t index)
1200{ 808{
1201 return swapin_readahead(entry, gfp, NULL, 0); 809 return swapin_readahead(swap, gfp, NULL, 0);
1202} 810}
1203 811
1204static inline struct page *shmem_alloc_page(gfp_t gfp, 812static inline struct page *shmem_alloc_page(gfp_t gfp,
1205 struct shmem_inode_info *info, unsigned long idx) 813 struct shmem_inode_info *info, pgoff_t index)
1206{ 814{
1207 return alloc_page(gfp); 815 return alloc_page(gfp);
1208} 816}
@@ -1222,243 +830,190 @@ static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
1222 * vm. If we swap it in we mark it dirty since we also free the swap 830 * vm. If we swap it in we mark it dirty since we also free the swap
1223 * entry since a page cannot live in both the swap and page cache 831 * entry since a page cannot live in both the swap and page cache
1224 */ 832 */
1225static int shmem_getpage_gfp(struct inode *inode, pgoff_t idx, 833static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
1226 struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type) 834 struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type)
1227{ 835{
1228 struct address_space *mapping = inode->i_mapping; 836 struct address_space *mapping = inode->i_mapping;
1229 struct shmem_inode_info *info = SHMEM_I(inode); 837 struct shmem_inode_info *info;
1230 struct shmem_sb_info *sbinfo; 838 struct shmem_sb_info *sbinfo;
1231 struct page *page; 839 struct page *page;
1232 struct page *prealloc_page = NULL;
1233 swp_entry_t *entry;
1234 swp_entry_t swap; 840 swp_entry_t swap;
1235 int error; 841 int error;
1236 int ret; 842 int once = 0;
1237 843
1238 if (idx >= SHMEM_MAX_INDEX) 844 if (index > (MAX_LFS_FILESIZE >> PAGE_CACHE_SHIFT))
1239 return -EFBIG; 845 return -EFBIG;
1240repeat: 846repeat:
1241 page = find_lock_page(mapping, idx); 847 swap.val = 0;
1242 if (page) { 848 page = find_lock_page(mapping, index);
849 if (radix_tree_exceptional_entry(page)) {
850 swap = radix_to_swp_entry(page);
851 page = NULL;
852 }
853
854 if (sgp != SGP_WRITE &&
855 ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
856 error = -EINVAL;
857 goto failed;
858 }
859
860 if (page || (sgp == SGP_READ && !swap.val)) {
1243 /* 861 /*
1244 * Once we can get the page lock, it must be uptodate: 862 * Once we can get the page lock, it must be uptodate:
1245 * if there were an error in reading back from swap, 863 * if there were an error in reading back from swap,
1246 * the page would not be inserted into the filecache. 864 * the page would not be inserted into the filecache.
1247 */ 865 */
1248 BUG_ON(!PageUptodate(page)); 866 BUG_ON(page && !PageUptodate(page));
1249 goto done; 867 *pagep = page;
868 return 0;
1250 } 869 }
1251 870
1252 /* 871 /*
1253 * Try to preload while we can wait, to not make a habit of 872 * Fast cache lookup did not find it:
1254 * draining atomic reserves; but don't latch on to this cpu. 873 * bring it back from swap or allocate.
1255 */ 874 */
1256 error = radix_tree_preload(gfp & GFP_RECLAIM_MASK); 875 info = SHMEM_I(inode);
1257 if (error) 876 sbinfo = SHMEM_SB(inode->i_sb);
1258 goto out;
1259 radix_tree_preload_end();
1260
1261 if (sgp != SGP_READ && !prealloc_page) {
1262 prealloc_page = shmem_alloc_page(gfp, info, idx);
1263 if (prealloc_page) {
1264 SetPageSwapBacked(prealloc_page);
1265 if (mem_cgroup_cache_charge(prealloc_page,
1266 current->mm, GFP_KERNEL)) {
1267 page_cache_release(prealloc_page);
1268 prealloc_page = NULL;
1269 }
1270 }
1271 }
1272
1273 spin_lock(&info->lock);
1274 shmem_recalc_inode(inode);
1275 entry = shmem_swp_alloc(info, idx, sgp, gfp);
1276 if (IS_ERR(entry)) {
1277 spin_unlock(&info->lock);
1278 error = PTR_ERR(entry);
1279 goto out;
1280 }
1281 swap = *entry;
1282 877
1283 if (swap.val) { 878 if (swap.val) {
1284 /* Look it up and read it in.. */ 879 /* Look it up and read it in.. */
1285 page = lookup_swap_cache(swap); 880 page = lookup_swap_cache(swap);
1286 if (!page) { 881 if (!page) {
1287 shmem_swp_unmap(entry);
1288 spin_unlock(&info->lock);
1289 /* here we actually do the io */ 882 /* here we actually do the io */
1290 if (fault_type) 883 if (fault_type)
1291 *fault_type |= VM_FAULT_MAJOR; 884 *fault_type |= VM_FAULT_MAJOR;
1292 page = shmem_swapin(swap, gfp, info, idx); 885 page = shmem_swapin(swap, gfp, info, index);
1293 if (!page) { 886 if (!page) {
1294 spin_lock(&info->lock); 887 error = -ENOMEM;
1295 entry = shmem_swp_alloc(info, idx, sgp, gfp); 888 goto failed;
1296 if (IS_ERR(entry))
1297 error = PTR_ERR(entry);
1298 else {
1299 if (entry->val == swap.val)
1300 error = -ENOMEM;
1301 shmem_swp_unmap(entry);
1302 }
1303 spin_unlock(&info->lock);
1304 if (error)
1305 goto out;
1306 goto repeat;
1307 } 889 }
1308 wait_on_page_locked(page);
1309 page_cache_release(page);
1310 goto repeat;
1311 } 890 }
1312 891
1313 /* We have to do this with page locked to prevent races */ 892 /* We have to do this with page locked to prevent races */
1314 if (!trylock_page(page)) { 893 lock_page(page);
1315 shmem_swp_unmap(entry);
1316 spin_unlock(&info->lock);
1317 wait_on_page_locked(page);
1318 page_cache_release(page);
1319 goto repeat;
1320 }
1321 if (PageWriteback(page)) {
1322 shmem_swp_unmap(entry);
1323 spin_unlock(&info->lock);
1324 wait_on_page_writeback(page);
1325 unlock_page(page);
1326 page_cache_release(page);
1327 goto repeat;
1328 }
1329 if (!PageUptodate(page)) { 894 if (!PageUptodate(page)) {
1330 shmem_swp_unmap(entry);
1331 spin_unlock(&info->lock);
1332 unlock_page(page);
1333 page_cache_release(page);
1334 error = -EIO; 895 error = -EIO;
1335 goto out; 896 goto failed;
1336 } 897 }
1337 898 wait_on_page_writeback(page);
1338 error = add_to_page_cache_locked(page, mapping, 899
1339 idx, GFP_NOWAIT); 900 /* Someone may have already done it for us */
1340 if (error) { 901 if (page->mapping) {
1341 shmem_swp_unmap(entry); 902 if (page->mapping == mapping &&
1342 spin_unlock(&info->lock); 903 page->index == index)
1343 if (error == -ENOMEM) { 904 goto done;
1344 /* 905 error = -EEXIST;
1345 * reclaim from proper memory cgroup and 906 goto failed;
1346 * call memcg's OOM if needed.
1347 */
1348 error = mem_cgroup_shmem_charge_fallback(
1349 page, current->mm, gfp);
1350 if (error) {
1351 unlock_page(page);
1352 page_cache_release(page);
1353 goto out;
1354 }
1355 }
1356 unlock_page(page);
1357 page_cache_release(page);
1358 goto repeat;
1359 } 907 }
1360 908
1361 info->flags |= SHMEM_PAGEIN; 909 error = mem_cgroup_cache_charge(page, current->mm,
1362 shmem_swp_set(info, entry, 0); 910 gfp & GFP_RECLAIM_MASK);
1363 shmem_swp_unmap(entry); 911 if (!error)
1364 delete_from_swap_cache(page); 912 error = shmem_add_to_page_cache(page, mapping, index,
913 gfp, swp_to_radix_entry(swap));
914 if (error)
915 goto failed;
916
917 spin_lock(&info->lock);
918 info->swapped--;
919 shmem_recalc_inode(inode);
1365 spin_unlock(&info->lock); 920 spin_unlock(&info->lock);
921
922 delete_from_swap_cache(page);
1366 set_page_dirty(page); 923 set_page_dirty(page);
1367 swap_free(swap); 924 swap_free(swap);
1368 925
1369 } else if (sgp == SGP_READ) { 926 } else {
1370 shmem_swp_unmap(entry); 927 if (shmem_acct_block(info->flags)) {
1371 page = find_get_page(mapping, idx); 928 error = -ENOSPC;
1372 if (page && !trylock_page(page)) { 929 goto failed;
1373 spin_unlock(&info->lock);
1374 wait_on_page_locked(page);
1375 page_cache_release(page);
1376 goto repeat;
1377 } 930 }
1378 spin_unlock(&info->lock);
1379
1380 } else if (prealloc_page) {
1381 shmem_swp_unmap(entry);
1382 sbinfo = SHMEM_SB(inode->i_sb);
1383 if (sbinfo->max_blocks) { 931 if (sbinfo->max_blocks) {
1384 if (percpu_counter_compare(&sbinfo->used_blocks, 932 if (percpu_counter_compare(&sbinfo->used_blocks,
1385 sbinfo->max_blocks) >= 0 || 933 sbinfo->max_blocks) >= 0) {
1386 shmem_acct_block(info->flags)) 934 error = -ENOSPC;
1387 goto nospace; 935 goto unacct;
936 }
1388 percpu_counter_inc(&sbinfo->used_blocks); 937 percpu_counter_inc(&sbinfo->used_blocks);
1389 inode->i_blocks += BLOCKS_PER_PAGE;
1390 } else if (shmem_acct_block(info->flags))
1391 goto nospace;
1392
1393 page = prealloc_page;
1394 prealloc_page = NULL;
1395
1396 entry = shmem_swp_alloc(info, idx, sgp, gfp);
1397 if (IS_ERR(entry))
1398 error = PTR_ERR(entry);
1399 else {
1400 swap = *entry;
1401 shmem_swp_unmap(entry);
1402 } 938 }
1403 ret = error || swap.val; 939
1404 if (ret) 940 page = shmem_alloc_page(gfp, info, index);
1405 mem_cgroup_uncharge_cache_page(page); 941 if (!page) {
1406 else 942 error = -ENOMEM;
1407 ret = add_to_page_cache_lru(page, mapping, 943 goto decused;
1408 idx, GFP_NOWAIT);
1409 /*
1410 * At add_to_page_cache_lru() failure,
1411 * uncharge will be done automatically.
1412 */
1413 if (ret) {
1414 shmem_unacct_blocks(info->flags, 1);
1415 shmem_free_blocks(inode, 1);
1416 spin_unlock(&info->lock);
1417 page_cache_release(page);
1418 if (error)
1419 goto out;
1420 goto repeat;
1421 } 944 }
1422 945
1423 info->flags |= SHMEM_PAGEIN; 946 SetPageSwapBacked(page);
947 __set_page_locked(page);
948 error = mem_cgroup_cache_charge(page, current->mm,
949 gfp & GFP_RECLAIM_MASK);
950 if (!error)
951 error = shmem_add_to_page_cache(page, mapping, index,
952 gfp, NULL);
953 if (error)
954 goto decused;
955 lru_cache_add_anon(page);
956
957 spin_lock(&info->lock);
1424 info->alloced++; 958 info->alloced++;
959 inode->i_blocks += BLOCKS_PER_PAGE;
960 shmem_recalc_inode(inode);
1425 spin_unlock(&info->lock); 961 spin_unlock(&info->lock);
962
1426 clear_highpage(page); 963 clear_highpage(page);
1427 flush_dcache_page(page); 964 flush_dcache_page(page);
1428 SetPageUptodate(page); 965 SetPageUptodate(page);
1429 if (sgp == SGP_DIRTY) 966 if (sgp == SGP_DIRTY)
1430 set_page_dirty(page); 967 set_page_dirty(page);
1431
1432 } else {
1433 spin_unlock(&info->lock);
1434 error = -ENOMEM;
1435 goto out;
1436 } 968 }
1437done: 969done:
1438 *pagep = page; 970 /* Perhaps the file has been truncated since we checked */
1439 error = 0; 971 if (sgp != SGP_WRITE &&
1440out: 972 ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
1441 if (prealloc_page) { 973 error = -EINVAL;
1442 mem_cgroup_uncharge_cache_page(prealloc_page); 974 goto trunc;
1443 page_cache_release(prealloc_page);
1444 } 975 }
1445 return error; 976 *pagep = page;
977 return 0;
1446 978
1447nospace:
1448 /* 979 /*
1449 * Perhaps the page was brought in from swap between find_lock_page 980 * Error recovery.
1450 * and taking info->lock? We allow for that at add_to_page_cache_lru,
1451 * but must also avoid reporting a spurious ENOSPC while working on a
1452 * full tmpfs.
1453 */ 981 */
1454 page = find_get_page(mapping, idx); 982trunc:
983 ClearPageDirty(page);
984 delete_from_page_cache(page);
985 spin_lock(&info->lock);
986 info->alloced--;
987 inode->i_blocks -= BLOCKS_PER_PAGE;
1455 spin_unlock(&info->lock); 988 spin_unlock(&info->lock);
989decused:
990 if (sbinfo->max_blocks)
991 percpu_counter_add(&sbinfo->used_blocks, -1);
992unacct:
993 shmem_unacct_blocks(info->flags, 1);
994failed:
995 if (swap.val && error != -EINVAL) {
996 struct page *test = find_get_page(mapping, index);
997 if (test && !radix_tree_exceptional_entry(test))
998 page_cache_release(test);
999 /* Have another try if the entry has changed */
1000 if (test != swp_to_radix_entry(swap))
1001 error = -EEXIST;
1002 }
1456 if (page) { 1003 if (page) {
1004 unlock_page(page);
1457 page_cache_release(page); 1005 page_cache_release(page);
1006 }
1007 if (error == -ENOSPC && !once++) {
1008 info = SHMEM_I(inode);
1009 spin_lock(&info->lock);
1010 shmem_recalc_inode(inode);
1011 spin_unlock(&info->lock);
1458 goto repeat; 1012 goto repeat;
1459 } 1013 }
1460 error = -ENOSPC; 1014 if (error == -EEXIST)
1461 goto out; 1015 goto repeat;
1016 return error;
1462} 1017}
1463 1018
1464static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 1019static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
@@ -1467,9 +1022,6 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1467 int error; 1022 int error;
1468 int ret = VM_FAULT_LOCKED; 1023 int ret = VM_FAULT_LOCKED;
1469 1024
1470 if (((loff_t)vmf->pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode))
1471 return VM_FAULT_SIGBUS;
1472
1473 error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret); 1025 error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret);
1474 if (error) 1026 if (error)
1475 return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); 1027 return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS);
@@ -1482,20 +1034,20 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1482} 1034}
1483 1035
1484#ifdef CONFIG_NUMA 1036#ifdef CONFIG_NUMA
1485static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new) 1037static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol)
1486{ 1038{
1487 struct inode *i = vma->vm_file->f_path.dentry->d_inode; 1039 struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
1488 return mpol_set_shared_policy(&SHMEM_I(i)->policy, vma, new); 1040 return mpol_set_shared_policy(&SHMEM_I(inode)->policy, vma, mpol);
1489} 1041}
1490 1042
1491static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma, 1043static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma,
1492 unsigned long addr) 1044 unsigned long addr)
1493{ 1045{
1494 struct inode *i = vma->vm_file->f_path.dentry->d_inode; 1046 struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
1495 unsigned long idx; 1047 pgoff_t index;
1496 1048
1497 idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; 1049 index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
1498 return mpol_shared_policy_lookup(&SHMEM_I(i)->policy, idx); 1050 return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index);
1499} 1051}
1500#endif 1052#endif
1501 1053
@@ -1516,6 +1068,12 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user)
1516 user_shm_unlock(inode->i_size, user); 1068 user_shm_unlock(inode->i_size, user);
1517 info->flags &= ~VM_LOCKED; 1069 info->flags &= ~VM_LOCKED;
1518 mapping_clear_unevictable(file->f_mapping); 1070 mapping_clear_unevictable(file->f_mapping);
1071 /*
1072 * Ensure that a racing putback_lru_page() can see
1073 * the pages of this mapping are evictable when we
1074 * skip them due to !PageLRU during the scan.
1075 */
1076 smp_mb__after_clear_bit();
1519 scan_mapping_unevictable_pages(file->f_mapping); 1077 scan_mapping_unevictable_pages(file->f_mapping);
1520 } 1078 }
1521 retval = 0; 1079 retval = 0;
@@ -1593,7 +1151,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
1593 1151
1594#ifdef CONFIG_TMPFS 1152#ifdef CONFIG_TMPFS
1595static const struct inode_operations shmem_symlink_inode_operations; 1153static const struct inode_operations shmem_symlink_inode_operations;
1596static const struct inode_operations shmem_symlink_inline_operations; 1154static const struct inode_operations shmem_short_symlink_operations;
1597 1155
1598static int 1156static int
1599shmem_write_begin(struct file *file, struct address_space *mapping, 1157shmem_write_begin(struct file *file, struct address_space *mapping,
@@ -1626,7 +1184,8 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_
1626{ 1184{
1627 struct inode *inode = filp->f_path.dentry->d_inode; 1185 struct inode *inode = filp->f_path.dentry->d_inode;
1628 struct address_space *mapping = inode->i_mapping; 1186 struct address_space *mapping = inode->i_mapping;
1629 unsigned long index, offset; 1187 pgoff_t index;
1188 unsigned long offset;
1630 enum sgp_type sgp = SGP_READ; 1189 enum sgp_type sgp = SGP_READ;
1631 1190
1632 /* 1191 /*
@@ -1642,7 +1201,8 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_
1642 1201
1643 for (;;) { 1202 for (;;) {
1644 struct page *page = NULL; 1203 struct page *page = NULL;
1645 unsigned long end_index, nr, ret; 1204 pgoff_t end_index;
1205 unsigned long nr, ret;
1646 loff_t i_size = i_size_read(inode); 1206 loff_t i_size = i_size_read(inode);
1647 1207
1648 end_index = i_size >> PAGE_CACHE_SHIFT; 1208 end_index = i_size >> PAGE_CACHE_SHIFT;
@@ -1880,8 +1440,9 @@ static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
1880 buf->f_namelen = NAME_MAX; 1440 buf->f_namelen = NAME_MAX;
1881 if (sbinfo->max_blocks) { 1441 if (sbinfo->max_blocks) {
1882 buf->f_blocks = sbinfo->max_blocks; 1442 buf->f_blocks = sbinfo->max_blocks;
1883 buf->f_bavail = buf->f_bfree = 1443 buf->f_bavail =
1884 sbinfo->max_blocks - percpu_counter_sum(&sbinfo->used_blocks); 1444 buf->f_bfree = sbinfo->max_blocks -
1445 percpu_counter_sum(&sbinfo->used_blocks);
1885 } 1446 }
1886 if (sbinfo->max_inodes) { 1447 if (sbinfo->max_inodes) {
1887 buf->f_files = sbinfo->max_inodes; 1448 buf->f_files = sbinfo->max_inodes;
@@ -1903,7 +1464,7 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
1903 inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE); 1464 inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE);
1904 if (inode) { 1465 if (inode) {
1905 error = security_inode_init_security(inode, dir, 1466 error = security_inode_init_security(inode, dir,
1906 &dentry->d_name, NULL, 1467 &dentry->d_name,
1907 NULL, NULL); 1468 NULL, NULL);
1908 if (error) { 1469 if (error) {
1909 if (error != -EOPNOTSUPP) { 1470 if (error != -EOPNOTSUPP) {
@@ -2043,7 +1604,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
2043 if (!inode) 1604 if (!inode)
2044 return -ENOSPC; 1605 return -ENOSPC;
2045 1606
2046 error = security_inode_init_security(inode, dir, &dentry->d_name, NULL, 1607 error = security_inode_init_security(inode, dir, &dentry->d_name,
2047 NULL, NULL); 1608 NULL, NULL);
2048 if (error) { 1609 if (error) {
2049 if (error != -EOPNOTSUPP) { 1610 if (error != -EOPNOTSUPP) {
@@ -2055,10 +1616,13 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
2055 1616
2056 info = SHMEM_I(inode); 1617 info = SHMEM_I(inode);
2057 inode->i_size = len-1; 1618 inode->i_size = len-1;
2058 if (len <= SHMEM_SYMLINK_INLINE_LEN) { 1619 if (len <= SHORT_SYMLINK_LEN) {
2059 /* do it inline */ 1620 info->symlink = kmemdup(symname, len, GFP_KERNEL);
2060 memcpy(info->inline_symlink, symname, len); 1621 if (!info->symlink) {
2061 inode->i_op = &shmem_symlink_inline_operations; 1622 iput(inode);
1623 return -ENOMEM;
1624 }
1625 inode->i_op = &shmem_short_symlink_operations;
2062 } else { 1626 } else {
2063 error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL); 1627 error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL);
2064 if (error) { 1628 if (error) {
@@ -2081,17 +1645,17 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
2081 return 0; 1645 return 0;
2082} 1646}
2083 1647
2084static void *shmem_follow_link_inline(struct dentry *dentry, struct nameidata *nd) 1648static void *shmem_follow_short_symlink(struct dentry *dentry, struct nameidata *nd)
2085{ 1649{
2086 nd_set_link(nd, SHMEM_I(dentry->d_inode)->inline_symlink); 1650 nd_set_link(nd, SHMEM_I(dentry->d_inode)->symlink);
2087 return NULL; 1651 return NULL;
2088} 1652}
2089 1653
2090static void *shmem_follow_link(struct dentry *dentry, struct nameidata *nd) 1654static void *shmem_follow_link(struct dentry *dentry, struct nameidata *nd)
2091{ 1655{
2092 struct page *page = NULL; 1656 struct page *page = NULL;
2093 int res = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL); 1657 int error = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL);
2094 nd_set_link(nd, res ? ERR_PTR(res) : kmap(page)); 1658 nd_set_link(nd, error ? ERR_PTR(error) : kmap(page));
2095 if (page) 1659 if (page)
2096 unlock_page(page); 1660 unlock_page(page);
2097 return page; 1661 return page;
@@ -2202,7 +1766,6 @@ out:
2202 return err; 1766 return err;
2203} 1767}
2204 1768
2205
2206static const struct xattr_handler *shmem_xattr_handlers[] = { 1769static const struct xattr_handler *shmem_xattr_handlers[] = {
2207#ifdef CONFIG_TMPFS_POSIX_ACL 1770#ifdef CONFIG_TMPFS_POSIX_ACL
2208 &generic_acl_access_handler, 1771 &generic_acl_access_handler,
@@ -2332,9 +1895,9 @@ static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size)
2332} 1895}
2333#endif /* CONFIG_TMPFS_XATTR */ 1896#endif /* CONFIG_TMPFS_XATTR */
2334 1897
2335static const struct inode_operations shmem_symlink_inline_operations = { 1898static const struct inode_operations shmem_short_symlink_operations = {
2336 .readlink = generic_readlink, 1899 .readlink = generic_readlink,
2337 .follow_link = shmem_follow_link_inline, 1900 .follow_link = shmem_follow_short_symlink,
2338#ifdef CONFIG_TMPFS_XATTR 1901#ifdef CONFIG_TMPFS_XATTR
2339 .setxattr = shmem_setxattr, 1902 .setxattr = shmem_setxattr,
2340 .getxattr = shmem_getxattr, 1903 .getxattr = shmem_getxattr,
@@ -2534,8 +2097,7 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
2534 if (config.max_inodes < inodes) 2097 if (config.max_inodes < inodes)
2535 goto out; 2098 goto out;
2536 /* 2099 /*
2537 * Those tests also disallow limited->unlimited while any are in 2100 * Those tests disallow limited->unlimited while any are in use;
2538 * use, so i_blocks will always be zero when max_blocks is zero;
2539 * but we must separately disallow unlimited->limited, because 2101 * but we must separately disallow unlimited->limited, because
2540 * in that case we have no record of how much is already in use. 2102 * in that case we have no record of how much is already in use.
2541 */ 2103 */
@@ -2627,7 +2189,7 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent)
2627 goto failed; 2189 goto failed;
2628 sbinfo->free_inodes = sbinfo->max_inodes; 2190 sbinfo->free_inodes = sbinfo->max_inodes;
2629 2191
2630 sb->s_maxbytes = SHMEM_MAX_BYTES; 2192 sb->s_maxbytes = MAX_LFS_FILESIZE;
2631 sb->s_blocksize = PAGE_CACHE_SIZE; 2193 sb->s_blocksize = PAGE_CACHE_SIZE;
2632 sb->s_blocksize_bits = PAGE_CACHE_SHIFT; 2194 sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
2633 sb->s_magic = TMPFS_MAGIC; 2195 sb->s_magic = TMPFS_MAGIC;
@@ -2662,14 +2224,14 @@ static struct kmem_cache *shmem_inode_cachep;
2662 2224
2663static struct inode *shmem_alloc_inode(struct super_block *sb) 2225static struct inode *shmem_alloc_inode(struct super_block *sb)
2664{ 2226{
2665 struct shmem_inode_info *p; 2227 struct shmem_inode_info *info;
2666 p = (struct shmem_inode_info *)kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL); 2228 info = kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL);
2667 if (!p) 2229 if (!info)
2668 return NULL; 2230 return NULL;
2669 return &p->vfs_inode; 2231 return &info->vfs_inode;
2670} 2232}
2671 2233
2672static void shmem_i_callback(struct rcu_head *head) 2234static void shmem_destroy_callback(struct rcu_head *head)
2673{ 2235{
2674 struct inode *inode = container_of(head, struct inode, i_rcu); 2236 struct inode *inode = container_of(head, struct inode, i_rcu);
2675 INIT_LIST_HEAD(&inode->i_dentry); 2237 INIT_LIST_HEAD(&inode->i_dentry);
@@ -2678,29 +2240,26 @@ static void shmem_i_callback(struct rcu_head *head)
2678 2240
2679static void shmem_destroy_inode(struct inode *inode) 2241static void shmem_destroy_inode(struct inode *inode)
2680{ 2242{
2681 if ((inode->i_mode & S_IFMT) == S_IFREG) { 2243 if ((inode->i_mode & S_IFMT) == S_IFREG)
2682 /* only struct inode is valid if it's an inline symlink */
2683 mpol_free_shared_policy(&SHMEM_I(inode)->policy); 2244 mpol_free_shared_policy(&SHMEM_I(inode)->policy);
2684 } 2245 call_rcu(&inode->i_rcu, shmem_destroy_callback);
2685 call_rcu(&inode->i_rcu, shmem_i_callback);
2686} 2246}
2687 2247
2688static void init_once(void *foo) 2248static void shmem_init_inode(void *foo)
2689{ 2249{
2690 struct shmem_inode_info *p = (struct shmem_inode_info *) foo; 2250 struct shmem_inode_info *info = foo;
2691 2251 inode_init_once(&info->vfs_inode);
2692 inode_init_once(&p->vfs_inode);
2693} 2252}
2694 2253
2695static int init_inodecache(void) 2254static int shmem_init_inodecache(void)
2696{ 2255{
2697 shmem_inode_cachep = kmem_cache_create("shmem_inode_cache", 2256 shmem_inode_cachep = kmem_cache_create("shmem_inode_cache",
2698 sizeof(struct shmem_inode_info), 2257 sizeof(struct shmem_inode_info),
2699 0, SLAB_PANIC, init_once); 2258 0, SLAB_PANIC, shmem_init_inode);
2700 return 0; 2259 return 0;
2701} 2260}
2702 2261
2703static void destroy_inodecache(void) 2262static void shmem_destroy_inodecache(void)
2704{ 2263{
2705 kmem_cache_destroy(shmem_inode_cachep); 2264 kmem_cache_destroy(shmem_inode_cachep);
2706} 2265}
@@ -2797,21 +2356,20 @@ static const struct vm_operations_struct shmem_vm_ops = {
2797#endif 2356#endif
2798}; 2357};
2799 2358
2800
2801static struct dentry *shmem_mount(struct file_system_type *fs_type, 2359static struct dentry *shmem_mount(struct file_system_type *fs_type,
2802 int flags, const char *dev_name, void *data) 2360 int flags, const char *dev_name, void *data)
2803{ 2361{
2804 return mount_nodev(fs_type, flags, data, shmem_fill_super); 2362 return mount_nodev(fs_type, flags, data, shmem_fill_super);
2805} 2363}
2806 2364
2807static struct file_system_type tmpfs_fs_type = { 2365static struct file_system_type shmem_fs_type = {
2808 .owner = THIS_MODULE, 2366 .owner = THIS_MODULE,
2809 .name = "tmpfs", 2367 .name = "tmpfs",
2810 .mount = shmem_mount, 2368 .mount = shmem_mount,
2811 .kill_sb = kill_litter_super, 2369 .kill_sb = kill_litter_super,
2812}; 2370};
2813 2371
2814int __init init_tmpfs(void) 2372int __init shmem_init(void)
2815{ 2373{
2816 int error; 2374 int error;
2817 2375
@@ -2819,18 +2377,18 @@ int __init init_tmpfs(void)
2819 if (error) 2377 if (error)
2820 goto out4; 2378 goto out4;
2821 2379
2822 error = init_inodecache(); 2380 error = shmem_init_inodecache();
2823 if (error) 2381 if (error)
2824 goto out3; 2382 goto out3;
2825 2383
2826 error = register_filesystem(&tmpfs_fs_type); 2384 error = register_filesystem(&shmem_fs_type);
2827 if (error) { 2385 if (error) {
2828 printk(KERN_ERR "Could not register tmpfs\n"); 2386 printk(KERN_ERR "Could not register tmpfs\n");
2829 goto out2; 2387 goto out2;
2830 } 2388 }
2831 2389
2832 shm_mnt = vfs_kern_mount(&tmpfs_fs_type, MS_NOUSER, 2390 shm_mnt = vfs_kern_mount(&shmem_fs_type, MS_NOUSER,
2833 tmpfs_fs_type.name, NULL); 2391 shmem_fs_type.name, NULL);
2834 if (IS_ERR(shm_mnt)) { 2392 if (IS_ERR(shm_mnt)) {
2835 error = PTR_ERR(shm_mnt); 2393 error = PTR_ERR(shm_mnt);
2836 printk(KERN_ERR "Could not kern_mount tmpfs\n"); 2394 printk(KERN_ERR "Could not kern_mount tmpfs\n");
@@ -2839,9 +2397,9 @@ int __init init_tmpfs(void)
2839 return 0; 2397 return 0;
2840 2398
2841out1: 2399out1:
2842 unregister_filesystem(&tmpfs_fs_type); 2400 unregister_filesystem(&shmem_fs_type);
2843out2: 2401out2:
2844 destroy_inodecache(); 2402 shmem_destroy_inodecache();
2845out3: 2403out3:
2846 bdi_destroy(&shmem_backing_dev_info); 2404 bdi_destroy(&shmem_backing_dev_info);
2847out4: 2405out4:
@@ -2849,45 +2407,6 @@ out4:
2849 return error; 2407 return error;
2850} 2408}
2851 2409
2852#ifdef CONFIG_CGROUP_MEM_RES_CTLR
2853/**
2854 * mem_cgroup_get_shmem_target - find a page or entry assigned to the shmem file
2855 * @inode: the inode to be searched
2856 * @pgoff: the offset to be searched
2857 * @pagep: the pointer for the found page to be stored
2858 * @ent: the pointer for the found swap entry to be stored
2859 *
2860 * If a page is found, refcount of it is incremented. Callers should handle
2861 * these refcount.
2862 */
2863void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t pgoff,
2864 struct page **pagep, swp_entry_t *ent)
2865{
2866 swp_entry_t entry = { .val = 0 }, *ptr;
2867 struct page *page = NULL;
2868 struct shmem_inode_info *info = SHMEM_I(inode);
2869
2870 if ((pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode))
2871 goto out;
2872
2873 spin_lock(&info->lock);
2874 ptr = shmem_swp_entry(info, pgoff, NULL);
2875#ifdef CONFIG_SWAP
2876 if (ptr && ptr->val) {
2877 entry.val = ptr->val;
2878 page = find_get_page(&swapper_space, entry.val);
2879 } else
2880#endif
2881 page = find_get_page(inode->i_mapping, pgoff);
2882 if (ptr)
2883 shmem_swp_unmap(ptr);
2884 spin_unlock(&info->lock);
2885out:
2886 *pagep = page;
2887 *ent = entry;
2888}
2889#endif
2890
2891#else /* !CONFIG_SHMEM */ 2410#else /* !CONFIG_SHMEM */
2892 2411
2893/* 2412/*
@@ -2901,23 +2420,23 @@ out:
2901 2420
2902#include <linux/ramfs.h> 2421#include <linux/ramfs.h>
2903 2422
2904static struct file_system_type tmpfs_fs_type = { 2423static struct file_system_type shmem_fs_type = {
2905 .name = "tmpfs", 2424 .name = "tmpfs",
2906 .mount = ramfs_mount, 2425 .mount = ramfs_mount,
2907 .kill_sb = kill_litter_super, 2426 .kill_sb = kill_litter_super,
2908}; 2427};
2909 2428
2910int __init init_tmpfs(void) 2429int __init shmem_init(void)
2911{ 2430{
2912 BUG_ON(register_filesystem(&tmpfs_fs_type) != 0); 2431 BUG_ON(register_filesystem(&shmem_fs_type) != 0);
2913 2432
2914 shm_mnt = kern_mount(&tmpfs_fs_type); 2433 shm_mnt = kern_mount(&shmem_fs_type);
2915 BUG_ON(IS_ERR(shm_mnt)); 2434 BUG_ON(IS_ERR(shm_mnt));
2916 2435
2917 return 0; 2436 return 0;
2918} 2437}
2919 2438
2920int shmem_unuse(swp_entry_t entry, struct page *page) 2439int shmem_unuse(swp_entry_t swap, struct page *page)
2921{ 2440{
2922 return 0; 2441 return 0;
2923} 2442}
@@ -2927,43 +2446,17 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user)
2927 return 0; 2446 return 0;
2928} 2447}
2929 2448
2930void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end) 2449void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
2931{ 2450{
2932 truncate_inode_pages_range(inode->i_mapping, start, end); 2451 truncate_inode_pages_range(inode->i_mapping, lstart, lend);
2933} 2452}
2934EXPORT_SYMBOL_GPL(shmem_truncate_range); 2453EXPORT_SYMBOL_GPL(shmem_truncate_range);
2935 2454
2936#ifdef CONFIG_CGROUP_MEM_RES_CTLR
2937/**
2938 * mem_cgroup_get_shmem_target - find a page or entry assigned to the shmem file
2939 * @inode: the inode to be searched
2940 * @pgoff: the offset to be searched
2941 * @pagep: the pointer for the found page to be stored
2942 * @ent: the pointer for the found swap entry to be stored
2943 *
2944 * If a page is found, refcount of it is incremented. Callers should handle
2945 * these refcount.
2946 */
2947void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t pgoff,
2948 struct page **pagep, swp_entry_t *ent)
2949{
2950 struct page *page = NULL;
2951
2952 if ((pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode))
2953 goto out;
2954 page = find_get_page(inode->i_mapping, pgoff);
2955out:
2956 *pagep = page;
2957 *ent = (swp_entry_t){ .val = 0 };
2958}
2959#endif
2960
2961#define shmem_vm_ops generic_file_vm_ops 2455#define shmem_vm_ops generic_file_vm_ops
2962#define shmem_file_operations ramfs_file_operations 2456#define shmem_file_operations ramfs_file_operations
2963#define shmem_get_inode(sb, dir, mode, dev, flags) ramfs_get_inode(sb, dir, mode, dev) 2457#define shmem_get_inode(sb, dir, mode, dev, flags) ramfs_get_inode(sb, dir, mode, dev)
2964#define shmem_acct_size(flags, size) 0 2458#define shmem_acct_size(flags, size) 0
2965#define shmem_unacct_size(flags, size) do {} while (0) 2459#define shmem_unacct_size(flags, size) do {} while (0)
2966#define SHMEM_MAX_BYTES MAX_LFS_FILESIZE
2967 2460
2968#endif /* CONFIG_SHMEM */ 2461#endif /* CONFIG_SHMEM */
2969 2462
@@ -2987,7 +2480,7 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags
2987 if (IS_ERR(shm_mnt)) 2480 if (IS_ERR(shm_mnt))
2988 return (void *)shm_mnt; 2481 return (void *)shm_mnt;
2989 2482
2990 if (size < 0 || size > SHMEM_MAX_BYTES) 2483 if (size < 0 || size > MAX_LFS_FILESIZE)
2991 return ERR_PTR(-EINVAL); 2484 return ERR_PTR(-EINVAL);
2992 2485
2993 if (shmem_acct_size(flags, size)) 2486 if (shmem_acct_size(flags, size))
@@ -3010,7 +2503,7 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags
3010 2503
3011 d_instantiate(path.dentry, inode); 2504 d_instantiate(path.dentry, inode);
3012 inode->i_size = size; 2505 inode->i_size = size;
3013 inode->i_nlink = 0; /* It is unlinked */ 2506 clear_nlink(inode); /* It is unlinked */
3014#ifndef CONFIG_MMU 2507#ifndef CONFIG_MMU
3015 error = ramfs_nommu_expand_for_mapping(inode, size); 2508 error = ramfs_nommu_expand_for_mapping(inode, size);
3016 if (error) 2509 if (error)
diff --git a/mm/slab.c b/mm/slab.c
index 1e523ed47c61..708efe886154 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -622,6 +622,51 @@ int slab_is_available(void)
622static struct lock_class_key on_slab_l3_key; 622static struct lock_class_key on_slab_l3_key;
623static struct lock_class_key on_slab_alc_key; 623static struct lock_class_key on_slab_alc_key;
624 624
625static struct lock_class_key debugobj_l3_key;
626static struct lock_class_key debugobj_alc_key;
627
628static void slab_set_lock_classes(struct kmem_cache *cachep,
629 struct lock_class_key *l3_key, struct lock_class_key *alc_key,
630 int q)
631{
632 struct array_cache **alc;
633 struct kmem_list3 *l3;
634 int r;
635
636 l3 = cachep->nodelists[q];
637 if (!l3)
638 return;
639
640 lockdep_set_class(&l3->list_lock, l3_key);
641 alc = l3->alien;
642 /*
643 * FIXME: This check for BAD_ALIEN_MAGIC
644 * should go away when common slab code is taught to
645 * work even without alien caches.
646 * Currently, non NUMA code returns BAD_ALIEN_MAGIC
647 * for alloc_alien_cache,
648 */
649 if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC)
650 return;
651 for_each_node(r) {
652 if (alc[r])
653 lockdep_set_class(&alc[r]->lock, alc_key);
654 }
655}
656
657static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, int node)
658{
659 slab_set_lock_classes(cachep, &debugobj_l3_key, &debugobj_alc_key, node);
660}
661
662static void slab_set_debugobj_lock_classes(struct kmem_cache *cachep)
663{
664 int node;
665
666 for_each_online_node(node)
667 slab_set_debugobj_lock_classes_node(cachep, node);
668}
669
625static void init_node_lock_keys(int q) 670static void init_node_lock_keys(int q)
626{ 671{
627 struct cache_sizes *s = malloc_sizes; 672 struct cache_sizes *s = malloc_sizes;
@@ -630,29 +675,14 @@ static void init_node_lock_keys(int q)
630 return; 675 return;
631 676
632 for (s = malloc_sizes; s->cs_size != ULONG_MAX; s++) { 677 for (s = malloc_sizes; s->cs_size != ULONG_MAX; s++) {
633 struct array_cache **alc;
634 struct kmem_list3 *l3; 678 struct kmem_list3 *l3;
635 int r;
636 679
637 l3 = s->cs_cachep->nodelists[q]; 680 l3 = s->cs_cachep->nodelists[q];
638 if (!l3 || OFF_SLAB(s->cs_cachep)) 681 if (!l3 || OFF_SLAB(s->cs_cachep))
639 continue; 682 continue;
640 lockdep_set_class(&l3->list_lock, &on_slab_l3_key); 683
641 alc = l3->alien; 684 slab_set_lock_classes(s->cs_cachep, &on_slab_l3_key,
642 /* 685 &on_slab_alc_key, q);
643 * FIXME: This check for BAD_ALIEN_MAGIC
644 * should go away when common slab code is taught to
645 * work even without alien caches.
646 * Currently, non NUMA code returns BAD_ALIEN_MAGIC
647 * for alloc_alien_cache,
648 */
649 if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC)
650 continue;
651 for_each_node(r) {
652 if (alc[r])
653 lockdep_set_class(&alc[r]->lock,
654 &on_slab_alc_key);
655 }
656 } 686 }
657} 687}
658 688
@@ -671,6 +701,14 @@ static void init_node_lock_keys(int q)
671static inline void init_lock_keys(void) 701static inline void init_lock_keys(void)
672{ 702{
673} 703}
704
705static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, int node)
706{
707}
708
709static void slab_set_debugobj_lock_classes(struct kmem_cache *cachep)
710{
711}
674#endif 712#endif
675 713
676/* 714/*
@@ -1264,6 +1302,8 @@ static int __cpuinit cpuup_prepare(long cpu)
1264 spin_unlock_irq(&l3->list_lock); 1302 spin_unlock_irq(&l3->list_lock);
1265 kfree(shared); 1303 kfree(shared);
1266 free_alien_cache(alien); 1304 free_alien_cache(alien);
1305 if (cachep->flags & SLAB_DEBUG_OBJECTS)
1306 slab_set_debugobj_lock_classes_node(cachep, node);
1267 } 1307 }
1268 init_node_lock_keys(node); 1308 init_node_lock_keys(node);
1269 1309
@@ -1626,6 +1666,9 @@ void __init kmem_cache_init_late(void)
1626{ 1666{
1627 struct kmem_cache *cachep; 1667 struct kmem_cache *cachep;
1628 1668
1669 /* Annotate slab for lockdep -- annotate the malloc caches */
1670 init_lock_keys();
1671
1629 /* 6) resize the head arrays to their final sizes */ 1672 /* 6) resize the head arrays to their final sizes */
1630 mutex_lock(&cache_chain_mutex); 1673 mutex_lock(&cache_chain_mutex);
1631 list_for_each_entry(cachep, &cache_chain, next) 1674 list_for_each_entry(cachep, &cache_chain, next)
@@ -1636,9 +1679,6 @@ void __init kmem_cache_init_late(void)
1636 /* Done! */ 1679 /* Done! */
1637 g_cpucache_up = FULL; 1680 g_cpucache_up = FULL;
1638 1681
1639 /* Annotate slab for lockdep -- annotate the malloc caches */
1640 init_lock_keys();
1641
1642 /* 1682 /*
1643 * Register a cpu startup notifier callback that initializes 1683 * Register a cpu startup notifier callback that initializes
1644 * cpu_cache_get for all new cpus 1684 * cpu_cache_get for all new cpus
@@ -1811,15 +1851,15 @@ static void dump_line(char *data, int offset, int limit)
1811 unsigned char error = 0; 1851 unsigned char error = 0;
1812 int bad_count = 0; 1852 int bad_count = 0;
1813 1853
1814 printk(KERN_ERR "%03x:", offset); 1854 printk(KERN_ERR "%03x: ", offset);
1815 for (i = 0; i < limit; i++) { 1855 for (i = 0; i < limit; i++) {
1816 if (data[offset + i] != POISON_FREE) { 1856 if (data[offset + i] != POISON_FREE) {
1817 error = data[offset + i]; 1857 error = data[offset + i];
1818 bad_count++; 1858 bad_count++;
1819 } 1859 }
1820 printk(" %02x", (unsigned char)data[offset + i]);
1821 } 1860 }
1822 printk("\n"); 1861 print_hex_dump(KERN_CONT, "", 0, 16, 1,
1862 &data[offset], limit, 1);
1823 1863
1824 if (bad_count == 1) { 1864 if (bad_count == 1) {
1825 error ^= POISON_FREE; 1865 error ^= POISON_FREE;
@@ -2426,6 +2466,16 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2426 goto oops; 2466 goto oops;
2427 } 2467 }
2428 2468
2469 if (flags & SLAB_DEBUG_OBJECTS) {
2470 /*
2471 * Would deadlock through slab_destroy()->call_rcu()->
2472 * debug_object_activate()->kmem_cache_alloc().
2473 */
2474 WARN_ON_ONCE(flags & SLAB_DESTROY_BY_RCU);
2475
2476 slab_set_debugobj_lock_classes(cachep);
2477 }
2478
2429 /* cache setup completed, link it into the list */ 2479 /* cache setup completed, link it into the list */
2430 list_add(&cachep->next, &cache_chain); 2480 list_add(&cachep->next, &cache_chain);
2431oops: 2481oops:
@@ -2989,14 +3039,9 @@ bad:
2989 printk(KERN_ERR "slab: Internal list corruption detected in " 3039 printk(KERN_ERR "slab: Internal list corruption detected in "
2990 "cache '%s'(%d), slabp %p(%d). Hexdump:\n", 3040 "cache '%s'(%d), slabp %p(%d). Hexdump:\n",
2991 cachep->name, cachep->num, slabp, slabp->inuse); 3041 cachep->name, cachep->num, slabp, slabp->inuse);
2992 for (i = 0; 3042 print_hex_dump(KERN_ERR, "", DUMP_PREFIX_OFFSET, 16, 1, slabp,
2993 i < sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t); 3043 sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t),
2994 i++) { 3044 1);
2995 if (i % 16 == 0)
2996 printk("\n%03x:", i);
2997 printk(" %02x", ((unsigned char *)slabp)[i]);
2998 }
2999 printk("\n");
3000 BUG(); 3045 BUG();
3001 } 3046 }
3002} 3047}
@@ -3403,7 +3448,7 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
3403 cache_alloc_debugcheck_before(cachep, flags); 3448 cache_alloc_debugcheck_before(cachep, flags);
3404 local_irq_save(save_flags); 3449 local_irq_save(save_flags);
3405 3450
3406 if (nodeid == -1) 3451 if (nodeid == NUMA_NO_NODE)
3407 nodeid = slab_node; 3452 nodeid = slab_node;
3408 3453
3409 if (unlikely(!cachep->nodelists[nodeid])) { 3454 if (unlikely(!cachep->nodelists[nodeid])) {
@@ -3934,7 +3979,7 @@ fail:
3934 3979
3935struct ccupdate_struct { 3980struct ccupdate_struct {
3936 struct kmem_cache *cachep; 3981 struct kmem_cache *cachep;
3937 struct array_cache *new[NR_CPUS]; 3982 struct array_cache *new[0];
3938}; 3983};
3939 3984
3940static void do_ccupdate_local(void *info) 3985static void do_ccupdate_local(void *info)
@@ -3956,7 +4001,8 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
3956 struct ccupdate_struct *new; 4001 struct ccupdate_struct *new;
3957 int i; 4002 int i;
3958 4003
3959 new = kzalloc(sizeof(*new), gfp); 4004 new = kzalloc(sizeof(*new) + nr_cpu_ids * sizeof(struct array_cache *),
4005 gfp);
3960 if (!new) 4006 if (!new)
3961 return -ENOMEM; 4007 return -ENOMEM;
3962 4008
@@ -4533,7 +4579,7 @@ static const struct file_operations proc_slabstats_operations = {
4533 4579
4534static int __init slab_proc_init(void) 4580static int __init slab_proc_init(void)
4535{ 4581{
4536 proc_create("slabinfo",S_IWUSR|S_IRUGO,NULL,&proc_slabinfo_operations); 4582 proc_create("slabinfo",S_IWUSR|S_IRUSR,NULL,&proc_slabinfo_operations);
4537#ifdef CONFIG_DEBUG_SLAB_LEAK 4583#ifdef CONFIG_DEBUG_SLAB_LEAK
4538 proc_create("slab_allocators", 0, NULL, &proc_slabstats_operations); 4584 proc_create("slab_allocators", 0, NULL, &proc_slabstats_operations);
4539#endif 4585#endif
diff --git a/mm/slob.c b/mm/slob.c
index bf3918187165..8105be42cad1 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -63,7 +63,7 @@
63#include <linux/swap.h> /* struct reclaim_state */ 63#include <linux/swap.h> /* struct reclaim_state */
64#include <linux/cache.h> 64#include <linux/cache.h>
65#include <linux/init.h> 65#include <linux/init.h>
66#include <linux/module.h> 66#include <linux/export.h>
67#include <linux/rcupdate.h> 67#include <linux/rcupdate.h>
68#include <linux/list.h> 68#include <linux/list.h>
69#include <linux/kmemleak.h> 69#include <linux/kmemleak.h>
diff --git a/mm/slub.c b/mm/slub.c
index f8f5e8efeb88..7d2a996c307e 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2,10 +2,11 @@
2 * SLUB: A slab allocator that limits cache line use instead of queuing 2 * SLUB: A slab allocator that limits cache line use instead of queuing
3 * objects in per cpu and per node lists. 3 * objects in per cpu and per node lists.
4 * 4 *
5 * The allocator synchronizes using per slab locks and only 5 * The allocator synchronizes using per slab locks or atomic operatios
6 * uses a centralized lock to manage a pool of partial slabs. 6 * and only uses a centralized lock to manage a pool of partial slabs.
7 * 7 *
8 * (C) 2007 SGI, Christoph Lameter 8 * (C) 2007 SGI, Christoph Lameter
9 * (C) 2011 Linux Foundation, Christoph Lameter
9 */ 10 */
10 11
11#include <linux/mm.h> 12#include <linux/mm.h>
@@ -33,15 +34,27 @@
33 34
34/* 35/*
35 * Lock order: 36 * Lock order:
36 * 1. slab_lock(page) 37 * 1. slub_lock (Global Semaphore)
37 * 2. slab->list_lock 38 * 2. node->list_lock
39 * 3. slab_lock(page) (Only on some arches and for debugging)
38 * 40 *
39 * The slab_lock protects operations on the object of a particular 41 * slub_lock
40 * slab and its metadata in the page struct. If the slab lock 42 *
41 * has been taken then no allocations nor frees can be performed 43 * The role of the slub_lock is to protect the list of all the slabs
42 * on the objects in the slab nor can the slab be added or removed 44 * and to synchronize major metadata changes to slab cache structures.
43 * from the partial or full lists since this would mean modifying 45 *
44 * the page_struct of the slab. 46 * The slab_lock is only used for debugging and on arches that do not
47 * have the ability to do a cmpxchg_double. It only protects the second
48 * double word in the page struct. Meaning
49 * A. page->freelist -> List of object free in a page
50 * B. page->counters -> Counters of objects
51 * C. page->frozen -> frozen state
52 *
53 * If a slab is frozen then it is exempt from list management. It is not
54 * on any list. The processor that froze the slab is the one who can
55 * perform list operations on the page. Other processors may put objects
56 * onto the freelist but the processor that froze the slab is the only
57 * one that can retrieve the objects from the page's freelist.
45 * 58 *
46 * The list_lock protects the partial and full list on each node and 59 * The list_lock protects the partial and full list on each node and
47 * the partial slab counter. If taken then no new slabs may be added or 60 * the partial slab counter. If taken then no new slabs may be added or
@@ -54,20 +67,6 @@
54 * slabs, operations can continue without any centralized lock. F.e. 67 * slabs, operations can continue without any centralized lock. F.e.
55 * allocating a long series of objects that fill up slabs does not require 68 * allocating a long series of objects that fill up slabs does not require
56 * the list lock. 69 * the list lock.
57 *
58 * The lock order is sometimes inverted when we are trying to get a slab
59 * off a list. We take the list_lock and then look for a page on the list
60 * to use. While we do that objects in the slabs may be freed. We can
61 * only operate on the slab if we have also taken the slab_lock. So we use
62 * a slab_trylock() on the slab. If trylock was successful then no frees
63 * can occur anymore and we can use the slab for allocations etc. If the
64 * slab_trylock() does not succeed then frees are in progress in the slab and
65 * we must stay away from it for a while since we may cause a bouncing
66 * cacheline if we try to acquire the lock. So go onto the next slab.
67 * If all pages are busy then we may allocate a new slab instead of reusing
68 * a partial slab. A new slab has no one operating on it and thus there is
69 * no danger of cacheline contention.
70 *
71 * Interrupts are disabled during allocation and deallocation in order to 70 * Interrupts are disabled during allocation and deallocation in order to
72 * make the slab allocator safe to use in the context of an irq. In addition 71 * make the slab allocator safe to use in the context of an irq. In addition
73 * interrupts are disabled to ensure that the processor does not change 72 * interrupts are disabled to ensure that the processor does not change
@@ -132,6 +131,9 @@ static inline int kmem_cache_debug(struct kmem_cache *s)
132/* Enable to test recovery from slab corruption on boot */ 131/* Enable to test recovery from slab corruption on boot */
133#undef SLUB_RESILIENCY_TEST 132#undef SLUB_RESILIENCY_TEST
134 133
134/* Enable to log cmpxchg failures */
135#undef SLUB_DEBUG_CMPXCHG
136
135/* 137/*
136 * Mininum number of partial slabs. These will be left on the partial 138 * Mininum number of partial slabs. These will be left on the partial
137 * lists even if they are empty. kmem_cache_shrink may reclaim them. 139 * lists even if they are empty. kmem_cache_shrink may reclaim them.
@@ -167,10 +169,11 @@ static inline int kmem_cache_debug(struct kmem_cache *s)
167 169
168#define OO_SHIFT 16 170#define OO_SHIFT 16
169#define OO_MASK ((1 << OO_SHIFT) - 1) 171#define OO_MASK ((1 << OO_SHIFT) - 1)
170#define MAX_OBJS_PER_PAGE 65535 /* since page.objects is u16 */ 172#define MAX_OBJS_PER_PAGE 32767 /* since page.objects is u15 */
171 173
172/* Internal SLUB flags */ 174/* Internal SLUB flags */
173#define __OBJECT_POISON 0x80000000UL /* Poison object */ 175#define __OBJECT_POISON 0x80000000UL /* Poison object */
176#define __CMPXCHG_DOUBLE 0x40000000UL /* Use cmpxchg_double */
174 177
175static int kmem_size = sizeof(struct kmem_cache); 178static int kmem_size = sizeof(struct kmem_cache);
176 179
@@ -343,11 +346,99 @@ static inline int oo_objects(struct kmem_cache_order_objects x)
343 return x.x & OO_MASK; 346 return x.x & OO_MASK;
344} 347}
345 348
349/*
350 * Per slab locking using the pagelock
351 */
352static __always_inline void slab_lock(struct page *page)
353{
354 bit_spin_lock(PG_locked, &page->flags);
355}
356
357static __always_inline void slab_unlock(struct page *page)
358{
359 __bit_spin_unlock(PG_locked, &page->flags);
360}
361
362/* Interrupts must be disabled (for the fallback code to work right) */
363static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
364 void *freelist_old, unsigned long counters_old,
365 void *freelist_new, unsigned long counters_new,
366 const char *n)
367{
368 VM_BUG_ON(!irqs_disabled());
369#ifdef CONFIG_CMPXCHG_DOUBLE
370 if (s->flags & __CMPXCHG_DOUBLE) {
371 if (cmpxchg_double(&page->freelist,
372 freelist_old, counters_old,
373 freelist_new, counters_new))
374 return 1;
375 } else
376#endif
377 {
378 slab_lock(page);
379 if (page->freelist == freelist_old && page->counters == counters_old) {
380 page->freelist = freelist_new;
381 page->counters = counters_new;
382 slab_unlock(page);
383 return 1;
384 }
385 slab_unlock(page);
386 }
387
388 cpu_relax();
389 stat(s, CMPXCHG_DOUBLE_FAIL);
390
391#ifdef SLUB_DEBUG_CMPXCHG
392 printk(KERN_INFO "%s %s: cmpxchg double redo ", n, s->name);
393#endif
394
395 return 0;
396}
397
398static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
399 void *freelist_old, unsigned long counters_old,
400 void *freelist_new, unsigned long counters_new,
401 const char *n)
402{
403#ifdef CONFIG_CMPXCHG_DOUBLE
404 if (s->flags & __CMPXCHG_DOUBLE) {
405 if (cmpxchg_double(&page->freelist,
406 freelist_old, counters_old,
407 freelist_new, counters_new))
408 return 1;
409 } else
410#endif
411 {
412 unsigned long flags;
413
414 local_irq_save(flags);
415 slab_lock(page);
416 if (page->freelist == freelist_old && page->counters == counters_old) {
417 page->freelist = freelist_new;
418 page->counters = counters_new;
419 slab_unlock(page);
420 local_irq_restore(flags);
421 return 1;
422 }
423 slab_unlock(page);
424 local_irq_restore(flags);
425 }
426
427 cpu_relax();
428 stat(s, CMPXCHG_DOUBLE_FAIL);
429
430#ifdef SLUB_DEBUG_CMPXCHG
431 printk(KERN_INFO "%s %s: cmpxchg double redo ", n, s->name);
432#endif
433
434 return 0;
435}
436
346#ifdef CONFIG_SLUB_DEBUG 437#ifdef CONFIG_SLUB_DEBUG
347/* 438/*
348 * Determine a map of object in use on a page. 439 * Determine a map of object in use on a page.
349 * 440 *
350 * Slab lock or node listlock must be held to guarantee that the page does 441 * Node listlock must be held to guarantee that the page does
351 * not vanish from under us. 442 * not vanish from under us.
352 */ 443 */
353static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map) 444static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map)
@@ -376,34 +467,8 @@ static int disable_higher_order_debug;
376 */ 467 */
377static void print_section(char *text, u8 *addr, unsigned int length) 468static void print_section(char *text, u8 *addr, unsigned int length)
378{ 469{
379 int i, offset; 470 print_hex_dump(KERN_ERR, text, DUMP_PREFIX_ADDRESS, 16, 1, addr,
380 int newline = 1; 471 length, 1);
381 char ascii[17];
382
383 ascii[16] = 0;
384
385 for (i = 0; i < length; i++) {
386 if (newline) {
387 printk(KERN_ERR "%8s 0x%p: ", text, addr + i);
388 newline = 0;
389 }
390 printk(KERN_CONT " %02x", addr[i]);
391 offset = i % 16;
392 ascii[offset] = isgraph(addr[i]) ? addr[i] : '.';
393 if (offset == 15) {
394 printk(KERN_CONT " %s\n", ascii);
395 newline = 1;
396 }
397 }
398 if (!newline) {
399 i %= 16;
400 while (i < 16) {
401 printk(KERN_CONT " ");
402 ascii[i] = ' ';
403 i++;
404 }
405 printk(KERN_CONT " %s\n", ascii);
406 }
407} 472}
408 473
409static struct track *get_track(struct kmem_cache *s, void *object, 474static struct track *get_track(struct kmem_cache *s, void *object,
@@ -534,12 +599,12 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
534 p, p - addr, get_freepointer(s, p)); 599 p, p - addr, get_freepointer(s, p));
535 600
536 if (p > addr + 16) 601 if (p > addr + 16)
537 print_section("Bytes b4", p - 16, 16); 602 print_section("Bytes b4 ", p - 16, 16);
538
539 print_section("Object", p, min_t(unsigned long, s->objsize, PAGE_SIZE));
540 603
604 print_section("Object ", p, min_t(unsigned long, s->objsize,
605 PAGE_SIZE));
541 if (s->flags & SLAB_RED_ZONE) 606 if (s->flags & SLAB_RED_ZONE)
542 print_section("Redzone", p + s->objsize, 607 print_section("Redzone ", p + s->objsize,
543 s->inuse - s->objsize); 608 s->inuse - s->objsize);
544 609
545 if (s->offset) 610 if (s->offset)
@@ -552,7 +617,7 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
552 617
553 if (off != s->size) 618 if (off != s->size)
554 /* Beginning of the filler is the free pointer */ 619 /* Beginning of the filler is the free pointer */
555 print_section("Padding", p + off, s->size - off); 620 print_section("Padding ", p + off, s->size - off);
556 621
557 dump_stack(); 622 dump_stack();
558} 623}
@@ -590,49 +655,6 @@ static void init_object(struct kmem_cache *s, void *object, u8 val)
590 memset(p + s->objsize, val, s->inuse - s->objsize); 655 memset(p + s->objsize, val, s->inuse - s->objsize);
591} 656}
592 657
593static u8 *check_bytes8(u8 *start, u8 value, unsigned int bytes)
594{
595 while (bytes) {
596 if (*start != value)
597 return start;
598 start++;
599 bytes--;
600 }
601 return NULL;
602}
603
604static u8 *check_bytes(u8 *start, u8 value, unsigned int bytes)
605{
606 u64 value64;
607 unsigned int words, prefix;
608
609 if (bytes <= 16)
610 return check_bytes8(start, value, bytes);
611
612 value64 = value | value << 8 | value << 16 | value << 24;
613 value64 = value64 | value64 << 32;
614 prefix = 8 - ((unsigned long)start) % 8;
615
616 if (prefix) {
617 u8 *r = check_bytes8(start, value, prefix);
618 if (r)
619 return r;
620 start += prefix;
621 bytes -= prefix;
622 }
623
624 words = bytes / 8;
625
626 while (words) {
627 if (*(u64 *)start != value64)
628 return check_bytes8(start, value, 8);
629 start += 8;
630 words--;
631 }
632
633 return check_bytes8(start, value, bytes % 8);
634}
635
636static void restore_bytes(struct kmem_cache *s, char *message, u8 data, 658static void restore_bytes(struct kmem_cache *s, char *message, u8 data,
637 void *from, void *to) 659 void *from, void *to)
638{ 660{
@@ -647,7 +669,7 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page,
647 u8 *fault; 669 u8 *fault;
648 u8 *end; 670 u8 *end;
649 671
650 fault = check_bytes(start, value, bytes); 672 fault = memchr_inv(start, value, bytes);
651 if (!fault) 673 if (!fault)
652 return 1; 674 return 1;
653 675
@@ -740,14 +762,14 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page)
740 if (!remainder) 762 if (!remainder)
741 return 1; 763 return 1;
742 764
743 fault = check_bytes(end - remainder, POISON_INUSE, remainder); 765 fault = memchr_inv(end - remainder, POISON_INUSE, remainder);
744 if (!fault) 766 if (!fault)
745 return 1; 767 return 1;
746 while (end > fault && end[-1] == POISON_INUSE) 768 while (end > fault && end[-1] == POISON_INUSE)
747 end--; 769 end--;
748 770
749 slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1); 771 slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1);
750 print_section("Padding", end - remainder, remainder); 772 print_section("Padding ", end - remainder, remainder);
751 773
752 restore_bytes(s, "slab padding", POISON_INUSE, end - remainder, end); 774 restore_bytes(s, "slab padding", POISON_INUSE, end - remainder, end);
753 return 0; 775 return 0;
@@ -838,10 +860,11 @@ static int check_slab(struct kmem_cache *s, struct page *page)
838static int on_freelist(struct kmem_cache *s, struct page *page, void *search) 860static int on_freelist(struct kmem_cache *s, struct page *page, void *search)
839{ 861{
840 int nr = 0; 862 int nr = 0;
841 void *fp = page->freelist; 863 void *fp;
842 void *object = NULL; 864 void *object = NULL;
843 unsigned long max_objects; 865 unsigned long max_objects;
844 866
867 fp = page->freelist;
845 while (fp && nr <= page->objects) { 868 while (fp && nr <= page->objects) {
846 if (fp == search) 869 if (fp == search)
847 return 1; 870 return 1;
@@ -895,7 +918,7 @@ static void trace(struct kmem_cache *s, struct page *page, void *object,
895 page->freelist); 918 page->freelist);
896 919
897 if (!alloc) 920 if (!alloc)
898 print_section("Object", (void *)object, s->objsize); 921 print_section("Object ", (void *)object, s->objsize);
899 922
900 dump_stack(); 923 dump_stack();
901 } 924 }
@@ -946,26 +969,27 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x)
946 969
947/* 970/*
948 * Tracking of fully allocated slabs for debugging purposes. 971 * Tracking of fully allocated slabs for debugging purposes.
972 *
973 * list_lock must be held.
949 */ 974 */
950static void add_full(struct kmem_cache_node *n, struct page *page) 975static void add_full(struct kmem_cache *s,
976 struct kmem_cache_node *n, struct page *page)
951{ 977{
952 spin_lock(&n->list_lock); 978 if (!(s->flags & SLAB_STORE_USER))
979 return;
980
953 list_add(&page->lru, &n->full); 981 list_add(&page->lru, &n->full);
954 spin_unlock(&n->list_lock);
955} 982}
956 983
984/*
985 * list_lock must be held.
986 */
957static void remove_full(struct kmem_cache *s, struct page *page) 987static void remove_full(struct kmem_cache *s, struct page *page)
958{ 988{
959 struct kmem_cache_node *n;
960
961 if (!(s->flags & SLAB_STORE_USER)) 989 if (!(s->flags & SLAB_STORE_USER))
962 return; 990 return;
963 991
964 n = get_node(s, page_to_nid(page));
965
966 spin_lock(&n->list_lock);
967 list_del(&page->lru); 992 list_del(&page->lru);
968 spin_unlock(&n->list_lock);
969} 993}
970 994
971/* Tracking of the number of slabs for debugging purposes */ 995/* Tracking of the number of slabs for debugging purposes */
@@ -1021,11 +1045,6 @@ static noinline int alloc_debug_processing(struct kmem_cache *s, struct page *pa
1021 if (!check_slab(s, page)) 1045 if (!check_slab(s, page))
1022 goto bad; 1046 goto bad;
1023 1047
1024 if (!on_freelist(s, page, object)) {
1025 object_err(s, page, object, "Object already allocated");
1026 goto bad;
1027 }
1028
1029 if (!check_valid_pointer(s, page, object)) { 1048 if (!check_valid_pointer(s, page, object)) {
1030 object_err(s, page, object, "Freelist Pointer check fails"); 1049 object_err(s, page, object, "Freelist Pointer check fails");
1031 goto bad; 1050 goto bad;
@@ -1058,6 +1077,12 @@ bad:
1058static noinline int free_debug_processing(struct kmem_cache *s, 1077static noinline int free_debug_processing(struct kmem_cache *s,
1059 struct page *page, void *object, unsigned long addr) 1078 struct page *page, void *object, unsigned long addr)
1060{ 1079{
1080 unsigned long flags;
1081 int rc = 0;
1082
1083 local_irq_save(flags);
1084 slab_lock(page);
1085
1061 if (!check_slab(s, page)) 1086 if (!check_slab(s, page))
1062 goto fail; 1087 goto fail;
1063 1088
@@ -1072,7 +1097,7 @@ static noinline int free_debug_processing(struct kmem_cache *s,
1072 } 1097 }
1073 1098
1074 if (!check_object(s, page, object, SLUB_RED_ACTIVE)) 1099 if (!check_object(s, page, object, SLUB_RED_ACTIVE))
1075 return 0; 1100 goto out;
1076 1101
1077 if (unlikely(s != page->slab)) { 1102 if (unlikely(s != page->slab)) {
1078 if (!PageSlab(page)) { 1103 if (!PageSlab(page)) {
@@ -1089,18 +1114,19 @@ static noinline int free_debug_processing(struct kmem_cache *s,
1089 goto fail; 1114 goto fail;
1090 } 1115 }
1091 1116
1092 /* Special debug activities for freeing objects */
1093 if (!PageSlubFrozen(page) && !page->freelist)
1094 remove_full(s, page);
1095 if (s->flags & SLAB_STORE_USER) 1117 if (s->flags & SLAB_STORE_USER)
1096 set_track(s, object, TRACK_FREE, addr); 1118 set_track(s, object, TRACK_FREE, addr);
1097 trace(s, page, object, 0); 1119 trace(s, page, object, 0);
1098 init_object(s, object, SLUB_RED_INACTIVE); 1120 init_object(s, object, SLUB_RED_INACTIVE);
1099 return 1; 1121 rc = 1;
1122out:
1123 slab_unlock(page);
1124 local_irq_restore(flags);
1125 return rc;
1100 1126
1101fail: 1127fail:
1102 slab_fix(s, "Object at 0x%p not freed", object); 1128 slab_fix(s, "Object at 0x%p not freed", object);
1103 return 0; 1129 goto out;
1104} 1130}
1105 1131
1106static int __init setup_slub_debug(char *str) 1132static int __init setup_slub_debug(char *str)
@@ -1200,7 +1226,9 @@ static inline int slab_pad_check(struct kmem_cache *s, struct page *page)
1200 { return 1; } 1226 { return 1; }
1201static inline int check_object(struct kmem_cache *s, struct page *page, 1227static inline int check_object(struct kmem_cache *s, struct page *page,
1202 void *object, u8 val) { return 1; } 1228 void *object, u8 val) { return 1; }
1203static inline void add_full(struct kmem_cache_node *n, struct page *page) {} 1229static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n,
1230 struct page *page) {}
1231static inline void remove_full(struct kmem_cache *s, struct page *page) {}
1204static inline unsigned long kmem_cache_flags(unsigned long objsize, 1232static inline unsigned long kmem_cache_flags(unsigned long objsize,
1205 unsigned long flags, const char *name, 1233 unsigned long flags, const char *name,
1206 void (*ctor)(void *)) 1234 void (*ctor)(void *))
@@ -1252,6 +1280,11 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
1252 struct kmem_cache_order_objects oo = s->oo; 1280 struct kmem_cache_order_objects oo = s->oo;
1253 gfp_t alloc_gfp; 1281 gfp_t alloc_gfp;
1254 1282
1283 flags &= gfp_allowed_mask;
1284
1285 if (flags & __GFP_WAIT)
1286 local_irq_enable();
1287
1255 flags |= s->allocflags; 1288 flags |= s->allocflags;
1256 1289
1257 /* 1290 /*
@@ -1268,12 +1301,17 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
1268 * Try a lower order alloc if possible 1301 * Try a lower order alloc if possible
1269 */ 1302 */
1270 page = alloc_slab_page(flags, node, oo); 1303 page = alloc_slab_page(flags, node, oo);
1271 if (!page)
1272 return NULL;
1273 1304
1274 stat(s, ORDER_FALLBACK); 1305 if (page)
1306 stat(s, ORDER_FALLBACK);
1275 } 1307 }
1276 1308
1309 if (flags & __GFP_WAIT)
1310 local_irq_disable();
1311
1312 if (!page)
1313 return NULL;
1314
1277 if (kmemcheck_enabled 1315 if (kmemcheck_enabled
1278 && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) { 1316 && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) {
1279 int pages = 1 << oo_order(oo); 1317 int pages = 1 << oo_order(oo);
@@ -1340,7 +1378,8 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
1340 set_freepointer(s, last, NULL); 1378 set_freepointer(s, last, NULL);
1341 1379
1342 page->freelist = start; 1380 page->freelist = start;
1343 page->inuse = 0; 1381 page->inuse = page->objects;
1382 page->frozen = 1;
1344out: 1383out:
1345 return page; 1384 return page;
1346} 1385}
@@ -1418,79 +1457,80 @@ static void discard_slab(struct kmem_cache *s, struct page *page)
1418} 1457}
1419 1458
1420/* 1459/*
1421 * Per slab locking using the pagelock 1460 * Management of partially allocated slabs.
1422 */ 1461 *
1423static __always_inline void slab_lock(struct page *page) 1462 * list_lock must be held.
1424{
1425 bit_spin_lock(PG_locked, &page->flags);
1426}
1427
1428static __always_inline void slab_unlock(struct page *page)
1429{
1430 __bit_spin_unlock(PG_locked, &page->flags);
1431}
1432
1433static __always_inline int slab_trylock(struct page *page)
1434{
1435 int rc = 1;
1436
1437 rc = bit_spin_trylock(PG_locked, &page->flags);
1438 return rc;
1439}
1440
1441/*
1442 * Management of partially allocated slabs
1443 */ 1463 */
1444static void add_partial(struct kmem_cache_node *n, 1464static inline void add_partial(struct kmem_cache_node *n,
1445 struct page *page, int tail) 1465 struct page *page, int tail)
1446{ 1466{
1447 spin_lock(&n->list_lock);
1448 n->nr_partial++; 1467 n->nr_partial++;
1449 if (tail) 1468 if (tail == DEACTIVATE_TO_TAIL)
1450 list_add_tail(&page->lru, &n->partial); 1469 list_add_tail(&page->lru, &n->partial);
1451 else 1470 else
1452 list_add(&page->lru, &n->partial); 1471 list_add(&page->lru, &n->partial);
1453 spin_unlock(&n->list_lock);
1454} 1472}
1455 1473
1456static inline void __remove_partial(struct kmem_cache_node *n, 1474/*
1475 * list_lock must be held.
1476 */
1477static inline void remove_partial(struct kmem_cache_node *n,
1457 struct page *page) 1478 struct page *page)
1458{ 1479{
1459 list_del(&page->lru); 1480 list_del(&page->lru);
1460 n->nr_partial--; 1481 n->nr_partial--;
1461} 1482}
1462 1483
1463static void remove_partial(struct kmem_cache *s, struct page *page)
1464{
1465 struct kmem_cache_node *n = get_node(s, page_to_nid(page));
1466
1467 spin_lock(&n->list_lock);
1468 __remove_partial(n, page);
1469 spin_unlock(&n->list_lock);
1470}
1471
1472/* 1484/*
1473 * Lock slab and remove from the partial list. 1485 * Lock slab, remove from the partial list and put the object into the
1486 * per cpu freelist.
1487 *
1488 * Returns a list of objects or NULL if it fails.
1474 * 1489 *
1475 * Must hold list_lock. 1490 * Must hold list_lock.
1476 */ 1491 */
1477static inline int lock_and_freeze_slab(struct kmem_cache_node *n, 1492static inline void *acquire_slab(struct kmem_cache *s,
1478 struct page *page) 1493 struct kmem_cache_node *n, struct page *page,
1494 int mode)
1479{ 1495{
1480 if (slab_trylock(page)) { 1496 void *freelist;
1481 __remove_partial(n, page); 1497 unsigned long counters;
1482 __SetPageSlubFrozen(page); 1498 struct page new;
1483 return 1; 1499
1484 } 1500 /*
1485 return 0; 1501 * Zap the freelist and set the frozen bit.
1502 * The old freelist is the list of objects for the
1503 * per cpu allocation list.
1504 */
1505 do {
1506 freelist = page->freelist;
1507 counters = page->counters;
1508 new.counters = counters;
1509 if (mode)
1510 new.inuse = page->objects;
1511
1512 VM_BUG_ON(new.frozen);
1513 new.frozen = 1;
1514
1515 } while (!__cmpxchg_double_slab(s, page,
1516 freelist, counters,
1517 NULL, new.counters,
1518 "lock and freeze"));
1519
1520 remove_partial(n, page);
1521 return freelist;
1486} 1522}
1487 1523
1524static int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain);
1525
1488/* 1526/*
1489 * Try to allocate a partial slab from a specific node. 1527 * Try to allocate a partial slab from a specific node.
1490 */ 1528 */
1491static struct page *get_partial_node(struct kmem_cache_node *n) 1529static void *get_partial_node(struct kmem_cache *s,
1530 struct kmem_cache_node *n, struct kmem_cache_cpu *c)
1492{ 1531{
1493 struct page *page; 1532 struct page *page, *page2;
1533 void *object = NULL;
1494 1534
1495 /* 1535 /*
1496 * Racy check. If we mistakenly see no partial slabs then we 1536 * Racy check. If we mistakenly see no partial slabs then we
@@ -1502,26 +1542,43 @@ static struct page *get_partial_node(struct kmem_cache_node *n)
1502 return NULL; 1542 return NULL;
1503 1543
1504 spin_lock(&n->list_lock); 1544 spin_lock(&n->list_lock);
1505 list_for_each_entry(page, &n->partial, lru) 1545 list_for_each_entry_safe(page, page2, &n->partial, lru) {
1506 if (lock_and_freeze_slab(n, page)) 1546 void *t = acquire_slab(s, n, page, object == NULL);
1507 goto out; 1547 int available;
1508 page = NULL; 1548
1509out: 1549 if (!t)
1550 break;
1551
1552 if (!object) {
1553 c->page = page;
1554 c->node = page_to_nid(page);
1555 stat(s, ALLOC_FROM_PARTIAL);
1556 object = t;
1557 available = page->objects - page->inuse;
1558 } else {
1559 page->freelist = t;
1560 available = put_cpu_partial(s, page, 0);
1561 }
1562 if (kmem_cache_debug(s) || available > s->cpu_partial / 2)
1563 break;
1564
1565 }
1510 spin_unlock(&n->list_lock); 1566 spin_unlock(&n->list_lock);
1511 return page; 1567 return object;
1512} 1568}
1513 1569
1514/* 1570/*
1515 * Get a page from somewhere. Search in increasing NUMA distances. 1571 * Get a page from somewhere. Search in increasing NUMA distances.
1516 */ 1572 */
1517static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) 1573static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags,
1574 struct kmem_cache_cpu *c)
1518{ 1575{
1519#ifdef CONFIG_NUMA 1576#ifdef CONFIG_NUMA
1520 struct zonelist *zonelist; 1577 struct zonelist *zonelist;
1521 struct zoneref *z; 1578 struct zoneref *z;
1522 struct zone *zone; 1579 struct zone *zone;
1523 enum zone_type high_zoneidx = gfp_zone(flags); 1580 enum zone_type high_zoneidx = gfp_zone(flags);
1524 struct page *page; 1581 void *object;
1525 1582
1526 /* 1583 /*
1527 * The defrag ratio allows a configuration of the tradeoffs between 1584 * The defrag ratio allows a configuration of the tradeoffs between
@@ -1554,10 +1611,10 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
1554 1611
1555 if (n && cpuset_zone_allowed_hardwall(zone, flags) && 1612 if (n && cpuset_zone_allowed_hardwall(zone, flags) &&
1556 n->nr_partial > s->min_partial) { 1613 n->nr_partial > s->min_partial) {
1557 page = get_partial_node(n); 1614 object = get_partial_node(s, n, c);
1558 if (page) { 1615 if (object) {
1559 put_mems_allowed(); 1616 put_mems_allowed();
1560 return page; 1617 return object;
1561 } 1618 }
1562 } 1619 }
1563 } 1620 }
@@ -1569,63 +1626,17 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
1569/* 1626/*
1570 * Get a partial page, lock it and return it. 1627 * Get a partial page, lock it and return it.
1571 */ 1628 */
1572static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node) 1629static void *get_partial(struct kmem_cache *s, gfp_t flags, int node,
1630 struct kmem_cache_cpu *c)
1573{ 1631{
1574 struct page *page; 1632 void *object;
1575 int searchnode = (node == NUMA_NO_NODE) ? numa_node_id() : node; 1633 int searchnode = (node == NUMA_NO_NODE) ? numa_node_id() : node;
1576 1634
1577 page = get_partial_node(get_node(s, searchnode)); 1635 object = get_partial_node(s, get_node(s, searchnode), c);
1578 if (page || node != NUMA_NO_NODE) 1636 if (object || node != NUMA_NO_NODE)
1579 return page; 1637 return object;
1580 1638
1581 return get_any_partial(s, flags); 1639 return get_any_partial(s, flags, c);
1582}
1583
1584/*
1585 * Move a page back to the lists.
1586 *
1587 * Must be called with the slab lock held.
1588 *
1589 * On exit the slab lock will have been dropped.
1590 */
1591static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
1592 __releases(bitlock)
1593{
1594 struct kmem_cache_node *n = get_node(s, page_to_nid(page));
1595
1596 __ClearPageSlubFrozen(page);
1597 if (page->inuse) {
1598
1599 if (page->freelist) {
1600 add_partial(n, page, tail);
1601 stat(s, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD);
1602 } else {
1603 stat(s, DEACTIVATE_FULL);
1604 if (kmem_cache_debug(s) && (s->flags & SLAB_STORE_USER))
1605 add_full(n, page);
1606 }
1607 slab_unlock(page);
1608 } else {
1609 stat(s, DEACTIVATE_EMPTY);
1610 if (n->nr_partial < s->min_partial) {
1611 /*
1612 * Adding an empty slab to the partial slabs in order
1613 * to avoid page allocator overhead. This slab needs
1614 * to come after the other slabs with objects in
1615 * so that the others get filled first. That way the
1616 * size of the partial list stays small.
1617 *
1618 * kmem_cache_shrink can reclaim any empty slabs from
1619 * the partial list.
1620 */
1621 add_partial(n, page, 1);
1622 slab_unlock(page);
1623 } else {
1624 slab_unlock(page);
1625 stat(s, FREE_SLAB);
1626 discard_slab(s, page);
1627 }
1628 }
1629} 1640}
1630 1641
1631#ifdef CONFIG_PREEMPT 1642#ifdef CONFIG_PREEMPT
@@ -1694,45 +1705,278 @@ void init_kmem_cache_cpus(struct kmem_cache *s)
1694 for_each_possible_cpu(cpu) 1705 for_each_possible_cpu(cpu)
1695 per_cpu_ptr(s->cpu_slab, cpu)->tid = init_tid(cpu); 1706 per_cpu_ptr(s->cpu_slab, cpu)->tid = init_tid(cpu);
1696} 1707}
1708
1697/* 1709/*
1698 * Remove the cpu slab 1710 * Remove the cpu slab
1699 */ 1711 */
1700static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) 1712static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
1701 __releases(bitlock)
1702{ 1713{
1714 enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE };
1703 struct page *page = c->page; 1715 struct page *page = c->page;
1704 int tail = 1; 1716 struct kmem_cache_node *n = get_node(s, page_to_nid(page));
1705 1717 int lock = 0;
1706 if (page->freelist) 1718 enum slab_modes l = M_NONE, m = M_NONE;
1719 void *freelist;
1720 void *nextfree;
1721 int tail = DEACTIVATE_TO_HEAD;
1722 struct page new;
1723 struct page old;
1724
1725 if (page->freelist) {
1707 stat(s, DEACTIVATE_REMOTE_FREES); 1726 stat(s, DEACTIVATE_REMOTE_FREES);
1727 tail = DEACTIVATE_TO_TAIL;
1728 }
1729
1730 c->tid = next_tid(c->tid);
1731 c->page = NULL;
1732 freelist = c->freelist;
1733 c->freelist = NULL;
1734
1735 /*
1736 * Stage one: Free all available per cpu objects back
1737 * to the page freelist while it is still frozen. Leave the
1738 * last one.
1739 *
1740 * There is no need to take the list->lock because the page
1741 * is still frozen.
1742 */
1743 while (freelist && (nextfree = get_freepointer(s, freelist))) {
1744 void *prior;
1745 unsigned long counters;
1746
1747 do {
1748 prior = page->freelist;
1749 counters = page->counters;
1750 set_freepointer(s, freelist, prior);
1751 new.counters = counters;
1752 new.inuse--;
1753 VM_BUG_ON(!new.frozen);
1754
1755 } while (!__cmpxchg_double_slab(s, page,
1756 prior, counters,
1757 freelist, new.counters,
1758 "drain percpu freelist"));
1759
1760 freelist = nextfree;
1761 }
1762
1708 /* 1763 /*
1709 * Merge cpu freelist into slab freelist. Typically we get here 1764 * Stage two: Ensure that the page is unfrozen while the
1710 * because both freelists are empty. So this is unlikely 1765 * list presence reflects the actual number of objects
1711 * to occur. 1766 * during unfreeze.
1767 *
1768 * We setup the list membership and then perform a cmpxchg
1769 * with the count. If there is a mismatch then the page
1770 * is not unfrozen but the page is on the wrong list.
1771 *
1772 * Then we restart the process which may have to remove
1773 * the page from the list that we just put it on again
1774 * because the number of objects in the slab may have
1775 * changed.
1712 */ 1776 */
1713 while (unlikely(c->freelist)) { 1777redo:
1714 void **object; 1778
1779 old.freelist = page->freelist;
1780 old.counters = page->counters;
1781 VM_BUG_ON(!old.frozen);
1715 1782
1716 tail = 0; /* Hot objects. Put the slab first */ 1783 /* Determine target state of the slab */
1784 new.counters = old.counters;
1785 if (freelist) {
1786 new.inuse--;
1787 set_freepointer(s, freelist, old.freelist);
1788 new.freelist = freelist;
1789 } else
1790 new.freelist = old.freelist;
1717 1791
1718 /* Retrieve object from cpu_freelist */ 1792 new.frozen = 0;
1719 object = c->freelist;
1720 c->freelist = get_freepointer(s, c->freelist);
1721 1793
1722 /* And put onto the regular freelist */ 1794 if (!new.inuse && n->nr_partial > s->min_partial)
1723 set_freepointer(s, object, page->freelist); 1795 m = M_FREE;
1724 page->freelist = object; 1796 else if (new.freelist) {
1725 page->inuse--; 1797 m = M_PARTIAL;
1798 if (!lock) {
1799 lock = 1;
1800 /*
1801 * Taking the spinlock removes the possiblity
1802 * that acquire_slab() will see a slab page that
1803 * is frozen
1804 */
1805 spin_lock(&n->list_lock);
1806 }
1807 } else {
1808 m = M_FULL;
1809 if (kmem_cache_debug(s) && !lock) {
1810 lock = 1;
1811 /*
1812 * This also ensures that the scanning of full
1813 * slabs from diagnostic functions will not see
1814 * any frozen slabs.
1815 */
1816 spin_lock(&n->list_lock);
1817 }
1818 }
1819
1820 if (l != m) {
1821
1822 if (l == M_PARTIAL)
1823
1824 remove_partial(n, page);
1825
1826 else if (l == M_FULL)
1827
1828 remove_full(s, page);
1829
1830 if (m == M_PARTIAL) {
1831
1832 add_partial(n, page, tail);
1833 stat(s, tail);
1834
1835 } else if (m == M_FULL) {
1836
1837 stat(s, DEACTIVATE_FULL);
1838 add_full(s, n, page);
1839
1840 }
1841 }
1842
1843 l = m;
1844 if (!__cmpxchg_double_slab(s, page,
1845 old.freelist, old.counters,
1846 new.freelist, new.counters,
1847 "unfreezing slab"))
1848 goto redo;
1849
1850 if (lock)
1851 spin_unlock(&n->list_lock);
1852
1853 if (m == M_FREE) {
1854 stat(s, DEACTIVATE_EMPTY);
1855 discard_slab(s, page);
1856 stat(s, FREE_SLAB);
1726 } 1857 }
1727 c->page = NULL; 1858}
1728 c->tid = next_tid(c->tid); 1859
1729 unfreeze_slab(s, page, tail); 1860/* Unfreeze all the cpu partial slabs */
1861static void unfreeze_partials(struct kmem_cache *s)
1862{
1863 struct kmem_cache_node *n = NULL;
1864 struct kmem_cache_cpu *c = this_cpu_ptr(s->cpu_slab);
1865 struct page *page;
1866
1867 while ((page = c->partial)) {
1868 enum slab_modes { M_PARTIAL, M_FREE };
1869 enum slab_modes l, m;
1870 struct page new;
1871 struct page old;
1872
1873 c->partial = page->next;
1874 l = M_FREE;
1875
1876 do {
1877
1878 old.freelist = page->freelist;
1879 old.counters = page->counters;
1880 VM_BUG_ON(!old.frozen);
1881
1882 new.counters = old.counters;
1883 new.freelist = old.freelist;
1884
1885 new.frozen = 0;
1886
1887 if (!new.inuse && (!n || n->nr_partial > s->min_partial))
1888 m = M_FREE;
1889 else {
1890 struct kmem_cache_node *n2 = get_node(s,
1891 page_to_nid(page));
1892
1893 m = M_PARTIAL;
1894 if (n != n2) {
1895 if (n)
1896 spin_unlock(&n->list_lock);
1897
1898 n = n2;
1899 spin_lock(&n->list_lock);
1900 }
1901 }
1902
1903 if (l != m) {
1904 if (l == M_PARTIAL)
1905 remove_partial(n, page);
1906 else
1907 add_partial(n, page, 1);
1908
1909 l = m;
1910 }
1911
1912 } while (!cmpxchg_double_slab(s, page,
1913 old.freelist, old.counters,
1914 new.freelist, new.counters,
1915 "unfreezing slab"));
1916
1917 if (m == M_FREE) {
1918 stat(s, DEACTIVATE_EMPTY);
1919 discard_slab(s, page);
1920 stat(s, FREE_SLAB);
1921 }
1922 }
1923
1924 if (n)
1925 spin_unlock(&n->list_lock);
1926}
1927
1928/*
1929 * Put a page that was just frozen (in __slab_free) into a partial page
1930 * slot if available. This is done without interrupts disabled and without
1931 * preemption disabled. The cmpxchg is racy and may put the partial page
1932 * onto a random cpus partial slot.
1933 *
1934 * If we did not find a slot then simply move all the partials to the
1935 * per node partial list.
1936 */
1937int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
1938{
1939 struct page *oldpage;
1940 int pages;
1941 int pobjects;
1942
1943 do {
1944 pages = 0;
1945 pobjects = 0;
1946 oldpage = this_cpu_read(s->cpu_slab->partial);
1947
1948 if (oldpage) {
1949 pobjects = oldpage->pobjects;
1950 pages = oldpage->pages;
1951 if (drain && pobjects > s->cpu_partial) {
1952 unsigned long flags;
1953 /*
1954 * partial array is full. Move the existing
1955 * set to the per node partial list.
1956 */
1957 local_irq_save(flags);
1958 unfreeze_partials(s);
1959 local_irq_restore(flags);
1960 pobjects = 0;
1961 pages = 0;
1962 }
1963 }
1964
1965 pages++;
1966 pobjects += page->objects - page->inuse;
1967
1968 page->pages = pages;
1969 page->pobjects = pobjects;
1970 page->next = oldpage;
1971
1972 } while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page) != oldpage);
1973 stat(s, CPU_PARTIAL_FREE);
1974 return pobjects;
1730} 1975}
1731 1976
1732static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) 1977static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
1733{ 1978{
1734 stat(s, CPUSLAB_FLUSH); 1979 stat(s, CPUSLAB_FLUSH);
1735 slab_lock(c->page);
1736 deactivate_slab(s, c); 1980 deactivate_slab(s, c);
1737} 1981}
1738 1982
@@ -1745,8 +1989,12 @@ static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu)
1745{ 1989{
1746 struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); 1990 struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
1747 1991
1748 if (likely(c && c->page)) 1992 if (likely(c)) {
1749 flush_slab(s, c); 1993 if (c->page)
1994 flush_slab(s, c);
1995
1996 unfreeze_partials(s);
1997 }
1750} 1998}
1751 1999
1752static void flush_cpu_slab(void *d) 2000static void flush_cpu_slab(void *d)
@@ -1837,12 +2085,39 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
1837 } 2085 }
1838} 2086}
1839 2087
2088static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags,
2089 int node, struct kmem_cache_cpu **pc)
2090{
2091 void *object;
2092 struct kmem_cache_cpu *c;
2093 struct page *page = new_slab(s, flags, node);
2094
2095 if (page) {
2096 c = __this_cpu_ptr(s->cpu_slab);
2097 if (c->page)
2098 flush_slab(s, c);
2099
2100 /*
2101 * No other reference to the page yet so we can
2102 * muck around with it freely without cmpxchg
2103 */
2104 object = page->freelist;
2105 page->freelist = NULL;
2106
2107 stat(s, ALLOC_SLAB);
2108 c->node = page_to_nid(page);
2109 c->page = page;
2110 *pc = c;
2111 } else
2112 object = NULL;
2113
2114 return object;
2115}
2116
1840/* 2117/*
1841 * Slow path. The lockless freelist is empty or we need to perform 2118 * Slow path. The lockless freelist is empty or we need to perform
1842 * debugging duties. 2119 * debugging duties.
1843 * 2120 *
1844 * Interrupts are disabled.
1845 *
1846 * Processing is still very fast if new objects have been freed to the 2121 * Processing is still very fast if new objects have been freed to the
1847 * regular freelist. In that case we simply take over the regular freelist 2122 * regular freelist. In that case we simply take over the regular freelist
1848 * as the lockless freelist and zap the regular freelist. 2123 * as the lockless freelist and zap the regular freelist.
@@ -1859,8 +2134,9 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
1859 unsigned long addr, struct kmem_cache_cpu *c) 2134 unsigned long addr, struct kmem_cache_cpu *c)
1860{ 2135{
1861 void **object; 2136 void **object;
1862 struct page *page;
1863 unsigned long flags; 2137 unsigned long flags;
2138 struct page new;
2139 unsigned long counters;
1864 2140
1865 local_irq_save(flags); 2141 local_irq_save(flags);
1866#ifdef CONFIG_PREEMPT 2142#ifdef CONFIG_PREEMPT
@@ -1872,81 +2148,91 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
1872 c = this_cpu_ptr(s->cpu_slab); 2148 c = this_cpu_ptr(s->cpu_slab);
1873#endif 2149#endif
1874 2150
1875 /* We handle __GFP_ZERO in the caller */ 2151 if (!c->page)
1876 gfpflags &= ~__GFP_ZERO;
1877
1878 page = c->page;
1879 if (!page)
1880 goto new_slab; 2152 goto new_slab;
2153redo:
2154 if (unlikely(!node_match(c, node))) {
2155 stat(s, ALLOC_NODE_MISMATCH);
2156 deactivate_slab(s, c);
2157 goto new_slab;
2158 }
1881 2159
1882 slab_lock(page); 2160 stat(s, ALLOC_SLOWPATH);
1883 if (unlikely(!node_match(c, node))) 2161
1884 goto another_slab; 2162 do {
2163 object = c->page->freelist;
2164 counters = c->page->counters;
2165 new.counters = counters;
2166 VM_BUG_ON(!new.frozen);
2167
2168 /*
2169 * If there is no object left then we use this loop to
2170 * deactivate the slab which is simple since no objects
2171 * are left in the slab and therefore we do not need to
2172 * put the page back onto the partial list.
2173 *
2174 * If there are objects left then we retrieve them
2175 * and use them to refill the per cpu queue.
2176 */
2177
2178 new.inuse = c->page->objects;
2179 new.frozen = object != NULL;
2180
2181 } while (!__cmpxchg_double_slab(s, c->page,
2182 object, counters,
2183 NULL, new.counters,
2184 "__slab_alloc"));
2185
2186 if (!object) {
2187 c->page = NULL;
2188 stat(s, DEACTIVATE_BYPASS);
2189 goto new_slab;
2190 }
1885 2191
1886 stat(s, ALLOC_REFILL); 2192 stat(s, ALLOC_REFILL);
1887 2193
1888load_freelist: 2194load_freelist:
1889 object = page->freelist;
1890 if (unlikely(!object))
1891 goto another_slab;
1892 if (kmem_cache_debug(s))
1893 goto debug;
1894
1895 c->freelist = get_freepointer(s, object); 2195 c->freelist = get_freepointer(s, object);
1896 page->inuse = page->objects;
1897 page->freelist = NULL;
1898
1899 slab_unlock(page);
1900 c->tid = next_tid(c->tid); 2196 c->tid = next_tid(c->tid);
1901 local_irq_restore(flags); 2197 local_irq_restore(flags);
1902 stat(s, ALLOC_SLOWPATH);
1903 return object; 2198 return object;
1904 2199
1905another_slab:
1906 deactivate_slab(s, c);
1907
1908new_slab: 2200new_slab:
1909 page = get_partial(s, gfpflags, node); 2201
1910 if (page) { 2202 if (c->partial) {
1911 stat(s, ALLOC_FROM_PARTIAL); 2203 c->page = c->partial;
1912 c->node = page_to_nid(page); 2204 c->partial = c->page->next;
1913 c->page = page; 2205 c->node = page_to_nid(c->page);
1914 goto load_freelist; 2206 stat(s, CPU_PARTIAL_ALLOC);
2207 c->freelist = NULL;
2208 goto redo;
1915 } 2209 }
1916 2210
1917 gfpflags &= gfp_allowed_mask; 2211 /* Then do expensive stuff like retrieving pages from the partial lists */
1918 if (gfpflags & __GFP_WAIT) 2212 object = get_partial(s, gfpflags, node, c);
1919 local_irq_enable();
1920 2213
1921 page = new_slab(s, gfpflags, node); 2214 if (unlikely(!object)) {
1922 2215
1923 if (gfpflags & __GFP_WAIT) 2216 object = new_slab_objects(s, gfpflags, node, &c);
1924 local_irq_disable();
1925 2217
1926 if (page) { 2218 if (unlikely(!object)) {
1927 c = __this_cpu_ptr(s->cpu_slab); 2219 if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit())
1928 stat(s, ALLOC_SLAB); 2220 slab_out_of_memory(s, gfpflags, node);
1929 if (c->page)
1930 flush_slab(s, c);
1931 2221
1932 slab_lock(page); 2222 local_irq_restore(flags);
1933 __SetPageSlubFrozen(page); 2223 return NULL;
1934 c->node = page_to_nid(page); 2224 }
1935 c->page = page;
1936 goto load_freelist;
1937 } 2225 }
1938 if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit())
1939 slab_out_of_memory(s, gfpflags, node);
1940 local_irq_restore(flags);
1941 return NULL;
1942debug:
1943 if (!alloc_debug_processing(s, page, object, addr))
1944 goto another_slab;
1945 2226
1946 page->inuse++; 2227 if (likely(!kmem_cache_debug(s)))
1947 page->freelist = get_freepointer(s, object); 2228 goto load_freelist;
2229
2230 /* Only entered in the debug case */
2231 if (!alloc_debug_processing(s, c->page, object, addr))
2232 goto new_slab; /* Slab failed checks. Next slab needed */
2233
2234 c->freelist = get_freepointer(s, object);
1948 deactivate_slab(s, c); 2235 deactivate_slab(s, c);
1949 c->page = NULL;
1950 c->node = NUMA_NO_NODE; 2236 c->node = NUMA_NO_NODE;
1951 local_irq_restore(flags); 2237 local_irq_restore(flags);
1952 return object; 2238 return object;
@@ -2096,52 +2382,110 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
2096{ 2382{
2097 void *prior; 2383 void *prior;
2098 void **object = (void *)x; 2384 void **object = (void *)x;
2099 unsigned long flags; 2385 int was_frozen;
2386 int inuse;
2387 struct page new;
2388 unsigned long counters;
2389 struct kmem_cache_node *n = NULL;
2390 unsigned long uninitialized_var(flags);
2100 2391
2101 local_irq_save(flags);
2102 slab_lock(page);
2103 stat(s, FREE_SLOWPATH); 2392 stat(s, FREE_SLOWPATH);
2104 2393
2105 if (kmem_cache_debug(s) && !free_debug_processing(s, page, x, addr)) 2394 if (kmem_cache_debug(s) && !free_debug_processing(s, page, x, addr))
2106 goto out_unlock; 2395 return;
2107 2396
2108 prior = page->freelist; 2397 do {
2109 set_freepointer(s, object, prior); 2398 prior = page->freelist;
2110 page->freelist = object; 2399 counters = page->counters;
2111 page->inuse--; 2400 set_freepointer(s, object, prior);
2401 new.counters = counters;
2402 was_frozen = new.frozen;
2403 new.inuse--;
2404 if ((!new.inuse || !prior) && !was_frozen && !n) {
2112 2405
2113 if (unlikely(PageSlubFrozen(page))) { 2406 if (!kmem_cache_debug(s) && !prior)
2114 stat(s, FREE_FROZEN); 2407
2115 goto out_unlock; 2408 /*
2116 } 2409 * Slab was on no list before and will be partially empty
2410 * We can defer the list move and instead freeze it.
2411 */
2412 new.frozen = 1;
2413
2414 else { /* Needs to be taken off a list */
2415
2416 n = get_node(s, page_to_nid(page));
2417 /*
2418 * Speculatively acquire the list_lock.
2419 * If the cmpxchg does not succeed then we may
2420 * drop the list_lock without any processing.
2421 *
2422 * Otherwise the list_lock will synchronize with
2423 * other processors updating the list of slabs.
2424 */
2425 spin_lock_irqsave(&n->list_lock, flags);
2426
2427 }
2428 }
2429 inuse = new.inuse;
2117 2430
2118 if (unlikely(!page->inuse)) 2431 } while (!cmpxchg_double_slab(s, page,
2119 goto slab_empty; 2432 prior, counters,
2433 object, new.counters,
2434 "__slab_free"));
2435
2436 if (likely(!n)) {
2437
2438 /*
2439 * If we just froze the page then put it onto the
2440 * per cpu partial list.
2441 */
2442 if (new.frozen && !was_frozen)
2443 put_cpu_partial(s, page, 1);
2444
2445 /*
2446 * The list lock was not taken therefore no list
2447 * activity can be necessary.
2448 */
2449 if (was_frozen)
2450 stat(s, FREE_FROZEN);
2451 return;
2452 }
2120 2453
2121 /* 2454 /*
2122 * Objects left in the slab. If it was not on the partial list before 2455 * was_frozen may have been set after we acquired the list_lock in
2123 * then add it. 2456 * an earlier loop. So we need to check it here again.
2124 */ 2457 */
2125 if (unlikely(!prior)) { 2458 if (was_frozen)
2126 add_partial(get_node(s, page_to_nid(page)), page, 1); 2459 stat(s, FREE_FROZEN);
2127 stat(s, FREE_ADD_PARTIAL); 2460 else {
2128 } 2461 if (unlikely(!inuse && n->nr_partial > s->min_partial))
2462 goto slab_empty;
2129 2463
2130out_unlock: 2464 /*
2131 slab_unlock(page); 2465 * Objects left in the slab. If it was not on the partial list before
2132 local_irq_restore(flags); 2466 * then add it.
2467 */
2468 if (unlikely(!prior)) {
2469 remove_full(s, page);
2470 add_partial(n, page, DEACTIVATE_TO_TAIL);
2471 stat(s, FREE_ADD_PARTIAL);
2472 }
2473 }
2474 spin_unlock_irqrestore(&n->list_lock, flags);
2133 return; 2475 return;
2134 2476
2135slab_empty: 2477slab_empty:
2136 if (prior) { 2478 if (prior) {
2137 /* 2479 /*
2138 * Slab still on the partial list. 2480 * Slab on the partial list.
2139 */ 2481 */
2140 remove_partial(s, page); 2482 remove_partial(n, page);
2141 stat(s, FREE_REMOVE_PARTIAL); 2483 stat(s, FREE_REMOVE_PARTIAL);
2142 } 2484 } else
2143 slab_unlock(page); 2485 /* Slab must be on the full list */
2144 local_irq_restore(flags); 2486 remove_full(s, page);
2487
2488 spin_unlock_irqrestore(&n->list_lock, flags);
2145 stat(s, FREE_SLAB); 2489 stat(s, FREE_SLAB);
2146 discard_slab(s, page); 2490 discard_slab(s, page);
2147} 2491}
@@ -2167,7 +2511,6 @@ static __always_inline void slab_free(struct kmem_cache *s,
2167 slab_free_hook(s, x); 2511 slab_free_hook(s, x);
2168 2512
2169redo: 2513redo:
2170
2171 /* 2514 /*
2172 * Determine the currently cpus per cpu slab. 2515 * Determine the currently cpus per cpu slab.
2173 * The cpu may change afterward. However that does not matter since 2516 * The cpu may change afterward. However that does not matter since
@@ -2415,7 +2758,6 @@ static void early_kmem_cache_node_alloc(int node)
2415{ 2758{
2416 struct page *page; 2759 struct page *page;
2417 struct kmem_cache_node *n; 2760 struct kmem_cache_node *n;
2418 unsigned long flags;
2419 2761
2420 BUG_ON(kmem_cache_node->size < sizeof(struct kmem_cache_node)); 2762 BUG_ON(kmem_cache_node->size < sizeof(struct kmem_cache_node));
2421 2763
@@ -2432,7 +2774,8 @@ static void early_kmem_cache_node_alloc(int node)
2432 n = page->freelist; 2774 n = page->freelist;
2433 BUG_ON(!n); 2775 BUG_ON(!n);
2434 page->freelist = get_freepointer(kmem_cache_node, n); 2776 page->freelist = get_freepointer(kmem_cache_node, n);
2435 page->inuse++; 2777 page->inuse = 1;
2778 page->frozen = 0;
2436 kmem_cache_node->node[node] = n; 2779 kmem_cache_node->node[node] = n;
2437#ifdef CONFIG_SLUB_DEBUG 2780#ifdef CONFIG_SLUB_DEBUG
2438 init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); 2781 init_object(kmem_cache_node, n, SLUB_RED_ACTIVE);
@@ -2441,14 +2784,7 @@ static void early_kmem_cache_node_alloc(int node)
2441 init_kmem_cache_node(n, kmem_cache_node); 2784 init_kmem_cache_node(n, kmem_cache_node);
2442 inc_slabs_node(kmem_cache_node, node, page->objects); 2785 inc_slabs_node(kmem_cache_node, node, page->objects);
2443 2786
2444 /* 2787 add_partial(n, page, DEACTIVATE_TO_HEAD);
2445 * lockdep requires consistent irq usage for each lock
2446 * so even though there cannot be a race this early in
2447 * the boot sequence, we still disable irqs.
2448 */
2449 local_irq_save(flags);
2450 add_partial(n, page, 0);
2451 local_irq_restore(flags);
2452} 2788}
2453 2789
2454static void free_kmem_cache_nodes(struct kmem_cache *s) 2790static void free_kmem_cache_nodes(struct kmem_cache *s)
@@ -2654,11 +2990,44 @@ static int kmem_cache_open(struct kmem_cache *s,
2654 } 2990 }
2655 } 2991 }
2656 2992
2993#ifdef CONFIG_CMPXCHG_DOUBLE
2994 if (system_has_cmpxchg_double() && (s->flags & SLAB_DEBUG_FLAGS) == 0)
2995 /* Enable fast mode */
2996 s->flags |= __CMPXCHG_DOUBLE;
2997#endif
2998
2657 /* 2999 /*
2658 * The larger the object size is, the more pages we want on the partial 3000 * The larger the object size is, the more pages we want on the partial
2659 * list to avoid pounding the page allocator excessively. 3001 * list to avoid pounding the page allocator excessively.
2660 */ 3002 */
2661 set_min_partial(s, ilog2(s->size)); 3003 set_min_partial(s, ilog2(s->size) / 2);
3004
3005 /*
3006 * cpu_partial determined the maximum number of objects kept in the
3007 * per cpu partial lists of a processor.
3008 *
3009 * Per cpu partial lists mainly contain slabs that just have one
3010 * object freed. If they are used for allocation then they can be
3011 * filled up again with minimal effort. The slab will never hit the
3012 * per node partial lists and therefore no locking will be required.
3013 *
3014 * This setting also determines
3015 *
3016 * A) The number of objects from per cpu partial slabs dumped to the
3017 * per node list when we reach the limit.
3018 * B) The number of objects in cpu partial slabs to extract from the
3019 * per node list when we run out of per cpu objects. We only fetch 50%
3020 * to keep some capacity around for frees.
3021 */
3022 if (s->size >= PAGE_SIZE)
3023 s->cpu_partial = 2;
3024 else if (s->size >= 1024)
3025 s->cpu_partial = 6;
3026 else if (s->size >= 256)
3027 s->cpu_partial = 13;
3028 else
3029 s->cpu_partial = 30;
3030
2662 s->refcount = 1; 3031 s->refcount = 1;
2663#ifdef CONFIG_NUMA 3032#ifdef CONFIG_NUMA
2664 s->remote_node_defrag_ratio = 1000; 3033 s->remote_node_defrag_ratio = 1000;
@@ -2717,23 +3086,22 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
2717 3086
2718/* 3087/*
2719 * Attempt to free all partial slabs on a node. 3088 * Attempt to free all partial slabs on a node.
3089 * This is called from kmem_cache_close(). We must be the last thread
3090 * using the cache and therefore we do not need to lock anymore.
2720 */ 3091 */
2721static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) 3092static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
2722{ 3093{
2723 unsigned long flags;
2724 struct page *page, *h; 3094 struct page *page, *h;
2725 3095
2726 spin_lock_irqsave(&n->list_lock, flags);
2727 list_for_each_entry_safe(page, h, &n->partial, lru) { 3096 list_for_each_entry_safe(page, h, &n->partial, lru) {
2728 if (!page->inuse) { 3097 if (!page->inuse) {
2729 __remove_partial(n, page); 3098 remove_partial(n, page);
2730 discard_slab(s, page); 3099 discard_slab(s, page);
2731 } else { 3100 } else {
2732 list_slab_objects(s, page, 3101 list_slab_objects(s, page,
2733 "Objects remaining on kmem_cache_close()"); 3102 "Objects remaining on kmem_cache_close()");
2734 } 3103 }
2735 } 3104 }
2736 spin_unlock_irqrestore(&n->list_lock, flags);
2737} 3105}
2738 3106
2739/* 3107/*
@@ -2767,6 +3135,7 @@ void kmem_cache_destroy(struct kmem_cache *s)
2767 s->refcount--; 3135 s->refcount--;
2768 if (!s->refcount) { 3136 if (!s->refcount) {
2769 list_del(&s->list); 3137 list_del(&s->list);
3138 up_write(&slub_lock);
2770 if (kmem_cache_close(s)) { 3139 if (kmem_cache_close(s)) {
2771 printk(KERN_ERR "SLUB %s: %s called for cache that " 3140 printk(KERN_ERR "SLUB %s: %s called for cache that "
2772 "still has objects.\n", s->name, __func__); 3141 "still has objects.\n", s->name, __func__);
@@ -2775,8 +3144,8 @@ void kmem_cache_destroy(struct kmem_cache *s)
2775 if (s->flags & SLAB_DESTROY_BY_RCU) 3144 if (s->flags & SLAB_DESTROY_BY_RCU)
2776 rcu_barrier(); 3145 rcu_barrier();
2777 sysfs_slab_remove(s); 3146 sysfs_slab_remove(s);
2778 } 3147 } else
2779 up_write(&slub_lock); 3148 up_write(&slub_lock);
2780} 3149}
2781EXPORT_SYMBOL(kmem_cache_destroy); 3150EXPORT_SYMBOL(kmem_cache_destroy);
2782 3151
@@ -3094,29 +3463,23 @@ int kmem_cache_shrink(struct kmem_cache *s)
3094 * list_lock. page->inuse here is the upper limit. 3463 * list_lock. page->inuse here is the upper limit.
3095 */ 3464 */
3096 list_for_each_entry_safe(page, t, &n->partial, lru) { 3465 list_for_each_entry_safe(page, t, &n->partial, lru) {
3097 if (!page->inuse && slab_trylock(page)) { 3466 list_move(&page->lru, slabs_by_inuse + page->inuse);
3098 /* 3467 if (!page->inuse)
3099 * Must hold slab lock here because slab_free 3468 n->nr_partial--;
3100 * may have freed the last object and be
3101 * waiting to release the slab.
3102 */
3103 __remove_partial(n, page);
3104 slab_unlock(page);
3105 discard_slab(s, page);
3106 } else {
3107 list_move(&page->lru,
3108 slabs_by_inuse + page->inuse);
3109 }
3110 } 3469 }
3111 3470
3112 /* 3471 /*
3113 * Rebuild the partial list with the slabs filled up most 3472 * Rebuild the partial list with the slabs filled up most
3114 * first and the least used slabs at the end. 3473 * first and the least used slabs at the end.
3115 */ 3474 */
3116 for (i = objects - 1; i >= 0; i--) 3475 for (i = objects - 1; i > 0; i--)
3117 list_splice(slabs_by_inuse + i, n->partial.prev); 3476 list_splice(slabs_by_inuse + i, n->partial.prev);
3118 3477
3119 spin_unlock_irqrestore(&n->list_lock, flags); 3478 spin_unlock_irqrestore(&n->list_lock, flags);
3479
3480 /* Release empty slabs */
3481 list_for_each_entry_safe(page, t, slabs_by_inuse, lru)
3482 discard_slab(s, page);
3120 } 3483 }
3121 3484
3122 kfree(slabs_by_inuse); 3485 kfree(slabs_by_inuse);
@@ -3689,12 +4052,9 @@ static int validate_slab(struct kmem_cache *s, struct page *page,
3689static void validate_slab_slab(struct kmem_cache *s, struct page *page, 4052static void validate_slab_slab(struct kmem_cache *s, struct page *page,
3690 unsigned long *map) 4053 unsigned long *map)
3691{ 4054{
3692 if (slab_trylock(page)) { 4055 slab_lock(page);
3693 validate_slab(s, page, map); 4056 validate_slab(s, page, map);
3694 slab_unlock(page); 4057 slab_unlock(page);
3695 } else
3696 printk(KERN_INFO "SLUB %s: Skipped busy slab 0x%p\n",
3697 s->name, page);
3698} 4058}
3699 4059
3700static int validate_slab_node(struct kmem_cache *s, 4060static int validate_slab_node(struct kmem_cache *s,
@@ -4075,6 +4435,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
4075 4435
4076 for_each_possible_cpu(cpu) { 4436 for_each_possible_cpu(cpu) {
4077 struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); 4437 struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
4438 struct page *page;
4078 4439
4079 if (!c || c->node < 0) 4440 if (!c || c->node < 0)
4080 continue; 4441 continue;
@@ -4090,6 +4451,13 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
4090 total += x; 4451 total += x;
4091 nodes[c->node] += x; 4452 nodes[c->node] += x;
4092 } 4453 }
4454 page = c->partial;
4455
4456 if (page) {
4457 x = page->pobjects;
4458 total += x;
4459 nodes[c->node] += x;
4460 }
4093 per_cpu[c->node]++; 4461 per_cpu[c->node]++;
4094 } 4462 }
4095 } 4463 }
@@ -4168,11 +4536,12 @@ struct slab_attribute {
4168}; 4536};
4169 4537
4170#define SLAB_ATTR_RO(_name) \ 4538#define SLAB_ATTR_RO(_name) \
4171 static struct slab_attribute _name##_attr = __ATTR_RO(_name) 4539 static struct slab_attribute _name##_attr = \
4540 __ATTR(_name, 0400, _name##_show, NULL)
4172 4541
4173#define SLAB_ATTR(_name) \ 4542#define SLAB_ATTR(_name) \
4174 static struct slab_attribute _name##_attr = \ 4543 static struct slab_attribute _name##_attr = \
4175 __ATTR(_name, 0644, _name##_show, _name##_store) 4544 __ATTR(_name, 0600, _name##_show, _name##_store)
4176 4545
4177static ssize_t slab_size_show(struct kmem_cache *s, char *buf) 4546static ssize_t slab_size_show(struct kmem_cache *s, char *buf)
4178{ 4547{
@@ -4241,6 +4610,27 @@ static ssize_t min_partial_store(struct kmem_cache *s, const char *buf,
4241} 4610}
4242SLAB_ATTR(min_partial); 4611SLAB_ATTR(min_partial);
4243 4612
4613static ssize_t cpu_partial_show(struct kmem_cache *s, char *buf)
4614{
4615 return sprintf(buf, "%u\n", s->cpu_partial);
4616}
4617
4618static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf,
4619 size_t length)
4620{
4621 unsigned long objects;
4622 int err;
4623
4624 err = strict_strtoul(buf, 10, &objects);
4625 if (err)
4626 return err;
4627
4628 s->cpu_partial = objects;
4629 flush_all(s);
4630 return length;
4631}
4632SLAB_ATTR(cpu_partial);
4633
4244static ssize_t ctor_show(struct kmem_cache *s, char *buf) 4634static ssize_t ctor_show(struct kmem_cache *s, char *buf)
4245{ 4635{
4246 if (!s->ctor) 4636 if (!s->ctor)
@@ -4279,6 +4669,37 @@ static ssize_t objects_partial_show(struct kmem_cache *s, char *buf)
4279} 4669}
4280SLAB_ATTR_RO(objects_partial); 4670SLAB_ATTR_RO(objects_partial);
4281 4671
4672static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf)
4673{
4674 int objects = 0;
4675 int pages = 0;
4676 int cpu;
4677 int len;
4678
4679 for_each_online_cpu(cpu) {
4680 struct page *page = per_cpu_ptr(s->cpu_slab, cpu)->partial;
4681
4682 if (page) {
4683 pages += page->pages;
4684 objects += page->pobjects;
4685 }
4686 }
4687
4688 len = sprintf(buf, "%d(%d)", objects, pages);
4689
4690#ifdef CONFIG_SMP
4691 for_each_online_cpu(cpu) {
4692 struct page *page = per_cpu_ptr(s->cpu_slab, cpu) ->partial;
4693
4694 if (page && len < PAGE_SIZE - 20)
4695 len += sprintf(buf + len, " C%d=%d(%d)", cpu,
4696 page->pobjects, page->pages);
4697 }
4698#endif
4699 return len + sprintf(buf + len, "\n");
4700}
4701SLAB_ATTR_RO(slabs_cpu_partial);
4702
4282static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf) 4703static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf)
4283{ 4704{
4284 return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT)); 4705 return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT));
@@ -4342,8 +4763,10 @@ static ssize_t sanity_checks_store(struct kmem_cache *s,
4342 const char *buf, size_t length) 4763 const char *buf, size_t length)
4343{ 4764{
4344 s->flags &= ~SLAB_DEBUG_FREE; 4765 s->flags &= ~SLAB_DEBUG_FREE;
4345 if (buf[0] == '1') 4766 if (buf[0] == '1') {
4767 s->flags &= ~__CMPXCHG_DOUBLE;
4346 s->flags |= SLAB_DEBUG_FREE; 4768 s->flags |= SLAB_DEBUG_FREE;
4769 }
4347 return length; 4770 return length;
4348} 4771}
4349SLAB_ATTR(sanity_checks); 4772SLAB_ATTR(sanity_checks);
@@ -4357,8 +4780,10 @@ static ssize_t trace_store(struct kmem_cache *s, const char *buf,
4357 size_t length) 4780 size_t length)
4358{ 4781{
4359 s->flags &= ~SLAB_TRACE; 4782 s->flags &= ~SLAB_TRACE;
4360 if (buf[0] == '1') 4783 if (buf[0] == '1') {
4784 s->flags &= ~__CMPXCHG_DOUBLE;
4361 s->flags |= SLAB_TRACE; 4785 s->flags |= SLAB_TRACE;
4786 }
4362 return length; 4787 return length;
4363} 4788}
4364SLAB_ATTR(trace); 4789SLAB_ATTR(trace);
@@ -4375,8 +4800,10 @@ static ssize_t red_zone_store(struct kmem_cache *s,
4375 return -EBUSY; 4800 return -EBUSY;
4376 4801
4377 s->flags &= ~SLAB_RED_ZONE; 4802 s->flags &= ~SLAB_RED_ZONE;
4378 if (buf[0] == '1') 4803 if (buf[0] == '1') {
4804 s->flags &= ~__CMPXCHG_DOUBLE;
4379 s->flags |= SLAB_RED_ZONE; 4805 s->flags |= SLAB_RED_ZONE;
4806 }
4380 calculate_sizes(s, -1); 4807 calculate_sizes(s, -1);
4381 return length; 4808 return length;
4382} 4809}
@@ -4394,8 +4821,10 @@ static ssize_t poison_store(struct kmem_cache *s,
4394 return -EBUSY; 4821 return -EBUSY;
4395 4822
4396 s->flags &= ~SLAB_POISON; 4823 s->flags &= ~SLAB_POISON;
4397 if (buf[0] == '1') 4824 if (buf[0] == '1') {
4825 s->flags &= ~__CMPXCHG_DOUBLE;
4398 s->flags |= SLAB_POISON; 4826 s->flags |= SLAB_POISON;
4827 }
4399 calculate_sizes(s, -1); 4828 calculate_sizes(s, -1);
4400 return length; 4829 return length;
4401} 4830}
@@ -4413,8 +4842,10 @@ static ssize_t store_user_store(struct kmem_cache *s,
4413 return -EBUSY; 4842 return -EBUSY;
4414 4843
4415 s->flags &= ~SLAB_STORE_USER; 4844 s->flags &= ~SLAB_STORE_USER;
4416 if (buf[0] == '1') 4845 if (buf[0] == '1') {
4846 s->flags &= ~__CMPXCHG_DOUBLE;
4417 s->flags |= SLAB_STORE_USER; 4847 s->flags |= SLAB_STORE_USER;
4848 }
4418 calculate_sizes(s, -1); 4849 calculate_sizes(s, -1);
4419 return length; 4850 return length;
4420} 4851}
@@ -4579,6 +5010,7 @@ STAT_ATTR(FREE_REMOVE_PARTIAL, free_remove_partial);
4579STAT_ATTR(ALLOC_FROM_PARTIAL, alloc_from_partial); 5010STAT_ATTR(ALLOC_FROM_PARTIAL, alloc_from_partial);
4580STAT_ATTR(ALLOC_SLAB, alloc_slab); 5011STAT_ATTR(ALLOC_SLAB, alloc_slab);
4581STAT_ATTR(ALLOC_REFILL, alloc_refill); 5012STAT_ATTR(ALLOC_REFILL, alloc_refill);
5013STAT_ATTR(ALLOC_NODE_MISMATCH, alloc_node_mismatch);
4582STAT_ATTR(FREE_SLAB, free_slab); 5014STAT_ATTR(FREE_SLAB, free_slab);
4583STAT_ATTR(CPUSLAB_FLUSH, cpuslab_flush); 5015STAT_ATTR(CPUSLAB_FLUSH, cpuslab_flush);
4584STAT_ATTR(DEACTIVATE_FULL, deactivate_full); 5016STAT_ATTR(DEACTIVATE_FULL, deactivate_full);
@@ -4586,7 +5018,12 @@ STAT_ATTR(DEACTIVATE_EMPTY, deactivate_empty);
4586STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate_to_head); 5018STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate_to_head);
4587STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail); 5019STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail);
4588STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees); 5020STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees);
5021STAT_ATTR(DEACTIVATE_BYPASS, deactivate_bypass);
4589STAT_ATTR(ORDER_FALLBACK, order_fallback); 5022STAT_ATTR(ORDER_FALLBACK, order_fallback);
5023STAT_ATTR(CMPXCHG_DOUBLE_CPU_FAIL, cmpxchg_double_cpu_fail);
5024STAT_ATTR(CMPXCHG_DOUBLE_FAIL, cmpxchg_double_fail);
5025STAT_ATTR(CPU_PARTIAL_ALLOC, cpu_partial_alloc);
5026STAT_ATTR(CPU_PARTIAL_FREE, cpu_partial_free);
4590#endif 5027#endif
4591 5028
4592static struct attribute *slab_attrs[] = { 5029static struct attribute *slab_attrs[] = {
@@ -4595,6 +5032,7 @@ static struct attribute *slab_attrs[] = {
4595 &objs_per_slab_attr.attr, 5032 &objs_per_slab_attr.attr,
4596 &order_attr.attr, 5033 &order_attr.attr,
4597 &min_partial_attr.attr, 5034 &min_partial_attr.attr,
5035 &cpu_partial_attr.attr,
4598 &objects_attr.attr, 5036 &objects_attr.attr,
4599 &objects_partial_attr.attr, 5037 &objects_partial_attr.attr,
4600 &partial_attr.attr, 5038 &partial_attr.attr,
@@ -4607,6 +5045,7 @@ static struct attribute *slab_attrs[] = {
4607 &destroy_by_rcu_attr.attr, 5045 &destroy_by_rcu_attr.attr,
4608 &shrink_attr.attr, 5046 &shrink_attr.attr,
4609 &reserved_attr.attr, 5047 &reserved_attr.attr,
5048 &slabs_cpu_partial_attr.attr,
4610#ifdef CONFIG_SLUB_DEBUG 5049#ifdef CONFIG_SLUB_DEBUG
4611 &total_objects_attr.attr, 5050 &total_objects_attr.attr,
4612 &slabs_attr.attr, 5051 &slabs_attr.attr,
@@ -4636,6 +5075,7 @@ static struct attribute *slab_attrs[] = {
4636 &alloc_from_partial_attr.attr, 5075 &alloc_from_partial_attr.attr,
4637 &alloc_slab_attr.attr, 5076 &alloc_slab_attr.attr,
4638 &alloc_refill_attr.attr, 5077 &alloc_refill_attr.attr,
5078 &alloc_node_mismatch_attr.attr,
4639 &free_slab_attr.attr, 5079 &free_slab_attr.attr,
4640 &cpuslab_flush_attr.attr, 5080 &cpuslab_flush_attr.attr,
4641 &deactivate_full_attr.attr, 5081 &deactivate_full_attr.attr,
@@ -4643,7 +5083,12 @@ static struct attribute *slab_attrs[] = {
4643 &deactivate_to_head_attr.attr, 5083 &deactivate_to_head_attr.attr,
4644 &deactivate_to_tail_attr.attr, 5084 &deactivate_to_tail_attr.attr,
4645 &deactivate_remote_frees_attr.attr, 5085 &deactivate_remote_frees_attr.attr,
5086 &deactivate_bypass_attr.attr,
4646 &order_fallback_attr.attr, 5087 &order_fallback_attr.attr,
5088 &cmpxchg_double_fail_attr.attr,
5089 &cmpxchg_double_cpu_fail_attr.attr,
5090 &cpu_partial_alloc_attr.attr,
5091 &cpu_partial_free_attr.attr,
4647#endif 5092#endif
4648#ifdef CONFIG_FAILSLAB 5093#ifdef CONFIG_FAILSLAB
4649 &failslab_attr.attr, 5094 &failslab_attr.attr,
@@ -4995,7 +5440,7 @@ static const struct file_operations proc_slabinfo_operations = {
4995 5440
4996static int __init slab_proc_init(void) 5441static int __init slab_proc_init(void)
4997{ 5442{
4998 proc_create("slabinfo", S_IRUGO, NULL, &proc_slabinfo_operations); 5443 proc_create("slabinfo", S_IRUSR, NULL, &proc_slabinfo_operations);
4999 return 0; 5444 return 0;
5000} 5445}
5001module_init(slab_proc_init); 5446module_init(slab_proc_init);
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 64b984091edb..1b7e22ab9b09 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -21,7 +21,6 @@
21#include <linux/mmzone.h> 21#include <linux/mmzone.h>
22#include <linux/bootmem.h> 22#include <linux/bootmem.h>
23#include <linux/highmem.h> 23#include <linux/highmem.h>
24#include <linux/module.h>
25#include <linux/slab.h> 24#include <linux/slab.h>
26#include <linux/spinlock.h> 25#include <linux/spinlock.h>
27#include <linux/vmalloc.h> 26#include <linux/vmalloc.h>
diff --git a/mm/sparse.c b/mm/sparse.c
index 858e1dff9b2a..61d7cde23111 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -6,7 +6,7 @@
6#include <linux/mmzone.h> 6#include <linux/mmzone.h>
7#include <linux/bootmem.h> 7#include <linux/bootmem.h>
8#include <linux/highmem.h> 8#include <linux/highmem.h>
9#include <linux/module.h> 9#include <linux/export.h>
10#include <linux/spinlock.h> 10#include <linux/spinlock.h>
11#include <linux/vmalloc.h> 11#include <linux/vmalloc.h>
12#include "internal.h" 12#include "internal.h"
diff --git a/mm/swap.c b/mm/swap.c
index 3a442f18b0b3..a91caf754d9b 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -21,7 +21,7 @@
21#include <linux/pagemap.h> 21#include <linux/pagemap.h>
22#include <linux/pagevec.h> 22#include <linux/pagevec.h>
23#include <linux/init.h> 23#include <linux/init.h>
24#include <linux/module.h> 24#include <linux/export.h>
25#include <linux/mm_inline.h> 25#include <linux/mm_inline.h>
26#include <linux/buffer_head.h> /* for try_to_release_page() */ 26#include <linux/buffer_head.h> /* for try_to_release_page() */
27#include <linux/percpu_counter.h> 27#include <linux/percpu_counter.h>
@@ -78,39 +78,22 @@ static void put_compound_page(struct page *page)
78{ 78{
79 if (unlikely(PageTail(page))) { 79 if (unlikely(PageTail(page))) {
80 /* __split_huge_page_refcount can run under us */ 80 /* __split_huge_page_refcount can run under us */
81 struct page *page_head = page->first_page; 81 struct page *page_head = compound_trans_head(page);
82 smp_rmb(); 82
83 /* 83 if (likely(page != page_head &&
84 * If PageTail is still set after smp_rmb() we can be sure 84 get_page_unless_zero(page_head))) {
85 * that the page->first_page we read wasn't a dangling pointer.
86 * See __split_huge_page_refcount() smp_wmb().
87 */
88 if (likely(PageTail(page) && get_page_unless_zero(page_head))) {
89 unsigned long flags; 85 unsigned long flags;
90 /* 86 /*
91 * Verify that our page_head wasn't converted 87 * page_head wasn't a dangling pointer but it
92 * to a a regular page before we got a 88 * may not be a head page anymore by the time
93 * reference on it. 89 * we obtain the lock. That is ok as long as it
90 * can't be freed from under us.
94 */ 91 */
95 if (unlikely(!PageHead(page_head))) {
96 /* PageHead is cleared after PageTail */
97 smp_rmb();
98 VM_BUG_ON(PageTail(page));
99 goto out_put_head;
100 }
101 /*
102 * Only run compound_lock on a valid PageHead,
103 * after having it pinned with
104 * get_page_unless_zero() above.
105 */
106 smp_mb();
107 /* page_head wasn't a dangling pointer */
108 flags = compound_lock_irqsave(page_head); 92 flags = compound_lock_irqsave(page_head);
109 if (unlikely(!PageTail(page))) { 93 if (unlikely(!PageTail(page))) {
110 /* __split_huge_page_refcount run before us */ 94 /* __split_huge_page_refcount run before us */
111 compound_unlock_irqrestore(page_head, flags); 95 compound_unlock_irqrestore(page_head, flags);
112 VM_BUG_ON(PageHead(page_head)); 96 VM_BUG_ON(PageHead(page_head));
113 out_put_head:
114 if (put_page_testzero(page_head)) 97 if (put_page_testzero(page_head))
115 __put_single_page(page_head); 98 __put_single_page(page_head);
116 out_put_single: 99 out_put_single:
@@ -121,16 +104,17 @@ static void put_compound_page(struct page *page)
121 VM_BUG_ON(page_head != page->first_page); 104 VM_BUG_ON(page_head != page->first_page);
122 /* 105 /*
123 * We can release the refcount taken by 106 * We can release the refcount taken by
124 * get_page_unless_zero now that 107 * get_page_unless_zero() now that
125 * split_huge_page_refcount is blocked on the 108 * __split_huge_page_refcount() is blocked on
126 * compound_lock. 109 * the compound_lock.
127 */ 110 */
128 if (put_page_testzero(page_head)) 111 if (put_page_testzero(page_head))
129 VM_BUG_ON(1); 112 VM_BUG_ON(1);
130 /* __split_huge_page_refcount will wait now */ 113 /* __split_huge_page_refcount will wait now */
131 VM_BUG_ON(atomic_read(&page->_count) <= 0); 114 VM_BUG_ON(page_mapcount(page) <= 0);
132 atomic_dec(&page->_count); 115 atomic_dec(&page->_mapcount);
133 VM_BUG_ON(atomic_read(&page_head->_count) <= 0); 116 VM_BUG_ON(atomic_read(&page_head->_count) <= 0);
117 VM_BUG_ON(atomic_read(&page->_count) != 0);
134 compound_unlock_irqrestore(page_head, flags); 118 compound_unlock_irqrestore(page_head, flags);
135 if (put_page_testzero(page_head)) { 119 if (put_page_testzero(page_head)) {
136 if (PageHead(page_head)) 120 if (PageHead(page_head))
@@ -160,6 +144,45 @@ void put_page(struct page *page)
160} 144}
161EXPORT_SYMBOL(put_page); 145EXPORT_SYMBOL(put_page);
162 146
147/*
148 * This function is exported but must not be called by anything other
149 * than get_page(). It implements the slow path of get_page().
150 */
151bool __get_page_tail(struct page *page)
152{
153 /*
154 * This takes care of get_page() if run on a tail page
155 * returned by one of the get_user_pages/follow_page variants.
156 * get_user_pages/follow_page itself doesn't need the compound
157 * lock because it runs __get_page_tail_foll() under the
158 * proper PT lock that already serializes against
159 * split_huge_page().
160 */
161 unsigned long flags;
162 bool got = false;
163 struct page *page_head = compound_trans_head(page);
164
165 if (likely(page != page_head && get_page_unless_zero(page_head))) {
166 /*
167 * page_head wasn't a dangling pointer but it
168 * may not be a head page anymore by the time
169 * we obtain the lock. That is ok as long as it
170 * can't be freed from under us.
171 */
172 flags = compound_lock_irqsave(page_head);
173 /* here __split_huge_page_refcount won't run anymore */
174 if (likely(PageTail(page))) {
175 __get_page_tail_foll(page, false);
176 got = true;
177 }
178 compound_unlock_irqrestore(page_head, flags);
179 if (unlikely(!got))
180 put_page(page_head);
181 }
182 return got;
183}
184EXPORT_SYMBOL(__get_page_tail);
185
163/** 186/**
164 * put_pages_list() - release a list of pages 187 * put_pages_list() - release a list of pages
165 * @pages: list of pages threaded on page->lru 188 * @pages: list of pages threaded on page->lru
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 46680461785b..78cc4d1f6cce 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -6,7 +6,6 @@
6 * 6 *
7 * Rewritten to use page cache, (C) 1998 Stephen Tweedie 7 * Rewritten to use page cache, (C) 1998 Stephen Tweedie
8 */ 8 */
9#include <linux/module.h>
10#include <linux/mm.h> 9#include <linux/mm.h>
11#include <linux/gfp.h> 10#include <linux/gfp.h>
12#include <linux/kernel_stat.h> 11#include <linux/kernel_stat.h>
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 1b8c33907242..b1cd12060723 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -21,7 +21,6 @@
21#include <linux/proc_fs.h> 21#include <linux/proc_fs.h>
22#include <linux/seq_file.h> 22#include <linux/seq_file.h>
23#include <linux/init.h> 23#include <linux/init.h>
24#include <linux/module.h>
25#include <linux/ksm.h> 24#include <linux/ksm.h>
26#include <linux/rmap.h> 25#include <linux/rmap.h>
27#include <linux/security.h> 26#include <linux/security.h>
@@ -1617,7 +1616,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1617 1616
1618 oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX); 1617 oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX);
1619 err = try_to_unuse(type); 1618 err = try_to_unuse(type);
1620 test_set_oom_score_adj(oom_score_adj); 1619 compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, oom_score_adj);
1621 1620
1622 if (err) { 1621 if (err) {
1623 /* 1622 /*
@@ -1924,20 +1923,24 @@ static unsigned long read_swap_header(struct swap_info_struct *p,
1924 1923
1925 /* 1924 /*
1926 * Find out how many pages are allowed for a single swap 1925 * Find out how many pages are allowed for a single swap
1927 * device. There are two limiting factors: 1) the number of 1926 * device. There are three limiting factors: 1) the number
1928 * bits for the swap offset in the swp_entry_t type and 1927 * of bits for the swap offset in the swp_entry_t type, and
1929 * 2) the number of bits in the a swap pte as defined by 1928 * 2) the number of bits in the swap pte as defined by the
1930 * the different architectures. In order to find the 1929 * the different architectures, and 3) the number of free bits
1931 * largest possible bit mask a swap entry with swap type 0 1930 * in an exceptional radix_tree entry. In order to find the
1931 * largest possible bit mask, a swap entry with swap type 0
1932 * and swap offset ~0UL is created, encoded to a swap pte, 1932 * and swap offset ~0UL is created, encoded to a swap pte,
1933 * decoded to a swp_entry_t again and finally the swap 1933 * decoded to a swp_entry_t again, and finally the swap
1934 * offset is extracted. This will mask all the bits from 1934 * offset is extracted. This will mask all the bits from
1935 * the initial ~0UL mask that can't be encoded in either 1935 * the initial ~0UL mask that can't be encoded in either
1936 * the swp_entry_t or the architecture definition of a 1936 * the swp_entry_t or the architecture definition of a
1937 * swap pte. 1937 * swap pte. Then the same is done for a radix_tree entry.
1938 */ 1938 */
1939 maxpages = swp_offset(pte_to_swp_entry( 1939 maxpages = swp_offset(pte_to_swp_entry(
1940 swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1; 1940 swp_entry_to_pte(swp_entry(0, ~0UL))));
1941 maxpages = swp_offset(radix_to_swp_entry(
1942 swp_to_radix_entry(swp_entry(0, maxpages)))) + 1;
1943
1941 if (maxpages > swap_header->info.last_page) { 1944 if (maxpages > swap_header->info.last_page) {
1942 maxpages = swap_header->info.last_page + 1; 1945 maxpages = swap_header->info.last_page + 1;
1943 /* p->max is an unsigned int: don't overflow it */ 1946 /* p->max is an unsigned int: don't overflow it */
diff --git a/mm/thrash.c b/mm/thrash.c
index e53f7d02c17c..57ad495dbd54 100644
--- a/mm/thrash.c
+++ b/mm/thrash.c
@@ -29,7 +29,7 @@
29 29
30static DEFINE_SPINLOCK(swap_token_lock); 30static DEFINE_SPINLOCK(swap_token_lock);
31struct mm_struct *swap_token_mm; 31struct mm_struct *swap_token_mm;
32struct mem_cgroup *swap_token_memcg; 32static struct mem_cgroup *swap_token_memcg;
33 33
34#ifdef CONFIG_CGROUP_MEM_RES_CTLR 34#ifdef CONFIG_CGROUP_MEM_RES_CTLR
35static struct mem_cgroup *swap_token_memcg_from_mm(struct mm_struct *mm) 35static struct mem_cgroup *swap_token_memcg_from_mm(struct mm_struct *mm)
diff --git a/mm/truncate.c b/mm/truncate.c
index 232eb2736a79..632b15e29f74 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -12,7 +12,7 @@
12#include <linux/gfp.h> 12#include <linux/gfp.h>
13#include <linux/mm.h> 13#include <linux/mm.h>
14#include <linux/swap.h> 14#include <linux/swap.h>
15#include <linux/module.h> 15#include <linux/export.h>
16#include <linux/pagemap.h> 16#include <linux/pagemap.h>
17#include <linux/highmem.h> 17#include <linux/highmem.h>
18#include <linux/pagevec.h> 18#include <linux/pagevec.h>
@@ -336,6 +336,14 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
336 unsigned long count = 0; 336 unsigned long count = 0;
337 int i; 337 int i;
338 338
339 /*
340 * Note: this function may get called on a shmem/tmpfs mapping:
341 * pagevec_lookup() might then return 0 prematurely (because it
342 * got a gangful of swap entries); but it's hardly worth worrying
343 * about - it can rarely have anything to free from such a mapping
344 * (most pages are dirty), and already skips over any difficulties.
345 */
346
339 pagevec_init(&pvec, 0); 347 pagevec_init(&pvec, 0);
340 while (index <= end && pagevec_lookup(&pvec, mapping, index, 348 while (index <= end && pagevec_lookup(&pvec, mapping, index,
341 min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { 349 min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
diff --git a/mm/util.c b/mm/util.c
index 88ea1bd661c0..136ac4f322b8 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -1,7 +1,7 @@
1#include <linux/mm.h> 1#include <linux/mm.h>
2#include <linux/slab.h> 2#include <linux/slab.h>
3#include <linux/string.h> 3#include <linux/string.h>
4#include <linux/module.h> 4#include <linux/export.h>
5#include <linux/err.h> 5#include <linux/err.h>
6#include <linux/sched.h> 6#include <linux/sched.h>
7#include <asm/uaccess.h> 7#include <asm/uaccess.h>
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 464621d18eb2..b669aa6f6caf 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -725,9 +725,10 @@ static void free_unmap_vmap_area_addr(unsigned long addr)
725#define VMAP_BBMAP_BITS_MIN (VMAP_MAX_ALLOC*2) 725#define VMAP_BBMAP_BITS_MIN (VMAP_MAX_ALLOC*2)
726#define VMAP_MIN(x, y) ((x) < (y) ? (x) : (y)) /* can't use min() */ 726#define VMAP_MIN(x, y) ((x) < (y) ? (x) : (y)) /* can't use min() */
727#define VMAP_MAX(x, y) ((x) > (y) ? (x) : (y)) /* can't use max() */ 727#define VMAP_MAX(x, y) ((x) > (y) ? (x) : (y)) /* can't use max() */
728#define VMAP_BBMAP_BITS VMAP_MIN(VMAP_BBMAP_BITS_MAX, \ 728#define VMAP_BBMAP_BITS \
729 VMAP_MAX(VMAP_BBMAP_BITS_MIN, \ 729 VMAP_MIN(VMAP_BBMAP_BITS_MAX, \
730 VMALLOC_PAGES / NR_CPUS / 16)) 730 VMAP_MAX(VMAP_BBMAP_BITS_MIN, \
731 VMALLOC_PAGES / roundup_pow_of_two(NR_CPUS) / 16))
731 732
732#define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE) 733#define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE)
733 734
@@ -1252,18 +1253,22 @@ EXPORT_SYMBOL_GPL(map_vm_area);
1252DEFINE_RWLOCK(vmlist_lock); 1253DEFINE_RWLOCK(vmlist_lock);
1253struct vm_struct *vmlist; 1254struct vm_struct *vmlist;
1254 1255
1255static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, 1256static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
1256 unsigned long flags, void *caller) 1257 unsigned long flags, void *caller)
1257{ 1258{
1258 struct vm_struct *tmp, **p;
1259
1260 vm->flags = flags; 1259 vm->flags = flags;
1261 vm->addr = (void *)va->va_start; 1260 vm->addr = (void *)va->va_start;
1262 vm->size = va->va_end - va->va_start; 1261 vm->size = va->va_end - va->va_start;
1263 vm->caller = caller; 1262 vm->caller = caller;
1264 va->private = vm; 1263 va->private = vm;
1265 va->flags |= VM_VM_AREA; 1264 va->flags |= VM_VM_AREA;
1265}
1266 1266
1267static void insert_vmalloc_vmlist(struct vm_struct *vm)
1268{
1269 struct vm_struct *tmp, **p;
1270
1271 vm->flags &= ~VM_UNLIST;
1267 write_lock(&vmlist_lock); 1272 write_lock(&vmlist_lock);
1268 for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) { 1273 for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) {
1269 if (tmp->addr >= vm->addr) 1274 if (tmp->addr >= vm->addr)
@@ -1274,6 +1279,13 @@ static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
1274 write_unlock(&vmlist_lock); 1279 write_unlock(&vmlist_lock);
1275} 1280}
1276 1281
1282static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
1283 unsigned long flags, void *caller)
1284{
1285 setup_vmalloc_vm(vm, va, flags, caller);
1286 insert_vmalloc_vmlist(vm);
1287}
1288
1277static struct vm_struct *__get_vm_area_node(unsigned long size, 1289static struct vm_struct *__get_vm_area_node(unsigned long size,
1278 unsigned long align, unsigned long flags, unsigned long start, 1290 unsigned long align, unsigned long flags, unsigned long start,
1279 unsigned long end, int node, gfp_t gfp_mask, void *caller) 1291 unsigned long end, int node, gfp_t gfp_mask, void *caller)
@@ -1312,7 +1324,18 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
1312 return NULL; 1324 return NULL;
1313 } 1325 }
1314 1326
1315 insert_vmalloc_vm(area, va, flags, caller); 1327 /*
1328 * When this function is called from __vmalloc_node_range,
1329 * we do not add vm_struct to vmlist here to avoid
1330 * accessing uninitialized members of vm_struct such as
1331 * pages and nr_pages fields. They will be set later.
1332 * To distinguish it from others, we use a VM_UNLIST flag.
1333 */
1334 if (flags & VM_UNLIST)
1335 setup_vmalloc_vm(area, va, flags, caller);
1336 else
1337 insert_vmalloc_vm(area, va, flags, caller);
1338
1316 return area; 1339 return area;
1317} 1340}
1318 1341
@@ -1380,17 +1403,20 @@ struct vm_struct *remove_vm_area(const void *addr)
1380 va = find_vmap_area((unsigned long)addr); 1403 va = find_vmap_area((unsigned long)addr);
1381 if (va && va->flags & VM_VM_AREA) { 1404 if (va && va->flags & VM_VM_AREA) {
1382 struct vm_struct *vm = va->private; 1405 struct vm_struct *vm = va->private;
1383 struct vm_struct *tmp, **p; 1406
1384 /* 1407 if (!(vm->flags & VM_UNLIST)) {
1385 * remove from list and disallow access to this vm_struct 1408 struct vm_struct *tmp, **p;
1386 * before unmap. (address range confliction is maintained by 1409 /*
1387 * vmap.) 1410 * remove from list and disallow access to
1388 */ 1411 * this vm_struct before unmap. (address range
1389 write_lock(&vmlist_lock); 1412 * confliction is maintained by vmap.)
1390 for (p = &vmlist; (tmp = *p) != vm; p = &tmp->next) 1413 */
1391 ; 1414 write_lock(&vmlist_lock);
1392 *p = tmp->next; 1415 for (p = &vmlist; (tmp = *p) != vm; p = &tmp->next)
1393 write_unlock(&vmlist_lock); 1416 ;
1417 *p = tmp->next;
1418 write_unlock(&vmlist_lock);
1419 }
1394 1420
1395 vmap_debug_free_range(va->va_start, va->va_end); 1421 vmap_debug_free_range(va->va_start, va->va_end);
1396 free_unmap_vmap_area(va); 1422 free_unmap_vmap_area(va);
@@ -1567,8 +1593,8 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
1567 return area->addr; 1593 return area->addr;
1568 1594
1569fail: 1595fail:
1570 warn_alloc_failed(gfp_mask, order, "vmalloc: allocation failure, " 1596 warn_alloc_failed(gfp_mask, order,
1571 "allocated %ld of %ld bytes\n", 1597 "vmalloc: allocation failure, allocated %ld of %ld bytes\n",
1572 (area->nr_pages*PAGE_SIZE), area->size); 1598 (area->nr_pages*PAGE_SIZE), area->size);
1573 vfree(area->addr); 1599 vfree(area->addr);
1574 return NULL; 1600 return NULL;
@@ -1599,17 +1625,22 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
1599 1625
1600 size = PAGE_ALIGN(size); 1626 size = PAGE_ALIGN(size);
1601 if (!size || (size >> PAGE_SHIFT) > totalram_pages) 1627 if (!size || (size >> PAGE_SHIFT) > totalram_pages)
1602 return NULL; 1628 goto fail;
1603
1604 area = __get_vm_area_node(size, align, VM_ALLOC, start, end, node,
1605 gfp_mask, caller);
1606 1629
1630 area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNLIST,
1631 start, end, node, gfp_mask, caller);
1607 if (!area) 1632 if (!area)
1608 return NULL; 1633 goto fail;
1609 1634
1610 addr = __vmalloc_area_node(area, gfp_mask, prot, node, caller); 1635 addr = __vmalloc_area_node(area, gfp_mask, prot, node, caller);
1611 1636
1612 /* 1637 /*
1638 * In this function, newly allocated vm_struct is not added
1639 * to vmlist at __get_vm_area_node(). so, it is added here.
1640 */
1641 insert_vmalloc_vmlist(area);
1642
1643 /*
1613 * A ref_count = 3 is needed because the vm_struct and vmap_area 1644 * A ref_count = 3 is needed because the vm_struct and vmap_area
1614 * structures allocated in the __get_vm_area_node() function contain 1645 * structures allocated in the __get_vm_area_node() function contain
1615 * references to the virtual address of the vmalloc'ed block. 1646 * references to the virtual address of the vmalloc'ed block.
@@ -1617,6 +1648,12 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
1617 kmemleak_alloc(addr, real_size, 3, gfp_mask); 1648 kmemleak_alloc(addr, real_size, 3, gfp_mask);
1618 1649
1619 return addr; 1650 return addr;
1651
1652fail:
1653 warn_alloc_failed(gfp_mask, 0,
1654 "vmalloc: allocation failure: %lu bytes\n",
1655 real_size);
1656 return NULL;
1620} 1657}
1621 1658
1622/** 1659/**
@@ -2139,6 +2176,14 @@ struct vm_struct *alloc_vm_area(size_t size)
2139 return NULL; 2176 return NULL;
2140 } 2177 }
2141 2178
2179 /*
2180 * If the allocated address space is passed to a hypercall
2181 * before being used then we cannot rely on a page fault to
2182 * trigger an update of the page tables. So sync all the page
2183 * tables here.
2184 */
2185 vmalloc_sync_all();
2186
2142 return area; 2187 return area;
2143} 2188}
2144EXPORT_SYMBOL_GPL(alloc_vm_area); 2189EXPORT_SYMBOL_GPL(alloc_vm_area);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 7ef69124fa3e..a1893c050795 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -105,7 +105,6 @@ struct scan_control {
105 105
106 /* Which cgroup do we reclaim from */ 106 /* Which cgroup do we reclaim from */
107 struct mem_cgroup *mem_cgroup; 107 struct mem_cgroup *mem_cgroup;
108 struct memcg_scanrecord *memcg_record;
109 108
110 /* 109 /*
111 * Nodemask of nodes allowed by the caller. If NULL, all nodes 110 * Nodemask of nodes allowed by the caller. If NULL, all nodes
@@ -496,15 +495,6 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
496 return PAGE_ACTIVATE; 495 return PAGE_ACTIVATE;
497 } 496 }
498 497
499 /*
500 * Wait on writeback if requested to. This happens when
501 * direct reclaiming a large contiguous area and the
502 * first attempt to free a range of pages fails.
503 */
504 if (PageWriteback(page) &&
505 (sc->reclaim_mode & RECLAIM_MODE_SYNC))
506 wait_on_page_writeback(page);
507
508 if (!PageWriteback(page)) { 498 if (!PageWriteback(page)) {
509 /* synchronous write or broken a_ops? */ 499 /* synchronous write or broken a_ops? */
510 ClearPageReclaim(page); 500 ClearPageReclaim(page);
@@ -643,13 +633,14 @@ redo:
643 lru = LRU_UNEVICTABLE; 633 lru = LRU_UNEVICTABLE;
644 add_page_to_unevictable_list(page); 634 add_page_to_unevictable_list(page);
645 /* 635 /*
646 * When racing with an mlock clearing (page is 636 * When racing with an mlock or AS_UNEVICTABLE clearing
647 * unlocked), make sure that if the other thread does 637 * (page is unlocked) make sure that if the other thread
648 * not observe our setting of PG_lru and fails 638 * does not observe our setting of PG_lru and fails
649 * isolation, we see PG_mlocked cleared below and move 639 * isolation/check_move_unevictable_page,
640 * we see PG_mlocked/AS_UNEVICTABLE cleared below and move
650 * the page back to the evictable list. 641 * the page back to the evictable list.
651 * 642 *
652 * The other side is TestClearPageMlocked(). 643 * The other side is TestClearPageMlocked() or shmem_lock().
653 */ 644 */
654 smp_mb(); 645 smp_mb();
655 } 646 }
@@ -760,7 +751,10 @@ static noinline_for_stack void free_page_list(struct list_head *free_pages)
760 */ 751 */
761static unsigned long shrink_page_list(struct list_head *page_list, 752static unsigned long shrink_page_list(struct list_head *page_list,
762 struct zone *zone, 753 struct zone *zone,
763 struct scan_control *sc) 754 struct scan_control *sc,
755 int priority,
756 unsigned long *ret_nr_dirty,
757 unsigned long *ret_nr_writeback)
764{ 758{
765 LIST_HEAD(ret_pages); 759 LIST_HEAD(ret_pages);
766 LIST_HEAD(free_pages); 760 LIST_HEAD(free_pages);
@@ -768,6 +762,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
768 unsigned long nr_dirty = 0; 762 unsigned long nr_dirty = 0;
769 unsigned long nr_congested = 0; 763 unsigned long nr_congested = 0;
770 unsigned long nr_reclaimed = 0; 764 unsigned long nr_reclaimed = 0;
765 unsigned long nr_writeback = 0;
771 766
772 cond_resched(); 767 cond_resched();
773 768
@@ -804,13 +799,12 @@ static unsigned long shrink_page_list(struct list_head *page_list,
804 (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); 799 (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
805 800
806 if (PageWriteback(page)) { 801 if (PageWriteback(page)) {
802 nr_writeback++;
807 /* 803 /*
808 * Synchronous reclaim is performed in two passes, 804 * Synchronous reclaim cannot queue pages for
809 * first an asynchronous pass over the list to 805 * writeback due to the possibility of stack overflow
810 * start parallel writeback, and a second synchronous 806 * but if it encounters a page under writeback, wait
811 * pass to wait for the IO to complete. Wait here 807 * for the IO to complete.
812 * for any page for which writeback has already
813 * started.
814 */ 808 */
815 if ((sc->reclaim_mode & RECLAIM_MODE_SYNC) && 809 if ((sc->reclaim_mode & RECLAIM_MODE_SYNC) &&
816 may_enter_fs) 810 may_enter_fs)
@@ -866,6 +860,25 @@ static unsigned long shrink_page_list(struct list_head *page_list,
866 if (PageDirty(page)) { 860 if (PageDirty(page)) {
867 nr_dirty++; 861 nr_dirty++;
868 862
863 /*
864 * Only kswapd can writeback filesystem pages to
865 * avoid risk of stack overflow but do not writeback
866 * unless under significant pressure.
867 */
868 if (page_is_file_cache(page) &&
869 (!current_is_kswapd() || priority >= DEF_PRIORITY - 2)) {
870 /*
871 * Immediately reclaim when written back.
872 * Similar in principal to deactivate_page()
873 * except we already have the page isolated
874 * and know it's dirty
875 */
876 inc_zone_page_state(page, NR_VMSCAN_IMMEDIATE);
877 SetPageReclaim(page);
878
879 goto keep_locked;
880 }
881
869 if (references == PAGEREF_RECLAIM_CLEAN) 882 if (references == PAGEREF_RECLAIM_CLEAN)
870 goto keep_locked; 883 goto keep_locked;
871 if (!may_enter_fs) 884 if (!may_enter_fs)
@@ -1000,6 +1013,8 @@ keep_lumpy:
1000 1013
1001 list_splice(&ret_pages, page_list); 1014 list_splice(&ret_pages, page_list);
1002 count_vm_events(PGACTIVATE, pgactivate); 1015 count_vm_events(PGACTIVATE, pgactivate);
1016 *ret_nr_dirty += nr_dirty;
1017 *ret_nr_writeback += nr_writeback;
1003 return nr_reclaimed; 1018 return nr_reclaimed;
1004} 1019}
1005 1020
@@ -1013,23 +1028,27 @@ keep_lumpy:
1013 * 1028 *
1014 * returns 0 on success, -ve errno on failure. 1029 * returns 0 on success, -ve errno on failure.
1015 */ 1030 */
1016int __isolate_lru_page(struct page *page, int mode, int file) 1031int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file)
1017{ 1032{
1033 bool all_lru_mode;
1018 int ret = -EINVAL; 1034 int ret = -EINVAL;
1019 1035
1020 /* Only take pages on the LRU. */ 1036 /* Only take pages on the LRU. */
1021 if (!PageLRU(page)) 1037 if (!PageLRU(page))
1022 return ret; 1038 return ret;
1023 1039
1040 all_lru_mode = (mode & (ISOLATE_ACTIVE|ISOLATE_INACTIVE)) ==
1041 (ISOLATE_ACTIVE|ISOLATE_INACTIVE);
1042
1024 /* 1043 /*
1025 * When checking the active state, we need to be sure we are 1044 * When checking the active state, we need to be sure we are
1026 * dealing with comparible boolean values. Take the logical not 1045 * dealing with comparible boolean values. Take the logical not
1027 * of each. 1046 * of each.
1028 */ 1047 */
1029 if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode)) 1048 if (!all_lru_mode && !PageActive(page) != !(mode & ISOLATE_ACTIVE))
1030 return ret; 1049 return ret;
1031 1050
1032 if (mode != ISOLATE_BOTH && page_is_file_cache(page) != file) 1051 if (!all_lru_mode && !!page_is_file_cache(page) != file)
1033 return ret; 1052 return ret;
1034 1053
1035 /* 1054 /*
@@ -1042,6 +1061,12 @@ int __isolate_lru_page(struct page *page, int mode, int file)
1042 1061
1043 ret = -EBUSY; 1062 ret = -EBUSY;
1044 1063
1064 if ((mode & ISOLATE_CLEAN) && (PageDirty(page) || PageWriteback(page)))
1065 return ret;
1066
1067 if ((mode & ISOLATE_UNMAPPED) && page_mapped(page))
1068 return ret;
1069
1045 if (likely(get_page_unless_zero(page))) { 1070 if (likely(get_page_unless_zero(page))) {
1046 /* 1071 /*
1047 * Be careful not to clear PageLRU until after we're 1072 * Be careful not to clear PageLRU until after we're
@@ -1077,7 +1102,8 @@ int __isolate_lru_page(struct page *page, int mode, int file)
1077 */ 1102 */
1078static unsigned long isolate_lru_pages(unsigned long nr_to_scan, 1103static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1079 struct list_head *src, struct list_head *dst, 1104 struct list_head *src, struct list_head *dst,
1080 unsigned long *scanned, int order, int mode, int file) 1105 unsigned long *scanned, int order, isolate_mode_t mode,
1106 int file)
1081{ 1107{
1082 unsigned long nr_taken = 0; 1108 unsigned long nr_taken = 0;
1083 unsigned long nr_lumpy_taken = 0; 1109 unsigned long nr_lumpy_taken = 0;
@@ -1202,8 +1228,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1202static unsigned long isolate_pages_global(unsigned long nr, 1228static unsigned long isolate_pages_global(unsigned long nr,
1203 struct list_head *dst, 1229 struct list_head *dst,
1204 unsigned long *scanned, int order, 1230 unsigned long *scanned, int order,
1205 int mode, struct zone *z, 1231 isolate_mode_t mode,
1206 int active, int file) 1232 struct zone *z, int active, int file)
1207{ 1233{
1208 int lru = LRU_BASE; 1234 int lru = LRU_BASE;
1209 if (active) 1235 if (active)
@@ -1349,8 +1375,6 @@ putback_lru_pages(struct zone *zone, struct scan_control *sc,
1349 int file = is_file_lru(lru); 1375 int file = is_file_lru(lru);
1350 int numpages = hpage_nr_pages(page); 1376 int numpages = hpage_nr_pages(page);
1351 reclaim_stat->recent_rotated[file] += numpages; 1377 reclaim_stat->recent_rotated[file] += numpages;
1352 if (!scanning_global_lru(sc))
1353 sc->memcg_record->nr_rotated[file] += numpages;
1354 } 1378 }
1355 if (!pagevec_add(&pvec, page)) { 1379 if (!pagevec_add(&pvec, page)) {
1356 spin_unlock_irq(&zone->lru_lock); 1380 spin_unlock_irq(&zone->lru_lock);
@@ -1394,14 +1418,10 @@ static noinline_for_stack void update_isolated_counts(struct zone *zone,
1394 1418
1395 reclaim_stat->recent_scanned[0] += *nr_anon; 1419 reclaim_stat->recent_scanned[0] += *nr_anon;
1396 reclaim_stat->recent_scanned[1] += *nr_file; 1420 reclaim_stat->recent_scanned[1] += *nr_file;
1397 if (!scanning_global_lru(sc)) {
1398 sc->memcg_record->nr_scanned[0] += *nr_anon;
1399 sc->memcg_record->nr_scanned[1] += *nr_file;
1400 }
1401} 1421}
1402 1422
1403/* 1423/*
1404 * Returns true if the caller should wait to clean dirty/writeback pages. 1424 * Returns true if a direct reclaim should wait on pages under writeback.
1405 * 1425 *
1406 * If we are direct reclaiming for contiguous pages and we do not reclaim 1426 * If we are direct reclaiming for contiguous pages and we do not reclaim
1407 * everything in the list, try again and wait for writeback IO to complete. 1427 * everything in the list, try again and wait for writeback IO to complete.
@@ -1423,7 +1443,7 @@ static inline bool should_reclaim_stall(unsigned long nr_taken,
1423 if (sc->reclaim_mode & RECLAIM_MODE_SINGLE) 1443 if (sc->reclaim_mode & RECLAIM_MODE_SINGLE)
1424 return false; 1444 return false;
1425 1445
1426 /* If we have relaimed everything on the isolated list, no stall */ 1446 /* If we have reclaimed everything on the isolated list, no stall */
1427 if (nr_freed == nr_taken) 1447 if (nr_freed == nr_taken)
1428 return false; 1448 return false;
1429 1449
@@ -1455,6 +1475,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1455 unsigned long nr_taken; 1475 unsigned long nr_taken;
1456 unsigned long nr_anon; 1476 unsigned long nr_anon;
1457 unsigned long nr_file; 1477 unsigned long nr_file;
1478 unsigned long nr_dirty = 0;
1479 unsigned long nr_writeback = 0;
1480 isolate_mode_t reclaim_mode = ISOLATE_INACTIVE;
1458 1481
1459 while (unlikely(too_many_isolated(zone, file, sc))) { 1482 while (unlikely(too_many_isolated(zone, file, sc))) {
1460 congestion_wait(BLK_RW_ASYNC, HZ/10); 1483 congestion_wait(BLK_RW_ASYNC, HZ/10);
@@ -1465,15 +1488,21 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1465 } 1488 }
1466 1489
1467 set_reclaim_mode(priority, sc, false); 1490 set_reclaim_mode(priority, sc, false);
1491 if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM)
1492 reclaim_mode |= ISOLATE_ACTIVE;
1493
1468 lru_add_drain(); 1494 lru_add_drain();
1495
1496 if (!sc->may_unmap)
1497 reclaim_mode |= ISOLATE_UNMAPPED;
1498 if (!sc->may_writepage)
1499 reclaim_mode |= ISOLATE_CLEAN;
1500
1469 spin_lock_irq(&zone->lru_lock); 1501 spin_lock_irq(&zone->lru_lock);
1470 1502
1471 if (scanning_global_lru(sc)) { 1503 if (scanning_global_lru(sc)) {
1472 nr_taken = isolate_pages_global(nr_to_scan, 1504 nr_taken = isolate_pages_global(nr_to_scan, &page_list,
1473 &page_list, &nr_scanned, sc->order, 1505 &nr_scanned, sc->order, reclaim_mode, zone, 0, file);
1474 sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
1475 ISOLATE_BOTH : ISOLATE_INACTIVE,
1476 zone, 0, file);
1477 zone->pages_scanned += nr_scanned; 1506 zone->pages_scanned += nr_scanned;
1478 if (current_is_kswapd()) 1507 if (current_is_kswapd())
1479 __count_zone_vm_events(PGSCAN_KSWAPD, zone, 1508 __count_zone_vm_events(PGSCAN_KSWAPD, zone,
@@ -1482,12 +1511,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1482 __count_zone_vm_events(PGSCAN_DIRECT, zone, 1511 __count_zone_vm_events(PGSCAN_DIRECT, zone,
1483 nr_scanned); 1512 nr_scanned);
1484 } else { 1513 } else {
1485 nr_taken = mem_cgroup_isolate_pages(nr_to_scan, 1514 nr_taken = mem_cgroup_isolate_pages(nr_to_scan, &page_list,
1486 &page_list, &nr_scanned, sc->order, 1515 &nr_scanned, sc->order, reclaim_mode, zone,
1487 sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ? 1516 sc->mem_cgroup, 0, file);
1488 ISOLATE_BOTH : ISOLATE_INACTIVE,
1489 zone, sc->mem_cgroup,
1490 0, file);
1491 /* 1517 /*
1492 * mem_cgroup_isolate_pages() keeps track of 1518 * mem_cgroup_isolate_pages() keeps track of
1493 * scanned pages on its own. 1519 * scanned pages on its own.
@@ -1503,17 +1529,16 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1503 1529
1504 spin_unlock_irq(&zone->lru_lock); 1530 spin_unlock_irq(&zone->lru_lock);
1505 1531
1506 nr_reclaimed = shrink_page_list(&page_list, zone, sc); 1532 nr_reclaimed = shrink_page_list(&page_list, zone, sc, priority,
1533 &nr_dirty, &nr_writeback);
1507 1534
1508 /* Check if we should syncronously wait for writeback */ 1535 /* Check if we should syncronously wait for writeback */
1509 if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) { 1536 if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) {
1510 set_reclaim_mode(priority, sc, true); 1537 set_reclaim_mode(priority, sc, true);
1511 nr_reclaimed += shrink_page_list(&page_list, zone, sc); 1538 nr_reclaimed += shrink_page_list(&page_list, zone, sc,
1539 priority, &nr_dirty, &nr_writeback);
1512 } 1540 }
1513 1541
1514 if (!scanning_global_lru(sc))
1515 sc->memcg_record->nr_freed[file] += nr_reclaimed;
1516
1517 local_irq_disable(); 1542 local_irq_disable();
1518 if (current_is_kswapd()) 1543 if (current_is_kswapd())
1519 __count_vm_events(KSWAPD_STEAL, nr_reclaimed); 1544 __count_vm_events(KSWAPD_STEAL, nr_reclaimed);
@@ -1521,6 +1546,32 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1521 1546
1522 putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list); 1547 putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list);
1523 1548
1549 /*
1550 * If reclaim is isolating dirty pages under writeback, it implies
1551 * that the long-lived page allocation rate is exceeding the page
1552 * laundering rate. Either the global limits are not being effective
1553 * at throttling processes due to the page distribution throughout
1554 * zones or there is heavy usage of a slow backing device. The
1555 * only option is to throttle from reclaim context which is not ideal
1556 * as there is no guarantee the dirtying process is throttled in the
1557 * same way balance_dirty_pages() manages.
1558 *
1559 * This scales the number of dirty pages that must be under writeback
1560 * before throttling depending on priority. It is a simple backoff
1561 * function that has the most effect in the range DEF_PRIORITY to
1562 * DEF_PRIORITY-2 which is the priority reclaim is considered to be
1563 * in trouble and reclaim is considered to be in trouble.
1564 *
1565 * DEF_PRIORITY 100% isolated pages must be PageWriteback to throttle
1566 * DEF_PRIORITY-1 50% must be PageWriteback
1567 * DEF_PRIORITY-2 25% must be PageWriteback, kswapd in trouble
1568 * ...
1569 * DEF_PRIORITY-6 For SWAP_CLUSTER_MAX isolated pages, throttle if any
1570 * isolated page is PageWriteback
1571 */
1572 if (nr_writeback && nr_writeback >= (nr_taken >> (DEF_PRIORITY-priority)))
1573 wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);
1574
1524 trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id, 1575 trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,
1525 zone_idx(zone), 1576 zone_idx(zone),
1526 nr_scanned, nr_reclaimed, 1577 nr_scanned, nr_reclaimed,
@@ -1592,19 +1643,26 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1592 struct page *page; 1643 struct page *page;
1593 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); 1644 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1594 unsigned long nr_rotated = 0; 1645 unsigned long nr_rotated = 0;
1646 isolate_mode_t reclaim_mode = ISOLATE_ACTIVE;
1595 1647
1596 lru_add_drain(); 1648 lru_add_drain();
1649
1650 if (!sc->may_unmap)
1651 reclaim_mode |= ISOLATE_UNMAPPED;
1652 if (!sc->may_writepage)
1653 reclaim_mode |= ISOLATE_CLEAN;
1654
1597 spin_lock_irq(&zone->lru_lock); 1655 spin_lock_irq(&zone->lru_lock);
1598 if (scanning_global_lru(sc)) { 1656 if (scanning_global_lru(sc)) {
1599 nr_taken = isolate_pages_global(nr_pages, &l_hold, 1657 nr_taken = isolate_pages_global(nr_pages, &l_hold,
1600 &pgscanned, sc->order, 1658 &pgscanned, sc->order,
1601 ISOLATE_ACTIVE, zone, 1659 reclaim_mode, zone,
1602 1, file); 1660 1, file);
1603 zone->pages_scanned += pgscanned; 1661 zone->pages_scanned += pgscanned;
1604 } else { 1662 } else {
1605 nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold, 1663 nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold,
1606 &pgscanned, sc->order, 1664 &pgscanned, sc->order,
1607 ISOLATE_ACTIVE, zone, 1665 reclaim_mode, zone,
1608 sc->mem_cgroup, 1, file); 1666 sc->mem_cgroup, 1, file);
1609 /* 1667 /*
1610 * mem_cgroup_isolate_pages() keeps track of 1668 * mem_cgroup_isolate_pages() keeps track of
@@ -1613,8 +1671,6 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1613 } 1671 }
1614 1672
1615 reclaim_stat->recent_scanned[file] += nr_taken; 1673 reclaim_stat->recent_scanned[file] += nr_taken;
1616 if (!scanning_global_lru(sc))
1617 sc->memcg_record->nr_scanned[file] += nr_taken;
1618 1674
1619 __count_zone_vm_events(PGREFILL, zone, pgscanned); 1675 __count_zone_vm_events(PGREFILL, zone, pgscanned);
1620 if (file) 1676 if (file)
@@ -1666,8 +1722,6 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1666 * get_scan_ratio. 1722 * get_scan_ratio.
1667 */ 1723 */
1668 reclaim_stat->recent_rotated[file] += nr_rotated; 1724 reclaim_stat->recent_rotated[file] += nr_rotated;
1669 if (!scanning_global_lru(sc))
1670 sc->memcg_record->nr_rotated[file] += nr_rotated;
1671 1725
1672 move_active_pages_to_lru(zone, &l_active, 1726 move_active_pages_to_lru(zone, &l_active,
1673 LRU_ACTIVE + file * LRU_FILE); 1727 LRU_ACTIVE + file * LRU_FILE);
@@ -1713,7 +1767,7 @@ static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc)
1713 if (scanning_global_lru(sc)) 1767 if (scanning_global_lru(sc))
1714 low = inactive_anon_is_low_global(zone); 1768 low = inactive_anon_is_low_global(zone);
1715 else 1769 else
1716 low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup); 1770 low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup, zone);
1717 return low; 1771 return low;
1718} 1772}
1719#else 1773#else
@@ -1756,7 +1810,7 @@ static int inactive_file_is_low(struct zone *zone, struct scan_control *sc)
1756 if (scanning_global_lru(sc)) 1810 if (scanning_global_lru(sc))
1757 low = inactive_file_is_low_global(zone); 1811 low = inactive_file_is_low_global(zone);
1758 else 1812 else
1759 low = mem_cgroup_inactive_file_is_low(sc->mem_cgroup); 1813 low = mem_cgroup_inactive_file_is_low(sc->mem_cgroup, zone);
1760 return low; 1814 return low;
1761} 1815}
1762 1816
@@ -1808,23 +1862,22 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
1808 u64 fraction[2], denominator; 1862 u64 fraction[2], denominator;
1809 enum lru_list l; 1863 enum lru_list l;
1810 int noswap = 0; 1864 int noswap = 0;
1811 int force_scan = 0; 1865 bool force_scan = false;
1812 unsigned long nr_force_scan[2];
1813
1814 1866
1815 anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) + 1867 /*
1816 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON); 1868 * If the zone or memcg is small, nr[l] can be 0. This
1817 file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) + 1869 * results in no scanning on this priority and a potential
1818 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); 1870 * priority drop. Global direct reclaim can go to the next
1819 1871 * zone and tends to have no problems. Global kswapd is for
1820 if (((anon + file) >> priority) < SWAP_CLUSTER_MAX) { 1872 * zone balancing and it needs to scan a minimum amount. When
1821 /* kswapd does zone balancing and need to scan this zone */ 1873 * reclaiming for a memcg, a priority drop can cause high
1822 if (scanning_global_lru(sc) && current_is_kswapd()) 1874 * latencies, so it's better to scan a minimum amount there as
1823 force_scan = 1; 1875 * well.
1824 /* memcg may have small limit and need to avoid priority drop */ 1876 */
1825 if (!scanning_global_lru(sc)) 1877 if (scanning_global_lru(sc) && current_is_kswapd())
1826 force_scan = 1; 1878 force_scan = true;
1827 } 1879 if (!scanning_global_lru(sc))
1880 force_scan = true;
1828 1881
1829 /* If we have no swap space, do not bother scanning anon pages. */ 1882 /* If we have no swap space, do not bother scanning anon pages. */
1830 if (!sc->may_swap || (nr_swap_pages <= 0)) { 1883 if (!sc->may_swap || (nr_swap_pages <= 0)) {
@@ -1832,11 +1885,14 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
1832 fraction[0] = 0; 1885 fraction[0] = 0;
1833 fraction[1] = 1; 1886 fraction[1] = 1;
1834 denominator = 1; 1887 denominator = 1;
1835 nr_force_scan[0] = 0;
1836 nr_force_scan[1] = SWAP_CLUSTER_MAX;
1837 goto out; 1888 goto out;
1838 } 1889 }
1839 1890
1891 anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) +
1892 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
1893 file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) +
1894 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
1895
1840 if (scanning_global_lru(sc)) { 1896 if (scanning_global_lru(sc)) {
1841 free = zone_page_state(zone, NR_FREE_PAGES); 1897 free = zone_page_state(zone, NR_FREE_PAGES);
1842 /* If we have very few page cache pages, 1898 /* If we have very few page cache pages,
@@ -1845,8 +1901,6 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
1845 fraction[0] = 1; 1901 fraction[0] = 1;
1846 fraction[1] = 0; 1902 fraction[1] = 0;
1847 denominator = 1; 1903 denominator = 1;
1848 nr_force_scan[0] = SWAP_CLUSTER_MAX;
1849 nr_force_scan[1] = 0;
1850 goto out; 1904 goto out;
1851 } 1905 }
1852 } 1906 }
@@ -1895,11 +1949,6 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
1895 fraction[0] = ap; 1949 fraction[0] = ap;
1896 fraction[1] = fp; 1950 fraction[1] = fp;
1897 denominator = ap + fp + 1; 1951 denominator = ap + fp + 1;
1898 if (force_scan) {
1899 unsigned long scan = SWAP_CLUSTER_MAX;
1900 nr_force_scan[0] = div64_u64(scan * ap, denominator);
1901 nr_force_scan[1] = div64_u64(scan * fp, denominator);
1902 }
1903out: 1952out:
1904 for_each_evictable_lru(l) { 1953 for_each_evictable_lru(l) {
1905 int file = is_file_lru(l); 1954 int file = is_file_lru(l);
@@ -1908,20 +1957,10 @@ out:
1908 scan = zone_nr_lru_pages(zone, sc, l); 1957 scan = zone_nr_lru_pages(zone, sc, l);
1909 if (priority || noswap) { 1958 if (priority || noswap) {
1910 scan >>= priority; 1959 scan >>= priority;
1960 if (!scan && force_scan)
1961 scan = SWAP_CLUSTER_MAX;
1911 scan = div64_u64(scan * fraction[file], denominator); 1962 scan = div64_u64(scan * fraction[file], denominator);
1912 } 1963 }
1913
1914 /*
1915 * If zone is small or memcg is small, nr[l] can be 0.
1916 * This results no-scan on this priority and priority drop down.
1917 * For global direct reclaim, it can visit next zone and tend
1918 * not to have problems. For global kswapd, it's for zone
1919 * balancing and it need to scan a small amounts. When using
1920 * memcg, priority drop can cause big latency. So, it's better
1921 * to scan small amount. See may_noscan above.
1922 */
1923 if (!scan && force_scan)
1924 scan = nr_force_scan[file];
1925 nr[l] = scan; 1964 nr[l] = scan;
1926 } 1965 }
1927} 1966}
@@ -2000,12 +2039,14 @@ static void shrink_zone(int priority, struct zone *zone,
2000 enum lru_list l; 2039 enum lru_list l;
2001 unsigned long nr_reclaimed, nr_scanned; 2040 unsigned long nr_reclaimed, nr_scanned;
2002 unsigned long nr_to_reclaim = sc->nr_to_reclaim; 2041 unsigned long nr_to_reclaim = sc->nr_to_reclaim;
2042 struct blk_plug plug;
2003 2043
2004restart: 2044restart:
2005 nr_reclaimed = 0; 2045 nr_reclaimed = 0;
2006 nr_scanned = sc->nr_scanned; 2046 nr_scanned = sc->nr_scanned;
2007 get_scan_count(zone, sc, nr, priority); 2047 get_scan_count(zone, sc, nr, priority);
2008 2048
2049 blk_start_plug(&plug);
2009 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || 2050 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
2010 nr[LRU_INACTIVE_FILE]) { 2051 nr[LRU_INACTIVE_FILE]) {
2011 for_each_evictable_lru(l) { 2052 for_each_evictable_lru(l) {
@@ -2029,6 +2070,7 @@ restart:
2029 if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY) 2070 if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY)
2030 break; 2071 break;
2031 } 2072 }
2073 blk_finish_plug(&plug);
2032 sc->nr_reclaimed += nr_reclaimed; 2074 sc->nr_reclaimed += nr_reclaimed;
2033 2075
2034 /* 2076 /*
@@ -2061,14 +2103,19 @@ restart:
2061 * 2103 *
2062 * If a zone is deemed to be full of pinned pages then just give it a light 2104 * If a zone is deemed to be full of pinned pages then just give it a light
2063 * scan then give up on it. 2105 * scan then give up on it.
2106 *
2107 * This function returns true if a zone is being reclaimed for a costly
2108 * high-order allocation and compaction is either ready to begin or deferred.
2109 * This indicates to the caller that it should retry the allocation or fail.
2064 */ 2110 */
2065static void shrink_zones(int priority, struct zonelist *zonelist, 2111static bool shrink_zones(int priority, struct zonelist *zonelist,
2066 struct scan_control *sc) 2112 struct scan_control *sc)
2067{ 2113{
2068 struct zoneref *z; 2114 struct zoneref *z;
2069 struct zone *zone; 2115 struct zone *zone;
2070 unsigned long nr_soft_reclaimed; 2116 unsigned long nr_soft_reclaimed;
2071 unsigned long nr_soft_scanned; 2117 unsigned long nr_soft_scanned;
2118 bool should_abort_reclaim = false;
2072 2119
2073 for_each_zone_zonelist_nodemask(zone, z, zonelist, 2120 for_each_zone_zonelist_nodemask(zone, z, zonelist,
2074 gfp_zone(sc->gfp_mask), sc->nodemask) { 2121 gfp_zone(sc->gfp_mask), sc->nodemask) {
@@ -2083,6 +2130,23 @@ static void shrink_zones(int priority, struct zonelist *zonelist,
2083 continue; 2130 continue;
2084 if (zone->all_unreclaimable && priority != DEF_PRIORITY) 2131 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
2085 continue; /* Let kswapd poll it */ 2132 continue; /* Let kswapd poll it */
2133 if (COMPACTION_BUILD) {
2134 /*
2135 * If we already have plenty of memory free for
2136 * compaction in this zone, don't free any more.
2137 * Even though compaction is invoked for any
2138 * non-zero order, only frequent costly order
2139 * reclamation is disruptive enough to become a
2140 * noticable problem, like transparent huge page
2141 * allocations.
2142 */
2143 if (sc->order > PAGE_ALLOC_COSTLY_ORDER &&
2144 (compaction_suitable(zone, sc->order) ||
2145 compaction_deferred(zone))) {
2146 should_abort_reclaim = true;
2147 continue;
2148 }
2149 }
2086 /* 2150 /*
2087 * This steals pages from memory cgroups over softlimit 2151 * This steals pages from memory cgroups over softlimit
2088 * and returns the number of reclaimed pages and 2152 * and returns the number of reclaimed pages and
@@ -2100,6 +2164,8 @@ static void shrink_zones(int priority, struct zonelist *zonelist,
2100 2164
2101 shrink_zone(priority, zone, sc); 2165 shrink_zone(priority, zone, sc);
2102 } 2166 }
2167
2168 return should_abort_reclaim;
2103} 2169}
2104 2170
2105static bool zone_reclaimable(struct zone *zone) 2171static bool zone_reclaimable(struct zone *zone)
@@ -2164,7 +2230,9 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2164 sc->nr_scanned = 0; 2230 sc->nr_scanned = 0;
2165 if (!priority) 2231 if (!priority)
2166 disable_swap_token(sc->mem_cgroup); 2232 disable_swap_token(sc->mem_cgroup);
2167 shrink_zones(priority, zonelist, sc); 2233 if (shrink_zones(priority, zonelist, sc))
2234 break;
2235
2168 /* 2236 /*
2169 * Don't shrink slabs when reclaiming memory from 2237 * Don't shrink slabs when reclaiming memory from
2170 * over limit cgroups 2238 * over limit cgroups
@@ -2198,7 +2266,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2198 */ 2266 */
2199 writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2; 2267 writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2;
2200 if (total_scanned > writeback_threshold) { 2268 if (total_scanned > writeback_threshold) {
2201 wakeup_flusher_threads(laptop_mode ? 0 : total_scanned); 2269 wakeup_flusher_threads(laptop_mode ? 0 : total_scanned,
2270 WB_REASON_TRY_TO_FREE_PAGES);
2202 sc->may_writepage = 1; 2271 sc->may_writepage = 1;
2203 } 2272 }
2204 2273
@@ -2268,10 +2337,9 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
2268#ifdef CONFIG_CGROUP_MEM_RES_CTLR 2337#ifdef CONFIG_CGROUP_MEM_RES_CTLR
2269 2338
2270unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, 2339unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
2271 gfp_t gfp_mask, bool noswap, 2340 gfp_t gfp_mask, bool noswap,
2272 struct zone *zone, 2341 struct zone *zone,
2273 struct memcg_scanrecord *rec, 2342 unsigned long *nr_scanned)
2274 unsigned long *scanned)
2275{ 2343{
2276 struct scan_control sc = { 2344 struct scan_control sc = {
2277 .nr_scanned = 0, 2345 .nr_scanned = 0,
@@ -2281,9 +2349,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
2281 .may_swap = !noswap, 2349 .may_swap = !noswap,
2282 .order = 0, 2350 .order = 0,
2283 .mem_cgroup = mem, 2351 .mem_cgroup = mem,
2284 .memcg_record = rec,
2285 }; 2352 };
2286 unsigned long start, end;
2287 2353
2288 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | 2354 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
2289 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); 2355 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
@@ -2292,7 +2358,6 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
2292 sc.may_writepage, 2358 sc.may_writepage,
2293 sc.gfp_mask); 2359 sc.gfp_mask);
2294 2360
2295 start = sched_clock();
2296 /* 2361 /*
2297 * NOTE: Although we can get the priority field, using it 2362 * NOTE: Although we can get the priority field, using it
2298 * here is not a good idea, since it limits the pages we can scan. 2363 * here is not a good idea, since it limits the pages we can scan.
@@ -2301,25 +2366,19 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
2301 * the priority and make it zero. 2366 * the priority and make it zero.
2302 */ 2367 */
2303 shrink_zone(0, zone, &sc); 2368 shrink_zone(0, zone, &sc);
2304 end = sched_clock();
2305
2306 if (rec)
2307 rec->elapsed += end - start;
2308 *scanned = sc.nr_scanned;
2309 2369
2310 trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); 2370 trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
2311 2371
2372 *nr_scanned = sc.nr_scanned;
2312 return sc.nr_reclaimed; 2373 return sc.nr_reclaimed;
2313} 2374}
2314 2375
2315unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, 2376unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
2316 gfp_t gfp_mask, 2377 gfp_t gfp_mask,
2317 bool noswap, 2378 bool noswap)
2318 struct memcg_scanrecord *rec)
2319{ 2379{
2320 struct zonelist *zonelist; 2380 struct zonelist *zonelist;
2321 unsigned long nr_reclaimed; 2381 unsigned long nr_reclaimed;
2322 unsigned long start, end;
2323 int nid; 2382 int nid;
2324 struct scan_control sc = { 2383 struct scan_control sc = {
2325 .may_writepage = !laptop_mode, 2384 .may_writepage = !laptop_mode,
@@ -2328,7 +2387,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
2328 .nr_to_reclaim = SWAP_CLUSTER_MAX, 2387 .nr_to_reclaim = SWAP_CLUSTER_MAX,
2329 .order = 0, 2388 .order = 0,
2330 .mem_cgroup = mem_cont, 2389 .mem_cgroup = mem_cont,
2331 .memcg_record = rec,
2332 .nodemask = NULL, /* we don't care the placement */ 2390 .nodemask = NULL, /* we don't care the placement */
2333 .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | 2391 .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
2334 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), 2392 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
@@ -2337,7 +2395,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
2337 .gfp_mask = sc.gfp_mask, 2395 .gfp_mask = sc.gfp_mask,
2338 }; 2396 };
2339 2397
2340 start = sched_clock();
2341 /* 2398 /*
2342 * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't 2399 * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't
2343 * take care of from where we get pages. So the node where we start the 2400 * take care of from where we get pages. So the node where we start the
@@ -2352,9 +2409,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
2352 sc.gfp_mask); 2409 sc.gfp_mask);
2353 2410
2354 nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink); 2411 nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
2355 end = sched_clock();
2356 if (rec)
2357 rec->elapsed += end - start;
2358 2412
2359 trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); 2413 trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
2360 2414
@@ -2529,6 +2583,9 @@ loop_again:
2529 high_wmark_pages(zone), 0, 0)) { 2583 high_wmark_pages(zone), 0, 0)) {
2530 end_zone = i; 2584 end_zone = i;
2531 break; 2585 break;
2586 } else {
2587 /* If balanced, clear the congested flag */
2588 zone_clear_flag(zone, ZONE_CONGESTED);
2532 } 2589 }
2533 } 2590 }
2534 if (i < 0) 2591 if (i < 0)
@@ -2719,6 +2776,8 @@ out:
2719 2776
2720 /* If balanced, clear the congested flag */ 2777 /* If balanced, clear the congested flag */
2721 zone_clear_flag(zone, ZONE_CONGESTED); 2778 zone_clear_flag(zone, ZONE_CONGESTED);
2779 if (i <= *classzone_idx)
2780 balanced += zone->present_pages;
2722 } 2781 }
2723 } 2782 }
2724 2783
@@ -2792,7 +2851,9 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
2792static int kswapd(void *p) 2851static int kswapd(void *p)
2793{ 2852{
2794 unsigned long order, new_order; 2853 unsigned long order, new_order;
2854 unsigned balanced_order;
2795 int classzone_idx, new_classzone_idx; 2855 int classzone_idx, new_classzone_idx;
2856 int balanced_classzone_idx;
2796 pg_data_t *pgdat = (pg_data_t*)p; 2857 pg_data_t *pgdat = (pg_data_t*)p;
2797 struct task_struct *tsk = current; 2858 struct task_struct *tsk = current;
2798 2859
@@ -2823,7 +2884,9 @@ static int kswapd(void *p)
2823 set_freezable(); 2884 set_freezable();
2824 2885
2825 order = new_order = 0; 2886 order = new_order = 0;
2887 balanced_order = 0;
2826 classzone_idx = new_classzone_idx = pgdat->nr_zones - 1; 2888 classzone_idx = new_classzone_idx = pgdat->nr_zones - 1;
2889 balanced_classzone_idx = classzone_idx;
2827 for ( ; ; ) { 2890 for ( ; ; ) {
2828 int ret; 2891 int ret;
2829 2892
@@ -2832,7 +2895,8 @@ static int kswapd(void *p)
2832 * new request of a similar or harder type will succeed soon 2895 * new request of a similar or harder type will succeed soon
2833 * so consider going to sleep on the basis we reclaimed at 2896 * so consider going to sleep on the basis we reclaimed at
2834 */ 2897 */
2835 if (classzone_idx >= new_classzone_idx && order == new_order) { 2898 if (balanced_classzone_idx >= new_classzone_idx &&
2899 balanced_order == new_order) {
2836 new_order = pgdat->kswapd_max_order; 2900 new_order = pgdat->kswapd_max_order;
2837 new_classzone_idx = pgdat->classzone_idx; 2901 new_classzone_idx = pgdat->classzone_idx;
2838 pgdat->kswapd_max_order = 0; 2902 pgdat->kswapd_max_order = 0;
@@ -2847,9 +2911,12 @@ static int kswapd(void *p)
2847 order = new_order; 2911 order = new_order;
2848 classzone_idx = new_classzone_idx; 2912 classzone_idx = new_classzone_idx;
2849 } else { 2913 } else {
2850 kswapd_try_to_sleep(pgdat, order, classzone_idx); 2914 kswapd_try_to_sleep(pgdat, balanced_order,
2915 balanced_classzone_idx);
2851 order = pgdat->kswapd_max_order; 2916 order = pgdat->kswapd_max_order;
2852 classzone_idx = pgdat->classzone_idx; 2917 classzone_idx = pgdat->classzone_idx;
2918 new_order = order;
2919 new_classzone_idx = classzone_idx;
2853 pgdat->kswapd_max_order = 0; 2920 pgdat->kswapd_max_order = 0;
2854 pgdat->classzone_idx = pgdat->nr_zones - 1; 2921 pgdat->classzone_idx = pgdat->nr_zones - 1;
2855 } 2922 }
@@ -2864,7 +2931,9 @@ static int kswapd(void *p)
2864 */ 2931 */
2865 if (!ret) { 2932 if (!ret) {
2866 trace_mm_vmscan_kswapd_wake(pgdat->node_id, order); 2933 trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
2867 order = balance_pgdat(pgdat, order, &classzone_idx); 2934 balanced_classzone_idx = classzone_idx;
2935 balanced_order = balance_pgdat(pgdat, order,
2936 &balanced_classzone_idx);
2868 } 2937 }
2869 } 2938 }
2870 return 0; 2939 return 0;
@@ -3376,66 +3445,12 @@ void scan_mapping_unevictable_pages(struct address_space *mapping)
3376 3445
3377} 3446}
3378 3447
3379/** 3448static void warn_scan_unevictable_pages(void)
3380 * scan_zone_unevictable_pages - check unevictable list for evictable pages
3381 * @zone - zone of which to scan the unevictable list
3382 *
3383 * Scan @zone's unevictable LRU lists to check for pages that have become
3384 * evictable. Move those that have to @zone's inactive list where they
3385 * become candidates for reclaim, unless shrink_inactive_zone() decides
3386 * to reactivate them. Pages that are still unevictable are rotated
3387 * back onto @zone's unevictable list.
3388 */
3389#define SCAN_UNEVICTABLE_BATCH_SIZE 16UL /* arbitrary lock hold batch size */
3390static void scan_zone_unevictable_pages(struct zone *zone)
3391{ 3449{
3392 struct list_head *l_unevictable = &zone->lru[LRU_UNEVICTABLE].list; 3450 printk_once(KERN_WARNING
3393 unsigned long scan; 3451 "The scan_unevictable_pages sysctl/node-interface has been "
3394 unsigned long nr_to_scan = zone_page_state(zone, NR_UNEVICTABLE); 3452 "disabled for lack of a legitimate use case. If you have "
3395 3453 "one, please send an email to linux-mm@kvack.org.\n");
3396 while (nr_to_scan > 0) {
3397 unsigned long batch_size = min(nr_to_scan,
3398 SCAN_UNEVICTABLE_BATCH_SIZE);
3399
3400 spin_lock_irq(&zone->lru_lock);
3401 for (scan = 0; scan < batch_size; scan++) {
3402 struct page *page = lru_to_page(l_unevictable);
3403
3404 if (!trylock_page(page))
3405 continue;
3406
3407 prefetchw_prev_lru_page(page, l_unevictable, flags);
3408
3409 if (likely(PageLRU(page) && PageUnevictable(page)))
3410 check_move_unevictable_page(page, zone);
3411
3412 unlock_page(page);
3413 }
3414 spin_unlock_irq(&zone->lru_lock);
3415
3416 nr_to_scan -= batch_size;
3417 }
3418}
3419
3420
3421/**
3422 * scan_all_zones_unevictable_pages - scan all unevictable lists for evictable pages
3423 *
3424 * A really big hammer: scan all zones' unevictable LRU lists to check for
3425 * pages that have become evictable. Move those back to the zones'
3426 * inactive list where they become candidates for reclaim.
3427 * This occurs when, e.g., we have unswappable pages on the unevictable lists,
3428 * and we add swap to the system. As such, it runs in the context of a task
3429 * that has possibly/probably made some previously unevictable pages
3430 * evictable.
3431 */
3432static void scan_all_zones_unevictable_pages(void)
3433{
3434 struct zone *zone;
3435
3436 for_each_zone(zone) {
3437 scan_zone_unevictable_pages(zone);
3438 }
3439} 3454}
3440 3455
3441/* 3456/*
@@ -3448,11 +3463,8 @@ int scan_unevictable_handler(struct ctl_table *table, int write,
3448 void __user *buffer, 3463 void __user *buffer,
3449 size_t *length, loff_t *ppos) 3464 size_t *length, loff_t *ppos)
3450{ 3465{
3466 warn_scan_unevictable_pages();
3451 proc_doulongvec_minmax(table, write, buffer, length, ppos); 3467 proc_doulongvec_minmax(table, write, buffer, length, ppos);
3452
3453 if (write && *(unsigned long *)table->data)
3454 scan_all_zones_unevictable_pages();
3455
3456 scan_unevictable_pages = 0; 3468 scan_unevictable_pages = 0;
3457 return 0; 3469 return 0;
3458} 3470}
@@ -3467,6 +3479,7 @@ static ssize_t read_scan_unevictable_node(struct sys_device *dev,
3467 struct sysdev_attribute *attr, 3479 struct sysdev_attribute *attr,
3468 char *buf) 3480 char *buf)
3469{ 3481{
3482 warn_scan_unevictable_pages();
3470 return sprintf(buf, "0\n"); /* always zero; should fit... */ 3483 return sprintf(buf, "0\n"); /* always zero; should fit... */
3471} 3484}
3472 3485
@@ -3474,19 +3487,7 @@ static ssize_t write_scan_unevictable_node(struct sys_device *dev,
3474 struct sysdev_attribute *attr, 3487 struct sysdev_attribute *attr,
3475 const char *buf, size_t count) 3488 const char *buf, size_t count)
3476{ 3489{
3477 struct zone *node_zones = NODE_DATA(dev->id)->node_zones; 3490 warn_scan_unevictable_pages();
3478 struct zone *zone;
3479 unsigned long res;
3480 unsigned long req = strict_strtoul(buf, 10, &res);
3481
3482 if (!req)
3483 return 1; /* zero is no-op */
3484
3485 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
3486 if (!populated_zone(zone))
3487 continue;
3488 scan_zone_unevictable_pages(zone);
3489 }
3490 return 1; 3491 return 1;
3491} 3492}
3492 3493
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 20c18b7694b2..8fd603b1665e 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -78,7 +78,7 @@ void vm_events_fold_cpu(int cpu)
78 * 78 *
79 * vm_stat contains the global counters 79 * vm_stat contains the global counters
80 */ 80 */
81atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS]; 81atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp;
82EXPORT_SYMBOL(vm_stat); 82EXPORT_SYMBOL(vm_stat);
83 83
84#ifdef CONFIG_SMP 84#ifdef CONFIG_SMP
@@ -659,7 +659,7 @@ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
659} 659}
660#endif 660#endif
661 661
662#if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS) 662#if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS) || defined(CONFIG_NUMA)
663#ifdef CONFIG_ZONE_DMA 663#ifdef CONFIG_ZONE_DMA
664#define TEXT_FOR_DMA(xx) xx "_dma", 664#define TEXT_FOR_DMA(xx) xx "_dma",
665#else 665#else
@@ -702,6 +702,7 @@ const char * const vmstat_text[] = {
702 "nr_unstable", 702 "nr_unstable",
703 "nr_bounce", 703 "nr_bounce",
704 "nr_vmscan_write", 704 "nr_vmscan_write",
705 "nr_vmscan_immediate_reclaim",
705 "nr_writeback_temp", 706 "nr_writeback_temp",
706 "nr_isolated_anon", 707 "nr_isolated_anon",
707 "nr_isolated_file", 708 "nr_isolated_file",
@@ -788,7 +789,7 @@ const char * const vmstat_text[] = {
788 789
789#endif /* CONFIG_VM_EVENTS_COUNTERS */ 790#endif /* CONFIG_VM_EVENTS_COUNTERS */
790}; 791};
791#endif /* CONFIG_PROC_FS || CONFIG_SYSFS */ 792#endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA */
792 793
793 794
794#ifdef CONFIG_PROC_FS 795#ifdef CONFIG_PROC_FS