aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig5
-rw-r--r--mm/Makefile3
-rw-r--r--mm/backing-dev.c127
-rw-r--r--mm/bootmem.c2
-rw-r--r--mm/bounce.c11
-rw-r--r--mm/compaction.c26
-rw-r--r--mm/debug-pagealloc.c56
-rw-r--r--mm/dmapool.c5
-rw-r--r--mm/failslab.c39
-rw-r--r--mm/filemap.c140
-rw-r--r--mm/filemap_xip.c2
-rw-r--r--mm/fremap.c1
-rw-r--r--mm/highmem.c8
-rw-r--r--mm/huge_memory.c97
-rw-r--r--mm/hugetlb.c43
-rw-r--r--mm/init-mm.c2
-rw-r--r--mm/internal.h46
-rw-r--r--mm/kmemleak.c4
-rw-r--r--mm/ksm.c3
-rw-r--r--mm/maccess.c2
-rw-r--r--mm/madvise.c2
-rw-r--r--mm/memblock.c17
-rw-r--r--mm/memcontrol.c1373
-rw-r--r--mm/memory-failure.c105
-rw-r--r--mm/memory.c129
-rw-r--r--mm/memory_hotplug.c70
-rw-r--r--mm/mempolicy.c29
-rw-r--r--mm/mempool.c2
-rw-r--r--mm/migrate.c85
-rw-r--r--mm/mincore.c11
-rw-r--r--mm/mlock.c15
-rw-r--r--mm/mm_init.c2
-rw-r--r--mm/mmap.c45
-rw-r--r--mm/mmu_context.c2
-rw-r--r--mm/mmu_notifier.c2
-rw-r--r--mm/mmzone.c1
-rw-r--r--mm/mremap.c42
-rw-r--r--mm/nobootmem.c2
-rw-r--r--mm/nommu.c41
-rw-r--r--mm/oom_kill.c61
-rw-r--r--mm/page-writeback.c915
-rw-r--r--mm/page_alloc.c135
-rw-r--r--mm/page_cgroup.c22
-rw-r--r--mm/pagewalk.c49
-rw-r--r--mm/process_vm_access.c496
-rw-r--r--mm/quicklist.c1
-rw-r--r--mm/readahead.c2
-rw-r--r--mm/rmap.c15
-rw-r--r--mm/shmem.c1827
-rw-r--r--mm/slab.c135
-rw-r--r--mm/slob.c10
-rw-r--r--mm/slub.c1278
-rw-r--r--mm/sparse-vmemmap.c1
-rw-r--r--mm/sparse.c4
-rw-r--r--mm/swap.c85
-rw-r--r--mm/swap_state.c1
-rw-r--r--mm/swapfile.c52
-rw-r--r--mm/thrash.c19
-rw-r--r--mm/truncate.c156
-rw-r--r--mm/util.c2
-rw-r--r--mm/vmalloc.c126
-rw-r--r--mm/vmscan.c454
-rw-r--r--mm/vmstat.c7
63 files changed, 5101 insertions, 3349 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 7c5697116fcf..e338407f1225 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -137,6 +137,9 @@ config HAVE_MEMBLOCK_NODE_MAP
137config ARCH_DISCARD_MEMBLOCK 137config ARCH_DISCARD_MEMBLOCK
138 boolean 138 boolean
139 139
140config NO_BOOTMEM
141 boolean
142
140# eventually, we can have this option just 'select SPARSEMEM' 143# eventually, we can have this option just 'select SPARSEMEM'
141config MEMORY_HOTPLUG 144config MEMORY_HOTPLUG
142 bool "Allow for memory hot-add" 145 bool "Allow for memory hot-add"
@@ -362,7 +365,7 @@ config CLEANCACHE
362 for clean pages that the kernel's pageframe replacement algorithm 365 for clean pages that the kernel's pageframe replacement algorithm
363 (PFRA) would like to keep around, but can't since there isn't enough 366 (PFRA) would like to keep around, but can't since there isn't enough
364 memory. So when the PFRA "evicts" a page, it first attempts to use 367 memory. So when the PFRA "evicts" a page, it first attempts to use
365 cleancacne code to put the data contained in that page into 368 cleancache code to put the data contained in that page into
366 "transcendent memory", memory that is not directly accessible or 369 "transcendent memory", memory that is not directly accessible or
367 addressable by the kernel and is of unknown and possibly 370 addressable by the kernel and is of unknown and possibly
368 time-varying size. And when a cleancache-enabled 371 time-varying size. And when a cleancache-enabled
diff --git a/mm/Makefile b/mm/Makefile
index 836e4163c1bf..50ec00ef2a0e 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -5,7 +5,8 @@
5mmu-y := nommu.o 5mmu-y := nommu.o
6mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ 6mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \
7 mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ 7 mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
8 vmalloc.o pagewalk.o pgtable-generic.o 8 vmalloc.o pagewalk.o pgtable-generic.o \
9 process_vm_access.o
9 10
10obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ 11obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
11 maccess.o page_alloc.o page-writeback.o \ 12 maccess.o page_alloc.o page-writeback.o \
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index f032e6e1e09a..71034f41a2ba 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -45,6 +45,17 @@ static struct timer_list sync_supers_timer;
45static int bdi_sync_supers(void *); 45static int bdi_sync_supers(void *);
46static void sync_supers_timer_fn(unsigned long); 46static void sync_supers_timer_fn(unsigned long);
47 47
48void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2)
49{
50 if (wb1 < wb2) {
51 spin_lock(&wb1->list_lock);
52 spin_lock_nested(&wb2->list_lock, 1);
53 } else {
54 spin_lock(&wb2->list_lock);
55 spin_lock_nested(&wb1->list_lock, 1);
56 }
57}
58
48#ifdef CONFIG_DEBUG_FS 59#ifdef CONFIG_DEBUG_FS
49#include <linux/debugfs.h> 60#include <linux/debugfs.h>
50#include <linux/seq_file.h> 61#include <linux/seq_file.h>
@@ -67,34 +78,44 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
67 struct inode *inode; 78 struct inode *inode;
68 79
69 nr_dirty = nr_io = nr_more_io = 0; 80 nr_dirty = nr_io = nr_more_io = 0;
70 spin_lock(&inode_wb_list_lock); 81 spin_lock(&wb->list_lock);
71 list_for_each_entry(inode, &wb->b_dirty, i_wb_list) 82 list_for_each_entry(inode, &wb->b_dirty, i_wb_list)
72 nr_dirty++; 83 nr_dirty++;
73 list_for_each_entry(inode, &wb->b_io, i_wb_list) 84 list_for_each_entry(inode, &wb->b_io, i_wb_list)
74 nr_io++; 85 nr_io++;
75 list_for_each_entry(inode, &wb->b_more_io, i_wb_list) 86 list_for_each_entry(inode, &wb->b_more_io, i_wb_list)
76 nr_more_io++; 87 nr_more_io++;
77 spin_unlock(&inode_wb_list_lock); 88 spin_unlock(&wb->list_lock);
78 89
79 global_dirty_limits(&background_thresh, &dirty_thresh); 90 global_dirty_limits(&background_thresh, &dirty_thresh);
80 bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); 91 bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
81 92
82#define K(x) ((x) << (PAGE_SHIFT - 10)) 93#define K(x) ((x) << (PAGE_SHIFT - 10))
83 seq_printf(m, 94 seq_printf(m,
84 "BdiWriteback: %8lu kB\n" 95 "BdiWriteback: %10lu kB\n"
85 "BdiReclaimable: %8lu kB\n" 96 "BdiReclaimable: %10lu kB\n"
86 "BdiDirtyThresh: %8lu kB\n" 97 "BdiDirtyThresh: %10lu kB\n"
87 "DirtyThresh: %8lu kB\n" 98 "DirtyThresh: %10lu kB\n"
88 "BackgroundThresh: %8lu kB\n" 99 "BackgroundThresh: %10lu kB\n"
89 "b_dirty: %8lu\n" 100 "BdiDirtied: %10lu kB\n"
90 "b_io: %8lu\n" 101 "BdiWritten: %10lu kB\n"
91 "b_more_io: %8lu\n" 102 "BdiWriteBandwidth: %10lu kBps\n"
92 "bdi_list: %8u\n" 103 "b_dirty: %10lu\n"
93 "state: %8lx\n", 104 "b_io: %10lu\n"
105 "b_more_io: %10lu\n"
106 "bdi_list: %10u\n"
107 "state: %10lx\n",
94 (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)), 108 (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)),
95 (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)), 109 (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)),
96 K(bdi_thresh), K(dirty_thresh), 110 K(bdi_thresh),
97 K(background_thresh), nr_dirty, nr_io, nr_more_io, 111 K(dirty_thresh),
112 K(background_thresh),
113 (unsigned long) K(bdi_stat(bdi, BDI_DIRTIED)),
114 (unsigned long) K(bdi_stat(bdi, BDI_WRITTEN)),
115 (unsigned long) K(bdi->write_bandwidth),
116 nr_dirty,
117 nr_io,
118 nr_more_io,
98 !list_empty(&bdi->bdi_list), bdi->state); 119 !list_empty(&bdi->bdi_list), bdi->state);
99#undef K 120#undef K
100 121
@@ -249,18 +270,6 @@ int bdi_has_dirty_io(struct backing_dev_info *bdi)
249 return wb_has_dirty_io(&bdi->wb); 270 return wb_has_dirty_io(&bdi->wb);
250} 271}
251 272
252static void bdi_flush_io(struct backing_dev_info *bdi)
253{
254 struct writeback_control wbc = {
255 .sync_mode = WB_SYNC_NONE,
256 .older_than_this = NULL,
257 .range_cyclic = 1,
258 .nr_to_write = 1024,
259 };
260
261 writeback_inodes_wb(&bdi->wb, &wbc);
262}
263
264/* 273/*
265 * kupdated() used to do this. We cannot do it from the bdi_forker_thread() 274 * kupdated() used to do this. We cannot do it from the bdi_forker_thread()
266 * or we risk deadlocking on ->s_umount. The longer term solution would be 275 * or we risk deadlocking on ->s_umount. The longer term solution would be
@@ -352,6 +361,17 @@ static unsigned long bdi_longest_inactive(void)
352 return max(5UL * 60 * HZ, interval); 361 return max(5UL * 60 * HZ, interval);
353} 362}
354 363
364/*
365 * Clear pending bit and wakeup anybody waiting for flusher thread creation or
366 * shutdown
367 */
368static void bdi_clear_pending(struct backing_dev_info *bdi)
369{
370 clear_bit(BDI_pending, &bdi->state);
371 smp_mb__after_clear_bit();
372 wake_up_bit(&bdi->state, BDI_pending);
373}
374
355static int bdi_forker_thread(void *ptr) 375static int bdi_forker_thread(void *ptr)
356{ 376{
357 struct bdi_writeback *me = ptr; 377 struct bdi_writeback *me = ptr;
@@ -383,6 +403,12 @@ static int bdi_forker_thread(void *ptr)
383 } 403 }
384 404
385 spin_lock_bh(&bdi_lock); 405 spin_lock_bh(&bdi_lock);
406 /*
407 * In the following loop we are going to check whether we have
408 * some work to do without any synchronization with tasks
409 * waking us up to do work for them. Set the task state here
410 * so that we don't miss wakeups after verifying conditions.
411 */
386 set_current_state(TASK_INTERRUPTIBLE); 412 set_current_state(TASK_INTERRUPTIBLE);
387 413
388 list_for_each_entry(bdi, &bdi_list, bdi_list) { 414 list_for_each_entry(bdi, &bdi_list, bdi_list) {
@@ -446,9 +472,11 @@ static int bdi_forker_thread(void *ptr)
446 if (IS_ERR(task)) { 472 if (IS_ERR(task)) {
447 /* 473 /*
448 * If thread creation fails, force writeout of 474 * If thread creation fails, force writeout of
449 * the bdi from the thread. 475 * the bdi from the thread. Hopefully 1024 is
476 * large enough for efficient IO.
450 */ 477 */
451 bdi_flush_io(bdi); 478 writeback_inodes_wb(&bdi->wb, 1024,
479 WB_REASON_FORKER_THREAD);
452 } else { 480 } else {
453 /* 481 /*
454 * The spinlock makes sure we do not lose 482 * The spinlock makes sure we do not lose
@@ -461,11 +489,13 @@ static int bdi_forker_thread(void *ptr)
461 spin_unlock_bh(&bdi->wb_lock); 489 spin_unlock_bh(&bdi->wb_lock);
462 wake_up_process(task); 490 wake_up_process(task);
463 } 491 }
492 bdi_clear_pending(bdi);
464 break; 493 break;
465 494
466 case KILL_THREAD: 495 case KILL_THREAD:
467 __set_current_state(TASK_RUNNING); 496 __set_current_state(TASK_RUNNING);
468 kthread_stop(task); 497 kthread_stop(task);
498 bdi_clear_pending(bdi);
469 break; 499 break;
470 500
471 case NO_ACTION: 501 case NO_ACTION:
@@ -481,16 +511,8 @@ static int bdi_forker_thread(void *ptr)
481 else 511 else
482 schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10)); 512 schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
483 try_to_freeze(); 513 try_to_freeze();
484 /* Back to the main loop */ 514 break;
485 continue;
486 } 515 }
487
488 /*
489 * Clear pending bit and wakeup anybody waiting to tear us down.
490 */
491 clear_bit(BDI_pending, &bdi->state);
492 smp_mb__after_clear_bit();
493 wake_up_bit(&bdi->state, BDI_pending);
494 } 516 }
495 517
496 return 0; 518 return 0;
@@ -505,7 +527,7 @@ static void bdi_remove_from_list(struct backing_dev_info *bdi)
505 list_del_rcu(&bdi->bdi_list); 527 list_del_rcu(&bdi->bdi_list);
506 spin_unlock_bh(&bdi_lock); 528 spin_unlock_bh(&bdi_lock);
507 529
508 synchronize_rcu(); 530 synchronize_rcu_expedited();
509} 531}
510 532
511int bdi_register(struct backing_dev_info *bdi, struct device *parent, 533int bdi_register(struct backing_dev_info *bdi, struct device *parent,
@@ -606,6 +628,7 @@ static void bdi_prune_sb(struct backing_dev_info *bdi)
606void bdi_unregister(struct backing_dev_info *bdi) 628void bdi_unregister(struct backing_dev_info *bdi)
607{ 629{
608 if (bdi->dev) { 630 if (bdi->dev) {
631 bdi_set_min_ratio(bdi, 0);
609 trace_writeback_bdi_unregister(bdi); 632 trace_writeback_bdi_unregister(bdi);
610 bdi_prune_sb(bdi); 633 bdi_prune_sb(bdi);
611 del_timer_sync(&bdi->wb.wakeup_timer); 634 del_timer_sync(&bdi->wb.wakeup_timer);
@@ -628,9 +651,15 @@ static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
628 INIT_LIST_HEAD(&wb->b_dirty); 651 INIT_LIST_HEAD(&wb->b_dirty);
629 INIT_LIST_HEAD(&wb->b_io); 652 INIT_LIST_HEAD(&wb->b_io);
630 INIT_LIST_HEAD(&wb->b_more_io); 653 INIT_LIST_HEAD(&wb->b_more_io);
654 spin_lock_init(&wb->list_lock);
631 setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi); 655 setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi);
632} 656}
633 657
658/*
659 * Initial write bandwidth: 100 MB/s
660 */
661#define INIT_BW (100 << (20 - PAGE_SHIFT))
662
634int bdi_init(struct backing_dev_info *bdi) 663int bdi_init(struct backing_dev_info *bdi)
635{ 664{
636 int i, err; 665 int i, err;
@@ -653,6 +682,15 @@ int bdi_init(struct backing_dev_info *bdi)
653 } 682 }
654 683
655 bdi->dirty_exceeded = 0; 684 bdi->dirty_exceeded = 0;
685
686 bdi->bw_time_stamp = jiffies;
687 bdi->written_stamp = 0;
688
689 bdi->balanced_dirty_ratelimit = INIT_BW;
690 bdi->dirty_ratelimit = INIT_BW;
691 bdi->write_bandwidth = INIT_BW;
692 bdi->avg_write_bandwidth = INIT_BW;
693
656 err = prop_local_init_percpu(&bdi->completions); 694 err = prop_local_init_percpu(&bdi->completions);
657 695
658 if (err) { 696 if (err) {
@@ -676,15 +714,24 @@ void bdi_destroy(struct backing_dev_info *bdi)
676 if (bdi_has_dirty_io(bdi)) { 714 if (bdi_has_dirty_io(bdi)) {
677 struct bdi_writeback *dst = &default_backing_dev_info.wb; 715 struct bdi_writeback *dst = &default_backing_dev_info.wb;
678 716
679 spin_lock(&inode_wb_list_lock); 717 bdi_lock_two(&bdi->wb, dst);
680 list_splice(&bdi->wb.b_dirty, &dst->b_dirty); 718 list_splice(&bdi->wb.b_dirty, &dst->b_dirty);
681 list_splice(&bdi->wb.b_io, &dst->b_io); 719 list_splice(&bdi->wb.b_io, &dst->b_io);
682 list_splice(&bdi->wb.b_more_io, &dst->b_more_io); 720 list_splice(&bdi->wb.b_more_io, &dst->b_more_io);
683 spin_unlock(&inode_wb_list_lock); 721 spin_unlock(&bdi->wb.list_lock);
722 spin_unlock(&dst->list_lock);
684 } 723 }
685 724
686 bdi_unregister(bdi); 725 bdi_unregister(bdi);
687 726
727 /*
728 * If bdi_unregister() had already been called earlier, the
729 * wakeup_timer could still be armed because bdi_prune_sb()
730 * can race with the bdi_wakeup_thread_delayed() calls from
731 * __mark_inode_dirty().
732 */
733 del_timer_sync(&bdi->wb.wakeup_timer);
734
688 for (i = 0; i < NR_BDI_STAT_ITEMS; i++) 735 for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
689 percpu_counter_destroy(&bdi->bdi_stat[i]); 736 percpu_counter_destroy(&bdi->bdi_stat[i]);
690 737
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 01d5a4b3dd0c..1a77012ecdb3 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -12,7 +12,7 @@
12#include <linux/pfn.h> 12#include <linux/pfn.h>
13#include <linux/slab.h> 13#include <linux/slab.h>
14#include <linux/bootmem.h> 14#include <linux/bootmem.h>
15#include <linux/module.h> 15#include <linux/export.h>
16#include <linux/kmemleak.h> 16#include <linux/kmemleak.h>
17#include <linux/range.h> 17#include <linux/range.h>
18#include <linux/memblock.h> 18#include <linux/memblock.h>
diff --git a/mm/bounce.c b/mm/bounce.c
index 1481de68184b..4e9ae722af83 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -4,7 +4,7 @@
4 */ 4 */
5 5
6#include <linux/mm.h> 6#include <linux/mm.h>
7#include <linux/module.h> 7#include <linux/export.h>
8#include <linux/swap.h> 8#include <linux/swap.h>
9#include <linux/gfp.h> 9#include <linux/gfp.h>
10#include <linux/bio.h> 10#include <linux/bio.h>
@@ -14,6 +14,7 @@
14#include <linux/init.h> 14#include <linux/init.h>
15#include <linux/hash.h> 15#include <linux/hash.h>
16#include <linux/highmem.h> 16#include <linux/highmem.h>
17#include <linux/bootmem.h>
17#include <asm/tlbflush.h> 18#include <asm/tlbflush.h>
18 19
19#include <trace/events/block.h> 20#include <trace/events/block.h>
@@ -26,12 +27,10 @@ static mempool_t *page_pool, *isa_page_pool;
26#ifdef CONFIG_HIGHMEM 27#ifdef CONFIG_HIGHMEM
27static __init int init_emergency_pool(void) 28static __init int init_emergency_pool(void)
28{ 29{
29 struct sysinfo i; 30#ifndef CONFIG_MEMORY_HOTPLUG
30 si_meminfo(&i); 31 if (max_pfn <= max_low_pfn)
31 si_swapinfo(&i);
32
33 if (!i.totalhigh)
34 return 0; 32 return 0;
33#endif
35 34
36 page_pool = mempool_create_page_pool(POOL_SIZE, 0); 35 page_pool = mempool_create_page_pool(POOL_SIZE, 0);
37 BUG_ON(!page_pool); 36 BUG_ON(!page_pool);
diff --git a/mm/compaction.c b/mm/compaction.c
index 6cc604bd5649..899d95638586 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -35,10 +35,6 @@ struct compact_control {
35 unsigned long migrate_pfn; /* isolate_migratepages search base */ 35 unsigned long migrate_pfn; /* isolate_migratepages search base */
36 bool sync; /* Synchronous migration */ 36 bool sync; /* Synchronous migration */
37 37
38 /* Account for isolated anon and file pages */
39 unsigned long nr_anon;
40 unsigned long nr_file;
41
42 unsigned int order; /* order a direct compactor needs */ 38 unsigned int order; /* order a direct compactor needs */
43 int migratetype; /* MOVABLE, RECLAIMABLE etc */ 39 int migratetype; /* MOVABLE, RECLAIMABLE etc */
44 struct zone *zone; 40 struct zone *zone;
@@ -223,17 +219,13 @@ static void isolate_freepages(struct zone *zone,
223static void acct_isolated(struct zone *zone, struct compact_control *cc) 219static void acct_isolated(struct zone *zone, struct compact_control *cc)
224{ 220{
225 struct page *page; 221 struct page *page;
226 unsigned int count[NR_LRU_LISTS] = { 0, }; 222 unsigned int count[2] = { 0, };
227 223
228 list_for_each_entry(page, &cc->migratepages, lru) { 224 list_for_each_entry(page, &cc->migratepages, lru)
229 int lru = page_lru_base_type(page); 225 count[!!page_is_file_cache(page)]++;
230 count[lru]++;
231 }
232 226
233 cc->nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON]; 227 __mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]);
234 cc->nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE]; 228 __mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]);
235 __mod_zone_page_state(zone, NR_ISOLATED_ANON, cc->nr_anon);
236 __mod_zone_page_state(zone, NR_ISOLATED_FILE, cc->nr_file);
237} 229}
238 230
239/* Similar to reclaim, but different enough that they don't share logic */ 231/* Similar to reclaim, but different enough that they don't share logic */
@@ -269,6 +261,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
269 unsigned long last_pageblock_nr = 0, pageblock_nr; 261 unsigned long last_pageblock_nr = 0, pageblock_nr;
270 unsigned long nr_scanned = 0, nr_isolated = 0; 262 unsigned long nr_scanned = 0, nr_isolated = 0;
271 struct list_head *migratelist = &cc->migratepages; 263 struct list_head *migratelist = &cc->migratepages;
264 isolate_mode_t mode = ISOLATE_ACTIVE|ISOLATE_INACTIVE;
272 265
273 /* Do not scan outside zone boundaries */ 266 /* Do not scan outside zone boundaries */
274 low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn); 267 low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn);
@@ -356,8 +349,11 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
356 continue; 349 continue;
357 } 350 }
358 351
352 if (!cc->sync)
353 mode |= ISOLATE_CLEAN;
354
359 /* Try isolate the page */ 355 /* Try isolate the page */
360 if (__isolate_lru_page(page, ISOLATE_BOTH, 0) != 0) 356 if (__isolate_lru_page(page, mode, 0) != 0)
361 continue; 357 continue;
362 358
363 VM_BUG_ON(PageTransCompound(page)); 359 VM_BUG_ON(PageTransCompound(page));
@@ -586,7 +582,7 @@ out:
586 return ret; 582 return ret;
587} 583}
588 584
589unsigned long compact_zone_order(struct zone *zone, 585static unsigned long compact_zone_order(struct zone *zone,
590 int order, gfp_t gfp_mask, 586 int order, gfp_t gfp_mask,
591 bool sync) 587 bool sync)
592{ 588{
diff --git a/mm/debug-pagealloc.c b/mm/debug-pagealloc.c
index a1e3324de2b5..7cea557407f4 100644
--- a/mm/debug-pagealloc.c
+++ b/mm/debug-pagealloc.c
@@ -1,7 +1,10 @@
1#include <linux/kernel.h> 1#include <linux/kernel.h>
2#include <linux/string.h>
2#include <linux/mm.h> 3#include <linux/mm.h>
4#include <linux/highmem.h>
3#include <linux/page-debug-flags.h> 5#include <linux/page-debug-flags.h>
4#include <linux/poison.h> 6#include <linux/poison.h>
7#include <linux/ratelimit.h>
5 8
6static inline void set_page_poison(struct page *page) 9static inline void set_page_poison(struct page *page)
7{ 10{
@@ -18,28 +21,13 @@ static inline bool page_poison(struct page *page)
18 return test_bit(PAGE_DEBUG_FLAG_POISON, &page->debug_flags); 21 return test_bit(PAGE_DEBUG_FLAG_POISON, &page->debug_flags);
19} 22}
20 23
21static void poison_highpage(struct page *page)
22{
23 /*
24 * Page poisoning for highmem pages is not implemented.
25 *
26 * This can be called from interrupt contexts.
27 * So we need to create a new kmap_atomic slot for this
28 * application and it will need interrupt protection.
29 */
30}
31
32static void poison_page(struct page *page) 24static void poison_page(struct page *page)
33{ 25{
34 void *addr; 26 void *addr = kmap_atomic(page);
35 27
36 if (PageHighMem(page)) {
37 poison_highpage(page);
38 return;
39 }
40 set_page_poison(page); 28 set_page_poison(page);
41 addr = page_address(page);
42 memset(addr, PAGE_POISON, PAGE_SIZE); 29 memset(addr, PAGE_POISON, PAGE_SIZE);
30 kunmap_atomic(addr);
43} 31}
44 32
45static void poison_pages(struct page *page, int n) 33static void poison_pages(struct page *page, int n)
@@ -59,14 +47,12 @@ static bool single_bit_flip(unsigned char a, unsigned char b)
59 47
60static void check_poison_mem(unsigned char *mem, size_t bytes) 48static void check_poison_mem(unsigned char *mem, size_t bytes)
61{ 49{
50 static DEFINE_RATELIMIT_STATE(ratelimit, 5 * HZ, 10);
62 unsigned char *start; 51 unsigned char *start;
63 unsigned char *end; 52 unsigned char *end;
64 53
65 for (start = mem; start < mem + bytes; start++) { 54 start = memchr_inv(mem, PAGE_POISON, bytes);
66 if (*start != PAGE_POISON) 55 if (!start)
67 break;
68 }
69 if (start == mem + bytes)
70 return; 56 return;
71 57
72 for (end = mem + bytes - 1; end > start; end--) { 58 for (end = mem + bytes - 1; end > start; end--) {
@@ -74,7 +60,7 @@ static void check_poison_mem(unsigned char *mem, size_t bytes)
74 break; 60 break;
75 } 61 }
76 62
77 if (!printk_ratelimit()) 63 if (!__ratelimit(&ratelimit))
78 return; 64 return;
79 else if (start == end && single_bit_flip(*start, PAGE_POISON)) 65 else if (start == end && single_bit_flip(*start, PAGE_POISON))
80 printk(KERN_ERR "pagealloc: single bit error\n"); 66 printk(KERN_ERR "pagealloc: single bit error\n");
@@ -86,27 +72,17 @@ static void check_poison_mem(unsigned char *mem, size_t bytes)
86 dump_stack(); 72 dump_stack();
87} 73}
88 74
89static void unpoison_highpage(struct page *page)
90{
91 /*
92 * See comment in poison_highpage().
93 * Highmem pages should not be poisoned for now
94 */
95 BUG_ON(page_poison(page));
96}
97
98static void unpoison_page(struct page *page) 75static void unpoison_page(struct page *page)
99{ 76{
100 if (PageHighMem(page)) { 77 void *addr;
101 unpoison_highpage(page); 78
79 if (!page_poison(page))
102 return; 80 return;
103 }
104 if (page_poison(page)) {
105 void *addr = page_address(page);
106 81
107 check_poison_mem(addr, PAGE_SIZE); 82 addr = kmap_atomic(page);
108 clear_page_poison(page); 83 check_poison_mem(addr, PAGE_SIZE);
109 } 84 clear_page_poison(page);
85 kunmap_atomic(addr);
110} 86}
111 87
112static void unpoison_pages(struct page *page, int n) 88static void unpoison_pages(struct page *page, int n)
diff --git a/mm/dmapool.c b/mm/dmapool.c
index 03bf3bb4519a..c5ab33bca0a8 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -27,11 +27,12 @@
27#include <linux/dmapool.h> 27#include <linux/dmapool.h>
28#include <linux/kernel.h> 28#include <linux/kernel.h>
29#include <linux/list.h> 29#include <linux/list.h>
30#include <linux/module.h> 30#include <linux/export.h>
31#include <linux/mutex.h> 31#include <linux/mutex.h>
32#include <linux/poison.h> 32#include <linux/poison.h>
33#include <linux/sched.h> 33#include <linux/sched.h>
34#include <linux/slab.h> 34#include <linux/slab.h>
35#include <linux/stat.h>
35#include <linux/spinlock.h> 36#include <linux/spinlock.h>
36#include <linux/string.h> 37#include <linux/string.h>
37#include <linux/types.h> 38#include <linux/types.h>
@@ -500,7 +501,7 @@ void dmam_pool_destroy(struct dma_pool *pool)
500{ 501{
501 struct device *dev = pool->dev; 502 struct device *dev = pool->dev;
502 503
503 dma_pool_destroy(pool);
504 WARN_ON(devres_destroy(dev, dmam_pool_release, dmam_pool_match, pool)); 504 WARN_ON(devres_destroy(dev, dmam_pool_release, dmam_pool_match, pool));
505 dma_pool_destroy(pool);
505} 506}
506EXPORT_SYMBOL(dmam_pool_destroy); 507EXPORT_SYMBOL(dmam_pool_destroy);
diff --git a/mm/failslab.c b/mm/failslab.c
index c5f88f240ddc..0dd7b8fec71c 100644
--- a/mm/failslab.c
+++ b/mm/failslab.c
@@ -5,10 +5,6 @@ static struct {
5 struct fault_attr attr; 5 struct fault_attr attr;
6 u32 ignore_gfp_wait; 6 u32 ignore_gfp_wait;
7 int cache_filter; 7 int cache_filter;
8#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
9 struct dentry *ignore_gfp_wait_file;
10 struct dentry *cache_filter_file;
11#endif
12} failslab = { 8} failslab = {
13 .attr = FAULT_ATTR_INITIALIZER, 9 .attr = FAULT_ATTR_INITIALIZER,
14 .ignore_gfp_wait = 1, 10 .ignore_gfp_wait = 1,
@@ -38,32 +34,25 @@ __setup("failslab=", setup_failslab);
38#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS 34#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
39static int __init failslab_debugfs_init(void) 35static int __init failslab_debugfs_init(void)
40{ 36{
41 mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
42 struct dentry *dir; 37 struct dentry *dir;
43 int err; 38 mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
44
45 err = init_fault_attr_dentries(&failslab.attr, "failslab");
46 if (err)
47 return err;
48 dir = failslab.attr.dentries.dir;
49 39
50 failslab.ignore_gfp_wait_file = 40 dir = fault_create_debugfs_attr("failslab", NULL, &failslab.attr);
51 debugfs_create_bool("ignore-gfp-wait", mode, dir, 41 if (IS_ERR(dir))
52 &failslab.ignore_gfp_wait); 42 return PTR_ERR(dir);
53 43
54 failslab.cache_filter_file = 44 if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,
55 debugfs_create_bool("cache-filter", mode, dir, 45 &failslab.ignore_gfp_wait))
56 &failslab.cache_filter); 46 goto fail;
47 if (!debugfs_create_bool("cache-filter", mode, dir,
48 &failslab.cache_filter))
49 goto fail;
57 50
58 if (!failslab.ignore_gfp_wait_file || 51 return 0;
59 !failslab.cache_filter_file) { 52fail:
60 err = -ENOMEM; 53 debugfs_remove_recursive(dir);
61 debugfs_remove(failslab.cache_filter_file);
62 debugfs_remove(failslab.ignore_gfp_wait_file);
63 cleanup_fault_attr_dentries(&failslab.attr);
64 }
65 54
66 return err; 55 return -ENOMEM;
67} 56}
68 57
69late_initcall(failslab_debugfs_init); 58late_initcall(failslab_debugfs_init);
diff --git a/mm/filemap.c b/mm/filemap.c
index a8251a8d3457..c0018f2d50e0 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -9,7 +9,7 @@
9 * most "normal" filesystems (but you don't /have/ to use this: 9 * most "normal" filesystems (but you don't /have/ to use this:
10 * the NFS filesystem used to do this differently, for example) 10 * the NFS filesystem used to do this differently, for example)
11 */ 11 */
12#include <linux/module.h> 12#include <linux/export.h>
13#include <linux/compiler.h> 13#include <linux/compiler.h>
14#include <linux/fs.h> 14#include <linux/fs.h>
15#include <linux/uaccess.h> 15#include <linux/uaccess.h>
@@ -33,7 +33,6 @@
33#include <linux/cpuset.h> 33#include <linux/cpuset.h>
34#include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */ 34#include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
35#include <linux/memcontrol.h> 35#include <linux/memcontrol.h>
36#include <linux/mm_inline.h> /* for page_is_file_cache() */
37#include <linux/cleancache.h> 36#include <linux/cleancache.h>
38#include "internal.h" 37#include "internal.h"
39 38
@@ -78,10 +77,7 @@
78 * ->i_mutex (generic_file_buffered_write) 77 * ->i_mutex (generic_file_buffered_write)
79 * ->mmap_sem (fault_in_pages_readable->do_page_fault) 78 * ->mmap_sem (fault_in_pages_readable->do_page_fault)
80 * 79 *
81 * ->i_mutex 80 * bdi->wb.list_lock
82 * ->i_alloc_sem (various)
83 *
84 * inode_wb_list_lock
85 * sb_lock (fs/fs-writeback.c) 81 * sb_lock (fs/fs-writeback.c)
86 * ->mapping->tree_lock (__sync_single_inode) 82 * ->mapping->tree_lock (__sync_single_inode)
87 * 83 *
@@ -99,9 +95,9 @@
99 * ->zone.lru_lock (check_pte_range->isolate_lru_page) 95 * ->zone.lru_lock (check_pte_range->isolate_lru_page)
100 * ->private_lock (page_remove_rmap->set_page_dirty) 96 * ->private_lock (page_remove_rmap->set_page_dirty)
101 * ->tree_lock (page_remove_rmap->set_page_dirty) 97 * ->tree_lock (page_remove_rmap->set_page_dirty)
102 * inode_wb_list_lock (page_remove_rmap->set_page_dirty) 98 * bdi.wb->list_lock (page_remove_rmap->set_page_dirty)
103 * ->inode->i_lock (page_remove_rmap->set_page_dirty) 99 * ->inode->i_lock (page_remove_rmap->set_page_dirty)
104 * inode_wb_list_lock (zap_pte_range->set_page_dirty) 100 * bdi.wb->list_lock (zap_pte_range->set_page_dirty)
105 * ->inode->i_lock (zap_pte_range->set_page_dirty) 101 * ->inode->i_lock (zap_pte_range->set_page_dirty)
106 * ->private_lock (zap_pte_range->__set_page_dirty_buffers) 102 * ->private_lock (zap_pte_range->__set_page_dirty_buffers)
107 * 103 *
@@ -131,6 +127,7 @@ void __delete_from_page_cache(struct page *page)
131 127
132 radix_tree_delete(&mapping->page_tree, page->index); 128 radix_tree_delete(&mapping->page_tree, page->index);
133 page->mapping = NULL; 129 page->mapping = NULL;
130 /* Leave page->index set: truncation lookup relies upon it */
134 mapping->nrpages--; 131 mapping->nrpages--;
135 __dec_zone_page_state(page, NR_FILE_PAGES); 132 __dec_zone_page_state(page, NR_FILE_PAGES);
136 if (PageSwapBacked(page)) 133 if (PageSwapBacked(page))
@@ -464,6 +461,7 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
464 int error; 461 int error;
465 462
466 VM_BUG_ON(!PageLocked(page)); 463 VM_BUG_ON(!PageLocked(page));
464 VM_BUG_ON(PageSwapBacked(page));
467 465
468 error = mem_cgroup_cache_charge(page, current->mm, 466 error = mem_cgroup_cache_charge(page, current->mm,
469 gfp_mask & GFP_RECLAIM_MASK); 467 gfp_mask & GFP_RECLAIM_MASK);
@@ -481,11 +479,10 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
481 if (likely(!error)) { 479 if (likely(!error)) {
482 mapping->nrpages++; 480 mapping->nrpages++;
483 __inc_zone_page_state(page, NR_FILE_PAGES); 481 __inc_zone_page_state(page, NR_FILE_PAGES);
484 if (PageSwapBacked(page))
485 __inc_zone_page_state(page, NR_SHMEM);
486 spin_unlock_irq(&mapping->tree_lock); 482 spin_unlock_irq(&mapping->tree_lock);
487 } else { 483 } else {
488 page->mapping = NULL; 484 page->mapping = NULL;
485 /* Leave page->index set: truncation relies upon it */
489 spin_unlock_irq(&mapping->tree_lock); 486 spin_unlock_irq(&mapping->tree_lock);
490 mem_cgroup_uncharge_cache_page(page); 487 mem_cgroup_uncharge_cache_page(page);
491 page_cache_release(page); 488 page_cache_release(page);
@@ -503,22 +500,9 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
503{ 500{
504 int ret; 501 int ret;
505 502
506 /*
507 * Splice_read and readahead add shmem/tmpfs pages into the page cache
508 * before shmem_readpage has a chance to mark them as SwapBacked: they
509 * need to go on the anon lru below, and mem_cgroup_cache_charge
510 * (called in add_to_page_cache) needs to know where they're going too.
511 */
512 if (mapping_cap_swap_backed(mapping))
513 SetPageSwapBacked(page);
514
515 ret = add_to_page_cache(page, mapping, offset, gfp_mask); 503 ret = add_to_page_cache(page, mapping, offset, gfp_mask);
516 if (ret == 0) { 504 if (ret == 0)
517 if (page_is_file_cache(page)) 505 lru_cache_add_file(page);
518 lru_cache_add_file(page);
519 else
520 lru_cache_add_anon(page);
521 }
522 return ret; 506 return ret;
523} 507}
524EXPORT_SYMBOL_GPL(add_to_page_cache_lru); 508EXPORT_SYMBOL_GPL(add_to_page_cache_lru);
@@ -715,9 +699,16 @@ repeat:
715 page = radix_tree_deref_slot(pagep); 699 page = radix_tree_deref_slot(pagep);
716 if (unlikely(!page)) 700 if (unlikely(!page))
717 goto out; 701 goto out;
718 if (radix_tree_deref_retry(page)) 702 if (radix_tree_exception(page)) {
719 goto repeat; 703 if (radix_tree_deref_retry(page))
720 704 goto repeat;
705 /*
706 * Otherwise, shmem/tmpfs must be storing a swap entry
707 * here as an exceptional entry: so return it without
708 * attempting to raise page count.
709 */
710 goto out;
711 }
721 if (!page_cache_get_speculative(page)) 712 if (!page_cache_get_speculative(page))
722 goto repeat; 713 goto repeat;
723 714
@@ -754,7 +745,7 @@ struct page *find_lock_page(struct address_space *mapping, pgoff_t offset)
754 745
755repeat: 746repeat:
756 page = find_get_page(mapping, offset); 747 page = find_get_page(mapping, offset);
757 if (page) { 748 if (page && !radix_tree_exception(page)) {
758 lock_page(page); 749 lock_page(page);
759 /* Has the page been truncated? */ 750 /* Has the page been truncated? */
760 if (unlikely(page->mapping != mapping)) { 751 if (unlikely(page->mapping != mapping)) {
@@ -836,13 +827,14 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
836{ 827{
837 unsigned int i; 828 unsigned int i;
838 unsigned int ret; 829 unsigned int ret;
839 unsigned int nr_found; 830 unsigned int nr_found, nr_skip;
840 831
841 rcu_read_lock(); 832 rcu_read_lock();
842restart: 833restart:
843 nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, 834 nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
844 (void ***)pages, start, nr_pages); 835 (void ***)pages, NULL, start, nr_pages);
845 ret = 0; 836 ret = 0;
837 nr_skip = 0;
846 for (i = 0; i < nr_found; i++) { 838 for (i = 0; i < nr_found; i++) {
847 struct page *page; 839 struct page *page;
848repeat: 840repeat:
@@ -850,13 +842,23 @@ repeat:
850 if (unlikely(!page)) 842 if (unlikely(!page))
851 continue; 843 continue;
852 844
853 /* 845 if (radix_tree_exception(page)) {
854 * This can only trigger when the entry at index 0 moves out 846 if (radix_tree_deref_retry(page)) {
855 * of or back to the root: none yet gotten, safe to restart. 847 /*
856 */ 848 * Transient condition which can only trigger
857 if (radix_tree_deref_retry(page)) { 849 * when entry at index 0 moves out of or back
858 WARN_ON(start | i); 850 * to root: none yet gotten, safe to restart.
859 goto restart; 851 */
852 WARN_ON(start | i);
853 goto restart;
854 }
855 /*
856 * Otherwise, shmem/tmpfs must be storing a swap entry
857 * here as an exceptional entry: so skip over it -
858 * we only reach this from invalidate_mapping_pages().
859 */
860 nr_skip++;
861 continue;
860 } 862 }
861 863
862 if (!page_cache_get_speculative(page)) 864 if (!page_cache_get_speculative(page))
@@ -876,7 +878,7 @@ repeat:
876 * If all entries were removed before we could secure them, 878 * If all entries were removed before we could secure them,
877 * try again, because callers stop trying once 0 is returned. 879 * try again, because callers stop trying once 0 is returned.
878 */ 880 */
879 if (unlikely(!ret && nr_found)) 881 if (unlikely(!ret && nr_found > nr_skip))
880 goto restart; 882 goto restart;
881 rcu_read_unlock(); 883 rcu_read_unlock();
882 return ret; 884 return ret;
@@ -904,7 +906,7 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
904 rcu_read_lock(); 906 rcu_read_lock();
905restart: 907restart:
906 nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, 908 nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
907 (void ***)pages, index, nr_pages); 909 (void ***)pages, NULL, index, nr_pages);
908 ret = 0; 910 ret = 0;
909 for (i = 0; i < nr_found; i++) { 911 for (i = 0; i < nr_found; i++) {
910 struct page *page; 912 struct page *page;
@@ -913,12 +915,22 @@ repeat:
913 if (unlikely(!page)) 915 if (unlikely(!page))
914 continue; 916 continue;
915 917
916 /* 918 if (radix_tree_exception(page)) {
917 * This can only trigger when the entry at index 0 moves out 919 if (radix_tree_deref_retry(page)) {
918 * of or back to the root: none yet gotten, safe to restart. 920 /*
919 */ 921 * Transient condition which can only trigger
920 if (radix_tree_deref_retry(page)) 922 * when entry at index 0 moves out of or back
921 goto restart; 923 * to root: none yet gotten, safe to restart.
924 */
925 goto restart;
926 }
927 /*
928 * Otherwise, shmem/tmpfs must be storing a swap entry
929 * here as an exceptional entry: so stop looking for
930 * contiguous pages.
931 */
932 break;
933 }
922 934
923 if (!page_cache_get_speculative(page)) 935 if (!page_cache_get_speculative(page))
924 goto repeat; 936 goto repeat;
@@ -978,12 +990,21 @@ repeat:
978 if (unlikely(!page)) 990 if (unlikely(!page))
979 continue; 991 continue;
980 992
981 /* 993 if (radix_tree_exception(page)) {
982 * This can only trigger when the entry at index 0 moves out 994 if (radix_tree_deref_retry(page)) {
983 * of or back to the root: none yet gotten, safe to restart. 995 /*
984 */ 996 * Transient condition which can only trigger
985 if (radix_tree_deref_retry(page)) 997 * when entry at index 0 moves out of or back
986 goto restart; 998 * to root: none yet gotten, safe to restart.
999 */
1000 goto restart;
1001 }
1002 /*
1003 * This function is never used on a shmem/tmpfs
1004 * mapping, so a swap entry won't be found here.
1005 */
1006 BUG();
1007 }
987 1008
988 if (!page_cache_get_speculative(page)) 1009 if (!page_cache_get_speculative(page))
989 goto repeat; 1010 goto repeat;
@@ -1795,7 +1816,7 @@ EXPORT_SYMBOL(generic_file_readonly_mmap);
1795 1816
1796static struct page *__read_cache_page(struct address_space *mapping, 1817static struct page *__read_cache_page(struct address_space *mapping,
1797 pgoff_t index, 1818 pgoff_t index,
1798 int (*filler)(void *,struct page*), 1819 int (*filler)(void *, struct page *),
1799 void *data, 1820 void *data,
1800 gfp_t gfp) 1821 gfp_t gfp)
1801{ 1822{
@@ -1826,7 +1847,7 @@ repeat:
1826 1847
1827static struct page *do_read_cache_page(struct address_space *mapping, 1848static struct page *do_read_cache_page(struct address_space *mapping,
1828 pgoff_t index, 1849 pgoff_t index,
1829 int (*filler)(void *,struct page*), 1850 int (*filler)(void *, struct page *),
1830 void *data, 1851 void *data,
1831 gfp_t gfp) 1852 gfp_t gfp)
1832 1853
@@ -1866,7 +1887,7 @@ out:
1866 * @mapping: the page's address_space 1887 * @mapping: the page's address_space
1867 * @index: the page index 1888 * @index: the page index
1868 * @filler: function to perform the read 1889 * @filler: function to perform the read
1869 * @data: destination for read data 1890 * @data: first arg to filler(data, page) function, often left as NULL
1870 * 1891 *
1871 * Same as read_cache_page, but don't wait for page to become unlocked 1892 * Same as read_cache_page, but don't wait for page to become unlocked
1872 * after submitting it to the filler. 1893 * after submitting it to the filler.
@@ -1878,7 +1899,7 @@ out:
1878 */ 1899 */
1879struct page *read_cache_page_async(struct address_space *mapping, 1900struct page *read_cache_page_async(struct address_space *mapping,
1880 pgoff_t index, 1901 pgoff_t index,
1881 int (*filler)(void *,struct page*), 1902 int (*filler)(void *, struct page *),
1882 void *data) 1903 void *data)
1883{ 1904{
1884 return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping)); 1905 return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping));
@@ -1926,7 +1947,7 @@ EXPORT_SYMBOL(read_cache_page_gfp);
1926 * @mapping: the page's address_space 1947 * @mapping: the page's address_space
1927 * @index: the page index 1948 * @index: the page index
1928 * @filler: function to perform the read 1949 * @filler: function to perform the read
1929 * @data: destination for read data 1950 * @data: first arg to filler(data, page) function, often left as NULL
1930 * 1951 *
1931 * Read into the page cache. If a page already exists, and PageUptodate() is 1952 * Read into the page cache. If a page already exists, and PageUptodate() is
1932 * not set, try to fill the page then wait for it to become unlocked. 1953 * not set, try to fill the page then wait for it to become unlocked.
@@ -1935,7 +1956,7 @@ EXPORT_SYMBOL(read_cache_page_gfp);
1935 */ 1956 */
1936struct page *read_cache_page(struct address_space *mapping, 1957struct page *read_cache_page(struct address_space *mapping,
1937 pgoff_t index, 1958 pgoff_t index,
1938 int (*filler)(void *,struct page*), 1959 int (*filler)(void *, struct page *),
1939 void *data) 1960 void *data)
1940{ 1961{
1941 return wait_on_page_read(read_cache_page_async(mapping, index, filler, data)); 1962 return wait_on_page_read(read_cache_page_async(mapping, index, filler, data));
@@ -2094,6 +2115,7 @@ void iov_iter_advance(struct iov_iter *i, size_t bytes)
2094 } else { 2115 } else {
2095 const struct iovec *iov = i->iov; 2116 const struct iovec *iov = i->iov;
2096 size_t base = i->iov_offset; 2117 size_t base = i->iov_offset;
2118 unsigned long nr_segs = i->nr_segs;
2097 2119
2098 /* 2120 /*
2099 * The !iov->iov_len check ensures we skip over unlikely 2121 * The !iov->iov_len check ensures we skip over unlikely
@@ -2109,11 +2131,13 @@ void iov_iter_advance(struct iov_iter *i, size_t bytes)
2109 base += copy; 2131 base += copy;
2110 if (iov->iov_len == base) { 2132 if (iov->iov_len == base) {
2111 iov++; 2133 iov++;
2134 nr_segs--;
2112 base = 0; 2135 base = 0;
2113 } 2136 }
2114 } 2137 }
2115 i->iov = iov; 2138 i->iov = iov;
2116 i->iov_offset = base; 2139 i->iov_offset = base;
2140 i->nr_segs = nr_segs;
2117 } 2141 }
2118} 2142}
2119EXPORT_SYMBOL(iov_iter_advance); 2143EXPORT_SYMBOL(iov_iter_advance);
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 93356cd12828..f91b2f687343 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -10,7 +10,7 @@
10 10
11#include <linux/fs.h> 11#include <linux/fs.h>
12#include <linux/pagemap.h> 12#include <linux/pagemap.h>
13#include <linux/module.h> 13#include <linux/export.h>
14#include <linux/uio.h> 14#include <linux/uio.h>
15#include <linux/rmap.h> 15#include <linux/rmap.h>
16#include <linux/mmu_notifier.h> 16#include <linux/mmu_notifier.h>
diff --git a/mm/fremap.c b/mm/fremap.c
index b8e0e2d468af..9ed4fd432467 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -13,7 +13,6 @@
13#include <linux/pagemap.h> 13#include <linux/pagemap.h>
14#include <linux/swapops.h> 14#include <linux/swapops.h>
15#include <linux/rmap.h> 15#include <linux/rmap.h>
16#include <linux/module.h>
17#include <linux/syscalls.h> 16#include <linux/syscalls.h>
18#include <linux/mmu_notifier.h> 17#include <linux/mmu_notifier.h>
19 18
diff --git a/mm/highmem.c b/mm/highmem.c
index 693394daa2ed..57d82c6250c3 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -17,7 +17,7 @@
17 */ 17 */
18 18
19#include <linux/mm.h> 19#include <linux/mm.h>
20#include <linux/module.h> 20#include <linux/export.h>
21#include <linux/swap.h> 21#include <linux/swap.h>
22#include <linux/bio.h> 22#include <linux/bio.h>
23#include <linux/pagemap.h> 23#include <linux/pagemap.h>
@@ -250,7 +250,7 @@ void *kmap_high_get(struct page *page)
250#endif 250#endif
251 251
252/** 252/**
253 * kunmap_high - map a highmem page into memory 253 * kunmap_high - unmap a highmem page into memory
254 * @page: &struct page to unmap 254 * @page: &struct page to unmap
255 * 255 *
256 * If ARCH_NEEDS_KMAP_HIGH_GET is not defined then this may be called 256 * If ARCH_NEEDS_KMAP_HIGH_GET is not defined then this may be called
@@ -326,7 +326,7 @@ static struct page_address_slot {
326 spinlock_t lock; /* Protect this bucket's list */ 326 spinlock_t lock; /* Protect this bucket's list */
327} ____cacheline_aligned_in_smp page_address_htable[1<<PA_HASH_ORDER]; 327} ____cacheline_aligned_in_smp page_address_htable[1<<PA_HASH_ORDER];
328 328
329static struct page_address_slot *page_slot(struct page *page) 329static struct page_address_slot *page_slot(const struct page *page)
330{ 330{
331 return &page_address_htable[hash_ptr(page, PA_HASH_ORDER)]; 331 return &page_address_htable[hash_ptr(page, PA_HASH_ORDER)];
332} 332}
@@ -337,7 +337,7 @@ static struct page_address_slot *page_slot(struct page *page)
337 * 337 *
338 * Returns the page's virtual address. 338 * Returns the page's virtual address.
339 */ 339 */
340void *page_address(struct page *page) 340void *page_address(const struct page *page)
341{ 341{
342 unsigned long flags; 342 unsigned long flags;
343 void *ret; 343 void *ret;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 81532f297fd2..4298abaae153 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -89,7 +89,8 @@ struct khugepaged_scan {
89 struct list_head mm_head; 89 struct list_head mm_head;
90 struct mm_slot *mm_slot; 90 struct mm_slot *mm_slot;
91 unsigned long address; 91 unsigned long address;
92} khugepaged_scan = { 92};
93static struct khugepaged_scan khugepaged_scan = {
93 .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head), 94 .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),
94}; 95};
95 96
@@ -829,7 +830,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
829 830
830 for (i = 0; i < HPAGE_PMD_NR; i++) { 831 for (i = 0; i < HPAGE_PMD_NR; i++) {
831 copy_user_highpage(pages[i], page + i, 832 copy_user_highpage(pages[i], page + i,
832 haddr + PAGE_SHIFT*i, vma); 833 haddr + PAGE_SIZE * i, vma);
833 __SetPageUptodate(pages[i]); 834 __SetPageUptodate(pages[i]);
834 cond_resched(); 835 cond_resched();
835 } 836 }
@@ -989,7 +990,7 @@ struct page *follow_trans_huge_pmd(struct mm_struct *mm,
989 page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; 990 page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
990 VM_BUG_ON(!PageCompound(page)); 991 VM_BUG_ON(!PageCompound(page));
991 if (flags & FOLL_GET) 992 if (flags & FOLL_GET)
992 get_page(page); 993 get_page_foll(page);
993 994
994out: 995out:
995 return page; 996 return page;
@@ -1052,6 +1053,51 @@ int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
1052 return ret; 1053 return ret;
1053} 1054}
1054 1055
1056int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
1057 unsigned long old_addr,
1058 unsigned long new_addr, unsigned long old_end,
1059 pmd_t *old_pmd, pmd_t *new_pmd)
1060{
1061 int ret = 0;
1062 pmd_t pmd;
1063
1064 struct mm_struct *mm = vma->vm_mm;
1065
1066 if ((old_addr & ~HPAGE_PMD_MASK) ||
1067 (new_addr & ~HPAGE_PMD_MASK) ||
1068 old_end - old_addr < HPAGE_PMD_SIZE ||
1069 (new_vma->vm_flags & VM_NOHUGEPAGE))
1070 goto out;
1071
1072 /*
1073 * The destination pmd shouldn't be established, free_pgtables()
1074 * should have release it.
1075 */
1076 if (WARN_ON(!pmd_none(*new_pmd))) {
1077 VM_BUG_ON(pmd_trans_huge(*new_pmd));
1078 goto out;
1079 }
1080
1081 spin_lock(&mm->page_table_lock);
1082 if (likely(pmd_trans_huge(*old_pmd))) {
1083 if (pmd_trans_splitting(*old_pmd)) {
1084 spin_unlock(&mm->page_table_lock);
1085 wait_split_huge_page(vma->anon_vma, old_pmd);
1086 ret = -1;
1087 } else {
1088 pmd = pmdp_get_and_clear(mm, old_addr, old_pmd);
1089 VM_BUG_ON(!pmd_none(*new_pmd));
1090 set_pmd_at(mm, new_addr, new_pmd, pmd);
1091 spin_unlock(&mm->page_table_lock);
1092 ret = 1;
1093 }
1094 } else {
1095 spin_unlock(&mm->page_table_lock);
1096 }
1097out:
1098 return ret;
1099}
1100
1055int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, 1101int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
1056 unsigned long addr, pgprot_t newprot) 1102 unsigned long addr, pgprot_t newprot)
1057{ 1103{
@@ -1156,6 +1202,7 @@ static void __split_huge_page_refcount(struct page *page)
1156 unsigned long head_index = page->index; 1202 unsigned long head_index = page->index;
1157 struct zone *zone = page_zone(page); 1203 struct zone *zone = page_zone(page);
1158 int zonestat; 1204 int zonestat;
1205 int tail_count = 0;
1159 1206
1160 /* prevent PageLRU to go away from under us, and freeze lru stats */ 1207 /* prevent PageLRU to go away from under us, and freeze lru stats */
1161 spin_lock_irq(&zone->lru_lock); 1208 spin_lock_irq(&zone->lru_lock);
@@ -1164,11 +1211,27 @@ static void __split_huge_page_refcount(struct page *page)
1164 for (i = 1; i < HPAGE_PMD_NR; i++) { 1211 for (i = 1; i < HPAGE_PMD_NR; i++) {
1165 struct page *page_tail = page + i; 1212 struct page *page_tail = page + i;
1166 1213
1167 /* tail_page->_count cannot change */ 1214 /* tail_page->_mapcount cannot change */
1168 atomic_sub(atomic_read(&page_tail->_count), &page->_count); 1215 BUG_ON(page_mapcount(page_tail) < 0);
1169 BUG_ON(page_count(page) <= 0); 1216 tail_count += page_mapcount(page_tail);
1170 atomic_add(page_mapcount(page) + 1, &page_tail->_count); 1217 /* check for overflow */
1171 BUG_ON(atomic_read(&page_tail->_count) <= 0); 1218 BUG_ON(tail_count < 0);
1219 BUG_ON(atomic_read(&page_tail->_count) != 0);
1220 /*
1221 * tail_page->_count is zero and not changing from
1222 * under us. But get_page_unless_zero() may be running
1223 * from under us on the tail_page. If we used
1224 * atomic_set() below instead of atomic_add(), we
1225 * would then run atomic_set() concurrently with
1226 * get_page_unless_zero(), and atomic_set() is
1227 * implemented in C not using locked ops. spin_unlock
1228 * on x86 sometime uses locked ops because of PPro
1229 * errata 66, 92, so unless somebody can guarantee
1230 * atomic_set() here would be safe on all archs (and
1231 * not only on x86), it's safer to use atomic_add().
1232 */
1233 atomic_add(page_mapcount(page) + page_mapcount(page_tail) + 1,
1234 &page_tail->_count);
1172 1235
1173 /* after clearing PageTail the gup refcount can be released */ 1236 /* after clearing PageTail the gup refcount can be released */
1174 smp_mb(); 1237 smp_mb();
@@ -1186,10 +1249,7 @@ static void __split_huge_page_refcount(struct page *page)
1186 (1L << PG_uptodate))); 1249 (1L << PG_uptodate)));
1187 page_tail->flags |= (1L << PG_dirty); 1250 page_tail->flags |= (1L << PG_dirty);
1188 1251
1189 /* 1252 /* clear PageTail before overwriting first_page */
1190 * 1) clear PageTail before overwriting first_page
1191 * 2) clear PageTail before clearing PageHead for VM_BUG_ON
1192 */
1193 smp_wmb(); 1253 smp_wmb();
1194 1254
1195 /* 1255 /*
@@ -1206,7 +1266,6 @@ static void __split_huge_page_refcount(struct page *page)
1206 * status is achieved setting a reserved bit in the 1266 * status is achieved setting a reserved bit in the
1207 * pmd, not by clearing the present bit. 1267 * pmd, not by clearing the present bit.
1208 */ 1268 */
1209 BUG_ON(page_mapcount(page_tail));
1210 page_tail->_mapcount = page->_mapcount; 1269 page_tail->_mapcount = page->_mapcount;
1211 1270
1212 BUG_ON(page_tail->mapping); 1271 BUG_ON(page_tail->mapping);
@@ -1223,6 +1282,8 @@ static void __split_huge_page_refcount(struct page *page)
1223 1282
1224 lru_add_page_tail(zone, page, page_tail); 1283 lru_add_page_tail(zone, page, page_tail);
1225 } 1284 }
1285 atomic_sub(tail_count, &page->_count);
1286 BUG_ON(atomic_read(&page->_count) <= 0);
1226 1287
1227 __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); 1288 __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
1228 __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR); 1289 __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR);
@@ -1596,14 +1657,13 @@ void __khugepaged_exit(struct mm_struct *mm)
1596 list_del(&mm_slot->mm_node); 1657 list_del(&mm_slot->mm_node);
1597 free = 1; 1658 free = 1;
1598 } 1659 }
1660 spin_unlock(&khugepaged_mm_lock);
1599 1661
1600 if (free) { 1662 if (free) {
1601 spin_unlock(&khugepaged_mm_lock);
1602 clear_bit(MMF_VM_HUGEPAGE, &mm->flags); 1663 clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
1603 free_mm_slot(mm_slot); 1664 free_mm_slot(mm_slot);
1604 mmdrop(mm); 1665 mmdrop(mm);
1605 } else if (mm_slot) { 1666 } else if (mm_slot) {
1606 spin_unlock(&khugepaged_mm_lock);
1607 /* 1667 /*
1608 * This is required to serialize against 1668 * This is required to serialize against
1609 * khugepaged_test_exit() (which is guaranteed to run 1669 * khugepaged_test_exit() (which is guaranteed to run
@@ -1614,8 +1674,7 @@ void __khugepaged_exit(struct mm_struct *mm)
1614 */ 1674 */
1615 down_write(&mm->mmap_sem); 1675 down_write(&mm->mmap_sem);
1616 up_write(&mm->mmap_sem); 1676 up_write(&mm->mmap_sem);
1617 } else 1677 }
1618 spin_unlock(&khugepaged_mm_lock);
1619} 1678}
1620 1679
1621static void release_pte_page(struct page *page) 1680static void release_pte_page(struct page *page)
@@ -1908,7 +1967,7 @@ static void collapse_huge_page(struct mm_struct *mm,
1908 BUG_ON(!pmd_none(*pmd)); 1967 BUG_ON(!pmd_none(*pmd));
1909 page_add_new_anon_rmap(new_page, vma, address); 1968 page_add_new_anon_rmap(new_page, vma, address);
1910 set_pmd_at(mm, address, pmd, _pmd); 1969 set_pmd_at(mm, address, pmd, _pmd);
1911 update_mmu_cache(vma, address, entry); 1970 update_mmu_cache(vma, address, _pmd);
1912 prepare_pmd_huge_pte(pgtable, mm); 1971 prepare_pmd_huge_pte(pgtable, mm);
1913 mm->nr_ptes--; 1972 mm->nr_ptes--;
1914 spin_unlock(&mm->page_table_lock); 1973 spin_unlock(&mm->page_table_lock);
@@ -2026,6 +2085,8 @@ static void collect_mm_slot(struct mm_slot *mm_slot)
2026 2085
2027static unsigned int khugepaged_scan_mm_slot(unsigned int pages, 2086static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
2028 struct page **hpage) 2087 struct page **hpage)
2088 __releases(&khugepaged_mm_lock)
2089 __acquires(&khugepaged_mm_lock)
2029{ 2090{
2030 struct mm_slot *mm_slot; 2091 struct mm_slot *mm_slot;
2031 struct mm_struct *mm; 2092 struct mm_struct *mm;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index bfcf153bc829..bb28a5f9db8d 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -24,7 +24,7 @@
24 24
25#include <asm/page.h> 25#include <asm/page.h>
26#include <asm/pgtable.h> 26#include <asm/pgtable.h>
27#include <asm/io.h> 27#include <linux/io.h>
28 28
29#include <linux/hugetlb.h> 29#include <linux/hugetlb.h>
30#include <linux/node.h> 30#include <linux/node.h>
@@ -62,10 +62,10 @@ static DEFINE_SPINLOCK(hugetlb_lock);
62 * must either hold the mmap_sem for write, or the mmap_sem for read and 62 * must either hold the mmap_sem for write, or the mmap_sem for read and
63 * the hugetlb_instantiation mutex: 63 * the hugetlb_instantiation mutex:
64 * 64 *
65 * down_write(&mm->mmap_sem); 65 * down_write(&mm->mmap_sem);
66 * or 66 * or
67 * down_read(&mm->mmap_sem); 67 * down_read(&mm->mmap_sem);
68 * mutex_lock(&hugetlb_instantiation_mutex); 68 * mutex_lock(&hugetlb_instantiation_mutex);
69 */ 69 */
70struct file_region { 70struct file_region {
71 struct list_head link; 71 struct list_head link;
@@ -503,9 +503,10 @@ static void update_and_free_page(struct hstate *h, struct page *page)
503 h->nr_huge_pages--; 503 h->nr_huge_pages--;
504 h->nr_huge_pages_node[page_to_nid(page)]--; 504 h->nr_huge_pages_node[page_to_nid(page)]--;
505 for (i = 0; i < pages_per_huge_page(h); i++) { 505 for (i = 0; i < pages_per_huge_page(h); i++) {
506 page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | 506 page[i].flags &= ~(1 << PG_locked | 1 << PG_error |
507 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved | 507 1 << PG_referenced | 1 << PG_dirty |
508 1 << PG_private | 1<< PG_writeback); 508 1 << PG_active | 1 << PG_reserved |
509 1 << PG_private | 1 << PG_writeback);
509 } 510 }
510 set_compound_page_dtor(page, NULL); 511 set_compound_page_dtor(page, NULL);
511 set_page_refcounted(page); 512 set_page_refcounted(page);
@@ -591,7 +592,6 @@ int PageHuge(struct page *page)
591 592
592 return dtor == free_huge_page; 593 return dtor == free_huge_page;
593} 594}
594
595EXPORT_SYMBOL_GPL(PageHuge); 595EXPORT_SYMBOL_GPL(PageHuge);
596 596
597static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) 597static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
@@ -1105,8 +1105,16 @@ static void __init gather_bootmem_prealloc(void)
1105 struct huge_bootmem_page *m; 1105 struct huge_bootmem_page *m;
1106 1106
1107 list_for_each_entry(m, &huge_boot_pages, list) { 1107 list_for_each_entry(m, &huge_boot_pages, list) {
1108 struct page *page = virt_to_page(m);
1109 struct hstate *h = m->hstate; 1108 struct hstate *h = m->hstate;
1109 struct page *page;
1110
1111#ifdef CONFIG_HIGHMEM
1112 page = pfn_to_page(m->phys >> PAGE_SHIFT);
1113 free_bootmem_late((unsigned long)m,
1114 sizeof(struct huge_bootmem_page));
1115#else
1116 page = virt_to_page(m);
1117#endif
1110 __ClearPageReserved(page); 1118 __ClearPageReserved(page);
1111 WARN_ON(page_count(page) != 1); 1119 WARN_ON(page_count(page) != 1);
1112 prep_compound_huge_page(page, h->order); 1120 prep_compound_huge_page(page, h->order);
@@ -2124,9 +2132,8 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma,
2124 pte_t entry; 2132 pte_t entry;
2125 2133
2126 entry = pte_mkwrite(pte_mkdirty(huge_ptep_get(ptep))); 2134 entry = pte_mkwrite(pte_mkdirty(huge_ptep_get(ptep)));
2127 if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) { 2135 if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1))
2128 update_mmu_cache(vma, address, ptep); 2136 update_mmu_cache(vma, address, ptep);
2129 }
2130} 2137}
2131 2138
2132 2139
@@ -2181,9 +2188,9 @@ static int is_hugetlb_entry_migration(pte_t pte)
2181 if (huge_pte_none(pte) || pte_present(pte)) 2188 if (huge_pte_none(pte) || pte_present(pte))
2182 return 0; 2189 return 0;
2183 swp = pte_to_swp_entry(pte); 2190 swp = pte_to_swp_entry(pte);
2184 if (non_swap_entry(swp) && is_migration_entry(swp)) { 2191 if (non_swap_entry(swp) && is_migration_entry(swp))
2185 return 1; 2192 return 1;
2186 } else 2193 else
2187 return 0; 2194 return 0;
2188} 2195}
2189 2196
@@ -2194,9 +2201,9 @@ static int is_hugetlb_entry_hwpoisoned(pte_t pte)
2194 if (huge_pte_none(pte) || pte_present(pte)) 2201 if (huge_pte_none(pte) || pte_present(pte))
2195 return 0; 2202 return 0;
2196 swp = pte_to_swp_entry(pte); 2203 swp = pte_to_swp_entry(pte);
2197 if (non_swap_entry(swp) && is_hwpoison_entry(swp)) { 2204 if (non_swap_entry(swp) && is_hwpoison_entry(swp))
2198 return 1; 2205 return 1;
2199 } else 2206 else
2200 return 0; 2207 return 0;
2201} 2208}
2202 2209
@@ -2415,6 +2422,8 @@ retry_avoidcopy:
2415 * anon_vma prepared. 2422 * anon_vma prepared.
2416 */ 2423 */
2417 if (unlikely(anon_vma_prepare(vma))) { 2424 if (unlikely(anon_vma_prepare(vma))) {
2425 page_cache_release(new_page);
2426 page_cache_release(old_page);
2418 /* Caller expects lock to be held */ 2427 /* Caller expects lock to be held */
2419 spin_lock(&mm->page_table_lock); 2428 spin_lock(&mm->page_table_lock);
2420 return VM_FAULT_OOM; 2429 return VM_FAULT_OOM;
@@ -2559,7 +2568,7 @@ retry:
2559 * So we need to block hugepage fault by PG_hwpoison bit check. 2568 * So we need to block hugepage fault by PG_hwpoison bit check.
2560 */ 2569 */
2561 if (unlikely(PageHWPoison(page))) { 2570 if (unlikely(PageHWPoison(page))) {
2562 ret = VM_FAULT_HWPOISON | 2571 ret = VM_FAULT_HWPOISON |
2563 VM_FAULT_SET_HINDEX(h - hstates); 2572 VM_FAULT_SET_HINDEX(h - hstates);
2564 goto backout_unlocked; 2573 goto backout_unlocked;
2565 } 2574 }
@@ -2627,7 +2636,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2627 migration_entry_wait(mm, (pmd_t *)ptep, address); 2636 migration_entry_wait(mm, (pmd_t *)ptep, address);
2628 return 0; 2637 return 0;
2629 } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) 2638 } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
2630 return VM_FAULT_HWPOISON_LARGE | 2639 return VM_FAULT_HWPOISON_LARGE |
2631 VM_FAULT_SET_HINDEX(h - hstates); 2640 VM_FAULT_SET_HINDEX(h - hstates);
2632 } 2641 }
2633 2642
diff --git a/mm/init-mm.c b/mm/init-mm.c
index 4019979b2637..a56a851908d2 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -5,7 +5,7 @@
5#include <linux/list.h> 5#include <linux/list.h>
6#include <linux/cpumask.h> 6#include <linux/cpumask.h>
7 7
8#include <asm/atomic.h> 8#include <linux/atomic.h>
9#include <asm/pgtable.h> 9#include <asm/pgtable.h>
10#include <asm/mmu.h> 10#include <asm/mmu.h>
11 11
diff --git a/mm/internal.h b/mm/internal.h
index d071d380fb49..2189af491783 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -37,6 +37,52 @@ static inline void __put_page(struct page *page)
37 atomic_dec(&page->_count); 37 atomic_dec(&page->_count);
38} 38}
39 39
40static inline void __get_page_tail_foll(struct page *page,
41 bool get_page_head)
42{
43 /*
44 * If we're getting a tail page, the elevated page->_count is
45 * required only in the head page and we will elevate the head
46 * page->_count and tail page->_mapcount.
47 *
48 * We elevate page_tail->_mapcount for tail pages to force
49 * page_tail->_count to be zero at all times to avoid getting
50 * false positives from get_page_unless_zero() with
51 * speculative page access (like in
52 * page_cache_get_speculative()) on tail pages.
53 */
54 VM_BUG_ON(atomic_read(&page->first_page->_count) <= 0);
55 VM_BUG_ON(atomic_read(&page->_count) != 0);
56 VM_BUG_ON(page_mapcount(page) < 0);
57 if (get_page_head)
58 atomic_inc(&page->first_page->_count);
59 atomic_inc(&page->_mapcount);
60}
61
62/*
63 * This is meant to be called as the FOLL_GET operation of
64 * follow_page() and it must be called while holding the proper PT
65 * lock while the pte (or pmd_trans_huge) is still mapping the page.
66 */
67static inline void get_page_foll(struct page *page)
68{
69 if (unlikely(PageTail(page)))
70 /*
71 * This is safe only because
72 * __split_huge_page_refcount() can't run under
73 * get_page_foll() because we hold the proper PT lock.
74 */
75 __get_page_tail_foll(page, true);
76 else {
77 /*
78 * Getting a normal page or the head of a compound page
79 * requires to already have an elevated page->_count.
80 */
81 VM_BUG_ON(atomic_read(&page->_count) <= 0);
82 atomic_inc(&page->_count);
83 }
84}
85
40extern unsigned long highest_memmap_pfn; 86extern unsigned long highest_memmap_pfn;
41 87
42/* 88/*
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index aacee45616fc..f3b2a00fe9c1 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -69,7 +69,7 @@
69#include <linux/sched.h> 69#include <linux/sched.h>
70#include <linux/jiffies.h> 70#include <linux/jiffies.h>
71#include <linux/delay.h> 71#include <linux/delay.h>
72#include <linux/module.h> 72#include <linux/export.h>
73#include <linux/kthread.h> 73#include <linux/kthread.h>
74#include <linux/prio_tree.h> 74#include <linux/prio_tree.h>
75#include <linux/fs.h> 75#include <linux/fs.h>
@@ -96,7 +96,7 @@
96 96
97#include <asm/sections.h> 97#include <asm/sections.h>
98#include <asm/processor.h> 98#include <asm/processor.h>
99#include <asm/atomic.h> 99#include <linux/atomic.h>
100 100
101#include <linux/kmemcheck.h> 101#include <linux/kmemcheck.h>
102#include <linux/kmemleak.h> 102#include <linux/kmemleak.h>
diff --git a/mm/ksm.c b/mm/ksm.c
index 9a68b0cf0a1c..310544a379ae 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1905,7 +1905,8 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
1905 1905
1906 oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX); 1906 oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX);
1907 err = unmerge_and_remove_all_rmap_items(); 1907 err = unmerge_and_remove_all_rmap_items();
1908 test_set_oom_score_adj(oom_score_adj); 1908 compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX,
1909 oom_score_adj);
1909 if (err) { 1910 if (err) {
1910 ksm_run = KSM_RUN_STOP; 1911 ksm_run = KSM_RUN_STOP;
1911 count = err; 1912 count = err;
diff --git a/mm/maccess.c b/mm/maccess.c
index 4cee182ab5f3..d53adf9ba84b 100644
--- a/mm/maccess.c
+++ b/mm/maccess.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * Access kernel memory without faulting. 2 * Access kernel memory without faulting.
3 */ 3 */
4#include <linux/module.h> 4#include <linux/export.h>
5#include <linux/mm.h> 5#include <linux/mm.h>
6#include <linux/uaccess.h> 6#include <linux/uaccess.h>
7 7
diff --git a/mm/madvise.c b/mm/madvise.c
index 2221491ed503..74bf193eff04 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -218,7 +218,7 @@ static long madvise_remove(struct vm_area_struct *vma,
218 endoff = (loff_t)(end - vma->vm_start - 1) 218 endoff = (loff_t)(end - vma->vm_start - 1)
219 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); 219 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
220 220
221 /* vmtruncate_range needs to take i_mutex and i_alloc_sem */ 221 /* vmtruncate_range needs to take i_mutex */
222 up_read(&current->mm->mmap_sem); 222 up_read(&current->mm->mmap_sem);
223 error = vmtruncate_range(mapping->host, offset, endoff); 223 error = vmtruncate_range(mapping->host, offset, endoff);
224 down_read(&current->mm->mmap_sem); 224 down_read(&current->mm->mmap_sem);
diff --git a/mm/memblock.c b/mm/memblock.c
index a75723d62631..a57092f63a86 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -47,7 +47,8 @@ static unsigned long __init_memblock memblock_addrs_overlap(phys_addr_t base1, p
47 return ((base1 < (base2 + size2)) && (base2 < (base1 + size1))); 47 return ((base1 < (base2 + size2)) && (base2 < (base1 + size1)));
48} 48}
49 49
50long __init_memblock memblock_overlaps_region(struct memblock_type *type, phys_addr_t base, phys_addr_t size) 50static long __init_memblock memblock_overlaps_region(struct memblock_type *type,
51 phys_addr_t base, phys_addr_t size)
51{ 52{
52 unsigned long i; 53 unsigned long i;
53 54
@@ -773,6 +774,12 @@ phys_addr_t __init memblock_phys_mem_size(void)
773 return memblock.memory_size; 774 return memblock.memory_size;
774} 775}
775 776
777/* lowest address */
778phys_addr_t __init_memblock memblock_start_of_DRAM(void)
779{
780 return memblock.memory.regions[0].base;
781}
782
776phys_addr_t __init_memblock memblock_end_of_DRAM(void) 783phys_addr_t __init_memblock memblock_end_of_DRAM(void)
777{ 784{
778 int idx = memblock.memory.cnt - 1; 785 int idx = memblock.memory.cnt - 1;
@@ -912,9 +919,9 @@ void __init memblock_analyze(void)
912 919
913 /* Check marker in the unused last array entry */ 920 /* Check marker in the unused last array entry */
914 WARN_ON(memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS].base 921 WARN_ON(memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS].base
915 != (phys_addr_t)RED_INACTIVE); 922 != MEMBLOCK_INACTIVE);
916 WARN_ON(memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS].base 923 WARN_ON(memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS].base
917 != (phys_addr_t)RED_INACTIVE); 924 != MEMBLOCK_INACTIVE);
918 925
919 memblock.memory_size = 0; 926 memblock.memory_size = 0;
920 927
@@ -940,8 +947,8 @@ void __init memblock_init(void)
940 memblock.reserved.max = INIT_MEMBLOCK_REGIONS; 947 memblock.reserved.max = INIT_MEMBLOCK_REGIONS;
941 948
942 /* Write a marker in the unused last array entry */ 949 /* Write a marker in the unused last array entry */
943 memblock.memory.regions[INIT_MEMBLOCK_REGIONS].base = (phys_addr_t)RED_INACTIVE; 950 memblock.memory.regions[INIT_MEMBLOCK_REGIONS].base = MEMBLOCK_INACTIVE;
944 memblock.reserved.regions[INIT_MEMBLOCK_REGIONS].base = (phys_addr_t)RED_INACTIVE; 951 memblock.reserved.regions[INIT_MEMBLOCK_REGIONS].base = MEMBLOCK_INACTIVE;
945 952
946 /* Create a dummy zero size MEMBLOCK which will get coalesced away later. 953 /* Create a dummy zero size MEMBLOCK which will get coalesced away later.
947 * This simplifies the memblock_add() code below... 954 * This simplifies the memblock_add() code below...
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e013b8e57d25..6aff93c98aca 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -33,9 +33,9 @@
33#include <linux/bit_spinlock.h> 33#include <linux/bit_spinlock.h>
34#include <linux/rcupdate.h> 34#include <linux/rcupdate.h>
35#include <linux/limits.h> 35#include <linux/limits.h>
36#include <linux/export.h>
36#include <linux/mutex.h> 37#include <linux/mutex.h>
37#include <linux/rbtree.h> 38#include <linux/rbtree.h>
38#include <linux/shmem_fs.h>
39#include <linux/slab.h> 39#include <linux/slab.h>
40#include <linux/swap.h> 40#include <linux/swap.h>
41#include <linux/swapops.h> 41#include <linux/swapops.h>
@@ -202,8 +202,8 @@ struct mem_cgroup_eventfd_list {
202 struct eventfd_ctx *eventfd; 202 struct eventfd_ctx *eventfd;
203}; 203};
204 204
205static void mem_cgroup_threshold(struct mem_cgroup *mem); 205static void mem_cgroup_threshold(struct mem_cgroup *memcg);
206static void mem_cgroup_oom_notify(struct mem_cgroup *mem); 206static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
207 207
208/* 208/*
209 * The memory controller data structure. The memory controller controls both 209 * The memory controller data structure. The memory controller controls both
@@ -246,10 +246,13 @@ struct mem_cgroup {
246 * Should the accounting and control be hierarchical, per subtree? 246 * Should the accounting and control be hierarchical, per subtree?
247 */ 247 */
248 bool use_hierarchy; 248 bool use_hierarchy;
249 atomic_t oom_lock; 249
250 bool oom_lock;
251 atomic_t under_oom;
252
250 atomic_t refcnt; 253 atomic_t refcnt;
251 254
252 unsigned int swappiness; 255 int swappiness;
253 /* OOM-Killer disable */ 256 /* OOM-Killer disable */
254 int oom_kill_disable; 257 int oom_kill_disable;
255 258
@@ -360,29 +363,29 @@ enum charge_type {
360#define MEM_CGROUP_RECLAIM_SOFT_BIT 0x2 363#define MEM_CGROUP_RECLAIM_SOFT_BIT 0x2
361#define MEM_CGROUP_RECLAIM_SOFT (1 << MEM_CGROUP_RECLAIM_SOFT_BIT) 364#define MEM_CGROUP_RECLAIM_SOFT (1 << MEM_CGROUP_RECLAIM_SOFT_BIT)
362 365
363static void mem_cgroup_get(struct mem_cgroup *mem); 366static void mem_cgroup_get(struct mem_cgroup *memcg);
364static void mem_cgroup_put(struct mem_cgroup *mem); 367static void mem_cgroup_put(struct mem_cgroup *memcg);
365static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); 368static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg);
366static void drain_all_stock_async(struct mem_cgroup *mem); 369static void drain_all_stock_async(struct mem_cgroup *memcg);
367 370
368static struct mem_cgroup_per_zone * 371static struct mem_cgroup_per_zone *
369mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) 372mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid)
370{ 373{
371 return &mem->info.nodeinfo[nid]->zoneinfo[zid]; 374 return &memcg->info.nodeinfo[nid]->zoneinfo[zid];
372} 375}
373 376
374struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem) 377struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg)
375{ 378{
376 return &mem->css; 379 return &memcg->css;
377} 380}
378 381
379static struct mem_cgroup_per_zone * 382static struct mem_cgroup_per_zone *
380page_cgroup_zoneinfo(struct mem_cgroup *mem, struct page *page) 383page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page)
381{ 384{
382 int nid = page_to_nid(page); 385 int nid = page_to_nid(page);
383 int zid = page_zonenum(page); 386 int zid = page_zonenum(page);
384 387
385 return mem_cgroup_zoneinfo(mem, nid, zid); 388 return mem_cgroup_zoneinfo(memcg, nid, zid);
386} 389}
387 390
388static struct mem_cgroup_tree_per_zone * 391static struct mem_cgroup_tree_per_zone *
@@ -401,7 +404,7 @@ soft_limit_tree_from_page(struct page *page)
401} 404}
402 405
403static void 406static void
404__mem_cgroup_insert_exceeded(struct mem_cgroup *mem, 407__mem_cgroup_insert_exceeded(struct mem_cgroup *memcg,
405 struct mem_cgroup_per_zone *mz, 408 struct mem_cgroup_per_zone *mz,
406 struct mem_cgroup_tree_per_zone *mctz, 409 struct mem_cgroup_tree_per_zone *mctz,
407 unsigned long long new_usage_in_excess) 410 unsigned long long new_usage_in_excess)
@@ -435,7 +438,7 @@ __mem_cgroup_insert_exceeded(struct mem_cgroup *mem,
435} 438}
436 439
437static void 440static void
438__mem_cgroup_remove_exceeded(struct mem_cgroup *mem, 441__mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
439 struct mem_cgroup_per_zone *mz, 442 struct mem_cgroup_per_zone *mz,
440 struct mem_cgroup_tree_per_zone *mctz) 443 struct mem_cgroup_tree_per_zone *mctz)
441{ 444{
@@ -446,17 +449,17 @@ __mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
446} 449}
447 450
448static void 451static void
449mem_cgroup_remove_exceeded(struct mem_cgroup *mem, 452mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
450 struct mem_cgroup_per_zone *mz, 453 struct mem_cgroup_per_zone *mz,
451 struct mem_cgroup_tree_per_zone *mctz) 454 struct mem_cgroup_tree_per_zone *mctz)
452{ 455{
453 spin_lock(&mctz->lock); 456 spin_lock(&mctz->lock);
454 __mem_cgroup_remove_exceeded(mem, mz, mctz); 457 __mem_cgroup_remove_exceeded(memcg, mz, mctz);
455 spin_unlock(&mctz->lock); 458 spin_unlock(&mctz->lock);
456} 459}
457 460
458 461
459static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page) 462static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
460{ 463{
461 unsigned long long excess; 464 unsigned long long excess;
462 struct mem_cgroup_per_zone *mz; 465 struct mem_cgroup_per_zone *mz;
@@ -469,9 +472,9 @@ static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page)
469 * Necessary to update all ancestors when hierarchy is used. 472 * Necessary to update all ancestors when hierarchy is used.
470 * because their event counter is not touched. 473 * because their event counter is not touched.
471 */ 474 */
472 for (; mem; mem = parent_mem_cgroup(mem)) { 475 for (; memcg; memcg = parent_mem_cgroup(memcg)) {
473 mz = mem_cgroup_zoneinfo(mem, nid, zid); 476 mz = mem_cgroup_zoneinfo(memcg, nid, zid);
474 excess = res_counter_soft_limit_excess(&mem->res); 477 excess = res_counter_soft_limit_excess(&memcg->res);
475 /* 478 /*
476 * We have to update the tree if mz is on RB-tree or 479 * We have to update the tree if mz is on RB-tree or
477 * mem is over its softlimit. 480 * mem is over its softlimit.
@@ -480,18 +483,18 @@ static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page)
480 spin_lock(&mctz->lock); 483 spin_lock(&mctz->lock);
481 /* if on-tree, remove it */ 484 /* if on-tree, remove it */
482 if (mz->on_tree) 485 if (mz->on_tree)
483 __mem_cgroup_remove_exceeded(mem, mz, mctz); 486 __mem_cgroup_remove_exceeded(memcg, mz, mctz);
484 /* 487 /*
485 * Insert again. mz->usage_in_excess will be updated. 488 * Insert again. mz->usage_in_excess will be updated.
486 * If excess is 0, no tree ops. 489 * If excess is 0, no tree ops.
487 */ 490 */
488 __mem_cgroup_insert_exceeded(mem, mz, mctz, excess); 491 __mem_cgroup_insert_exceeded(memcg, mz, mctz, excess);
489 spin_unlock(&mctz->lock); 492 spin_unlock(&mctz->lock);
490 } 493 }
491 } 494 }
492} 495}
493 496
494static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem) 497static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
495{ 498{
496 int node, zone; 499 int node, zone;
497 struct mem_cgroup_per_zone *mz; 500 struct mem_cgroup_per_zone *mz;
@@ -499,9 +502,9 @@ static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem)
499 502
500 for_each_node_state(node, N_POSSIBLE) { 503 for_each_node_state(node, N_POSSIBLE) {
501 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 504 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
502 mz = mem_cgroup_zoneinfo(mem, node, zone); 505 mz = mem_cgroup_zoneinfo(memcg, node, zone);
503 mctz = soft_limit_tree_node_zone(node, zone); 506 mctz = soft_limit_tree_node_zone(node, zone);
504 mem_cgroup_remove_exceeded(mem, mz, mctz); 507 mem_cgroup_remove_exceeded(memcg, mz, mctz);
505 } 508 }
506 } 509 }
507} 510}
@@ -562,7 +565,7 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
562 * common workload, threashold and synchonization as vmstat[] should be 565 * common workload, threashold and synchonization as vmstat[] should be
563 * implemented. 566 * implemented.
564 */ 567 */
565static long mem_cgroup_read_stat(struct mem_cgroup *mem, 568static long mem_cgroup_read_stat(struct mem_cgroup *memcg,
566 enum mem_cgroup_stat_index idx) 569 enum mem_cgroup_stat_index idx)
567{ 570{
568 long val = 0; 571 long val = 0;
@@ -570,111 +573,131 @@ static long mem_cgroup_read_stat(struct mem_cgroup *mem,
570 573
571 get_online_cpus(); 574 get_online_cpus();
572 for_each_online_cpu(cpu) 575 for_each_online_cpu(cpu)
573 val += per_cpu(mem->stat->count[idx], cpu); 576 val += per_cpu(memcg->stat->count[idx], cpu);
574#ifdef CONFIG_HOTPLUG_CPU 577#ifdef CONFIG_HOTPLUG_CPU
575 spin_lock(&mem->pcp_counter_lock); 578 spin_lock(&memcg->pcp_counter_lock);
576 val += mem->nocpu_base.count[idx]; 579 val += memcg->nocpu_base.count[idx];
577 spin_unlock(&mem->pcp_counter_lock); 580 spin_unlock(&memcg->pcp_counter_lock);
578#endif 581#endif
579 put_online_cpus(); 582 put_online_cpus();
580 return val; 583 return val;
581} 584}
582 585
583static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, 586static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
584 bool charge) 587 bool charge)
585{ 588{
586 int val = (charge) ? 1 : -1; 589 int val = (charge) ? 1 : -1;
587 this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val); 590 this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAPOUT], val);
588} 591}
589 592
590void mem_cgroup_pgfault(struct mem_cgroup *mem, int val) 593void mem_cgroup_pgfault(struct mem_cgroup *memcg, int val)
591{ 594{
592 this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_PGFAULT], val); 595 this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT], val);
593} 596}
594 597
595void mem_cgroup_pgmajfault(struct mem_cgroup *mem, int val) 598void mem_cgroup_pgmajfault(struct mem_cgroup *memcg, int val)
596{ 599{
597 this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT], val); 600 this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT], val);
598} 601}
599 602
600static unsigned long mem_cgroup_read_events(struct mem_cgroup *mem, 603static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
601 enum mem_cgroup_events_index idx) 604 enum mem_cgroup_events_index idx)
602{ 605{
603 unsigned long val = 0; 606 unsigned long val = 0;
604 int cpu; 607 int cpu;
605 608
606 for_each_online_cpu(cpu) 609 for_each_online_cpu(cpu)
607 val += per_cpu(mem->stat->events[idx], cpu); 610 val += per_cpu(memcg->stat->events[idx], cpu);
608#ifdef CONFIG_HOTPLUG_CPU 611#ifdef CONFIG_HOTPLUG_CPU
609 spin_lock(&mem->pcp_counter_lock); 612 spin_lock(&memcg->pcp_counter_lock);
610 val += mem->nocpu_base.events[idx]; 613 val += memcg->nocpu_base.events[idx];
611 spin_unlock(&mem->pcp_counter_lock); 614 spin_unlock(&memcg->pcp_counter_lock);
612#endif 615#endif
613 return val; 616 return val;
614} 617}
615 618
616static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, 619static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
617 bool file, int nr_pages) 620 bool file, int nr_pages)
618{ 621{
619 preempt_disable(); 622 preempt_disable();
620 623
621 if (file) 624 if (file)
622 __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], nr_pages); 625 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],
626 nr_pages);
623 else 627 else
624 __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], nr_pages); 628 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS],
629 nr_pages);
625 630
626 /* pagein of a big page is an event. So, ignore page size */ 631 /* pagein of a big page is an event. So, ignore page size */
627 if (nr_pages > 0) 632 if (nr_pages > 0)
628 __this_cpu_inc(mem->stat->events[MEM_CGROUP_EVENTS_PGPGIN]); 633 __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]);
629 else { 634 else {
630 __this_cpu_inc(mem->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]); 635 __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]);
631 nr_pages = -nr_pages; /* for event */ 636 nr_pages = -nr_pages; /* for event */
632 } 637 }
633 638
634 __this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_COUNT], nr_pages); 639 __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT], nr_pages);
635 640
636 preempt_enable(); 641 preempt_enable();
637} 642}
638 643
639static unsigned long 644unsigned long
640mem_cgroup_get_zonestat_node(struct mem_cgroup *mem, int nid, enum lru_list idx) 645mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid,
646 unsigned int lru_mask)
641{ 647{
642 struct mem_cgroup_per_zone *mz; 648 struct mem_cgroup_per_zone *mz;
649 enum lru_list l;
650 unsigned long ret = 0;
651
652 mz = mem_cgroup_zoneinfo(memcg, nid, zid);
653
654 for_each_lru(l) {
655 if (BIT(l) & lru_mask)
656 ret += MEM_CGROUP_ZSTAT(mz, l);
657 }
658 return ret;
659}
660
661static unsigned long
662mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
663 int nid, unsigned int lru_mask)
664{
643 u64 total = 0; 665 u64 total = 0;
644 int zid; 666 int zid;
645 667
646 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 668 for (zid = 0; zid < MAX_NR_ZONES; zid++)
647 mz = mem_cgroup_zoneinfo(mem, nid, zid); 669 total += mem_cgroup_zone_nr_lru_pages(memcg,
648 total += MEM_CGROUP_ZSTAT(mz, idx); 670 nid, zid, lru_mask);
649 } 671
650 return total; 672 return total;
651} 673}
652static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, 674
653 enum lru_list idx) 675static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
676 unsigned int lru_mask)
654{ 677{
655 int nid; 678 int nid;
656 u64 total = 0; 679 u64 total = 0;
657 680
658 for_each_online_node(nid) 681 for_each_node_state(nid, N_HIGH_MEMORY)
659 total += mem_cgroup_get_zonestat_node(mem, nid, idx); 682 total += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);
660 return total; 683 return total;
661} 684}
662 685
663static bool __memcg_event_check(struct mem_cgroup *mem, int target) 686static bool __memcg_event_check(struct mem_cgroup *memcg, int target)
664{ 687{
665 unsigned long val, next; 688 unsigned long val, next;
666 689
667 val = this_cpu_read(mem->stat->events[MEM_CGROUP_EVENTS_COUNT]); 690 val = __this_cpu_read(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT]);
668 next = this_cpu_read(mem->stat->targets[target]); 691 next = __this_cpu_read(memcg->stat->targets[target]);
669 /* from time_after() in jiffies.h */ 692 /* from time_after() in jiffies.h */
670 return ((long)next - (long)val < 0); 693 return ((long)next - (long)val < 0);
671} 694}
672 695
673static void __mem_cgroup_target_update(struct mem_cgroup *mem, int target) 696static void __mem_cgroup_target_update(struct mem_cgroup *memcg, int target)
674{ 697{
675 unsigned long val, next; 698 unsigned long val, next;
676 699
677 val = this_cpu_read(mem->stat->events[MEM_CGROUP_EVENTS_COUNT]); 700 val = __this_cpu_read(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT]);
678 701
679 switch (target) { 702 switch (target) {
680 case MEM_CGROUP_TARGET_THRESH: 703 case MEM_CGROUP_TARGET_THRESH:
@@ -690,34 +713,36 @@ static void __mem_cgroup_target_update(struct mem_cgroup *mem, int target)
690 return; 713 return;
691 } 714 }
692 715
693 this_cpu_write(mem->stat->targets[target], next); 716 __this_cpu_write(memcg->stat->targets[target], next);
694} 717}
695 718
696/* 719/*
697 * Check events in order. 720 * Check events in order.
698 * 721 *
699 */ 722 */
700static void memcg_check_events(struct mem_cgroup *mem, struct page *page) 723static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
701{ 724{
725 preempt_disable();
702 /* threshold event is triggered in finer grain than soft limit */ 726 /* threshold event is triggered in finer grain than soft limit */
703 if (unlikely(__memcg_event_check(mem, MEM_CGROUP_TARGET_THRESH))) { 727 if (unlikely(__memcg_event_check(memcg, MEM_CGROUP_TARGET_THRESH))) {
704 mem_cgroup_threshold(mem); 728 mem_cgroup_threshold(memcg);
705 __mem_cgroup_target_update(mem, MEM_CGROUP_TARGET_THRESH); 729 __mem_cgroup_target_update(memcg, MEM_CGROUP_TARGET_THRESH);
706 if (unlikely(__memcg_event_check(mem, 730 if (unlikely(__memcg_event_check(memcg,
707 MEM_CGROUP_TARGET_SOFTLIMIT))) { 731 MEM_CGROUP_TARGET_SOFTLIMIT))) {
708 mem_cgroup_update_tree(mem, page); 732 mem_cgroup_update_tree(memcg, page);
709 __mem_cgroup_target_update(mem, 733 __mem_cgroup_target_update(memcg,
710 MEM_CGROUP_TARGET_SOFTLIMIT); 734 MEM_CGROUP_TARGET_SOFTLIMIT);
711 } 735 }
712#if MAX_NUMNODES > 1 736#if MAX_NUMNODES > 1
713 if (unlikely(__memcg_event_check(mem, 737 if (unlikely(__memcg_event_check(memcg,
714 MEM_CGROUP_TARGET_NUMAINFO))) { 738 MEM_CGROUP_TARGET_NUMAINFO))) {
715 atomic_inc(&mem->numainfo_events); 739 atomic_inc(&memcg->numainfo_events);
716 __mem_cgroup_target_update(mem, 740 __mem_cgroup_target_update(memcg,
717 MEM_CGROUP_TARGET_NUMAINFO); 741 MEM_CGROUP_TARGET_NUMAINFO);
718 } 742 }
719#endif 743#endif
720 } 744 }
745 preempt_enable();
721} 746}
722 747
723static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) 748static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
@@ -743,7 +768,7 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
743 768
744struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) 769struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
745{ 770{
746 struct mem_cgroup *mem = NULL; 771 struct mem_cgroup *memcg = NULL;
747 772
748 if (!mm) 773 if (!mm)
749 return NULL; 774 return NULL;
@@ -754,25 +779,25 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
754 */ 779 */
755 rcu_read_lock(); 780 rcu_read_lock();
756 do { 781 do {
757 mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); 782 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
758 if (unlikely(!mem)) 783 if (unlikely(!memcg))
759 break; 784 break;
760 } while (!css_tryget(&mem->css)); 785 } while (!css_tryget(&memcg->css));
761 rcu_read_unlock(); 786 rcu_read_unlock();
762 return mem; 787 return memcg;
763} 788}
764 789
765/* The caller has to guarantee "mem" exists before calling this */ 790/* The caller has to guarantee "mem" exists before calling this */
766static struct mem_cgroup *mem_cgroup_start_loop(struct mem_cgroup *mem) 791static struct mem_cgroup *mem_cgroup_start_loop(struct mem_cgroup *memcg)
767{ 792{
768 struct cgroup_subsys_state *css; 793 struct cgroup_subsys_state *css;
769 int found; 794 int found;
770 795
771 if (!mem) /* ROOT cgroup has the smallest ID */ 796 if (!memcg) /* ROOT cgroup has the smallest ID */
772 return root_mem_cgroup; /*css_put/get against root is ignored*/ 797 return root_mem_cgroup; /*css_put/get against root is ignored*/
773 if (!mem->use_hierarchy) { 798 if (!memcg->use_hierarchy) {
774 if (css_tryget(&mem->css)) 799 if (css_tryget(&memcg->css))
775 return mem; 800 return memcg;
776 return NULL; 801 return NULL;
777 } 802 }
778 rcu_read_lock(); 803 rcu_read_lock();
@@ -780,13 +805,13 @@ static struct mem_cgroup *mem_cgroup_start_loop(struct mem_cgroup *mem)
780 * searching a memory cgroup which has the smallest ID under given 805 * searching a memory cgroup which has the smallest ID under given
781 * ROOT cgroup. (ID >= 1) 806 * ROOT cgroup. (ID >= 1)
782 */ 807 */
783 css = css_get_next(&mem_cgroup_subsys, 1, &mem->css, &found); 808 css = css_get_next(&mem_cgroup_subsys, 1, &memcg->css, &found);
784 if (css && css_tryget(css)) 809 if (css && css_tryget(css))
785 mem = container_of(css, struct mem_cgroup, css); 810 memcg = container_of(css, struct mem_cgroup, css);
786 else 811 else
787 mem = NULL; 812 memcg = NULL;
788 rcu_read_unlock(); 813 rcu_read_unlock();
789 return mem; 814 return memcg;
790} 815}
791 816
792static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter, 817static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter,
@@ -840,29 +865,29 @@ static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter,
840 for_each_mem_cgroup_tree_cond(iter, NULL, true) 865 for_each_mem_cgroup_tree_cond(iter, NULL, true)
841 866
842 867
843static inline bool mem_cgroup_is_root(struct mem_cgroup *mem) 868static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
844{ 869{
845 return (mem == root_mem_cgroup); 870 return (memcg == root_mem_cgroup);
846} 871}
847 872
848void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) 873void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
849{ 874{
850 struct mem_cgroup *mem; 875 struct mem_cgroup *memcg;
851 876
852 if (!mm) 877 if (!mm)
853 return; 878 return;
854 879
855 rcu_read_lock(); 880 rcu_read_lock();
856 mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); 881 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
857 if (unlikely(!mem)) 882 if (unlikely(!memcg))
858 goto out; 883 goto out;
859 884
860 switch (idx) { 885 switch (idx) {
861 case PGMAJFAULT: 886 case PGMAJFAULT:
862 mem_cgroup_pgmajfault(mem, 1); 887 mem_cgroup_pgmajfault(memcg, 1);
863 break; 888 break;
864 case PGFAULT: 889 case PGFAULT:
865 mem_cgroup_pgfault(mem, 1); 890 mem_cgroup_pgfault(memcg, 1);
866 break; 891 break;
867 default: 892 default:
868 BUG(); 893 BUG();
@@ -971,6 +996,16 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
971 return; 996 return;
972 pc = lookup_page_cgroup(page); 997 pc = lookup_page_cgroup(page);
973 VM_BUG_ON(PageCgroupAcctLRU(pc)); 998 VM_BUG_ON(PageCgroupAcctLRU(pc));
999 /*
1000 * putback: charge:
1001 * SetPageLRU SetPageCgroupUsed
1002 * smp_mb smp_mb
1003 * PageCgroupUsed && add to memcg LRU PageLRU && add to memcg LRU
1004 *
1005 * Ensure that one of the two sides adds the page to the memcg
1006 * LRU during a race.
1007 */
1008 smp_mb();
974 if (!PageCgroupUsed(pc)) 1009 if (!PageCgroupUsed(pc))
975 return; 1010 return;
976 /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ 1011 /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
@@ -1022,7 +1057,16 @@ static void mem_cgroup_lru_add_after_commit(struct page *page)
1022 unsigned long flags; 1057 unsigned long flags;
1023 struct zone *zone = page_zone(page); 1058 struct zone *zone = page_zone(page);
1024 struct page_cgroup *pc = lookup_page_cgroup(page); 1059 struct page_cgroup *pc = lookup_page_cgroup(page);
1025 1060 /*
1061 * putback: charge:
1062 * SetPageLRU SetPageCgroupUsed
1063 * smp_mb smp_mb
1064 * PageCgroupUsed && add to memcg LRU PageLRU && add to memcg LRU
1065 *
1066 * Ensure that one of the two sides adds the page to the memcg
1067 * LRU during a race.
1068 */
1069 smp_mb();
1026 /* taking care of that the page is added to LRU while we commit it */ 1070 /* taking care of that the page is added to LRU while we commit it */
1027 if (likely(!PageLRU(page))) 1071 if (likely(!PageLRU(page)))
1028 return; 1072 return;
@@ -1043,7 +1087,22 @@ void mem_cgroup_move_lists(struct page *page,
1043 mem_cgroup_add_lru_list(page, to); 1087 mem_cgroup_add_lru_list(page, to);
1044} 1088}
1045 1089
1046int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) 1090/*
1091 * Checks whether given mem is same or in the root_mem_cgroup's
1092 * hierarchy subtree
1093 */
1094static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
1095 struct mem_cgroup *memcg)
1096{
1097 if (root_memcg != memcg) {
1098 return (root_memcg->use_hierarchy &&
1099 css_is_ancestor(&memcg->css, &root_memcg->css));
1100 }
1101
1102 return true;
1103}
1104
1105int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg)
1047{ 1106{
1048 int ret; 1107 int ret;
1049 struct mem_cgroup *curr = NULL; 1108 struct mem_cgroup *curr = NULL;
@@ -1057,28 +1116,29 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
1057 if (!curr) 1116 if (!curr)
1058 return 0; 1117 return 0;
1059 /* 1118 /*
1060 * We should check use_hierarchy of "mem" not "curr". Because checking 1119 * We should check use_hierarchy of "memcg" not "curr". Because checking
1061 * use_hierarchy of "curr" here make this function true if hierarchy is 1120 * use_hierarchy of "curr" here make this function true if hierarchy is
1062 * enabled in "curr" and "curr" is a child of "mem" in *cgroup* 1121 * enabled in "curr" and "curr" is a child of "memcg" in *cgroup*
1063 * hierarchy(even if use_hierarchy is disabled in "mem"). 1122 * hierarchy(even if use_hierarchy is disabled in "memcg").
1064 */ 1123 */
1065 if (mem->use_hierarchy) 1124 ret = mem_cgroup_same_or_subtree(memcg, curr);
1066 ret = css_is_ancestor(&curr->css, &mem->css);
1067 else
1068 ret = (curr == mem);
1069 css_put(&curr->css); 1125 css_put(&curr->css);
1070 return ret; 1126 return ret;
1071} 1127}
1072 1128
1073static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages) 1129int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg, struct zone *zone)
1074{ 1130{
1075 unsigned long active; 1131 unsigned long inactive_ratio;
1132 int nid = zone_to_nid(zone);
1133 int zid = zone_idx(zone);
1076 unsigned long inactive; 1134 unsigned long inactive;
1135 unsigned long active;
1077 unsigned long gb; 1136 unsigned long gb;
1078 unsigned long inactive_ratio;
1079 1137
1080 inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_ANON); 1138 inactive = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid,
1081 active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_ANON); 1139 BIT(LRU_INACTIVE_ANON));
1140 active = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid,
1141 BIT(LRU_ACTIVE_ANON));
1082 1142
1083 gb = (inactive + active) >> (30 - PAGE_SHIFT); 1143 gb = (inactive + active) >> (30 - PAGE_SHIFT);
1084 if (gb) 1144 if (gb)
@@ -1086,139 +1146,23 @@ static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_
1086 else 1146 else
1087 inactive_ratio = 1; 1147 inactive_ratio = 1;
1088 1148
1089 if (present_pages) { 1149 return inactive * inactive_ratio < active;
1090 present_pages[0] = inactive;
1091 present_pages[1] = active;
1092 }
1093
1094 return inactive_ratio;
1095} 1150}
1096 1151
1097int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg) 1152int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg, struct zone *zone)
1098{ 1153{
1099 unsigned long active; 1154 unsigned long active;
1100 unsigned long inactive; 1155 unsigned long inactive;
1101 unsigned long present_pages[2];
1102 unsigned long inactive_ratio;
1103
1104 inactive_ratio = calc_inactive_ratio(memcg, present_pages);
1105
1106 inactive = present_pages[0];
1107 active = present_pages[1];
1108
1109 if (inactive * inactive_ratio < active)
1110 return 1;
1111
1112 return 0;
1113}
1114
1115int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg)
1116{
1117 unsigned long active;
1118 unsigned long inactive;
1119
1120 inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_FILE);
1121 active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_FILE);
1122
1123 return (active > inactive);
1124}
1125
1126unsigned long mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg,
1127 struct zone *zone,
1128 enum lru_list lru)
1129{
1130 int nid = zone_to_nid(zone);
1131 int zid = zone_idx(zone); 1156 int zid = zone_idx(zone);
1132 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid); 1157 int nid = zone_to_nid(zone);
1133
1134 return MEM_CGROUP_ZSTAT(mz, lru);
1135}
1136
1137static unsigned long mem_cgroup_node_nr_file_lru_pages(struct mem_cgroup *memcg,
1138 int nid)
1139{
1140 unsigned long ret;
1141
1142 ret = mem_cgroup_get_zonestat_node(memcg, nid, LRU_INACTIVE_FILE) +
1143 mem_cgroup_get_zonestat_node(memcg, nid, LRU_ACTIVE_FILE);
1144
1145 return ret;
1146}
1147
1148static unsigned long mem_cgroup_node_nr_anon_lru_pages(struct mem_cgroup *memcg,
1149 int nid)
1150{
1151 unsigned long ret;
1152
1153 ret = mem_cgroup_get_zonestat_node(memcg, nid, LRU_INACTIVE_ANON) +
1154 mem_cgroup_get_zonestat_node(memcg, nid, LRU_ACTIVE_ANON);
1155 return ret;
1156}
1157
1158#if MAX_NUMNODES > 1
1159static unsigned long mem_cgroup_nr_file_lru_pages(struct mem_cgroup *memcg)
1160{
1161 u64 total = 0;
1162 int nid;
1163
1164 for_each_node_state(nid, N_HIGH_MEMORY)
1165 total += mem_cgroup_node_nr_file_lru_pages(memcg, nid);
1166
1167 return total;
1168}
1169
1170static unsigned long mem_cgroup_nr_anon_lru_pages(struct mem_cgroup *memcg)
1171{
1172 u64 total = 0;
1173 int nid;
1174
1175 for_each_node_state(nid, N_HIGH_MEMORY)
1176 total += mem_cgroup_node_nr_anon_lru_pages(memcg, nid);
1177
1178 return total;
1179}
1180
1181static unsigned long
1182mem_cgroup_node_nr_unevictable_lru_pages(struct mem_cgroup *memcg, int nid)
1183{
1184 return mem_cgroup_get_zonestat_node(memcg, nid, LRU_UNEVICTABLE);
1185}
1186
1187static unsigned long
1188mem_cgroup_nr_unevictable_lru_pages(struct mem_cgroup *memcg)
1189{
1190 u64 total = 0;
1191 int nid;
1192
1193 for_each_node_state(nid, N_HIGH_MEMORY)
1194 total += mem_cgroup_node_nr_unevictable_lru_pages(memcg, nid);
1195
1196 return total;
1197}
1198
1199static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
1200 int nid)
1201{
1202 enum lru_list l;
1203 u64 total = 0;
1204
1205 for_each_lru(l)
1206 total += mem_cgroup_get_zonestat_node(memcg, nid, l);
1207
1208 return total;
1209}
1210
1211static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg)
1212{
1213 u64 total = 0;
1214 int nid;
1215 1158
1216 for_each_node_state(nid, N_HIGH_MEMORY) 1159 inactive = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid,
1217 total += mem_cgroup_node_nr_lru_pages(memcg, nid); 1160 BIT(LRU_INACTIVE_FILE));
1161 active = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid,
1162 BIT(LRU_ACTIVE_FILE));
1218 1163
1219 return total; 1164 return (active > inactive);
1220} 1165}
1221#endif /* CONFIG_NUMA */
1222 1166
1223struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg, 1167struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg,
1224 struct zone *zone) 1168 struct zone *zone)
@@ -1251,7 +1195,8 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page)
1251unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, 1195unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
1252 struct list_head *dst, 1196 struct list_head *dst,
1253 unsigned long *scanned, int order, 1197 unsigned long *scanned, int order,
1254 int mode, struct zone *z, 1198 isolate_mode_t mode,
1199 struct zone *z,
1255 struct mem_cgroup *mem_cont, 1200 struct mem_cgroup *mem_cont,
1256 int active, int file) 1201 int active, int file)
1257{ 1202{
@@ -1319,17 +1264,17 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
1319 * Returns the maximum amount of memory @mem can be charged with, in 1264 * Returns the maximum amount of memory @mem can be charged with, in
1320 * pages. 1265 * pages.
1321 */ 1266 */
1322static unsigned long mem_cgroup_margin(struct mem_cgroup *mem) 1267static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
1323{ 1268{
1324 unsigned long long margin; 1269 unsigned long long margin;
1325 1270
1326 margin = res_counter_margin(&mem->res); 1271 margin = res_counter_margin(&memcg->res);
1327 if (do_swap_account) 1272 if (do_swap_account)
1328 margin = min(margin, res_counter_margin(&mem->memsw)); 1273 margin = min(margin, res_counter_margin(&memcg->memsw));
1329 return margin >> PAGE_SHIFT; 1274 return margin >> PAGE_SHIFT;
1330} 1275}
1331 1276
1332static unsigned int get_swappiness(struct mem_cgroup *memcg) 1277int mem_cgroup_swappiness(struct mem_cgroup *memcg)
1333{ 1278{
1334 struct cgroup *cgrp = memcg->css.cgroup; 1279 struct cgroup *cgrp = memcg->css.cgroup;
1335 1280
@@ -1340,33 +1285,33 @@ static unsigned int get_swappiness(struct mem_cgroup *memcg)
1340 return memcg->swappiness; 1285 return memcg->swappiness;
1341} 1286}
1342 1287
1343static void mem_cgroup_start_move(struct mem_cgroup *mem) 1288static void mem_cgroup_start_move(struct mem_cgroup *memcg)
1344{ 1289{
1345 int cpu; 1290 int cpu;
1346 1291
1347 get_online_cpus(); 1292 get_online_cpus();
1348 spin_lock(&mem->pcp_counter_lock); 1293 spin_lock(&memcg->pcp_counter_lock);
1349 for_each_online_cpu(cpu) 1294 for_each_online_cpu(cpu)
1350 per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1; 1295 per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1;
1351 mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] += 1; 1296 memcg->nocpu_base.count[MEM_CGROUP_ON_MOVE] += 1;
1352 spin_unlock(&mem->pcp_counter_lock); 1297 spin_unlock(&memcg->pcp_counter_lock);
1353 put_online_cpus(); 1298 put_online_cpus();
1354 1299
1355 synchronize_rcu(); 1300 synchronize_rcu();
1356} 1301}
1357 1302
1358static void mem_cgroup_end_move(struct mem_cgroup *mem) 1303static void mem_cgroup_end_move(struct mem_cgroup *memcg)
1359{ 1304{
1360 int cpu; 1305 int cpu;
1361 1306
1362 if (!mem) 1307 if (!memcg)
1363 return; 1308 return;
1364 get_online_cpus(); 1309 get_online_cpus();
1365 spin_lock(&mem->pcp_counter_lock); 1310 spin_lock(&memcg->pcp_counter_lock);
1366 for_each_online_cpu(cpu) 1311 for_each_online_cpu(cpu)
1367 per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1; 1312 per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1;
1368 mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] -= 1; 1313 memcg->nocpu_base.count[MEM_CGROUP_ON_MOVE] -= 1;
1369 spin_unlock(&mem->pcp_counter_lock); 1314 spin_unlock(&memcg->pcp_counter_lock);
1370 put_online_cpus(); 1315 put_online_cpus();
1371} 1316}
1372/* 1317/*
@@ -1381,13 +1326,13 @@ static void mem_cgroup_end_move(struct mem_cgroup *mem)
1381 * waiting at hith-memory prressure caused by "move". 1326 * waiting at hith-memory prressure caused by "move".
1382 */ 1327 */
1383 1328
1384static bool mem_cgroup_stealed(struct mem_cgroup *mem) 1329static bool mem_cgroup_stealed(struct mem_cgroup *memcg)
1385{ 1330{
1386 VM_BUG_ON(!rcu_read_lock_held()); 1331 VM_BUG_ON(!rcu_read_lock_held());
1387 return this_cpu_read(mem->stat->count[MEM_CGROUP_ON_MOVE]) > 0; 1332 return this_cpu_read(memcg->stat->count[MEM_CGROUP_ON_MOVE]) > 0;
1388} 1333}
1389 1334
1390static bool mem_cgroup_under_move(struct mem_cgroup *mem) 1335static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
1391{ 1336{
1392 struct mem_cgroup *from; 1337 struct mem_cgroup *from;
1393 struct mem_cgroup *to; 1338 struct mem_cgroup *to;
@@ -1401,19 +1346,18 @@ static bool mem_cgroup_under_move(struct mem_cgroup *mem)
1401 to = mc.to; 1346 to = mc.to;
1402 if (!from) 1347 if (!from)
1403 goto unlock; 1348 goto unlock;
1404 if (from == mem || to == mem 1349
1405 || (mem->use_hierarchy && css_is_ancestor(&from->css, &mem->css)) 1350 ret = mem_cgroup_same_or_subtree(memcg, from)
1406 || (mem->use_hierarchy && css_is_ancestor(&to->css, &mem->css))) 1351 || mem_cgroup_same_or_subtree(memcg, to);
1407 ret = true;
1408unlock: 1352unlock:
1409 spin_unlock(&mc.lock); 1353 spin_unlock(&mc.lock);
1410 return ret; 1354 return ret;
1411} 1355}
1412 1356
1413static bool mem_cgroup_wait_acct_move(struct mem_cgroup *mem) 1357static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
1414{ 1358{
1415 if (mc.moving_task && current != mc.moving_task) { 1359 if (mc.moving_task && current != mc.moving_task) {
1416 if (mem_cgroup_under_move(mem)) { 1360 if (mem_cgroup_under_move(memcg)) {
1417 DEFINE_WAIT(wait); 1361 DEFINE_WAIT(wait);
1418 prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE); 1362 prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
1419 /* moving charge context might have finished. */ 1363 /* moving charge context might have finished. */
@@ -1497,12 +1441,12 @@ done:
1497 * This function returns the number of memcg under hierarchy tree. Returns 1441 * This function returns the number of memcg under hierarchy tree. Returns
1498 * 1(self count) if no children. 1442 * 1(self count) if no children.
1499 */ 1443 */
1500static int mem_cgroup_count_children(struct mem_cgroup *mem) 1444static int mem_cgroup_count_children(struct mem_cgroup *memcg)
1501{ 1445{
1502 int num = 0; 1446 int num = 0;
1503 struct mem_cgroup *iter; 1447 struct mem_cgroup *iter;
1504 1448
1505 for_each_mem_cgroup_tree(iter, mem) 1449 for_each_mem_cgroup_tree(iter, memcg)
1506 num++; 1450 num++;
1507 return num; 1451 return num;
1508} 1452}
@@ -1532,21 +1476,21 @@ u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
1532 * that to reclaim free pages from. 1476 * that to reclaim free pages from.
1533 */ 1477 */
1534static struct mem_cgroup * 1478static struct mem_cgroup *
1535mem_cgroup_select_victim(struct mem_cgroup *root_mem) 1479mem_cgroup_select_victim(struct mem_cgroup *root_memcg)
1536{ 1480{
1537 struct mem_cgroup *ret = NULL; 1481 struct mem_cgroup *ret = NULL;
1538 struct cgroup_subsys_state *css; 1482 struct cgroup_subsys_state *css;
1539 int nextid, found; 1483 int nextid, found;
1540 1484
1541 if (!root_mem->use_hierarchy) { 1485 if (!root_memcg->use_hierarchy) {
1542 css_get(&root_mem->css); 1486 css_get(&root_memcg->css);
1543 ret = root_mem; 1487 ret = root_memcg;
1544 } 1488 }
1545 1489
1546 while (!ret) { 1490 while (!ret) {
1547 rcu_read_lock(); 1491 rcu_read_lock();
1548 nextid = root_mem->last_scanned_child + 1; 1492 nextid = root_memcg->last_scanned_child + 1;
1549 css = css_get_next(&mem_cgroup_subsys, nextid, &root_mem->css, 1493 css = css_get_next(&mem_cgroup_subsys, nextid, &root_memcg->css,
1550 &found); 1494 &found);
1551 if (css && css_tryget(css)) 1495 if (css && css_tryget(css))
1552 ret = container_of(css, struct mem_cgroup, css); 1496 ret = container_of(css, struct mem_cgroup, css);
@@ -1555,9 +1499,9 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem)
1555 /* Updates scanning parameter */ 1499 /* Updates scanning parameter */
1556 if (!css) { 1500 if (!css) {
1557 /* this means start scan from ID:1 */ 1501 /* this means start scan from ID:1 */
1558 root_mem->last_scanned_child = 0; 1502 root_memcg->last_scanned_child = 0;
1559 } else 1503 } else
1560 root_mem->last_scanned_child = found; 1504 root_memcg->last_scanned_child = found;
1561 } 1505 }
1562 1506
1563 return ret; 1507 return ret;
@@ -1573,14 +1517,14 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem)
1573 * reclaimable pages on a node. Returns true if there are any reclaimable 1517 * reclaimable pages on a node. Returns true if there are any reclaimable
1574 * pages in the node. 1518 * pages in the node.
1575 */ 1519 */
1576static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *mem, 1520static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
1577 int nid, bool noswap) 1521 int nid, bool noswap)
1578{ 1522{
1579 if (mem_cgroup_node_nr_file_lru_pages(mem, nid)) 1523 if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE))
1580 return true; 1524 return true;
1581 if (noswap || !total_swap_pages) 1525 if (noswap || !total_swap_pages)
1582 return false; 1526 return false;
1583 if (mem_cgroup_node_nr_anon_lru_pages(mem, nid)) 1527 if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON))
1584 return true; 1528 return true;
1585 return false; 1529 return false;
1586 1530
@@ -1593,29 +1537,29 @@ static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *mem,
1593 * nodes based on the zonelist. So update the list loosely once per 10 secs. 1537 * nodes based on the zonelist. So update the list loosely once per 10 secs.
1594 * 1538 *
1595 */ 1539 */
1596static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem) 1540static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg)
1597{ 1541{
1598 int nid; 1542 int nid;
1599 /* 1543 /*
1600 * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET 1544 * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET
1601 * pagein/pageout changes since the last update. 1545 * pagein/pageout changes since the last update.
1602 */ 1546 */
1603 if (!atomic_read(&mem->numainfo_events)) 1547 if (!atomic_read(&memcg->numainfo_events))
1604 return; 1548 return;
1605 if (atomic_inc_return(&mem->numainfo_updating) > 1) 1549 if (atomic_inc_return(&memcg->numainfo_updating) > 1)
1606 return; 1550 return;
1607 1551
1608 /* make a nodemask where this memcg uses memory from */ 1552 /* make a nodemask where this memcg uses memory from */
1609 mem->scan_nodes = node_states[N_HIGH_MEMORY]; 1553 memcg->scan_nodes = node_states[N_HIGH_MEMORY];
1610 1554
1611 for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) { 1555 for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) {
1612 1556
1613 if (!test_mem_cgroup_node_reclaimable(mem, nid, false)) 1557 if (!test_mem_cgroup_node_reclaimable(memcg, nid, false))
1614 node_clear(nid, mem->scan_nodes); 1558 node_clear(nid, memcg->scan_nodes);
1615 } 1559 }
1616 1560
1617 atomic_set(&mem->numainfo_events, 0); 1561 atomic_set(&memcg->numainfo_events, 0);
1618 atomic_set(&mem->numainfo_updating, 0); 1562 atomic_set(&memcg->numainfo_updating, 0);
1619} 1563}
1620 1564
1621/* 1565/*
@@ -1630,16 +1574,16 @@ static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem)
1630 * 1574 *
1631 * Now, we use round-robin. Better algorithm is welcomed. 1575 * Now, we use round-robin. Better algorithm is welcomed.
1632 */ 1576 */
1633int mem_cgroup_select_victim_node(struct mem_cgroup *mem) 1577int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1634{ 1578{
1635 int node; 1579 int node;
1636 1580
1637 mem_cgroup_may_update_nodemask(mem); 1581 mem_cgroup_may_update_nodemask(memcg);
1638 node = mem->last_scanned_node; 1582 node = memcg->last_scanned_node;
1639 1583
1640 node = next_node(node, mem->scan_nodes); 1584 node = next_node(node, memcg->scan_nodes);
1641 if (node == MAX_NUMNODES) 1585 if (node == MAX_NUMNODES)
1642 node = first_node(mem->scan_nodes); 1586 node = first_node(memcg->scan_nodes);
1643 /* 1587 /*
1644 * We call this when we hit limit, not when pages are added to LRU. 1588 * We call this when we hit limit, not when pages are added to LRU.
1645 * No LRU may hold pages because all pages are UNEVICTABLE or 1589 * No LRU may hold pages because all pages are UNEVICTABLE or
@@ -1649,7 +1593,7 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
1649 if (unlikely(node == MAX_NUMNODES)) 1593 if (unlikely(node == MAX_NUMNODES))
1650 node = numa_node_id(); 1594 node = numa_node_id();
1651 1595
1652 mem->last_scanned_node = node; 1596 memcg->last_scanned_node = node;
1653 return node; 1597 return node;
1654} 1598}
1655 1599
@@ -1659,7 +1603,7 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
1659 * unused nodes. But scan_nodes is lazily updated and may not cotain 1603 * unused nodes. But scan_nodes is lazily updated and may not cotain
1660 * enough new information. We need to do double check. 1604 * enough new information. We need to do double check.
1661 */ 1605 */
1662bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap) 1606bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
1663{ 1607{
1664 int nid; 1608 int nid;
1665 1609
@@ -1667,12 +1611,12 @@ bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap)
1667 * quick check...making use of scan_node. 1611 * quick check...making use of scan_node.
1668 * We can skip unused nodes. 1612 * We can skip unused nodes.
1669 */ 1613 */
1670 if (!nodes_empty(mem->scan_nodes)) { 1614 if (!nodes_empty(memcg->scan_nodes)) {
1671 for (nid = first_node(mem->scan_nodes); 1615 for (nid = first_node(memcg->scan_nodes);
1672 nid < MAX_NUMNODES; 1616 nid < MAX_NUMNODES;
1673 nid = next_node(nid, mem->scan_nodes)) { 1617 nid = next_node(nid, memcg->scan_nodes)) {
1674 1618
1675 if (test_mem_cgroup_node_reclaimable(mem, nid, noswap)) 1619 if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
1676 return true; 1620 return true;
1677 } 1621 }
1678 } 1622 }
@@ -1680,23 +1624,23 @@ bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap)
1680 * Check rest of nodes. 1624 * Check rest of nodes.
1681 */ 1625 */
1682 for_each_node_state(nid, N_HIGH_MEMORY) { 1626 for_each_node_state(nid, N_HIGH_MEMORY) {
1683 if (node_isset(nid, mem->scan_nodes)) 1627 if (node_isset(nid, memcg->scan_nodes))
1684 continue; 1628 continue;
1685 if (test_mem_cgroup_node_reclaimable(mem, nid, noswap)) 1629 if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
1686 return true; 1630 return true;
1687 } 1631 }
1688 return false; 1632 return false;
1689} 1633}
1690 1634
1691#else 1635#else
1692int mem_cgroup_select_victim_node(struct mem_cgroup *mem) 1636int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1693{ 1637{
1694 return 0; 1638 return 0;
1695} 1639}
1696 1640
1697bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap) 1641bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
1698{ 1642{
1699 return test_mem_cgroup_node_reclaimable(mem, 0, noswap); 1643 return test_mem_cgroup_node_reclaimable(memcg, 0, noswap);
1700} 1644}
1701#endif 1645#endif
1702 1646
@@ -1705,14 +1649,14 @@ bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap)
1705 * we reclaimed from, so that we don't end up penalizing one child extensively 1649 * we reclaimed from, so that we don't end up penalizing one child extensively
1706 * based on its position in the children list. 1650 * based on its position in the children list.
1707 * 1651 *
1708 * root_mem is the original ancestor that we've been reclaim from. 1652 * root_memcg is the original ancestor that we've been reclaim from.
1709 * 1653 *
1710 * We give up and return to the caller when we visit root_mem twice. 1654 * We give up and return to the caller when we visit root_memcg twice.
1711 * (other groups can be removed while we're walking....) 1655 * (other groups can be removed while we're walking....)
1712 * 1656 *
1713 * If shrink==true, for avoiding to free too much, this returns immedieately. 1657 * If shrink==true, for avoiding to free too much, this returns immedieately.
1714 */ 1658 */
1715static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, 1659static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_memcg,
1716 struct zone *zone, 1660 struct zone *zone,
1717 gfp_t gfp_mask, 1661 gfp_t gfp_mask,
1718 unsigned long reclaim_options, 1662 unsigned long reclaim_options,
@@ -1727,15 +1671,15 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1727 unsigned long excess; 1671 unsigned long excess;
1728 unsigned long nr_scanned; 1672 unsigned long nr_scanned;
1729 1673
1730 excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT; 1674 excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT;
1731 1675
1732 /* If memsw_is_minimum==1, swap-out is of-no-use. */ 1676 /* If memsw_is_minimum==1, swap-out is of-no-use. */
1733 if (!check_soft && root_mem->memsw_is_minimum) 1677 if (!check_soft && !shrink && root_memcg->memsw_is_minimum)
1734 noswap = true; 1678 noswap = true;
1735 1679
1736 while (1) { 1680 while (1) {
1737 victim = mem_cgroup_select_victim(root_mem); 1681 victim = mem_cgroup_select_victim(root_memcg);
1738 if (victim == root_mem) { 1682 if (victim == root_memcg) {
1739 loop++; 1683 loop++;
1740 /* 1684 /*
1741 * We are not draining per cpu cached charges during 1685 * We are not draining per cpu cached charges during
@@ -1744,7 +1688,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1744 * charges will not give any. 1688 * charges will not give any.
1745 */ 1689 */
1746 if (!check_soft && loop >= 1) 1690 if (!check_soft && loop >= 1)
1747 drain_all_stock_async(root_mem); 1691 drain_all_stock_async(root_memcg);
1748 if (loop >= 2) { 1692 if (loop >= 2) {
1749 /* 1693 /*
1750 * If we have not been able to reclaim 1694 * If we have not been able to reclaim
@@ -1776,12 +1720,11 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1776 /* we use swappiness of local cgroup */ 1720 /* we use swappiness of local cgroup */
1777 if (check_soft) { 1721 if (check_soft) {
1778 ret = mem_cgroup_shrink_node_zone(victim, gfp_mask, 1722 ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,
1779 noswap, get_swappiness(victim), zone, 1723 noswap, zone, &nr_scanned);
1780 &nr_scanned);
1781 *total_scanned += nr_scanned; 1724 *total_scanned += nr_scanned;
1782 } else 1725 } else
1783 ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, 1726 ret = try_to_free_mem_cgroup_pages(victim, gfp_mask,
1784 noswap, get_swappiness(victim)); 1727 noswap);
1785 css_put(&victim->css); 1728 css_put(&victim->css);
1786 /* 1729 /*
1787 * At shrinking usage, we can't check we should stop here or 1730 * At shrinking usage, we can't check we should stop here or
@@ -1792,9 +1735,9 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1792 return ret; 1735 return ret;
1793 total += ret; 1736 total += ret;
1794 if (check_soft) { 1737 if (check_soft) {
1795 if (!res_counter_soft_limit_excess(&root_mem->res)) 1738 if (!res_counter_soft_limit_excess(&root_memcg->res))
1796 return total; 1739 return total;
1797 } else if (mem_cgroup_margin(root_mem)) 1740 } else if (mem_cgroup_margin(root_memcg))
1798 return total; 1741 return total;
1799 } 1742 }
1800 return total; 1743 return total;
@@ -1803,23 +1746,64 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1803/* 1746/*
1804 * Check OOM-Killer is already running under our hierarchy. 1747 * Check OOM-Killer is already running under our hierarchy.
1805 * If someone is running, return false. 1748 * If someone is running, return false.
1749 * Has to be called with memcg_oom_lock
1806 */ 1750 */
1807static bool mem_cgroup_oom_lock(struct mem_cgroup *mem) 1751static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg)
1808{ 1752{
1809 int x, lock_count = 0; 1753 struct mem_cgroup *iter, *failed = NULL;
1810 struct mem_cgroup *iter; 1754 bool cond = true;
1811 1755
1812 for_each_mem_cgroup_tree(iter, mem) { 1756 for_each_mem_cgroup_tree_cond(iter, memcg, cond) {
1813 x = atomic_inc_return(&iter->oom_lock); 1757 if (iter->oom_lock) {
1814 lock_count = max(x, lock_count); 1758 /*
1759 * this subtree of our hierarchy is already locked
1760 * so we cannot give a lock.
1761 */
1762 failed = iter;
1763 cond = false;
1764 } else
1765 iter->oom_lock = true;
1815 } 1766 }
1816 1767
1817 if (lock_count == 1) 1768 if (!failed)
1818 return true; 1769 return true;
1770
1771 /*
1772 * OK, we failed to lock the whole subtree so we have to clean up
1773 * what we set up to the failing subtree
1774 */
1775 cond = true;
1776 for_each_mem_cgroup_tree_cond(iter, memcg, cond) {
1777 if (iter == failed) {
1778 cond = false;
1779 continue;
1780 }
1781 iter->oom_lock = false;
1782 }
1819 return false; 1783 return false;
1820} 1784}
1821 1785
1822static int mem_cgroup_oom_unlock(struct mem_cgroup *mem) 1786/*
1787 * Has to be called with memcg_oom_lock
1788 */
1789static int mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
1790{
1791 struct mem_cgroup *iter;
1792
1793 for_each_mem_cgroup_tree(iter, memcg)
1794 iter->oom_lock = false;
1795 return 0;
1796}
1797
1798static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
1799{
1800 struct mem_cgroup *iter;
1801
1802 for_each_mem_cgroup_tree(iter, memcg)
1803 atomic_inc(&iter->under_oom);
1804}
1805
1806static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
1823{ 1807{
1824 struct mem_cgroup *iter; 1808 struct mem_cgroup *iter;
1825 1809
@@ -1828,13 +1812,11 @@ static int mem_cgroup_oom_unlock(struct mem_cgroup *mem)
1828 * mem_cgroup_oom_lock() may not be called. We have to use 1812 * mem_cgroup_oom_lock() may not be called. We have to use
1829 * atomic_add_unless() here. 1813 * atomic_add_unless() here.
1830 */ 1814 */
1831 for_each_mem_cgroup_tree(iter, mem) 1815 for_each_mem_cgroup_tree(iter, memcg)
1832 atomic_add_unless(&iter->oom_lock, -1, 0); 1816 atomic_add_unless(&iter->under_oom, -1, 0);
1833 return 0;
1834} 1817}
1835 1818
1836 1819static DEFINE_SPINLOCK(memcg_oom_lock);
1837static DEFINE_MUTEX(memcg_oom_mutex);
1838static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); 1820static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
1839 1821
1840struct oom_wait_info { 1822struct oom_wait_info {
@@ -1845,85 +1827,85 @@ struct oom_wait_info {
1845static int memcg_oom_wake_function(wait_queue_t *wait, 1827static int memcg_oom_wake_function(wait_queue_t *wait,
1846 unsigned mode, int sync, void *arg) 1828 unsigned mode, int sync, void *arg)
1847{ 1829{
1848 struct mem_cgroup *wake_mem = (struct mem_cgroup *)arg; 1830 struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg,
1831 *oom_wait_memcg;
1849 struct oom_wait_info *oom_wait_info; 1832 struct oom_wait_info *oom_wait_info;
1850 1833
1851 oom_wait_info = container_of(wait, struct oom_wait_info, wait); 1834 oom_wait_info = container_of(wait, struct oom_wait_info, wait);
1835 oom_wait_memcg = oom_wait_info->mem;
1852 1836
1853 if (oom_wait_info->mem == wake_mem)
1854 goto wakeup;
1855 /* if no hierarchy, no match */
1856 if (!oom_wait_info->mem->use_hierarchy || !wake_mem->use_hierarchy)
1857 return 0;
1858 /* 1837 /*
1859 * Both of oom_wait_info->mem and wake_mem are stable under us. 1838 * Both of oom_wait_info->mem and wake_mem are stable under us.
1860 * Then we can use css_is_ancestor without taking care of RCU. 1839 * Then we can use css_is_ancestor without taking care of RCU.
1861 */ 1840 */
1862 if (!css_is_ancestor(&oom_wait_info->mem->css, &wake_mem->css) && 1841 if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg)
1863 !css_is_ancestor(&wake_mem->css, &oom_wait_info->mem->css)) 1842 && !mem_cgroup_same_or_subtree(wake_memcg, oom_wait_memcg))
1864 return 0; 1843 return 0;
1865
1866wakeup:
1867 return autoremove_wake_function(wait, mode, sync, arg); 1844 return autoremove_wake_function(wait, mode, sync, arg);
1868} 1845}
1869 1846
1870static void memcg_wakeup_oom(struct mem_cgroup *mem) 1847static void memcg_wakeup_oom(struct mem_cgroup *memcg)
1871{ 1848{
1872 /* for filtering, pass "mem" as argument. */ 1849 /* for filtering, pass "memcg" as argument. */
1873 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, mem); 1850 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
1874} 1851}
1875 1852
1876static void memcg_oom_recover(struct mem_cgroup *mem) 1853static void memcg_oom_recover(struct mem_cgroup *memcg)
1877{ 1854{
1878 if (mem && atomic_read(&mem->oom_lock)) 1855 if (memcg && atomic_read(&memcg->under_oom))
1879 memcg_wakeup_oom(mem); 1856 memcg_wakeup_oom(memcg);
1880} 1857}
1881 1858
1882/* 1859/*
1883 * try to call OOM killer. returns false if we should exit memory-reclaim loop. 1860 * try to call OOM killer. returns false if we should exit memory-reclaim loop.
1884 */ 1861 */
1885bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) 1862bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask)
1886{ 1863{
1887 struct oom_wait_info owait; 1864 struct oom_wait_info owait;
1888 bool locked, need_to_kill; 1865 bool locked, need_to_kill;
1889 1866
1890 owait.mem = mem; 1867 owait.mem = memcg;
1891 owait.wait.flags = 0; 1868 owait.wait.flags = 0;
1892 owait.wait.func = memcg_oom_wake_function; 1869 owait.wait.func = memcg_oom_wake_function;
1893 owait.wait.private = current; 1870 owait.wait.private = current;
1894 INIT_LIST_HEAD(&owait.wait.task_list); 1871 INIT_LIST_HEAD(&owait.wait.task_list);
1895 need_to_kill = true; 1872 need_to_kill = true;
1896 /* At first, try to OOM lock hierarchy under mem.*/ 1873 mem_cgroup_mark_under_oom(memcg);
1897 mutex_lock(&memcg_oom_mutex); 1874
1898 locked = mem_cgroup_oom_lock(mem); 1875 /* At first, try to OOM lock hierarchy under memcg.*/
1876 spin_lock(&memcg_oom_lock);
1877 locked = mem_cgroup_oom_lock(memcg);
1899 /* 1878 /*
1900 * Even if signal_pending(), we can't quit charge() loop without 1879 * Even if signal_pending(), we can't quit charge() loop without
1901 * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL 1880 * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL
1902 * under OOM is always welcomed, use TASK_KILLABLE here. 1881 * under OOM is always welcomed, use TASK_KILLABLE here.
1903 */ 1882 */
1904 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); 1883 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
1905 if (!locked || mem->oom_kill_disable) 1884 if (!locked || memcg->oom_kill_disable)
1906 need_to_kill = false; 1885 need_to_kill = false;
1907 if (locked) 1886 if (locked)
1908 mem_cgroup_oom_notify(mem); 1887 mem_cgroup_oom_notify(memcg);
1909 mutex_unlock(&memcg_oom_mutex); 1888 spin_unlock(&memcg_oom_lock);
1910 1889
1911 if (need_to_kill) { 1890 if (need_to_kill) {
1912 finish_wait(&memcg_oom_waitq, &owait.wait); 1891 finish_wait(&memcg_oom_waitq, &owait.wait);
1913 mem_cgroup_out_of_memory(mem, mask); 1892 mem_cgroup_out_of_memory(memcg, mask);
1914 } else { 1893 } else {
1915 schedule(); 1894 schedule();
1916 finish_wait(&memcg_oom_waitq, &owait.wait); 1895 finish_wait(&memcg_oom_waitq, &owait.wait);
1917 } 1896 }
1918 mutex_lock(&memcg_oom_mutex); 1897 spin_lock(&memcg_oom_lock);
1919 mem_cgroup_oom_unlock(mem); 1898 if (locked)
1920 memcg_wakeup_oom(mem); 1899 mem_cgroup_oom_unlock(memcg);
1921 mutex_unlock(&memcg_oom_mutex); 1900 memcg_wakeup_oom(memcg);
1901 spin_unlock(&memcg_oom_lock);
1902
1903 mem_cgroup_unmark_under_oom(memcg);
1922 1904
1923 if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) 1905 if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
1924 return false; 1906 return false;
1925 /* Give chance to dying process */ 1907 /* Give chance to dying process */
1926 schedule_timeout(1); 1908 schedule_timeout_uninterruptible(1);
1927 return true; 1909 return true;
1928} 1910}
1929 1911
@@ -1954,7 +1936,7 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
1954void mem_cgroup_update_page_stat(struct page *page, 1936void mem_cgroup_update_page_stat(struct page *page,
1955 enum mem_cgroup_page_stat_item idx, int val) 1937 enum mem_cgroup_page_stat_item idx, int val)
1956{ 1938{
1957 struct mem_cgroup *mem; 1939 struct mem_cgroup *memcg;
1958 struct page_cgroup *pc = lookup_page_cgroup(page); 1940 struct page_cgroup *pc = lookup_page_cgroup(page);
1959 bool need_unlock = false; 1941 bool need_unlock = false;
1960 unsigned long uninitialized_var(flags); 1942 unsigned long uninitialized_var(flags);
@@ -1963,16 +1945,16 @@ void mem_cgroup_update_page_stat(struct page *page,
1963 return; 1945 return;
1964 1946
1965 rcu_read_lock(); 1947 rcu_read_lock();
1966 mem = pc->mem_cgroup; 1948 memcg = pc->mem_cgroup;
1967 if (unlikely(!mem || !PageCgroupUsed(pc))) 1949 if (unlikely(!memcg || !PageCgroupUsed(pc)))
1968 goto out; 1950 goto out;
1969 /* pc->mem_cgroup is unstable ? */ 1951 /* pc->mem_cgroup is unstable ? */
1970 if (unlikely(mem_cgroup_stealed(mem)) || PageTransHuge(page)) { 1952 if (unlikely(mem_cgroup_stealed(memcg)) || PageTransHuge(page)) {
1971 /* take a lock against to access pc->mem_cgroup */ 1953 /* take a lock against to access pc->mem_cgroup */
1972 move_lock_page_cgroup(pc, &flags); 1954 move_lock_page_cgroup(pc, &flags);
1973 need_unlock = true; 1955 need_unlock = true;
1974 mem = pc->mem_cgroup; 1956 memcg = pc->mem_cgroup;
1975 if (!mem || !PageCgroupUsed(pc)) 1957 if (!memcg || !PageCgroupUsed(pc))
1976 goto out; 1958 goto out;
1977 } 1959 }
1978 1960
@@ -1988,7 +1970,7 @@ void mem_cgroup_update_page_stat(struct page *page,
1988 BUG(); 1970 BUG();
1989 } 1971 }
1990 1972
1991 this_cpu_add(mem->stat->count[idx], val); 1973 this_cpu_add(memcg->stat->count[idx], val);
1992 1974
1993out: 1975out:
1994 if (unlikely(need_unlock)) 1976 if (unlikely(need_unlock))
@@ -2019,13 +2001,13 @@ static DEFINE_MUTEX(percpu_charge_mutex);
2019 * cgroup which is not current target, returns false. This stock will be 2001 * cgroup which is not current target, returns false. This stock will be
2020 * refilled. 2002 * refilled.
2021 */ 2003 */
2022static bool consume_stock(struct mem_cgroup *mem) 2004static bool consume_stock(struct mem_cgroup *memcg)
2023{ 2005{
2024 struct memcg_stock_pcp *stock; 2006 struct memcg_stock_pcp *stock;
2025 bool ret = true; 2007 bool ret = true;
2026 2008
2027 stock = &get_cpu_var(memcg_stock); 2009 stock = &get_cpu_var(memcg_stock);
2028 if (mem == stock->cached && stock->nr_pages) 2010 if (memcg == stock->cached && stock->nr_pages)
2029 stock->nr_pages--; 2011 stock->nr_pages--;
2030 else /* need to call res_counter_charge */ 2012 else /* need to call res_counter_charge */
2031 ret = false; 2013 ret = false;
@@ -2066,72 +2048,83 @@ static void drain_local_stock(struct work_struct *dummy)
2066 * Cache charges(val) which is from res_counter, to local per_cpu area. 2048 * Cache charges(val) which is from res_counter, to local per_cpu area.
2067 * This will be consumed by consume_stock() function, later. 2049 * This will be consumed by consume_stock() function, later.
2068 */ 2050 */
2069static void refill_stock(struct mem_cgroup *mem, unsigned int nr_pages) 2051static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
2070{ 2052{
2071 struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock); 2053 struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);
2072 2054
2073 if (stock->cached != mem) { /* reset if necessary */ 2055 if (stock->cached != memcg) { /* reset if necessary */
2074 drain_stock(stock); 2056 drain_stock(stock);
2075 stock->cached = mem; 2057 stock->cached = memcg;
2076 } 2058 }
2077 stock->nr_pages += nr_pages; 2059 stock->nr_pages += nr_pages;
2078 put_cpu_var(memcg_stock); 2060 put_cpu_var(memcg_stock);
2079} 2061}
2080 2062
2081/* 2063/*
2082 * Tries to drain stocked charges in other cpus. This function is asynchronous 2064 * Drains all per-CPU charge caches for given root_memcg resp. subtree
2083 * and just put a work per cpu for draining localy on each cpu. Caller can 2065 * of the hierarchy under it. sync flag says whether we should block
2084 * expects some charges will be back to res_counter later but cannot wait for 2066 * until the work is done.
2085 * it.
2086 */ 2067 */
2087static void drain_all_stock_async(struct mem_cgroup *root_mem) 2068static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync)
2088{ 2069{
2089 int cpu, curcpu; 2070 int cpu, curcpu;
2090 /* 2071
2091 * If someone calls draining, avoid adding more kworker runs.
2092 */
2093 if (!mutex_trylock(&percpu_charge_mutex))
2094 return;
2095 /* Notify other cpus that system-wide "drain" is running */ 2072 /* Notify other cpus that system-wide "drain" is running */
2096 get_online_cpus(); 2073 get_online_cpus();
2097 /* 2074 curcpu = get_cpu();
2098 * Get a hint for avoiding draining charges on the current cpu,
2099 * which must be exhausted by our charging. It is not required that
2100 * this be a precise check, so we use raw_smp_processor_id() instead of
2101 * getcpu()/putcpu().
2102 */
2103 curcpu = raw_smp_processor_id();
2104 for_each_online_cpu(cpu) { 2075 for_each_online_cpu(cpu) {
2105 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); 2076 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
2106 struct mem_cgroup *mem; 2077 struct mem_cgroup *memcg;
2107 2078
2108 if (cpu == curcpu) 2079 memcg = stock->cached;
2080 if (!memcg || !stock->nr_pages)
2109 continue; 2081 continue;
2110 2082 if (!mem_cgroup_same_or_subtree(root_memcg, memcg))
2111 mem = stock->cached;
2112 if (!mem)
2113 continue; 2083 continue;
2114 if (mem != root_mem) { 2084 if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
2115 if (!root_mem->use_hierarchy) 2085 if (cpu == curcpu)
2116 continue; 2086 drain_local_stock(&stock->work);
2117 /* check whether "mem" is under tree of "root_mem" */ 2087 else
2118 if (!css_is_ancestor(&mem->css, &root_mem->css)) 2088 schedule_work_on(cpu, &stock->work);
2119 continue;
2120 } 2089 }
2121 if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags))
2122 schedule_work_on(cpu, &stock->work);
2123 } 2090 }
2091 put_cpu();
2092
2093 if (!sync)
2094 goto out;
2095
2096 for_each_online_cpu(cpu) {
2097 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
2098 if (test_bit(FLUSHING_CACHED_CHARGE, &stock->flags))
2099 flush_work(&stock->work);
2100 }
2101out:
2124 put_online_cpus(); 2102 put_online_cpus();
2103}
2104
2105/*
2106 * Tries to drain stocked charges in other cpus. This function is asynchronous
2107 * and just put a work per cpu for draining localy on each cpu. Caller can
2108 * expects some charges will be back to res_counter later but cannot wait for
2109 * it.
2110 */
2111static void drain_all_stock_async(struct mem_cgroup *root_memcg)
2112{
2113 /*
2114 * If someone calls draining, avoid adding more kworker runs.
2115 */
2116 if (!mutex_trylock(&percpu_charge_mutex))
2117 return;
2118 drain_all_stock(root_memcg, false);
2125 mutex_unlock(&percpu_charge_mutex); 2119 mutex_unlock(&percpu_charge_mutex);
2126 /* We don't wait for flush_work */
2127} 2120}
2128 2121
2129/* This is a synchronous drain interface. */ 2122/* This is a synchronous drain interface. */
2130static void drain_all_stock_sync(void) 2123static void drain_all_stock_sync(struct mem_cgroup *root_memcg)
2131{ 2124{
2132 /* called when force_empty is called */ 2125 /* called when force_empty is called */
2133 mutex_lock(&percpu_charge_mutex); 2126 mutex_lock(&percpu_charge_mutex);
2134 schedule_on_each_cpu(drain_local_stock); 2127 drain_all_stock(root_memcg, true);
2135 mutex_unlock(&percpu_charge_mutex); 2128 mutex_unlock(&percpu_charge_mutex);
2136} 2129}
2137 2130
@@ -2139,35 +2132,35 @@ static void drain_all_stock_sync(void)
2139 * This function drains percpu counter value from DEAD cpu and 2132 * This function drains percpu counter value from DEAD cpu and
2140 * move it to local cpu. Note that this function can be preempted. 2133 * move it to local cpu. Note that this function can be preempted.
2141 */ 2134 */
2142static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *mem, int cpu) 2135static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu)
2143{ 2136{
2144 int i; 2137 int i;
2145 2138
2146 spin_lock(&mem->pcp_counter_lock); 2139 spin_lock(&memcg->pcp_counter_lock);
2147 for (i = 0; i < MEM_CGROUP_STAT_DATA; i++) { 2140 for (i = 0; i < MEM_CGROUP_STAT_DATA; i++) {
2148 long x = per_cpu(mem->stat->count[i], cpu); 2141 long x = per_cpu(memcg->stat->count[i], cpu);
2149 2142
2150 per_cpu(mem->stat->count[i], cpu) = 0; 2143 per_cpu(memcg->stat->count[i], cpu) = 0;
2151 mem->nocpu_base.count[i] += x; 2144 memcg->nocpu_base.count[i] += x;
2152 } 2145 }
2153 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) { 2146 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
2154 unsigned long x = per_cpu(mem->stat->events[i], cpu); 2147 unsigned long x = per_cpu(memcg->stat->events[i], cpu);
2155 2148
2156 per_cpu(mem->stat->events[i], cpu) = 0; 2149 per_cpu(memcg->stat->events[i], cpu) = 0;
2157 mem->nocpu_base.events[i] += x; 2150 memcg->nocpu_base.events[i] += x;
2158 } 2151 }
2159 /* need to clear ON_MOVE value, works as a kind of lock. */ 2152 /* need to clear ON_MOVE value, works as a kind of lock. */
2160 per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0; 2153 per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0;
2161 spin_unlock(&mem->pcp_counter_lock); 2154 spin_unlock(&memcg->pcp_counter_lock);
2162} 2155}
2163 2156
2164static void synchronize_mem_cgroup_on_move(struct mem_cgroup *mem, int cpu) 2157static void synchronize_mem_cgroup_on_move(struct mem_cgroup *memcg, int cpu)
2165{ 2158{
2166 int idx = MEM_CGROUP_ON_MOVE; 2159 int idx = MEM_CGROUP_ON_MOVE;
2167 2160
2168 spin_lock(&mem->pcp_counter_lock); 2161 spin_lock(&memcg->pcp_counter_lock);
2169 per_cpu(mem->stat->count[idx], cpu) = mem->nocpu_base.count[idx]; 2162 per_cpu(memcg->stat->count[idx], cpu) = memcg->nocpu_base.count[idx];
2170 spin_unlock(&mem->pcp_counter_lock); 2163 spin_unlock(&memcg->pcp_counter_lock);
2171} 2164}
2172 2165
2173static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb, 2166static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb,
@@ -2205,7 +2198,7 @@ enum {
2205 CHARGE_OOM_DIE, /* the current is killed because of OOM */ 2198 CHARGE_OOM_DIE, /* the current is killed because of OOM */
2206}; 2199};
2207 2200
2208static int mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, 2201static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
2209 unsigned int nr_pages, bool oom_check) 2202 unsigned int nr_pages, bool oom_check)
2210{ 2203{
2211 unsigned long csize = nr_pages * PAGE_SIZE; 2204 unsigned long csize = nr_pages * PAGE_SIZE;
@@ -2214,16 +2207,16 @@ static int mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
2214 unsigned long flags = 0; 2207 unsigned long flags = 0;
2215 int ret; 2208 int ret;
2216 2209
2217 ret = res_counter_charge(&mem->res, csize, &fail_res); 2210 ret = res_counter_charge(&memcg->res, csize, &fail_res);
2218 2211
2219 if (likely(!ret)) { 2212 if (likely(!ret)) {
2220 if (!do_swap_account) 2213 if (!do_swap_account)
2221 return CHARGE_OK; 2214 return CHARGE_OK;
2222 ret = res_counter_charge(&mem->memsw, csize, &fail_res); 2215 ret = res_counter_charge(&memcg->memsw, csize, &fail_res);
2223 if (likely(!ret)) 2216 if (likely(!ret))
2224 return CHARGE_OK; 2217 return CHARGE_OK;
2225 2218
2226 res_counter_uncharge(&mem->res, csize); 2219 res_counter_uncharge(&memcg->res, csize);
2227 mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); 2220 mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
2228 flags |= MEM_CGROUP_RECLAIM_NOSWAP; 2221 flags |= MEM_CGROUP_RECLAIM_NOSWAP;
2229 } else 2222 } else
@@ -2281,12 +2274,12 @@ static int mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
2281static int __mem_cgroup_try_charge(struct mm_struct *mm, 2274static int __mem_cgroup_try_charge(struct mm_struct *mm,
2282 gfp_t gfp_mask, 2275 gfp_t gfp_mask,
2283 unsigned int nr_pages, 2276 unsigned int nr_pages,
2284 struct mem_cgroup **memcg, 2277 struct mem_cgroup **ptr,
2285 bool oom) 2278 bool oom)
2286{ 2279{
2287 unsigned int batch = max(CHARGE_BATCH, nr_pages); 2280 unsigned int batch = max(CHARGE_BATCH, nr_pages);
2288 int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; 2281 int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
2289 struct mem_cgroup *mem = NULL; 2282 struct mem_cgroup *memcg = NULL;
2290 int ret; 2283 int ret;
2291 2284
2292 /* 2285 /*
@@ -2304,17 +2297,17 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
2304 * thread group leader migrates. It's possible that mm is not 2297 * thread group leader migrates. It's possible that mm is not
2305 * set, if so charge the init_mm (happens for pagecache usage). 2298 * set, if so charge the init_mm (happens for pagecache usage).
2306 */ 2299 */
2307 if (!*memcg && !mm) 2300 if (!*ptr && !mm)
2308 goto bypass; 2301 goto bypass;
2309again: 2302again:
2310 if (*memcg) { /* css should be a valid one */ 2303 if (*ptr) { /* css should be a valid one */
2311 mem = *memcg; 2304 memcg = *ptr;
2312 VM_BUG_ON(css_is_removed(&mem->css)); 2305 VM_BUG_ON(css_is_removed(&memcg->css));
2313 if (mem_cgroup_is_root(mem)) 2306 if (mem_cgroup_is_root(memcg))
2314 goto done; 2307 goto done;
2315 if (nr_pages == 1 && consume_stock(mem)) 2308 if (nr_pages == 1 && consume_stock(memcg))
2316 goto done; 2309 goto done;
2317 css_get(&mem->css); 2310 css_get(&memcg->css);
2318 } else { 2311 } else {
2319 struct task_struct *p; 2312 struct task_struct *p;
2320 2313
@@ -2322,7 +2315,7 @@ again:
2322 p = rcu_dereference(mm->owner); 2315 p = rcu_dereference(mm->owner);
2323 /* 2316 /*
2324 * Because we don't have task_lock(), "p" can exit. 2317 * Because we don't have task_lock(), "p" can exit.
2325 * In that case, "mem" can point to root or p can be NULL with 2318 * In that case, "memcg" can point to root or p can be NULL with
2326 * race with swapoff. Then, we have small risk of mis-accouning. 2319 * race with swapoff. Then, we have small risk of mis-accouning.
2327 * But such kind of mis-account by race always happens because 2320 * But such kind of mis-account by race always happens because
2328 * we don't have cgroup_mutex(). It's overkill and we allo that 2321 * we don't have cgroup_mutex(). It's overkill and we allo that
@@ -2330,12 +2323,12 @@ again:
2330 * (*) swapoff at el will charge against mm-struct not against 2323 * (*) swapoff at el will charge against mm-struct not against
2331 * task-struct. So, mm->owner can be NULL. 2324 * task-struct. So, mm->owner can be NULL.
2332 */ 2325 */
2333 mem = mem_cgroup_from_task(p); 2326 memcg = mem_cgroup_from_task(p);
2334 if (!mem || mem_cgroup_is_root(mem)) { 2327 if (!memcg || mem_cgroup_is_root(memcg)) {
2335 rcu_read_unlock(); 2328 rcu_read_unlock();
2336 goto done; 2329 goto done;
2337 } 2330 }
2338 if (nr_pages == 1 && consume_stock(mem)) { 2331 if (nr_pages == 1 && consume_stock(memcg)) {
2339 /* 2332 /*
2340 * It seems dagerous to access memcg without css_get(). 2333 * It seems dagerous to access memcg without css_get().
2341 * But considering how consume_stok works, it's not 2334 * But considering how consume_stok works, it's not
@@ -2348,7 +2341,7 @@ again:
2348 goto done; 2341 goto done;
2349 } 2342 }
2350 /* after here, we may be blocked. we need to get refcnt */ 2343 /* after here, we may be blocked. we need to get refcnt */
2351 if (!css_tryget(&mem->css)) { 2344 if (!css_tryget(&memcg->css)) {
2352 rcu_read_unlock(); 2345 rcu_read_unlock();
2353 goto again; 2346 goto again;
2354 } 2347 }
@@ -2360,7 +2353,7 @@ again:
2360 2353
2361 /* If killed, bypass charge */ 2354 /* If killed, bypass charge */
2362 if (fatal_signal_pending(current)) { 2355 if (fatal_signal_pending(current)) {
2363 css_put(&mem->css); 2356 css_put(&memcg->css);
2364 goto bypass; 2357 goto bypass;
2365 } 2358 }
2366 2359
@@ -2370,43 +2363,43 @@ again:
2370 nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; 2363 nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
2371 } 2364 }
2372 2365
2373 ret = mem_cgroup_do_charge(mem, gfp_mask, batch, oom_check); 2366 ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, oom_check);
2374 switch (ret) { 2367 switch (ret) {
2375 case CHARGE_OK: 2368 case CHARGE_OK:
2376 break; 2369 break;
2377 case CHARGE_RETRY: /* not in OOM situation but retry */ 2370 case CHARGE_RETRY: /* not in OOM situation but retry */
2378 batch = nr_pages; 2371 batch = nr_pages;
2379 css_put(&mem->css); 2372 css_put(&memcg->css);
2380 mem = NULL; 2373 memcg = NULL;
2381 goto again; 2374 goto again;
2382 case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */ 2375 case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */
2383 css_put(&mem->css); 2376 css_put(&memcg->css);
2384 goto nomem; 2377 goto nomem;
2385 case CHARGE_NOMEM: /* OOM routine works */ 2378 case CHARGE_NOMEM: /* OOM routine works */
2386 if (!oom) { 2379 if (!oom) {
2387 css_put(&mem->css); 2380 css_put(&memcg->css);
2388 goto nomem; 2381 goto nomem;
2389 } 2382 }
2390 /* If oom, we never return -ENOMEM */ 2383 /* If oom, we never return -ENOMEM */
2391 nr_oom_retries--; 2384 nr_oom_retries--;
2392 break; 2385 break;
2393 case CHARGE_OOM_DIE: /* Killed by OOM Killer */ 2386 case CHARGE_OOM_DIE: /* Killed by OOM Killer */
2394 css_put(&mem->css); 2387 css_put(&memcg->css);
2395 goto bypass; 2388 goto bypass;
2396 } 2389 }
2397 } while (ret != CHARGE_OK); 2390 } while (ret != CHARGE_OK);
2398 2391
2399 if (batch > nr_pages) 2392 if (batch > nr_pages)
2400 refill_stock(mem, batch - nr_pages); 2393 refill_stock(memcg, batch - nr_pages);
2401 css_put(&mem->css); 2394 css_put(&memcg->css);
2402done: 2395done:
2403 *memcg = mem; 2396 *ptr = memcg;
2404 return 0; 2397 return 0;
2405nomem: 2398nomem:
2406 *memcg = NULL; 2399 *ptr = NULL;
2407 return -ENOMEM; 2400 return -ENOMEM;
2408bypass: 2401bypass:
2409 *memcg = NULL; 2402 *ptr = NULL;
2410 return 0; 2403 return 0;
2411} 2404}
2412 2405
@@ -2415,15 +2408,15 @@ bypass:
2415 * This function is for that and do uncharge, put css's refcnt. 2408 * This function is for that and do uncharge, put css's refcnt.
2416 * gotten by try_charge(). 2409 * gotten by try_charge().
2417 */ 2410 */
2418static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem, 2411static void __mem_cgroup_cancel_charge(struct mem_cgroup *memcg,
2419 unsigned int nr_pages) 2412 unsigned int nr_pages)
2420{ 2413{
2421 if (!mem_cgroup_is_root(mem)) { 2414 if (!mem_cgroup_is_root(memcg)) {
2422 unsigned long bytes = nr_pages * PAGE_SIZE; 2415 unsigned long bytes = nr_pages * PAGE_SIZE;
2423 2416
2424 res_counter_uncharge(&mem->res, bytes); 2417 res_counter_uncharge(&memcg->res, bytes);
2425 if (do_swap_account) 2418 if (do_swap_account)
2426 res_counter_uncharge(&mem->memsw, bytes); 2419 res_counter_uncharge(&memcg->memsw, bytes);
2427 } 2420 }
2428} 2421}
2429 2422
@@ -2448,7 +2441,7 @@ static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
2448 2441
2449struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) 2442struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
2450{ 2443{
2451 struct mem_cgroup *mem = NULL; 2444 struct mem_cgroup *memcg = NULL;
2452 struct page_cgroup *pc; 2445 struct page_cgroup *pc;
2453 unsigned short id; 2446 unsigned short id;
2454 swp_entry_t ent; 2447 swp_entry_t ent;
@@ -2458,23 +2451,23 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
2458 pc = lookup_page_cgroup(page); 2451 pc = lookup_page_cgroup(page);
2459 lock_page_cgroup(pc); 2452 lock_page_cgroup(pc);
2460 if (PageCgroupUsed(pc)) { 2453 if (PageCgroupUsed(pc)) {
2461 mem = pc->mem_cgroup; 2454 memcg = pc->mem_cgroup;
2462 if (mem && !css_tryget(&mem->css)) 2455 if (memcg && !css_tryget(&memcg->css))
2463 mem = NULL; 2456 memcg = NULL;
2464 } else if (PageSwapCache(page)) { 2457 } else if (PageSwapCache(page)) {
2465 ent.val = page_private(page); 2458 ent.val = page_private(page);
2466 id = lookup_swap_cgroup(ent); 2459 id = lookup_swap_cgroup(ent);
2467 rcu_read_lock(); 2460 rcu_read_lock();
2468 mem = mem_cgroup_lookup(id); 2461 memcg = mem_cgroup_lookup(id);
2469 if (mem && !css_tryget(&mem->css)) 2462 if (memcg && !css_tryget(&memcg->css))
2470 mem = NULL; 2463 memcg = NULL;
2471 rcu_read_unlock(); 2464 rcu_read_unlock();
2472 } 2465 }
2473 unlock_page_cgroup(pc); 2466 unlock_page_cgroup(pc);
2474 return mem; 2467 return memcg;
2475} 2468}
2476 2469
2477static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, 2470static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
2478 struct page *page, 2471 struct page *page,
2479 unsigned int nr_pages, 2472 unsigned int nr_pages,
2480 struct page_cgroup *pc, 2473 struct page_cgroup *pc,
@@ -2483,14 +2476,14 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
2483 lock_page_cgroup(pc); 2476 lock_page_cgroup(pc);
2484 if (unlikely(PageCgroupUsed(pc))) { 2477 if (unlikely(PageCgroupUsed(pc))) {
2485 unlock_page_cgroup(pc); 2478 unlock_page_cgroup(pc);
2486 __mem_cgroup_cancel_charge(mem, nr_pages); 2479 __mem_cgroup_cancel_charge(memcg, nr_pages);
2487 return; 2480 return;
2488 } 2481 }
2489 /* 2482 /*
2490 * we don't need page_cgroup_lock about tail pages, becase they are not 2483 * we don't need page_cgroup_lock about tail pages, becase they are not
2491 * accessed by any other context at this point. 2484 * accessed by any other context at this point.
2492 */ 2485 */
2493 pc->mem_cgroup = mem; 2486 pc->mem_cgroup = memcg;
2494 /* 2487 /*
2495 * We access a page_cgroup asynchronously without lock_page_cgroup(). 2488 * We access a page_cgroup asynchronously without lock_page_cgroup().
2496 * Especially when a page_cgroup is taken from a page, pc->mem_cgroup 2489 * Especially when a page_cgroup is taken from a page, pc->mem_cgroup
@@ -2513,14 +2506,14 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
2513 break; 2506 break;
2514 } 2507 }
2515 2508
2516 mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), nr_pages); 2509 mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), nr_pages);
2517 unlock_page_cgroup(pc); 2510 unlock_page_cgroup(pc);
2518 /* 2511 /*
2519 * "charge_statistics" updated event counter. Then, check it. 2512 * "charge_statistics" updated event counter. Then, check it.
2520 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. 2513 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
2521 * if they exceeds softlimit. 2514 * if they exceeds softlimit.
2522 */ 2515 */
2523 memcg_check_events(mem, page); 2516 memcg_check_events(memcg, page);
2524} 2517}
2525 2518
2526#ifdef CONFIG_TRANSPARENT_HUGEPAGE 2519#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -2707,7 +2700,7 @@ out:
2707static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, 2700static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
2708 gfp_t gfp_mask, enum charge_type ctype) 2701 gfp_t gfp_mask, enum charge_type ctype)
2709{ 2702{
2710 struct mem_cgroup *mem = NULL; 2703 struct mem_cgroup *memcg = NULL;
2711 unsigned int nr_pages = 1; 2704 unsigned int nr_pages = 1;
2712 struct page_cgroup *pc; 2705 struct page_cgroup *pc;
2713 bool oom = true; 2706 bool oom = true;
@@ -2726,11 +2719,11 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
2726 pc = lookup_page_cgroup(page); 2719 pc = lookup_page_cgroup(page);
2727 BUG_ON(!pc); /* XXX: remove this and move pc lookup into commit */ 2720 BUG_ON(!pc); /* XXX: remove this and move pc lookup into commit */
2728 2721
2729 ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &mem, oom); 2722 ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom);
2730 if (ret || !mem) 2723 if (ret || !memcg)
2731 return ret; 2724 return ret;
2732 2725
2733 __mem_cgroup_commit_charge(mem, page, nr_pages, pc, ctype); 2726 __mem_cgroup_commit_charge(memcg, page, nr_pages, pc, ctype);
2734 return 0; 2727 return 0;
2735} 2728}
2736 2729
@@ -2759,7 +2752,7 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
2759 enum charge_type ctype); 2752 enum charge_type ctype);
2760 2753
2761static void 2754static void
2762__mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *mem, 2755__mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *memcg,
2763 enum charge_type ctype) 2756 enum charge_type ctype)
2764{ 2757{
2765 struct page_cgroup *pc = lookup_page_cgroup(page); 2758 struct page_cgroup *pc = lookup_page_cgroup(page);
@@ -2769,7 +2762,7 @@ __mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *mem,
2769 * LRU. Take care of it. 2762 * LRU. Take care of it.
2770 */ 2763 */
2771 mem_cgroup_lru_del_before_commit(page); 2764 mem_cgroup_lru_del_before_commit(page);
2772 __mem_cgroup_commit_charge(mem, page, 1, pc, ctype); 2765 __mem_cgroup_commit_charge(memcg, page, 1, pc, ctype);
2773 mem_cgroup_lru_add_after_commit(page); 2766 mem_cgroup_lru_add_after_commit(page);
2774 return; 2767 return;
2775} 2768}
@@ -2777,44 +2770,20 @@ __mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *mem,
2777int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, 2770int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
2778 gfp_t gfp_mask) 2771 gfp_t gfp_mask)
2779{ 2772{
2780 struct mem_cgroup *mem = NULL; 2773 struct mem_cgroup *memcg = NULL;
2781 int ret; 2774 int ret;
2782 2775
2783 if (mem_cgroup_disabled()) 2776 if (mem_cgroup_disabled())
2784 return 0; 2777 return 0;
2785 if (PageCompound(page)) 2778 if (PageCompound(page))
2786 return 0; 2779 return 0;
2787 /*
2788 * Corner case handling. This is called from add_to_page_cache()
2789 * in usual. But some FS (shmem) precharges this page before calling it
2790 * and call add_to_page_cache() with GFP_NOWAIT.
2791 *
2792 * For GFP_NOWAIT case, the page may be pre-charged before calling
2793 * add_to_page_cache(). (See shmem.c) check it here and avoid to call
2794 * charge twice. (It works but has to pay a bit larger cost.)
2795 * And when the page is SwapCache, it should take swap information
2796 * into account. This is under lock_page() now.
2797 */
2798 if (!(gfp_mask & __GFP_WAIT)) {
2799 struct page_cgroup *pc;
2800
2801 pc = lookup_page_cgroup(page);
2802 if (!pc)
2803 return 0;
2804 lock_page_cgroup(pc);
2805 if (PageCgroupUsed(pc)) {
2806 unlock_page_cgroup(pc);
2807 return 0;
2808 }
2809 unlock_page_cgroup(pc);
2810 }
2811 2780
2812 if (unlikely(!mm)) 2781 if (unlikely(!mm))
2813 mm = &init_mm; 2782 mm = &init_mm;
2814 2783
2815 if (page_is_file_cache(page)) { 2784 if (page_is_file_cache(page)) {
2816 ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, &mem, true); 2785 ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, &memcg, true);
2817 if (ret || !mem) 2786 if (ret || !memcg)
2818 return ret; 2787 return ret;
2819 2788
2820 /* 2789 /*
@@ -2822,15 +2791,15 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
2822 * put that would remove them from the LRU list, make 2791 * put that would remove them from the LRU list, make
2823 * sure that they get relinked properly. 2792 * sure that they get relinked properly.
2824 */ 2793 */
2825 __mem_cgroup_commit_charge_lrucare(page, mem, 2794 __mem_cgroup_commit_charge_lrucare(page, memcg,
2826 MEM_CGROUP_CHARGE_TYPE_CACHE); 2795 MEM_CGROUP_CHARGE_TYPE_CACHE);
2827 return ret; 2796 return ret;
2828 } 2797 }
2829 /* shmem */ 2798 /* shmem */
2830 if (PageSwapCache(page)) { 2799 if (PageSwapCache(page)) {
2831 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem); 2800 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &memcg);
2832 if (!ret) 2801 if (!ret)
2833 __mem_cgroup_commit_charge_swapin(page, mem, 2802 __mem_cgroup_commit_charge_swapin(page, memcg,
2834 MEM_CGROUP_CHARGE_TYPE_SHMEM); 2803 MEM_CGROUP_CHARGE_TYPE_SHMEM);
2835 } else 2804 } else
2836 ret = mem_cgroup_charge_common(page, mm, gfp_mask, 2805 ret = mem_cgroup_charge_common(page, mm, gfp_mask,
@@ -2849,7 +2818,7 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
2849 struct page *page, 2818 struct page *page,
2850 gfp_t mask, struct mem_cgroup **ptr) 2819 gfp_t mask, struct mem_cgroup **ptr)
2851{ 2820{
2852 struct mem_cgroup *mem; 2821 struct mem_cgroup *memcg;
2853 int ret; 2822 int ret;
2854 2823
2855 *ptr = NULL; 2824 *ptr = NULL;
@@ -2867,12 +2836,12 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
2867 */ 2836 */
2868 if (!PageSwapCache(page)) 2837 if (!PageSwapCache(page))
2869 goto charge_cur_mm; 2838 goto charge_cur_mm;
2870 mem = try_get_mem_cgroup_from_page(page); 2839 memcg = try_get_mem_cgroup_from_page(page);
2871 if (!mem) 2840 if (!memcg)
2872 goto charge_cur_mm; 2841 goto charge_cur_mm;
2873 *ptr = mem; 2842 *ptr = memcg;
2874 ret = __mem_cgroup_try_charge(NULL, mask, 1, ptr, true); 2843 ret = __mem_cgroup_try_charge(NULL, mask, 1, ptr, true);
2875 css_put(&mem->css); 2844 css_put(&memcg->css);
2876 return ret; 2845 return ret;
2877charge_cur_mm: 2846charge_cur_mm:
2878 if (unlikely(!mm)) 2847 if (unlikely(!mm))
@@ -2932,16 +2901,16 @@ void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)
2932 MEM_CGROUP_CHARGE_TYPE_MAPPED); 2901 MEM_CGROUP_CHARGE_TYPE_MAPPED);
2933} 2902}
2934 2903
2935void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) 2904void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg)
2936{ 2905{
2937 if (mem_cgroup_disabled()) 2906 if (mem_cgroup_disabled())
2938 return; 2907 return;
2939 if (!mem) 2908 if (!memcg)
2940 return; 2909 return;
2941 __mem_cgroup_cancel_charge(mem, 1); 2910 __mem_cgroup_cancel_charge(memcg, 1);
2942} 2911}
2943 2912
2944static void mem_cgroup_do_uncharge(struct mem_cgroup *mem, 2913static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg,
2945 unsigned int nr_pages, 2914 unsigned int nr_pages,
2946 const enum charge_type ctype) 2915 const enum charge_type ctype)
2947{ 2916{
@@ -2959,7 +2928,7 @@ static void mem_cgroup_do_uncharge(struct mem_cgroup *mem,
2959 * uncharges. Then, it's ok to ignore memcg's refcnt. 2928 * uncharges. Then, it's ok to ignore memcg's refcnt.
2960 */ 2929 */
2961 if (!batch->memcg) 2930 if (!batch->memcg)
2962 batch->memcg = mem; 2931 batch->memcg = memcg;
2963 /* 2932 /*
2964 * do_batch > 0 when unmapping pages or inode invalidate/truncate. 2933 * do_batch > 0 when unmapping pages or inode invalidate/truncate.
2965 * In those cases, all pages freed continuously can be expected to be in 2934 * In those cases, all pages freed continuously can be expected to be in
@@ -2979,7 +2948,7 @@ static void mem_cgroup_do_uncharge(struct mem_cgroup *mem,
2979 * merge a series of uncharges to an uncharge of res_counter. 2948 * merge a series of uncharges to an uncharge of res_counter.
2980 * If not, we uncharge res_counter ony by one. 2949 * If not, we uncharge res_counter ony by one.
2981 */ 2950 */
2982 if (batch->memcg != mem) 2951 if (batch->memcg != memcg)
2983 goto direct_uncharge; 2952 goto direct_uncharge;
2984 /* remember freed charge and uncharge it later */ 2953 /* remember freed charge and uncharge it later */
2985 batch->nr_pages++; 2954 batch->nr_pages++;
@@ -2987,11 +2956,11 @@ static void mem_cgroup_do_uncharge(struct mem_cgroup *mem,
2987 batch->memsw_nr_pages++; 2956 batch->memsw_nr_pages++;
2988 return; 2957 return;
2989direct_uncharge: 2958direct_uncharge:
2990 res_counter_uncharge(&mem->res, nr_pages * PAGE_SIZE); 2959 res_counter_uncharge(&memcg->res, nr_pages * PAGE_SIZE);
2991 if (uncharge_memsw) 2960 if (uncharge_memsw)
2992 res_counter_uncharge(&mem->memsw, nr_pages * PAGE_SIZE); 2961 res_counter_uncharge(&memcg->memsw, nr_pages * PAGE_SIZE);
2993 if (unlikely(batch->memcg != mem)) 2962 if (unlikely(batch->memcg != memcg))
2994 memcg_oom_recover(mem); 2963 memcg_oom_recover(memcg);
2995 return; 2964 return;
2996} 2965}
2997 2966
@@ -3001,7 +2970,7 @@ direct_uncharge:
3001static struct mem_cgroup * 2970static struct mem_cgroup *
3002__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) 2971__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
3003{ 2972{
3004 struct mem_cgroup *mem = NULL; 2973 struct mem_cgroup *memcg = NULL;
3005 unsigned int nr_pages = 1; 2974 unsigned int nr_pages = 1;
3006 struct page_cgroup *pc; 2975 struct page_cgroup *pc;
3007 2976
@@ -3024,7 +2993,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
3024 2993
3025 lock_page_cgroup(pc); 2994 lock_page_cgroup(pc);
3026 2995
3027 mem = pc->mem_cgroup; 2996 memcg = pc->mem_cgroup;
3028 2997
3029 if (!PageCgroupUsed(pc)) 2998 if (!PageCgroupUsed(pc))
3030 goto unlock_out; 2999 goto unlock_out;
@@ -3047,7 +3016,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
3047 break; 3016 break;
3048 } 3017 }
3049 3018
3050 mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), -nr_pages); 3019 mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), -nr_pages);
3051 3020
3052 ClearPageCgroupUsed(pc); 3021 ClearPageCgroupUsed(pc);
3053 /* 3022 /*
@@ -3059,18 +3028,18 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
3059 3028
3060 unlock_page_cgroup(pc); 3029 unlock_page_cgroup(pc);
3061 /* 3030 /*
3062 * even after unlock, we have mem->res.usage here and this memcg 3031 * even after unlock, we have memcg->res.usage here and this memcg
3063 * will never be freed. 3032 * will never be freed.
3064 */ 3033 */
3065 memcg_check_events(mem, page); 3034 memcg_check_events(memcg, page);
3066 if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) { 3035 if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) {
3067 mem_cgroup_swap_statistics(mem, true); 3036 mem_cgroup_swap_statistics(memcg, true);
3068 mem_cgroup_get(mem); 3037 mem_cgroup_get(memcg);
3069 } 3038 }
3070 if (!mem_cgroup_is_root(mem)) 3039 if (!mem_cgroup_is_root(memcg))
3071 mem_cgroup_do_uncharge(mem, nr_pages, ctype); 3040 mem_cgroup_do_uncharge(memcg, nr_pages, ctype);
3072 3041
3073 return mem; 3042 return memcg;
3074 3043
3075unlock_out: 3044unlock_out:
3076 unlock_page_cgroup(pc); 3045 unlock_page_cgroup(pc);
@@ -3260,7 +3229,7 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
3260int mem_cgroup_prepare_migration(struct page *page, 3229int mem_cgroup_prepare_migration(struct page *page,
3261 struct page *newpage, struct mem_cgroup **ptr, gfp_t gfp_mask) 3230 struct page *newpage, struct mem_cgroup **ptr, gfp_t gfp_mask)
3262{ 3231{
3263 struct mem_cgroup *mem = NULL; 3232 struct mem_cgroup *memcg = NULL;
3264 struct page_cgroup *pc; 3233 struct page_cgroup *pc;
3265 enum charge_type ctype; 3234 enum charge_type ctype;
3266 int ret = 0; 3235 int ret = 0;
@@ -3274,8 +3243,8 @@ int mem_cgroup_prepare_migration(struct page *page,
3274 pc = lookup_page_cgroup(page); 3243 pc = lookup_page_cgroup(page);
3275 lock_page_cgroup(pc); 3244 lock_page_cgroup(pc);
3276 if (PageCgroupUsed(pc)) { 3245 if (PageCgroupUsed(pc)) {
3277 mem = pc->mem_cgroup; 3246 memcg = pc->mem_cgroup;
3278 css_get(&mem->css); 3247 css_get(&memcg->css);
3279 /* 3248 /*
3280 * At migrating an anonymous page, its mapcount goes down 3249 * At migrating an anonymous page, its mapcount goes down
3281 * to 0 and uncharge() will be called. But, even if it's fully 3250 * to 0 and uncharge() will be called. But, even if it's fully
@@ -3313,12 +3282,12 @@ int mem_cgroup_prepare_migration(struct page *page,
3313 * If the page is not charged at this point, 3282 * If the page is not charged at this point,
3314 * we return here. 3283 * we return here.
3315 */ 3284 */
3316 if (!mem) 3285 if (!memcg)
3317 return 0; 3286 return 0;
3318 3287
3319 *ptr = mem; 3288 *ptr = memcg;
3320 ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, ptr, false); 3289 ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, ptr, false);
3321 css_put(&mem->css);/* drop extra refcnt */ 3290 css_put(&memcg->css);/* drop extra refcnt */
3322 if (ret || *ptr == NULL) { 3291 if (ret || *ptr == NULL) {
3323 if (PageAnon(page)) { 3292 if (PageAnon(page)) {
3324 lock_page_cgroup(pc); 3293 lock_page_cgroup(pc);
@@ -3344,21 +3313,21 @@ int mem_cgroup_prepare_migration(struct page *page,
3344 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; 3313 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
3345 else 3314 else
3346 ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; 3315 ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
3347 __mem_cgroup_commit_charge(mem, page, 1, pc, ctype); 3316 __mem_cgroup_commit_charge(memcg, page, 1, pc, ctype);
3348 return ret; 3317 return ret;
3349} 3318}
3350 3319
3351/* remove redundant charge if migration failed*/ 3320/* remove redundant charge if migration failed*/
3352void mem_cgroup_end_migration(struct mem_cgroup *mem, 3321void mem_cgroup_end_migration(struct mem_cgroup *memcg,
3353 struct page *oldpage, struct page *newpage, bool migration_ok) 3322 struct page *oldpage, struct page *newpage, bool migration_ok)
3354{ 3323{
3355 struct page *used, *unused; 3324 struct page *used, *unused;
3356 struct page_cgroup *pc; 3325 struct page_cgroup *pc;
3357 3326
3358 if (!mem) 3327 if (!memcg)
3359 return; 3328 return;
3360 /* blocks rmdir() */ 3329 /* blocks rmdir() */
3361 cgroup_exclude_rmdir(&mem->css); 3330 cgroup_exclude_rmdir(&memcg->css);
3362 if (!migration_ok) { 3331 if (!migration_ok) {
3363 used = oldpage; 3332 used = oldpage;
3364 unused = newpage; 3333 unused = newpage;
@@ -3394,32 +3363,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem,
3394 * So, rmdir()->pre_destroy() can be called while we do this charge. 3363 * So, rmdir()->pre_destroy() can be called while we do this charge.
3395 * In that case, we need to call pre_destroy() again. check it here. 3364 * In that case, we need to call pre_destroy() again. check it here.
3396 */ 3365 */
3397 cgroup_release_and_wakeup_rmdir(&mem->css); 3366 cgroup_release_and_wakeup_rmdir(&memcg->css);
3398}
3399
3400/*
3401 * A call to try to shrink memory usage on charge failure at shmem's swapin.
3402 * Calling hierarchical_reclaim is not enough because we should update
3403 * last_oom_jiffies to prevent pagefault_out_of_memory from invoking global OOM.
3404 * Moreover considering hierarchy, we should reclaim from the mem_over_limit,
3405 * not from the memcg which this page would be charged to.
3406 * try_charge_swapin does all of these works properly.
3407 */
3408int mem_cgroup_shmem_charge_fallback(struct page *page,
3409 struct mm_struct *mm,
3410 gfp_t gfp_mask)
3411{
3412 struct mem_cgroup *mem;
3413 int ret;
3414
3415 if (mem_cgroup_disabled())
3416 return 0;
3417
3418 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem);
3419 if (!ret)
3420 mem_cgroup_cancel_charge_swapin(mem); /* it does !mem check */
3421
3422 return ret;
3423} 3367}
3424 3368
3425#ifdef CONFIG_DEBUG_VM 3369#ifdef CONFIG_DEBUG_VM
@@ -3498,7 +3442,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
3498 /* 3442 /*
3499 * Rather than hide all in some function, I do this in 3443 * Rather than hide all in some function, I do this in
3500 * open coded manner. You see what this really does. 3444 * open coded manner. You see what this really does.
3501 * We have to guarantee mem->res.limit < mem->memsw.limit. 3445 * We have to guarantee memcg->res.limit < memcg->memsw.limit.
3502 */ 3446 */
3503 mutex_lock(&set_limit_mutex); 3447 mutex_lock(&set_limit_mutex);
3504 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 3448 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
@@ -3560,7 +3504,7 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
3560 /* 3504 /*
3561 * Rather than hide all in some function, I do this in 3505 * Rather than hide all in some function, I do this in
3562 * open coded manner. You see what this really does. 3506 * open coded manner. You see what this really does.
3563 * We have to guarantee mem->res.limit < mem->memsw.limit. 3507 * We have to guarantee memcg->res.limit < memcg->memsw.limit.
3564 */ 3508 */
3565 mutex_lock(&set_limit_mutex); 3509 mutex_lock(&set_limit_mutex);
3566 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); 3510 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
@@ -3698,7 +3642,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
3698 * This routine traverse page_cgroup in given list and drop them all. 3642 * This routine traverse page_cgroup in given list and drop them all.
3699 * *And* this routine doesn't reclaim page itself, just removes page_cgroup. 3643 * *And* this routine doesn't reclaim page itself, just removes page_cgroup.
3700 */ 3644 */
3701static int mem_cgroup_force_empty_list(struct mem_cgroup *mem, 3645static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
3702 int node, int zid, enum lru_list lru) 3646 int node, int zid, enum lru_list lru)
3703{ 3647{
3704 struct zone *zone; 3648 struct zone *zone;
@@ -3709,7 +3653,7 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
3709 int ret = 0; 3653 int ret = 0;
3710 3654
3711 zone = &NODE_DATA(node)->node_zones[zid]; 3655 zone = &NODE_DATA(node)->node_zones[zid];
3712 mz = mem_cgroup_zoneinfo(mem, node, zid); 3656 mz = mem_cgroup_zoneinfo(memcg, node, zid);
3713 list = &mz->lists[lru]; 3657 list = &mz->lists[lru];
3714 3658
3715 loop = MEM_CGROUP_ZSTAT(mz, lru); 3659 loop = MEM_CGROUP_ZSTAT(mz, lru);
@@ -3736,7 +3680,7 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
3736 3680
3737 page = lookup_cgroup_page(pc); 3681 page = lookup_cgroup_page(pc);
3738 3682
3739 ret = mem_cgroup_move_parent(page, pc, mem, GFP_KERNEL); 3683 ret = mem_cgroup_move_parent(page, pc, memcg, GFP_KERNEL);
3740 if (ret == -ENOMEM) 3684 if (ret == -ENOMEM)
3741 break; 3685 break;
3742 3686
@@ -3757,14 +3701,14 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
3757 * make mem_cgroup's charge to be 0 if there is no task. 3701 * make mem_cgroup's charge to be 0 if there is no task.
3758 * This enables deleting this mem_cgroup. 3702 * This enables deleting this mem_cgroup.
3759 */ 3703 */
3760static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all) 3704static int mem_cgroup_force_empty(struct mem_cgroup *memcg, bool free_all)
3761{ 3705{
3762 int ret; 3706 int ret;
3763 int node, zid, shrink; 3707 int node, zid, shrink;
3764 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 3708 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
3765 struct cgroup *cgrp = mem->css.cgroup; 3709 struct cgroup *cgrp = memcg->css.cgroup;
3766 3710
3767 css_get(&mem->css); 3711 css_get(&memcg->css);
3768 3712
3769 shrink = 0; 3713 shrink = 0;
3770 /* should free all ? */ 3714 /* should free all ? */
@@ -3780,14 +3724,14 @@ move_account:
3780 goto out; 3724 goto out;
3781 /* This is for making all *used* pages to be on LRU. */ 3725 /* This is for making all *used* pages to be on LRU. */
3782 lru_add_drain_all(); 3726 lru_add_drain_all();
3783 drain_all_stock_sync(); 3727 drain_all_stock_sync(memcg);
3784 ret = 0; 3728 ret = 0;
3785 mem_cgroup_start_move(mem); 3729 mem_cgroup_start_move(memcg);
3786 for_each_node_state(node, N_HIGH_MEMORY) { 3730 for_each_node_state(node, N_HIGH_MEMORY) {
3787 for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { 3731 for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
3788 enum lru_list l; 3732 enum lru_list l;
3789 for_each_lru(l) { 3733 for_each_lru(l) {
3790 ret = mem_cgroup_force_empty_list(mem, 3734 ret = mem_cgroup_force_empty_list(memcg,
3791 node, zid, l); 3735 node, zid, l);
3792 if (ret) 3736 if (ret)
3793 break; 3737 break;
@@ -3796,16 +3740,16 @@ move_account:
3796 if (ret) 3740 if (ret)
3797 break; 3741 break;
3798 } 3742 }
3799 mem_cgroup_end_move(mem); 3743 mem_cgroup_end_move(memcg);
3800 memcg_oom_recover(mem); 3744 memcg_oom_recover(memcg);
3801 /* it seems parent cgroup doesn't have enough mem */ 3745 /* it seems parent cgroup doesn't have enough mem */
3802 if (ret == -ENOMEM) 3746 if (ret == -ENOMEM)
3803 goto try_to_free; 3747 goto try_to_free;
3804 cond_resched(); 3748 cond_resched();
3805 /* "ret" should also be checked to ensure all lists are empty. */ 3749 /* "ret" should also be checked to ensure all lists are empty. */
3806 } while (mem->res.usage > 0 || ret); 3750 } while (memcg->res.usage > 0 || ret);
3807out: 3751out:
3808 css_put(&mem->css); 3752 css_put(&memcg->css);
3809 return ret; 3753 return ret;
3810 3754
3811try_to_free: 3755try_to_free:
@@ -3818,15 +3762,15 @@ try_to_free:
3818 lru_add_drain_all(); 3762 lru_add_drain_all();
3819 /* try to free all pages in this cgroup */ 3763 /* try to free all pages in this cgroup */
3820 shrink = 1; 3764 shrink = 1;
3821 while (nr_retries && mem->res.usage > 0) { 3765 while (nr_retries && memcg->res.usage > 0) {
3822 int progress; 3766 int progress;
3823 3767
3824 if (signal_pending(current)) { 3768 if (signal_pending(current)) {
3825 ret = -EINTR; 3769 ret = -EINTR;
3826 goto out; 3770 goto out;
3827 } 3771 }
3828 progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL, 3772 progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL,
3829 false, get_swappiness(mem)); 3773 false);
3830 if (!progress) { 3774 if (!progress) {
3831 nr_retries--; 3775 nr_retries--;
3832 /* maybe some writeback is necessary */ 3776 /* maybe some writeback is necessary */
@@ -3854,12 +3798,12 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
3854 u64 val) 3798 u64 val)
3855{ 3799{
3856 int retval = 0; 3800 int retval = 0;
3857 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 3801 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
3858 struct cgroup *parent = cont->parent; 3802 struct cgroup *parent = cont->parent;
3859 struct mem_cgroup *parent_mem = NULL; 3803 struct mem_cgroup *parent_memcg = NULL;
3860 3804
3861 if (parent) 3805 if (parent)
3862 parent_mem = mem_cgroup_from_cont(parent); 3806 parent_memcg = mem_cgroup_from_cont(parent);
3863 3807
3864 cgroup_lock(); 3808 cgroup_lock();
3865 /* 3809 /*
@@ -3870,10 +3814,10 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
3870 * For the root cgroup, parent_mem is NULL, we allow value to be 3814 * For the root cgroup, parent_mem is NULL, we allow value to be
3871 * set if there are no children. 3815 * set if there are no children.
3872 */ 3816 */
3873 if ((!parent_mem || !parent_mem->use_hierarchy) && 3817 if ((!parent_memcg || !parent_memcg->use_hierarchy) &&
3874 (val == 1 || val == 0)) { 3818 (val == 1 || val == 0)) {
3875 if (list_empty(&cont->children)) 3819 if (list_empty(&cont->children))
3876 mem->use_hierarchy = val; 3820 memcg->use_hierarchy = val;
3877 else 3821 else
3878 retval = -EBUSY; 3822 retval = -EBUSY;
3879 } else 3823 } else
@@ -3884,14 +3828,14 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
3884} 3828}
3885 3829
3886 3830
3887static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *mem, 3831static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg,
3888 enum mem_cgroup_stat_index idx) 3832 enum mem_cgroup_stat_index idx)
3889{ 3833{
3890 struct mem_cgroup *iter; 3834 struct mem_cgroup *iter;
3891 long val = 0; 3835 long val = 0;
3892 3836
3893 /* Per-cpu values can be negative, use a signed accumulator */ 3837 /* Per-cpu values can be negative, use a signed accumulator */
3894 for_each_mem_cgroup_tree(iter, mem) 3838 for_each_mem_cgroup_tree(iter, memcg)
3895 val += mem_cgroup_read_stat(iter, idx); 3839 val += mem_cgroup_read_stat(iter, idx);
3896 3840
3897 if (val < 0) /* race ? */ 3841 if (val < 0) /* race ? */
@@ -3899,29 +3843,29 @@ static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *mem,
3899 return val; 3843 return val;
3900} 3844}
3901 3845
3902static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap) 3846static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
3903{ 3847{
3904 u64 val; 3848 u64 val;
3905 3849
3906 if (!mem_cgroup_is_root(mem)) { 3850 if (!mem_cgroup_is_root(memcg)) {
3907 if (!swap) 3851 if (!swap)
3908 return res_counter_read_u64(&mem->res, RES_USAGE); 3852 return res_counter_read_u64(&memcg->res, RES_USAGE);
3909 else 3853 else
3910 return res_counter_read_u64(&mem->memsw, RES_USAGE); 3854 return res_counter_read_u64(&memcg->memsw, RES_USAGE);
3911 } 3855 }
3912 3856
3913 val = mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_CACHE); 3857 val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE);
3914 val += mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_RSS); 3858 val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS);
3915 3859
3916 if (swap) 3860 if (swap)
3917 val += mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_SWAPOUT); 3861 val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAPOUT);
3918 3862
3919 return val << PAGE_SHIFT; 3863 return val << PAGE_SHIFT;
3920} 3864}
3921 3865
3922static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) 3866static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
3923{ 3867{
3924 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 3868 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
3925 u64 val; 3869 u64 val;
3926 int type, name; 3870 int type, name;
3927 3871
@@ -3930,15 +3874,15 @@ static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
3930 switch (type) { 3874 switch (type) {
3931 case _MEM: 3875 case _MEM:
3932 if (name == RES_USAGE) 3876 if (name == RES_USAGE)
3933 val = mem_cgroup_usage(mem, false); 3877 val = mem_cgroup_usage(memcg, false);
3934 else 3878 else
3935 val = res_counter_read_u64(&mem->res, name); 3879 val = res_counter_read_u64(&memcg->res, name);
3936 break; 3880 break;
3937 case _MEMSWAP: 3881 case _MEMSWAP:
3938 if (name == RES_USAGE) 3882 if (name == RES_USAGE)
3939 val = mem_cgroup_usage(mem, true); 3883 val = mem_cgroup_usage(memcg, true);
3940 else 3884 else
3941 val = res_counter_read_u64(&mem->memsw, name); 3885 val = res_counter_read_u64(&memcg->memsw, name);
3942 break; 3886 break;
3943 default: 3887 default:
3944 BUG(); 3888 BUG();
@@ -4026,24 +3970,24 @@ out:
4026 3970
4027static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) 3971static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
4028{ 3972{
4029 struct mem_cgroup *mem; 3973 struct mem_cgroup *memcg;
4030 int type, name; 3974 int type, name;
4031 3975
4032 mem = mem_cgroup_from_cont(cont); 3976 memcg = mem_cgroup_from_cont(cont);
4033 type = MEMFILE_TYPE(event); 3977 type = MEMFILE_TYPE(event);
4034 name = MEMFILE_ATTR(event); 3978 name = MEMFILE_ATTR(event);
4035 switch (name) { 3979 switch (name) {
4036 case RES_MAX_USAGE: 3980 case RES_MAX_USAGE:
4037 if (type == _MEM) 3981 if (type == _MEM)
4038 res_counter_reset_max(&mem->res); 3982 res_counter_reset_max(&memcg->res);
4039 else 3983 else
4040 res_counter_reset_max(&mem->memsw); 3984 res_counter_reset_max(&memcg->memsw);
4041 break; 3985 break;
4042 case RES_FAILCNT: 3986 case RES_FAILCNT:
4043 if (type == _MEM) 3987 if (type == _MEM)
4044 res_counter_reset_failcnt(&mem->res); 3988 res_counter_reset_failcnt(&memcg->res);
4045 else 3989 else
4046 res_counter_reset_failcnt(&mem->memsw); 3990 res_counter_reset_failcnt(&memcg->memsw);
4047 break; 3991 break;
4048 } 3992 }
4049 3993
@@ -4060,7 +4004,7 @@ static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp,
4060static int mem_cgroup_move_charge_write(struct cgroup *cgrp, 4004static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
4061 struct cftype *cft, u64 val) 4005 struct cftype *cft, u64 val)
4062{ 4006{
4063 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); 4007 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4064 4008
4065 if (val >= (1 << NR_MOVE_TYPE)) 4009 if (val >= (1 << NR_MOVE_TYPE))
4066 return -EINVAL; 4010 return -EINVAL;
@@ -4070,7 +4014,7 @@ static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
4070 * inconsistent. 4014 * inconsistent.
4071 */ 4015 */
4072 cgroup_lock(); 4016 cgroup_lock();
4073 mem->move_charge_at_immigrate = val; 4017 memcg->move_charge_at_immigrate = val;
4074 cgroup_unlock(); 4018 cgroup_unlock();
4075 4019
4076 return 0; 4020 return 0;
@@ -4127,49 +4071,49 @@ struct {
4127 4071
4128 4072
4129static void 4073static void
4130mem_cgroup_get_local_stat(struct mem_cgroup *mem, struct mcs_total_stat *s) 4074mem_cgroup_get_local_stat(struct mem_cgroup *memcg, struct mcs_total_stat *s)
4131{ 4075{
4132 s64 val; 4076 s64 val;
4133 4077
4134 /* per cpu stat */ 4078 /* per cpu stat */
4135 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE); 4079 val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_CACHE);
4136 s->stat[MCS_CACHE] += val * PAGE_SIZE; 4080 s->stat[MCS_CACHE] += val * PAGE_SIZE;
4137 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS); 4081 val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_RSS);
4138 s->stat[MCS_RSS] += val * PAGE_SIZE; 4082 s->stat[MCS_RSS] += val * PAGE_SIZE;
4139 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED); 4083 val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED);
4140 s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE; 4084 s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE;
4141 val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGPGIN); 4085 val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGPGIN);
4142 s->stat[MCS_PGPGIN] += val; 4086 s->stat[MCS_PGPGIN] += val;
4143 val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGPGOUT); 4087 val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGPGOUT);
4144 s->stat[MCS_PGPGOUT] += val; 4088 s->stat[MCS_PGPGOUT] += val;
4145 if (do_swap_account) { 4089 if (do_swap_account) {
4146 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT); 4090 val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_SWAPOUT);
4147 s->stat[MCS_SWAP] += val * PAGE_SIZE; 4091 s->stat[MCS_SWAP] += val * PAGE_SIZE;
4148 } 4092 }
4149 val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGFAULT); 4093 val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGFAULT);
4150 s->stat[MCS_PGFAULT] += val; 4094 s->stat[MCS_PGFAULT] += val;
4151 val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGMAJFAULT); 4095 val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGMAJFAULT);
4152 s->stat[MCS_PGMAJFAULT] += val; 4096 s->stat[MCS_PGMAJFAULT] += val;
4153 4097
4154 /* per zone stat */ 4098 /* per zone stat */
4155 val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON); 4099 val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_ANON));
4156 s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE; 4100 s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE;
4157 val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_ANON); 4101 val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_ANON));
4158 s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE; 4102 s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE;
4159 val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_FILE); 4103 val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_FILE));
4160 s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE; 4104 s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE;
4161 val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_FILE); 4105 val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_FILE));
4162 s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE; 4106 s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE;
4163 val = mem_cgroup_get_local_zonestat(mem, LRU_UNEVICTABLE); 4107 val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE));
4164 s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE; 4108 s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE;
4165} 4109}
4166 4110
4167static void 4111static void
4168mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s) 4112mem_cgroup_get_total_stat(struct mem_cgroup *memcg, struct mcs_total_stat *s)
4169{ 4113{
4170 struct mem_cgroup *iter; 4114 struct mem_cgroup *iter;
4171 4115
4172 for_each_mem_cgroup_tree(iter, mem) 4116 for_each_mem_cgroup_tree(iter, memcg)
4173 mem_cgroup_get_local_stat(iter, s); 4117 mem_cgroup_get_local_stat(iter, s);
4174} 4118}
4175 4119
@@ -4182,35 +4126,37 @@ static int mem_control_numa_stat_show(struct seq_file *m, void *arg)
4182 struct cgroup *cont = m->private; 4126 struct cgroup *cont = m->private;
4183 struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont); 4127 struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);
4184 4128
4185 total_nr = mem_cgroup_nr_lru_pages(mem_cont); 4129 total_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL);
4186 seq_printf(m, "total=%lu", total_nr); 4130 seq_printf(m, "total=%lu", total_nr);
4187 for_each_node_state(nid, N_HIGH_MEMORY) { 4131 for_each_node_state(nid, N_HIGH_MEMORY) {
4188 node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid); 4132 node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, LRU_ALL);
4189 seq_printf(m, " N%d=%lu", nid, node_nr); 4133 seq_printf(m, " N%d=%lu", nid, node_nr);
4190 } 4134 }
4191 seq_putc(m, '\n'); 4135 seq_putc(m, '\n');
4192 4136
4193 file_nr = mem_cgroup_nr_file_lru_pages(mem_cont); 4137 file_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL_FILE);
4194 seq_printf(m, "file=%lu", file_nr); 4138 seq_printf(m, "file=%lu", file_nr);
4195 for_each_node_state(nid, N_HIGH_MEMORY) { 4139 for_each_node_state(nid, N_HIGH_MEMORY) {
4196 node_nr = mem_cgroup_node_nr_file_lru_pages(mem_cont, nid); 4140 node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid,
4141 LRU_ALL_FILE);
4197 seq_printf(m, " N%d=%lu", nid, node_nr); 4142 seq_printf(m, " N%d=%lu", nid, node_nr);
4198 } 4143 }
4199 seq_putc(m, '\n'); 4144 seq_putc(m, '\n');
4200 4145
4201 anon_nr = mem_cgroup_nr_anon_lru_pages(mem_cont); 4146 anon_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL_ANON);
4202 seq_printf(m, "anon=%lu", anon_nr); 4147 seq_printf(m, "anon=%lu", anon_nr);
4203 for_each_node_state(nid, N_HIGH_MEMORY) { 4148 for_each_node_state(nid, N_HIGH_MEMORY) {
4204 node_nr = mem_cgroup_node_nr_anon_lru_pages(mem_cont, nid); 4149 node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid,
4150 LRU_ALL_ANON);
4205 seq_printf(m, " N%d=%lu", nid, node_nr); 4151 seq_printf(m, " N%d=%lu", nid, node_nr);
4206 } 4152 }
4207 seq_putc(m, '\n'); 4153 seq_putc(m, '\n');
4208 4154
4209 unevictable_nr = mem_cgroup_nr_unevictable_lru_pages(mem_cont); 4155 unevictable_nr = mem_cgroup_nr_lru_pages(mem_cont, BIT(LRU_UNEVICTABLE));
4210 seq_printf(m, "unevictable=%lu", unevictable_nr); 4156 seq_printf(m, "unevictable=%lu", unevictable_nr);
4211 for_each_node_state(nid, N_HIGH_MEMORY) { 4157 for_each_node_state(nid, N_HIGH_MEMORY) {
4212 node_nr = mem_cgroup_node_nr_unevictable_lru_pages(mem_cont, 4158 node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid,
4213 nid); 4159 BIT(LRU_UNEVICTABLE));
4214 seq_printf(m, " N%d=%lu", nid, node_nr); 4160 seq_printf(m, " N%d=%lu", nid, node_nr);
4215 } 4161 }
4216 seq_putc(m, '\n'); 4162 seq_putc(m, '\n');
@@ -4253,8 +4199,6 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
4253 } 4199 }
4254 4200
4255#ifdef CONFIG_DEBUG_VM 4201#ifdef CONFIG_DEBUG_VM
4256 cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL));
4257
4258 { 4202 {
4259 int nid, zid; 4203 int nid, zid;
4260 struct mem_cgroup_per_zone *mz; 4204 struct mem_cgroup_per_zone *mz;
@@ -4288,7 +4232,7 @@ static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft)
4288{ 4232{
4289 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 4233 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4290 4234
4291 return get_swappiness(memcg); 4235 return mem_cgroup_swappiness(memcg);
4292} 4236}
4293 4237
4294static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft, 4238static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
@@ -4391,20 +4335,20 @@ static int compare_thresholds(const void *a, const void *b)
4391 return _a->threshold - _b->threshold; 4335 return _a->threshold - _b->threshold;
4392} 4336}
4393 4337
4394static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem) 4338static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
4395{ 4339{
4396 struct mem_cgroup_eventfd_list *ev; 4340 struct mem_cgroup_eventfd_list *ev;
4397 4341
4398 list_for_each_entry(ev, &mem->oom_notify, list) 4342 list_for_each_entry(ev, &memcg->oom_notify, list)
4399 eventfd_signal(ev->eventfd, 1); 4343 eventfd_signal(ev->eventfd, 1);
4400 return 0; 4344 return 0;
4401} 4345}
4402 4346
4403static void mem_cgroup_oom_notify(struct mem_cgroup *mem) 4347static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
4404{ 4348{
4405 struct mem_cgroup *iter; 4349 struct mem_cgroup *iter;
4406 4350
4407 for_each_mem_cgroup_tree(iter, mem) 4351 for_each_mem_cgroup_tree(iter, memcg)
4408 mem_cgroup_oom_notify_cb(iter); 4352 mem_cgroup_oom_notify_cb(iter);
4409} 4353}
4410 4354
@@ -4578,15 +4522,15 @@ static int mem_cgroup_oom_register_event(struct cgroup *cgrp,
4578 if (!event) 4522 if (!event)
4579 return -ENOMEM; 4523 return -ENOMEM;
4580 4524
4581 mutex_lock(&memcg_oom_mutex); 4525 spin_lock(&memcg_oom_lock);
4582 4526
4583 event->eventfd = eventfd; 4527 event->eventfd = eventfd;
4584 list_add(&event->list, &memcg->oom_notify); 4528 list_add(&event->list, &memcg->oom_notify);
4585 4529
4586 /* already in OOM ? */ 4530 /* already in OOM ? */
4587 if (atomic_read(&memcg->oom_lock)) 4531 if (atomic_read(&memcg->under_oom))
4588 eventfd_signal(eventfd, 1); 4532 eventfd_signal(eventfd, 1);
4589 mutex_unlock(&memcg_oom_mutex); 4533 spin_unlock(&memcg_oom_lock);
4590 4534
4591 return 0; 4535 return 0;
4592} 4536}
@@ -4594,32 +4538,32 @@ static int mem_cgroup_oom_register_event(struct cgroup *cgrp,
4594static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp, 4538static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,
4595 struct cftype *cft, struct eventfd_ctx *eventfd) 4539 struct cftype *cft, struct eventfd_ctx *eventfd)
4596{ 4540{
4597 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); 4541 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4598 struct mem_cgroup_eventfd_list *ev, *tmp; 4542 struct mem_cgroup_eventfd_list *ev, *tmp;
4599 int type = MEMFILE_TYPE(cft->private); 4543 int type = MEMFILE_TYPE(cft->private);
4600 4544
4601 BUG_ON(type != _OOM_TYPE); 4545 BUG_ON(type != _OOM_TYPE);
4602 4546
4603 mutex_lock(&memcg_oom_mutex); 4547 spin_lock(&memcg_oom_lock);
4604 4548
4605 list_for_each_entry_safe(ev, tmp, &mem->oom_notify, list) { 4549 list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {
4606 if (ev->eventfd == eventfd) { 4550 if (ev->eventfd == eventfd) {
4607 list_del(&ev->list); 4551 list_del(&ev->list);
4608 kfree(ev); 4552 kfree(ev);
4609 } 4553 }
4610 } 4554 }
4611 4555
4612 mutex_unlock(&memcg_oom_mutex); 4556 spin_unlock(&memcg_oom_lock);
4613} 4557}
4614 4558
4615static int mem_cgroup_oom_control_read(struct cgroup *cgrp, 4559static int mem_cgroup_oom_control_read(struct cgroup *cgrp,
4616 struct cftype *cft, struct cgroup_map_cb *cb) 4560 struct cftype *cft, struct cgroup_map_cb *cb)
4617{ 4561{
4618 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); 4562 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4619 4563
4620 cb->fill(cb, "oom_kill_disable", mem->oom_kill_disable); 4564 cb->fill(cb, "oom_kill_disable", memcg->oom_kill_disable);
4621 4565
4622 if (atomic_read(&mem->oom_lock)) 4566 if (atomic_read(&memcg->under_oom))
4623 cb->fill(cb, "under_oom", 1); 4567 cb->fill(cb, "under_oom", 1);
4624 else 4568 else
4625 cb->fill(cb, "under_oom", 0); 4569 cb->fill(cb, "under_oom", 0);
@@ -4629,7 +4573,7 @@ static int mem_cgroup_oom_control_read(struct cgroup *cgrp,
4629static int mem_cgroup_oom_control_write(struct cgroup *cgrp, 4573static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
4630 struct cftype *cft, u64 val) 4574 struct cftype *cft, u64 val)
4631{ 4575{
4632 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); 4576 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4633 struct mem_cgroup *parent; 4577 struct mem_cgroup *parent;
4634 4578
4635 /* cannot set to root cgroup and only 0 and 1 are allowed */ 4579 /* cannot set to root cgroup and only 0 and 1 are allowed */
@@ -4641,13 +4585,13 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
4641 cgroup_lock(); 4585 cgroup_lock();
4642 /* oom-kill-disable is a flag for subhierarchy. */ 4586 /* oom-kill-disable is a flag for subhierarchy. */
4643 if ((parent->use_hierarchy) || 4587 if ((parent->use_hierarchy) ||
4644 (mem->use_hierarchy && !list_empty(&cgrp->children))) { 4588 (memcg->use_hierarchy && !list_empty(&cgrp->children))) {
4645 cgroup_unlock(); 4589 cgroup_unlock();
4646 return -EINVAL; 4590 return -EINVAL;
4647 } 4591 }
4648 mem->oom_kill_disable = val; 4592 memcg->oom_kill_disable = val;
4649 if (!val) 4593 if (!val)
4650 memcg_oom_recover(mem); 4594 memcg_oom_recover(memcg);
4651 cgroup_unlock(); 4595 cgroup_unlock();
4652 return 0; 4596 return 0;
4653} 4597}
@@ -4783,7 +4727,7 @@ static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
4783} 4727}
4784#endif 4728#endif
4785 4729
4786static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) 4730static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
4787{ 4731{
4788 struct mem_cgroup_per_node *pn; 4732 struct mem_cgroup_per_node *pn;
4789 struct mem_cgroup_per_zone *mz; 4733 struct mem_cgroup_per_zone *mz;
@@ -4803,21 +4747,21 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
4803 if (!pn) 4747 if (!pn)
4804 return 1; 4748 return 1;
4805 4749
4806 mem->info.nodeinfo[node] = pn;
4807 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 4750 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
4808 mz = &pn->zoneinfo[zone]; 4751 mz = &pn->zoneinfo[zone];
4809 for_each_lru(l) 4752 for_each_lru(l)
4810 INIT_LIST_HEAD(&mz->lists[l]); 4753 INIT_LIST_HEAD(&mz->lists[l]);
4811 mz->usage_in_excess = 0; 4754 mz->usage_in_excess = 0;
4812 mz->on_tree = false; 4755 mz->on_tree = false;
4813 mz->mem = mem; 4756 mz->mem = memcg;
4814 } 4757 }
4758 memcg->info.nodeinfo[node] = pn;
4815 return 0; 4759 return 0;
4816} 4760}
4817 4761
4818static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) 4762static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
4819{ 4763{
4820 kfree(mem->info.nodeinfo[node]); 4764 kfree(memcg->info.nodeinfo[node]);
4821} 4765}
4822 4766
4823static struct mem_cgroup *mem_cgroup_alloc(void) 4767static struct mem_cgroup *mem_cgroup_alloc(void)
@@ -4859,51 +4803,51 @@ out_free:
4859 * Removal of cgroup itself succeeds regardless of refs from swap. 4803 * Removal of cgroup itself succeeds regardless of refs from swap.
4860 */ 4804 */
4861 4805
4862static void __mem_cgroup_free(struct mem_cgroup *mem) 4806static void __mem_cgroup_free(struct mem_cgroup *memcg)
4863{ 4807{
4864 int node; 4808 int node;
4865 4809
4866 mem_cgroup_remove_from_trees(mem); 4810 mem_cgroup_remove_from_trees(memcg);
4867 free_css_id(&mem_cgroup_subsys, &mem->css); 4811 free_css_id(&mem_cgroup_subsys, &memcg->css);
4868 4812
4869 for_each_node_state(node, N_POSSIBLE) 4813 for_each_node_state(node, N_POSSIBLE)
4870 free_mem_cgroup_per_zone_info(mem, node); 4814 free_mem_cgroup_per_zone_info(memcg, node);
4871 4815
4872 free_percpu(mem->stat); 4816 free_percpu(memcg->stat);
4873 if (sizeof(struct mem_cgroup) < PAGE_SIZE) 4817 if (sizeof(struct mem_cgroup) < PAGE_SIZE)
4874 kfree(mem); 4818 kfree(memcg);
4875 else 4819 else
4876 vfree(mem); 4820 vfree(memcg);
4877} 4821}
4878 4822
4879static void mem_cgroup_get(struct mem_cgroup *mem) 4823static void mem_cgroup_get(struct mem_cgroup *memcg)
4880{ 4824{
4881 atomic_inc(&mem->refcnt); 4825 atomic_inc(&memcg->refcnt);
4882} 4826}
4883 4827
4884static void __mem_cgroup_put(struct mem_cgroup *mem, int count) 4828static void __mem_cgroup_put(struct mem_cgroup *memcg, int count)
4885{ 4829{
4886 if (atomic_sub_and_test(count, &mem->refcnt)) { 4830 if (atomic_sub_and_test(count, &memcg->refcnt)) {
4887 struct mem_cgroup *parent = parent_mem_cgroup(mem); 4831 struct mem_cgroup *parent = parent_mem_cgroup(memcg);
4888 __mem_cgroup_free(mem); 4832 __mem_cgroup_free(memcg);
4889 if (parent) 4833 if (parent)
4890 mem_cgroup_put(parent); 4834 mem_cgroup_put(parent);
4891 } 4835 }
4892} 4836}
4893 4837
4894static void mem_cgroup_put(struct mem_cgroup *mem) 4838static void mem_cgroup_put(struct mem_cgroup *memcg)
4895{ 4839{
4896 __mem_cgroup_put(mem, 1); 4840 __mem_cgroup_put(memcg, 1);
4897} 4841}
4898 4842
4899/* 4843/*
4900 * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled. 4844 * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled.
4901 */ 4845 */
4902static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem) 4846static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
4903{ 4847{
4904 if (!mem->res.parent) 4848 if (!memcg->res.parent)
4905 return NULL; 4849 return NULL;
4906 return mem_cgroup_from_res_counter(mem->res.parent, res); 4850 return mem_cgroup_from_res_counter(memcg->res.parent, res);
4907} 4851}
4908 4852
4909#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 4853#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
@@ -4946,16 +4890,16 @@ static int mem_cgroup_soft_limit_tree_init(void)
4946static struct cgroup_subsys_state * __ref 4890static struct cgroup_subsys_state * __ref
4947mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) 4891mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
4948{ 4892{
4949 struct mem_cgroup *mem, *parent; 4893 struct mem_cgroup *memcg, *parent;
4950 long error = -ENOMEM; 4894 long error = -ENOMEM;
4951 int node; 4895 int node;
4952 4896
4953 mem = mem_cgroup_alloc(); 4897 memcg = mem_cgroup_alloc();
4954 if (!mem) 4898 if (!memcg)
4955 return ERR_PTR(error); 4899 return ERR_PTR(error);
4956 4900
4957 for_each_node_state(node, N_POSSIBLE) 4901 for_each_node_state(node, N_POSSIBLE)
4958 if (alloc_mem_cgroup_per_zone_info(mem, node)) 4902 if (alloc_mem_cgroup_per_zone_info(memcg, node))
4959 goto free_out; 4903 goto free_out;
4960 4904
4961 /* root ? */ 4905 /* root ? */
@@ -4963,7 +4907,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
4963 int cpu; 4907 int cpu;
4964 enable_swap_cgroup(); 4908 enable_swap_cgroup();
4965 parent = NULL; 4909 parent = NULL;
4966 root_mem_cgroup = mem; 4910 root_mem_cgroup = memcg;
4967 if (mem_cgroup_soft_limit_tree_init()) 4911 if (mem_cgroup_soft_limit_tree_init())
4968 goto free_out; 4912 goto free_out;
4969 for_each_possible_cpu(cpu) { 4913 for_each_possible_cpu(cpu) {
@@ -4974,13 +4918,13 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
4974 hotcpu_notifier(memcg_cpu_hotplug_callback, 0); 4918 hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
4975 } else { 4919 } else {
4976 parent = mem_cgroup_from_cont(cont->parent); 4920 parent = mem_cgroup_from_cont(cont->parent);
4977 mem->use_hierarchy = parent->use_hierarchy; 4921 memcg->use_hierarchy = parent->use_hierarchy;
4978 mem->oom_kill_disable = parent->oom_kill_disable; 4922 memcg->oom_kill_disable = parent->oom_kill_disable;
4979 } 4923 }
4980 4924
4981 if (parent && parent->use_hierarchy) { 4925 if (parent && parent->use_hierarchy) {
4982 res_counter_init(&mem->res, &parent->res); 4926 res_counter_init(&memcg->res, &parent->res);
4983 res_counter_init(&mem->memsw, &parent->memsw); 4927 res_counter_init(&memcg->memsw, &parent->memsw);
4984 /* 4928 /*
4985 * We increment refcnt of the parent to ensure that we can 4929 * We increment refcnt of the parent to ensure that we can
4986 * safely access it on res_counter_charge/uncharge. 4930 * safely access it on res_counter_charge/uncharge.
@@ -4989,21 +4933,21 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
4989 */ 4933 */
4990 mem_cgroup_get(parent); 4934 mem_cgroup_get(parent);
4991 } else { 4935 } else {
4992 res_counter_init(&mem->res, NULL); 4936 res_counter_init(&memcg->res, NULL);
4993 res_counter_init(&mem->memsw, NULL); 4937 res_counter_init(&memcg->memsw, NULL);
4994 } 4938 }
4995 mem->last_scanned_child = 0; 4939 memcg->last_scanned_child = 0;
4996 mem->last_scanned_node = MAX_NUMNODES; 4940 memcg->last_scanned_node = MAX_NUMNODES;
4997 INIT_LIST_HEAD(&mem->oom_notify); 4941 INIT_LIST_HEAD(&memcg->oom_notify);
4998 4942
4999 if (parent) 4943 if (parent)
5000 mem->swappiness = get_swappiness(parent); 4944 memcg->swappiness = mem_cgroup_swappiness(parent);
5001 atomic_set(&mem->refcnt, 1); 4945 atomic_set(&memcg->refcnt, 1);
5002 mem->move_charge_at_immigrate = 0; 4946 memcg->move_charge_at_immigrate = 0;
5003 mutex_init(&mem->thresholds_lock); 4947 mutex_init(&memcg->thresholds_lock);
5004 return &mem->css; 4948 return &memcg->css;
5005free_out: 4949free_out:
5006 __mem_cgroup_free(mem); 4950 __mem_cgroup_free(memcg);
5007 root_mem_cgroup = NULL; 4951 root_mem_cgroup = NULL;
5008 return ERR_PTR(error); 4952 return ERR_PTR(error);
5009} 4953}
@@ -5011,17 +4955,17 @@ free_out:
5011static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss, 4955static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
5012 struct cgroup *cont) 4956 struct cgroup *cont)
5013{ 4957{
5014 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 4958 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
5015 4959
5016 return mem_cgroup_force_empty(mem, false); 4960 return mem_cgroup_force_empty(memcg, false);
5017} 4961}
5018 4962
5019static void mem_cgroup_destroy(struct cgroup_subsys *ss, 4963static void mem_cgroup_destroy(struct cgroup_subsys *ss,
5020 struct cgroup *cont) 4964 struct cgroup *cont)
5021{ 4965{
5022 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 4966 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
5023 4967
5024 mem_cgroup_put(mem); 4968 mem_cgroup_put(memcg);
5025} 4969}
5026 4970
5027static int mem_cgroup_populate(struct cgroup_subsys *ss, 4971static int mem_cgroup_populate(struct cgroup_subsys *ss,
@@ -5044,9 +4988,9 @@ static int mem_cgroup_do_precharge(unsigned long count)
5044{ 4988{
5045 int ret = 0; 4989 int ret = 0;
5046 int batch_count = PRECHARGE_COUNT_AT_ONCE; 4990 int batch_count = PRECHARGE_COUNT_AT_ONCE;
5047 struct mem_cgroup *mem = mc.to; 4991 struct mem_cgroup *memcg = mc.to;
5048 4992
5049 if (mem_cgroup_is_root(mem)) { 4993 if (mem_cgroup_is_root(memcg)) {
5050 mc.precharge += count; 4994 mc.precharge += count;
5051 /* we don't need css_get for root */ 4995 /* we don't need css_get for root */
5052 return ret; 4996 return ret;
@@ -5055,16 +4999,16 @@ static int mem_cgroup_do_precharge(unsigned long count)
5055 if (count > 1) { 4999 if (count > 1) {
5056 struct res_counter *dummy; 5000 struct res_counter *dummy;
5057 /* 5001 /*
5058 * "mem" cannot be under rmdir() because we've already checked 5002 * "memcg" cannot be under rmdir() because we've already checked
5059 * by cgroup_lock_live_cgroup() that it is not removed and we 5003 * by cgroup_lock_live_cgroup() that it is not removed and we
5060 * are still under the same cgroup_mutex. So we can postpone 5004 * are still under the same cgroup_mutex. So we can postpone
5061 * css_get(). 5005 * css_get().
5062 */ 5006 */
5063 if (res_counter_charge(&mem->res, PAGE_SIZE * count, &dummy)) 5007 if (res_counter_charge(&memcg->res, PAGE_SIZE * count, &dummy))
5064 goto one_by_one; 5008 goto one_by_one;
5065 if (do_swap_account && res_counter_charge(&mem->memsw, 5009 if (do_swap_account && res_counter_charge(&memcg->memsw,
5066 PAGE_SIZE * count, &dummy)) { 5010 PAGE_SIZE * count, &dummy)) {
5067 res_counter_uncharge(&mem->res, PAGE_SIZE * count); 5011 res_counter_uncharge(&memcg->res, PAGE_SIZE * count);
5068 goto one_by_one; 5012 goto one_by_one;
5069 } 5013 }
5070 mc.precharge += count; 5014 mc.precharge += count;
@@ -5081,8 +5025,9 @@ one_by_one:
5081 batch_count = PRECHARGE_COUNT_AT_ONCE; 5025 batch_count = PRECHARGE_COUNT_AT_ONCE;
5082 cond_resched(); 5026 cond_resched();
5083 } 5027 }
5084 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, 1, &mem, false); 5028 ret = __mem_cgroup_try_charge(NULL,
5085 if (ret || !mem) 5029 GFP_KERNEL, 1, &memcg, false);
5030 if (ret || !memcg)
5086 /* mem_cgroup_clear_mc() will do uncharge later */ 5031 /* mem_cgroup_clear_mc() will do uncharge later */
5087 return -ENOMEM; 5032 return -ENOMEM;
5088 mc.precharge++; 5033 mc.precharge++;
@@ -5181,15 +5126,17 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
5181 pgoff = pte_to_pgoff(ptent); 5126 pgoff = pte_to_pgoff(ptent);
5182 5127
5183 /* page is moved even if it's not RSS of this task(page-faulted). */ 5128 /* page is moved even if it's not RSS of this task(page-faulted). */
5184 if (!mapping_cap_swap_backed(mapping)) { /* normal file */ 5129 page = find_get_page(mapping, pgoff);
5185 page = find_get_page(mapping, pgoff); 5130
5186 } else { /* shmem/tmpfs file. we should take account of swap too. */ 5131#ifdef CONFIG_SWAP
5187 swp_entry_t ent; 5132 /* shmem/tmpfs may report page out on swap: account for that too. */
5188 mem_cgroup_get_shmem_target(inode, pgoff, &page, &ent); 5133 if (radix_tree_exceptional_entry(page)) {
5134 swp_entry_t swap = radix_to_swp_entry(page);
5189 if (do_swap_account) 5135 if (do_swap_account)
5190 entry->val = ent.val; 5136 *entry = swap;
5137 page = find_get_page(&swapper_space, swap.val);
5191 } 5138 }
5192 5139#endif
5193 return page; 5140 return page;
5194} 5141}
5195 5142
@@ -5354,13 +5301,13 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
5354 struct task_struct *p) 5301 struct task_struct *p)
5355{ 5302{
5356 int ret = 0; 5303 int ret = 0;
5357 struct mem_cgroup *mem = mem_cgroup_from_cont(cgroup); 5304 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgroup);
5358 5305
5359 if (mem->move_charge_at_immigrate) { 5306 if (memcg->move_charge_at_immigrate) {
5360 struct mm_struct *mm; 5307 struct mm_struct *mm;
5361 struct mem_cgroup *from = mem_cgroup_from_task(p); 5308 struct mem_cgroup *from = mem_cgroup_from_task(p);
5362 5309
5363 VM_BUG_ON(from == mem); 5310 VM_BUG_ON(from == memcg);
5364 5311
5365 mm = get_task_mm(p); 5312 mm = get_task_mm(p);
5366 if (!mm) 5313 if (!mm)
@@ -5375,7 +5322,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
5375 mem_cgroup_start_move(from); 5322 mem_cgroup_start_move(from);
5376 spin_lock(&mc.lock); 5323 spin_lock(&mc.lock);
5377 mc.from = from; 5324 mc.from = from;
5378 mc.to = mem; 5325 mc.to = memcg;
5379 spin_unlock(&mc.lock); 5326 spin_unlock(&mc.lock);
5380 /* We set mc.moving_task later */ 5327 /* We set mc.moving_task later */
5381 5328
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 740c4f52059c..06d3479513aa 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -42,6 +42,7 @@
42#include <linux/sched.h> 42#include <linux/sched.h>
43#include <linux/ksm.h> 43#include <linux/ksm.h>
44#include <linux/rmap.h> 44#include <linux/rmap.h>
45#include <linux/export.h>
45#include <linux/pagemap.h> 46#include <linux/pagemap.h>
46#include <linux/swap.h> 47#include <linux/swap.h>
47#include <linux/backing-dev.h> 48#include <linux/backing-dev.h>
@@ -53,6 +54,7 @@
53#include <linux/hugetlb.h> 54#include <linux/hugetlb.h>
54#include <linux/memory_hotplug.h> 55#include <linux/memory_hotplug.h>
55#include <linux/mm_inline.h> 56#include <linux/mm_inline.h>
57#include <linux/kfifo.h>
56#include "internal.h" 58#include "internal.h"
57 59
58int sysctl_memory_failure_early_kill __read_mostly = 0; 60int sysctl_memory_failure_early_kill __read_mostly = 0;
@@ -1178,6 +1180,97 @@ void memory_failure(unsigned long pfn, int trapno)
1178 __memory_failure(pfn, trapno, 0); 1180 __memory_failure(pfn, trapno, 0);
1179} 1181}
1180 1182
1183#define MEMORY_FAILURE_FIFO_ORDER 4
1184#define MEMORY_FAILURE_FIFO_SIZE (1 << MEMORY_FAILURE_FIFO_ORDER)
1185
1186struct memory_failure_entry {
1187 unsigned long pfn;
1188 int trapno;
1189 int flags;
1190};
1191
1192struct memory_failure_cpu {
1193 DECLARE_KFIFO(fifo, struct memory_failure_entry,
1194 MEMORY_FAILURE_FIFO_SIZE);
1195 spinlock_t lock;
1196 struct work_struct work;
1197};
1198
1199static DEFINE_PER_CPU(struct memory_failure_cpu, memory_failure_cpu);
1200
1201/**
1202 * memory_failure_queue - Schedule handling memory failure of a page.
1203 * @pfn: Page Number of the corrupted page
1204 * @trapno: Trap number reported in the signal to user space.
1205 * @flags: Flags for memory failure handling
1206 *
1207 * This function is called by the low level hardware error handler
1208 * when it detects hardware memory corruption of a page. It schedules
1209 * the recovering of error page, including dropping pages, killing
1210 * processes etc.
1211 *
1212 * The function is primarily of use for corruptions that
1213 * happen outside the current execution context (e.g. when
1214 * detected by a background scrubber)
1215 *
1216 * Can run in IRQ context.
1217 */
1218void memory_failure_queue(unsigned long pfn, int trapno, int flags)
1219{
1220 struct memory_failure_cpu *mf_cpu;
1221 unsigned long proc_flags;
1222 struct memory_failure_entry entry = {
1223 .pfn = pfn,
1224 .trapno = trapno,
1225 .flags = flags,
1226 };
1227
1228 mf_cpu = &get_cpu_var(memory_failure_cpu);
1229 spin_lock_irqsave(&mf_cpu->lock, proc_flags);
1230 if (kfifo_put(&mf_cpu->fifo, &entry))
1231 schedule_work_on(smp_processor_id(), &mf_cpu->work);
1232 else
1233 pr_err("Memory failure: buffer overflow when queuing memory failure at 0x%#lx\n",
1234 pfn);
1235 spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
1236 put_cpu_var(memory_failure_cpu);
1237}
1238EXPORT_SYMBOL_GPL(memory_failure_queue);
1239
1240static void memory_failure_work_func(struct work_struct *work)
1241{
1242 struct memory_failure_cpu *mf_cpu;
1243 struct memory_failure_entry entry = { 0, };
1244 unsigned long proc_flags;
1245 int gotten;
1246
1247 mf_cpu = &__get_cpu_var(memory_failure_cpu);
1248 for (;;) {
1249 spin_lock_irqsave(&mf_cpu->lock, proc_flags);
1250 gotten = kfifo_get(&mf_cpu->fifo, &entry);
1251 spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
1252 if (!gotten)
1253 break;
1254 __memory_failure(entry.pfn, entry.trapno, entry.flags);
1255 }
1256}
1257
1258static int __init memory_failure_init(void)
1259{
1260 struct memory_failure_cpu *mf_cpu;
1261 int cpu;
1262
1263 for_each_possible_cpu(cpu) {
1264 mf_cpu = &per_cpu(memory_failure_cpu, cpu);
1265 spin_lock_init(&mf_cpu->lock);
1266 INIT_KFIFO(mf_cpu->fifo);
1267 INIT_WORK(&mf_cpu->work, memory_failure_work_func);
1268 }
1269
1270 return 0;
1271}
1272core_initcall(memory_failure_init);
1273
1181/** 1274/**
1182 * unpoison_memory - Unpoison a previously poisoned page 1275 * unpoison_memory - Unpoison a previously poisoned page
1183 * @pfn: Page number of the to be unpoisoned page 1276 * @pfn: Page number of the to be unpoisoned page
@@ -1218,7 +1311,7 @@ int unpoison_memory(unsigned long pfn)
1218 * to the end. 1311 * to the end.
1219 */ 1312 */
1220 if (PageHuge(page)) { 1313 if (PageHuge(page)) {
1221 pr_debug("MCE: Memory failure is now running on free hugepage %#lx\n", pfn); 1314 pr_info("MCE: Memory failure is now running on free hugepage %#lx\n", pfn);
1222 return 0; 1315 return 0;
1223 } 1316 }
1224 if (TestClearPageHWPoison(p)) 1317 if (TestClearPageHWPoison(p))
@@ -1327,7 +1420,7 @@ static int soft_offline_huge_page(struct page *page, int flags)
1327 1420
1328 if (PageHWPoison(hpage)) { 1421 if (PageHWPoison(hpage)) {
1329 put_page(hpage); 1422 put_page(hpage);
1330 pr_debug("soft offline: %#lx hugepage already poisoned\n", pfn); 1423 pr_info("soft offline: %#lx hugepage already poisoned\n", pfn);
1331 return -EBUSY; 1424 return -EBUSY;
1332 } 1425 }
1333 1426
@@ -1341,8 +1434,8 @@ static int soft_offline_huge_page(struct page *page, int flags)
1341 list_for_each_entry_safe(page1, page2, &pagelist, lru) 1434 list_for_each_entry_safe(page1, page2, &pagelist, lru)
1342 put_page(page1); 1435 put_page(page1);
1343 1436
1344 pr_debug("soft offline: %#lx: migration failed %d, type %lx\n", 1437 pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
1345 pfn, ret, page->flags); 1438 pfn, ret, page->flags);
1346 if (ret > 0) 1439 if (ret > 0)
1347 ret = -EIO; 1440 ret = -EIO;
1348 return ret; 1441 return ret;
@@ -1413,7 +1506,7 @@ int soft_offline_page(struct page *page, int flags)
1413 } 1506 }
1414 if (!PageLRU(page)) { 1507 if (!PageLRU(page)) {
1415 pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n", 1508 pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n",
1416 pfn, page->flags); 1509 pfn, page->flags);
1417 return -EIO; 1510 return -EIO;
1418 } 1511 }
1419 1512
@@ -1474,7 +1567,7 @@ int soft_offline_page(struct page *page, int flags)
1474 } 1567 }
1475 } else { 1568 } else {
1476 pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n", 1569 pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
1477 pfn, ret, page_count(page), page->flags); 1570 pfn, ret, page_count(page), page->flags);
1478 } 1571 }
1479 if (ret) 1572 if (ret)
1480 return ret; 1573 return ret;
diff --git a/mm/memory.c b/mm/memory.c
index 9b8a01d941cb..829d43735402 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -47,7 +47,7 @@
47#include <linux/pagemap.h> 47#include <linux/pagemap.h>
48#include <linux/ksm.h> 48#include <linux/ksm.h>
49#include <linux/rmap.h> 49#include <linux/rmap.h>
50#include <linux/module.h> 50#include <linux/export.h>
51#include <linux/delayacct.h> 51#include <linux/delayacct.h>
52#include <linux/init.h> 52#include <linux/init.h>
53#include <linux/writeback.h> 53#include <linux/writeback.h>
@@ -1290,13 +1290,6 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb,
1290 return addr; 1290 return addr;
1291} 1291}
1292 1292
1293#ifdef CONFIG_PREEMPT
1294# define ZAP_BLOCK_SIZE (8 * PAGE_SIZE)
1295#else
1296/* No preempt: go for improved straight-line efficiency */
1297# define ZAP_BLOCK_SIZE (1024 * PAGE_SIZE)
1298#endif
1299
1300/** 1293/**
1301 * unmap_vmas - unmap a range of memory covered by a list of vma's 1294 * unmap_vmas - unmap a range of memory covered by a list of vma's
1302 * @tlb: address of the caller's struct mmu_gather 1295 * @tlb: address of the caller's struct mmu_gather
@@ -1310,10 +1303,6 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb,
1310 * 1303 *
1311 * Unmap all pages in the vma list. 1304 * Unmap all pages in the vma list.
1312 * 1305 *
1313 * We aim to not hold locks for too long (for scheduling latency reasons).
1314 * So zap pages in ZAP_BLOCK_SIZE bytecounts. This means we need to
1315 * return the ending mmu_gather to the caller.
1316 *
1317 * Only addresses between `start' and `end' will be unmapped. 1306 * Only addresses between `start' and `end' will be unmapped.
1318 * 1307 *
1319 * The VMA list must be sorted in ascending virtual address order. 1308 * The VMA list must be sorted in ascending virtual address order.
@@ -1514,7 +1503,7 @@ split_fallthrough:
1514 } 1503 }
1515 1504
1516 if (flags & FOLL_GET) 1505 if (flags & FOLL_GET)
1517 get_page(page); 1506 get_page_foll(page);
1518 if (flags & FOLL_TOUCH) { 1507 if (flags & FOLL_TOUCH) {
1519 if ((flags & FOLL_WRITE) && 1508 if ((flags & FOLL_WRITE) &&
1520 !pte_dirty(pte) && !PageDirty(page)) 1509 !pte_dirty(pte) && !PageDirty(page))
@@ -1816,7 +1805,63 @@ next_page:
1816} 1805}
1817EXPORT_SYMBOL(__get_user_pages); 1806EXPORT_SYMBOL(__get_user_pages);
1818 1807
1819/** 1808/*
1809 * fixup_user_fault() - manually resolve a user page fault
1810 * @tsk: the task_struct to use for page fault accounting, or
1811 * NULL if faults are not to be recorded.
1812 * @mm: mm_struct of target mm
1813 * @address: user address
1814 * @fault_flags:flags to pass down to handle_mm_fault()
1815 *
1816 * This is meant to be called in the specific scenario where for locking reasons
1817 * we try to access user memory in atomic context (within a pagefault_disable()
1818 * section), this returns -EFAULT, and we want to resolve the user fault before
1819 * trying again.
1820 *
1821 * Typically this is meant to be used by the futex code.
1822 *
1823 * The main difference with get_user_pages() is that this function will
1824 * unconditionally call handle_mm_fault() which will in turn perform all the
1825 * necessary SW fixup of the dirty and young bits in the PTE, while
1826 * handle_mm_fault() only guarantees to update these in the struct page.
1827 *
1828 * This is important for some architectures where those bits also gate the
1829 * access permission to the page because they are maintained in software. On
1830 * such architectures, gup() will not be enough to make a subsequent access
1831 * succeed.
1832 *
1833 * This should be called with the mm_sem held for read.
1834 */
1835int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
1836 unsigned long address, unsigned int fault_flags)
1837{
1838 struct vm_area_struct *vma;
1839 int ret;
1840
1841 vma = find_extend_vma(mm, address);
1842 if (!vma || address < vma->vm_start)
1843 return -EFAULT;
1844
1845 ret = handle_mm_fault(mm, vma, address, fault_flags);
1846 if (ret & VM_FAULT_ERROR) {
1847 if (ret & VM_FAULT_OOM)
1848 return -ENOMEM;
1849 if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE))
1850 return -EHWPOISON;
1851 if (ret & VM_FAULT_SIGBUS)
1852 return -EFAULT;
1853 BUG();
1854 }
1855 if (tsk) {
1856 if (ret & VM_FAULT_MAJOR)
1857 tsk->maj_flt++;
1858 else
1859 tsk->min_flt++;
1860 }
1861 return 0;
1862}
1863
1864/*
1820 * get_user_pages() - pin user pages in memory 1865 * get_user_pages() - pin user pages in memory
1821 * @tsk: the task_struct to use for page fault accounting, or 1866 * @tsk: the task_struct to use for page fault accounting, or
1822 * NULL if faults are not to be recorded. 1867 * NULL if faults are not to be recorded.
@@ -3104,14 +3149,34 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3104 pte_t *page_table; 3149 pte_t *page_table;
3105 spinlock_t *ptl; 3150 spinlock_t *ptl;
3106 struct page *page; 3151 struct page *page;
3152 struct page *cow_page;
3107 pte_t entry; 3153 pte_t entry;
3108 int anon = 0; 3154 int anon = 0;
3109 int charged = 0;
3110 struct page *dirty_page = NULL; 3155 struct page *dirty_page = NULL;
3111 struct vm_fault vmf; 3156 struct vm_fault vmf;
3112 int ret; 3157 int ret;
3113 int page_mkwrite = 0; 3158 int page_mkwrite = 0;
3114 3159
3160 /*
3161 * If we do COW later, allocate page befor taking lock_page()
3162 * on the file cache page. This will reduce lock holding time.
3163 */
3164 if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
3165
3166 if (unlikely(anon_vma_prepare(vma)))
3167 return VM_FAULT_OOM;
3168
3169 cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
3170 if (!cow_page)
3171 return VM_FAULT_OOM;
3172
3173 if (mem_cgroup_newpage_charge(cow_page, mm, GFP_KERNEL)) {
3174 page_cache_release(cow_page);
3175 return VM_FAULT_OOM;
3176 }
3177 } else
3178 cow_page = NULL;
3179
3115 vmf.virtual_address = (void __user *)(address & PAGE_MASK); 3180 vmf.virtual_address = (void __user *)(address & PAGE_MASK);
3116 vmf.pgoff = pgoff; 3181 vmf.pgoff = pgoff;
3117 vmf.flags = flags; 3182 vmf.flags = flags;
@@ -3120,12 +3185,13 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3120 ret = vma->vm_ops->fault(vma, &vmf); 3185 ret = vma->vm_ops->fault(vma, &vmf);
3121 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | 3186 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
3122 VM_FAULT_RETRY))) 3187 VM_FAULT_RETRY)))
3123 return ret; 3188 goto uncharge_out;
3124 3189
3125 if (unlikely(PageHWPoison(vmf.page))) { 3190 if (unlikely(PageHWPoison(vmf.page))) {
3126 if (ret & VM_FAULT_LOCKED) 3191 if (ret & VM_FAULT_LOCKED)
3127 unlock_page(vmf.page); 3192 unlock_page(vmf.page);
3128 return VM_FAULT_HWPOISON; 3193 ret = VM_FAULT_HWPOISON;
3194 goto uncharge_out;
3129 } 3195 }
3130 3196
3131 /* 3197 /*
@@ -3143,23 +3209,8 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3143 page = vmf.page; 3209 page = vmf.page;
3144 if (flags & FAULT_FLAG_WRITE) { 3210 if (flags & FAULT_FLAG_WRITE) {
3145 if (!(vma->vm_flags & VM_SHARED)) { 3211 if (!(vma->vm_flags & VM_SHARED)) {
3212 page = cow_page;
3146 anon = 1; 3213 anon = 1;
3147 if (unlikely(anon_vma_prepare(vma))) {
3148 ret = VM_FAULT_OOM;
3149 goto out;
3150 }
3151 page = alloc_page_vma(GFP_HIGHUSER_MOVABLE,
3152 vma, address);
3153 if (!page) {
3154 ret = VM_FAULT_OOM;
3155 goto out;
3156 }
3157 if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) {
3158 ret = VM_FAULT_OOM;
3159 page_cache_release(page);
3160 goto out;
3161 }
3162 charged = 1;
3163 copy_user_highpage(page, vmf.page, address, vma); 3214 copy_user_highpage(page, vmf.page, address, vma);
3164 __SetPageUptodate(page); 3215 __SetPageUptodate(page);
3165 } else { 3216 } else {
@@ -3228,8 +3279,8 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3228 /* no need to invalidate: a not-present page won't be cached */ 3279 /* no need to invalidate: a not-present page won't be cached */
3229 update_mmu_cache(vma, address, page_table); 3280 update_mmu_cache(vma, address, page_table);
3230 } else { 3281 } else {
3231 if (charged) 3282 if (cow_page)
3232 mem_cgroup_uncharge_page(page); 3283 mem_cgroup_uncharge_page(cow_page);
3233 if (anon) 3284 if (anon)
3234 page_cache_release(page); 3285 page_cache_release(page);
3235 else 3286 else
@@ -3238,7 +3289,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3238 3289
3239 pte_unmap_unlock(page_table, ptl); 3290 pte_unmap_unlock(page_table, ptl);
3240 3291
3241out:
3242 if (dirty_page) { 3292 if (dirty_page) {
3243 struct address_space *mapping = page->mapping; 3293 struct address_space *mapping = page->mapping;
3244 3294
@@ -3268,6 +3318,13 @@ out:
3268unwritable_page: 3318unwritable_page:
3269 page_cache_release(page); 3319 page_cache_release(page);
3270 return ret; 3320 return ret;
3321uncharge_out:
3322 /* fs's fault handler get error */
3323 if (cow_page) {
3324 mem_cgroup_uncharge_page(cow_page);
3325 page_cache_release(cow_page);
3326 }
3327 return ret;
3271} 3328}
3272 3329
3273static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, 3330static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index c46887b5a11e..2168489c0bc9 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -11,7 +11,7 @@
11#include <linux/pagemap.h> 11#include <linux/pagemap.h>
12#include <linux/bootmem.h> 12#include <linux/bootmem.h>
13#include <linux/compiler.h> 13#include <linux/compiler.h>
14#include <linux/module.h> 14#include <linux/export.h>
15#include <linux/pagevec.h> 15#include <linux/pagevec.h>
16#include <linux/writeback.h> 16#include <linux/writeback.h>
17#include <linux/slab.h> 17#include <linux/slab.h>
@@ -34,6 +34,17 @@
34 34
35#include "internal.h" 35#include "internal.h"
36 36
37/*
38 * online_page_callback contains pointer to current page onlining function.
39 * Initially it is generic_online_page(). If it is required it could be
40 * changed by calling set_online_page_callback() for callback registration
41 * and restore_online_page_callback() for generic callback restore.
42 */
43
44static void generic_online_page(struct page *page);
45
46static online_page_callback_t online_page_callback = generic_online_page;
47
37DEFINE_MUTEX(mem_hotplug_mutex); 48DEFINE_MUTEX(mem_hotplug_mutex);
38 49
39void lock_memory_hotplug(void) 50void lock_memory_hotplug(void)
@@ -361,23 +372,74 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
361} 372}
362EXPORT_SYMBOL_GPL(__remove_pages); 373EXPORT_SYMBOL_GPL(__remove_pages);
363 374
364void online_page(struct page *page) 375int set_online_page_callback(online_page_callback_t callback)
376{
377 int rc = -EINVAL;
378
379 lock_memory_hotplug();
380
381 if (online_page_callback == generic_online_page) {
382 online_page_callback = callback;
383 rc = 0;
384 }
385
386 unlock_memory_hotplug();
387
388 return rc;
389}
390EXPORT_SYMBOL_GPL(set_online_page_callback);
391
392int restore_online_page_callback(online_page_callback_t callback)
393{
394 int rc = -EINVAL;
395
396 lock_memory_hotplug();
397
398 if (online_page_callback == callback) {
399 online_page_callback = generic_online_page;
400 rc = 0;
401 }
402
403 unlock_memory_hotplug();
404
405 return rc;
406}
407EXPORT_SYMBOL_GPL(restore_online_page_callback);
408
409void __online_page_set_limits(struct page *page)
365{ 410{
366 unsigned long pfn = page_to_pfn(page); 411 unsigned long pfn = page_to_pfn(page);
367 412
368 totalram_pages++;
369 if (pfn >= num_physpages) 413 if (pfn >= num_physpages)
370 num_physpages = pfn + 1; 414 num_physpages = pfn + 1;
415}
416EXPORT_SYMBOL_GPL(__online_page_set_limits);
417
418void __online_page_increment_counters(struct page *page)
419{
420 totalram_pages++;
371 421
372#ifdef CONFIG_HIGHMEM 422#ifdef CONFIG_HIGHMEM
373 if (PageHighMem(page)) 423 if (PageHighMem(page))
374 totalhigh_pages++; 424 totalhigh_pages++;
375#endif 425#endif
426}
427EXPORT_SYMBOL_GPL(__online_page_increment_counters);
376 428
429void __online_page_free(struct page *page)
430{
377 ClearPageReserved(page); 431 ClearPageReserved(page);
378 init_page_count(page); 432 init_page_count(page);
379 __free_page(page); 433 __free_page(page);
380} 434}
435EXPORT_SYMBOL_GPL(__online_page_free);
436
437static void generic_online_page(struct page *page)
438{
439 __online_page_set_limits(page);
440 __online_page_increment_counters(page);
441 __online_page_free(page);
442}
381 443
382static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, 444static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
383 void *arg) 445 void *arg)
@@ -388,7 +450,7 @@ static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
388 if (PageReserved(pfn_to_page(start_pfn))) 450 if (PageReserved(pfn_to_page(start_pfn)))
389 for (i = 0; i < nr_pages; i++) { 451 for (i = 0; i < nr_pages; i++) {
390 page = pfn_to_page(start_pfn + i); 452 page = pfn_to_page(start_pfn + i);
391 online_page(page); 453 (*online_page_callback)(page);
392 onlined_pages++; 454 onlined_pages++;
393 } 455 }
394 *(unsigned long *)arg = onlined_pages; 456 *(unsigned long *)arg = onlined_pages;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index e7fb9d25c54e..adc395481813 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -75,7 +75,7 @@
75#include <linux/cpuset.h> 75#include <linux/cpuset.h>
76#include <linux/slab.h> 76#include <linux/slab.h>
77#include <linux/string.h> 77#include <linux/string.h>
78#include <linux/module.h> 78#include <linux/export.h>
79#include <linux/nsproxy.h> 79#include <linux/nsproxy.h>
80#include <linux/interrupt.h> 80#include <linux/interrupt.h>
81#include <linux/init.h> 81#include <linux/init.h>
@@ -93,6 +93,7 @@
93 93
94#include <asm/tlbflush.h> 94#include <asm/tlbflush.h>
95#include <asm/uaccess.h> 95#include <asm/uaccess.h>
96#include <linux/random.h>
96 97
97#include "internal.h" 98#include "internal.h"
98 99
@@ -110,7 +111,7 @@ enum zone_type policy_zone = 0;
110/* 111/*
111 * run-time system-wide default policy => local allocation 112 * run-time system-wide default policy => local allocation
112 */ 113 */
113struct mempolicy default_policy = { 114static struct mempolicy default_policy = {
114 .refcnt = ATOMIC_INIT(1), /* never free it */ 115 .refcnt = ATOMIC_INIT(1), /* never free it */
115 .mode = MPOL_PREFERRED, 116 .mode = MPOL_PREFERRED,
116 .flags = MPOL_F_LOCAL, 117 .flags = MPOL_F_LOCAL,
@@ -635,7 +636,6 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
635 struct vm_area_struct *prev; 636 struct vm_area_struct *prev;
636 struct vm_area_struct *vma; 637 struct vm_area_struct *vma;
637 int err = 0; 638 int err = 0;
638 pgoff_t pgoff;
639 unsigned long vmstart; 639 unsigned long vmstart;
640 unsigned long vmend; 640 unsigned long vmend;
641 641
@@ -648,9 +648,9 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
648 vmstart = max(start, vma->vm_start); 648 vmstart = max(start, vma->vm_start);
649 vmend = min(end, vma->vm_end); 649 vmend = min(end, vma->vm_end);
650 650
651 pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
652 prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags, 651 prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
653 vma->anon_vma, vma->vm_file, pgoff, new_pol); 652 vma->anon_vma, vma->vm_file, vma->vm_pgoff,
653 new_pol);
654 if (prev) { 654 if (prev) {
655 vma = prev; 655 vma = prev;
656 next = vma->vm_next; 656 next = vma->vm_next;
@@ -1411,7 +1411,9 @@ asmlinkage long compat_sys_get_mempolicy(int __user *policy,
1411 err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags); 1411 err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1412 1412
1413 if (!err && nmask) { 1413 if (!err && nmask) {
1414 err = copy_from_user(bm, nm, alloc_size); 1414 unsigned long copy_size;
1415 copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
1416 err = copy_from_user(bm, nm, copy_size);
1415 /* ensure entire bitmap is zeroed */ 1417 /* ensure entire bitmap is zeroed */
1416 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8); 1418 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1417 err |= compat_put_bitmap(nmask, bm, nr_bits); 1419 err |= compat_put_bitmap(nmask, bm, nr_bits);
@@ -1645,6 +1647,21 @@ static inline unsigned interleave_nid(struct mempolicy *pol,
1645 return interleave_nodes(pol); 1647 return interleave_nodes(pol);
1646} 1648}
1647 1649
1650/*
1651 * Return the bit number of a random bit set in the nodemask.
1652 * (returns -1 if nodemask is empty)
1653 */
1654int node_random(const nodemask_t *maskp)
1655{
1656 int w, bit = -1;
1657
1658 w = nodes_weight(*maskp);
1659 if (w)
1660 bit = bitmap_ord_to_pos(maskp->bits,
1661 get_random_int() % w, MAX_NUMNODES);
1662 return bit;
1663}
1664
1648#ifdef CONFIG_HUGETLBFS 1665#ifdef CONFIG_HUGETLBFS
1649/* 1666/*
1650 * huge_zonelist(@vma, @addr, @gfp_flags, @mpol) 1667 * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
diff --git a/mm/mempool.c b/mm/mempool.c
index 1a3bc3d4d554..e73641b79bb5 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -10,7 +10,7 @@
10 10
11#include <linux/mm.h> 11#include <linux/mm.h>
12#include <linux/slab.h> 12#include <linux/slab.h>
13#include <linux/module.h> 13#include <linux/export.h>
14#include <linux/mempool.h> 14#include <linux/mempool.h>
15#include <linux/blkdev.h> 15#include <linux/blkdev.h>
16#include <linux/writeback.h> 16#include <linux/writeback.h>
diff --git a/mm/migrate.c b/mm/migrate.c
index 666e4e677414..578e29174fa6 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -13,7 +13,7 @@
13 */ 13 */
14 14
15#include <linux/migrate.h> 15#include <linux/migrate.h>
16#include <linux/module.h> 16#include <linux/export.h>
17#include <linux/swap.h> 17#include <linux/swap.h>
18#include <linux/swapops.h> 18#include <linux/swapops.h>
19#include <linux/pagemap.h> 19#include <linux/pagemap.h>
@@ -120,10 +120,10 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
120 120
121 ptep = pte_offset_map(pmd, addr); 121 ptep = pte_offset_map(pmd, addr);
122 122
123 if (!is_swap_pte(*ptep)) { 123 /*
124 pte_unmap(ptep); 124 * Peek to check is_swap_pte() before taking ptlock? No, we
125 goto out; 125 * can race mremap's move_ptes(), which skips anon_vma lock.
126 } 126 */
127 127
128 ptl = pte_lockptr(mm, pmd); 128 ptl = pte_lockptr(mm, pmd);
129 } 129 }
@@ -621,38 +621,18 @@ static int move_to_new_page(struct page *newpage, struct page *page,
621 return rc; 621 return rc;
622} 622}
623 623
624/* 624static int __unmap_and_move(struct page *page, struct page *newpage,
625 * Obtain the lock on page, remove all ptes and migrate the page 625 int force, bool offlining, bool sync)
626 * to the newly allocated page in newpage.
627 */
628static int unmap_and_move(new_page_t get_new_page, unsigned long private,
629 struct page *page, int force, bool offlining, bool sync)
630{ 626{
631 int rc = 0; 627 int rc = -EAGAIN;
632 int *result = NULL;
633 struct page *newpage = get_new_page(page, private, &result);
634 int remap_swapcache = 1; 628 int remap_swapcache = 1;
635 int charge = 0; 629 int charge = 0;
636 struct mem_cgroup *mem; 630 struct mem_cgroup *mem;
637 struct anon_vma *anon_vma = NULL; 631 struct anon_vma *anon_vma = NULL;
638 632
639 if (!newpage)
640 return -ENOMEM;
641
642 if (page_count(page) == 1) {
643 /* page was freed from under us. So we are done. */
644 goto move_newpage;
645 }
646 if (unlikely(PageTransHuge(page)))
647 if (unlikely(split_huge_page(page)))
648 goto move_newpage;
649
650 /* prepare cgroup just returns 0 or -ENOMEM */
651 rc = -EAGAIN;
652
653 if (!trylock_page(page)) { 633 if (!trylock_page(page)) {
654 if (!force || !sync) 634 if (!force || !sync)
655 goto move_newpage; 635 goto out;
656 636
657 /* 637 /*
658 * It's not safe for direct compaction to call lock_page. 638 * It's not safe for direct compaction to call lock_page.
@@ -668,7 +648,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
668 * altogether. 648 * altogether.
669 */ 649 */
670 if (current->flags & PF_MEMALLOC) 650 if (current->flags & PF_MEMALLOC)
671 goto move_newpage; 651 goto out;
672 652
673 lock_page(page); 653 lock_page(page);
674 } 654 }
@@ -785,27 +765,52 @@ uncharge:
785 mem_cgroup_end_migration(mem, page, newpage, rc == 0); 765 mem_cgroup_end_migration(mem, page, newpage, rc == 0);
786unlock: 766unlock:
787 unlock_page(page); 767 unlock_page(page);
768out:
769 return rc;
770}
788 771
789move_newpage: 772/*
773 * Obtain the lock on page, remove all ptes and migrate the page
774 * to the newly allocated page in newpage.
775 */
776static int unmap_and_move(new_page_t get_new_page, unsigned long private,
777 struct page *page, int force, bool offlining, bool sync)
778{
779 int rc = 0;
780 int *result = NULL;
781 struct page *newpage = get_new_page(page, private, &result);
782
783 if (!newpage)
784 return -ENOMEM;
785
786 if (page_count(page) == 1) {
787 /* page was freed from under us. So we are done. */
788 goto out;
789 }
790
791 if (unlikely(PageTransHuge(page)))
792 if (unlikely(split_huge_page(page)))
793 goto out;
794
795 rc = __unmap_and_move(page, newpage, force, offlining, sync);
796out:
790 if (rc != -EAGAIN) { 797 if (rc != -EAGAIN) {
791 /* 798 /*
792 * A page that has been migrated has all references 799 * A page that has been migrated has all references
793 * removed and will be freed. A page that has not been 800 * removed and will be freed. A page that has not been
794 * migrated will have kepts its references and be 801 * migrated will have kepts its references and be
795 * restored. 802 * restored.
796 */ 803 */
797 list_del(&page->lru); 804 list_del(&page->lru);
798 dec_zone_page_state(page, NR_ISOLATED_ANON + 805 dec_zone_page_state(page, NR_ISOLATED_ANON +
799 page_is_file_cache(page)); 806 page_is_file_cache(page));
800 putback_lru_page(page); 807 putback_lru_page(page);
801 } 808 }
802
803 /* 809 /*
804 * Move the new page to the LRU. If migration was not successful 810 * Move the new page to the LRU. If migration was not successful
805 * then this will free the page. 811 * then this will free the page.
806 */ 812 */
807 putback_lru_page(newpage); 813 putback_lru_page(newpage);
808
809 if (result) { 814 if (result) {
810 if (rc) 815 if (rc)
811 *result = rc; 816 *result = rc;
diff --git a/mm/mincore.c b/mm/mincore.c
index a4e6b9d75c76..636a86876ff2 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -69,12 +69,15 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff)
69 * file will not get a swp_entry_t in its pte, but rather it is like 69 * file will not get a swp_entry_t in its pte, but rather it is like
70 * any other file mapping (ie. marked !present and faulted in with 70 * any other file mapping (ie. marked !present and faulted in with
71 * tmpfs's .fault). So swapped out tmpfs mappings are tested here. 71 * tmpfs's .fault). So swapped out tmpfs mappings are tested here.
72 *
73 * However when tmpfs moves the page from pagecache and into swapcache,
74 * it is still in core, but the find_get_page below won't find it.
75 * No big deal, but make a note of it.
76 */ 72 */
77 page = find_get_page(mapping, pgoff); 73 page = find_get_page(mapping, pgoff);
74#ifdef CONFIG_SWAP
75 /* shmem/tmpfs may return swap: account for swapcache page too. */
76 if (radix_tree_exceptional_entry(page)) {
77 swp_entry_t swap = radix_to_swp_entry(page);
78 page = find_get_page(&swapper_space, swap.val);
79 }
80#endif
78 if (page) { 81 if (page) {
79 present = PageUptodate(page); 82 present = PageUptodate(page);
80 page_cache_release(page); 83 page_cache_release(page);
diff --git a/mm/mlock.c b/mm/mlock.c
index 048260c4e02e..4f4f53bdc65d 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -14,7 +14,7 @@
14#include <linux/mempolicy.h> 14#include <linux/mempolicy.h>
15#include <linux/syscalls.h> 15#include <linux/syscalls.h>
16#include <linux/sched.h> 16#include <linux/sched.h>
17#include <linux/module.h> 17#include <linux/export.h>
18#include <linux/rmap.h> 18#include <linux/rmap.h>
19#include <linux/mmzone.h> 19#include <linux/mmzone.h>
20#include <linux/hugetlb.h> 20#include <linux/hugetlb.h>
@@ -110,7 +110,15 @@ void munlock_vma_page(struct page *page)
110 if (TestClearPageMlocked(page)) { 110 if (TestClearPageMlocked(page)) {
111 dec_zone_page_state(page, NR_MLOCK); 111 dec_zone_page_state(page, NR_MLOCK);
112 if (!isolate_lru_page(page)) { 112 if (!isolate_lru_page(page)) {
113 int ret = try_to_munlock(page); 113 int ret = SWAP_AGAIN;
114
115 /*
116 * Optimization: if the page was mapped just once,
117 * that's our mapping and we don't need to check all the
118 * other vmas.
119 */
120 if (page_mapcount(page) > 1)
121 ret = try_to_munlock(page);
114 /* 122 /*
115 * did try_to_unlock() succeed or punt? 123 * did try_to_unlock() succeed or punt?
116 */ 124 */
@@ -549,7 +557,8 @@ SYSCALL_DEFINE1(mlockall, int, flags)
549 if (!can_do_mlock()) 557 if (!can_do_mlock())
550 goto out; 558 goto out;
551 559
552 lru_add_drain_all(); /* flush pagevec */ 560 if (flags & MCL_CURRENT)
561 lru_add_drain_all(); /* flush pagevec */
553 562
554 down_write(&current->mm->mmap_sem); 563 down_write(&current->mm->mmap_sem);
555 564
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 4e0e26591dfa..1ffd97ae26d7 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -8,7 +8,7 @@
8#include <linux/kernel.h> 8#include <linux/kernel.h>
9#include <linux/init.h> 9#include <linux/init.h>
10#include <linux/kobject.h> 10#include <linux/kobject.h>
11#include <linux/module.h> 11#include <linux/export.h>
12#include "internal.h" 12#include "internal.h"
13 13
14#ifdef CONFIG_DEBUG_MEMORY_INIT 14#ifdef CONFIG_DEBUG_MEMORY_INIT
diff --git a/mm/mmap.c b/mm/mmap.c
index d49736ff8a8d..eae90af60ea6 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -22,7 +22,7 @@
22#include <linux/security.h> 22#include <linux/security.h>
23#include <linux/hugetlb.h> 23#include <linux/hugetlb.h>
24#include <linux/profile.h> 24#include <linux/profile.h>
25#include <linux/module.h> 25#include <linux/export.h>
26#include <linux/mount.h> 26#include <linux/mount.h>
27#include <linux/mempolicy.h> 27#include <linux/mempolicy.h>
28#include <linux/rmap.h> 28#include <linux/rmap.h>
@@ -122,9 +122,17 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
122 return 0; 122 return 0;
123 123
124 if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { 124 if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
125 unsigned long n; 125 free = global_page_state(NR_FREE_PAGES);
126 free += global_page_state(NR_FILE_PAGES);
127
128 /*
129 * shmem pages shouldn't be counted as free in this
130 * case, they can't be purged, only swapped out, and
131 * that won't affect the overall amount of available
132 * memory in the system.
133 */
134 free -= global_page_state(NR_SHMEM);
126 135
127 free = global_page_state(NR_FILE_PAGES);
128 free += nr_swap_pages; 136 free += nr_swap_pages;
129 137
130 /* 138 /*
@@ -136,34 +144,18 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
136 free += global_page_state(NR_SLAB_RECLAIMABLE); 144 free += global_page_state(NR_SLAB_RECLAIMABLE);
137 145
138 /* 146 /*
139 * Leave the last 3% for root
140 */
141 if (!cap_sys_admin)
142 free -= free / 32;
143
144 if (free > pages)
145 return 0;
146
147 /*
148 * nr_free_pages() is very expensive on large systems,
149 * only call if we're about to fail.
150 */
151 n = nr_free_pages();
152
153 /*
154 * Leave reserved pages. The pages are not for anonymous pages. 147 * Leave reserved pages. The pages are not for anonymous pages.
155 */ 148 */
156 if (n <= totalreserve_pages) 149 if (free <= totalreserve_pages)
157 goto error; 150 goto error;
158 else 151 else
159 n -= totalreserve_pages; 152 free -= totalreserve_pages;
160 153
161 /* 154 /*
162 * Leave the last 3% for root 155 * Leave the last 3% for root
163 */ 156 */
164 if (!cap_sys_admin) 157 if (!cap_sys_admin)
165 n -= n / 32; 158 free -= free / 32;
166 free += n;
167 159
168 if (free > pages) 160 if (free > pages)
169 return 0; 161 return 0;
@@ -2566,7 +2558,6 @@ int mm_take_all_locks(struct mm_struct *mm)
2566{ 2558{
2567 struct vm_area_struct *vma; 2559 struct vm_area_struct *vma;
2568 struct anon_vma_chain *avc; 2560 struct anon_vma_chain *avc;
2569 int ret = -EINTR;
2570 2561
2571 BUG_ON(down_read_trylock(&mm->mmap_sem)); 2562 BUG_ON(down_read_trylock(&mm->mmap_sem));
2572 2563
@@ -2587,13 +2578,11 @@ int mm_take_all_locks(struct mm_struct *mm)
2587 vm_lock_anon_vma(mm, avc->anon_vma); 2578 vm_lock_anon_vma(mm, avc->anon_vma);
2588 } 2579 }
2589 2580
2590 ret = 0; 2581 return 0;
2591 2582
2592out_unlock: 2583out_unlock:
2593 if (ret) 2584 mm_drop_all_locks(mm);
2594 mm_drop_all_locks(mm); 2585 return -EINTR;
2595
2596 return ret;
2597} 2586}
2598 2587
2599static void vm_unlock_anon_vma(struct anon_vma *anon_vma) 2588static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
diff --git a/mm/mmu_context.c b/mm/mmu_context.c
index 9e82e937000e..cf332bc0080a 100644
--- a/mm/mmu_context.c
+++ b/mm/mmu_context.c
@@ -5,7 +5,7 @@
5 5
6#include <linux/mm.h> 6#include <linux/mm.h>
7#include <linux/mmu_context.h> 7#include <linux/mmu_context.h>
8#include <linux/module.h> 8#include <linux/export.h>
9#include <linux/sched.h> 9#include <linux/sched.h>
10 10
11#include <asm/mmu_context.h> 11#include <asm/mmu_context.h>
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 8d032de4088e..9a611d3a1848 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -11,7 +11,7 @@
11 11
12#include <linux/rculist.h> 12#include <linux/rculist.h>
13#include <linux/mmu_notifier.h> 13#include <linux/mmu_notifier.h>
14#include <linux/module.h> 14#include <linux/export.h>
15#include <linux/mm.h> 15#include <linux/mm.h>
16#include <linux/err.h> 16#include <linux/err.h>
17#include <linux/rcupdate.h> 17#include <linux/rcupdate.h>
diff --git a/mm/mmzone.c b/mm/mmzone.c
index f5b7d1760213..7cf7b7ddc7c5 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -8,7 +8,6 @@
8#include <linux/stddef.h> 8#include <linux/stddef.h>
9#include <linux/mm.h> 9#include <linux/mm.h>
10#include <linux/mmzone.h> 10#include <linux/mmzone.h>
11#include <linux/module.h>
12 11
13struct pglist_data *first_online_pgdat(void) 12struct pglist_data *first_online_pgdat(void)
14{ 13{
diff --git a/mm/mremap.c b/mm/mremap.c
index 506fa44403df..d6959cb4df58 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -41,8 +41,7 @@ static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
41 return NULL; 41 return NULL;
42 42
43 pmd = pmd_offset(pud, addr); 43 pmd = pmd_offset(pud, addr);
44 split_huge_page_pmd(mm, pmd); 44 if (pmd_none(*pmd))
45 if (pmd_none_or_clear_bad(pmd))
46 return NULL; 45 return NULL;
47 46
48 return pmd; 47 return pmd;
@@ -65,8 +64,6 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
65 return NULL; 64 return NULL;
66 65
67 VM_BUG_ON(pmd_trans_huge(*pmd)); 66 VM_BUG_ON(pmd_trans_huge(*pmd));
68 if (pmd_none(*pmd) && __pte_alloc(mm, vma, pmd, addr))
69 return NULL;
70 67
71 return pmd; 68 return pmd;
72} 69}
@@ -80,11 +77,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
80 struct mm_struct *mm = vma->vm_mm; 77 struct mm_struct *mm = vma->vm_mm;
81 pte_t *old_pte, *new_pte, pte; 78 pte_t *old_pte, *new_pte, pte;
82 spinlock_t *old_ptl, *new_ptl; 79 spinlock_t *old_ptl, *new_ptl;
83 unsigned long old_start;
84 80
85 old_start = old_addr;
86 mmu_notifier_invalidate_range_start(vma->vm_mm,
87 old_start, old_end);
88 if (vma->vm_file) { 81 if (vma->vm_file) {
89 /* 82 /*
90 * Subtle point from Rajesh Venkatasubramanian: before 83 * Subtle point from Rajesh Venkatasubramanian: before
@@ -111,7 +104,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
111 new_pte++, new_addr += PAGE_SIZE) { 104 new_pte++, new_addr += PAGE_SIZE) {
112 if (pte_none(*old_pte)) 105 if (pte_none(*old_pte))
113 continue; 106 continue;
114 pte = ptep_clear_flush(vma, old_addr, old_pte); 107 pte = ptep_get_and_clear(mm, old_addr, old_pte);
115 pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr); 108 pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr);
116 set_pte_at(mm, new_addr, new_pte, pte); 109 set_pte_at(mm, new_addr, new_pte, pte);
117 } 110 }
@@ -123,7 +116,6 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
123 pte_unmap_unlock(old_pte - 1, old_ptl); 116 pte_unmap_unlock(old_pte - 1, old_ptl);
124 if (mapping) 117 if (mapping)
125 mutex_unlock(&mapping->i_mmap_mutex); 118 mutex_unlock(&mapping->i_mmap_mutex);
126 mmu_notifier_invalidate_range_end(vma->vm_mm, old_start, old_end);
127} 119}
128 120
129#define LATENCY_LIMIT (64 * PAGE_SIZE) 121#define LATENCY_LIMIT (64 * PAGE_SIZE)
@@ -134,22 +126,43 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
134{ 126{
135 unsigned long extent, next, old_end; 127 unsigned long extent, next, old_end;
136 pmd_t *old_pmd, *new_pmd; 128 pmd_t *old_pmd, *new_pmd;
129 bool need_flush = false;
137 130
138 old_end = old_addr + len; 131 old_end = old_addr + len;
139 flush_cache_range(vma, old_addr, old_end); 132 flush_cache_range(vma, old_addr, old_end);
140 133
134 mmu_notifier_invalidate_range_start(vma->vm_mm, old_addr, old_end);
135
141 for (; old_addr < old_end; old_addr += extent, new_addr += extent) { 136 for (; old_addr < old_end; old_addr += extent, new_addr += extent) {
142 cond_resched(); 137 cond_resched();
143 next = (old_addr + PMD_SIZE) & PMD_MASK; 138 next = (old_addr + PMD_SIZE) & PMD_MASK;
144 if (next - 1 > old_end) 139 /* even if next overflowed, extent below will be ok */
145 next = old_end;
146 extent = next - old_addr; 140 extent = next - old_addr;
141 if (extent > old_end - old_addr)
142 extent = old_end - old_addr;
147 old_pmd = get_old_pmd(vma->vm_mm, old_addr); 143 old_pmd = get_old_pmd(vma->vm_mm, old_addr);
148 if (!old_pmd) 144 if (!old_pmd)
149 continue; 145 continue;
150 new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr); 146 new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr);
151 if (!new_pmd) 147 if (!new_pmd)
152 break; 148 break;
149 if (pmd_trans_huge(*old_pmd)) {
150 int err = 0;
151 if (extent == HPAGE_PMD_SIZE)
152 err = move_huge_pmd(vma, new_vma, old_addr,
153 new_addr, old_end,
154 old_pmd, new_pmd);
155 if (err > 0) {
156 need_flush = true;
157 continue;
158 } else if (!err) {
159 split_huge_page_pmd(vma->vm_mm, old_pmd);
160 }
161 VM_BUG_ON(pmd_trans_huge(*old_pmd));
162 }
163 if (pmd_none(*new_pmd) && __pte_alloc(new_vma->vm_mm, new_vma,
164 new_pmd, new_addr))
165 break;
153 next = (new_addr + PMD_SIZE) & PMD_MASK; 166 next = (new_addr + PMD_SIZE) & PMD_MASK;
154 if (extent > next - new_addr) 167 if (extent > next - new_addr)
155 extent = next - new_addr; 168 extent = next - new_addr;
@@ -157,7 +170,12 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
157 extent = LATENCY_LIMIT; 170 extent = LATENCY_LIMIT;
158 move_ptes(vma, old_pmd, old_addr, old_addr + extent, 171 move_ptes(vma, old_pmd, old_addr, old_addr + extent,
159 new_vma, new_pmd, new_addr); 172 new_vma, new_pmd, new_addr);
173 need_flush = true;
160 } 174 }
175 if (likely(need_flush))
176 flush_tlb_range(vma, old_end-len, old_addr);
177
178 mmu_notifier_invalidate_range_end(vma->vm_mm, old_end-len, old_end);
161 179
162 return len + old_addr - old_end; /* how much done */ 180 return len + old_addr - old_end; /* how much done */
163} 181}
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 29d948ce6d0f..24f0fc1a56d6 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -12,7 +12,7 @@
12#include <linux/pfn.h> 12#include <linux/pfn.h>
13#include <linux/slab.h> 13#include <linux/slab.h>
14#include <linux/bootmem.h> 14#include <linux/bootmem.h>
15#include <linux/module.h> 15#include <linux/export.h>
16#include <linux/kmemleak.h> 16#include <linux/kmemleak.h>
17#include <linux/range.h> 17#include <linux/range.h>
18#include <linux/memblock.h> 18#include <linux/memblock.h>
diff --git a/mm/nommu.c b/mm/nommu.c
index 9edc897a3970..b982290fd962 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -13,7 +13,7 @@
13 * Copyright (c) 2007-2010 Paul Mundt <lethal@linux-sh.org> 13 * Copyright (c) 2007-2010 Paul Mundt <lethal@linux-sh.org>
14 */ 14 */
15 15
16#include <linux/module.h> 16#include <linux/export.h>
17#include <linux/mm.h> 17#include <linux/mm.h>
18#include <linux/mman.h> 18#include <linux/mman.h>
19#include <linux/swap.h> 19#include <linux/swap.h>
@@ -22,7 +22,6 @@
22#include <linux/pagemap.h> 22#include <linux/pagemap.h>
23#include <linux/slab.h> 23#include <linux/slab.h>
24#include <linux/vmalloc.h> 24#include <linux/vmalloc.h>
25#include <linux/tracehook.h>
26#include <linux/blkdev.h> 25#include <linux/blkdev.h>
27#include <linux/backing-dev.h> 26#include <linux/backing-dev.h>
28#include <linux/mount.h> 27#include <linux/mount.h>
@@ -455,7 +454,7 @@ void __attribute__((weak)) vmalloc_sync_all(void)
455 * between processes, it syncs the pagetable across all 454 * between processes, it syncs the pagetable across all
456 * processes. 455 * processes.
457 */ 456 */
458struct vm_struct *alloc_vm_area(size_t size) 457struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes)
459{ 458{
460 BUG(); 459 BUG();
461 return NULL; 460 return NULL;
@@ -1087,7 +1086,7 @@ static unsigned long determine_vm_flags(struct file *file,
1087 * it's being traced - otherwise breakpoints set in it may interfere 1086 * it's being traced - otherwise breakpoints set in it may interfere
1088 * with another untraced process 1087 * with another untraced process
1089 */ 1088 */
1090 if ((flags & MAP_PRIVATE) && tracehook_expect_breakpoints(current)) 1089 if ((flags & MAP_PRIVATE) && current->ptrace)
1091 vm_flags &= ~VM_MAYSHARE; 1090 vm_flags &= ~VM_MAYSHARE;
1092 1091
1093 return vm_flags; 1092 return vm_flags;
@@ -1885,9 +1884,17 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
1885 return 0; 1884 return 0;
1886 1885
1887 if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { 1886 if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
1888 unsigned long n; 1887 free = global_page_state(NR_FREE_PAGES);
1888 free += global_page_state(NR_FILE_PAGES);
1889
1890 /*
1891 * shmem pages shouldn't be counted as free in this
1892 * case, they can't be purged, only swapped out, and
1893 * that won't affect the overall amount of available
1894 * memory in the system.
1895 */
1896 free -= global_page_state(NR_SHMEM);
1889 1897
1890 free = global_page_state(NR_FILE_PAGES);
1891 free += nr_swap_pages; 1898 free += nr_swap_pages;
1892 1899
1893 /* 1900 /*
@@ -1899,34 +1906,18 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
1899 free += global_page_state(NR_SLAB_RECLAIMABLE); 1906 free += global_page_state(NR_SLAB_RECLAIMABLE);
1900 1907
1901 /* 1908 /*
1902 * Leave the last 3% for root
1903 */
1904 if (!cap_sys_admin)
1905 free -= free / 32;
1906
1907 if (free > pages)
1908 return 0;
1909
1910 /*
1911 * nr_free_pages() is very expensive on large systems,
1912 * only call if we're about to fail.
1913 */
1914 n = nr_free_pages();
1915
1916 /*
1917 * Leave reserved pages. The pages are not for anonymous pages. 1909 * Leave reserved pages. The pages are not for anonymous pages.
1918 */ 1910 */
1919 if (n <= totalreserve_pages) 1911 if (free <= totalreserve_pages)
1920 goto error; 1912 goto error;
1921 else 1913 else
1922 n -= totalreserve_pages; 1914 free -= totalreserve_pages;
1923 1915
1924 /* 1916 /*
1925 * Leave the last 3% for root 1917 * Leave the last 3% for root
1926 */ 1918 */
1927 if (!cap_sys_admin) 1919 if (!cap_sys_admin)
1928 n -= n / 32; 1920 free -= free / 32;
1929 free += n;
1930 1921
1931 if (free > pages) 1922 if (free > pages)
1932 return 0; 1923 return 0;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index e4b0991ca351..76f2c5ae908e 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -26,18 +26,38 @@
26#include <linux/timex.h> 26#include <linux/timex.h>
27#include <linux/jiffies.h> 27#include <linux/jiffies.h>
28#include <linux/cpuset.h> 28#include <linux/cpuset.h>
29#include <linux/module.h> 29#include <linux/export.h>
30#include <linux/notifier.h> 30#include <linux/notifier.h>
31#include <linux/memcontrol.h> 31#include <linux/memcontrol.h>
32#include <linux/mempolicy.h> 32#include <linux/mempolicy.h>
33#include <linux/security.h> 33#include <linux/security.h>
34#include <linux/ptrace.h> 34#include <linux/ptrace.h>
35#include <linux/freezer.h>
35 36
36int sysctl_panic_on_oom; 37int sysctl_panic_on_oom;
37int sysctl_oom_kill_allocating_task; 38int sysctl_oom_kill_allocating_task;
38int sysctl_oom_dump_tasks = 1; 39int sysctl_oom_dump_tasks = 1;
39static DEFINE_SPINLOCK(zone_scan_lock); 40static DEFINE_SPINLOCK(zone_scan_lock);
40 41
42/*
43 * compare_swap_oom_score_adj() - compare and swap current's oom_score_adj
44 * @old_val: old oom_score_adj for compare
45 * @new_val: new oom_score_adj for swap
46 *
47 * Sets the oom_score_adj value for current to @new_val iff its present value is
48 * @old_val. Usually used to reinstate a previous value to prevent racing with
49 * userspacing tuning the value in the interim.
50 */
51void compare_swap_oom_score_adj(int old_val, int new_val)
52{
53 struct sighand_struct *sighand = current->sighand;
54
55 spin_lock_irq(&sighand->siglock);
56 if (current->signal->oom_score_adj == old_val)
57 current->signal->oom_score_adj = new_val;
58 spin_unlock_irq(&sighand->siglock);
59}
60
41/** 61/**
42 * test_set_oom_score_adj() - set current's oom_score_adj and return old value 62 * test_set_oom_score_adj() - set current's oom_score_adj and return old value
43 * @new_val: new oom_score_adj value 63 * @new_val: new oom_score_adj value
@@ -53,13 +73,7 @@ int test_set_oom_score_adj(int new_val)
53 73
54 spin_lock_irq(&sighand->siglock); 74 spin_lock_irq(&sighand->siglock);
55 old_val = current->signal->oom_score_adj; 75 old_val = current->signal->oom_score_adj;
56 if (new_val != old_val) { 76 current->signal->oom_score_adj = new_val;
57 if (new_val == OOM_SCORE_ADJ_MIN)
58 atomic_inc(&current->mm->oom_disable_count);
59 else if (old_val == OOM_SCORE_ADJ_MIN)
60 atomic_dec(&current->mm->oom_disable_count);
61 current->signal->oom_score_adj = new_val;
62 }
63 spin_unlock_irq(&sighand->siglock); 77 spin_unlock_irq(&sighand->siglock);
64 78
65 return old_val; 79 return old_val;
@@ -171,12 +185,7 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem,
171 if (!p) 185 if (!p)
172 return 0; 186 return 0;
173 187
174 /* 188 if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
175 * Shortcut check for a thread sharing p->mm that is OOM_SCORE_ADJ_MIN
176 * so the entire heuristic doesn't need to be executed for something
177 * that cannot be killed.
178 */
179 if (atomic_read(&p->mm->oom_disable_count)) {
180 task_unlock(p); 189 task_unlock(p);
181 return 0; 190 return 0;
182 } 191 }
@@ -303,7 +312,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
303 do_each_thread(g, p) { 312 do_each_thread(g, p) {
304 unsigned int points; 313 unsigned int points;
305 314
306 if (!p->mm) 315 if (p->exit_state)
307 continue; 316 continue;
308 if (oom_unkillable_task(p, mem, nodemask)) 317 if (oom_unkillable_task(p, mem, nodemask))
309 continue; 318 continue;
@@ -317,8 +326,13 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
317 * blocked waiting for another task which itself is waiting 326 * blocked waiting for another task which itself is waiting
318 * for memory. Is there a better alternative? 327 * for memory. Is there a better alternative?
319 */ 328 */
320 if (test_tsk_thread_flag(p, TIF_MEMDIE)) 329 if (test_tsk_thread_flag(p, TIF_MEMDIE)) {
330 if (unlikely(frozen(p)))
331 thaw_process(p);
321 return ERR_PTR(-1UL); 332 return ERR_PTR(-1UL);
333 }
334 if (!p->mm)
335 continue;
322 336
323 if (p->flags & PF_EXITING) { 337 if (p->flags & PF_EXITING) {
324 /* 338 /*
@@ -339,8 +353,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
339 * then wait for it to finish before killing 353 * then wait for it to finish before killing
340 * some other task unnecessarily. 354 * some other task unnecessarily.
341 */ 355 */
342 if (!(task_ptrace(p->group_leader) & 356 if (!(p->group_leader->ptrace & PT_TRACE_EXIT))
343 PT_TRACE_EXIT))
344 return ERR_PTR(-1UL); 357 return ERR_PTR(-1UL);
345 } 358 }
346 } 359 }
@@ -434,7 +447,7 @@ static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem)
434 task_unlock(p); 447 task_unlock(p);
435 448
436 /* 449 /*
437 * Kill all processes sharing p->mm in other thread groups, if any. 450 * Kill all user processes sharing p->mm in other thread groups, if any.
438 * They don't get access to memory reserves or a higher scheduler 451 * They don't get access to memory reserves or a higher scheduler
439 * priority, though, to avoid depletion of all memory or task 452 * priority, though, to avoid depletion of all memory or task
440 * starvation. This prevents mm->mmap_sem livelock when an oom killed 453 * starvation. This prevents mm->mmap_sem livelock when an oom killed
@@ -444,7 +457,11 @@ static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem)
444 * signal. 457 * signal.
445 */ 458 */
446 for_each_process(q) 459 for_each_process(q)
447 if (q->mm == mm && !same_thread_group(q, p)) { 460 if (q->mm == mm && !same_thread_group(q, p) &&
461 !(q->flags & PF_KTHREAD)) {
462 if (q->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
463 continue;
464
448 task_lock(q); /* Protect ->comm from prctl() */ 465 task_lock(q); /* Protect ->comm from prctl() */
449 pr_err("Kill process %d (%s) sharing same memory\n", 466 pr_err("Kill process %d (%s) sharing same memory\n",
450 task_pid_nr(q), q->comm); 467 task_pid_nr(q), q->comm);
@@ -488,7 +505,7 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
488 505
489 /* 506 /*
490 * If any of p's children has a different mm and is eligible for kill, 507 * If any of p's children has a different mm and is eligible for kill,
491 * the one with the highest badness() score is sacrificed for its 508 * the one with the highest oom_badness() score is sacrificed for its
492 * parent. This attempts to lose the minimal amount of work done while 509 * parent. This attempts to lose the minimal amount of work done while
493 * still freeing memory. 510 * still freeing memory.
494 */ 511 */
@@ -721,7 +738,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
721 read_lock(&tasklist_lock); 738 read_lock(&tasklist_lock);
722 if (sysctl_oom_kill_allocating_task && 739 if (sysctl_oom_kill_allocating_task &&
723 !oom_unkillable_task(current, NULL, nodemask) && 740 !oom_unkillable_task(current, NULL, nodemask) &&
724 current->mm && !atomic_read(&current->mm->oom_disable_count)) { 741 current->mm) {
725 /* 742 /*
726 * oom_kill_process() needs tasklist_lock held. If it returns 743 * oom_kill_process() needs tasklist_lock held. If it returns
727 * non-zero, current could not be killed so we must fallback to 744 * non-zero, current could not be killed so we must fallback to
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 31f698862420..71252486bc6f 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -12,7 +12,7 @@
12 */ 12 */
13 13
14#include <linux/kernel.h> 14#include <linux/kernel.h>
15#include <linux/module.h> 15#include <linux/export.h>
16#include <linux/spinlock.h> 16#include <linux/spinlock.h>
17#include <linux/fs.h> 17#include <linux/fs.h>
18#include <linux/mm.h> 18#include <linux/mm.h>
@@ -37,24 +37,22 @@
37#include <trace/events/writeback.h> 37#include <trace/events/writeback.h>
38 38
39/* 39/*
40 * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited 40 * Sleep at most 200ms at a time in balance_dirty_pages().
41 * will look to see if it needs to force writeback or throttling.
42 */ 41 */
43static long ratelimit_pages = 32; 42#define MAX_PAUSE max(HZ/5, 1)
44 43
45/* 44/*
46 * When balance_dirty_pages decides that the caller needs to perform some 45 * Estimate write bandwidth at 200ms intervals.
47 * non-background writeback, this is how many pages it will attempt to write.
48 * It should be somewhat larger than dirtied pages to ensure that reasonably
49 * large amounts of I/O are submitted.
50 */ 46 */
51static inline long sync_writeback_pages(unsigned long dirtied) 47#define BANDWIDTH_INTERVAL max(HZ/5, 1)
52{
53 if (dirtied < ratelimit_pages)
54 dirtied = ratelimit_pages;
55 48
56 return dirtied + dirtied / 2; 49#define RATELIMIT_CALC_SHIFT 10
57} 50
51/*
52 * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
53 * will look to see if it needs to force writeback or throttling.
54 */
55static long ratelimit_pages = 32;
58 56
59/* The following parameters are exported via /proc/sys/vm */ 57/* The following parameters are exported via /proc/sys/vm */
60 58
@@ -111,6 +109,7 @@ EXPORT_SYMBOL(laptop_mode);
111 109
112/* End of sysctl-exported parameters */ 110/* End of sysctl-exported parameters */
113 111
112unsigned long global_dirty_limit;
114 113
115/* 114/*
116 * Scale the writeback cache size proportional to the relative writeout speeds. 115 * Scale the writeback cache size proportional to the relative writeout speeds.
@@ -129,7 +128,6 @@ EXPORT_SYMBOL(laptop_mode);
129 * 128 *
130 */ 129 */
131static struct prop_descriptor vm_completions; 130static struct prop_descriptor vm_completions;
132static struct prop_descriptor vm_dirties;
133 131
134/* 132/*
135 * couple the period to the dirty_ratio: 133 * couple the period to the dirty_ratio:
@@ -155,7 +153,8 @@ static void update_completion_period(void)
155{ 153{
156 int shift = calc_period_shift(); 154 int shift = calc_period_shift();
157 prop_change_shift(&vm_completions, shift); 155 prop_change_shift(&vm_completions, shift);
158 prop_change_shift(&vm_dirties, shift); 156
157 writeback_set_ratelimit();
159} 158}
160 159
161int dirty_background_ratio_handler(struct ctl_table *table, int write, 160int dirty_background_ratio_handler(struct ctl_table *table, int write,
@@ -219,6 +218,7 @@ int dirty_bytes_handler(struct ctl_table *table, int write,
219 */ 218 */
220static inline void __bdi_writeout_inc(struct backing_dev_info *bdi) 219static inline void __bdi_writeout_inc(struct backing_dev_info *bdi)
221{ 220{
221 __inc_bdi_stat(bdi, BDI_WRITTEN);
222 __prop_inc_percpu_max(&vm_completions, &bdi->completions, 222 __prop_inc_percpu_max(&vm_completions, &bdi->completions,
223 bdi->max_prop_frac); 223 bdi->max_prop_frac);
224} 224}
@@ -233,65 +233,20 @@ void bdi_writeout_inc(struct backing_dev_info *bdi)
233} 233}
234EXPORT_SYMBOL_GPL(bdi_writeout_inc); 234EXPORT_SYMBOL_GPL(bdi_writeout_inc);
235 235
236void task_dirty_inc(struct task_struct *tsk)
237{
238 prop_inc_single(&vm_dirties, &tsk->dirties);
239}
240
241/* 236/*
242 * Obtain an accurate fraction of the BDI's portion. 237 * Obtain an accurate fraction of the BDI's portion.
243 */ 238 */
244static void bdi_writeout_fraction(struct backing_dev_info *bdi, 239static void bdi_writeout_fraction(struct backing_dev_info *bdi,
245 long *numerator, long *denominator) 240 long *numerator, long *denominator)
246{ 241{
247 if (bdi_cap_writeback_dirty(bdi)) { 242 prop_fraction_percpu(&vm_completions, &bdi->completions,
248 prop_fraction_percpu(&vm_completions, &bdi->completions,
249 numerator, denominator); 243 numerator, denominator);
250 } else {
251 *numerator = 0;
252 *denominator = 1;
253 }
254}
255
256static inline void task_dirties_fraction(struct task_struct *tsk,
257 long *numerator, long *denominator)
258{
259 prop_fraction_single(&vm_dirties, &tsk->dirties,
260 numerator, denominator);
261}
262
263/*
264 * task_dirty_limit - scale down dirty throttling threshold for one task
265 *
266 * task specific dirty limit:
267 *
268 * dirty -= (dirty/8) * p_{t}
269 *
270 * To protect light/slow dirtying tasks from heavier/fast ones, we start
271 * throttling individual tasks before reaching the bdi dirty limit.
272 * Relatively low thresholds will be allocated to heavy dirtiers. So when
273 * dirty pages grow large, heavy dirtiers will be throttled first, which will
274 * effectively curb the growth of dirty pages. Light dirtiers with high enough
275 * dirty threshold may never get throttled.
276 */
277static unsigned long task_dirty_limit(struct task_struct *tsk,
278 unsigned long bdi_dirty)
279{
280 long numerator, denominator;
281 unsigned long dirty = bdi_dirty;
282 u64 inv = dirty >> 3;
283
284 task_dirties_fraction(tsk, &numerator, &denominator);
285 inv *= numerator;
286 do_div(inv, denominator);
287
288 dirty -= inv;
289
290 return max(dirty, bdi_dirty/2);
291} 244}
292 245
293/* 246/*
294 * 247 * bdi_min_ratio keeps the sum of the minimum dirty shares of all
248 * registered backing devices, which, for obvious reasons, can not
249 * exceed 100%.
295 */ 250 */
296static unsigned int bdi_min_ratio; 251static unsigned int bdi_min_ratio;
297 252
@@ -397,6 +352,17 @@ unsigned long determine_dirtyable_memory(void)
397 return x + 1; /* Ensure that we never return 0 */ 352 return x + 1; /* Ensure that we never return 0 */
398} 353}
399 354
355static unsigned long dirty_freerun_ceiling(unsigned long thresh,
356 unsigned long bg_thresh)
357{
358 return (thresh + bg_thresh) / 2;
359}
360
361static unsigned long hard_dirty_limit(unsigned long thresh)
362{
363 return max(thresh, global_dirty_limit);
364}
365
400/* 366/*
401 * global_dirty_limits - background-writeback and dirty-throttling thresholds 367 * global_dirty_limits - background-writeback and dirty-throttling thresholds
402 * 368 *
@@ -435,12 +401,20 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
435 } 401 }
436 *pbackground = background; 402 *pbackground = background;
437 *pdirty = dirty; 403 *pdirty = dirty;
404 trace_global_dirty_state(background, dirty);
438} 405}
439 406
440/* 407/**
441 * bdi_dirty_limit - @bdi's share of dirty throttling threshold 408 * bdi_dirty_limit - @bdi's share of dirty throttling threshold
409 * @bdi: the backing_dev_info to query
410 * @dirty: global dirty limit in pages
442 * 411 *
443 * Allocate high/low dirty limits to fast/slow devices, in order to prevent 412 * Returns @bdi's dirty limit in pages. The term "dirty" in the context of
413 * dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages.
414 * And the "limit" in the name is not seriously taken as hard limit in
415 * balance_dirty_pages().
416 *
417 * It allocates high/low dirty limits to fast/slow devices, in order to prevent
444 * - starving fast devices 418 * - starving fast devices
445 * - piling up dirty pages (that will take long time to sync) on slow devices 419 * - piling up dirty pages (that will take long time to sync) on slow devices
446 * 420 *
@@ -469,36 +443,587 @@ unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty)
469} 443}
470 444
471/* 445/*
446 * Dirty position control.
447 *
448 * (o) global/bdi setpoints
449 *
450 * We want the dirty pages be balanced around the global/bdi setpoints.
451 * When the number of dirty pages is higher/lower than the setpoint, the
452 * dirty position control ratio (and hence task dirty ratelimit) will be
453 * decreased/increased to bring the dirty pages back to the setpoint.
454 *
455 * pos_ratio = 1 << RATELIMIT_CALC_SHIFT
456 *
457 * if (dirty < setpoint) scale up pos_ratio
458 * if (dirty > setpoint) scale down pos_ratio
459 *
460 * if (bdi_dirty < bdi_setpoint) scale up pos_ratio
461 * if (bdi_dirty > bdi_setpoint) scale down pos_ratio
462 *
463 * task_ratelimit = dirty_ratelimit * pos_ratio >> RATELIMIT_CALC_SHIFT
464 *
465 * (o) global control line
466 *
467 * ^ pos_ratio
468 * |
469 * | |<===== global dirty control scope ======>|
470 * 2.0 .............*
471 * | .*
472 * | . *
473 * | . *
474 * | . *
475 * | . *
476 * | . *
477 * 1.0 ................................*
478 * | . . *
479 * | . . *
480 * | . . *
481 * | . . *
482 * | . . *
483 * 0 +------------.------------------.----------------------*------------->
484 * freerun^ setpoint^ limit^ dirty pages
485 *
486 * (o) bdi control line
487 *
488 * ^ pos_ratio
489 * |
490 * | *
491 * | *
492 * | *
493 * | *
494 * | * |<=========== span ============>|
495 * 1.0 .......................*
496 * | . *
497 * | . *
498 * | . *
499 * | . *
500 * | . *
501 * | . *
502 * | . *
503 * | . *
504 * | . *
505 * | . *
506 * | . *
507 * 1/4 ...............................................* * * * * * * * * * * *
508 * | . .
509 * | . .
510 * | . .
511 * 0 +----------------------.-------------------------------.------------->
512 * bdi_setpoint^ x_intercept^
513 *
514 * The bdi control line won't drop below pos_ratio=1/4, so that bdi_dirty can
515 * be smoothly throttled down to normal if it starts high in situations like
516 * - start writing to a slow SD card and a fast disk at the same time. The SD
517 * card's bdi_dirty may rush to many times higher than bdi_setpoint.
518 * - the bdi dirty thresh drops quickly due to change of JBOD workload
519 */
520static unsigned long bdi_position_ratio(struct backing_dev_info *bdi,
521 unsigned long thresh,
522 unsigned long bg_thresh,
523 unsigned long dirty,
524 unsigned long bdi_thresh,
525 unsigned long bdi_dirty)
526{
527 unsigned long write_bw = bdi->avg_write_bandwidth;
528 unsigned long freerun = dirty_freerun_ceiling(thresh, bg_thresh);
529 unsigned long limit = hard_dirty_limit(thresh);
530 unsigned long x_intercept;
531 unsigned long setpoint; /* dirty pages' target balance point */
532 unsigned long bdi_setpoint;
533 unsigned long span;
534 long long pos_ratio; /* for scaling up/down the rate limit */
535 long x;
536
537 if (unlikely(dirty >= limit))
538 return 0;
539
540 /*
541 * global setpoint
542 *
543 * setpoint - dirty 3
544 * f(dirty) := 1.0 + (----------------)
545 * limit - setpoint
546 *
547 * it's a 3rd order polynomial that subjects to
548 *
549 * (1) f(freerun) = 2.0 => rampup dirty_ratelimit reasonably fast
550 * (2) f(setpoint) = 1.0 => the balance point
551 * (3) f(limit) = 0 => the hard limit
552 * (4) df/dx <= 0 => negative feedback control
553 * (5) the closer to setpoint, the smaller |df/dx| (and the reverse)
554 * => fast response on large errors; small oscillation near setpoint
555 */
556 setpoint = (freerun + limit) / 2;
557 x = div_s64((setpoint - dirty) << RATELIMIT_CALC_SHIFT,
558 limit - setpoint + 1);
559 pos_ratio = x;
560 pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
561 pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
562 pos_ratio += 1 << RATELIMIT_CALC_SHIFT;
563
564 /*
565 * We have computed basic pos_ratio above based on global situation. If
566 * the bdi is over/under its share of dirty pages, we want to scale
567 * pos_ratio further down/up. That is done by the following mechanism.
568 */
569
570 /*
571 * bdi setpoint
572 *
573 * f(bdi_dirty) := 1.0 + k * (bdi_dirty - bdi_setpoint)
574 *
575 * x_intercept - bdi_dirty
576 * := --------------------------
577 * x_intercept - bdi_setpoint
578 *
579 * The main bdi control line is a linear function that subjects to
580 *
581 * (1) f(bdi_setpoint) = 1.0
582 * (2) k = - 1 / (8 * write_bw) (in single bdi case)
583 * or equally: x_intercept = bdi_setpoint + 8 * write_bw
584 *
585 * For single bdi case, the dirty pages are observed to fluctuate
586 * regularly within range
587 * [bdi_setpoint - write_bw/2, bdi_setpoint + write_bw/2]
588 * for various filesystems, where (2) can yield in a reasonable 12.5%
589 * fluctuation range for pos_ratio.
590 *
591 * For JBOD case, bdi_thresh (not bdi_dirty!) could fluctuate up to its
592 * own size, so move the slope over accordingly and choose a slope that
593 * yields 100% pos_ratio fluctuation on suddenly doubled bdi_thresh.
594 */
595 if (unlikely(bdi_thresh > thresh))
596 bdi_thresh = thresh;
597 bdi_thresh = max(bdi_thresh, (limit - dirty) / 8);
598 /*
599 * scale global setpoint to bdi's:
600 * bdi_setpoint = setpoint * bdi_thresh / thresh
601 */
602 x = div_u64((u64)bdi_thresh << 16, thresh + 1);
603 bdi_setpoint = setpoint * (u64)x >> 16;
604 /*
605 * Use span=(8*write_bw) in single bdi case as indicated by
606 * (thresh - bdi_thresh ~= 0) and transit to bdi_thresh in JBOD case.
607 *
608 * bdi_thresh thresh - bdi_thresh
609 * span = ---------- * (8 * write_bw) + ------------------- * bdi_thresh
610 * thresh thresh
611 */
612 span = (thresh - bdi_thresh + 8 * write_bw) * (u64)x >> 16;
613 x_intercept = bdi_setpoint + span;
614
615 if (bdi_dirty < x_intercept - span / 4) {
616 pos_ratio = div_u64(pos_ratio * (x_intercept - bdi_dirty),
617 x_intercept - bdi_setpoint + 1);
618 } else
619 pos_ratio /= 4;
620
621 /*
622 * bdi reserve area, safeguard against dirty pool underrun and disk idle
623 * It may push the desired control point of global dirty pages higher
624 * than setpoint.
625 */
626 x_intercept = bdi_thresh / 2;
627 if (bdi_dirty < x_intercept) {
628 if (bdi_dirty > x_intercept / 8)
629 pos_ratio = div_u64(pos_ratio * x_intercept, bdi_dirty);
630 else
631 pos_ratio *= 8;
632 }
633
634 return pos_ratio;
635}
636
637static void bdi_update_write_bandwidth(struct backing_dev_info *bdi,
638 unsigned long elapsed,
639 unsigned long written)
640{
641 const unsigned long period = roundup_pow_of_two(3 * HZ);
642 unsigned long avg = bdi->avg_write_bandwidth;
643 unsigned long old = bdi->write_bandwidth;
644 u64 bw;
645
646 /*
647 * bw = written * HZ / elapsed
648 *
649 * bw * elapsed + write_bandwidth * (period - elapsed)
650 * write_bandwidth = ---------------------------------------------------
651 * period
652 */
653 bw = written - bdi->written_stamp;
654 bw *= HZ;
655 if (unlikely(elapsed > period)) {
656 do_div(bw, elapsed);
657 avg = bw;
658 goto out;
659 }
660 bw += (u64)bdi->write_bandwidth * (period - elapsed);
661 bw >>= ilog2(period);
662
663 /*
664 * one more level of smoothing, for filtering out sudden spikes
665 */
666 if (avg > old && old >= (unsigned long)bw)
667 avg -= (avg - old) >> 3;
668
669 if (avg < old && old <= (unsigned long)bw)
670 avg += (old - avg) >> 3;
671
672out:
673 bdi->write_bandwidth = bw;
674 bdi->avg_write_bandwidth = avg;
675}
676
677/*
678 * The global dirtyable memory and dirty threshold could be suddenly knocked
679 * down by a large amount (eg. on the startup of KVM in a swapless system).
680 * This may throw the system into deep dirty exceeded state and throttle
681 * heavy/light dirtiers alike. To retain good responsiveness, maintain
682 * global_dirty_limit for tracking slowly down to the knocked down dirty
683 * threshold.
684 */
685static void update_dirty_limit(unsigned long thresh, unsigned long dirty)
686{
687 unsigned long limit = global_dirty_limit;
688
689 /*
690 * Follow up in one step.
691 */
692 if (limit < thresh) {
693 limit = thresh;
694 goto update;
695 }
696
697 /*
698 * Follow down slowly. Use the higher one as the target, because thresh
699 * may drop below dirty. This is exactly the reason to introduce
700 * global_dirty_limit which is guaranteed to lie above the dirty pages.
701 */
702 thresh = max(thresh, dirty);
703 if (limit > thresh) {
704 limit -= (limit - thresh) >> 5;
705 goto update;
706 }
707 return;
708update:
709 global_dirty_limit = limit;
710}
711
712static void global_update_bandwidth(unsigned long thresh,
713 unsigned long dirty,
714 unsigned long now)
715{
716 static DEFINE_SPINLOCK(dirty_lock);
717 static unsigned long update_time;
718
719 /*
720 * check locklessly first to optimize away locking for the most time
721 */
722 if (time_before(now, update_time + BANDWIDTH_INTERVAL))
723 return;
724
725 spin_lock(&dirty_lock);
726 if (time_after_eq(now, update_time + BANDWIDTH_INTERVAL)) {
727 update_dirty_limit(thresh, dirty);
728 update_time = now;
729 }
730 spin_unlock(&dirty_lock);
731}
732
733/*
734 * Maintain bdi->dirty_ratelimit, the base dirty throttle rate.
735 *
736 * Normal bdi tasks will be curbed at or below it in long term.
737 * Obviously it should be around (write_bw / N) when there are N dd tasks.
738 */
739static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi,
740 unsigned long thresh,
741 unsigned long bg_thresh,
742 unsigned long dirty,
743 unsigned long bdi_thresh,
744 unsigned long bdi_dirty,
745 unsigned long dirtied,
746 unsigned long elapsed)
747{
748 unsigned long freerun = dirty_freerun_ceiling(thresh, bg_thresh);
749 unsigned long limit = hard_dirty_limit(thresh);
750 unsigned long setpoint = (freerun + limit) / 2;
751 unsigned long write_bw = bdi->avg_write_bandwidth;
752 unsigned long dirty_ratelimit = bdi->dirty_ratelimit;
753 unsigned long dirty_rate;
754 unsigned long task_ratelimit;
755 unsigned long balanced_dirty_ratelimit;
756 unsigned long pos_ratio;
757 unsigned long step;
758 unsigned long x;
759
760 /*
761 * The dirty rate will match the writeout rate in long term, except
762 * when dirty pages are truncated by userspace or re-dirtied by FS.
763 */
764 dirty_rate = (dirtied - bdi->dirtied_stamp) * HZ / elapsed;
765
766 pos_ratio = bdi_position_ratio(bdi, thresh, bg_thresh, dirty,
767 bdi_thresh, bdi_dirty);
768 /*
769 * task_ratelimit reflects each dd's dirty rate for the past 200ms.
770 */
771 task_ratelimit = (u64)dirty_ratelimit *
772 pos_ratio >> RATELIMIT_CALC_SHIFT;
773 task_ratelimit++; /* it helps rampup dirty_ratelimit from tiny values */
774
775 /*
776 * A linear estimation of the "balanced" throttle rate. The theory is,
777 * if there are N dd tasks, each throttled at task_ratelimit, the bdi's
778 * dirty_rate will be measured to be (N * task_ratelimit). So the below
779 * formula will yield the balanced rate limit (write_bw / N).
780 *
781 * Note that the expanded form is not a pure rate feedback:
782 * rate_(i+1) = rate_(i) * (write_bw / dirty_rate) (1)
783 * but also takes pos_ratio into account:
784 * rate_(i+1) = rate_(i) * (write_bw / dirty_rate) * pos_ratio (2)
785 *
786 * (1) is not realistic because pos_ratio also takes part in balancing
787 * the dirty rate. Consider the state
788 * pos_ratio = 0.5 (3)
789 * rate = 2 * (write_bw / N) (4)
790 * If (1) is used, it will stuck in that state! Because each dd will
791 * be throttled at
792 * task_ratelimit = pos_ratio * rate = (write_bw / N) (5)
793 * yielding
794 * dirty_rate = N * task_ratelimit = write_bw (6)
795 * put (6) into (1) we get
796 * rate_(i+1) = rate_(i) (7)
797 *
798 * So we end up using (2) to always keep
799 * rate_(i+1) ~= (write_bw / N) (8)
800 * regardless of the value of pos_ratio. As long as (8) is satisfied,
801 * pos_ratio is able to drive itself to 1.0, which is not only where
802 * the dirty count meet the setpoint, but also where the slope of
803 * pos_ratio is most flat and hence task_ratelimit is least fluctuated.
804 */
805 balanced_dirty_ratelimit = div_u64((u64)task_ratelimit * write_bw,
806 dirty_rate | 1);
807
808 /*
809 * We could safely do this and return immediately:
810 *
811 * bdi->dirty_ratelimit = balanced_dirty_ratelimit;
812 *
813 * However to get a more stable dirty_ratelimit, the below elaborated
814 * code makes use of task_ratelimit to filter out sigular points and
815 * limit the step size.
816 *
817 * The below code essentially only uses the relative value of
818 *
819 * task_ratelimit - dirty_ratelimit
820 * = (pos_ratio - 1) * dirty_ratelimit
821 *
822 * which reflects the direction and size of dirty position error.
823 */
824
825 /*
826 * dirty_ratelimit will follow balanced_dirty_ratelimit iff
827 * task_ratelimit is on the same side of dirty_ratelimit, too.
828 * For example, when
829 * - dirty_ratelimit > balanced_dirty_ratelimit
830 * - dirty_ratelimit > task_ratelimit (dirty pages are above setpoint)
831 * lowering dirty_ratelimit will help meet both the position and rate
832 * control targets. Otherwise, don't update dirty_ratelimit if it will
833 * only help meet the rate target. After all, what the users ultimately
834 * feel and care are stable dirty rate and small position error.
835 *
836 * |task_ratelimit - dirty_ratelimit| is used to limit the step size
837 * and filter out the sigular points of balanced_dirty_ratelimit. Which
838 * keeps jumping around randomly and can even leap far away at times
839 * due to the small 200ms estimation period of dirty_rate (we want to
840 * keep that period small to reduce time lags).
841 */
842 step = 0;
843 if (dirty < setpoint) {
844 x = min(bdi->balanced_dirty_ratelimit,
845 min(balanced_dirty_ratelimit, task_ratelimit));
846 if (dirty_ratelimit < x)
847 step = x - dirty_ratelimit;
848 } else {
849 x = max(bdi->balanced_dirty_ratelimit,
850 max(balanced_dirty_ratelimit, task_ratelimit));
851 if (dirty_ratelimit > x)
852 step = dirty_ratelimit - x;
853 }
854
855 /*
856 * Don't pursue 100% rate matching. It's impossible since the balanced
857 * rate itself is constantly fluctuating. So decrease the track speed
858 * when it gets close to the target. Helps eliminate pointless tremors.
859 */
860 step >>= dirty_ratelimit / (2 * step + 1);
861 /*
862 * Limit the tracking speed to avoid overshooting.
863 */
864 step = (step + 7) / 8;
865
866 if (dirty_ratelimit < balanced_dirty_ratelimit)
867 dirty_ratelimit += step;
868 else
869 dirty_ratelimit -= step;
870
871 bdi->dirty_ratelimit = max(dirty_ratelimit, 1UL);
872 bdi->balanced_dirty_ratelimit = balanced_dirty_ratelimit;
873
874 trace_bdi_dirty_ratelimit(bdi, dirty_rate, task_ratelimit);
875}
876
877void __bdi_update_bandwidth(struct backing_dev_info *bdi,
878 unsigned long thresh,
879 unsigned long bg_thresh,
880 unsigned long dirty,
881 unsigned long bdi_thresh,
882 unsigned long bdi_dirty,
883 unsigned long start_time)
884{
885 unsigned long now = jiffies;
886 unsigned long elapsed = now - bdi->bw_time_stamp;
887 unsigned long dirtied;
888 unsigned long written;
889
890 /*
891 * rate-limit, only update once every 200ms.
892 */
893 if (elapsed < BANDWIDTH_INTERVAL)
894 return;
895
896 dirtied = percpu_counter_read(&bdi->bdi_stat[BDI_DIRTIED]);
897 written = percpu_counter_read(&bdi->bdi_stat[BDI_WRITTEN]);
898
899 /*
900 * Skip quiet periods when disk bandwidth is under-utilized.
901 * (at least 1s idle time between two flusher runs)
902 */
903 if (elapsed > HZ && time_before(bdi->bw_time_stamp, start_time))
904 goto snapshot;
905
906 if (thresh) {
907 global_update_bandwidth(thresh, dirty, now);
908 bdi_update_dirty_ratelimit(bdi, thresh, bg_thresh, dirty,
909 bdi_thresh, bdi_dirty,
910 dirtied, elapsed);
911 }
912 bdi_update_write_bandwidth(bdi, elapsed, written);
913
914snapshot:
915 bdi->dirtied_stamp = dirtied;
916 bdi->written_stamp = written;
917 bdi->bw_time_stamp = now;
918}
919
920static void bdi_update_bandwidth(struct backing_dev_info *bdi,
921 unsigned long thresh,
922 unsigned long bg_thresh,
923 unsigned long dirty,
924 unsigned long bdi_thresh,
925 unsigned long bdi_dirty,
926 unsigned long start_time)
927{
928 if (time_is_after_eq_jiffies(bdi->bw_time_stamp + BANDWIDTH_INTERVAL))
929 return;
930 spin_lock(&bdi->wb.list_lock);
931 __bdi_update_bandwidth(bdi, thresh, bg_thresh, dirty,
932 bdi_thresh, bdi_dirty, start_time);
933 spin_unlock(&bdi->wb.list_lock);
934}
935
936/*
937 * After a task dirtied this many pages, balance_dirty_pages_ratelimited_nr()
938 * will look to see if it needs to start dirty throttling.
939 *
940 * If dirty_poll_interval is too low, big NUMA machines will call the expensive
941 * global_page_state() too often. So scale it near-sqrt to the safety margin
942 * (the number of pages we may dirty without exceeding the dirty limits).
943 */
944static unsigned long dirty_poll_interval(unsigned long dirty,
945 unsigned long thresh)
946{
947 if (thresh > dirty)
948 return 1UL << (ilog2(thresh - dirty) >> 1);
949
950 return 1;
951}
952
953static unsigned long bdi_max_pause(struct backing_dev_info *bdi,
954 unsigned long bdi_dirty)
955{
956 unsigned long bw = bdi->avg_write_bandwidth;
957 unsigned long hi = ilog2(bw);
958 unsigned long lo = ilog2(bdi->dirty_ratelimit);
959 unsigned long t;
960
961 /* target for 20ms max pause on 1-dd case */
962 t = HZ / 50;
963
964 /*
965 * Scale up pause time for concurrent dirtiers in order to reduce CPU
966 * overheads.
967 *
968 * (N * 20ms) on 2^N concurrent tasks.
969 */
970 if (hi > lo)
971 t += (hi - lo) * (20 * HZ) / 1024;
972
973 /*
974 * Limit pause time for small memory systems. If sleeping for too long
975 * time, a small pool of dirty/writeback pages may go empty and disk go
976 * idle.
977 *
978 * 8 serves as the safety ratio.
979 */
980 if (bdi_dirty)
981 t = min(t, bdi_dirty * HZ / (8 * bw + 1));
982
983 /*
984 * The pause time will be settled within range (max_pause/4, max_pause).
985 * Apply a minimal value of 4 to get a non-zero max_pause/4.
986 */
987 return clamp_val(t, 4, MAX_PAUSE);
988}
989
990/*
472 * balance_dirty_pages() must be called by processes which are generating dirty 991 * balance_dirty_pages() must be called by processes which are generating dirty
473 * data. It looks at the number of dirty pages in the machine and will force 992 * data. It looks at the number of dirty pages in the machine and will force
474 * the caller to perform writeback if the system is over `vm_dirty_ratio'. 993 * the caller to wait once crossing the (background_thresh + dirty_thresh) / 2.
475 * If we're over `background_thresh' then the writeback threads are woken to 994 * If we're over `background_thresh' then the writeback threads are woken to
476 * perform some writeout. 995 * perform some writeout.
477 */ 996 */
478static void balance_dirty_pages(struct address_space *mapping, 997static void balance_dirty_pages(struct address_space *mapping,
479 unsigned long write_chunk) 998 unsigned long pages_dirtied)
480{ 999{
481 long nr_reclaimable, bdi_nr_reclaimable; 1000 unsigned long nr_reclaimable; /* = file_dirty + unstable_nfs */
482 long nr_writeback, bdi_nr_writeback; 1001 unsigned long bdi_reclaimable;
1002 unsigned long nr_dirty; /* = file_dirty + writeback + unstable_nfs */
1003 unsigned long bdi_dirty;
1004 unsigned long freerun;
483 unsigned long background_thresh; 1005 unsigned long background_thresh;
484 unsigned long dirty_thresh; 1006 unsigned long dirty_thresh;
485 unsigned long bdi_thresh; 1007 unsigned long bdi_thresh;
486 unsigned long pages_written = 0; 1008 long pause = 0;
487 unsigned long pause = 1; 1009 long uninitialized_var(max_pause);
488 bool dirty_exceeded = false; 1010 bool dirty_exceeded = false;
1011 unsigned long task_ratelimit;
1012 unsigned long uninitialized_var(dirty_ratelimit);
1013 unsigned long pos_ratio;
489 struct backing_dev_info *bdi = mapping->backing_dev_info; 1014 struct backing_dev_info *bdi = mapping->backing_dev_info;
1015 unsigned long start_time = jiffies;
490 1016
491 for (;;) { 1017 for (;;) {
492 struct writeback_control wbc = { 1018 /*
493 .sync_mode = WB_SYNC_NONE, 1019 * Unstable writes are a feature of certain networked
494 .older_than_this = NULL, 1020 * filesystems (i.e. NFS) in which data may have been
495 .nr_to_write = write_chunk, 1021 * written to the server's write cache, but has not yet
496 .range_cyclic = 1, 1022 * been flushed to permanent storage.
497 }; 1023 */
498
499 nr_reclaimable = global_page_state(NR_FILE_DIRTY) + 1024 nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
500 global_page_state(NR_UNSTABLE_NFS); 1025 global_page_state(NR_UNSTABLE_NFS);
501 nr_writeback = global_page_state(NR_WRITEBACK); 1026 nr_dirty = nr_reclaimable + global_page_state(NR_WRITEBACK);
502 1027
503 global_dirty_limits(&background_thresh, &dirty_thresh); 1028 global_dirty_limits(&background_thresh, &dirty_thresh);
504 1029
@@ -507,12 +1032,28 @@ static void balance_dirty_pages(struct address_space *mapping,
507 * catch-up. This avoids (excessively) small writeouts 1032 * catch-up. This avoids (excessively) small writeouts
508 * when the bdi limits are ramping up. 1033 * when the bdi limits are ramping up.
509 */ 1034 */
510 if (nr_reclaimable + nr_writeback <= 1035 freerun = dirty_freerun_ceiling(dirty_thresh,
511 (background_thresh + dirty_thresh) / 2) 1036 background_thresh);
1037 if (nr_dirty <= freerun)
512 break; 1038 break;
513 1039
1040 if (unlikely(!writeback_in_progress(bdi)))
1041 bdi_start_background_writeback(bdi);
1042
1043 /*
1044 * bdi_thresh is not treated as some limiting factor as
1045 * dirty_thresh, due to reasons
1046 * - in JBOD setup, bdi_thresh can fluctuate a lot
1047 * - in a system with HDD and USB key, the USB key may somehow
1048 * go into state (bdi_dirty >> bdi_thresh) either because
1049 * bdi_dirty starts high, or because bdi_thresh drops low.
1050 * In this case we don't want to hard throttle the USB key
1051 * dirtiers for 100 seconds until bdi_dirty drops under
1052 * bdi_thresh. Instead the auxiliary bdi control line in
1053 * bdi_position_ratio() will let the dirtier task progress
1054 * at some rate <= (write_bw / 2) for bringing down bdi_dirty.
1055 */
514 bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); 1056 bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
515 bdi_thresh = task_dirty_limit(current, bdi_thresh);
516 1057
517 /* 1058 /*
518 * In order to avoid the stacked BDI deadlock we need 1059 * In order to avoid the stacked BDI deadlock we need
@@ -524,63 +1065,101 @@ static void balance_dirty_pages(struct address_space *mapping,
524 * actually dirty; with m+n sitting in the percpu 1065 * actually dirty; with m+n sitting in the percpu
525 * deltas. 1066 * deltas.
526 */ 1067 */
527 if (bdi_thresh < 2*bdi_stat_error(bdi)) { 1068 if (bdi_thresh < 2 * bdi_stat_error(bdi)) {
528 bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE); 1069 bdi_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
529 bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK); 1070 bdi_dirty = bdi_reclaimable +
1071 bdi_stat_sum(bdi, BDI_WRITEBACK);
530 } else { 1072 } else {
531 bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); 1073 bdi_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
532 bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK); 1074 bdi_dirty = bdi_reclaimable +
1075 bdi_stat(bdi, BDI_WRITEBACK);
533 } 1076 }
534 1077
535 /* 1078 dirty_exceeded = (bdi_dirty > bdi_thresh) ||
536 * The bdi thresh is somehow "soft" limit derived from the 1079 (nr_dirty > dirty_thresh);
537 * global "hard" limit. The former helps to prevent heavy IO 1080 if (dirty_exceeded && !bdi->dirty_exceeded)
538 * bdi or process from holding back light ones; The latter is
539 * the last resort safeguard.
540 */
541 dirty_exceeded =
542 (bdi_nr_reclaimable + bdi_nr_writeback > bdi_thresh)
543 || (nr_reclaimable + nr_writeback > dirty_thresh);
544
545 if (!dirty_exceeded)
546 break;
547
548 if (!bdi->dirty_exceeded)
549 bdi->dirty_exceeded = 1; 1081 bdi->dirty_exceeded = 1;
550 1082
551 /* Note: nr_reclaimable denotes nr_dirty + nr_unstable. 1083 bdi_update_bandwidth(bdi, dirty_thresh, background_thresh,
552 * Unstable writes are a feature of certain networked 1084 nr_dirty, bdi_thresh, bdi_dirty,
553 * filesystems (i.e. NFS) in which data may have been 1085 start_time);
554 * written to the server's write cache, but has not yet 1086
555 * been flushed to permanent storage. 1087 max_pause = bdi_max_pause(bdi, bdi_dirty);
556 * Only move pages to writeback if this bdi is over its 1088
557 * threshold otherwise wait until the disk writes catch 1089 dirty_ratelimit = bdi->dirty_ratelimit;
558 * up. 1090 pos_ratio = bdi_position_ratio(bdi, dirty_thresh,
559 */ 1091 background_thresh, nr_dirty,
560 trace_wbc_balance_dirty_start(&wbc, bdi); 1092 bdi_thresh, bdi_dirty);
561 if (bdi_nr_reclaimable > bdi_thresh) { 1093 task_ratelimit = ((u64)dirty_ratelimit * pos_ratio) >>
562 writeback_inodes_wb(&bdi->wb, &wbc); 1094 RATELIMIT_CALC_SHIFT;
563 pages_written += write_chunk - wbc.nr_to_write; 1095 if (unlikely(task_ratelimit == 0)) {
564 trace_wbc_balance_dirty_written(&wbc, bdi); 1096 pause = max_pause;
565 if (pages_written >= write_chunk) 1097 goto pause;
566 break; /* We've done our duty */
567 } 1098 }
568 trace_wbc_balance_dirty_wait(&wbc, bdi); 1099 pause = HZ * pages_dirtied / task_ratelimit;
569 __set_current_state(TASK_UNINTERRUPTIBLE); 1100 if (unlikely(pause <= 0)) {
1101 trace_balance_dirty_pages(bdi,
1102 dirty_thresh,
1103 background_thresh,
1104 nr_dirty,
1105 bdi_thresh,
1106 bdi_dirty,
1107 dirty_ratelimit,
1108 task_ratelimit,
1109 pages_dirtied,
1110 pause,
1111 start_time);
1112 pause = 1; /* avoid resetting nr_dirtied_pause below */
1113 break;
1114 }
1115 pause = min(pause, max_pause);
1116
1117pause:
1118 trace_balance_dirty_pages(bdi,
1119 dirty_thresh,
1120 background_thresh,
1121 nr_dirty,
1122 bdi_thresh,
1123 bdi_dirty,
1124 dirty_ratelimit,
1125 task_ratelimit,
1126 pages_dirtied,
1127 pause,
1128 start_time);
1129 __set_current_state(TASK_KILLABLE);
570 io_schedule_timeout(pause); 1130 io_schedule_timeout(pause);
571 1131
572 /* 1132 /*
573 * Increase the delay for each loop, up to our previous 1133 * This is typically equal to (nr_dirty < dirty_thresh) and can
574 * default of taking a 100ms nap. 1134 * also keep "1000+ dd on a slow USB stick" under control.
575 */ 1135 */
576 pause <<= 1; 1136 if (task_ratelimit)
577 if (pause > HZ / 10) 1137 break;
578 pause = HZ / 10; 1138
1139 if (fatal_signal_pending(current))
1140 break;
579 } 1141 }
580 1142
581 if (!dirty_exceeded && bdi->dirty_exceeded) 1143 if (!dirty_exceeded && bdi->dirty_exceeded)
582 bdi->dirty_exceeded = 0; 1144 bdi->dirty_exceeded = 0;
583 1145
1146 current->nr_dirtied = 0;
1147 if (pause == 0) { /* in freerun area */
1148 current->nr_dirtied_pause =
1149 dirty_poll_interval(nr_dirty, dirty_thresh);
1150 } else if (pause <= max_pause / 4 &&
1151 pages_dirtied >= current->nr_dirtied_pause) {
1152 current->nr_dirtied_pause = clamp_val(
1153 dirty_ratelimit * (max_pause / 2) / HZ,
1154 pages_dirtied + pages_dirtied / 8,
1155 pages_dirtied * 4);
1156 } else if (pause >= max_pause) {
1157 current->nr_dirtied_pause = 1 | clamp_val(
1158 dirty_ratelimit * (max_pause / 2) / HZ,
1159 pages_dirtied / 4,
1160 pages_dirtied - pages_dirtied / 8);
1161 }
1162
584 if (writeback_in_progress(bdi)) 1163 if (writeback_in_progress(bdi))
585 return; 1164 return;
586 1165
@@ -592,8 +1171,10 @@ static void balance_dirty_pages(struct address_space *mapping,
592 * In normal mode, we start background writeout at the lower 1171 * In normal mode, we start background writeout at the lower
593 * background_thresh, to keep the amount of dirty memory low. 1172 * background_thresh, to keep the amount of dirty memory low.
594 */ 1173 */
595 if ((laptop_mode && pages_written) || 1174 if (laptop_mode)
596 (!laptop_mode && (nr_reclaimable > background_thresh))) 1175 return;
1176
1177 if (nr_reclaimable > background_thresh)
597 bdi_start_background_writeback(bdi); 1178 bdi_start_background_writeback(bdi);
598} 1179}
599 1180
@@ -607,7 +1188,7 @@ void set_page_dirty_balance(struct page *page, int page_mkwrite)
607 } 1188 }
608} 1189}
609 1190
610static DEFINE_PER_CPU(unsigned long, bdp_ratelimits) = 0; 1191static DEFINE_PER_CPU(int, bdp_ratelimits);
611 1192
612/** 1193/**
613 * balance_dirty_pages_ratelimited_nr - balance dirty memory state 1194 * balance_dirty_pages_ratelimited_nr - balance dirty memory state
@@ -626,28 +1207,40 @@ static DEFINE_PER_CPU(unsigned long, bdp_ratelimits) = 0;
626void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, 1207void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
627 unsigned long nr_pages_dirtied) 1208 unsigned long nr_pages_dirtied)
628{ 1209{
629 unsigned long ratelimit; 1210 struct backing_dev_info *bdi = mapping->backing_dev_info;
630 unsigned long *p; 1211 int ratelimit;
1212 int *p;
1213
1214 if (!bdi_cap_account_dirty(bdi))
1215 return;
631 1216
632 ratelimit = ratelimit_pages; 1217 ratelimit = current->nr_dirtied_pause;
633 if (mapping->backing_dev_info->dirty_exceeded) 1218 if (bdi->dirty_exceeded)
634 ratelimit = 8; 1219 ratelimit = min(ratelimit, 32 >> (PAGE_SHIFT - 10));
635 1220
1221 current->nr_dirtied += nr_pages_dirtied;
1222
1223 preempt_disable();
636 /* 1224 /*
637 * Check the rate limiting. Also, we do not want to throttle real-time 1225 * This prevents one CPU to accumulate too many dirtied pages without
638 * tasks in balance_dirty_pages(). Period. 1226 * calling into balance_dirty_pages(), which can happen when there are
1227 * 1000+ tasks, all of them start dirtying pages at exactly the same
1228 * time, hence all honoured too large initial task->nr_dirtied_pause.
639 */ 1229 */
640 preempt_disable();
641 p = &__get_cpu_var(bdp_ratelimits); 1230 p = &__get_cpu_var(bdp_ratelimits);
642 *p += nr_pages_dirtied; 1231 if (unlikely(current->nr_dirtied >= ratelimit))
643 if (unlikely(*p >= ratelimit)) {
644 ratelimit = sync_writeback_pages(*p);
645 *p = 0; 1232 *p = 0;
646 preempt_enable(); 1233 else {
647 balance_dirty_pages(mapping, ratelimit); 1234 *p += nr_pages_dirtied;
648 return; 1235 if (unlikely(*p >= ratelimit_pages)) {
1236 *p = 0;
1237 ratelimit = 0;
1238 }
649 } 1239 }
650 preempt_enable(); 1240 preempt_enable();
1241
1242 if (unlikely(current->nr_dirtied >= ratelimit))
1243 balance_dirty_pages(mapping, current->nr_dirtied);
651} 1244}
652EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr); 1245EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr);
653 1246
@@ -703,7 +1296,8 @@ void laptop_mode_timer_fn(unsigned long data)
703 * threshold 1296 * threshold
704 */ 1297 */
705 if (bdi_has_dirty_io(&q->backing_dev_info)) 1298 if (bdi_has_dirty_io(&q->backing_dev_info))
706 bdi_start_writeback(&q->backing_dev_info, nr_pages); 1299 bdi_start_writeback(&q->backing_dev_info, nr_pages,
1300 WB_REASON_LAPTOP_TIMER);
707} 1301}
708 1302
709/* 1303/*
@@ -742,22 +1336,17 @@ void laptop_sync_completion(void)
742 * 1336 *
743 * Here we set ratelimit_pages to a level which ensures that when all CPUs are 1337 * Here we set ratelimit_pages to a level which ensures that when all CPUs are
744 * dirtying in parallel, we cannot go more than 3% (1/32) over the dirty memory 1338 * dirtying in parallel, we cannot go more than 3% (1/32) over the dirty memory
745 * thresholds before writeback cuts in. 1339 * thresholds.
746 *
747 * But the limit should not be set too high. Because it also controls the
748 * amount of memory which the balance_dirty_pages() caller has to write back.
749 * If this is too large then the caller will block on the IO queue all the
750 * time. So limit it to four megabytes - the balance_dirty_pages() caller
751 * will write six megabyte chunks, max.
752 */ 1340 */
753 1341
754void writeback_set_ratelimit(void) 1342void writeback_set_ratelimit(void)
755{ 1343{
756 ratelimit_pages = vm_total_pages / (num_online_cpus() * 32); 1344 unsigned long background_thresh;
1345 unsigned long dirty_thresh;
1346 global_dirty_limits(&background_thresh, &dirty_thresh);
1347 ratelimit_pages = dirty_thresh / (num_online_cpus() * 32);
757 if (ratelimit_pages < 16) 1348 if (ratelimit_pages < 16)
758 ratelimit_pages = 16; 1349 ratelimit_pages = 16;
759 if (ratelimit_pages * PAGE_CACHE_SIZE > 4096 * 1024)
760 ratelimit_pages = (4096 * 1024) / PAGE_CACHE_SIZE;
761} 1350}
762 1351
763static int __cpuinit 1352static int __cpuinit
@@ -799,7 +1388,6 @@ void __init page_writeback_init(void)
799 1388
800 shift = calc_period_shift(); 1389 shift = calc_period_shift();
801 prop_descriptor_init(&vm_completions, shift); 1390 prop_descriptor_init(&vm_completions, shift);
802 prop_descriptor_init(&vm_dirties, shift);
803} 1391}
804 1392
805/** 1393/**
@@ -892,12 +1480,12 @@ int write_cache_pages(struct address_space *mapping,
892 range_whole = 1; 1480 range_whole = 1;
893 cycled = 1; /* ignore range_cyclic tests */ 1481 cycled = 1; /* ignore range_cyclic tests */
894 } 1482 }
895 if (wbc->sync_mode == WB_SYNC_ALL) 1483 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
896 tag = PAGECACHE_TAG_TOWRITE; 1484 tag = PAGECACHE_TAG_TOWRITE;
897 else 1485 else
898 tag = PAGECACHE_TAG_DIRTY; 1486 tag = PAGECACHE_TAG_DIRTY;
899retry: 1487retry:
900 if (wbc->sync_mode == WB_SYNC_ALL) 1488 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
901 tag_pages_for_writeback(mapping, index, end); 1489 tag_pages_for_writeback(mapping, index, end);
902 done_index = index; 1490 done_index = index;
903 while (!done && (index <= end)) { 1491 while (!done && (index <= end)) {
@@ -1127,7 +1715,7 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)
1127 __inc_zone_page_state(page, NR_FILE_DIRTY); 1715 __inc_zone_page_state(page, NR_FILE_DIRTY);
1128 __inc_zone_page_state(page, NR_DIRTIED); 1716 __inc_zone_page_state(page, NR_DIRTIED);
1129 __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); 1717 __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
1130 task_dirty_inc(current); 1718 __inc_bdi_stat(mapping->backing_dev_info, BDI_DIRTIED);
1131 task_io_account_write(PAGE_CACHE_SIZE); 1719 task_io_account_write(PAGE_CACHE_SIZE);
1132 } 1720 }
1133} 1721}
@@ -1141,7 +1729,6 @@ EXPORT_SYMBOL(account_page_dirtied);
1141void account_page_writeback(struct page *page) 1729void account_page_writeback(struct page *page)
1142{ 1730{
1143 inc_zone_page_state(page, NR_WRITEBACK); 1731 inc_zone_page_state(page, NR_WRITEBACK);
1144 inc_zone_page_state(page, NR_WRITTEN);
1145} 1732}
1146EXPORT_SYMBOL(account_page_writeback); 1733EXPORT_SYMBOL(account_page_writeback);
1147 1734
@@ -1358,8 +1945,10 @@ int test_clear_page_writeback(struct page *page)
1358 } else { 1945 } else {
1359 ret = TestClearPageWriteback(page); 1946 ret = TestClearPageWriteback(page);
1360 } 1947 }
1361 if (ret) 1948 if (ret) {
1362 dec_zone_page_state(page, NR_WRITEBACK); 1949 dec_zone_page_state(page, NR_WRITEBACK);
1950 inc_zone_page_state(page, NR_WRITTEN);
1951 }
1363 return ret; 1952 return ret;
1364} 1953}
1365 1954
@@ -1405,10 +1994,6 @@ EXPORT_SYMBOL(test_set_page_writeback);
1405 */ 1994 */
1406int mapping_tagged(struct address_space *mapping, int tag) 1995int mapping_tagged(struct address_space *mapping, int tag)
1407{ 1996{
1408 int ret; 1997 return radix_tree_tagged(&mapping->page_tree, tag);
1409 rcu_read_lock();
1410 ret = radix_tree_tagged(&mapping->page_tree, tag);
1411 rcu_read_unlock();
1412 return ret;
1413} 1998}
1414EXPORT_SYMBOL(mapping_tagged); 1999EXPORT_SYMBOL(mapping_tagged);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 3c7ea45ffba9..6ce27331834c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -321,6 +321,7 @@ static void bad_page(struct page *page)
321 current->comm, page_to_pfn(page)); 321 current->comm, page_to_pfn(page));
322 dump_page(page); 322 dump_page(page);
323 323
324 print_modules();
324 dump_stack(); 325 dump_stack();
325out: 326out:
326 /* Leave bad fields for debug, except PageBuddy could make trouble */ 327 /* Leave bad fields for debug, except PageBuddy could make trouble */
@@ -1373,21 +1374,12 @@ failed:
1373 1374
1374#ifdef CONFIG_FAIL_PAGE_ALLOC 1375#ifdef CONFIG_FAIL_PAGE_ALLOC
1375 1376
1376static struct fail_page_alloc_attr { 1377static struct {
1377 struct fault_attr attr; 1378 struct fault_attr attr;
1378 1379
1379 u32 ignore_gfp_highmem; 1380 u32 ignore_gfp_highmem;
1380 u32 ignore_gfp_wait; 1381 u32 ignore_gfp_wait;
1381 u32 min_order; 1382 u32 min_order;
1382
1383#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
1384
1385 struct dentry *ignore_gfp_highmem_file;
1386 struct dentry *ignore_gfp_wait_file;
1387 struct dentry *min_order_file;
1388
1389#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
1390
1391} fail_page_alloc = { 1383} fail_page_alloc = {
1392 .attr = FAULT_ATTR_INITIALIZER, 1384 .attr = FAULT_ATTR_INITIALIZER,
1393 .ignore_gfp_wait = 1, 1385 .ignore_gfp_wait = 1,
@@ -1421,36 +1413,27 @@ static int __init fail_page_alloc_debugfs(void)
1421{ 1413{
1422 mode_t mode = S_IFREG | S_IRUSR | S_IWUSR; 1414 mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
1423 struct dentry *dir; 1415 struct dentry *dir;
1424 int err;
1425
1426 err = init_fault_attr_dentries(&fail_page_alloc.attr,
1427 "fail_page_alloc");
1428 if (err)
1429 return err;
1430 dir = fail_page_alloc.attr.dentries.dir;
1431
1432 fail_page_alloc.ignore_gfp_wait_file =
1433 debugfs_create_bool("ignore-gfp-wait", mode, dir,
1434 &fail_page_alloc.ignore_gfp_wait);
1435
1436 fail_page_alloc.ignore_gfp_highmem_file =
1437 debugfs_create_bool("ignore-gfp-highmem", mode, dir,
1438 &fail_page_alloc.ignore_gfp_highmem);
1439 fail_page_alloc.min_order_file =
1440 debugfs_create_u32("min-order", mode, dir,
1441 &fail_page_alloc.min_order);
1442
1443 if (!fail_page_alloc.ignore_gfp_wait_file ||
1444 !fail_page_alloc.ignore_gfp_highmem_file ||
1445 !fail_page_alloc.min_order_file) {
1446 err = -ENOMEM;
1447 debugfs_remove(fail_page_alloc.ignore_gfp_wait_file);
1448 debugfs_remove(fail_page_alloc.ignore_gfp_highmem_file);
1449 debugfs_remove(fail_page_alloc.min_order_file);
1450 cleanup_fault_attr_dentries(&fail_page_alloc.attr);
1451 }
1452 1416
1453 return err; 1417 dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
1418 &fail_page_alloc.attr);
1419 if (IS_ERR(dir))
1420 return PTR_ERR(dir);
1421
1422 if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,
1423 &fail_page_alloc.ignore_gfp_wait))
1424 goto fail;
1425 if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir,
1426 &fail_page_alloc.ignore_gfp_highmem))
1427 goto fail;
1428 if (!debugfs_create_u32("min-order", mode, dir,
1429 &fail_page_alloc.min_order))
1430 goto fail;
1431
1432 return 0;
1433fail:
1434 debugfs_remove_recursive(dir);
1435
1436 return -ENOMEM;
1454} 1437}
1455 1438
1456late_initcall(fail_page_alloc_debugfs); 1439late_initcall(fail_page_alloc_debugfs);
@@ -1619,6 +1602,21 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
1619 set_bit(i, zlc->fullzones); 1602 set_bit(i, zlc->fullzones);
1620} 1603}
1621 1604
1605/*
1606 * clear all zones full, called after direct reclaim makes progress so that
1607 * a zone that was recently full is not skipped over for up to a second
1608 */
1609static void zlc_clear_zones_full(struct zonelist *zonelist)
1610{
1611 struct zonelist_cache *zlc; /* cached zonelist speedup info */
1612
1613 zlc = zonelist->zlcache_ptr;
1614 if (!zlc)
1615 return;
1616
1617 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
1618}
1619
1622#else /* CONFIG_NUMA */ 1620#else /* CONFIG_NUMA */
1623 1621
1624static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) 1622static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
@@ -1635,6 +1633,10 @@ static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
1635static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) 1633static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
1636{ 1634{
1637} 1635}
1636
1637static void zlc_clear_zones_full(struct zonelist *zonelist)
1638{
1639}
1638#endif /* CONFIG_NUMA */ 1640#endif /* CONFIG_NUMA */
1639 1641
1640/* 1642/*
@@ -1667,7 +1669,7 @@ zonelist_scan:
1667 continue; 1669 continue;
1668 if ((alloc_flags & ALLOC_CPUSET) && 1670 if ((alloc_flags & ALLOC_CPUSET) &&
1669 !cpuset_zone_allowed_softwall(zone, gfp_mask)) 1671 !cpuset_zone_allowed_softwall(zone, gfp_mask))
1670 goto try_next_zone; 1672 continue;
1671 1673
1672 BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); 1674 BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
1673 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { 1675 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
@@ -1679,17 +1681,36 @@ zonelist_scan:
1679 classzone_idx, alloc_flags)) 1681 classzone_idx, alloc_flags))
1680 goto try_this_zone; 1682 goto try_this_zone;
1681 1683
1684 if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) {
1685 /*
1686 * we do zlc_setup if there are multiple nodes
1687 * and before considering the first zone allowed
1688 * by the cpuset.
1689 */
1690 allowednodes = zlc_setup(zonelist, alloc_flags);
1691 zlc_active = 1;
1692 did_zlc_setup = 1;
1693 }
1694
1682 if (zone_reclaim_mode == 0) 1695 if (zone_reclaim_mode == 0)
1683 goto this_zone_full; 1696 goto this_zone_full;
1684 1697
1698 /*
1699 * As we may have just activated ZLC, check if the first
1700 * eligible zone has failed zone_reclaim recently.
1701 */
1702 if (NUMA_BUILD && zlc_active &&
1703 !zlc_zone_worth_trying(zonelist, z, allowednodes))
1704 continue;
1705
1685 ret = zone_reclaim(zone, gfp_mask, order); 1706 ret = zone_reclaim(zone, gfp_mask, order);
1686 switch (ret) { 1707 switch (ret) {
1687 case ZONE_RECLAIM_NOSCAN: 1708 case ZONE_RECLAIM_NOSCAN:
1688 /* did not scan */ 1709 /* did not scan */
1689 goto try_next_zone; 1710 continue;
1690 case ZONE_RECLAIM_FULL: 1711 case ZONE_RECLAIM_FULL:
1691 /* scanned but unreclaimable */ 1712 /* scanned but unreclaimable */
1692 goto this_zone_full; 1713 continue;
1693 default: 1714 default:
1694 /* did we reclaim enough */ 1715 /* did we reclaim enough */
1695 if (!zone_watermark_ok(zone, order, mark, 1716 if (!zone_watermark_ok(zone, order, mark,
@@ -1706,16 +1727,6 @@ try_this_zone:
1706this_zone_full: 1727this_zone_full:
1707 if (NUMA_BUILD) 1728 if (NUMA_BUILD)
1708 zlc_mark_zone_full(zonelist, z); 1729 zlc_mark_zone_full(zonelist, z);
1709try_next_zone:
1710 if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) {
1711 /*
1712 * we do zlc_setup after the first zone is tried but only
1713 * if there are multiple nodes make it worthwhile
1714 */
1715 allowednodes = zlc_setup(zonelist, alloc_flags);
1716 zlc_active = 1;
1717 did_zlc_setup = 1;
1718 }
1719 } 1730 }
1720 1731
1721 if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) { 1732 if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {
@@ -1746,7 +1757,6 @@ static DEFINE_RATELIMIT_STATE(nopage_rs,
1746 1757
1747void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...) 1758void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
1748{ 1759{
1749 va_list args;
1750 unsigned int filter = SHOW_MEM_FILTER_NODES; 1760 unsigned int filter = SHOW_MEM_FILTER_NODES;
1751 1761
1752 if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs)) 1762 if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs))
@@ -1765,14 +1775,21 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
1765 filter &= ~SHOW_MEM_FILTER_NODES; 1775 filter &= ~SHOW_MEM_FILTER_NODES;
1766 1776
1767 if (fmt) { 1777 if (fmt) {
1768 printk(KERN_WARNING); 1778 struct va_format vaf;
1779 va_list args;
1780
1769 va_start(args, fmt); 1781 va_start(args, fmt);
1770 vprintk(fmt, args); 1782
1783 vaf.fmt = fmt;
1784 vaf.va = &args;
1785
1786 pr_warn("%pV", &vaf);
1787
1771 va_end(args); 1788 va_end(args);
1772 } 1789 }
1773 1790
1774 pr_warning("%s: page allocation failure: order:%d, mode:0x%x\n", 1791 pr_warn("%s: page allocation failure: order:%d, mode:0x%x\n",
1775 current->comm, order, gfp_mask); 1792 current->comm, order, gfp_mask);
1776 1793
1777 dump_stack(); 1794 dump_stack();
1778 if (!should_suppress_show_mem()) 1795 if (!should_suppress_show_mem())
@@ -1957,6 +1974,10 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
1957 if (unlikely(!(*did_some_progress))) 1974 if (unlikely(!(*did_some_progress)))
1958 return NULL; 1975 return NULL;
1959 1976
1977 /* After successful reclaim, reconsider all zones for allocation */
1978 if (NUMA_BUILD)
1979 zlc_clear_zones_full(zonelist);
1980
1960retry: 1981retry:
1961 page = get_page_from_freelist(gfp_mask, nodemask, order, 1982 page = get_page_from_freelist(gfp_mask, nodemask, order,
1962 zonelist, high_zoneidx, 1983 zonelist, high_zoneidx,
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 53bffc6c293e..2d123f94a8df 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -133,10 +133,13 @@ struct page *lookup_cgroup_page(struct page_cgroup *pc)
133static void *__meminit alloc_page_cgroup(size_t size, int nid) 133static void *__meminit alloc_page_cgroup(size_t size, int nid)
134{ 134{
135 void *addr = NULL; 135 void *addr = NULL;
136 gfp_t flags = GFP_KERNEL | __GFP_NOWARN;
136 137
137 addr = alloc_pages_exact_nid(nid, size, GFP_KERNEL | __GFP_NOWARN); 138 addr = alloc_pages_exact_nid(nid, size, flags);
138 if (addr) 139 if (addr) {
140 kmemleak_alloc(addr, size, 1, flags);
139 return addr; 141 return addr;
142 }
140 143
141 if (node_state(nid, N_HIGH_MEMORY)) 144 if (node_state(nid, N_HIGH_MEMORY))
142 addr = vmalloc_node(size, nid); 145 addr = vmalloc_node(size, nid);
@@ -225,8 +228,8 @@ int __meminit online_page_cgroup(unsigned long start_pfn,
225 unsigned long start, end, pfn; 228 unsigned long start, end, pfn;
226 int fail = 0; 229 int fail = 0;
227 230
228 start = start_pfn & ~(PAGES_PER_SECTION - 1); 231 start = SECTION_ALIGN_DOWN(start_pfn);
229 end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION); 232 end = SECTION_ALIGN_UP(start_pfn + nr_pages);
230 233
231 if (nid == -1) { 234 if (nid == -1) {
232 /* 235 /*
@@ -258,8 +261,8 @@ int __meminit offline_page_cgroup(unsigned long start_pfn,
258{ 261{
259 unsigned long start, end, pfn; 262 unsigned long start, end, pfn;
260 263
261 start = start_pfn & ~(PAGES_PER_SECTION - 1); 264 start = SECTION_ALIGN_DOWN(start_pfn);
262 end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION); 265 end = SECTION_ALIGN_UP(start_pfn + nr_pages);
263 266
264 for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) 267 for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
265 __free_page_cgroup(pfn); 268 __free_page_cgroup(pfn);
@@ -357,7 +360,7 @@ struct swap_cgroup_ctrl {
357 spinlock_t lock; 360 spinlock_t lock;
358}; 361};
359 362
360struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES]; 363static struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES];
361 364
362struct swap_cgroup { 365struct swap_cgroup {
363 unsigned short id; 366 unsigned short id;
@@ -513,11 +516,10 @@ int swap_cgroup_swapon(int type, unsigned long max_pages)
513 length = DIV_ROUND_UP(max_pages, SC_PER_PAGE); 516 length = DIV_ROUND_UP(max_pages, SC_PER_PAGE);
514 array_size = length * sizeof(void *); 517 array_size = length * sizeof(void *);
515 518
516 array = vmalloc(array_size); 519 array = vzalloc(array_size);
517 if (!array) 520 if (!array)
518 goto nomem; 521 goto nomem;
519 522
520 memset(array, 0, array_size);
521 ctrl = &swap_cgroup_ctrl[type]; 523 ctrl = &swap_cgroup_ctrl[type];
522 mutex_lock(&swap_cgroup_mutex); 524 mutex_lock(&swap_cgroup_mutex);
523 ctrl->length = length; 525 ctrl->length = length;
@@ -537,7 +539,7 @@ int swap_cgroup_swapon(int type, unsigned long max_pages)
537nomem: 539nomem:
538 printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n"); 540 printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n");
539 printk(KERN_INFO 541 printk(KERN_INFO
540 "swap_cgroup can be disabled by noswapaccount boot option\n"); 542 "swap_cgroup can be disabled by swapaccount=0 boot option\n");
541 return -ENOMEM; 543 return -ENOMEM;
542} 544}
543 545
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index c3450d533611..2f5cf10ff660 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -126,7 +126,39 @@ static int walk_hugetlb_range(struct vm_area_struct *vma,
126 126
127 return 0; 127 return 0;
128} 128}
129#endif 129
130static struct vm_area_struct* hugetlb_vma(unsigned long addr, struct mm_walk *walk)
131{
132 struct vm_area_struct *vma;
133
134 /* We don't need vma lookup at all. */
135 if (!walk->hugetlb_entry)
136 return NULL;
137
138 VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem));
139 vma = find_vma(walk->mm, addr);
140 if (vma && vma->vm_start <= addr && is_vm_hugetlb_page(vma))
141 return vma;
142
143 return NULL;
144}
145
146#else /* CONFIG_HUGETLB_PAGE */
147static struct vm_area_struct* hugetlb_vma(unsigned long addr, struct mm_walk *walk)
148{
149 return NULL;
150}
151
152static int walk_hugetlb_range(struct vm_area_struct *vma,
153 unsigned long addr, unsigned long end,
154 struct mm_walk *walk)
155{
156 return 0;
157}
158
159#endif /* CONFIG_HUGETLB_PAGE */
160
161
130 162
131/** 163/**
132 * walk_page_range - walk a memory map's page tables with a callback 164 * walk_page_range - walk a memory map's page tables with a callback
@@ -144,11 +176,15 @@ static int walk_hugetlb_range(struct vm_area_struct *vma,
144 * associated range, and a copy of the original mm_walk for access to 176 * associated range, and a copy of the original mm_walk for access to
145 * the ->private or ->mm fields. 177 * the ->private or ->mm fields.
146 * 178 *
147 * No locks are taken, but the bottom level iterator will map PTE 179 * Usually no locks are taken, but splitting transparent huge page may
180 * take page table lock. And the bottom level iterator will map PTE
148 * directories from highmem if necessary. 181 * directories from highmem if necessary.
149 * 182 *
150 * If any callback returns a non-zero value, the walk is aborted and 183 * If any callback returns a non-zero value, the walk is aborted and
151 * the return value is propagated back to the caller. Otherwise 0 is returned. 184 * the return value is propagated back to the caller. Otherwise 0 is returned.
185 *
186 * walk->mm->mmap_sem must be held for at least read if walk->hugetlb_entry
187 * is !NULL.
152 */ 188 */
153int walk_page_range(unsigned long addr, unsigned long end, 189int walk_page_range(unsigned long addr, unsigned long end,
154 struct mm_walk *walk) 190 struct mm_walk *walk)
@@ -165,18 +201,17 @@ int walk_page_range(unsigned long addr, unsigned long end,
165 201
166 pgd = pgd_offset(walk->mm, addr); 202 pgd = pgd_offset(walk->mm, addr);
167 do { 203 do {
168 struct vm_area_struct *uninitialized_var(vma); 204 struct vm_area_struct *vma;
169 205
170 next = pgd_addr_end(addr, end); 206 next = pgd_addr_end(addr, end);
171 207
172#ifdef CONFIG_HUGETLB_PAGE
173 /* 208 /*
174 * handle hugetlb vma individually because pagetable walk for 209 * handle hugetlb vma individually because pagetable walk for
175 * the hugetlb page is dependent on the architecture and 210 * the hugetlb page is dependent on the architecture and
176 * we can't handled it in the same manner as non-huge pages. 211 * we can't handled it in the same manner as non-huge pages.
177 */ 212 */
178 vma = find_vma(walk->mm, addr); 213 vma = hugetlb_vma(addr, walk);
179 if (vma && is_vm_hugetlb_page(vma)) { 214 if (vma) {
180 if (vma->vm_end < next) 215 if (vma->vm_end < next)
181 next = vma->vm_end; 216 next = vma->vm_end;
182 /* 217 /*
@@ -189,7 +224,7 @@ int walk_page_range(unsigned long addr, unsigned long end,
189 pgd = pgd_offset(walk->mm, next); 224 pgd = pgd_offset(walk->mm, next);
190 continue; 225 continue;
191 } 226 }
192#endif 227
193 if (pgd_none_or_clear_bad(pgd)) { 228 if (pgd_none_or_clear_bad(pgd)) {
194 if (walk->pte_hole) 229 if (walk->pte_hole)
195 err = walk->pte_hole(addr, next, walk); 230 err = walk->pte_hole(addr, next, walk);
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
new file mode 100644
index 000000000000..e920aa3ce104
--- /dev/null
+++ b/mm/process_vm_access.c
@@ -0,0 +1,496 @@
1/*
2 * linux/mm/process_vm_access.c
3 *
4 * Copyright (C) 2010-2011 Christopher Yeoh <cyeoh@au1.ibm.com>, IBM Corp.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12#include <linux/mm.h>
13#include <linux/uio.h>
14#include <linux/sched.h>
15#include <linux/highmem.h>
16#include <linux/ptrace.h>
17#include <linux/slab.h>
18#include <linux/syscalls.h>
19
20#ifdef CONFIG_COMPAT
21#include <linux/compat.h>
22#endif
23
24/**
25 * process_vm_rw_pages - read/write pages from task specified
26 * @task: task to read/write from
27 * @mm: mm for task
28 * @process_pages: struct pages area that can store at least
29 * nr_pages_to_copy struct page pointers
30 * @pa: address of page in task to start copying from/to
31 * @start_offset: offset in page to start copying from/to
32 * @len: number of bytes to copy
33 * @lvec: iovec array specifying where to copy to/from
34 * @lvec_cnt: number of elements in iovec array
35 * @lvec_current: index in iovec array we are up to
36 * @lvec_offset: offset in bytes from current iovec iov_base we are up to
37 * @vm_write: 0 means copy from, 1 means copy to
38 * @nr_pages_to_copy: number of pages to copy
39 * @bytes_copied: returns number of bytes successfully copied
40 * Returns 0 on success, error code otherwise
41 */
42static int process_vm_rw_pages(struct task_struct *task,
43 struct mm_struct *mm,
44 struct page **process_pages,
45 unsigned long pa,
46 unsigned long start_offset,
47 unsigned long len,
48 const struct iovec *lvec,
49 unsigned long lvec_cnt,
50 unsigned long *lvec_current,
51 size_t *lvec_offset,
52 int vm_write,
53 unsigned int nr_pages_to_copy,
54 ssize_t *bytes_copied)
55{
56 int pages_pinned;
57 void *target_kaddr;
58 int pgs_copied = 0;
59 int j;
60 int ret;
61 ssize_t bytes_to_copy;
62 ssize_t rc = 0;
63
64 *bytes_copied = 0;
65
66 /* Get the pages we're interested in */
67 down_read(&mm->mmap_sem);
68 pages_pinned = get_user_pages(task, mm, pa,
69 nr_pages_to_copy,
70 vm_write, 0, process_pages, NULL);
71 up_read(&mm->mmap_sem);
72
73 if (pages_pinned != nr_pages_to_copy) {
74 rc = -EFAULT;
75 goto end;
76 }
77
78 /* Do the copy for each page */
79 for (pgs_copied = 0;
80 (pgs_copied < nr_pages_to_copy) && (*lvec_current < lvec_cnt);
81 pgs_copied++) {
82 /* Make sure we have a non zero length iovec */
83 while (*lvec_current < lvec_cnt
84 && lvec[*lvec_current].iov_len == 0)
85 (*lvec_current)++;
86 if (*lvec_current == lvec_cnt)
87 break;
88
89 /*
90 * Will copy smallest of:
91 * - bytes remaining in page
92 * - bytes remaining in destination iovec
93 */
94 bytes_to_copy = min_t(ssize_t, PAGE_SIZE - start_offset,
95 len - *bytes_copied);
96 bytes_to_copy = min_t(ssize_t, bytes_to_copy,
97 lvec[*lvec_current].iov_len
98 - *lvec_offset);
99
100 target_kaddr = kmap(process_pages[pgs_copied]) + start_offset;
101
102 if (vm_write)
103 ret = copy_from_user(target_kaddr,
104 lvec[*lvec_current].iov_base
105 + *lvec_offset,
106 bytes_to_copy);
107 else
108 ret = copy_to_user(lvec[*lvec_current].iov_base
109 + *lvec_offset,
110 target_kaddr, bytes_to_copy);
111 kunmap(process_pages[pgs_copied]);
112 if (ret) {
113 *bytes_copied += bytes_to_copy - ret;
114 pgs_copied++;
115 rc = -EFAULT;
116 goto end;
117 }
118 *bytes_copied += bytes_to_copy;
119 *lvec_offset += bytes_to_copy;
120 if (*lvec_offset == lvec[*lvec_current].iov_len) {
121 /*
122 * Need to copy remaining part of page into the
123 * next iovec if there are any bytes left in page
124 */
125 (*lvec_current)++;
126 *lvec_offset = 0;
127 start_offset = (start_offset + bytes_to_copy)
128 % PAGE_SIZE;
129 if (start_offset)
130 pgs_copied--;
131 } else {
132 start_offset = 0;
133 }
134 }
135
136end:
137 if (vm_write) {
138 for (j = 0; j < pages_pinned; j++) {
139 if (j < pgs_copied)
140 set_page_dirty_lock(process_pages[j]);
141 put_page(process_pages[j]);
142 }
143 } else {
144 for (j = 0; j < pages_pinned; j++)
145 put_page(process_pages[j]);
146 }
147
148 return rc;
149}
150
151/* Maximum number of pages kmalloc'd to hold struct page's during copy */
152#define PVM_MAX_KMALLOC_PAGES (PAGE_SIZE * 2)
153
154/**
155 * process_vm_rw_single_vec - read/write pages from task specified
156 * @addr: start memory address of target process
157 * @len: size of area to copy to/from
158 * @lvec: iovec array specifying where to copy to/from locally
159 * @lvec_cnt: number of elements in iovec array
160 * @lvec_current: index in iovec array we are up to
161 * @lvec_offset: offset in bytes from current iovec iov_base we are up to
162 * @process_pages: struct pages area that can store at least
163 * nr_pages_to_copy struct page pointers
164 * @mm: mm for task
165 * @task: task to read/write from
166 * @vm_write: 0 means copy from, 1 means copy to
167 * @bytes_copied: returns number of bytes successfully copied
168 * Returns 0 on success or on failure error code
169 */
170static int process_vm_rw_single_vec(unsigned long addr,
171 unsigned long len,
172 const struct iovec *lvec,
173 unsigned long lvec_cnt,
174 unsigned long *lvec_current,
175 size_t *lvec_offset,
176 struct page **process_pages,
177 struct mm_struct *mm,
178 struct task_struct *task,
179 int vm_write,
180 ssize_t *bytes_copied)
181{
182 unsigned long pa = addr & PAGE_MASK;
183 unsigned long start_offset = addr - pa;
184 unsigned long nr_pages;
185 ssize_t bytes_copied_loop;
186 ssize_t rc = 0;
187 unsigned long nr_pages_copied = 0;
188 unsigned long nr_pages_to_copy;
189 unsigned long max_pages_per_loop = PVM_MAX_KMALLOC_PAGES
190 / sizeof(struct pages *);
191
192 *bytes_copied = 0;
193
194 /* Work out address and page range required */
195 if (len == 0)
196 return 0;
197 nr_pages = (addr + len - 1) / PAGE_SIZE - addr / PAGE_SIZE + 1;
198
199 while ((nr_pages_copied < nr_pages) && (*lvec_current < lvec_cnt)) {
200 nr_pages_to_copy = min(nr_pages - nr_pages_copied,
201 max_pages_per_loop);
202
203 rc = process_vm_rw_pages(task, mm, process_pages, pa,
204 start_offset, len,
205 lvec, lvec_cnt,
206 lvec_current, lvec_offset,
207 vm_write, nr_pages_to_copy,
208 &bytes_copied_loop);
209 start_offset = 0;
210 *bytes_copied += bytes_copied_loop;
211
212 if (rc < 0) {
213 return rc;
214 } else {
215 len -= bytes_copied_loop;
216 nr_pages_copied += nr_pages_to_copy;
217 pa += nr_pages_to_copy * PAGE_SIZE;
218 }
219 }
220
221 return rc;
222}
223
224/* Maximum number of entries for process pages array
225 which lives on stack */
226#define PVM_MAX_PP_ARRAY_COUNT 16
227
228/**
229 * process_vm_rw_core - core of reading/writing pages from task specified
230 * @pid: PID of process to read/write from/to
231 * @lvec: iovec array specifying where to copy to/from locally
232 * @liovcnt: size of lvec array
233 * @rvec: iovec array specifying where to copy to/from in the other process
234 * @riovcnt: size of rvec array
235 * @flags: currently unused
236 * @vm_write: 0 if reading from other process, 1 if writing to other process
237 * Returns the number of bytes read/written or error code. May
238 * return less bytes than expected if an error occurs during the copying
239 * process.
240 */
241static ssize_t process_vm_rw_core(pid_t pid, const struct iovec *lvec,
242 unsigned long liovcnt,
243 const struct iovec *rvec,
244 unsigned long riovcnt,
245 unsigned long flags, int vm_write)
246{
247 struct task_struct *task;
248 struct page *pp_stack[PVM_MAX_PP_ARRAY_COUNT];
249 struct page **process_pages = pp_stack;
250 struct mm_struct *mm;
251 unsigned long i;
252 ssize_t rc = 0;
253 ssize_t bytes_copied_loop;
254 ssize_t bytes_copied = 0;
255 unsigned long nr_pages = 0;
256 unsigned long nr_pages_iov;
257 unsigned long iov_l_curr_idx = 0;
258 size_t iov_l_curr_offset = 0;
259 ssize_t iov_len;
260
261 /*
262 * Work out how many pages of struct pages we're going to need
263 * when eventually calling get_user_pages
264 */
265 for (i = 0; i < riovcnt; i++) {
266 iov_len = rvec[i].iov_len;
267 if (iov_len > 0) {
268 nr_pages_iov = ((unsigned long)rvec[i].iov_base
269 + iov_len)
270 / PAGE_SIZE - (unsigned long)rvec[i].iov_base
271 / PAGE_SIZE + 1;
272 nr_pages = max(nr_pages, nr_pages_iov);
273 }
274 }
275
276 if (nr_pages == 0)
277 return 0;
278
279 if (nr_pages > PVM_MAX_PP_ARRAY_COUNT) {
280 /* For reliability don't try to kmalloc more than
281 2 pages worth */
282 process_pages = kmalloc(min_t(size_t, PVM_MAX_KMALLOC_PAGES,
283 sizeof(struct pages *)*nr_pages),
284 GFP_KERNEL);
285
286 if (!process_pages)
287 return -ENOMEM;
288 }
289
290 /* Get process information */
291 rcu_read_lock();
292 task = find_task_by_vpid(pid);
293 if (task)
294 get_task_struct(task);
295 rcu_read_unlock();
296 if (!task) {
297 rc = -ESRCH;
298 goto free_proc_pages;
299 }
300
301 task_lock(task);
302 if (__ptrace_may_access(task, PTRACE_MODE_ATTACH)) {
303 task_unlock(task);
304 rc = -EPERM;
305 goto put_task_struct;
306 }
307 mm = task->mm;
308
309 if (!mm || (task->flags & PF_KTHREAD)) {
310 task_unlock(task);
311 rc = -EINVAL;
312 goto put_task_struct;
313 }
314
315 atomic_inc(&mm->mm_users);
316 task_unlock(task);
317
318 for (i = 0; i < riovcnt && iov_l_curr_idx < liovcnt; i++) {
319 rc = process_vm_rw_single_vec(
320 (unsigned long)rvec[i].iov_base, rvec[i].iov_len,
321 lvec, liovcnt, &iov_l_curr_idx, &iov_l_curr_offset,
322 process_pages, mm, task, vm_write, &bytes_copied_loop);
323 bytes_copied += bytes_copied_loop;
324 if (rc != 0) {
325 /* If we have managed to copy any data at all then
326 we return the number of bytes copied. Otherwise
327 we return the error code */
328 if (bytes_copied)
329 rc = bytes_copied;
330 goto put_mm;
331 }
332 }
333
334 rc = bytes_copied;
335put_mm:
336 mmput(mm);
337
338put_task_struct:
339 put_task_struct(task);
340
341free_proc_pages:
342 if (process_pages != pp_stack)
343 kfree(process_pages);
344 return rc;
345}
346
347/**
348 * process_vm_rw - check iovecs before calling core routine
349 * @pid: PID of process to read/write from/to
350 * @lvec: iovec array specifying where to copy to/from locally
351 * @liovcnt: size of lvec array
352 * @rvec: iovec array specifying where to copy to/from in the other process
353 * @riovcnt: size of rvec array
354 * @flags: currently unused
355 * @vm_write: 0 if reading from other process, 1 if writing to other process
356 * Returns the number of bytes read/written or error code. May
357 * return less bytes than expected if an error occurs during the copying
358 * process.
359 */
360static ssize_t process_vm_rw(pid_t pid,
361 const struct iovec __user *lvec,
362 unsigned long liovcnt,
363 const struct iovec __user *rvec,
364 unsigned long riovcnt,
365 unsigned long flags, int vm_write)
366{
367 struct iovec iovstack_l[UIO_FASTIOV];
368 struct iovec iovstack_r[UIO_FASTIOV];
369 struct iovec *iov_l = iovstack_l;
370 struct iovec *iov_r = iovstack_r;
371 ssize_t rc;
372
373 if (flags != 0)
374 return -EINVAL;
375
376 /* Check iovecs */
377 if (vm_write)
378 rc = rw_copy_check_uvector(WRITE, lvec, liovcnt, UIO_FASTIOV,
379 iovstack_l, &iov_l, 1);
380 else
381 rc = rw_copy_check_uvector(READ, lvec, liovcnt, UIO_FASTIOV,
382 iovstack_l, &iov_l, 1);
383 if (rc <= 0)
384 goto free_iovecs;
385
386 rc = rw_copy_check_uvector(READ, rvec, riovcnt, UIO_FASTIOV,
387 iovstack_r, &iov_r, 0);
388 if (rc <= 0)
389 goto free_iovecs;
390
391 rc = process_vm_rw_core(pid, iov_l, liovcnt, iov_r, riovcnt, flags,
392 vm_write);
393
394free_iovecs:
395 if (iov_r != iovstack_r)
396 kfree(iov_r);
397 if (iov_l != iovstack_l)
398 kfree(iov_l);
399
400 return rc;
401}
402
403SYSCALL_DEFINE6(process_vm_readv, pid_t, pid, const struct iovec __user *, lvec,
404 unsigned long, liovcnt, const struct iovec __user *, rvec,
405 unsigned long, riovcnt, unsigned long, flags)
406{
407 return process_vm_rw(pid, lvec, liovcnt, rvec, riovcnt, flags, 0);
408}
409
410SYSCALL_DEFINE6(process_vm_writev, pid_t, pid,
411 const struct iovec __user *, lvec,
412 unsigned long, liovcnt, const struct iovec __user *, rvec,
413 unsigned long, riovcnt, unsigned long, flags)
414{
415 return process_vm_rw(pid, lvec, liovcnt, rvec, riovcnt, flags, 1);
416}
417
418#ifdef CONFIG_COMPAT
419
420asmlinkage ssize_t
421compat_process_vm_rw(compat_pid_t pid,
422 const struct compat_iovec __user *lvec,
423 unsigned long liovcnt,
424 const struct compat_iovec __user *rvec,
425 unsigned long riovcnt,
426 unsigned long flags, int vm_write)
427{
428 struct iovec iovstack_l[UIO_FASTIOV];
429 struct iovec iovstack_r[UIO_FASTIOV];
430 struct iovec *iov_l = iovstack_l;
431 struct iovec *iov_r = iovstack_r;
432 ssize_t rc = -EFAULT;
433
434 if (flags != 0)
435 return -EINVAL;
436
437 if (!access_ok(VERIFY_READ, lvec, liovcnt * sizeof(*lvec)))
438 goto out;
439
440 if (!access_ok(VERIFY_READ, rvec, riovcnt * sizeof(*rvec)))
441 goto out;
442
443 if (vm_write)
444 rc = compat_rw_copy_check_uvector(WRITE, lvec, liovcnt,
445 UIO_FASTIOV, iovstack_l,
446 &iov_l, 1);
447 else
448 rc = compat_rw_copy_check_uvector(READ, lvec, liovcnt,
449 UIO_FASTIOV, iovstack_l,
450 &iov_l, 1);
451 if (rc <= 0)
452 goto free_iovecs;
453 rc = compat_rw_copy_check_uvector(READ, rvec, riovcnt,
454 UIO_FASTIOV, iovstack_r,
455 &iov_r, 0);
456 if (rc <= 0)
457 goto free_iovecs;
458
459 rc = process_vm_rw_core(pid, iov_l, liovcnt, iov_r, riovcnt, flags,
460 vm_write);
461
462free_iovecs:
463 if (iov_r != iovstack_r)
464 kfree(iov_r);
465 if (iov_l != iovstack_l)
466 kfree(iov_l);
467
468out:
469 return rc;
470}
471
472asmlinkage ssize_t
473compat_sys_process_vm_readv(compat_pid_t pid,
474 const struct compat_iovec __user *lvec,
475 unsigned long liovcnt,
476 const struct compat_iovec __user *rvec,
477 unsigned long riovcnt,
478 unsigned long flags)
479{
480 return compat_process_vm_rw(pid, lvec, liovcnt, rvec,
481 riovcnt, flags, 0);
482}
483
484asmlinkage ssize_t
485compat_sys_process_vm_writev(compat_pid_t pid,
486 const struct compat_iovec __user *lvec,
487 unsigned long liovcnt,
488 const struct compat_iovec __user *rvec,
489 unsigned long riovcnt,
490 unsigned long flags)
491{
492 return compat_process_vm_rw(pid, lvec, liovcnt, rvec,
493 riovcnt, flags, 1);
494}
495
496#endif
diff --git a/mm/quicklist.c b/mm/quicklist.c
index 2876349339a7..942212970529 100644
--- a/mm/quicklist.c
+++ b/mm/quicklist.c
@@ -17,7 +17,6 @@
17#include <linux/gfp.h> 17#include <linux/gfp.h>
18#include <linux/mm.h> 18#include <linux/mm.h>
19#include <linux/mmzone.h> 19#include <linux/mmzone.h>
20#include <linux/module.h>
21#include <linux/quicklist.h> 20#include <linux/quicklist.h>
22 21
23DEFINE_PER_CPU(struct quicklist [CONFIG_NR_QUICK], quicklist); 22DEFINE_PER_CPU(struct quicklist [CONFIG_NR_QUICK], quicklist);
diff --git a/mm/readahead.c b/mm/readahead.c
index 867f9dd82dcd..cbcbb02f3e28 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -11,7 +11,7 @@
11#include <linux/fs.h> 11#include <linux/fs.h>
12#include <linux/gfp.h> 12#include <linux/gfp.h>
13#include <linux/mm.h> 13#include <linux/mm.h>
14#include <linux/module.h> 14#include <linux/export.h>
15#include <linux/blkdev.h> 15#include <linux/blkdev.h>
16#include <linux/backing-dev.h> 16#include <linux/backing-dev.h>
17#include <linux/task_io_accounting_ops.h> 17#include <linux/task_io_accounting_ops.h>
diff --git a/mm/rmap.c b/mm/rmap.c
index 23295f65ae43..a4fd3680038b 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -21,7 +21,6 @@
21 * Lock ordering in mm: 21 * Lock ordering in mm:
22 * 22 *
23 * inode->i_mutex (while writing or truncating, not reading or faulting) 23 * inode->i_mutex (while writing or truncating, not reading or faulting)
24 * inode->i_alloc_sem (vmtruncate_range)
25 * mm->mmap_sem 24 * mm->mmap_sem
26 * page->flags PG_locked (lock_page) 25 * page->flags PG_locked (lock_page)
27 * mapping->i_mmap_mutex 26 * mapping->i_mmap_mutex
@@ -32,11 +31,11 @@
32 * mmlist_lock (in mmput, drain_mmlist and others) 31 * mmlist_lock (in mmput, drain_mmlist and others)
33 * mapping->private_lock (in __set_page_dirty_buffers) 32 * mapping->private_lock (in __set_page_dirty_buffers)
34 * inode->i_lock (in set_page_dirty's __mark_inode_dirty) 33 * inode->i_lock (in set_page_dirty's __mark_inode_dirty)
35 * inode_wb_list_lock (in set_page_dirty's __mark_inode_dirty) 34 * bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
36 * sb_lock (within inode_lock in fs/fs-writeback.c) 35 * sb_lock (within inode_lock in fs/fs-writeback.c)
37 * mapping->tree_lock (widely used, in set_page_dirty, 36 * mapping->tree_lock (widely used, in set_page_dirty,
38 * in arch-dependent flush_dcache_mmap_lock, 37 * in arch-dependent flush_dcache_mmap_lock,
39 * within inode_wb_list_lock in __sync_single_inode) 38 * within bdi.wb->list_lock in __sync_single_inode)
40 * 39 *
41 * anon_vma->mutex,mapping->i_mutex (memory_failure, collect_procs_anon) 40 * anon_vma->mutex,mapping->i_mutex (memory_failure, collect_procs_anon)
42 * ->tasklist_lock 41 * ->tasklist_lock
@@ -52,7 +51,7 @@
52#include <linux/ksm.h> 51#include <linux/ksm.h>
53#include <linux/rmap.h> 52#include <linux/rmap.h>
54#include <linux/rcupdate.h> 53#include <linux/rcupdate.h>
55#include <linux/module.h> 54#include <linux/export.h>
56#include <linux/memcontrol.h> 55#include <linux/memcontrol.h>
57#include <linux/mmu_notifier.h> 56#include <linux/mmu_notifier.h>
58#include <linux/migrate.h> 57#include <linux/migrate.h>
@@ -870,11 +869,11 @@ int page_referenced(struct page *page,
870 vm_flags); 869 vm_flags);
871 if (we_locked) 870 if (we_locked)
872 unlock_page(page); 871 unlock_page(page);
872
873 if (page_test_and_clear_young(page_to_pfn(page)))
874 referenced++;
873 } 875 }
874out: 876out:
875 if (page_test_and_clear_young(page_to_pfn(page)))
876 referenced++;
877
878 return referenced; 877 return referenced;
879} 878}
880 879
@@ -1165,7 +1164,7 @@ void page_remove_rmap(struct page *page)
1165 1164
1166/* 1165/*
1167 * Subfunctions of try_to_unmap: try_to_unmap_one called 1166 * Subfunctions of try_to_unmap: try_to_unmap_one called
1168 * repeatedly from either try_to_unmap_anon or try_to_unmap_file. 1167 * repeatedly from try_to_unmap_ksm, try_to_unmap_anon or try_to_unmap_file.
1169 */ 1168 */
1170int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, 1169int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
1171 unsigned long address, enum ttu_flags flags) 1170 unsigned long address, enum ttu_flags flags)
diff --git a/mm/shmem.c b/mm/shmem.c
index fcedf5464eb7..d6722506d2da 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -6,7 +6,8 @@
6 * 2000-2001 Christoph Rohland 6 * 2000-2001 Christoph Rohland
7 * 2000-2001 SAP AG 7 * 2000-2001 SAP AG
8 * 2002 Red Hat Inc. 8 * 2002 Red Hat Inc.
9 * Copyright (C) 2002-2005 Hugh Dickins. 9 * Copyright (C) 2002-2011 Hugh Dickins.
10 * Copyright (C) 2011 Google Inc.
10 * Copyright (C) 2002-2005 VERITAS Software Corporation. 11 * Copyright (C) 2002-2005 VERITAS Software Corporation.
11 * Copyright (C) 2004 Andi Kleen, SuSE Labs 12 * Copyright (C) 2004 Andi Kleen, SuSE Labs
12 * 13 *
@@ -27,8 +28,7 @@
27#include <linux/pagemap.h> 28#include <linux/pagemap.h>
28#include <linux/file.h> 29#include <linux/file.h>
29#include <linux/mm.h> 30#include <linux/mm.h>
30#include <linux/module.h> 31#include <linux/export.h>
31#include <linux/percpu_counter.h>
32#include <linux/swap.h> 32#include <linux/swap.h>
33 33
34static struct vfsmount *shm_mnt; 34static struct vfsmount *shm_mnt;
@@ -51,6 +51,9 @@ static struct vfsmount *shm_mnt;
51#include <linux/shmem_fs.h> 51#include <linux/shmem_fs.h>
52#include <linux/writeback.h> 52#include <linux/writeback.h>
53#include <linux/blkdev.h> 53#include <linux/blkdev.h>
54#include <linux/pagevec.h>
55#include <linux/percpu_counter.h>
56#include <linux/splice.h>
54#include <linux/security.h> 57#include <linux/security.h>
55#include <linux/swapops.h> 58#include <linux/swapops.h>
56#include <linux/mempolicy.h> 59#include <linux/mempolicy.h>
@@ -62,43 +65,17 @@ static struct vfsmount *shm_mnt;
62#include <linux/magic.h> 65#include <linux/magic.h>
63 66
64#include <asm/uaccess.h> 67#include <asm/uaccess.h>
65#include <asm/div64.h>
66#include <asm/pgtable.h> 68#include <asm/pgtable.h>
67 69
68/*
69 * The maximum size of a shmem/tmpfs file is limited by the maximum size of
70 * its triple-indirect swap vector - see illustration at shmem_swp_entry().
71 *
72 * With 4kB page size, maximum file size is just over 2TB on a 32-bit kernel,
73 * but one eighth of that on a 64-bit kernel. With 8kB page size, maximum
74 * file size is just over 4TB on a 64-bit kernel, but 16TB on a 32-bit kernel,
75 * MAX_LFS_FILESIZE being then more restrictive than swap vector layout.
76 *
77 * We use / and * instead of shifts in the definitions below, so that the swap
78 * vector can be tested with small even values (e.g. 20) for ENTRIES_PER_PAGE.
79 */
80#define ENTRIES_PER_PAGE (PAGE_CACHE_SIZE/sizeof(unsigned long))
81#define ENTRIES_PER_PAGEPAGE ((unsigned long long)ENTRIES_PER_PAGE*ENTRIES_PER_PAGE)
82
83#define SHMSWP_MAX_INDEX (SHMEM_NR_DIRECT + (ENTRIES_PER_PAGEPAGE/2) * (ENTRIES_PER_PAGE+1))
84#define SHMSWP_MAX_BYTES (SHMSWP_MAX_INDEX << PAGE_CACHE_SHIFT)
85
86#define SHMEM_MAX_BYTES min_t(unsigned long long, SHMSWP_MAX_BYTES, MAX_LFS_FILESIZE)
87#define SHMEM_MAX_INDEX ((unsigned long)((SHMEM_MAX_BYTES+1) >> PAGE_CACHE_SHIFT))
88
89#define BLOCKS_PER_PAGE (PAGE_CACHE_SIZE/512) 70#define BLOCKS_PER_PAGE (PAGE_CACHE_SIZE/512)
90#define VM_ACCT(size) (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT) 71#define VM_ACCT(size) (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT)
91 72
92/* info->flags needs VM_flags to handle pagein/truncate races efficiently */
93#define SHMEM_PAGEIN VM_READ
94#define SHMEM_TRUNCATE VM_WRITE
95
96/* Definition to limit shmem_truncate's steps between cond_rescheds */
97#define LATENCY_LIMIT 64
98
99/* Pretend that each entry is of this size in directory's i_size */ 73/* Pretend that each entry is of this size in directory's i_size */
100#define BOGO_DIRENT_SIZE 20 74#define BOGO_DIRENT_SIZE 20
101 75
76/* Symlink up to this size is kmalloc'ed instead of using a swappable page */
77#define SHORT_SYMLINK_LEN 128
78
102struct shmem_xattr { 79struct shmem_xattr {
103 struct list_head list; /* anchored by shmem_inode_info->xattr_list */ 80 struct list_head list; /* anchored by shmem_inode_info->xattr_list */
104 char *name; /* xattr name */ 81 char *name; /* xattr name */
@@ -106,7 +83,7 @@ struct shmem_xattr {
106 char value[0]; 83 char value[0];
107}; 84};
108 85
109/* Flag allocation requirements to shmem_getpage and shmem_swp_alloc */ 86/* Flag allocation requirements to shmem_getpage */
110enum sgp_type { 87enum sgp_type {
111 SGP_READ, /* don't exceed i_size, don't allocate page */ 88 SGP_READ, /* don't exceed i_size, don't allocate page */
112 SGP_CACHE, /* don't exceed i_size, may allocate page */ 89 SGP_CACHE, /* don't exceed i_size, may allocate page */
@@ -126,57 +103,14 @@ static unsigned long shmem_default_max_inodes(void)
126} 103}
127#endif 104#endif
128 105
129static int shmem_getpage(struct inode *inode, unsigned long idx, 106static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
130 struct page **pagep, enum sgp_type sgp, int *type); 107 struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type);
131
132static inline struct page *shmem_dir_alloc(gfp_t gfp_mask)
133{
134 /*
135 * The above definition of ENTRIES_PER_PAGE, and the use of
136 * BLOCKS_PER_PAGE on indirect pages, assume PAGE_CACHE_SIZE:
137 * might be reconsidered if it ever diverges from PAGE_SIZE.
138 *
139 * Mobility flags are masked out as swap vectors cannot move
140 */
141 return alloc_pages((gfp_mask & ~GFP_MOVABLE_MASK) | __GFP_ZERO,
142 PAGE_CACHE_SHIFT-PAGE_SHIFT);
143}
144
145static inline void shmem_dir_free(struct page *page)
146{
147 __free_pages(page, PAGE_CACHE_SHIFT-PAGE_SHIFT);
148}
149
150static struct page **shmem_dir_map(struct page *page)
151{
152 return (struct page **)kmap_atomic(page, KM_USER0);
153}
154
155static inline void shmem_dir_unmap(struct page **dir)
156{
157 kunmap_atomic(dir, KM_USER0);
158}
159
160static swp_entry_t *shmem_swp_map(struct page *page)
161{
162 return (swp_entry_t *)kmap_atomic(page, KM_USER1);
163}
164
165static inline void shmem_swp_balance_unmap(void)
166{
167 /*
168 * When passing a pointer to an i_direct entry, to code which
169 * also handles indirect entries and so will shmem_swp_unmap,
170 * we must arrange for the preempt count to remain in balance.
171 * What kmap_atomic of a lowmem page does depends on config
172 * and architecture, so pretend to kmap_atomic some lowmem page.
173 */
174 (void) kmap_atomic(ZERO_PAGE(0), KM_USER1);
175}
176 108
177static inline void shmem_swp_unmap(swp_entry_t *entry) 109static inline int shmem_getpage(struct inode *inode, pgoff_t index,
110 struct page **pagep, enum sgp_type sgp, int *fault_type)
178{ 111{
179 kunmap_atomic(entry, KM_USER1); 112 return shmem_getpage_gfp(inode, index, pagep, sgp,
113 mapping_gfp_mask(inode->i_mapping), fault_type);
180} 114}
181 115
182static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb) 116static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
@@ -236,17 +170,6 @@ static struct backing_dev_info shmem_backing_dev_info __read_mostly = {
236static LIST_HEAD(shmem_swaplist); 170static LIST_HEAD(shmem_swaplist);
237static DEFINE_MUTEX(shmem_swaplist_mutex); 171static DEFINE_MUTEX(shmem_swaplist_mutex);
238 172
239static void shmem_free_blocks(struct inode *inode, long pages)
240{
241 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
242 if (sbinfo->max_blocks) {
243 percpu_counter_add(&sbinfo->used_blocks, -pages);
244 spin_lock(&inode->i_lock);
245 inode->i_blocks -= pages*BLOCKS_PER_PAGE;
246 spin_unlock(&inode->i_lock);
247 }
248}
249
250static int shmem_reserve_inode(struct super_block *sb) 173static int shmem_reserve_inode(struct super_block *sb)
251{ 174{
252 struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 175 struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
@@ -273,7 +196,7 @@ static void shmem_free_inode(struct super_block *sb)
273} 196}
274 197
275/** 198/**
276 * shmem_recalc_inode - recalculate the size of an inode 199 * shmem_recalc_inode - recalculate the block usage of an inode
277 * @inode: inode to recalc 200 * @inode: inode to recalc
278 * 201 *
279 * We have to calculate the free blocks since the mm can drop 202 * We have to calculate the free blocks since the mm can drop
@@ -291,474 +214,297 @@ static void shmem_recalc_inode(struct inode *inode)
291 214
292 freed = info->alloced - info->swapped - inode->i_mapping->nrpages; 215 freed = info->alloced - info->swapped - inode->i_mapping->nrpages;
293 if (freed > 0) { 216 if (freed > 0) {
217 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
218 if (sbinfo->max_blocks)
219 percpu_counter_add(&sbinfo->used_blocks, -freed);
294 info->alloced -= freed; 220 info->alloced -= freed;
221 inode->i_blocks -= freed * BLOCKS_PER_PAGE;
295 shmem_unacct_blocks(info->flags, freed); 222 shmem_unacct_blocks(info->flags, freed);
296 shmem_free_blocks(inode, freed);
297 } 223 }
298} 224}
299 225
300/** 226/*
301 * shmem_swp_entry - find the swap vector position in the info structure 227 * Replace item expected in radix tree by a new item, while holding tree lock.
302 * @info: info structure for the inode
303 * @index: index of the page to find
304 * @page: optional page to add to the structure. Has to be preset to
305 * all zeros
306 *
307 * If there is no space allocated yet it will return NULL when
308 * page is NULL, else it will use the page for the needed block,
309 * setting it to NULL on return to indicate that it has been used.
310 *
311 * The swap vector is organized the following way:
312 *
313 * There are SHMEM_NR_DIRECT entries directly stored in the
314 * shmem_inode_info structure. So small files do not need an addional
315 * allocation.
316 *
317 * For pages with index > SHMEM_NR_DIRECT there is the pointer
318 * i_indirect which points to a page which holds in the first half
319 * doubly indirect blocks, in the second half triple indirect blocks:
320 *
321 * For an artificial ENTRIES_PER_PAGE = 4 this would lead to the
322 * following layout (for SHMEM_NR_DIRECT == 16):
323 *
324 * i_indirect -> dir --> 16-19
325 * | +-> 20-23
326 * |
327 * +-->dir2 --> 24-27
328 * | +-> 28-31
329 * | +-> 32-35
330 * | +-> 36-39
331 * |
332 * +-->dir3 --> 40-43
333 * +-> 44-47
334 * +-> 48-51
335 * +-> 52-55
336 */ 228 */
337static swp_entry_t *shmem_swp_entry(struct shmem_inode_info *info, unsigned long index, struct page **page) 229static int shmem_radix_tree_replace(struct address_space *mapping,
338{ 230 pgoff_t index, void *expected, void *replacement)
339 unsigned long offset; 231{
340 struct page **dir; 232 void **pslot;
341 struct page *subdir; 233 void *item = NULL;
234
235 VM_BUG_ON(!expected);
236 pslot = radix_tree_lookup_slot(&mapping->page_tree, index);
237 if (pslot)
238 item = radix_tree_deref_slot_protected(pslot,
239 &mapping->tree_lock);
240 if (item != expected)
241 return -ENOENT;
242 if (replacement)
243 radix_tree_replace_slot(pslot, replacement);
244 else
245 radix_tree_delete(&mapping->page_tree, index);
246 return 0;
247}
342 248
343 if (index < SHMEM_NR_DIRECT) { 249/*
344 shmem_swp_balance_unmap(); 250 * Like add_to_page_cache_locked, but error if expected item has gone.
345 return info->i_direct+index; 251 */
346 } 252static int shmem_add_to_page_cache(struct page *page,
347 if (!info->i_indirect) { 253 struct address_space *mapping,
348 if (page) { 254 pgoff_t index, gfp_t gfp, void *expected)
349 info->i_indirect = *page; 255{
350 *page = NULL; 256 int error = 0;
351 }
352 return NULL; /* need another page */
353 }
354 257
355 index -= SHMEM_NR_DIRECT; 258 VM_BUG_ON(!PageLocked(page));
356 offset = index % ENTRIES_PER_PAGE; 259 VM_BUG_ON(!PageSwapBacked(page));
357 index /= ENTRIES_PER_PAGE;
358 dir = shmem_dir_map(info->i_indirect);
359
360 if (index >= ENTRIES_PER_PAGE/2) {
361 index -= ENTRIES_PER_PAGE/2;
362 dir += ENTRIES_PER_PAGE/2 + index/ENTRIES_PER_PAGE;
363 index %= ENTRIES_PER_PAGE;
364 subdir = *dir;
365 if (!subdir) {
366 if (page) {
367 *dir = *page;
368 *page = NULL;
369 }
370 shmem_dir_unmap(dir);
371 return NULL; /* need another page */
372 }
373 shmem_dir_unmap(dir);
374 dir = shmem_dir_map(subdir);
375 }
376 260
377 dir += index; 261 if (!expected)
378 subdir = *dir; 262 error = radix_tree_preload(gfp & GFP_RECLAIM_MASK);
379 if (!subdir) { 263 if (!error) {
380 if (!page || !(subdir = *page)) { 264 page_cache_get(page);
381 shmem_dir_unmap(dir); 265 page->mapping = mapping;
382 return NULL; /* need a page */ 266 page->index = index;
267
268 spin_lock_irq(&mapping->tree_lock);
269 if (!expected)
270 error = radix_tree_insert(&mapping->page_tree,
271 index, page);
272 else
273 error = shmem_radix_tree_replace(mapping, index,
274 expected, page);
275 if (!error) {
276 mapping->nrpages++;
277 __inc_zone_page_state(page, NR_FILE_PAGES);
278 __inc_zone_page_state(page, NR_SHMEM);
279 spin_unlock_irq(&mapping->tree_lock);
280 } else {
281 page->mapping = NULL;
282 spin_unlock_irq(&mapping->tree_lock);
283 page_cache_release(page);
383 } 284 }
384 *dir = subdir; 285 if (!expected)
385 *page = NULL; 286 radix_tree_preload_end();
386 } 287 }
387 shmem_dir_unmap(dir); 288 if (error)
388 return shmem_swp_map(subdir) + offset; 289 mem_cgroup_uncharge_cache_page(page);
290 return error;
389} 291}
390 292
391static void shmem_swp_set(struct shmem_inode_info *info, swp_entry_t *entry, unsigned long value) 293/*
294 * Like delete_from_page_cache, but substitutes swap for page.
295 */
296static void shmem_delete_from_page_cache(struct page *page, void *radswap)
392{ 297{
393 long incdec = value? 1: -1; 298 struct address_space *mapping = page->mapping;
299 int error;
394 300
395 entry->val = value; 301 spin_lock_irq(&mapping->tree_lock);
396 info->swapped += incdec; 302 error = shmem_radix_tree_replace(mapping, page->index, page, radswap);
397 if ((unsigned long)(entry - info->i_direct) >= SHMEM_NR_DIRECT) { 303 page->mapping = NULL;
398 struct page *page = kmap_atomic_to_page(entry); 304 mapping->nrpages--;
399 set_page_private(page, page_private(page) + incdec); 305 __dec_zone_page_state(page, NR_FILE_PAGES);
400 } 306 __dec_zone_page_state(page, NR_SHMEM);
307 spin_unlock_irq(&mapping->tree_lock);
308 page_cache_release(page);
309 BUG_ON(error);
401} 310}
402 311
403/** 312/*
404 * shmem_swp_alloc - get the position of the swap entry for the page. 313 * Like find_get_pages, but collecting swap entries as well as pages.
405 * @info: info structure for the inode
406 * @index: index of the page to find
407 * @sgp: check and recheck i_size? skip allocation?
408 *
409 * If the entry does not exist, allocate it.
410 */ 314 */
411static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long index, enum sgp_type sgp) 315static unsigned shmem_find_get_pages_and_swap(struct address_space *mapping,
412{ 316 pgoff_t start, unsigned int nr_pages,
413 struct inode *inode = &info->vfs_inode; 317 struct page **pages, pgoff_t *indices)
414 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); 318{
415 struct page *page = NULL; 319 unsigned int i;
416 swp_entry_t *entry; 320 unsigned int ret;
417 321 unsigned int nr_found;
418 if (sgp != SGP_WRITE && 322
419 ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) 323 rcu_read_lock();
420 return ERR_PTR(-EINVAL); 324restart:
421 325 nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
422 while (!(entry = shmem_swp_entry(info, index, &page))) { 326 (void ***)pages, indices, start, nr_pages);
423 if (sgp == SGP_READ) 327 ret = 0;
424 return shmem_swp_map(ZERO_PAGE(0)); 328 for (i = 0; i < nr_found; i++) {
425 /* 329 struct page *page;
426 * Test used_blocks against 1 less max_blocks, since we have 1 data 330repeat:
427 * page (and perhaps indirect index pages) yet to allocate: 331 page = radix_tree_deref_slot((void **)pages[i]);
428 * a waste to allocate index if we cannot allocate data. 332 if (unlikely(!page))
429 */ 333 continue;
430 if (sbinfo->max_blocks) { 334 if (radix_tree_exception(page)) {
431 if (percpu_counter_compare(&sbinfo->used_blocks, 335 if (radix_tree_deref_retry(page))
432 sbinfo->max_blocks - 1) >= 0) 336 goto restart;
433 return ERR_PTR(-ENOSPC); 337 /*
434 percpu_counter_inc(&sbinfo->used_blocks); 338 * Otherwise, we must be storing a swap entry
435 spin_lock(&inode->i_lock); 339 * here as an exceptional entry: so return it
436 inode->i_blocks += BLOCKS_PER_PAGE; 340 * without attempting to raise page count.
437 spin_unlock(&inode->i_lock); 341 */
342 goto export;
438 } 343 }
344 if (!page_cache_get_speculative(page))
345 goto repeat;
439 346
440 spin_unlock(&info->lock); 347 /* Has the page moved? */
441 page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping)); 348 if (unlikely(page != *((void **)pages[i]))) {
442 spin_lock(&info->lock); 349 page_cache_release(page);
443 350 goto repeat;
444 if (!page) {
445 shmem_free_blocks(inode, 1);
446 return ERR_PTR(-ENOMEM);
447 }
448 if (sgp != SGP_WRITE &&
449 ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
450 entry = ERR_PTR(-EINVAL);
451 break;
452 } 351 }
453 if (info->next_index <= index) 352export:
454 info->next_index = index + 1; 353 indices[ret] = indices[i];
455 } 354 pages[ret] = page;
456 if (page) { 355 ret++;
457 /* another task gave its page, or truncated the file */ 356 }
458 shmem_free_blocks(inode, 1); 357 if (unlikely(!ret && nr_found))
459 shmem_dir_free(page); 358 goto restart;
460 } 359 rcu_read_unlock();
461 if (info->next_index <= index && !IS_ERR(entry)) 360 return ret;
462 info->next_index = index + 1;
463 return entry;
464} 361}
465 362
466/** 363/*
467 * shmem_free_swp - free some swap entries in a directory 364 * Remove swap entry from radix tree, free the swap and its page cache.
468 * @dir: pointer to the directory
469 * @edir: pointer after last entry of the directory
470 * @punch_lock: pointer to spinlock when needed for the holepunch case
471 */ 365 */
472static int shmem_free_swp(swp_entry_t *dir, swp_entry_t *edir, 366static int shmem_free_swap(struct address_space *mapping,
473 spinlock_t *punch_lock) 367 pgoff_t index, void *radswap)
474{ 368{
475 spinlock_t *punch_unlock = NULL; 369 int error;
476 swp_entry_t *ptr; 370
477 int freed = 0; 371 spin_lock_irq(&mapping->tree_lock);
478 372 error = shmem_radix_tree_replace(mapping, index, radswap, NULL);
479 for (ptr = dir; ptr < edir; ptr++) { 373 spin_unlock_irq(&mapping->tree_lock);
480 if (ptr->val) { 374 if (!error)
481 if (unlikely(punch_lock)) { 375 free_swap_and_cache(radix_to_swp_entry(radswap));
482 punch_unlock = punch_lock; 376 return error;
483 punch_lock = NULL;
484 spin_lock(punch_unlock);
485 if (!ptr->val)
486 continue;
487 }
488 free_swap_and_cache(*ptr);
489 *ptr = (swp_entry_t){0};
490 freed++;
491 }
492 }
493 if (punch_unlock)
494 spin_unlock(punch_unlock);
495 return freed;
496}
497
498static int shmem_map_and_free_swp(struct page *subdir, int offset,
499 int limit, struct page ***dir, spinlock_t *punch_lock)
500{
501 swp_entry_t *ptr;
502 int freed = 0;
503
504 ptr = shmem_swp_map(subdir);
505 for (; offset < limit; offset += LATENCY_LIMIT) {
506 int size = limit - offset;
507 if (size > LATENCY_LIMIT)
508 size = LATENCY_LIMIT;
509 freed += shmem_free_swp(ptr+offset, ptr+offset+size,
510 punch_lock);
511 if (need_resched()) {
512 shmem_swp_unmap(ptr);
513 if (*dir) {
514 shmem_dir_unmap(*dir);
515 *dir = NULL;
516 }
517 cond_resched();
518 ptr = shmem_swp_map(subdir);
519 }
520 }
521 shmem_swp_unmap(ptr);
522 return freed;
523} 377}
524 378
525static void shmem_free_pages(struct list_head *next) 379/*
380 * Pagevec may contain swap entries, so shuffle up pages before releasing.
381 */
382static void shmem_pagevec_release(struct pagevec *pvec)
526{ 383{
527 struct page *page; 384 int i, j;
528 int freed = 0; 385
529 386 for (i = 0, j = 0; i < pagevec_count(pvec); i++) {
530 do { 387 struct page *page = pvec->pages[i];
531 page = container_of(next, struct page, lru); 388 if (!radix_tree_exceptional_entry(page))
532 next = next->next; 389 pvec->pages[j++] = page;
533 shmem_dir_free(page); 390 }
534 freed++; 391 pvec->nr = j;
535 if (freed >= LATENCY_LIMIT) { 392 pagevec_release(pvec);
536 cond_resched();
537 freed = 0;
538 }
539 } while (next);
540} 393}
541 394
542void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end) 395/*
396 * Remove range of pages and swap entries from radix tree, and free them.
397 */
398void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
543{ 399{
400 struct address_space *mapping = inode->i_mapping;
544 struct shmem_inode_info *info = SHMEM_I(inode); 401 struct shmem_inode_info *info = SHMEM_I(inode);
545 unsigned long idx; 402 pgoff_t start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
546 unsigned long size; 403 unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
547 unsigned long limit; 404 pgoff_t end = (lend >> PAGE_CACHE_SHIFT);
548 unsigned long stage; 405 struct pagevec pvec;
549 unsigned long diroff; 406 pgoff_t indices[PAGEVEC_SIZE];
550 struct page **dir;
551 struct page *topdir;
552 struct page *middir;
553 struct page *subdir;
554 swp_entry_t *ptr;
555 LIST_HEAD(pages_to_free);
556 long nr_pages_to_free = 0;
557 long nr_swaps_freed = 0; 407 long nr_swaps_freed = 0;
558 int offset; 408 pgoff_t index;
559 int freed; 409 int i;
560 int punch_hole;
561 spinlock_t *needs_lock;
562 spinlock_t *punch_lock;
563 unsigned long upper_limit;
564 410
565 truncate_inode_pages_range(inode->i_mapping, start, end); 411 BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1));
566 412
567 inode->i_ctime = inode->i_mtime = CURRENT_TIME; 413 pagevec_init(&pvec, 0);
568 idx = (start + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 414 index = start;
569 if (idx >= info->next_index) 415 while (index <= end) {
570 return; 416 pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
417 min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
418 pvec.pages, indices);
419 if (!pvec.nr)
420 break;
421 mem_cgroup_uncharge_start();
422 for (i = 0; i < pagevec_count(&pvec); i++) {
423 struct page *page = pvec.pages[i];
571 424
572 spin_lock(&info->lock); 425 index = indices[i];
573 info->flags |= SHMEM_TRUNCATE; 426 if (index > end)
574 if (likely(end == (loff_t) -1)) { 427 break;
575 limit = info->next_index;
576 upper_limit = SHMEM_MAX_INDEX;
577 info->next_index = idx;
578 needs_lock = NULL;
579 punch_hole = 0;
580 } else {
581 if (end + 1 >= inode->i_size) { /* we may free a little more */
582 limit = (inode->i_size + PAGE_CACHE_SIZE - 1) >>
583 PAGE_CACHE_SHIFT;
584 upper_limit = SHMEM_MAX_INDEX;
585 } else {
586 limit = (end + 1) >> PAGE_CACHE_SHIFT;
587 upper_limit = limit;
588 }
589 needs_lock = &info->lock;
590 punch_hole = 1;
591 }
592 428
593 topdir = info->i_indirect; 429 if (radix_tree_exceptional_entry(page)) {
594 if (topdir && idx <= SHMEM_NR_DIRECT && !punch_hole) { 430 nr_swaps_freed += !shmem_free_swap(mapping,
595 info->i_indirect = NULL; 431 index, page);
596 nr_pages_to_free++; 432 continue;
597 list_add(&topdir->lru, &pages_to_free); 433 }
434
435 if (!trylock_page(page))
436 continue;
437 if (page->mapping == mapping) {
438 VM_BUG_ON(PageWriteback(page));
439 truncate_inode_page(mapping, page);
440 }
441 unlock_page(page);
442 }
443 shmem_pagevec_release(&pvec);
444 mem_cgroup_uncharge_end();
445 cond_resched();
446 index++;
598 } 447 }
599 spin_unlock(&info->lock);
600 448
601 if (info->swapped && idx < SHMEM_NR_DIRECT) { 449 if (partial) {
602 ptr = info->i_direct; 450 struct page *page = NULL;
603 size = limit; 451 shmem_getpage(inode, start - 1, &page, SGP_READ, NULL);
604 if (size > SHMEM_NR_DIRECT) 452 if (page) {
605 size = SHMEM_NR_DIRECT; 453 zero_user_segment(page, partial, PAGE_CACHE_SIZE);
606 nr_swaps_freed = shmem_free_swp(ptr+idx, ptr+size, needs_lock); 454 set_page_dirty(page);
455 unlock_page(page);
456 page_cache_release(page);
457 }
607 } 458 }
608 459
609 /* 460 index = start;
610 * If there are no indirect blocks or we are punching a hole 461 for ( ; ; ) {
611 * below indirect blocks, nothing to be done. 462 cond_resched();
612 */ 463 pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
613 if (!topdir || limit <= SHMEM_NR_DIRECT) 464 min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
614 goto done2; 465 pvec.pages, indices);
466 if (!pvec.nr) {
467 if (index == start)
468 break;
469 index = start;
470 continue;
471 }
472 if (index == start && indices[0] > end) {
473 shmem_pagevec_release(&pvec);
474 break;
475 }
476 mem_cgroup_uncharge_start();
477 for (i = 0; i < pagevec_count(&pvec); i++) {
478 struct page *page = pvec.pages[i];
615 479
616 /* 480 index = indices[i];
617 * The truncation case has already dropped info->lock, and we're safe 481 if (index > end)
618 * because i_size and next_index have already been lowered, preventing 482 break;
619 * access beyond. But in the punch_hole case, we still need to take
620 * the lock when updating the swap directory, because there might be
621 * racing accesses by shmem_getpage(SGP_CACHE), shmem_unuse_inode or
622 * shmem_writepage. However, whenever we find we can remove a whole
623 * directory page (not at the misaligned start or end of the range),
624 * we first NULLify its pointer in the level above, and then have no
625 * need to take the lock when updating its contents: needs_lock and
626 * punch_lock (either pointing to info->lock or NULL) manage this.
627 */
628 483
629 upper_limit -= SHMEM_NR_DIRECT; 484 if (radix_tree_exceptional_entry(page)) {
630 limit -= SHMEM_NR_DIRECT; 485 nr_swaps_freed += !shmem_free_swap(mapping,
631 idx = (idx > SHMEM_NR_DIRECT)? (idx - SHMEM_NR_DIRECT): 0; 486 index, page);
632 offset = idx % ENTRIES_PER_PAGE; 487 continue;
633 idx -= offset;
634
635 dir = shmem_dir_map(topdir);
636 stage = ENTRIES_PER_PAGEPAGE/2;
637 if (idx < ENTRIES_PER_PAGEPAGE/2) {
638 middir = topdir;
639 diroff = idx/ENTRIES_PER_PAGE;
640 } else {
641 dir += ENTRIES_PER_PAGE/2;
642 dir += (idx - ENTRIES_PER_PAGEPAGE/2)/ENTRIES_PER_PAGEPAGE;
643 while (stage <= idx)
644 stage += ENTRIES_PER_PAGEPAGE;
645 middir = *dir;
646 if (*dir) {
647 diroff = ((idx - ENTRIES_PER_PAGEPAGE/2) %
648 ENTRIES_PER_PAGEPAGE) / ENTRIES_PER_PAGE;
649 if (!diroff && !offset && upper_limit >= stage) {
650 if (needs_lock) {
651 spin_lock(needs_lock);
652 *dir = NULL;
653 spin_unlock(needs_lock);
654 needs_lock = NULL;
655 } else
656 *dir = NULL;
657 nr_pages_to_free++;
658 list_add(&middir->lru, &pages_to_free);
659 } 488 }
660 shmem_dir_unmap(dir);
661 dir = shmem_dir_map(middir);
662 } else {
663 diroff = 0;
664 offset = 0;
665 idx = stage;
666 }
667 }
668 489
669 for (; idx < limit; idx += ENTRIES_PER_PAGE, diroff++) { 490 lock_page(page);
670 if (unlikely(idx == stage)) { 491 if (page->mapping == mapping) {
671 shmem_dir_unmap(dir); 492 VM_BUG_ON(PageWriteback(page));
672 dir = shmem_dir_map(topdir) + 493 truncate_inode_page(mapping, page);
673 ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE;
674 while (!*dir) {
675 dir++;
676 idx += ENTRIES_PER_PAGEPAGE;
677 if (idx >= limit)
678 goto done1;
679 } 494 }
680 stage = idx + ENTRIES_PER_PAGEPAGE; 495 unlock_page(page);
681 middir = *dir;
682 if (punch_hole)
683 needs_lock = &info->lock;
684 if (upper_limit >= stage) {
685 if (needs_lock) {
686 spin_lock(needs_lock);
687 *dir = NULL;
688 spin_unlock(needs_lock);
689 needs_lock = NULL;
690 } else
691 *dir = NULL;
692 nr_pages_to_free++;
693 list_add(&middir->lru, &pages_to_free);
694 }
695 shmem_dir_unmap(dir);
696 cond_resched();
697 dir = shmem_dir_map(middir);
698 diroff = 0;
699 }
700 punch_lock = needs_lock;
701 subdir = dir[diroff];
702 if (subdir && !offset && upper_limit-idx >= ENTRIES_PER_PAGE) {
703 if (needs_lock) {
704 spin_lock(needs_lock);
705 dir[diroff] = NULL;
706 spin_unlock(needs_lock);
707 punch_lock = NULL;
708 } else
709 dir[diroff] = NULL;
710 nr_pages_to_free++;
711 list_add(&subdir->lru, &pages_to_free);
712 }
713 if (subdir && page_private(subdir) /* has swap entries */) {
714 size = limit - idx;
715 if (size > ENTRIES_PER_PAGE)
716 size = ENTRIES_PER_PAGE;
717 freed = shmem_map_and_free_swp(subdir,
718 offset, size, &dir, punch_lock);
719 if (!dir)
720 dir = shmem_dir_map(middir);
721 nr_swaps_freed += freed;
722 if (offset || punch_lock) {
723 spin_lock(&info->lock);
724 set_page_private(subdir,
725 page_private(subdir) - freed);
726 spin_unlock(&info->lock);
727 } else
728 BUG_ON(page_private(subdir) != freed);
729 } 496 }
730 offset = 0; 497 shmem_pagevec_release(&pvec);
731 } 498 mem_cgroup_uncharge_end();
732done1: 499 index++;
733 shmem_dir_unmap(dir);
734done2:
735 if (inode->i_mapping->nrpages && (info->flags & SHMEM_PAGEIN)) {
736 /*
737 * Call truncate_inode_pages again: racing shmem_unuse_inode
738 * may have swizzled a page in from swap since
739 * truncate_pagecache or generic_delete_inode did it, before we
740 * lowered next_index. Also, though shmem_getpage checks
741 * i_size before adding to cache, no recheck after: so fix the
742 * narrow window there too.
743 */
744 truncate_inode_pages_range(inode->i_mapping, start, end);
745 } 500 }
746 501
747 spin_lock(&info->lock); 502 spin_lock(&info->lock);
748 info->flags &= ~SHMEM_TRUNCATE;
749 info->swapped -= nr_swaps_freed; 503 info->swapped -= nr_swaps_freed;
750 if (nr_pages_to_free)
751 shmem_free_blocks(inode, nr_pages_to_free);
752 shmem_recalc_inode(inode); 504 shmem_recalc_inode(inode);
753 spin_unlock(&info->lock); 505 spin_unlock(&info->lock);
754 506
755 /* 507 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
756 * Empty swap vector directory pages to be freed?
757 */
758 if (!list_empty(&pages_to_free)) {
759 pages_to_free.prev->next = NULL;
760 shmem_free_pages(pages_to_free.next);
761 }
762} 508}
763EXPORT_SYMBOL_GPL(shmem_truncate_range); 509EXPORT_SYMBOL_GPL(shmem_truncate_range);
764 510
@@ -774,37 +520,7 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
774 if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { 520 if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
775 loff_t oldsize = inode->i_size; 521 loff_t oldsize = inode->i_size;
776 loff_t newsize = attr->ia_size; 522 loff_t newsize = attr->ia_size;
777 struct page *page = NULL;
778 523
779 if (newsize < oldsize) {
780 /*
781 * If truncating down to a partial page, then
782 * if that page is already allocated, hold it
783 * in memory until the truncation is over, so
784 * truncate_partial_page cannot miss it were
785 * it assigned to swap.
786 */
787 if (newsize & (PAGE_CACHE_SIZE-1)) {
788 (void) shmem_getpage(inode,
789 newsize >> PAGE_CACHE_SHIFT,
790 &page, SGP_READ, NULL);
791 if (page)
792 unlock_page(page);
793 }
794 /*
795 * Reset SHMEM_PAGEIN flag so that shmem_truncate can
796 * detect if any pages might have been added to cache
797 * after truncate_inode_pages. But we needn't bother
798 * if it's being fully truncated to zero-length: the
799 * nrpages check is efficient enough in that case.
800 */
801 if (newsize) {
802 struct shmem_inode_info *info = SHMEM_I(inode);
803 spin_lock(&info->lock);
804 info->flags &= ~SHMEM_PAGEIN;
805 spin_unlock(&info->lock);
806 }
807 }
808 if (newsize != oldsize) { 524 if (newsize != oldsize) {
809 i_size_write(inode, newsize); 525 i_size_write(inode, newsize);
810 inode->i_ctime = inode->i_mtime = CURRENT_TIME; 526 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
@@ -816,8 +532,6 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
816 /* unmap again to remove racily COWed private pages */ 532 /* unmap again to remove racily COWed private pages */
817 unmap_mapping_range(inode->i_mapping, holebegin, 0, 1); 533 unmap_mapping_range(inode->i_mapping, holebegin, 0, 1);
818 } 534 }
819 if (page)
820 page_cache_release(page);
821 } 535 }
822 536
823 setattr_copy(inode, attr); 537 setattr_copy(inode, attr);
@@ -842,7 +556,8 @@ static void shmem_evict_inode(struct inode *inode)
842 list_del_init(&info->swaplist); 556 list_del_init(&info->swaplist);
843 mutex_unlock(&shmem_swaplist_mutex); 557 mutex_unlock(&shmem_swaplist_mutex);
844 } 558 }
845 } 559 } else
560 kfree(info->symlink);
846 561
847 list_for_each_entry_safe(xattr, nxattr, &info->xattr_list, list) { 562 list_for_each_entry_safe(xattr, nxattr, &info->xattr_list, list) {
848 kfree(xattr->name); 563 kfree(xattr->name);
@@ -853,106 +568,27 @@ static void shmem_evict_inode(struct inode *inode)
853 end_writeback(inode); 568 end_writeback(inode);
854} 569}
855 570
856static inline int shmem_find_swp(swp_entry_t entry, swp_entry_t *dir, swp_entry_t *edir) 571/*
857{ 572 * If swap found in inode, free it and move page from swapcache to filecache.
858 swp_entry_t *ptr; 573 */
859 574static int shmem_unuse_inode(struct shmem_inode_info *info,
860 for (ptr = dir; ptr < edir; ptr++) { 575 swp_entry_t swap, struct page *page)
861 if (ptr->val == entry.val)
862 return ptr - dir;
863 }
864 return -1;
865}
866
867static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, struct page *page)
868{ 576{
869 struct address_space *mapping; 577 struct address_space *mapping = info->vfs_inode.i_mapping;
870 unsigned long idx; 578 void *radswap;
871 unsigned long size; 579 pgoff_t index;
872 unsigned long limit;
873 unsigned long stage;
874 struct page **dir;
875 struct page *subdir;
876 swp_entry_t *ptr;
877 int offset;
878 int error; 580 int error;
879 581
880 idx = 0; 582 radswap = swp_to_radix_entry(swap);
881 ptr = info->i_direct; 583 index = radix_tree_locate_item(&mapping->page_tree, radswap);
882 spin_lock(&info->lock); 584 if (index == -1)
883 if (!info->swapped) { 585 return 0;
884 list_del_init(&info->swaplist);
885 goto lost2;
886 }
887 limit = info->next_index;
888 size = limit;
889 if (size > SHMEM_NR_DIRECT)
890 size = SHMEM_NR_DIRECT;
891 offset = shmem_find_swp(entry, ptr, ptr+size);
892 if (offset >= 0) {
893 shmem_swp_balance_unmap();
894 goto found;
895 }
896 if (!info->i_indirect)
897 goto lost2;
898
899 dir = shmem_dir_map(info->i_indirect);
900 stage = SHMEM_NR_DIRECT + ENTRIES_PER_PAGEPAGE/2;
901
902 for (idx = SHMEM_NR_DIRECT; idx < limit; idx += ENTRIES_PER_PAGE, dir++) {
903 if (unlikely(idx == stage)) {
904 shmem_dir_unmap(dir-1);
905 if (cond_resched_lock(&info->lock)) {
906 /* check it has not been truncated */
907 if (limit > info->next_index) {
908 limit = info->next_index;
909 if (idx >= limit)
910 goto lost2;
911 }
912 }
913 dir = shmem_dir_map(info->i_indirect) +
914 ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE;
915 while (!*dir) {
916 dir++;
917 idx += ENTRIES_PER_PAGEPAGE;
918 if (idx >= limit)
919 goto lost1;
920 }
921 stage = idx + ENTRIES_PER_PAGEPAGE;
922 subdir = *dir;
923 shmem_dir_unmap(dir);
924 dir = shmem_dir_map(subdir);
925 }
926 subdir = *dir;
927 if (subdir && page_private(subdir)) {
928 ptr = shmem_swp_map(subdir);
929 size = limit - idx;
930 if (size > ENTRIES_PER_PAGE)
931 size = ENTRIES_PER_PAGE;
932 offset = shmem_find_swp(entry, ptr, ptr+size);
933 shmem_swp_unmap(ptr);
934 if (offset >= 0) {
935 shmem_dir_unmap(dir);
936 ptr = shmem_swp_map(subdir);
937 goto found;
938 }
939 }
940 }
941lost1:
942 shmem_dir_unmap(dir-1);
943lost2:
944 spin_unlock(&info->lock);
945 return 0;
946found:
947 idx += offset;
948 ptr += offset;
949 586
950 /* 587 /*
951 * Move _head_ to start search for next from here. 588 * Move _head_ to start search for next from here.
952 * But be careful: shmem_evict_inode checks list_empty without taking 589 * But be careful: shmem_evict_inode checks list_empty without taking
953 * mutex, and there's an instant in list_move_tail when info->swaplist 590 * mutex, and there's an instant in list_move_tail when info->swaplist
954 * would appear empty, if it were the only one on shmem_swaplist. We 591 * would appear empty, if it were the only one on shmem_swaplist.
955 * could avoid doing it if inode NULL; or use this minor optimization.
956 */ 592 */
957 if (shmem_swaplist.next != &info->swaplist) 593 if (shmem_swaplist.next != &info->swaplist)
958 list_move_tail(&shmem_swaplist, &info->swaplist); 594 list_move_tail(&shmem_swaplist, &info->swaplist);
@@ -962,42 +598,34 @@ found:
962 * but also to hold up shmem_evict_inode(): so inode cannot be freed 598 * but also to hold up shmem_evict_inode(): so inode cannot be freed
963 * beneath us (pagelock doesn't help until the page is in pagecache). 599 * beneath us (pagelock doesn't help until the page is in pagecache).
964 */ 600 */
965 mapping = info->vfs_inode.i_mapping; 601 error = shmem_add_to_page_cache(page, mapping, index,
966 error = add_to_page_cache_locked(page, mapping, idx, GFP_NOWAIT); 602 GFP_NOWAIT, radswap);
967 /* which does mem_cgroup_uncharge_cache_page on error */ 603 /* which does mem_cgroup_uncharge_cache_page on error */
968 604
969 if (error == -EEXIST) { 605 if (error != -ENOMEM) {
970 struct page *filepage = find_get_page(mapping, idx); 606 /*
971 error = 1; 607 * Truncation and eviction use free_swap_and_cache(), which
972 if (filepage) { 608 * only does trylock page: if we raced, best clean up here.
973 /* 609 */
974 * There might be a more uptodate page coming down
975 * from a stacked writepage: forget our swappage if so.
976 */
977 if (PageUptodate(filepage))
978 error = 0;
979 page_cache_release(filepage);
980 }
981 }
982 if (!error) {
983 delete_from_swap_cache(page); 610 delete_from_swap_cache(page);
984 set_page_dirty(page); 611 set_page_dirty(page);
985 info->flags |= SHMEM_PAGEIN; 612 if (!error) {
986 shmem_swp_set(info, ptr, 0); 613 spin_lock(&info->lock);
987 swap_free(entry); 614 info->swapped--;
615 spin_unlock(&info->lock);
616 swap_free(swap);
617 }
988 error = 1; /* not an error, but entry was found */ 618 error = 1; /* not an error, but entry was found */
989 } 619 }
990 shmem_swp_unmap(ptr);
991 spin_unlock(&info->lock);
992 return error; 620 return error;
993} 621}
994 622
995/* 623/*
996 * shmem_unuse() search for an eventually swapped out shmem page. 624 * Search through swapped inodes to find and replace swap by page.
997 */ 625 */
998int shmem_unuse(swp_entry_t entry, struct page *page) 626int shmem_unuse(swp_entry_t swap, struct page *page)
999{ 627{
1000 struct list_head *p, *next; 628 struct list_head *this, *next;
1001 struct shmem_inode_info *info; 629 struct shmem_inode_info *info;
1002 int found = 0; 630 int found = 0;
1003 int error; 631 int error;
@@ -1006,32 +634,25 @@ int shmem_unuse(swp_entry_t entry, struct page *page)
1006 * Charge page using GFP_KERNEL while we can wait, before taking 634 * Charge page using GFP_KERNEL while we can wait, before taking
1007 * the shmem_swaplist_mutex which might hold up shmem_writepage(). 635 * the shmem_swaplist_mutex which might hold up shmem_writepage().
1008 * Charged back to the user (not to caller) when swap account is used. 636 * Charged back to the user (not to caller) when swap account is used.
1009 * add_to_page_cache() will be called with GFP_NOWAIT.
1010 */ 637 */
1011 error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL); 638 error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL);
1012 if (error) 639 if (error)
1013 goto out; 640 goto out;
1014 /* 641 /* No radix_tree_preload: swap entry keeps a place for page in tree */
1015 * Try to preload while we can wait, to not make a habit of
1016 * draining atomic reserves; but don't latch on to this cpu,
1017 * it's okay if sometimes we get rescheduled after this.
1018 */
1019 error = radix_tree_preload(GFP_KERNEL);
1020 if (error)
1021 goto uncharge;
1022 radix_tree_preload_end();
1023 642
1024 mutex_lock(&shmem_swaplist_mutex); 643 mutex_lock(&shmem_swaplist_mutex);
1025 list_for_each_safe(p, next, &shmem_swaplist) { 644 list_for_each_safe(this, next, &shmem_swaplist) {
1026 info = list_entry(p, struct shmem_inode_info, swaplist); 645 info = list_entry(this, struct shmem_inode_info, swaplist);
1027 found = shmem_unuse_inode(info, entry, page); 646 if (info->swapped)
647 found = shmem_unuse_inode(info, swap, page);
648 else
649 list_del_init(&info->swaplist);
1028 cond_resched(); 650 cond_resched();
1029 if (found) 651 if (found)
1030 break; 652 break;
1031 } 653 }
1032 mutex_unlock(&shmem_swaplist_mutex); 654 mutex_unlock(&shmem_swaplist_mutex);
1033 655
1034uncharge:
1035 if (!found) 656 if (!found)
1036 mem_cgroup_uncharge_cache_page(page); 657 mem_cgroup_uncharge_cache_page(page);
1037 if (found < 0) 658 if (found < 0)
@@ -1048,10 +669,10 @@ out:
1048static int shmem_writepage(struct page *page, struct writeback_control *wbc) 669static int shmem_writepage(struct page *page, struct writeback_control *wbc)
1049{ 670{
1050 struct shmem_inode_info *info; 671 struct shmem_inode_info *info;
1051 swp_entry_t *entry, swap;
1052 struct address_space *mapping; 672 struct address_space *mapping;
1053 unsigned long index;
1054 struct inode *inode; 673 struct inode *inode;
674 swp_entry_t swap;
675 pgoff_t index;
1055 676
1056 BUG_ON(!PageLocked(page)); 677 BUG_ON(!PageLocked(page));
1057 mapping = page->mapping; 678 mapping = page->mapping;
@@ -1066,69 +687,46 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
1066 /* 687 /*
1067 * shmem_backing_dev_info's capabilities prevent regular writeback or 688 * shmem_backing_dev_info's capabilities prevent regular writeback or
1068 * sync from ever calling shmem_writepage; but a stacking filesystem 689 * sync from ever calling shmem_writepage; but a stacking filesystem
1069 * may use the ->writepage of its underlying filesystem, in which case 690 * might use ->writepage of its underlying filesystem, in which case
1070 * tmpfs should write out to swap only in response to memory pressure, 691 * tmpfs should write out to swap only in response to memory pressure,
1071 * and not for the writeback threads or sync. However, in those cases, 692 * and not for the writeback threads or sync.
1072 * we do still want to check if there's a redundant swappage to be
1073 * discarded.
1074 */ 693 */
1075 if (wbc->for_reclaim) 694 if (!wbc->for_reclaim) {
1076 swap = get_swap_page(); 695 WARN_ON_ONCE(1); /* Still happens? Tell us about it! */
1077 else 696 goto redirty;
1078 swap.val = 0; 697 }
698 swap = get_swap_page();
699 if (!swap.val)
700 goto redirty;
1079 701
1080 /* 702 /*
1081 * Add inode to shmem_unuse()'s list of swapped-out inodes, 703 * Add inode to shmem_unuse()'s list of swapped-out inodes,
1082 * if it's not already there. Do it now because we cannot take 704 * if it's not already there. Do it now before the page is
1083 * mutex while holding spinlock, and must do so before the page 705 * moved to swap cache, when its pagelock no longer protects
1084 * is moved to swap cache, when its pagelock no longer protects
1085 * the inode from eviction. But don't unlock the mutex until 706 * the inode from eviction. But don't unlock the mutex until
1086 * we've taken the spinlock, because shmem_unuse_inode() will 707 * we've incremented swapped, because shmem_unuse_inode() will
1087 * prune a !swapped inode from the swaplist under both locks. 708 * prune a !swapped inode from the swaplist under this mutex.
1088 */ 709 */
1089 if (swap.val) { 710 mutex_lock(&shmem_swaplist_mutex);
1090 mutex_lock(&shmem_swaplist_mutex); 711 if (list_empty(&info->swaplist))
1091 if (list_empty(&info->swaplist)) 712 list_add_tail(&info->swaplist, &shmem_swaplist);
1092 list_add_tail(&info->swaplist, &shmem_swaplist);
1093 }
1094
1095 spin_lock(&info->lock);
1096 if (swap.val)
1097 mutex_unlock(&shmem_swaplist_mutex);
1098
1099 if (index >= info->next_index) {
1100 BUG_ON(!(info->flags & SHMEM_TRUNCATE));
1101 goto unlock;
1102 }
1103 entry = shmem_swp_entry(info, index, NULL);
1104 if (entry->val) {
1105 /*
1106 * The more uptodate page coming down from a stacked
1107 * writepage should replace our old swappage.
1108 */
1109 free_swap_and_cache(*entry);
1110 shmem_swp_set(info, entry, 0);
1111 }
1112 shmem_recalc_inode(inode);
1113 713
1114 if (swap.val && add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) { 714 if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) {
1115 delete_from_page_cache(page);
1116 shmem_swp_set(info, entry, swap.val);
1117 shmem_swp_unmap(entry);
1118 swap_shmem_alloc(swap); 715 swap_shmem_alloc(swap);
716 shmem_delete_from_page_cache(page, swp_to_radix_entry(swap));
717
718 spin_lock(&info->lock);
719 info->swapped++;
720 shmem_recalc_inode(inode);
1119 spin_unlock(&info->lock); 721 spin_unlock(&info->lock);
722
723 mutex_unlock(&shmem_swaplist_mutex);
1120 BUG_ON(page_mapped(page)); 724 BUG_ON(page_mapped(page));
1121 swap_writepage(page, wbc); 725 swap_writepage(page, wbc);
1122 return 0; 726 return 0;
1123 } 727 }
1124 728
1125 shmem_swp_unmap(entry); 729 mutex_unlock(&shmem_swaplist_mutex);
1126unlock:
1127 spin_unlock(&info->lock);
1128 /*
1129 * add_to_swap_cache() doesn't return -EEXIST, so we can safely
1130 * clear SWAP_HAS_CACHE flag.
1131 */
1132 swapcache_free(swap, NULL); 730 swapcache_free(swap, NULL);
1133redirty: 731redirty:
1134 set_page_dirty(page); 732 set_page_dirty(page);
@@ -1165,35 +763,33 @@ static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
1165} 763}
1166#endif /* CONFIG_TMPFS */ 764#endif /* CONFIG_TMPFS */
1167 765
1168static struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp, 766static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
1169 struct shmem_inode_info *info, unsigned long idx) 767 struct shmem_inode_info *info, pgoff_t index)
1170{ 768{
1171 struct mempolicy mpol, *spol; 769 struct mempolicy mpol, *spol;
1172 struct vm_area_struct pvma; 770 struct vm_area_struct pvma;
1173 struct page *page;
1174 771
1175 spol = mpol_cond_copy(&mpol, 772 spol = mpol_cond_copy(&mpol,
1176 mpol_shared_policy_lookup(&info->policy, idx)); 773 mpol_shared_policy_lookup(&info->policy, index));
1177 774
1178 /* Create a pseudo vma that just contains the policy */ 775 /* Create a pseudo vma that just contains the policy */
1179 pvma.vm_start = 0; 776 pvma.vm_start = 0;
1180 pvma.vm_pgoff = idx; 777 pvma.vm_pgoff = index;
1181 pvma.vm_ops = NULL; 778 pvma.vm_ops = NULL;
1182 pvma.vm_policy = spol; 779 pvma.vm_policy = spol;
1183 page = swapin_readahead(entry, gfp, &pvma, 0); 780 return swapin_readahead(swap, gfp, &pvma, 0);
1184 return page;
1185} 781}
1186 782
1187static struct page *shmem_alloc_page(gfp_t gfp, 783static struct page *shmem_alloc_page(gfp_t gfp,
1188 struct shmem_inode_info *info, unsigned long idx) 784 struct shmem_inode_info *info, pgoff_t index)
1189{ 785{
1190 struct vm_area_struct pvma; 786 struct vm_area_struct pvma;
1191 787
1192 /* Create a pseudo vma that just contains the policy */ 788 /* Create a pseudo vma that just contains the policy */
1193 pvma.vm_start = 0; 789 pvma.vm_start = 0;
1194 pvma.vm_pgoff = idx; 790 pvma.vm_pgoff = index;
1195 pvma.vm_ops = NULL; 791 pvma.vm_ops = NULL;
1196 pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx); 792 pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index);
1197 793
1198 /* 794 /*
1199 * alloc_page_vma() will drop the shared policy reference 795 * alloc_page_vma() will drop the shared policy reference
@@ -1202,19 +798,19 @@ static struct page *shmem_alloc_page(gfp_t gfp,
1202} 798}
1203#else /* !CONFIG_NUMA */ 799#else /* !CONFIG_NUMA */
1204#ifdef CONFIG_TMPFS 800#ifdef CONFIG_TMPFS
1205static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *p) 801static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
1206{ 802{
1207} 803}
1208#endif /* CONFIG_TMPFS */ 804#endif /* CONFIG_TMPFS */
1209 805
1210static inline struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp, 806static inline struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
1211 struct shmem_inode_info *info, unsigned long idx) 807 struct shmem_inode_info *info, pgoff_t index)
1212{ 808{
1213 return swapin_readahead(entry, gfp, NULL, 0); 809 return swapin_readahead(swap, gfp, NULL, 0);
1214} 810}
1215 811
1216static inline struct page *shmem_alloc_page(gfp_t gfp, 812static inline struct page *shmem_alloc_page(gfp_t gfp,
1217 struct shmem_inode_info *info, unsigned long idx) 813 struct shmem_inode_info *info, pgoff_t index)
1218{ 814{
1219 return alloc_page(gfp); 815 return alloc_page(gfp);
1220} 816}
@@ -1228,311 +824,195 @@ static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
1228#endif 824#endif
1229 825
1230/* 826/*
1231 * shmem_getpage - either get the page from swap or allocate a new one 827 * shmem_getpage_gfp - find page in cache, or get from swap, or allocate
1232 * 828 *
1233 * If we allocate a new one we do not mark it dirty. That's up to the 829 * If we allocate a new one we do not mark it dirty. That's up to the
1234 * vm. If we swap it in we mark it dirty since we also free the swap 830 * vm. If we swap it in we mark it dirty since we also free the swap
1235 * entry since a page cannot live in both the swap and page cache 831 * entry since a page cannot live in both the swap and page cache
1236 */ 832 */
1237static int shmem_getpage(struct inode *inode, unsigned long idx, 833static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
1238 struct page **pagep, enum sgp_type sgp, int *type) 834 struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type)
1239{ 835{
1240 struct address_space *mapping = inode->i_mapping; 836 struct address_space *mapping = inode->i_mapping;
1241 struct shmem_inode_info *info = SHMEM_I(inode); 837 struct shmem_inode_info *info;
1242 struct shmem_sb_info *sbinfo; 838 struct shmem_sb_info *sbinfo;
1243 struct page *filepage = *pagep; 839 struct page *page;
1244 struct page *swappage;
1245 struct page *prealloc_page = NULL;
1246 swp_entry_t *entry;
1247 swp_entry_t swap; 840 swp_entry_t swap;
1248 gfp_t gfp;
1249 int error; 841 int error;
842 int once = 0;
1250 843
1251 if (idx >= SHMEM_MAX_INDEX) 844 if (index > (MAX_LFS_FILESIZE >> PAGE_CACHE_SHIFT))
1252 return -EFBIG; 845 return -EFBIG;
846repeat:
847 swap.val = 0;
848 page = find_lock_page(mapping, index);
849 if (radix_tree_exceptional_entry(page)) {
850 swap = radix_to_swp_entry(page);
851 page = NULL;
852 }
1253 853
1254 if (type) 854 if (sgp != SGP_WRITE &&
1255 *type = 0; 855 ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
856 error = -EINVAL;
857 goto failed;
858 }
1256 859
1257 /* 860 if (page || (sgp == SGP_READ && !swap.val)) {
1258 * Normally, filepage is NULL on entry, and either found
1259 * uptodate immediately, or allocated and zeroed, or read
1260 * in under swappage, which is then assigned to filepage.
1261 * But shmem_readpage (required for splice) passes in a locked
1262 * filepage, which may be found not uptodate by other callers
1263 * too, and may need to be copied from the swappage read in.
1264 */
1265repeat:
1266 if (!filepage)
1267 filepage = find_lock_page(mapping, idx);
1268 if (filepage && PageUptodate(filepage))
1269 goto done;
1270 gfp = mapping_gfp_mask(mapping);
1271 if (!filepage) {
1272 /* 861 /*
1273 * Try to preload while we can wait, to not make a habit of 862 * Once we can get the page lock, it must be uptodate:
1274 * draining atomic reserves; but don't latch on to this cpu. 863 * if there were an error in reading back from swap,
864 * the page would not be inserted into the filecache.
1275 */ 865 */
1276 error = radix_tree_preload(gfp & ~__GFP_HIGHMEM); 866 BUG_ON(page && !PageUptodate(page));
1277 if (error) 867 *pagep = page;
1278 goto failed; 868 return 0;
1279 radix_tree_preload_end();
1280 if (sgp != SGP_READ && !prealloc_page) {
1281 /* We don't care if this fails */
1282 prealloc_page = shmem_alloc_page(gfp, info, idx);
1283 if (prealloc_page) {
1284 if (mem_cgroup_cache_charge(prealloc_page,
1285 current->mm, GFP_KERNEL)) {
1286 page_cache_release(prealloc_page);
1287 prealloc_page = NULL;
1288 }
1289 }
1290 }
1291 } 869 }
1292 error = 0;
1293 870
1294 spin_lock(&info->lock); 871 /*
1295 shmem_recalc_inode(inode); 872 * Fast cache lookup did not find it:
1296 entry = shmem_swp_alloc(info, idx, sgp); 873 * bring it back from swap or allocate.
1297 if (IS_ERR(entry)) { 874 */
1298 spin_unlock(&info->lock); 875 info = SHMEM_I(inode);
1299 error = PTR_ERR(entry); 876 sbinfo = SHMEM_SB(inode->i_sb);
1300 goto failed;
1301 }
1302 swap = *entry;
1303 877
1304 if (swap.val) { 878 if (swap.val) {
1305 /* Look it up and read it in.. */ 879 /* Look it up and read it in.. */
1306 swappage = lookup_swap_cache(swap); 880 page = lookup_swap_cache(swap);
1307 if (!swappage) { 881 if (!page) {
1308 shmem_swp_unmap(entry);
1309 spin_unlock(&info->lock);
1310 /* here we actually do the io */ 882 /* here we actually do the io */
1311 if (type) 883 if (fault_type)
1312 *type |= VM_FAULT_MAJOR; 884 *fault_type |= VM_FAULT_MAJOR;
1313 swappage = shmem_swapin(swap, gfp, info, idx); 885 page = shmem_swapin(swap, gfp, info, index);
1314 if (!swappage) { 886 if (!page) {
1315 spin_lock(&info->lock); 887 error = -ENOMEM;
1316 entry = shmem_swp_alloc(info, idx, sgp); 888 goto failed;
1317 if (IS_ERR(entry))
1318 error = PTR_ERR(entry);
1319 else {
1320 if (entry->val == swap.val)
1321 error = -ENOMEM;
1322 shmem_swp_unmap(entry);
1323 }
1324 spin_unlock(&info->lock);
1325 if (error)
1326 goto failed;
1327 goto repeat;
1328 } 889 }
1329 wait_on_page_locked(swappage);
1330 page_cache_release(swappage);
1331 goto repeat;
1332 } 890 }
1333 891
1334 /* We have to do this with page locked to prevent races */ 892 /* We have to do this with page locked to prevent races */
1335 if (!trylock_page(swappage)) { 893 lock_page(page);
1336 shmem_swp_unmap(entry); 894 if (!PageUptodate(page)) {
1337 spin_unlock(&info->lock);
1338 wait_on_page_locked(swappage);
1339 page_cache_release(swappage);
1340 goto repeat;
1341 }
1342 if (PageWriteback(swappage)) {
1343 shmem_swp_unmap(entry);
1344 spin_unlock(&info->lock);
1345 wait_on_page_writeback(swappage);
1346 unlock_page(swappage);
1347 page_cache_release(swappage);
1348 goto repeat;
1349 }
1350 if (!PageUptodate(swappage)) {
1351 shmem_swp_unmap(entry);
1352 spin_unlock(&info->lock);
1353 unlock_page(swappage);
1354 page_cache_release(swappage);
1355 error = -EIO; 895 error = -EIO;
1356 goto failed; 896 goto failed;
1357 } 897 }
1358 898 wait_on_page_writeback(page);
1359 if (filepage) { 899
1360 shmem_swp_set(info, entry, 0); 900 /* Someone may have already done it for us */
1361 shmem_swp_unmap(entry); 901 if (page->mapping) {
1362 delete_from_swap_cache(swappage); 902 if (page->mapping == mapping &&
1363 spin_unlock(&info->lock); 903 page->index == index)
1364 copy_highpage(filepage, swappage); 904 goto done;
1365 unlock_page(swappage); 905 error = -EEXIST;
1366 page_cache_release(swappage); 906 goto failed;
1367 flush_dcache_page(filepage);
1368 SetPageUptodate(filepage);
1369 set_page_dirty(filepage);
1370 swap_free(swap);
1371 } else if (!(error = add_to_page_cache_locked(swappage, mapping,
1372 idx, GFP_NOWAIT))) {
1373 info->flags |= SHMEM_PAGEIN;
1374 shmem_swp_set(info, entry, 0);
1375 shmem_swp_unmap(entry);
1376 delete_from_swap_cache(swappage);
1377 spin_unlock(&info->lock);
1378 filepage = swappage;
1379 set_page_dirty(filepage);
1380 swap_free(swap);
1381 } else {
1382 shmem_swp_unmap(entry);
1383 spin_unlock(&info->lock);
1384 if (error == -ENOMEM) {
1385 /*
1386 * reclaim from proper memory cgroup and
1387 * call memcg's OOM if needed.
1388 */
1389 error = mem_cgroup_shmem_charge_fallback(
1390 swappage,
1391 current->mm,
1392 gfp);
1393 if (error) {
1394 unlock_page(swappage);
1395 page_cache_release(swappage);
1396 goto failed;
1397 }
1398 }
1399 unlock_page(swappage);
1400 page_cache_release(swappage);
1401 goto repeat;
1402 }
1403 } else if (sgp == SGP_READ && !filepage) {
1404 shmem_swp_unmap(entry);
1405 filepage = find_get_page(mapping, idx);
1406 if (filepage &&
1407 (!PageUptodate(filepage) || !trylock_page(filepage))) {
1408 spin_unlock(&info->lock);
1409 wait_on_page_locked(filepage);
1410 page_cache_release(filepage);
1411 filepage = NULL;
1412 goto repeat;
1413 } 907 }
908
909 error = mem_cgroup_cache_charge(page, current->mm,
910 gfp & GFP_RECLAIM_MASK);
911 if (!error)
912 error = shmem_add_to_page_cache(page, mapping, index,
913 gfp, swp_to_radix_entry(swap));
914 if (error)
915 goto failed;
916
917 spin_lock(&info->lock);
918 info->swapped--;
919 shmem_recalc_inode(inode);
1414 spin_unlock(&info->lock); 920 spin_unlock(&info->lock);
921
922 delete_from_swap_cache(page);
923 set_page_dirty(page);
924 swap_free(swap);
925
1415 } else { 926 } else {
1416 shmem_swp_unmap(entry); 927 if (shmem_acct_block(info->flags)) {
1417 sbinfo = SHMEM_SB(inode->i_sb); 928 error = -ENOSPC;
929 goto failed;
930 }
1418 if (sbinfo->max_blocks) { 931 if (sbinfo->max_blocks) {
1419 if (percpu_counter_compare(&sbinfo->used_blocks, 932 if (percpu_counter_compare(&sbinfo->used_blocks,
1420 sbinfo->max_blocks) >= 0 || 933 sbinfo->max_blocks) >= 0) {
1421 shmem_acct_block(info->flags)) 934 error = -ENOSPC;
1422 goto nospace; 935 goto unacct;
1423 percpu_counter_inc(&sbinfo->used_blocks);
1424 spin_lock(&inode->i_lock);
1425 inode->i_blocks += BLOCKS_PER_PAGE;
1426 spin_unlock(&inode->i_lock);
1427 } else if (shmem_acct_block(info->flags))
1428 goto nospace;
1429
1430 if (!filepage) {
1431 int ret;
1432
1433 if (!prealloc_page) {
1434 spin_unlock(&info->lock);
1435 filepage = shmem_alloc_page(gfp, info, idx);
1436 if (!filepage) {
1437 shmem_unacct_blocks(info->flags, 1);
1438 shmem_free_blocks(inode, 1);
1439 error = -ENOMEM;
1440 goto failed;
1441 }
1442 SetPageSwapBacked(filepage);
1443
1444 /*
1445 * Precharge page while we can wait, compensate
1446 * after
1447 */
1448 error = mem_cgroup_cache_charge(filepage,
1449 current->mm, GFP_KERNEL);
1450 if (error) {
1451 page_cache_release(filepage);
1452 shmem_unacct_blocks(info->flags, 1);
1453 shmem_free_blocks(inode, 1);
1454 filepage = NULL;
1455 goto failed;
1456 }
1457
1458 spin_lock(&info->lock);
1459 } else {
1460 filepage = prealloc_page;
1461 prealloc_page = NULL;
1462 SetPageSwapBacked(filepage);
1463 } 936 }
937 percpu_counter_inc(&sbinfo->used_blocks);
938 }
1464 939
1465 entry = shmem_swp_alloc(info, idx, sgp); 940 page = shmem_alloc_page(gfp, info, index);
1466 if (IS_ERR(entry)) 941 if (!page) {
1467 error = PTR_ERR(entry); 942 error = -ENOMEM;
1468 else { 943 goto decused;
1469 swap = *entry;
1470 shmem_swp_unmap(entry);
1471 }
1472 ret = error || swap.val;
1473 if (ret)
1474 mem_cgroup_uncharge_cache_page(filepage);
1475 else
1476 ret = add_to_page_cache_lru(filepage, mapping,
1477 idx, GFP_NOWAIT);
1478 /*
1479 * At add_to_page_cache_lru() failure, uncharge will
1480 * be done automatically.
1481 */
1482 if (ret) {
1483 spin_unlock(&info->lock);
1484 page_cache_release(filepage);
1485 shmem_unacct_blocks(info->flags, 1);
1486 shmem_free_blocks(inode, 1);
1487 filepage = NULL;
1488 if (error)
1489 goto failed;
1490 goto repeat;
1491 }
1492 info->flags |= SHMEM_PAGEIN;
1493 } 944 }
1494 945
946 SetPageSwapBacked(page);
947 __set_page_locked(page);
948 error = mem_cgroup_cache_charge(page, current->mm,
949 gfp & GFP_RECLAIM_MASK);
950 if (!error)
951 error = shmem_add_to_page_cache(page, mapping, index,
952 gfp, NULL);
953 if (error)
954 goto decused;
955 lru_cache_add_anon(page);
956
957 spin_lock(&info->lock);
1495 info->alloced++; 958 info->alloced++;
959 inode->i_blocks += BLOCKS_PER_PAGE;
960 shmem_recalc_inode(inode);
1496 spin_unlock(&info->lock); 961 spin_unlock(&info->lock);
1497 clear_highpage(filepage); 962
1498 flush_dcache_page(filepage); 963 clear_highpage(page);
1499 SetPageUptodate(filepage); 964 flush_dcache_page(page);
965 SetPageUptodate(page);
1500 if (sgp == SGP_DIRTY) 966 if (sgp == SGP_DIRTY)
1501 set_page_dirty(filepage); 967 set_page_dirty(page);
1502 } 968 }
1503done: 969done:
1504 *pagep = filepage; 970 /* Perhaps the file has been truncated since we checked */
1505 error = 0; 971 if (sgp != SGP_WRITE &&
1506 goto out; 972 ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
973 error = -EINVAL;
974 goto trunc;
975 }
976 *pagep = page;
977 return 0;
1507 978
1508nospace:
1509 /* 979 /*
1510 * Perhaps the page was brought in from swap between find_lock_page 980 * Error recovery.
1511 * and taking info->lock? We allow for that at add_to_page_cache_lru,
1512 * but must also avoid reporting a spurious ENOSPC while working on a
1513 * full tmpfs. (When filepage has been passed in to shmem_getpage, it
1514 * is already in page cache, which prevents this race from occurring.)
1515 */ 981 */
1516 if (!filepage) { 982trunc:
1517 struct page *page = find_get_page(mapping, idx); 983 ClearPageDirty(page);
1518 if (page) { 984 delete_from_page_cache(page);
1519 spin_unlock(&info->lock); 985 spin_lock(&info->lock);
1520 page_cache_release(page); 986 info->alloced--;
1521 goto repeat; 987 inode->i_blocks -= BLOCKS_PER_PAGE;
1522 }
1523 }
1524 spin_unlock(&info->lock); 988 spin_unlock(&info->lock);
1525 error = -ENOSPC; 989decused:
990 if (sbinfo->max_blocks)
991 percpu_counter_add(&sbinfo->used_blocks, -1);
992unacct:
993 shmem_unacct_blocks(info->flags, 1);
1526failed: 994failed:
1527 if (*pagep != filepage) { 995 if (swap.val && error != -EINVAL) {
1528 unlock_page(filepage); 996 struct page *test = find_get_page(mapping, index);
1529 page_cache_release(filepage); 997 if (test && !radix_tree_exceptional_entry(test))
998 page_cache_release(test);
999 /* Have another try if the entry has changed */
1000 if (test != swp_to_radix_entry(swap))
1001 error = -EEXIST;
1530 } 1002 }
1531out: 1003 if (page) {
1532 if (prealloc_page) { 1004 unlock_page(page);
1533 mem_cgroup_uncharge_cache_page(prealloc_page); 1005 page_cache_release(page);
1534 page_cache_release(prealloc_page);
1535 } 1006 }
1007 if (error == -ENOSPC && !once++) {
1008 info = SHMEM_I(inode);
1009 spin_lock(&info->lock);
1010 shmem_recalc_inode(inode);
1011 spin_unlock(&info->lock);
1012 goto repeat;
1013 }
1014 if (error == -EEXIST)
1015 goto repeat;
1536 return error; 1016 return error;
1537} 1017}
1538 1018
@@ -1540,36 +1020,34 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1540{ 1020{
1541 struct inode *inode = vma->vm_file->f_path.dentry->d_inode; 1021 struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
1542 int error; 1022 int error;
1543 int ret; 1023 int ret = VM_FAULT_LOCKED;
1544
1545 if (((loff_t)vmf->pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode))
1546 return VM_FAULT_SIGBUS;
1547 1024
1548 error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret); 1025 error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret);
1549 if (error) 1026 if (error)
1550 return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); 1027 return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS);
1028
1551 if (ret & VM_FAULT_MAJOR) { 1029 if (ret & VM_FAULT_MAJOR) {
1552 count_vm_event(PGMAJFAULT); 1030 count_vm_event(PGMAJFAULT);
1553 mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); 1031 mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
1554 } 1032 }
1555 return ret | VM_FAULT_LOCKED; 1033 return ret;
1556} 1034}
1557 1035
1558#ifdef CONFIG_NUMA 1036#ifdef CONFIG_NUMA
1559static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new) 1037static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol)
1560{ 1038{
1561 struct inode *i = vma->vm_file->f_path.dentry->d_inode; 1039 struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
1562 return mpol_set_shared_policy(&SHMEM_I(i)->policy, vma, new); 1040 return mpol_set_shared_policy(&SHMEM_I(inode)->policy, vma, mpol);
1563} 1041}
1564 1042
1565static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma, 1043static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma,
1566 unsigned long addr) 1044 unsigned long addr)
1567{ 1045{
1568 struct inode *i = vma->vm_file->f_path.dentry->d_inode; 1046 struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
1569 unsigned long idx; 1047 pgoff_t index;
1570 1048
1571 idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; 1049 index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
1572 return mpol_shared_policy_lookup(&SHMEM_I(i)->policy, idx); 1050 return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index);
1573} 1051}
1574#endif 1052#endif
1575 1053
@@ -1590,6 +1068,12 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user)
1590 user_shm_unlock(inode->i_size, user); 1068 user_shm_unlock(inode->i_size, user);
1591 info->flags &= ~VM_LOCKED; 1069 info->flags &= ~VM_LOCKED;
1592 mapping_clear_unevictable(file->f_mapping); 1070 mapping_clear_unevictable(file->f_mapping);
1071 /*
1072 * Ensure that a racing putback_lru_page() can see
1073 * the pages of this mapping are evictable when we
1074 * skip them due to !PageLRU during the scan.
1075 */
1076 smp_mb__after_clear_bit();
1593 scan_mapping_unevictable_pages(file->f_mapping); 1077 scan_mapping_unevictable_pages(file->f_mapping);
1594 } 1078 }
1595 retval = 0; 1079 retval = 0;
@@ -1667,20 +1151,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
1667 1151
1668#ifdef CONFIG_TMPFS 1152#ifdef CONFIG_TMPFS
1669static const struct inode_operations shmem_symlink_inode_operations; 1153static const struct inode_operations shmem_symlink_inode_operations;
1670static const struct inode_operations shmem_symlink_inline_operations; 1154static const struct inode_operations shmem_short_symlink_operations;
1671
1672/*
1673 * Normally tmpfs avoids the use of shmem_readpage and shmem_write_begin;
1674 * but providing them allows a tmpfs file to be used for splice, sendfile, and
1675 * below the loop driver, in the generic fashion that many filesystems support.
1676 */
1677static int shmem_readpage(struct file *file, struct page *page)
1678{
1679 struct inode *inode = page->mapping->host;
1680 int error = shmem_getpage(inode, page->index, &page, SGP_CACHE, NULL);
1681 unlock_page(page);
1682 return error;
1683}
1684 1155
1685static int 1156static int
1686shmem_write_begin(struct file *file, struct address_space *mapping, 1157shmem_write_begin(struct file *file, struct address_space *mapping,
@@ -1689,7 +1160,6 @@ shmem_write_begin(struct file *file, struct address_space *mapping,
1689{ 1160{
1690 struct inode *inode = mapping->host; 1161 struct inode *inode = mapping->host;
1691 pgoff_t index = pos >> PAGE_CACHE_SHIFT; 1162 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
1692 *pagep = NULL;
1693 return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL); 1163 return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL);
1694} 1164}
1695 1165
@@ -1714,7 +1184,8 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_
1714{ 1184{
1715 struct inode *inode = filp->f_path.dentry->d_inode; 1185 struct inode *inode = filp->f_path.dentry->d_inode;
1716 struct address_space *mapping = inode->i_mapping; 1186 struct address_space *mapping = inode->i_mapping;
1717 unsigned long index, offset; 1187 pgoff_t index;
1188 unsigned long offset;
1718 enum sgp_type sgp = SGP_READ; 1189 enum sgp_type sgp = SGP_READ;
1719 1190
1720 /* 1191 /*
@@ -1730,7 +1201,8 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_
1730 1201
1731 for (;;) { 1202 for (;;) {
1732 struct page *page = NULL; 1203 struct page *page = NULL;
1733 unsigned long end_index, nr, ret; 1204 pgoff_t end_index;
1205 unsigned long nr, ret;
1734 loff_t i_size = i_size_read(inode); 1206 loff_t i_size = i_size_read(inode);
1735 1207
1736 end_index = i_size >> PAGE_CACHE_SHIFT; 1208 end_index = i_size >> PAGE_CACHE_SHIFT;
@@ -1846,6 +1318,119 @@ static ssize_t shmem_file_aio_read(struct kiocb *iocb,
1846 return retval; 1318 return retval;
1847} 1319}
1848 1320
1321static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
1322 struct pipe_inode_info *pipe, size_t len,
1323 unsigned int flags)
1324{
1325 struct address_space *mapping = in->f_mapping;
1326 struct inode *inode = mapping->host;
1327 unsigned int loff, nr_pages, req_pages;
1328 struct page *pages[PIPE_DEF_BUFFERS];
1329 struct partial_page partial[PIPE_DEF_BUFFERS];
1330 struct page *page;
1331 pgoff_t index, end_index;
1332 loff_t isize, left;
1333 int error, page_nr;
1334 struct splice_pipe_desc spd = {
1335 .pages = pages,
1336 .partial = partial,
1337 .flags = flags,
1338 .ops = &page_cache_pipe_buf_ops,
1339 .spd_release = spd_release_page,
1340 };
1341
1342 isize = i_size_read(inode);
1343 if (unlikely(*ppos >= isize))
1344 return 0;
1345
1346 left = isize - *ppos;
1347 if (unlikely(left < len))
1348 len = left;
1349
1350 if (splice_grow_spd(pipe, &spd))
1351 return -ENOMEM;
1352
1353 index = *ppos >> PAGE_CACHE_SHIFT;
1354 loff = *ppos & ~PAGE_CACHE_MASK;
1355 req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1356 nr_pages = min(req_pages, pipe->buffers);
1357
1358 spd.nr_pages = find_get_pages_contig(mapping, index,
1359 nr_pages, spd.pages);
1360 index += spd.nr_pages;
1361 error = 0;
1362
1363 while (spd.nr_pages < nr_pages) {
1364 error = shmem_getpage(inode, index, &page, SGP_CACHE, NULL);
1365 if (error)
1366 break;
1367 unlock_page(page);
1368 spd.pages[spd.nr_pages++] = page;
1369 index++;
1370 }
1371
1372 index = *ppos >> PAGE_CACHE_SHIFT;
1373 nr_pages = spd.nr_pages;
1374 spd.nr_pages = 0;
1375
1376 for (page_nr = 0; page_nr < nr_pages; page_nr++) {
1377 unsigned int this_len;
1378
1379 if (!len)
1380 break;
1381
1382 this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff);
1383 page = spd.pages[page_nr];
1384
1385 if (!PageUptodate(page) || page->mapping != mapping) {
1386 error = shmem_getpage(inode, index, &page,
1387 SGP_CACHE, NULL);
1388 if (error)
1389 break;
1390 unlock_page(page);
1391 page_cache_release(spd.pages[page_nr]);
1392 spd.pages[page_nr] = page;
1393 }
1394
1395 isize = i_size_read(inode);
1396 end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
1397 if (unlikely(!isize || index > end_index))
1398 break;
1399
1400 if (end_index == index) {
1401 unsigned int plen;
1402
1403 plen = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
1404 if (plen <= loff)
1405 break;
1406
1407 this_len = min(this_len, plen - loff);
1408 len = this_len;
1409 }
1410
1411 spd.partial[page_nr].offset = loff;
1412 spd.partial[page_nr].len = this_len;
1413 len -= this_len;
1414 loff = 0;
1415 spd.nr_pages++;
1416 index++;
1417 }
1418
1419 while (page_nr < nr_pages)
1420 page_cache_release(spd.pages[page_nr++]);
1421
1422 if (spd.nr_pages)
1423 error = splice_to_pipe(pipe, &spd);
1424
1425 splice_shrink_spd(pipe, &spd);
1426
1427 if (error > 0) {
1428 *ppos += error;
1429 file_accessed(in);
1430 }
1431 return error;
1432}
1433
1849static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) 1434static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
1850{ 1435{
1851 struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb); 1436 struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb);
@@ -1855,8 +1440,9 @@ static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
1855 buf->f_namelen = NAME_MAX; 1440 buf->f_namelen = NAME_MAX;
1856 if (sbinfo->max_blocks) { 1441 if (sbinfo->max_blocks) {
1857 buf->f_blocks = sbinfo->max_blocks; 1442 buf->f_blocks = sbinfo->max_blocks;
1858 buf->f_bavail = buf->f_bfree = 1443 buf->f_bavail =
1859 sbinfo->max_blocks - percpu_counter_sum(&sbinfo->used_blocks); 1444 buf->f_bfree = sbinfo->max_blocks -
1445 percpu_counter_sum(&sbinfo->used_blocks);
1860 } 1446 }
1861 if (sbinfo->max_inodes) { 1447 if (sbinfo->max_inodes) {
1862 buf->f_files = sbinfo->max_inodes; 1448 buf->f_files = sbinfo->max_inodes;
@@ -1878,7 +1464,7 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
1878 inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE); 1464 inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE);
1879 if (inode) { 1465 if (inode) {
1880 error = security_inode_init_security(inode, dir, 1466 error = security_inode_init_security(inode, dir,
1881 &dentry->d_name, NULL, 1467 &dentry->d_name,
1882 NULL, NULL); 1468 NULL, NULL);
1883 if (error) { 1469 if (error) {
1884 if (error != -EOPNOTSUPP) { 1470 if (error != -EOPNOTSUPP) {
@@ -2006,7 +1592,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
2006 int error; 1592 int error;
2007 int len; 1593 int len;
2008 struct inode *inode; 1594 struct inode *inode;
2009 struct page *page = NULL; 1595 struct page *page;
2010 char *kaddr; 1596 char *kaddr;
2011 struct shmem_inode_info *info; 1597 struct shmem_inode_info *info;
2012 1598
@@ -2018,7 +1604,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
2018 if (!inode) 1604 if (!inode)
2019 return -ENOSPC; 1605 return -ENOSPC;
2020 1606
2021 error = security_inode_init_security(inode, dir, &dentry->d_name, NULL, 1607 error = security_inode_init_security(inode, dir, &dentry->d_name,
2022 NULL, NULL); 1608 NULL, NULL);
2023 if (error) { 1609 if (error) {
2024 if (error != -EOPNOTSUPP) { 1610 if (error != -EOPNOTSUPP) {
@@ -2030,10 +1616,13 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
2030 1616
2031 info = SHMEM_I(inode); 1617 info = SHMEM_I(inode);
2032 inode->i_size = len-1; 1618 inode->i_size = len-1;
2033 if (len <= SHMEM_SYMLINK_INLINE_LEN) { 1619 if (len <= SHORT_SYMLINK_LEN) {
2034 /* do it inline */ 1620 info->symlink = kmemdup(symname, len, GFP_KERNEL);
2035 memcpy(info->inline_symlink, symname, len); 1621 if (!info->symlink) {
2036 inode->i_op = &shmem_symlink_inline_operations; 1622 iput(inode);
1623 return -ENOMEM;
1624 }
1625 inode->i_op = &shmem_short_symlink_operations;
2037 } else { 1626 } else {
2038 error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL); 1627 error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL);
2039 if (error) { 1628 if (error) {
@@ -2056,17 +1645,17 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
2056 return 0; 1645 return 0;
2057} 1646}
2058 1647
2059static void *shmem_follow_link_inline(struct dentry *dentry, struct nameidata *nd) 1648static void *shmem_follow_short_symlink(struct dentry *dentry, struct nameidata *nd)
2060{ 1649{
2061 nd_set_link(nd, SHMEM_I(dentry->d_inode)->inline_symlink); 1650 nd_set_link(nd, SHMEM_I(dentry->d_inode)->symlink);
2062 return NULL; 1651 return NULL;
2063} 1652}
2064 1653
2065static void *shmem_follow_link(struct dentry *dentry, struct nameidata *nd) 1654static void *shmem_follow_link(struct dentry *dentry, struct nameidata *nd)
2066{ 1655{
2067 struct page *page = NULL; 1656 struct page *page = NULL;
2068 int res = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL); 1657 int error = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL);
2069 nd_set_link(nd, res ? ERR_PTR(res) : kmap(page)); 1658 nd_set_link(nd, error ? ERR_PTR(error) : kmap(page));
2070 if (page) 1659 if (page)
2071 unlock_page(page); 1660 unlock_page(page);
2072 return page; 1661 return page;
@@ -2177,7 +1766,6 @@ out:
2177 return err; 1766 return err;
2178} 1767}
2179 1768
2180
2181static const struct xattr_handler *shmem_xattr_handlers[] = { 1769static const struct xattr_handler *shmem_xattr_handlers[] = {
2182#ifdef CONFIG_TMPFS_POSIX_ACL 1770#ifdef CONFIG_TMPFS_POSIX_ACL
2183 &generic_acl_access_handler, 1771 &generic_acl_access_handler,
@@ -2307,9 +1895,9 @@ static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size)
2307} 1895}
2308#endif /* CONFIG_TMPFS_XATTR */ 1896#endif /* CONFIG_TMPFS_XATTR */
2309 1897
2310static const struct inode_operations shmem_symlink_inline_operations = { 1898static const struct inode_operations shmem_short_symlink_operations = {
2311 .readlink = generic_readlink, 1899 .readlink = generic_readlink,
2312 .follow_link = shmem_follow_link_inline, 1900 .follow_link = shmem_follow_short_symlink,
2313#ifdef CONFIG_TMPFS_XATTR 1901#ifdef CONFIG_TMPFS_XATTR
2314 .setxattr = shmem_setxattr, 1902 .setxattr = shmem_setxattr,
2315 .getxattr = shmem_getxattr, 1903 .getxattr = shmem_getxattr,
@@ -2509,8 +2097,7 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
2509 if (config.max_inodes < inodes) 2097 if (config.max_inodes < inodes)
2510 goto out; 2098 goto out;
2511 /* 2099 /*
2512 * Those tests also disallow limited->unlimited while any are in 2100 * Those tests disallow limited->unlimited while any are in use;
2513 * use, so i_blocks will always be zero when max_blocks is zero;
2514 * but we must separately disallow unlimited->limited, because 2101 * but we must separately disallow unlimited->limited, because
2515 * in that case we have no record of how much is already in use. 2102 * in that case we have no record of how much is already in use.
2516 */ 2103 */
@@ -2602,7 +2189,7 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent)
2602 goto failed; 2189 goto failed;
2603 sbinfo->free_inodes = sbinfo->max_inodes; 2190 sbinfo->free_inodes = sbinfo->max_inodes;
2604 2191
2605 sb->s_maxbytes = SHMEM_MAX_BYTES; 2192 sb->s_maxbytes = MAX_LFS_FILESIZE;
2606 sb->s_blocksize = PAGE_CACHE_SIZE; 2193 sb->s_blocksize = PAGE_CACHE_SIZE;
2607 sb->s_blocksize_bits = PAGE_CACHE_SHIFT; 2194 sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
2608 sb->s_magic = TMPFS_MAGIC; 2195 sb->s_magic = TMPFS_MAGIC;
@@ -2637,14 +2224,14 @@ static struct kmem_cache *shmem_inode_cachep;
2637 2224
2638static struct inode *shmem_alloc_inode(struct super_block *sb) 2225static struct inode *shmem_alloc_inode(struct super_block *sb)
2639{ 2226{
2640 struct shmem_inode_info *p; 2227 struct shmem_inode_info *info;
2641 p = (struct shmem_inode_info *)kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL); 2228 info = kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL);
2642 if (!p) 2229 if (!info)
2643 return NULL; 2230 return NULL;
2644 return &p->vfs_inode; 2231 return &info->vfs_inode;
2645} 2232}
2646 2233
2647static void shmem_i_callback(struct rcu_head *head) 2234static void shmem_destroy_callback(struct rcu_head *head)
2648{ 2235{
2649 struct inode *inode = container_of(head, struct inode, i_rcu); 2236 struct inode *inode = container_of(head, struct inode, i_rcu);
2650 INIT_LIST_HEAD(&inode->i_dentry); 2237 INIT_LIST_HEAD(&inode->i_dentry);
@@ -2653,29 +2240,26 @@ static void shmem_i_callback(struct rcu_head *head)
2653 2240
2654static void shmem_destroy_inode(struct inode *inode) 2241static void shmem_destroy_inode(struct inode *inode)
2655{ 2242{
2656 if ((inode->i_mode & S_IFMT) == S_IFREG) { 2243 if ((inode->i_mode & S_IFMT) == S_IFREG)
2657 /* only struct inode is valid if it's an inline symlink */
2658 mpol_free_shared_policy(&SHMEM_I(inode)->policy); 2244 mpol_free_shared_policy(&SHMEM_I(inode)->policy);
2659 } 2245 call_rcu(&inode->i_rcu, shmem_destroy_callback);
2660 call_rcu(&inode->i_rcu, shmem_i_callback);
2661} 2246}
2662 2247
2663static void init_once(void *foo) 2248static void shmem_init_inode(void *foo)
2664{ 2249{
2665 struct shmem_inode_info *p = (struct shmem_inode_info *) foo; 2250 struct shmem_inode_info *info = foo;
2666 2251 inode_init_once(&info->vfs_inode);
2667 inode_init_once(&p->vfs_inode);
2668} 2252}
2669 2253
2670static int init_inodecache(void) 2254static int shmem_init_inodecache(void)
2671{ 2255{
2672 shmem_inode_cachep = kmem_cache_create("shmem_inode_cache", 2256 shmem_inode_cachep = kmem_cache_create("shmem_inode_cache",
2673 sizeof(struct shmem_inode_info), 2257 sizeof(struct shmem_inode_info),
2674 0, SLAB_PANIC, init_once); 2258 0, SLAB_PANIC, shmem_init_inode);
2675 return 0; 2259 return 0;
2676} 2260}
2677 2261
2678static void destroy_inodecache(void) 2262static void shmem_destroy_inodecache(void)
2679{ 2263{
2680 kmem_cache_destroy(shmem_inode_cachep); 2264 kmem_cache_destroy(shmem_inode_cachep);
2681} 2265}
@@ -2684,7 +2268,6 @@ static const struct address_space_operations shmem_aops = {
2684 .writepage = shmem_writepage, 2268 .writepage = shmem_writepage,
2685 .set_page_dirty = __set_page_dirty_no_writeback, 2269 .set_page_dirty = __set_page_dirty_no_writeback,
2686#ifdef CONFIG_TMPFS 2270#ifdef CONFIG_TMPFS
2687 .readpage = shmem_readpage,
2688 .write_begin = shmem_write_begin, 2271 .write_begin = shmem_write_begin,
2689 .write_end = shmem_write_end, 2272 .write_end = shmem_write_end,
2690#endif 2273#endif
@@ -2701,7 +2284,7 @@ static const struct file_operations shmem_file_operations = {
2701 .aio_read = shmem_file_aio_read, 2284 .aio_read = shmem_file_aio_read,
2702 .aio_write = generic_file_aio_write, 2285 .aio_write = generic_file_aio_write,
2703 .fsync = noop_fsync, 2286 .fsync = noop_fsync,
2704 .splice_read = generic_file_splice_read, 2287 .splice_read = shmem_file_splice_read,
2705 .splice_write = generic_file_splice_write, 2288 .splice_write = generic_file_splice_write,
2706#endif 2289#endif
2707}; 2290};
@@ -2715,10 +2298,6 @@ static const struct inode_operations shmem_inode_operations = {
2715 .listxattr = shmem_listxattr, 2298 .listxattr = shmem_listxattr,
2716 .removexattr = shmem_removexattr, 2299 .removexattr = shmem_removexattr,
2717#endif 2300#endif
2718#ifdef CONFIG_TMPFS_POSIX_ACL
2719 .check_acl = generic_check_acl,
2720#endif
2721
2722}; 2301};
2723 2302
2724static const struct inode_operations shmem_dir_inode_operations = { 2303static const struct inode_operations shmem_dir_inode_operations = {
@@ -2741,7 +2320,6 @@ static const struct inode_operations shmem_dir_inode_operations = {
2741#endif 2320#endif
2742#ifdef CONFIG_TMPFS_POSIX_ACL 2321#ifdef CONFIG_TMPFS_POSIX_ACL
2743 .setattr = shmem_setattr, 2322 .setattr = shmem_setattr,
2744 .check_acl = generic_check_acl,
2745#endif 2323#endif
2746}; 2324};
2747 2325
@@ -2754,7 +2332,6 @@ static const struct inode_operations shmem_special_inode_operations = {
2754#endif 2332#endif
2755#ifdef CONFIG_TMPFS_POSIX_ACL 2333#ifdef CONFIG_TMPFS_POSIX_ACL
2756 .setattr = shmem_setattr, 2334 .setattr = shmem_setattr,
2757 .check_acl = generic_check_acl,
2758#endif 2335#endif
2759}; 2336};
2760 2337
@@ -2779,21 +2356,20 @@ static const struct vm_operations_struct shmem_vm_ops = {
2779#endif 2356#endif
2780}; 2357};
2781 2358
2782
2783static struct dentry *shmem_mount(struct file_system_type *fs_type, 2359static struct dentry *shmem_mount(struct file_system_type *fs_type,
2784 int flags, const char *dev_name, void *data) 2360 int flags, const char *dev_name, void *data)
2785{ 2361{
2786 return mount_nodev(fs_type, flags, data, shmem_fill_super); 2362 return mount_nodev(fs_type, flags, data, shmem_fill_super);
2787} 2363}
2788 2364
2789static struct file_system_type tmpfs_fs_type = { 2365static struct file_system_type shmem_fs_type = {
2790 .owner = THIS_MODULE, 2366 .owner = THIS_MODULE,
2791 .name = "tmpfs", 2367 .name = "tmpfs",
2792 .mount = shmem_mount, 2368 .mount = shmem_mount,
2793 .kill_sb = kill_litter_super, 2369 .kill_sb = kill_litter_super,
2794}; 2370};
2795 2371
2796int __init init_tmpfs(void) 2372int __init shmem_init(void)
2797{ 2373{
2798 int error; 2374 int error;
2799 2375
@@ -2801,18 +2377,18 @@ int __init init_tmpfs(void)
2801 if (error) 2377 if (error)
2802 goto out4; 2378 goto out4;
2803 2379
2804 error = init_inodecache(); 2380 error = shmem_init_inodecache();
2805 if (error) 2381 if (error)
2806 goto out3; 2382 goto out3;
2807 2383
2808 error = register_filesystem(&tmpfs_fs_type); 2384 error = register_filesystem(&shmem_fs_type);
2809 if (error) { 2385 if (error) {
2810 printk(KERN_ERR "Could not register tmpfs\n"); 2386 printk(KERN_ERR "Could not register tmpfs\n");
2811 goto out2; 2387 goto out2;
2812 } 2388 }
2813 2389
2814 shm_mnt = vfs_kern_mount(&tmpfs_fs_type, MS_NOUSER, 2390 shm_mnt = vfs_kern_mount(&shmem_fs_type, MS_NOUSER,
2815 tmpfs_fs_type.name, NULL); 2391 shmem_fs_type.name, NULL);
2816 if (IS_ERR(shm_mnt)) { 2392 if (IS_ERR(shm_mnt)) {
2817 error = PTR_ERR(shm_mnt); 2393 error = PTR_ERR(shm_mnt);
2818 printk(KERN_ERR "Could not kern_mount tmpfs\n"); 2394 printk(KERN_ERR "Could not kern_mount tmpfs\n");
@@ -2821,9 +2397,9 @@ int __init init_tmpfs(void)
2821 return 0; 2397 return 0;
2822 2398
2823out1: 2399out1:
2824 unregister_filesystem(&tmpfs_fs_type); 2400 unregister_filesystem(&shmem_fs_type);
2825out2: 2401out2:
2826 destroy_inodecache(); 2402 shmem_destroy_inodecache();
2827out3: 2403out3:
2828 bdi_destroy(&shmem_backing_dev_info); 2404 bdi_destroy(&shmem_backing_dev_info);
2829out4: 2405out4:
@@ -2831,45 +2407,6 @@ out4:
2831 return error; 2407 return error;
2832} 2408}
2833 2409
2834#ifdef CONFIG_CGROUP_MEM_RES_CTLR
2835/**
2836 * mem_cgroup_get_shmem_target - find a page or entry assigned to the shmem file
2837 * @inode: the inode to be searched
2838 * @pgoff: the offset to be searched
2839 * @pagep: the pointer for the found page to be stored
2840 * @ent: the pointer for the found swap entry to be stored
2841 *
2842 * If a page is found, refcount of it is incremented. Callers should handle
2843 * these refcount.
2844 */
2845void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t pgoff,
2846 struct page **pagep, swp_entry_t *ent)
2847{
2848 swp_entry_t entry = { .val = 0 }, *ptr;
2849 struct page *page = NULL;
2850 struct shmem_inode_info *info = SHMEM_I(inode);
2851
2852 if ((pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode))
2853 goto out;
2854
2855 spin_lock(&info->lock);
2856 ptr = shmem_swp_entry(info, pgoff, NULL);
2857#ifdef CONFIG_SWAP
2858 if (ptr && ptr->val) {
2859 entry.val = ptr->val;
2860 page = find_get_page(&swapper_space, entry.val);
2861 } else
2862#endif
2863 page = find_get_page(inode->i_mapping, pgoff);
2864 if (ptr)
2865 shmem_swp_unmap(ptr);
2866 spin_unlock(&info->lock);
2867out:
2868 *pagep = page;
2869 *ent = entry;
2870}
2871#endif
2872
2873#else /* !CONFIG_SHMEM */ 2410#else /* !CONFIG_SHMEM */
2874 2411
2875/* 2412/*
@@ -2883,23 +2420,23 @@ out:
2883 2420
2884#include <linux/ramfs.h> 2421#include <linux/ramfs.h>
2885 2422
2886static struct file_system_type tmpfs_fs_type = { 2423static struct file_system_type shmem_fs_type = {
2887 .name = "tmpfs", 2424 .name = "tmpfs",
2888 .mount = ramfs_mount, 2425 .mount = ramfs_mount,
2889 .kill_sb = kill_litter_super, 2426 .kill_sb = kill_litter_super,
2890}; 2427};
2891 2428
2892int __init init_tmpfs(void) 2429int __init shmem_init(void)
2893{ 2430{
2894 BUG_ON(register_filesystem(&tmpfs_fs_type) != 0); 2431 BUG_ON(register_filesystem(&shmem_fs_type) != 0);
2895 2432
2896 shm_mnt = kern_mount(&tmpfs_fs_type); 2433 shm_mnt = kern_mount(&shmem_fs_type);
2897 BUG_ON(IS_ERR(shm_mnt)); 2434 BUG_ON(IS_ERR(shm_mnt));
2898 2435
2899 return 0; 2436 return 0;
2900} 2437}
2901 2438
2902int shmem_unuse(swp_entry_t entry, struct page *page) 2439int shmem_unuse(swp_entry_t swap, struct page *page)
2903{ 2440{
2904 return 0; 2441 return 0;
2905} 2442}
@@ -2909,43 +2446,17 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user)
2909 return 0; 2446 return 0;
2910} 2447}
2911 2448
2912void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end) 2449void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
2913{ 2450{
2914 truncate_inode_pages_range(inode->i_mapping, start, end); 2451 truncate_inode_pages_range(inode->i_mapping, lstart, lend);
2915} 2452}
2916EXPORT_SYMBOL_GPL(shmem_truncate_range); 2453EXPORT_SYMBOL_GPL(shmem_truncate_range);
2917 2454
2918#ifdef CONFIG_CGROUP_MEM_RES_CTLR
2919/**
2920 * mem_cgroup_get_shmem_target - find a page or entry assigned to the shmem file
2921 * @inode: the inode to be searched
2922 * @pgoff: the offset to be searched
2923 * @pagep: the pointer for the found page to be stored
2924 * @ent: the pointer for the found swap entry to be stored
2925 *
2926 * If a page is found, refcount of it is incremented. Callers should handle
2927 * these refcount.
2928 */
2929void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t pgoff,
2930 struct page **pagep, swp_entry_t *ent)
2931{
2932 struct page *page = NULL;
2933
2934 if ((pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode))
2935 goto out;
2936 page = find_get_page(inode->i_mapping, pgoff);
2937out:
2938 *pagep = page;
2939 *ent = (swp_entry_t){ .val = 0 };
2940}
2941#endif
2942
2943#define shmem_vm_ops generic_file_vm_ops 2455#define shmem_vm_ops generic_file_vm_ops
2944#define shmem_file_operations ramfs_file_operations 2456#define shmem_file_operations ramfs_file_operations
2945#define shmem_get_inode(sb, dir, mode, dev, flags) ramfs_get_inode(sb, dir, mode, dev) 2457#define shmem_get_inode(sb, dir, mode, dev, flags) ramfs_get_inode(sb, dir, mode, dev)
2946#define shmem_acct_size(flags, size) 0 2458#define shmem_acct_size(flags, size) 0
2947#define shmem_unacct_size(flags, size) do {} while (0) 2459#define shmem_unacct_size(flags, size) do {} while (0)
2948#define SHMEM_MAX_BYTES MAX_LFS_FILESIZE
2949 2460
2950#endif /* CONFIG_SHMEM */ 2461#endif /* CONFIG_SHMEM */
2951 2462
@@ -2969,7 +2480,7 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags
2969 if (IS_ERR(shm_mnt)) 2480 if (IS_ERR(shm_mnt))
2970 return (void *)shm_mnt; 2481 return (void *)shm_mnt;
2971 2482
2972 if (size < 0 || size > SHMEM_MAX_BYTES) 2483 if (size < 0 || size > MAX_LFS_FILESIZE)
2973 return ERR_PTR(-EINVAL); 2484 return ERR_PTR(-EINVAL);
2974 2485
2975 if (shmem_acct_size(flags, size)) 2486 if (shmem_acct_size(flags, size))
@@ -2992,7 +2503,7 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags
2992 2503
2993 d_instantiate(path.dentry, inode); 2504 d_instantiate(path.dentry, inode);
2994 inode->i_size = size; 2505 inode->i_size = size;
2995 inode->i_nlink = 0; /* It is unlinked */ 2506 clear_nlink(inode); /* It is unlinked */
2996#ifndef CONFIG_MMU 2507#ifndef CONFIG_MMU
2997 error = ramfs_nommu_expand_for_mapping(inode, size); 2508 error = ramfs_nommu_expand_for_mapping(inode, size);
2998 if (error) 2509 if (error)
@@ -3048,13 +2559,29 @@ int shmem_zero_setup(struct vm_area_struct *vma)
3048 * suit tmpfs, since it may have pages in swapcache, and needs to find those 2559 * suit tmpfs, since it may have pages in swapcache, and needs to find those
3049 * for itself; although drivers/gpu/drm i915 and ttm rely upon this support. 2560 * for itself; although drivers/gpu/drm i915 and ttm rely upon this support.
3050 * 2561 *
3051 * Provide a stub for those callers to start using now, then later 2562 * i915_gem_object_get_pages_gtt() mixes __GFP_NORETRY | __GFP_NOWARN in
3052 * flesh it out to call shmem_getpage() with additional gfp mask, when 2563 * with the mapping_gfp_mask(), to avoid OOMing the machine unnecessarily.
3053 * shmem_file_splice_read() is added and shmem_readpage() is removed.
3054 */ 2564 */
3055struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, 2565struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
3056 pgoff_t index, gfp_t gfp) 2566 pgoff_t index, gfp_t gfp)
3057{ 2567{
2568#ifdef CONFIG_SHMEM
2569 struct inode *inode = mapping->host;
2570 struct page *page;
2571 int error;
2572
2573 BUG_ON(mapping->a_ops != &shmem_aops);
2574 error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE, gfp, NULL);
2575 if (error)
2576 page = ERR_PTR(error);
2577 else
2578 unlock_page(page);
2579 return page;
2580#else
2581 /*
2582 * The tiny !SHMEM case uses ramfs without swap
2583 */
3058 return read_cache_page_gfp(mapping, index, gfp); 2584 return read_cache_page_gfp(mapping, index, gfp);
2585#endif
3059} 2586}
3060EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp); 2587EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp);
diff --git a/mm/slab.c b/mm/slab.c
index d96e223de775..708efe886154 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -574,7 +574,9 @@ static struct arraycache_init initarray_generic =
574 { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; 574 { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
575 575
576/* internal cache of cache description objs */ 576/* internal cache of cache description objs */
577static struct kmem_list3 *cache_cache_nodelists[MAX_NUMNODES];
577static struct kmem_cache cache_cache = { 578static struct kmem_cache cache_cache = {
579 .nodelists = cache_cache_nodelists,
578 .batchcount = 1, 580 .batchcount = 1,
579 .limit = BOOT_CPUCACHE_ENTRIES, 581 .limit = BOOT_CPUCACHE_ENTRIES,
580 .shared = 1, 582 .shared = 1,
@@ -620,6 +622,51 @@ int slab_is_available(void)
620static struct lock_class_key on_slab_l3_key; 622static struct lock_class_key on_slab_l3_key;
621static struct lock_class_key on_slab_alc_key; 623static struct lock_class_key on_slab_alc_key;
622 624
625static struct lock_class_key debugobj_l3_key;
626static struct lock_class_key debugobj_alc_key;
627
628static void slab_set_lock_classes(struct kmem_cache *cachep,
629 struct lock_class_key *l3_key, struct lock_class_key *alc_key,
630 int q)
631{
632 struct array_cache **alc;
633 struct kmem_list3 *l3;
634 int r;
635
636 l3 = cachep->nodelists[q];
637 if (!l3)
638 return;
639
640 lockdep_set_class(&l3->list_lock, l3_key);
641 alc = l3->alien;
642 /*
643 * FIXME: This check for BAD_ALIEN_MAGIC
644 * should go away when common slab code is taught to
645 * work even without alien caches.
646 * Currently, non NUMA code returns BAD_ALIEN_MAGIC
647 * for alloc_alien_cache,
648 */
649 if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC)
650 return;
651 for_each_node(r) {
652 if (alc[r])
653 lockdep_set_class(&alc[r]->lock, alc_key);
654 }
655}
656
657static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, int node)
658{
659 slab_set_lock_classes(cachep, &debugobj_l3_key, &debugobj_alc_key, node);
660}
661
662static void slab_set_debugobj_lock_classes(struct kmem_cache *cachep)
663{
664 int node;
665
666 for_each_online_node(node)
667 slab_set_debugobj_lock_classes_node(cachep, node);
668}
669
623static void init_node_lock_keys(int q) 670static void init_node_lock_keys(int q)
624{ 671{
625 struct cache_sizes *s = malloc_sizes; 672 struct cache_sizes *s = malloc_sizes;
@@ -628,29 +675,14 @@ static void init_node_lock_keys(int q)
628 return; 675 return;
629 676
630 for (s = malloc_sizes; s->cs_size != ULONG_MAX; s++) { 677 for (s = malloc_sizes; s->cs_size != ULONG_MAX; s++) {
631 struct array_cache **alc;
632 struct kmem_list3 *l3; 678 struct kmem_list3 *l3;
633 int r;
634 679
635 l3 = s->cs_cachep->nodelists[q]; 680 l3 = s->cs_cachep->nodelists[q];
636 if (!l3 || OFF_SLAB(s->cs_cachep)) 681 if (!l3 || OFF_SLAB(s->cs_cachep))
637 continue; 682 continue;
638 lockdep_set_class(&l3->list_lock, &on_slab_l3_key); 683
639 alc = l3->alien; 684 slab_set_lock_classes(s->cs_cachep, &on_slab_l3_key,
640 /* 685 &on_slab_alc_key, q);
641 * FIXME: This check for BAD_ALIEN_MAGIC
642 * should go away when common slab code is taught to
643 * work even without alien caches.
644 * Currently, non NUMA code returns BAD_ALIEN_MAGIC
645 * for alloc_alien_cache,
646 */
647 if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC)
648 continue;
649 for_each_node(r) {
650 if (alc[r])
651 lockdep_set_class(&alc[r]->lock,
652 &on_slab_alc_key);
653 }
654 } 686 }
655} 687}
656 688
@@ -669,6 +701,14 @@ static void init_node_lock_keys(int q)
669static inline void init_lock_keys(void) 701static inline void init_lock_keys(void)
670{ 702{
671} 703}
704
705static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, int node)
706{
707}
708
709static void slab_set_debugobj_lock_classes(struct kmem_cache *cachep)
710{
711}
672#endif 712#endif
673 713
674/* 714/*
@@ -1262,6 +1302,8 @@ static int __cpuinit cpuup_prepare(long cpu)
1262 spin_unlock_irq(&l3->list_lock); 1302 spin_unlock_irq(&l3->list_lock);
1263 kfree(shared); 1303 kfree(shared);
1264 free_alien_cache(alien); 1304 free_alien_cache(alien);
1305 if (cachep->flags & SLAB_DEBUG_OBJECTS)
1306 slab_set_debugobj_lock_classes_node(cachep, node);
1265 } 1307 }
1266 init_node_lock_keys(node); 1308 init_node_lock_keys(node);
1267 1309
@@ -1492,11 +1534,10 @@ void __init kmem_cache_init(void)
1492 cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE + node]; 1534 cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE + node];
1493 1535
1494 /* 1536 /*
1495 * struct kmem_cache size depends on nr_node_ids, which 1537 * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids
1496 * can be less than MAX_NUMNODES.
1497 */ 1538 */
1498 cache_cache.buffer_size = offsetof(struct kmem_cache, nodelists) + 1539 cache_cache.buffer_size = offsetof(struct kmem_cache, array[nr_cpu_ids]) +
1499 nr_node_ids * sizeof(struct kmem_list3 *); 1540 nr_node_ids * sizeof(struct kmem_list3 *);
1500#if DEBUG 1541#if DEBUG
1501 cache_cache.obj_size = cache_cache.buffer_size; 1542 cache_cache.obj_size = cache_cache.buffer_size;
1502#endif 1543#endif
@@ -1625,6 +1666,9 @@ void __init kmem_cache_init_late(void)
1625{ 1666{
1626 struct kmem_cache *cachep; 1667 struct kmem_cache *cachep;
1627 1668
1669 /* Annotate slab for lockdep -- annotate the malloc caches */
1670 init_lock_keys();
1671
1628 /* 6) resize the head arrays to their final sizes */ 1672 /* 6) resize the head arrays to their final sizes */
1629 mutex_lock(&cache_chain_mutex); 1673 mutex_lock(&cache_chain_mutex);
1630 list_for_each_entry(cachep, &cache_chain, next) 1674 list_for_each_entry(cachep, &cache_chain, next)
@@ -1635,9 +1679,6 @@ void __init kmem_cache_init_late(void)
1635 /* Done! */ 1679 /* Done! */
1636 g_cpucache_up = FULL; 1680 g_cpucache_up = FULL;
1637 1681
1638 /* Annotate slab for lockdep -- annotate the malloc caches */
1639 init_lock_keys();
1640
1641 /* 1682 /*
1642 * Register a cpu startup notifier callback that initializes 1683 * Register a cpu startup notifier callback that initializes
1643 * cpu_cache_get for all new cpus 1684 * cpu_cache_get for all new cpus
@@ -1810,15 +1851,15 @@ static void dump_line(char *data, int offset, int limit)
1810 unsigned char error = 0; 1851 unsigned char error = 0;
1811 int bad_count = 0; 1852 int bad_count = 0;
1812 1853
1813 printk(KERN_ERR "%03x:", offset); 1854 printk(KERN_ERR "%03x: ", offset);
1814 for (i = 0; i < limit; i++) { 1855 for (i = 0; i < limit; i++) {
1815 if (data[offset + i] != POISON_FREE) { 1856 if (data[offset + i] != POISON_FREE) {
1816 error = data[offset + i]; 1857 error = data[offset + i];
1817 bad_count++; 1858 bad_count++;
1818 } 1859 }
1819 printk(" %02x", (unsigned char)data[offset + i]);
1820 } 1860 }
1821 printk("\n"); 1861 print_hex_dump(KERN_CONT, "", 0, 16, 1,
1862 &data[offset], limit, 1);
1822 1863
1823 if (bad_count == 1) { 1864 if (bad_count == 1) {
1824 error ^= POISON_FREE; 1865 error ^= POISON_FREE;
@@ -2308,6 +2349,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2308 if (!cachep) 2349 if (!cachep)
2309 goto oops; 2350 goto oops;
2310 2351
2352 cachep->nodelists = (struct kmem_list3 **)&cachep->array[nr_cpu_ids];
2311#if DEBUG 2353#if DEBUG
2312 cachep->obj_size = size; 2354 cachep->obj_size = size;
2313 2355
@@ -2424,6 +2466,16 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2424 goto oops; 2466 goto oops;
2425 } 2467 }
2426 2468
2469 if (flags & SLAB_DEBUG_OBJECTS) {
2470 /*
2471 * Would deadlock through slab_destroy()->call_rcu()->
2472 * debug_object_activate()->kmem_cache_alloc().
2473 */
2474 WARN_ON_ONCE(flags & SLAB_DESTROY_BY_RCU);
2475
2476 slab_set_debugobj_lock_classes(cachep);
2477 }
2478
2427 /* cache setup completed, link it into the list */ 2479 /* cache setup completed, link it into the list */
2428 list_add(&cachep->next, &cache_chain); 2480 list_add(&cachep->next, &cache_chain);
2429oops: 2481oops:
@@ -2987,14 +3039,9 @@ bad:
2987 printk(KERN_ERR "slab: Internal list corruption detected in " 3039 printk(KERN_ERR "slab: Internal list corruption detected in "
2988 "cache '%s'(%d), slabp %p(%d). Hexdump:\n", 3040 "cache '%s'(%d), slabp %p(%d). Hexdump:\n",
2989 cachep->name, cachep->num, slabp, slabp->inuse); 3041 cachep->name, cachep->num, slabp, slabp->inuse);
2990 for (i = 0; 3042 print_hex_dump(KERN_ERR, "", DUMP_PREFIX_OFFSET, 16, 1, slabp,
2991 i < sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t); 3043 sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t),
2992 i++) { 3044 1);
2993 if (i % 16 == 0)
2994 printk("\n%03x:", i);
2995 printk(" %02x", ((unsigned char *)slabp)[i]);
2996 }
2997 printk("\n");
2998 BUG(); 3045 BUG();
2999 } 3046 }
3000} 3047}
@@ -3153,12 +3200,11 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
3153 objp += obj_offset(cachep); 3200 objp += obj_offset(cachep);
3154 if (cachep->ctor && cachep->flags & SLAB_POISON) 3201 if (cachep->ctor && cachep->flags & SLAB_POISON)
3155 cachep->ctor(objp); 3202 cachep->ctor(objp);
3156#if ARCH_SLAB_MINALIGN 3203 if (ARCH_SLAB_MINALIGN &&
3157 if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) { 3204 ((unsigned long)objp & (ARCH_SLAB_MINALIGN-1))) {
3158 printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n", 3205 printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n",
3159 objp, ARCH_SLAB_MINALIGN); 3206 objp, (int)ARCH_SLAB_MINALIGN);
3160 } 3207 }
3161#endif
3162 return objp; 3208 return objp;
3163} 3209}
3164#else 3210#else
@@ -3402,7 +3448,7 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
3402 cache_alloc_debugcheck_before(cachep, flags); 3448 cache_alloc_debugcheck_before(cachep, flags);
3403 local_irq_save(save_flags); 3449 local_irq_save(save_flags);
3404 3450
3405 if (nodeid == -1) 3451 if (nodeid == NUMA_NO_NODE)
3406 nodeid = slab_node; 3452 nodeid = slab_node;
3407 3453
3408 if (unlikely(!cachep->nodelists[nodeid])) { 3454 if (unlikely(!cachep->nodelists[nodeid])) {
@@ -3933,7 +3979,7 @@ fail:
3933 3979
3934struct ccupdate_struct { 3980struct ccupdate_struct {
3935 struct kmem_cache *cachep; 3981 struct kmem_cache *cachep;
3936 struct array_cache *new[NR_CPUS]; 3982 struct array_cache *new[0];
3937}; 3983};
3938 3984
3939static void do_ccupdate_local(void *info) 3985static void do_ccupdate_local(void *info)
@@ -3955,7 +4001,8 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
3955 struct ccupdate_struct *new; 4001 struct ccupdate_struct *new;
3956 int i; 4002 int i;
3957 4003
3958 new = kzalloc(sizeof(*new), gfp); 4004 new = kzalloc(sizeof(*new) + nr_cpu_ids * sizeof(struct array_cache *),
4005 gfp);
3959 if (!new) 4006 if (!new)
3960 return -ENOMEM; 4007 return -ENOMEM;
3961 4008
@@ -4532,7 +4579,7 @@ static const struct file_operations proc_slabstats_operations = {
4532 4579
4533static int __init slab_proc_init(void) 4580static int __init slab_proc_init(void)
4534{ 4581{
4535 proc_create("slabinfo",S_IWUSR|S_IRUGO,NULL,&proc_slabinfo_operations); 4582 proc_create("slabinfo",S_IWUSR|S_IRUSR,NULL,&proc_slabinfo_operations);
4536#ifdef CONFIG_DEBUG_SLAB_LEAK 4583#ifdef CONFIG_DEBUG_SLAB_LEAK
4537 proc_create("slab_allocators", 0, NULL, &proc_slabstats_operations); 4584 proc_create("slab_allocators", 0, NULL, &proc_slabstats_operations);
4538#endif 4585#endif
diff --git a/mm/slob.c b/mm/slob.c
index 46e0aee33a23..8105be42cad1 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -63,14 +63,14 @@
63#include <linux/swap.h> /* struct reclaim_state */ 63#include <linux/swap.h> /* struct reclaim_state */
64#include <linux/cache.h> 64#include <linux/cache.h>
65#include <linux/init.h> 65#include <linux/init.h>
66#include <linux/module.h> 66#include <linux/export.h>
67#include <linux/rcupdate.h> 67#include <linux/rcupdate.h>
68#include <linux/list.h> 68#include <linux/list.h>
69#include <linux/kmemleak.h> 69#include <linux/kmemleak.h>
70 70
71#include <trace/events/kmem.h> 71#include <trace/events/kmem.h>
72 72
73#include <asm/atomic.h> 73#include <linux/atomic.h>
74 74
75/* 75/*
76 * slob_block has a field 'units', which indicates size of block if +ve, 76 * slob_block has a field 'units', which indicates size of block if +ve,
@@ -482,6 +482,8 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node)
482 int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); 482 int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
483 void *ret; 483 void *ret;
484 484
485 gfp &= gfp_allowed_mask;
486
485 lockdep_trace_alloc(gfp); 487 lockdep_trace_alloc(gfp);
486 488
487 if (size < PAGE_SIZE - align) { 489 if (size < PAGE_SIZE - align) {
@@ -608,6 +610,10 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
608{ 610{
609 void *b; 611 void *b;
610 612
613 flags &= gfp_allowed_mask;
614
615 lockdep_trace_alloc(flags);
616
611 if (c->size < PAGE_SIZE) { 617 if (c->size < PAGE_SIZE) {
612 b = slob_alloc(c->size, flags, c->align, node); 618 b = slob_alloc(c->size, flags, c->align, node);
613 trace_kmem_cache_alloc_node(_RET_IP_, b, c->size, 619 trace_kmem_cache_alloc_node(_RET_IP_, b, c->size,
diff --git a/mm/slub.c b/mm/slub.c
index 35f351f26193..7d2a996c307e 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2,10 +2,11 @@
2 * SLUB: A slab allocator that limits cache line use instead of queuing 2 * SLUB: A slab allocator that limits cache line use instead of queuing
3 * objects in per cpu and per node lists. 3 * objects in per cpu and per node lists.
4 * 4 *
5 * The allocator synchronizes using per slab locks and only 5 * The allocator synchronizes using per slab locks or atomic operatios
6 * uses a centralized lock to manage a pool of partial slabs. 6 * and only uses a centralized lock to manage a pool of partial slabs.
7 * 7 *
8 * (C) 2007 SGI, Christoph Lameter 8 * (C) 2007 SGI, Christoph Lameter
9 * (C) 2011 Linux Foundation, Christoph Lameter
9 */ 10 */
10 11
11#include <linux/mm.h> 12#include <linux/mm.h>
@@ -27,20 +28,33 @@
27#include <linux/memory.h> 28#include <linux/memory.h>
28#include <linux/math64.h> 29#include <linux/math64.h>
29#include <linux/fault-inject.h> 30#include <linux/fault-inject.h>
31#include <linux/stacktrace.h>
30 32
31#include <trace/events/kmem.h> 33#include <trace/events/kmem.h>
32 34
33/* 35/*
34 * Lock order: 36 * Lock order:
35 * 1. slab_lock(page) 37 * 1. slub_lock (Global Semaphore)
36 * 2. slab->list_lock 38 * 2. node->list_lock
39 * 3. slab_lock(page) (Only on some arches and for debugging)
37 * 40 *
38 * The slab_lock protects operations on the object of a particular 41 * slub_lock
39 * slab and its metadata in the page struct. If the slab lock 42 *
40 * has been taken then no allocations nor frees can be performed 43 * The role of the slub_lock is to protect the list of all the slabs
41 * on the objects in the slab nor can the slab be added or removed 44 * and to synchronize major metadata changes to slab cache structures.
42 * from the partial or full lists since this would mean modifying 45 *
43 * the page_struct of the slab. 46 * The slab_lock is only used for debugging and on arches that do not
47 * have the ability to do a cmpxchg_double. It only protects the second
48 * double word in the page struct. Meaning
49 * A. page->freelist -> List of object free in a page
50 * B. page->counters -> Counters of objects
51 * C. page->frozen -> frozen state
52 *
53 * If a slab is frozen then it is exempt from list management. It is not
54 * on any list. The processor that froze the slab is the one who can
55 * perform list operations on the page. Other processors may put objects
56 * onto the freelist but the processor that froze the slab is the only
57 * one that can retrieve the objects from the page's freelist.
44 * 58 *
45 * The list_lock protects the partial and full list on each node and 59 * The list_lock protects the partial and full list on each node and
46 * the partial slab counter. If taken then no new slabs may be added or 60 * the partial slab counter. If taken then no new slabs may be added or
@@ -53,20 +67,6 @@
53 * slabs, operations can continue without any centralized lock. F.e. 67 * slabs, operations can continue without any centralized lock. F.e.
54 * allocating a long series of objects that fill up slabs does not require 68 * allocating a long series of objects that fill up slabs does not require
55 * the list lock. 69 * the list lock.
56 *
57 * The lock order is sometimes inverted when we are trying to get a slab
58 * off a list. We take the list_lock and then look for a page on the list
59 * to use. While we do that objects in the slabs may be freed. We can
60 * only operate on the slab if we have also taken the slab_lock. So we use
61 * a slab_trylock() on the slab. If trylock was successful then no frees
62 * can occur anymore and we can use the slab for allocations etc. If the
63 * slab_trylock() does not succeed then frees are in progress in the slab and
64 * we must stay away from it for a while since we may cause a bouncing
65 * cacheline if we try to acquire the lock. So go onto the next slab.
66 * If all pages are busy then we may allocate a new slab instead of reusing
67 * a partial slab. A new slab has no one operating on it and thus there is
68 * no danger of cacheline contention.
69 *
70 * Interrupts are disabled during allocation and deallocation in order to 70 * Interrupts are disabled during allocation and deallocation in order to
71 * make the slab allocator safe to use in the context of an irq. In addition 71 * make the slab allocator safe to use in the context of an irq. In addition
72 * interrupts are disabled to ensure that the processor does not change 72 * interrupts are disabled to ensure that the processor does not change
@@ -131,6 +131,9 @@ static inline int kmem_cache_debug(struct kmem_cache *s)
131/* Enable to test recovery from slab corruption on boot */ 131/* Enable to test recovery from slab corruption on boot */
132#undef SLUB_RESILIENCY_TEST 132#undef SLUB_RESILIENCY_TEST
133 133
134/* Enable to log cmpxchg failures */
135#undef SLUB_DEBUG_CMPXCHG
136
134/* 137/*
135 * Mininum number of partial slabs. These will be left on the partial 138 * Mininum number of partial slabs. These will be left on the partial
136 * lists even if they are empty. kmem_cache_shrink may reclaim them. 139 * lists even if they are empty. kmem_cache_shrink may reclaim them.
@@ -166,10 +169,11 @@ static inline int kmem_cache_debug(struct kmem_cache *s)
166 169
167#define OO_SHIFT 16 170#define OO_SHIFT 16
168#define OO_MASK ((1 << OO_SHIFT) - 1) 171#define OO_MASK ((1 << OO_SHIFT) - 1)
169#define MAX_OBJS_PER_PAGE 65535 /* since page.objects is u16 */ 172#define MAX_OBJS_PER_PAGE 32767 /* since page.objects is u15 */
170 173
171/* Internal SLUB flags */ 174/* Internal SLUB flags */
172#define __OBJECT_POISON 0x80000000UL /* Poison object */ 175#define __OBJECT_POISON 0x80000000UL /* Poison object */
176#define __CMPXCHG_DOUBLE 0x40000000UL /* Use cmpxchg_double */
173 177
174static int kmem_size = sizeof(struct kmem_cache); 178static int kmem_size = sizeof(struct kmem_cache);
175 179
@@ -191,8 +195,12 @@ static LIST_HEAD(slab_caches);
191/* 195/*
192 * Tracking user of a slab. 196 * Tracking user of a slab.
193 */ 197 */
198#define TRACK_ADDRS_COUNT 16
194struct track { 199struct track {
195 unsigned long addr; /* Called from address */ 200 unsigned long addr; /* Called from address */
201#ifdef CONFIG_STACKTRACE
202 unsigned long addrs[TRACK_ADDRS_COUNT]; /* Called from address */
203#endif
196 int cpu; /* Was running on cpu */ 204 int cpu; /* Was running on cpu */
197 int pid; /* Pid context */ 205 int pid; /* Pid context */
198 unsigned long when; /* When did the operation occur */ 206 unsigned long when; /* When did the operation occur */
@@ -338,11 +346,99 @@ static inline int oo_objects(struct kmem_cache_order_objects x)
338 return x.x & OO_MASK; 346 return x.x & OO_MASK;
339} 347}
340 348
349/*
350 * Per slab locking using the pagelock
351 */
352static __always_inline void slab_lock(struct page *page)
353{
354 bit_spin_lock(PG_locked, &page->flags);
355}
356
357static __always_inline void slab_unlock(struct page *page)
358{
359 __bit_spin_unlock(PG_locked, &page->flags);
360}
361
362/* Interrupts must be disabled (for the fallback code to work right) */
363static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
364 void *freelist_old, unsigned long counters_old,
365 void *freelist_new, unsigned long counters_new,
366 const char *n)
367{
368 VM_BUG_ON(!irqs_disabled());
369#ifdef CONFIG_CMPXCHG_DOUBLE
370 if (s->flags & __CMPXCHG_DOUBLE) {
371 if (cmpxchg_double(&page->freelist,
372 freelist_old, counters_old,
373 freelist_new, counters_new))
374 return 1;
375 } else
376#endif
377 {
378 slab_lock(page);
379 if (page->freelist == freelist_old && page->counters == counters_old) {
380 page->freelist = freelist_new;
381 page->counters = counters_new;
382 slab_unlock(page);
383 return 1;
384 }
385 slab_unlock(page);
386 }
387
388 cpu_relax();
389 stat(s, CMPXCHG_DOUBLE_FAIL);
390
391#ifdef SLUB_DEBUG_CMPXCHG
392 printk(KERN_INFO "%s %s: cmpxchg double redo ", n, s->name);
393#endif
394
395 return 0;
396}
397
398static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
399 void *freelist_old, unsigned long counters_old,
400 void *freelist_new, unsigned long counters_new,
401 const char *n)
402{
403#ifdef CONFIG_CMPXCHG_DOUBLE
404 if (s->flags & __CMPXCHG_DOUBLE) {
405 if (cmpxchg_double(&page->freelist,
406 freelist_old, counters_old,
407 freelist_new, counters_new))
408 return 1;
409 } else
410#endif
411 {
412 unsigned long flags;
413
414 local_irq_save(flags);
415 slab_lock(page);
416 if (page->freelist == freelist_old && page->counters == counters_old) {
417 page->freelist = freelist_new;
418 page->counters = counters_new;
419 slab_unlock(page);
420 local_irq_restore(flags);
421 return 1;
422 }
423 slab_unlock(page);
424 local_irq_restore(flags);
425 }
426
427 cpu_relax();
428 stat(s, CMPXCHG_DOUBLE_FAIL);
429
430#ifdef SLUB_DEBUG_CMPXCHG
431 printk(KERN_INFO "%s %s: cmpxchg double redo ", n, s->name);
432#endif
433
434 return 0;
435}
436
341#ifdef CONFIG_SLUB_DEBUG 437#ifdef CONFIG_SLUB_DEBUG
342/* 438/*
343 * Determine a map of object in use on a page. 439 * Determine a map of object in use on a page.
344 * 440 *
345 * Slab lock or node listlock must be held to guarantee that the page does 441 * Node listlock must be held to guarantee that the page does
346 * not vanish from under us. 442 * not vanish from under us.
347 */ 443 */
348static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map) 444static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map)
@@ -371,34 +467,8 @@ static int disable_higher_order_debug;
371 */ 467 */
372static void print_section(char *text, u8 *addr, unsigned int length) 468static void print_section(char *text, u8 *addr, unsigned int length)
373{ 469{
374 int i, offset; 470 print_hex_dump(KERN_ERR, text, DUMP_PREFIX_ADDRESS, 16, 1, addr,
375 int newline = 1; 471 length, 1);
376 char ascii[17];
377
378 ascii[16] = 0;
379
380 for (i = 0; i < length; i++) {
381 if (newline) {
382 printk(KERN_ERR "%8s 0x%p: ", text, addr + i);
383 newline = 0;
384 }
385 printk(KERN_CONT " %02x", addr[i]);
386 offset = i % 16;
387 ascii[offset] = isgraph(addr[i]) ? addr[i] : '.';
388 if (offset == 15) {
389 printk(KERN_CONT " %s\n", ascii);
390 newline = 1;
391 }
392 }
393 if (!newline) {
394 i %= 16;
395 while (i < 16) {
396 printk(KERN_CONT " ");
397 ascii[i] = ' ';
398 i++;
399 }
400 printk(KERN_CONT " %s\n", ascii);
401 }
402} 472}
403 473
404static struct track *get_track(struct kmem_cache *s, void *object, 474static struct track *get_track(struct kmem_cache *s, void *object,
@@ -420,6 +490,24 @@ static void set_track(struct kmem_cache *s, void *object,
420 struct track *p = get_track(s, object, alloc); 490 struct track *p = get_track(s, object, alloc);
421 491
422 if (addr) { 492 if (addr) {
493#ifdef CONFIG_STACKTRACE
494 struct stack_trace trace;
495 int i;
496
497 trace.nr_entries = 0;
498 trace.max_entries = TRACK_ADDRS_COUNT;
499 trace.entries = p->addrs;
500 trace.skip = 3;
501 save_stack_trace(&trace);
502
503 /* See rant in lockdep.c */
504 if (trace.nr_entries != 0 &&
505 trace.entries[trace.nr_entries - 1] == ULONG_MAX)
506 trace.nr_entries--;
507
508 for (i = trace.nr_entries; i < TRACK_ADDRS_COUNT; i++)
509 p->addrs[i] = 0;
510#endif
423 p->addr = addr; 511 p->addr = addr;
424 p->cpu = smp_processor_id(); 512 p->cpu = smp_processor_id();
425 p->pid = current->pid; 513 p->pid = current->pid;
@@ -444,6 +532,16 @@ static void print_track(const char *s, struct track *t)
444 532
445 printk(KERN_ERR "INFO: %s in %pS age=%lu cpu=%u pid=%d\n", 533 printk(KERN_ERR "INFO: %s in %pS age=%lu cpu=%u pid=%d\n",
446 s, (void *)t->addr, jiffies - t->when, t->cpu, t->pid); 534 s, (void *)t->addr, jiffies - t->when, t->cpu, t->pid);
535#ifdef CONFIG_STACKTRACE
536 {
537 int i;
538 for (i = 0; i < TRACK_ADDRS_COUNT; i++)
539 if (t->addrs[i])
540 printk(KERN_ERR "\t%pS\n", (void *)t->addrs[i]);
541 else
542 break;
543 }
544#endif
447} 545}
448 546
449static void print_tracking(struct kmem_cache *s, void *object) 547static void print_tracking(struct kmem_cache *s, void *object)
@@ -501,12 +599,12 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
501 p, p - addr, get_freepointer(s, p)); 599 p, p - addr, get_freepointer(s, p));
502 600
503 if (p > addr + 16) 601 if (p > addr + 16)
504 print_section("Bytes b4", p - 16, 16); 602 print_section("Bytes b4 ", p - 16, 16);
505
506 print_section("Object", p, min_t(unsigned long, s->objsize, PAGE_SIZE));
507 603
604 print_section("Object ", p, min_t(unsigned long, s->objsize,
605 PAGE_SIZE));
508 if (s->flags & SLAB_RED_ZONE) 606 if (s->flags & SLAB_RED_ZONE)
509 print_section("Redzone", p + s->objsize, 607 print_section("Redzone ", p + s->objsize,
510 s->inuse - s->objsize); 608 s->inuse - s->objsize);
511 609
512 if (s->offset) 610 if (s->offset)
@@ -519,7 +617,7 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
519 617
520 if (off != s->size) 618 if (off != s->size)
521 /* Beginning of the filler is the free pointer */ 619 /* Beginning of the filler is the free pointer */
522 print_section("Padding", p + off, s->size - off); 620 print_section("Padding ", p + off, s->size - off);
523 621
524 dump_stack(); 622 dump_stack();
525} 623}
@@ -557,17 +655,6 @@ static void init_object(struct kmem_cache *s, void *object, u8 val)
557 memset(p + s->objsize, val, s->inuse - s->objsize); 655 memset(p + s->objsize, val, s->inuse - s->objsize);
558} 656}
559 657
560static u8 *check_bytes(u8 *start, unsigned int value, unsigned int bytes)
561{
562 while (bytes) {
563 if (*start != (u8)value)
564 return start;
565 start++;
566 bytes--;
567 }
568 return NULL;
569}
570
571static void restore_bytes(struct kmem_cache *s, char *message, u8 data, 658static void restore_bytes(struct kmem_cache *s, char *message, u8 data,
572 void *from, void *to) 659 void *from, void *to)
573{ 660{
@@ -582,7 +669,7 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page,
582 u8 *fault; 669 u8 *fault;
583 u8 *end; 670 u8 *end;
584 671
585 fault = check_bytes(start, value, bytes); 672 fault = memchr_inv(start, value, bytes);
586 if (!fault) 673 if (!fault)
587 return 1; 674 return 1;
588 675
@@ -675,14 +762,14 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page)
675 if (!remainder) 762 if (!remainder)
676 return 1; 763 return 1;
677 764
678 fault = check_bytes(end - remainder, POISON_INUSE, remainder); 765 fault = memchr_inv(end - remainder, POISON_INUSE, remainder);
679 if (!fault) 766 if (!fault)
680 return 1; 767 return 1;
681 while (end > fault && end[-1] == POISON_INUSE) 768 while (end > fault && end[-1] == POISON_INUSE)
682 end--; 769 end--;
683 770
684 slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1); 771 slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1);
685 print_section("Padding", end - remainder, remainder); 772 print_section("Padding ", end - remainder, remainder);
686 773
687 restore_bytes(s, "slab padding", POISON_INUSE, end - remainder, end); 774 restore_bytes(s, "slab padding", POISON_INUSE, end - remainder, end);
688 return 0; 775 return 0;
@@ -773,10 +860,11 @@ static int check_slab(struct kmem_cache *s, struct page *page)
773static int on_freelist(struct kmem_cache *s, struct page *page, void *search) 860static int on_freelist(struct kmem_cache *s, struct page *page, void *search)
774{ 861{
775 int nr = 0; 862 int nr = 0;
776 void *fp = page->freelist; 863 void *fp;
777 void *object = NULL; 864 void *object = NULL;
778 unsigned long max_objects; 865 unsigned long max_objects;
779 866
867 fp = page->freelist;
780 while (fp && nr <= page->objects) { 868 while (fp && nr <= page->objects) {
781 if (fp == search) 869 if (fp == search)
782 return 1; 870 return 1;
@@ -830,7 +918,7 @@ static void trace(struct kmem_cache *s, struct page *page, void *object,
830 page->freelist); 918 page->freelist);
831 919
832 if (!alloc) 920 if (!alloc)
833 print_section("Object", (void *)object, s->objsize); 921 print_section("Object ", (void *)object, s->objsize);
834 922
835 dump_stack(); 923 dump_stack();
836 } 924 }
@@ -881,26 +969,27 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x)
881 969
882/* 970/*
883 * Tracking of fully allocated slabs for debugging purposes. 971 * Tracking of fully allocated slabs for debugging purposes.
972 *
973 * list_lock must be held.
884 */ 974 */
885static void add_full(struct kmem_cache_node *n, struct page *page) 975static void add_full(struct kmem_cache *s,
976 struct kmem_cache_node *n, struct page *page)
886{ 977{
887 spin_lock(&n->list_lock); 978 if (!(s->flags & SLAB_STORE_USER))
979 return;
980
888 list_add(&page->lru, &n->full); 981 list_add(&page->lru, &n->full);
889 spin_unlock(&n->list_lock);
890} 982}
891 983
984/*
985 * list_lock must be held.
986 */
892static void remove_full(struct kmem_cache *s, struct page *page) 987static void remove_full(struct kmem_cache *s, struct page *page)
893{ 988{
894 struct kmem_cache_node *n;
895
896 if (!(s->flags & SLAB_STORE_USER)) 989 if (!(s->flags & SLAB_STORE_USER))
897 return; 990 return;
898 991
899 n = get_node(s, page_to_nid(page));
900
901 spin_lock(&n->list_lock);
902 list_del(&page->lru); 992 list_del(&page->lru);
903 spin_unlock(&n->list_lock);
904} 993}
905 994
906/* Tracking of the number of slabs for debugging purposes */ 995/* Tracking of the number of slabs for debugging purposes */
@@ -956,11 +1045,6 @@ static noinline int alloc_debug_processing(struct kmem_cache *s, struct page *pa
956 if (!check_slab(s, page)) 1045 if (!check_slab(s, page))
957 goto bad; 1046 goto bad;
958 1047
959 if (!on_freelist(s, page, object)) {
960 object_err(s, page, object, "Object already allocated");
961 goto bad;
962 }
963
964 if (!check_valid_pointer(s, page, object)) { 1048 if (!check_valid_pointer(s, page, object)) {
965 object_err(s, page, object, "Freelist Pointer check fails"); 1049 object_err(s, page, object, "Freelist Pointer check fails");
966 goto bad; 1050 goto bad;
@@ -993,6 +1077,12 @@ bad:
993static noinline int free_debug_processing(struct kmem_cache *s, 1077static noinline int free_debug_processing(struct kmem_cache *s,
994 struct page *page, void *object, unsigned long addr) 1078 struct page *page, void *object, unsigned long addr)
995{ 1079{
1080 unsigned long flags;
1081 int rc = 0;
1082
1083 local_irq_save(flags);
1084 slab_lock(page);
1085
996 if (!check_slab(s, page)) 1086 if (!check_slab(s, page))
997 goto fail; 1087 goto fail;
998 1088
@@ -1007,7 +1097,7 @@ static noinline int free_debug_processing(struct kmem_cache *s,
1007 } 1097 }
1008 1098
1009 if (!check_object(s, page, object, SLUB_RED_ACTIVE)) 1099 if (!check_object(s, page, object, SLUB_RED_ACTIVE))
1010 return 0; 1100 goto out;
1011 1101
1012 if (unlikely(s != page->slab)) { 1102 if (unlikely(s != page->slab)) {
1013 if (!PageSlab(page)) { 1103 if (!PageSlab(page)) {
@@ -1024,18 +1114,19 @@ static noinline int free_debug_processing(struct kmem_cache *s,
1024 goto fail; 1114 goto fail;
1025 } 1115 }
1026 1116
1027 /* Special debug activities for freeing objects */
1028 if (!PageSlubFrozen(page) && !page->freelist)
1029 remove_full(s, page);
1030 if (s->flags & SLAB_STORE_USER) 1117 if (s->flags & SLAB_STORE_USER)
1031 set_track(s, object, TRACK_FREE, addr); 1118 set_track(s, object, TRACK_FREE, addr);
1032 trace(s, page, object, 0); 1119 trace(s, page, object, 0);
1033 init_object(s, object, SLUB_RED_INACTIVE); 1120 init_object(s, object, SLUB_RED_INACTIVE);
1034 return 1; 1121 rc = 1;
1122out:
1123 slab_unlock(page);
1124 local_irq_restore(flags);
1125 return rc;
1035 1126
1036fail: 1127fail:
1037 slab_fix(s, "Object at 0x%p not freed", object); 1128 slab_fix(s, "Object at 0x%p not freed", object);
1038 return 0; 1129 goto out;
1039} 1130}
1040 1131
1041static int __init setup_slub_debug(char *str) 1132static int __init setup_slub_debug(char *str)
@@ -1135,7 +1226,9 @@ static inline int slab_pad_check(struct kmem_cache *s, struct page *page)
1135 { return 1; } 1226 { return 1; }
1136static inline int check_object(struct kmem_cache *s, struct page *page, 1227static inline int check_object(struct kmem_cache *s, struct page *page,
1137 void *object, u8 val) { return 1; } 1228 void *object, u8 val) { return 1; }
1138static inline void add_full(struct kmem_cache_node *n, struct page *page) {} 1229static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n,
1230 struct page *page) {}
1231static inline void remove_full(struct kmem_cache *s, struct page *page) {}
1139static inline unsigned long kmem_cache_flags(unsigned long objsize, 1232static inline unsigned long kmem_cache_flags(unsigned long objsize,
1140 unsigned long flags, const char *name, 1233 unsigned long flags, const char *name,
1141 void (*ctor)(void *)) 1234 void (*ctor)(void *))
@@ -1187,6 +1280,11 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
1187 struct kmem_cache_order_objects oo = s->oo; 1280 struct kmem_cache_order_objects oo = s->oo;
1188 gfp_t alloc_gfp; 1281 gfp_t alloc_gfp;
1189 1282
1283 flags &= gfp_allowed_mask;
1284
1285 if (flags & __GFP_WAIT)
1286 local_irq_enable();
1287
1190 flags |= s->allocflags; 1288 flags |= s->allocflags;
1191 1289
1192 /* 1290 /*
@@ -1203,12 +1301,17 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
1203 * Try a lower order alloc if possible 1301 * Try a lower order alloc if possible
1204 */ 1302 */
1205 page = alloc_slab_page(flags, node, oo); 1303 page = alloc_slab_page(flags, node, oo);
1206 if (!page)
1207 return NULL;
1208 1304
1209 stat(s, ORDER_FALLBACK); 1305 if (page)
1306 stat(s, ORDER_FALLBACK);
1210 } 1307 }
1211 1308
1309 if (flags & __GFP_WAIT)
1310 local_irq_disable();
1311
1312 if (!page)
1313 return NULL;
1314
1212 if (kmemcheck_enabled 1315 if (kmemcheck_enabled
1213 && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) { 1316 && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) {
1214 int pages = 1 << oo_order(oo); 1317 int pages = 1 << oo_order(oo);
@@ -1275,7 +1378,8 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
1275 set_freepointer(s, last, NULL); 1378 set_freepointer(s, last, NULL);
1276 1379
1277 page->freelist = start; 1380 page->freelist = start;
1278 page->inuse = 0; 1381 page->inuse = page->objects;
1382 page->frozen = 1;
1279out: 1383out:
1280 return page; 1384 return page;
1281} 1385}
@@ -1353,79 +1457,80 @@ static void discard_slab(struct kmem_cache *s, struct page *page)
1353} 1457}
1354 1458
1355/* 1459/*
1356 * Per slab locking using the pagelock 1460 * Management of partially allocated slabs.
1357 */ 1461 *
1358static __always_inline void slab_lock(struct page *page) 1462 * list_lock must be held.
1359{
1360 bit_spin_lock(PG_locked, &page->flags);
1361}
1362
1363static __always_inline void slab_unlock(struct page *page)
1364{
1365 __bit_spin_unlock(PG_locked, &page->flags);
1366}
1367
1368static __always_inline int slab_trylock(struct page *page)
1369{
1370 int rc = 1;
1371
1372 rc = bit_spin_trylock(PG_locked, &page->flags);
1373 return rc;
1374}
1375
1376/*
1377 * Management of partially allocated slabs
1378 */ 1463 */
1379static void add_partial(struct kmem_cache_node *n, 1464static inline void add_partial(struct kmem_cache_node *n,
1380 struct page *page, int tail) 1465 struct page *page, int tail)
1381{ 1466{
1382 spin_lock(&n->list_lock);
1383 n->nr_partial++; 1467 n->nr_partial++;
1384 if (tail) 1468 if (tail == DEACTIVATE_TO_TAIL)
1385 list_add_tail(&page->lru, &n->partial); 1469 list_add_tail(&page->lru, &n->partial);
1386 else 1470 else
1387 list_add(&page->lru, &n->partial); 1471 list_add(&page->lru, &n->partial);
1388 spin_unlock(&n->list_lock);
1389} 1472}
1390 1473
1391static inline void __remove_partial(struct kmem_cache_node *n, 1474/*
1475 * list_lock must be held.
1476 */
1477static inline void remove_partial(struct kmem_cache_node *n,
1392 struct page *page) 1478 struct page *page)
1393{ 1479{
1394 list_del(&page->lru); 1480 list_del(&page->lru);
1395 n->nr_partial--; 1481 n->nr_partial--;
1396} 1482}
1397 1483
1398static void remove_partial(struct kmem_cache *s, struct page *page)
1399{
1400 struct kmem_cache_node *n = get_node(s, page_to_nid(page));
1401
1402 spin_lock(&n->list_lock);
1403 __remove_partial(n, page);
1404 spin_unlock(&n->list_lock);
1405}
1406
1407/* 1484/*
1408 * Lock slab and remove from the partial list. 1485 * Lock slab, remove from the partial list and put the object into the
1486 * per cpu freelist.
1487 *
1488 * Returns a list of objects or NULL if it fails.
1409 * 1489 *
1410 * Must hold list_lock. 1490 * Must hold list_lock.
1411 */ 1491 */
1412static inline int lock_and_freeze_slab(struct kmem_cache_node *n, 1492static inline void *acquire_slab(struct kmem_cache *s,
1413 struct page *page) 1493 struct kmem_cache_node *n, struct page *page,
1494 int mode)
1414{ 1495{
1415 if (slab_trylock(page)) { 1496 void *freelist;
1416 __remove_partial(n, page); 1497 unsigned long counters;
1417 __SetPageSlubFrozen(page); 1498 struct page new;
1418 return 1; 1499
1419 } 1500 /*
1420 return 0; 1501 * Zap the freelist and set the frozen bit.
1502 * The old freelist is the list of objects for the
1503 * per cpu allocation list.
1504 */
1505 do {
1506 freelist = page->freelist;
1507 counters = page->counters;
1508 new.counters = counters;
1509 if (mode)
1510 new.inuse = page->objects;
1511
1512 VM_BUG_ON(new.frozen);
1513 new.frozen = 1;
1514
1515 } while (!__cmpxchg_double_slab(s, page,
1516 freelist, counters,
1517 NULL, new.counters,
1518 "lock and freeze"));
1519
1520 remove_partial(n, page);
1521 return freelist;
1421} 1522}
1422 1523
1524static int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain);
1525
1423/* 1526/*
1424 * Try to allocate a partial slab from a specific node. 1527 * Try to allocate a partial slab from a specific node.
1425 */ 1528 */
1426static struct page *get_partial_node(struct kmem_cache_node *n) 1529static void *get_partial_node(struct kmem_cache *s,
1530 struct kmem_cache_node *n, struct kmem_cache_cpu *c)
1427{ 1531{
1428 struct page *page; 1532 struct page *page, *page2;
1533 void *object = NULL;
1429 1534
1430 /* 1535 /*
1431 * Racy check. If we mistakenly see no partial slabs then we 1536 * Racy check. If we mistakenly see no partial slabs then we
@@ -1437,26 +1542,43 @@ static struct page *get_partial_node(struct kmem_cache_node *n)
1437 return NULL; 1542 return NULL;
1438 1543
1439 spin_lock(&n->list_lock); 1544 spin_lock(&n->list_lock);
1440 list_for_each_entry(page, &n->partial, lru) 1545 list_for_each_entry_safe(page, page2, &n->partial, lru) {
1441 if (lock_and_freeze_slab(n, page)) 1546 void *t = acquire_slab(s, n, page, object == NULL);
1442 goto out; 1547 int available;
1443 page = NULL; 1548
1444out: 1549 if (!t)
1550 break;
1551
1552 if (!object) {
1553 c->page = page;
1554 c->node = page_to_nid(page);
1555 stat(s, ALLOC_FROM_PARTIAL);
1556 object = t;
1557 available = page->objects - page->inuse;
1558 } else {
1559 page->freelist = t;
1560 available = put_cpu_partial(s, page, 0);
1561 }
1562 if (kmem_cache_debug(s) || available > s->cpu_partial / 2)
1563 break;
1564
1565 }
1445 spin_unlock(&n->list_lock); 1566 spin_unlock(&n->list_lock);
1446 return page; 1567 return object;
1447} 1568}
1448 1569
1449/* 1570/*
1450 * Get a page from somewhere. Search in increasing NUMA distances. 1571 * Get a page from somewhere. Search in increasing NUMA distances.
1451 */ 1572 */
1452static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) 1573static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags,
1574 struct kmem_cache_cpu *c)
1453{ 1575{
1454#ifdef CONFIG_NUMA 1576#ifdef CONFIG_NUMA
1455 struct zonelist *zonelist; 1577 struct zonelist *zonelist;
1456 struct zoneref *z; 1578 struct zoneref *z;
1457 struct zone *zone; 1579 struct zone *zone;
1458 enum zone_type high_zoneidx = gfp_zone(flags); 1580 enum zone_type high_zoneidx = gfp_zone(flags);
1459 struct page *page; 1581 void *object;
1460 1582
1461 /* 1583 /*
1462 * The defrag ratio allows a configuration of the tradeoffs between 1584 * The defrag ratio allows a configuration of the tradeoffs between
@@ -1489,10 +1611,10 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
1489 1611
1490 if (n && cpuset_zone_allowed_hardwall(zone, flags) && 1612 if (n && cpuset_zone_allowed_hardwall(zone, flags) &&
1491 n->nr_partial > s->min_partial) { 1613 n->nr_partial > s->min_partial) {
1492 page = get_partial_node(n); 1614 object = get_partial_node(s, n, c);
1493 if (page) { 1615 if (object) {
1494 put_mems_allowed(); 1616 put_mems_allowed();
1495 return page; 1617 return object;
1496 } 1618 }
1497 } 1619 }
1498 } 1620 }
@@ -1504,63 +1626,17 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
1504/* 1626/*
1505 * Get a partial page, lock it and return it. 1627 * Get a partial page, lock it and return it.
1506 */ 1628 */
1507static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node) 1629static void *get_partial(struct kmem_cache *s, gfp_t flags, int node,
1630 struct kmem_cache_cpu *c)
1508{ 1631{
1509 struct page *page; 1632 void *object;
1510 int searchnode = (node == NUMA_NO_NODE) ? numa_node_id() : node; 1633 int searchnode = (node == NUMA_NO_NODE) ? numa_node_id() : node;
1511 1634
1512 page = get_partial_node(get_node(s, searchnode)); 1635 object = get_partial_node(s, get_node(s, searchnode), c);
1513 if (page || node != NUMA_NO_NODE) 1636 if (object || node != NUMA_NO_NODE)
1514 return page; 1637 return object;
1515
1516 return get_any_partial(s, flags);
1517}
1518
1519/*
1520 * Move a page back to the lists.
1521 *
1522 * Must be called with the slab lock held.
1523 *
1524 * On exit the slab lock will have been dropped.
1525 */
1526static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
1527 __releases(bitlock)
1528{
1529 struct kmem_cache_node *n = get_node(s, page_to_nid(page));
1530
1531 __ClearPageSlubFrozen(page);
1532 if (page->inuse) {
1533 1638
1534 if (page->freelist) { 1639 return get_any_partial(s, flags, c);
1535 add_partial(n, page, tail);
1536 stat(s, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD);
1537 } else {
1538 stat(s, DEACTIVATE_FULL);
1539 if (kmem_cache_debug(s) && (s->flags & SLAB_STORE_USER))
1540 add_full(n, page);
1541 }
1542 slab_unlock(page);
1543 } else {
1544 stat(s, DEACTIVATE_EMPTY);
1545 if (n->nr_partial < s->min_partial) {
1546 /*
1547 * Adding an empty slab to the partial slabs in order
1548 * to avoid page allocator overhead. This slab needs
1549 * to come after the other slabs with objects in
1550 * so that the others get filled first. That way the
1551 * size of the partial list stays small.
1552 *
1553 * kmem_cache_shrink can reclaim any empty slabs from
1554 * the partial list.
1555 */
1556 add_partial(n, page, 1);
1557 slab_unlock(page);
1558 } else {
1559 slab_unlock(page);
1560 stat(s, FREE_SLAB);
1561 discard_slab(s, page);
1562 }
1563 }
1564} 1640}
1565 1641
1566#ifdef CONFIG_PREEMPT 1642#ifdef CONFIG_PREEMPT
@@ -1629,45 +1705,278 @@ void init_kmem_cache_cpus(struct kmem_cache *s)
1629 for_each_possible_cpu(cpu) 1705 for_each_possible_cpu(cpu)
1630 per_cpu_ptr(s->cpu_slab, cpu)->tid = init_tid(cpu); 1706 per_cpu_ptr(s->cpu_slab, cpu)->tid = init_tid(cpu);
1631} 1707}
1708
1632/* 1709/*
1633 * Remove the cpu slab 1710 * Remove the cpu slab
1634 */ 1711 */
1635static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) 1712static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
1636 __releases(bitlock)
1637{ 1713{
1714 enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE };
1638 struct page *page = c->page; 1715 struct page *page = c->page;
1639 int tail = 1; 1716 struct kmem_cache_node *n = get_node(s, page_to_nid(page));
1640 1717 int lock = 0;
1641 if (page->freelist) 1718 enum slab_modes l = M_NONE, m = M_NONE;
1719 void *freelist;
1720 void *nextfree;
1721 int tail = DEACTIVATE_TO_HEAD;
1722 struct page new;
1723 struct page old;
1724
1725 if (page->freelist) {
1642 stat(s, DEACTIVATE_REMOTE_FREES); 1726 stat(s, DEACTIVATE_REMOTE_FREES);
1727 tail = DEACTIVATE_TO_TAIL;
1728 }
1729
1730 c->tid = next_tid(c->tid);
1731 c->page = NULL;
1732 freelist = c->freelist;
1733 c->freelist = NULL;
1734
1735 /*
1736 * Stage one: Free all available per cpu objects back
1737 * to the page freelist while it is still frozen. Leave the
1738 * last one.
1739 *
1740 * There is no need to take the list->lock because the page
1741 * is still frozen.
1742 */
1743 while (freelist && (nextfree = get_freepointer(s, freelist))) {
1744 void *prior;
1745 unsigned long counters;
1746
1747 do {
1748 prior = page->freelist;
1749 counters = page->counters;
1750 set_freepointer(s, freelist, prior);
1751 new.counters = counters;
1752 new.inuse--;
1753 VM_BUG_ON(!new.frozen);
1754
1755 } while (!__cmpxchg_double_slab(s, page,
1756 prior, counters,
1757 freelist, new.counters,
1758 "drain percpu freelist"));
1759
1760 freelist = nextfree;
1761 }
1762
1643 /* 1763 /*
1644 * Merge cpu freelist into slab freelist. Typically we get here 1764 * Stage two: Ensure that the page is unfrozen while the
1645 * because both freelists are empty. So this is unlikely 1765 * list presence reflects the actual number of objects
1646 * to occur. 1766 * during unfreeze.
1767 *
1768 * We setup the list membership and then perform a cmpxchg
1769 * with the count. If there is a mismatch then the page
1770 * is not unfrozen but the page is on the wrong list.
1771 *
1772 * Then we restart the process which may have to remove
1773 * the page from the list that we just put it on again
1774 * because the number of objects in the slab may have
1775 * changed.
1647 */ 1776 */
1648 while (unlikely(c->freelist)) { 1777redo:
1649 void **object; 1778
1779 old.freelist = page->freelist;
1780 old.counters = page->counters;
1781 VM_BUG_ON(!old.frozen);
1650 1782
1651 tail = 0; /* Hot objects. Put the slab first */ 1783 /* Determine target state of the slab */
1784 new.counters = old.counters;
1785 if (freelist) {
1786 new.inuse--;
1787 set_freepointer(s, freelist, old.freelist);
1788 new.freelist = freelist;
1789 } else
1790 new.freelist = old.freelist;
1652 1791
1653 /* Retrieve object from cpu_freelist */ 1792 new.frozen = 0;
1654 object = c->freelist;
1655 c->freelist = get_freepointer(s, c->freelist);
1656 1793
1657 /* And put onto the regular freelist */ 1794 if (!new.inuse && n->nr_partial > s->min_partial)
1658 set_freepointer(s, object, page->freelist); 1795 m = M_FREE;
1659 page->freelist = object; 1796 else if (new.freelist) {
1660 page->inuse--; 1797 m = M_PARTIAL;
1798 if (!lock) {
1799 lock = 1;
1800 /*
1801 * Taking the spinlock removes the possiblity
1802 * that acquire_slab() will see a slab page that
1803 * is frozen
1804 */
1805 spin_lock(&n->list_lock);
1806 }
1807 } else {
1808 m = M_FULL;
1809 if (kmem_cache_debug(s) && !lock) {
1810 lock = 1;
1811 /*
1812 * This also ensures that the scanning of full
1813 * slabs from diagnostic functions will not see
1814 * any frozen slabs.
1815 */
1816 spin_lock(&n->list_lock);
1817 }
1818 }
1819
1820 if (l != m) {
1821
1822 if (l == M_PARTIAL)
1823
1824 remove_partial(n, page);
1825
1826 else if (l == M_FULL)
1827
1828 remove_full(s, page);
1829
1830 if (m == M_PARTIAL) {
1831
1832 add_partial(n, page, tail);
1833 stat(s, tail);
1834
1835 } else if (m == M_FULL) {
1836
1837 stat(s, DEACTIVATE_FULL);
1838 add_full(s, n, page);
1839
1840 }
1841 }
1842
1843 l = m;
1844 if (!__cmpxchg_double_slab(s, page,
1845 old.freelist, old.counters,
1846 new.freelist, new.counters,
1847 "unfreezing slab"))
1848 goto redo;
1849
1850 if (lock)
1851 spin_unlock(&n->list_lock);
1852
1853 if (m == M_FREE) {
1854 stat(s, DEACTIVATE_EMPTY);
1855 discard_slab(s, page);
1856 stat(s, FREE_SLAB);
1661 } 1857 }
1662 c->page = NULL; 1858}
1663 c->tid = next_tid(c->tid); 1859
1664 unfreeze_slab(s, page, tail); 1860/* Unfreeze all the cpu partial slabs */
1861static void unfreeze_partials(struct kmem_cache *s)
1862{
1863 struct kmem_cache_node *n = NULL;
1864 struct kmem_cache_cpu *c = this_cpu_ptr(s->cpu_slab);
1865 struct page *page;
1866
1867 while ((page = c->partial)) {
1868 enum slab_modes { M_PARTIAL, M_FREE };
1869 enum slab_modes l, m;
1870 struct page new;
1871 struct page old;
1872
1873 c->partial = page->next;
1874 l = M_FREE;
1875
1876 do {
1877
1878 old.freelist = page->freelist;
1879 old.counters = page->counters;
1880 VM_BUG_ON(!old.frozen);
1881
1882 new.counters = old.counters;
1883 new.freelist = old.freelist;
1884
1885 new.frozen = 0;
1886
1887 if (!new.inuse && (!n || n->nr_partial > s->min_partial))
1888 m = M_FREE;
1889 else {
1890 struct kmem_cache_node *n2 = get_node(s,
1891 page_to_nid(page));
1892
1893 m = M_PARTIAL;
1894 if (n != n2) {
1895 if (n)
1896 spin_unlock(&n->list_lock);
1897
1898 n = n2;
1899 spin_lock(&n->list_lock);
1900 }
1901 }
1902
1903 if (l != m) {
1904 if (l == M_PARTIAL)
1905 remove_partial(n, page);
1906 else
1907 add_partial(n, page, 1);
1908
1909 l = m;
1910 }
1911
1912 } while (!cmpxchg_double_slab(s, page,
1913 old.freelist, old.counters,
1914 new.freelist, new.counters,
1915 "unfreezing slab"));
1916
1917 if (m == M_FREE) {
1918 stat(s, DEACTIVATE_EMPTY);
1919 discard_slab(s, page);
1920 stat(s, FREE_SLAB);
1921 }
1922 }
1923
1924 if (n)
1925 spin_unlock(&n->list_lock);
1926}
1927
1928/*
1929 * Put a page that was just frozen (in __slab_free) into a partial page
1930 * slot if available. This is done without interrupts disabled and without
1931 * preemption disabled. The cmpxchg is racy and may put the partial page
1932 * onto a random cpus partial slot.
1933 *
1934 * If we did not find a slot then simply move all the partials to the
1935 * per node partial list.
1936 */
1937int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
1938{
1939 struct page *oldpage;
1940 int pages;
1941 int pobjects;
1942
1943 do {
1944 pages = 0;
1945 pobjects = 0;
1946 oldpage = this_cpu_read(s->cpu_slab->partial);
1947
1948 if (oldpage) {
1949 pobjects = oldpage->pobjects;
1950 pages = oldpage->pages;
1951 if (drain && pobjects > s->cpu_partial) {
1952 unsigned long flags;
1953 /*
1954 * partial array is full. Move the existing
1955 * set to the per node partial list.
1956 */
1957 local_irq_save(flags);
1958 unfreeze_partials(s);
1959 local_irq_restore(flags);
1960 pobjects = 0;
1961 pages = 0;
1962 }
1963 }
1964
1965 pages++;
1966 pobjects += page->objects - page->inuse;
1967
1968 page->pages = pages;
1969 page->pobjects = pobjects;
1970 page->next = oldpage;
1971
1972 } while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page) != oldpage);
1973 stat(s, CPU_PARTIAL_FREE);
1974 return pobjects;
1665} 1975}
1666 1976
1667static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) 1977static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
1668{ 1978{
1669 stat(s, CPUSLAB_FLUSH); 1979 stat(s, CPUSLAB_FLUSH);
1670 slab_lock(c->page);
1671 deactivate_slab(s, c); 1980 deactivate_slab(s, c);
1672} 1981}
1673 1982
@@ -1680,8 +1989,12 @@ static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu)
1680{ 1989{
1681 struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); 1990 struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
1682 1991
1683 if (likely(c && c->page)) 1992 if (likely(c)) {
1684 flush_slab(s, c); 1993 if (c->page)
1994 flush_slab(s, c);
1995
1996 unfreeze_partials(s);
1997 }
1685} 1998}
1686 1999
1687static void flush_cpu_slab(void *d) 2000static void flush_cpu_slab(void *d)
@@ -1772,12 +2085,39 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
1772 } 2085 }
1773} 2086}
1774 2087
2088static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags,
2089 int node, struct kmem_cache_cpu **pc)
2090{
2091 void *object;
2092 struct kmem_cache_cpu *c;
2093 struct page *page = new_slab(s, flags, node);
2094
2095 if (page) {
2096 c = __this_cpu_ptr(s->cpu_slab);
2097 if (c->page)
2098 flush_slab(s, c);
2099
2100 /*
2101 * No other reference to the page yet so we can
2102 * muck around with it freely without cmpxchg
2103 */
2104 object = page->freelist;
2105 page->freelist = NULL;
2106
2107 stat(s, ALLOC_SLAB);
2108 c->node = page_to_nid(page);
2109 c->page = page;
2110 *pc = c;
2111 } else
2112 object = NULL;
2113
2114 return object;
2115}
2116
1775/* 2117/*
1776 * Slow path. The lockless freelist is empty or we need to perform 2118 * Slow path. The lockless freelist is empty or we need to perform
1777 * debugging duties. 2119 * debugging duties.
1778 * 2120 *
1779 * Interrupts are disabled.
1780 *
1781 * Processing is still very fast if new objects have been freed to the 2121 * Processing is still very fast if new objects have been freed to the
1782 * regular freelist. In that case we simply take over the regular freelist 2122 * regular freelist. In that case we simply take over the regular freelist
1783 * as the lockless freelist and zap the regular freelist. 2123 * as the lockless freelist and zap the regular freelist.
@@ -1794,8 +2134,9 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
1794 unsigned long addr, struct kmem_cache_cpu *c) 2134 unsigned long addr, struct kmem_cache_cpu *c)
1795{ 2135{
1796 void **object; 2136 void **object;
1797 struct page *page;
1798 unsigned long flags; 2137 unsigned long flags;
2138 struct page new;
2139 unsigned long counters;
1799 2140
1800 local_irq_save(flags); 2141 local_irq_save(flags);
1801#ifdef CONFIG_PREEMPT 2142#ifdef CONFIG_PREEMPT
@@ -1807,81 +2148,91 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
1807 c = this_cpu_ptr(s->cpu_slab); 2148 c = this_cpu_ptr(s->cpu_slab);
1808#endif 2149#endif
1809 2150
1810 /* We handle __GFP_ZERO in the caller */ 2151 if (!c->page)
1811 gfpflags &= ~__GFP_ZERO; 2152 goto new_slab;
1812 2153redo:
1813 page = c->page; 2154 if (unlikely(!node_match(c, node))) {
1814 if (!page) 2155 stat(s, ALLOC_NODE_MISMATCH);
2156 deactivate_slab(s, c);
1815 goto new_slab; 2157 goto new_slab;
2158 }
1816 2159
1817 slab_lock(page); 2160 stat(s, ALLOC_SLOWPATH);
1818 if (unlikely(!node_match(c, node))) 2161
1819 goto another_slab; 2162 do {
2163 object = c->page->freelist;
2164 counters = c->page->counters;
2165 new.counters = counters;
2166 VM_BUG_ON(!new.frozen);
2167
2168 /*
2169 * If there is no object left then we use this loop to
2170 * deactivate the slab which is simple since no objects
2171 * are left in the slab and therefore we do not need to
2172 * put the page back onto the partial list.
2173 *
2174 * If there are objects left then we retrieve them
2175 * and use them to refill the per cpu queue.
2176 */
2177
2178 new.inuse = c->page->objects;
2179 new.frozen = object != NULL;
2180
2181 } while (!__cmpxchg_double_slab(s, c->page,
2182 object, counters,
2183 NULL, new.counters,
2184 "__slab_alloc"));
2185
2186 if (!object) {
2187 c->page = NULL;
2188 stat(s, DEACTIVATE_BYPASS);
2189 goto new_slab;
2190 }
1820 2191
1821 stat(s, ALLOC_REFILL); 2192 stat(s, ALLOC_REFILL);
1822 2193
1823load_freelist: 2194load_freelist:
1824 object = page->freelist;
1825 if (unlikely(!object))
1826 goto another_slab;
1827 if (kmem_cache_debug(s))
1828 goto debug;
1829
1830 c->freelist = get_freepointer(s, object); 2195 c->freelist = get_freepointer(s, object);
1831 page->inuse = page->objects;
1832 page->freelist = NULL;
1833
1834 slab_unlock(page);
1835 c->tid = next_tid(c->tid); 2196 c->tid = next_tid(c->tid);
1836 local_irq_restore(flags); 2197 local_irq_restore(flags);
1837 stat(s, ALLOC_SLOWPATH);
1838 return object; 2198 return object;
1839 2199
1840another_slab:
1841 deactivate_slab(s, c);
1842
1843new_slab: 2200new_slab:
1844 page = get_partial(s, gfpflags, node); 2201
1845 if (page) { 2202 if (c->partial) {
1846 stat(s, ALLOC_FROM_PARTIAL); 2203 c->page = c->partial;
1847 c->node = page_to_nid(page); 2204 c->partial = c->page->next;
1848 c->page = page; 2205 c->node = page_to_nid(c->page);
1849 goto load_freelist; 2206 stat(s, CPU_PARTIAL_ALLOC);
2207 c->freelist = NULL;
2208 goto redo;
1850 } 2209 }
1851 2210
1852 gfpflags &= gfp_allowed_mask; 2211 /* Then do expensive stuff like retrieving pages from the partial lists */
1853 if (gfpflags & __GFP_WAIT) 2212 object = get_partial(s, gfpflags, node, c);
1854 local_irq_enable();
1855 2213
1856 page = new_slab(s, gfpflags, node); 2214 if (unlikely(!object)) {
1857 2215
1858 if (gfpflags & __GFP_WAIT) 2216 object = new_slab_objects(s, gfpflags, node, &c);
1859 local_irq_disable();
1860 2217
1861 if (page) { 2218 if (unlikely(!object)) {
1862 c = __this_cpu_ptr(s->cpu_slab); 2219 if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit())
1863 stat(s, ALLOC_SLAB); 2220 slab_out_of_memory(s, gfpflags, node);
1864 if (c->page)
1865 flush_slab(s, c);
1866 2221
1867 slab_lock(page); 2222 local_irq_restore(flags);
1868 __SetPageSlubFrozen(page); 2223 return NULL;
1869 c->node = page_to_nid(page); 2224 }
1870 c->page = page;
1871 goto load_freelist;
1872 } 2225 }
1873 if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit())
1874 slab_out_of_memory(s, gfpflags, node);
1875 local_irq_restore(flags);
1876 return NULL;
1877debug:
1878 if (!alloc_debug_processing(s, page, object, addr))
1879 goto another_slab;
1880 2226
1881 page->inuse++; 2227 if (likely(!kmem_cache_debug(s)))
1882 page->freelist = get_freepointer(s, object); 2228 goto load_freelist;
2229
2230 /* Only entered in the debug case */
2231 if (!alloc_debug_processing(s, c->page, object, addr))
2232 goto new_slab; /* Slab failed checks. Next slab needed */
2233
2234 c->freelist = get_freepointer(s, object);
1883 deactivate_slab(s, c); 2235 deactivate_slab(s, c);
1884 c->page = NULL;
1885 c->node = NUMA_NO_NODE; 2236 c->node = NUMA_NO_NODE;
1886 local_irq_restore(flags); 2237 local_irq_restore(flags);
1887 return object; 2238 return object;
@@ -2031,52 +2382,110 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
2031{ 2382{
2032 void *prior; 2383 void *prior;
2033 void **object = (void *)x; 2384 void **object = (void *)x;
2034 unsigned long flags; 2385 int was_frozen;
2386 int inuse;
2387 struct page new;
2388 unsigned long counters;
2389 struct kmem_cache_node *n = NULL;
2390 unsigned long uninitialized_var(flags);
2035 2391
2036 local_irq_save(flags);
2037 slab_lock(page);
2038 stat(s, FREE_SLOWPATH); 2392 stat(s, FREE_SLOWPATH);
2039 2393
2040 if (kmem_cache_debug(s) && !free_debug_processing(s, page, x, addr)) 2394 if (kmem_cache_debug(s) && !free_debug_processing(s, page, x, addr))
2041 goto out_unlock; 2395 return;
2042 2396
2043 prior = page->freelist; 2397 do {
2044 set_freepointer(s, object, prior); 2398 prior = page->freelist;
2045 page->freelist = object; 2399 counters = page->counters;
2046 page->inuse--; 2400 set_freepointer(s, object, prior);
2401 new.counters = counters;
2402 was_frozen = new.frozen;
2403 new.inuse--;
2404 if ((!new.inuse || !prior) && !was_frozen && !n) {
2047 2405
2048 if (unlikely(PageSlubFrozen(page))) { 2406 if (!kmem_cache_debug(s) && !prior)
2049 stat(s, FREE_FROZEN); 2407
2050 goto out_unlock; 2408 /*
2051 } 2409 * Slab was on no list before and will be partially empty
2410 * We can defer the list move and instead freeze it.
2411 */
2412 new.frozen = 1;
2413
2414 else { /* Needs to be taken off a list */
2415
2416 n = get_node(s, page_to_nid(page));
2417 /*
2418 * Speculatively acquire the list_lock.
2419 * If the cmpxchg does not succeed then we may
2420 * drop the list_lock without any processing.
2421 *
2422 * Otherwise the list_lock will synchronize with
2423 * other processors updating the list of slabs.
2424 */
2425 spin_lock_irqsave(&n->list_lock, flags);
2426
2427 }
2428 }
2429 inuse = new.inuse;
2430
2431 } while (!cmpxchg_double_slab(s, page,
2432 prior, counters,
2433 object, new.counters,
2434 "__slab_free"));
2435
2436 if (likely(!n)) {
2437
2438 /*
2439 * If we just froze the page then put it onto the
2440 * per cpu partial list.
2441 */
2442 if (new.frozen && !was_frozen)
2443 put_cpu_partial(s, page, 1);
2052 2444
2053 if (unlikely(!page->inuse)) 2445 /*
2054 goto slab_empty; 2446 * The list lock was not taken therefore no list
2447 * activity can be necessary.
2448 */
2449 if (was_frozen)
2450 stat(s, FREE_FROZEN);
2451 return;
2452 }
2055 2453
2056 /* 2454 /*
2057 * Objects left in the slab. If it was not on the partial list before 2455 * was_frozen may have been set after we acquired the list_lock in
2058 * then add it. 2456 * an earlier loop. So we need to check it here again.
2059 */ 2457 */
2060 if (unlikely(!prior)) { 2458 if (was_frozen)
2061 add_partial(get_node(s, page_to_nid(page)), page, 1); 2459 stat(s, FREE_FROZEN);
2062 stat(s, FREE_ADD_PARTIAL); 2460 else {
2063 } 2461 if (unlikely(!inuse && n->nr_partial > s->min_partial))
2462 goto slab_empty;
2064 2463
2065out_unlock: 2464 /*
2066 slab_unlock(page); 2465 * Objects left in the slab. If it was not on the partial list before
2067 local_irq_restore(flags); 2466 * then add it.
2467 */
2468 if (unlikely(!prior)) {
2469 remove_full(s, page);
2470 add_partial(n, page, DEACTIVATE_TO_TAIL);
2471 stat(s, FREE_ADD_PARTIAL);
2472 }
2473 }
2474 spin_unlock_irqrestore(&n->list_lock, flags);
2068 return; 2475 return;
2069 2476
2070slab_empty: 2477slab_empty:
2071 if (prior) { 2478 if (prior) {
2072 /* 2479 /*
2073 * Slab still on the partial list. 2480 * Slab on the partial list.
2074 */ 2481 */
2075 remove_partial(s, page); 2482 remove_partial(n, page);
2076 stat(s, FREE_REMOVE_PARTIAL); 2483 stat(s, FREE_REMOVE_PARTIAL);
2077 } 2484 } else
2078 slab_unlock(page); 2485 /* Slab must be on the full list */
2079 local_irq_restore(flags); 2486 remove_full(s, page);
2487
2488 spin_unlock_irqrestore(&n->list_lock, flags);
2080 stat(s, FREE_SLAB); 2489 stat(s, FREE_SLAB);
2081 discard_slab(s, page); 2490 discard_slab(s, page);
2082} 2491}
@@ -2102,7 +2511,6 @@ static __always_inline void slab_free(struct kmem_cache *s,
2102 slab_free_hook(s, x); 2511 slab_free_hook(s, x);
2103 2512
2104redo: 2513redo:
2105
2106 /* 2514 /*
2107 * Determine the currently cpus per cpu slab. 2515 * Determine the currently cpus per cpu slab.
2108 * The cpu may change afterward. However that does not matter since 2516 * The cpu may change afterward. However that does not matter since
@@ -2350,7 +2758,6 @@ static void early_kmem_cache_node_alloc(int node)
2350{ 2758{
2351 struct page *page; 2759 struct page *page;
2352 struct kmem_cache_node *n; 2760 struct kmem_cache_node *n;
2353 unsigned long flags;
2354 2761
2355 BUG_ON(kmem_cache_node->size < sizeof(struct kmem_cache_node)); 2762 BUG_ON(kmem_cache_node->size < sizeof(struct kmem_cache_node));
2356 2763
@@ -2367,7 +2774,8 @@ static void early_kmem_cache_node_alloc(int node)
2367 n = page->freelist; 2774 n = page->freelist;
2368 BUG_ON(!n); 2775 BUG_ON(!n);
2369 page->freelist = get_freepointer(kmem_cache_node, n); 2776 page->freelist = get_freepointer(kmem_cache_node, n);
2370 page->inuse++; 2777 page->inuse = 1;
2778 page->frozen = 0;
2371 kmem_cache_node->node[node] = n; 2779 kmem_cache_node->node[node] = n;
2372#ifdef CONFIG_SLUB_DEBUG 2780#ifdef CONFIG_SLUB_DEBUG
2373 init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); 2781 init_object(kmem_cache_node, n, SLUB_RED_ACTIVE);
@@ -2376,14 +2784,7 @@ static void early_kmem_cache_node_alloc(int node)
2376 init_kmem_cache_node(n, kmem_cache_node); 2784 init_kmem_cache_node(n, kmem_cache_node);
2377 inc_slabs_node(kmem_cache_node, node, page->objects); 2785 inc_slabs_node(kmem_cache_node, node, page->objects);
2378 2786
2379 /* 2787 add_partial(n, page, DEACTIVATE_TO_HEAD);
2380 * lockdep requires consistent irq usage for each lock
2381 * so even though there cannot be a race this early in
2382 * the boot sequence, we still disable irqs.
2383 */
2384 local_irq_save(flags);
2385 add_partial(n, page, 0);
2386 local_irq_restore(flags);
2387} 2788}
2388 2789
2389static void free_kmem_cache_nodes(struct kmem_cache *s) 2790static void free_kmem_cache_nodes(struct kmem_cache *s)
@@ -2589,11 +2990,44 @@ static int kmem_cache_open(struct kmem_cache *s,
2589 } 2990 }
2590 } 2991 }
2591 2992
2993#ifdef CONFIG_CMPXCHG_DOUBLE
2994 if (system_has_cmpxchg_double() && (s->flags & SLAB_DEBUG_FLAGS) == 0)
2995 /* Enable fast mode */
2996 s->flags |= __CMPXCHG_DOUBLE;
2997#endif
2998
2592 /* 2999 /*
2593 * The larger the object size is, the more pages we want on the partial 3000 * The larger the object size is, the more pages we want on the partial
2594 * list to avoid pounding the page allocator excessively. 3001 * list to avoid pounding the page allocator excessively.
2595 */ 3002 */
2596 set_min_partial(s, ilog2(s->size)); 3003 set_min_partial(s, ilog2(s->size) / 2);
3004
3005 /*
3006 * cpu_partial determined the maximum number of objects kept in the
3007 * per cpu partial lists of a processor.
3008 *
3009 * Per cpu partial lists mainly contain slabs that just have one
3010 * object freed. If they are used for allocation then they can be
3011 * filled up again with minimal effort. The slab will never hit the
3012 * per node partial lists and therefore no locking will be required.
3013 *
3014 * This setting also determines
3015 *
3016 * A) The number of objects from per cpu partial slabs dumped to the
3017 * per node list when we reach the limit.
3018 * B) The number of objects in cpu partial slabs to extract from the
3019 * per node list when we run out of per cpu objects. We only fetch 50%
3020 * to keep some capacity around for frees.
3021 */
3022 if (s->size >= PAGE_SIZE)
3023 s->cpu_partial = 2;
3024 else if (s->size >= 1024)
3025 s->cpu_partial = 6;
3026 else if (s->size >= 256)
3027 s->cpu_partial = 13;
3028 else
3029 s->cpu_partial = 30;
3030
2597 s->refcount = 1; 3031 s->refcount = 1;
2598#ifdef CONFIG_NUMA 3032#ifdef CONFIG_NUMA
2599 s->remote_node_defrag_ratio = 1000; 3033 s->remote_node_defrag_ratio = 1000;
@@ -2652,23 +3086,22 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
2652 3086
2653/* 3087/*
2654 * Attempt to free all partial slabs on a node. 3088 * Attempt to free all partial slabs on a node.
3089 * This is called from kmem_cache_close(). We must be the last thread
3090 * using the cache and therefore we do not need to lock anymore.
2655 */ 3091 */
2656static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) 3092static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
2657{ 3093{
2658 unsigned long flags;
2659 struct page *page, *h; 3094 struct page *page, *h;
2660 3095
2661 spin_lock_irqsave(&n->list_lock, flags);
2662 list_for_each_entry_safe(page, h, &n->partial, lru) { 3096 list_for_each_entry_safe(page, h, &n->partial, lru) {
2663 if (!page->inuse) { 3097 if (!page->inuse) {
2664 __remove_partial(n, page); 3098 remove_partial(n, page);
2665 discard_slab(s, page); 3099 discard_slab(s, page);
2666 } else { 3100 } else {
2667 list_slab_objects(s, page, 3101 list_slab_objects(s, page,
2668 "Objects remaining on kmem_cache_close()"); 3102 "Objects remaining on kmem_cache_close()");
2669 } 3103 }
2670 } 3104 }
2671 spin_unlock_irqrestore(&n->list_lock, flags);
2672} 3105}
2673 3106
2674/* 3107/*
@@ -2702,6 +3135,7 @@ void kmem_cache_destroy(struct kmem_cache *s)
2702 s->refcount--; 3135 s->refcount--;
2703 if (!s->refcount) { 3136 if (!s->refcount) {
2704 list_del(&s->list); 3137 list_del(&s->list);
3138 up_write(&slub_lock);
2705 if (kmem_cache_close(s)) { 3139 if (kmem_cache_close(s)) {
2706 printk(KERN_ERR "SLUB %s: %s called for cache that " 3140 printk(KERN_ERR "SLUB %s: %s called for cache that "
2707 "still has objects.\n", s->name, __func__); 3141 "still has objects.\n", s->name, __func__);
@@ -2710,8 +3144,8 @@ void kmem_cache_destroy(struct kmem_cache *s)
2710 if (s->flags & SLAB_DESTROY_BY_RCU) 3144 if (s->flags & SLAB_DESTROY_BY_RCU)
2711 rcu_barrier(); 3145 rcu_barrier();
2712 sysfs_slab_remove(s); 3146 sysfs_slab_remove(s);
2713 } 3147 } else
2714 up_write(&slub_lock); 3148 up_write(&slub_lock);
2715} 3149}
2716EXPORT_SYMBOL(kmem_cache_destroy); 3150EXPORT_SYMBOL(kmem_cache_destroy);
2717 3151
@@ -2928,6 +3362,42 @@ size_t ksize(const void *object)
2928} 3362}
2929EXPORT_SYMBOL(ksize); 3363EXPORT_SYMBOL(ksize);
2930 3364
3365#ifdef CONFIG_SLUB_DEBUG
3366bool verify_mem_not_deleted(const void *x)
3367{
3368 struct page *page;
3369 void *object = (void *)x;
3370 unsigned long flags;
3371 bool rv;
3372
3373 if (unlikely(ZERO_OR_NULL_PTR(x)))
3374 return false;
3375
3376 local_irq_save(flags);
3377
3378 page = virt_to_head_page(x);
3379 if (unlikely(!PageSlab(page))) {
3380 /* maybe it was from stack? */
3381 rv = true;
3382 goto out_unlock;
3383 }
3384
3385 slab_lock(page);
3386 if (on_freelist(page->slab, page, object)) {
3387 object_err(page->slab, page, object, "Object is on free-list");
3388 rv = false;
3389 } else {
3390 rv = true;
3391 }
3392 slab_unlock(page);
3393
3394out_unlock:
3395 local_irq_restore(flags);
3396 return rv;
3397}
3398EXPORT_SYMBOL(verify_mem_not_deleted);
3399#endif
3400
2931void kfree(const void *x) 3401void kfree(const void *x)
2932{ 3402{
2933 struct page *page; 3403 struct page *page;
@@ -2993,29 +3463,23 @@ int kmem_cache_shrink(struct kmem_cache *s)
2993 * list_lock. page->inuse here is the upper limit. 3463 * list_lock. page->inuse here is the upper limit.
2994 */ 3464 */
2995 list_for_each_entry_safe(page, t, &n->partial, lru) { 3465 list_for_each_entry_safe(page, t, &n->partial, lru) {
2996 if (!page->inuse && slab_trylock(page)) { 3466 list_move(&page->lru, slabs_by_inuse + page->inuse);
2997 /* 3467 if (!page->inuse)
2998 * Must hold slab lock here because slab_free 3468 n->nr_partial--;
2999 * may have freed the last object and be
3000 * waiting to release the slab.
3001 */
3002 __remove_partial(n, page);
3003 slab_unlock(page);
3004 discard_slab(s, page);
3005 } else {
3006 list_move(&page->lru,
3007 slabs_by_inuse + page->inuse);
3008 }
3009 } 3469 }
3010 3470
3011 /* 3471 /*
3012 * Rebuild the partial list with the slabs filled up most 3472 * Rebuild the partial list with the slabs filled up most
3013 * first and the least used slabs at the end. 3473 * first and the least used slabs at the end.
3014 */ 3474 */
3015 for (i = objects - 1; i >= 0; i--) 3475 for (i = objects - 1; i > 0; i--)
3016 list_splice(slabs_by_inuse + i, n->partial.prev); 3476 list_splice(slabs_by_inuse + i, n->partial.prev);
3017 3477
3018 spin_unlock_irqrestore(&n->list_lock, flags); 3478 spin_unlock_irqrestore(&n->list_lock, flags);
3479
3480 /* Release empty slabs */
3481 list_for_each_entry_safe(page, t, slabs_by_inuse, lru)
3482 discard_slab(s, page);
3019 } 3483 }
3020 3484
3021 kfree(slabs_by_inuse); 3485 kfree(slabs_by_inuse);
@@ -3588,12 +4052,9 @@ static int validate_slab(struct kmem_cache *s, struct page *page,
3588static void validate_slab_slab(struct kmem_cache *s, struct page *page, 4052static void validate_slab_slab(struct kmem_cache *s, struct page *page,
3589 unsigned long *map) 4053 unsigned long *map)
3590{ 4054{
3591 if (slab_trylock(page)) { 4055 slab_lock(page);
3592 validate_slab(s, page, map); 4056 validate_slab(s, page, map);
3593 slab_unlock(page); 4057 slab_unlock(page);
3594 } else
3595 printk(KERN_INFO "SLUB %s: Skipped busy slab 0x%p\n",
3596 s->name, page);
3597} 4058}
3598 4059
3599static int validate_slab_node(struct kmem_cache *s, 4060static int validate_slab_node(struct kmem_cache *s,
@@ -3974,6 +4435,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
3974 4435
3975 for_each_possible_cpu(cpu) { 4436 for_each_possible_cpu(cpu) {
3976 struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); 4437 struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
4438 struct page *page;
3977 4439
3978 if (!c || c->node < 0) 4440 if (!c || c->node < 0)
3979 continue; 4441 continue;
@@ -3989,6 +4451,13 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
3989 total += x; 4451 total += x;
3990 nodes[c->node] += x; 4452 nodes[c->node] += x;
3991 } 4453 }
4454 page = c->partial;
4455
4456 if (page) {
4457 x = page->pobjects;
4458 total += x;
4459 nodes[c->node] += x;
4460 }
3992 per_cpu[c->node]++; 4461 per_cpu[c->node]++;
3993 } 4462 }
3994 } 4463 }
@@ -4058,7 +4527,7 @@ static int any_slab_objects(struct kmem_cache *s)
4058#endif 4527#endif
4059 4528
4060#define to_slab_attr(n) container_of(n, struct slab_attribute, attr) 4529#define to_slab_attr(n) container_of(n, struct slab_attribute, attr)
4061#define to_slab(n) container_of(n, struct kmem_cache, kobj); 4530#define to_slab(n) container_of(n, struct kmem_cache, kobj)
4062 4531
4063struct slab_attribute { 4532struct slab_attribute {
4064 struct attribute attr; 4533 struct attribute attr;
@@ -4067,11 +4536,12 @@ struct slab_attribute {
4067}; 4536};
4068 4537
4069#define SLAB_ATTR_RO(_name) \ 4538#define SLAB_ATTR_RO(_name) \
4070 static struct slab_attribute _name##_attr = __ATTR_RO(_name) 4539 static struct slab_attribute _name##_attr = \
4540 __ATTR(_name, 0400, _name##_show, NULL)
4071 4541
4072#define SLAB_ATTR(_name) \ 4542#define SLAB_ATTR(_name) \
4073 static struct slab_attribute _name##_attr = \ 4543 static struct slab_attribute _name##_attr = \
4074 __ATTR(_name, 0644, _name##_show, _name##_store) 4544 __ATTR(_name, 0600, _name##_show, _name##_store)
4075 4545
4076static ssize_t slab_size_show(struct kmem_cache *s, char *buf) 4546static ssize_t slab_size_show(struct kmem_cache *s, char *buf)
4077{ 4547{
@@ -4140,6 +4610,27 @@ static ssize_t min_partial_store(struct kmem_cache *s, const char *buf,
4140} 4610}
4141SLAB_ATTR(min_partial); 4611SLAB_ATTR(min_partial);
4142 4612
4613static ssize_t cpu_partial_show(struct kmem_cache *s, char *buf)
4614{
4615 return sprintf(buf, "%u\n", s->cpu_partial);
4616}
4617
4618static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf,
4619 size_t length)
4620{
4621 unsigned long objects;
4622 int err;
4623
4624 err = strict_strtoul(buf, 10, &objects);
4625 if (err)
4626 return err;
4627
4628 s->cpu_partial = objects;
4629 flush_all(s);
4630 return length;
4631}
4632SLAB_ATTR(cpu_partial);
4633
4143static ssize_t ctor_show(struct kmem_cache *s, char *buf) 4634static ssize_t ctor_show(struct kmem_cache *s, char *buf)
4144{ 4635{
4145 if (!s->ctor) 4636 if (!s->ctor)
@@ -4178,6 +4669,37 @@ static ssize_t objects_partial_show(struct kmem_cache *s, char *buf)
4178} 4669}
4179SLAB_ATTR_RO(objects_partial); 4670SLAB_ATTR_RO(objects_partial);
4180 4671
4672static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf)
4673{
4674 int objects = 0;
4675 int pages = 0;
4676 int cpu;
4677 int len;
4678
4679 for_each_online_cpu(cpu) {
4680 struct page *page = per_cpu_ptr(s->cpu_slab, cpu)->partial;
4681
4682 if (page) {
4683 pages += page->pages;
4684 objects += page->pobjects;
4685 }
4686 }
4687
4688 len = sprintf(buf, "%d(%d)", objects, pages);
4689
4690#ifdef CONFIG_SMP
4691 for_each_online_cpu(cpu) {
4692 struct page *page = per_cpu_ptr(s->cpu_slab, cpu) ->partial;
4693
4694 if (page && len < PAGE_SIZE - 20)
4695 len += sprintf(buf + len, " C%d=%d(%d)", cpu,
4696 page->pobjects, page->pages);
4697 }
4698#endif
4699 return len + sprintf(buf + len, "\n");
4700}
4701SLAB_ATTR_RO(slabs_cpu_partial);
4702
4181static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf) 4703static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf)
4182{ 4704{
4183 return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT)); 4705 return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT));
@@ -4241,8 +4763,10 @@ static ssize_t sanity_checks_store(struct kmem_cache *s,
4241 const char *buf, size_t length) 4763 const char *buf, size_t length)
4242{ 4764{
4243 s->flags &= ~SLAB_DEBUG_FREE; 4765 s->flags &= ~SLAB_DEBUG_FREE;
4244 if (buf[0] == '1') 4766 if (buf[0] == '1') {
4767 s->flags &= ~__CMPXCHG_DOUBLE;
4245 s->flags |= SLAB_DEBUG_FREE; 4768 s->flags |= SLAB_DEBUG_FREE;
4769 }
4246 return length; 4770 return length;
4247} 4771}
4248SLAB_ATTR(sanity_checks); 4772SLAB_ATTR(sanity_checks);
@@ -4256,8 +4780,10 @@ static ssize_t trace_store(struct kmem_cache *s, const char *buf,
4256 size_t length) 4780 size_t length)
4257{ 4781{
4258 s->flags &= ~SLAB_TRACE; 4782 s->flags &= ~SLAB_TRACE;
4259 if (buf[0] == '1') 4783 if (buf[0] == '1') {
4784 s->flags &= ~__CMPXCHG_DOUBLE;
4260 s->flags |= SLAB_TRACE; 4785 s->flags |= SLAB_TRACE;
4786 }
4261 return length; 4787 return length;
4262} 4788}
4263SLAB_ATTR(trace); 4789SLAB_ATTR(trace);
@@ -4274,8 +4800,10 @@ static ssize_t red_zone_store(struct kmem_cache *s,
4274 return -EBUSY; 4800 return -EBUSY;
4275 4801
4276 s->flags &= ~SLAB_RED_ZONE; 4802 s->flags &= ~SLAB_RED_ZONE;
4277 if (buf[0] == '1') 4803 if (buf[0] == '1') {
4804 s->flags &= ~__CMPXCHG_DOUBLE;
4278 s->flags |= SLAB_RED_ZONE; 4805 s->flags |= SLAB_RED_ZONE;
4806 }
4279 calculate_sizes(s, -1); 4807 calculate_sizes(s, -1);
4280 return length; 4808 return length;
4281} 4809}
@@ -4293,8 +4821,10 @@ static ssize_t poison_store(struct kmem_cache *s,
4293 return -EBUSY; 4821 return -EBUSY;
4294 4822
4295 s->flags &= ~SLAB_POISON; 4823 s->flags &= ~SLAB_POISON;
4296 if (buf[0] == '1') 4824 if (buf[0] == '1') {
4825 s->flags &= ~__CMPXCHG_DOUBLE;
4297 s->flags |= SLAB_POISON; 4826 s->flags |= SLAB_POISON;
4827 }
4298 calculate_sizes(s, -1); 4828 calculate_sizes(s, -1);
4299 return length; 4829 return length;
4300} 4830}
@@ -4312,8 +4842,10 @@ static ssize_t store_user_store(struct kmem_cache *s,
4312 return -EBUSY; 4842 return -EBUSY;
4313 4843
4314 s->flags &= ~SLAB_STORE_USER; 4844 s->flags &= ~SLAB_STORE_USER;
4315 if (buf[0] == '1') 4845 if (buf[0] == '1') {
4846 s->flags &= ~__CMPXCHG_DOUBLE;
4316 s->flags |= SLAB_STORE_USER; 4847 s->flags |= SLAB_STORE_USER;
4848 }
4317 calculate_sizes(s, -1); 4849 calculate_sizes(s, -1);
4318 return length; 4850 return length;
4319} 4851}
@@ -4478,6 +5010,7 @@ STAT_ATTR(FREE_REMOVE_PARTIAL, free_remove_partial);
4478STAT_ATTR(ALLOC_FROM_PARTIAL, alloc_from_partial); 5010STAT_ATTR(ALLOC_FROM_PARTIAL, alloc_from_partial);
4479STAT_ATTR(ALLOC_SLAB, alloc_slab); 5011STAT_ATTR(ALLOC_SLAB, alloc_slab);
4480STAT_ATTR(ALLOC_REFILL, alloc_refill); 5012STAT_ATTR(ALLOC_REFILL, alloc_refill);
5013STAT_ATTR(ALLOC_NODE_MISMATCH, alloc_node_mismatch);
4481STAT_ATTR(FREE_SLAB, free_slab); 5014STAT_ATTR(FREE_SLAB, free_slab);
4482STAT_ATTR(CPUSLAB_FLUSH, cpuslab_flush); 5015STAT_ATTR(CPUSLAB_FLUSH, cpuslab_flush);
4483STAT_ATTR(DEACTIVATE_FULL, deactivate_full); 5016STAT_ATTR(DEACTIVATE_FULL, deactivate_full);
@@ -4485,7 +5018,12 @@ STAT_ATTR(DEACTIVATE_EMPTY, deactivate_empty);
4485STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate_to_head); 5018STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate_to_head);
4486STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail); 5019STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail);
4487STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees); 5020STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees);
5021STAT_ATTR(DEACTIVATE_BYPASS, deactivate_bypass);
4488STAT_ATTR(ORDER_FALLBACK, order_fallback); 5022STAT_ATTR(ORDER_FALLBACK, order_fallback);
5023STAT_ATTR(CMPXCHG_DOUBLE_CPU_FAIL, cmpxchg_double_cpu_fail);
5024STAT_ATTR(CMPXCHG_DOUBLE_FAIL, cmpxchg_double_fail);
5025STAT_ATTR(CPU_PARTIAL_ALLOC, cpu_partial_alloc);
5026STAT_ATTR(CPU_PARTIAL_FREE, cpu_partial_free);
4489#endif 5027#endif
4490 5028
4491static struct attribute *slab_attrs[] = { 5029static struct attribute *slab_attrs[] = {
@@ -4494,6 +5032,7 @@ static struct attribute *slab_attrs[] = {
4494 &objs_per_slab_attr.attr, 5032 &objs_per_slab_attr.attr,
4495 &order_attr.attr, 5033 &order_attr.attr,
4496 &min_partial_attr.attr, 5034 &min_partial_attr.attr,
5035 &cpu_partial_attr.attr,
4497 &objects_attr.attr, 5036 &objects_attr.attr,
4498 &objects_partial_attr.attr, 5037 &objects_partial_attr.attr,
4499 &partial_attr.attr, 5038 &partial_attr.attr,
@@ -4506,6 +5045,7 @@ static struct attribute *slab_attrs[] = {
4506 &destroy_by_rcu_attr.attr, 5045 &destroy_by_rcu_attr.attr,
4507 &shrink_attr.attr, 5046 &shrink_attr.attr,
4508 &reserved_attr.attr, 5047 &reserved_attr.attr,
5048 &slabs_cpu_partial_attr.attr,
4509#ifdef CONFIG_SLUB_DEBUG 5049#ifdef CONFIG_SLUB_DEBUG
4510 &total_objects_attr.attr, 5050 &total_objects_attr.attr,
4511 &slabs_attr.attr, 5051 &slabs_attr.attr,
@@ -4535,6 +5075,7 @@ static struct attribute *slab_attrs[] = {
4535 &alloc_from_partial_attr.attr, 5075 &alloc_from_partial_attr.attr,
4536 &alloc_slab_attr.attr, 5076 &alloc_slab_attr.attr,
4537 &alloc_refill_attr.attr, 5077 &alloc_refill_attr.attr,
5078 &alloc_node_mismatch_attr.attr,
4538 &free_slab_attr.attr, 5079 &free_slab_attr.attr,
4539 &cpuslab_flush_attr.attr, 5080 &cpuslab_flush_attr.attr,
4540 &deactivate_full_attr.attr, 5081 &deactivate_full_attr.attr,
@@ -4542,7 +5083,12 @@ static struct attribute *slab_attrs[] = {
4542 &deactivate_to_head_attr.attr, 5083 &deactivate_to_head_attr.attr,
4543 &deactivate_to_tail_attr.attr, 5084 &deactivate_to_tail_attr.attr,
4544 &deactivate_remote_frees_attr.attr, 5085 &deactivate_remote_frees_attr.attr,
5086 &deactivate_bypass_attr.attr,
4545 &order_fallback_attr.attr, 5087 &order_fallback_attr.attr,
5088 &cmpxchg_double_fail_attr.attr,
5089 &cmpxchg_double_cpu_fail_attr.attr,
5090 &cpu_partial_alloc_attr.attr,
5091 &cpu_partial_free_attr.attr,
4546#endif 5092#endif
4547#ifdef CONFIG_FAILSLAB 5093#ifdef CONFIG_FAILSLAB
4548 &failslab_attr.attr, 5094 &failslab_attr.attr,
@@ -4894,7 +5440,7 @@ static const struct file_operations proc_slabinfo_operations = {
4894 5440
4895static int __init slab_proc_init(void) 5441static int __init slab_proc_init(void)
4896{ 5442{
4897 proc_create("slabinfo", S_IRUGO, NULL, &proc_slabinfo_operations); 5443 proc_create("slabinfo", S_IRUSR, NULL, &proc_slabinfo_operations);
4898 return 0; 5444 return 0;
4899} 5445}
4900module_init(slab_proc_init); 5446module_init(slab_proc_init);
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 64b984091edb..1b7e22ab9b09 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -21,7 +21,6 @@
21#include <linux/mmzone.h> 21#include <linux/mmzone.h>
22#include <linux/bootmem.h> 22#include <linux/bootmem.h>
23#include <linux/highmem.h> 23#include <linux/highmem.h>
24#include <linux/module.h>
25#include <linux/slab.h> 24#include <linux/slab.h>
26#include <linux/spinlock.h> 25#include <linux/spinlock.h>
27#include <linux/vmalloc.h> 26#include <linux/vmalloc.h>
diff --git a/mm/sparse.c b/mm/sparse.c
index aa64b12831a2..61d7cde23111 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -6,7 +6,7 @@
6#include <linux/mmzone.h> 6#include <linux/mmzone.h>
7#include <linux/bootmem.h> 7#include <linux/bootmem.h>
8#include <linux/highmem.h> 8#include <linux/highmem.h>
9#include <linux/module.h> 9#include <linux/export.h>
10#include <linux/spinlock.h> 10#include <linux/spinlock.h>
11#include <linux/vmalloc.h> 11#include <linux/vmalloc.h>
12#include "internal.h" 12#include "internal.h"
@@ -40,7 +40,7 @@ static u8 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned;
40static u16 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned; 40static u16 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned;
41#endif 41#endif
42 42
43int page_to_nid(struct page *page) 43int page_to_nid(const struct page *page)
44{ 44{
45 return section_to_node_table[page_to_section(page)]; 45 return section_to_node_table[page_to_section(page)];
46} 46}
diff --git a/mm/swap.c b/mm/swap.c
index 3a442f18b0b3..a91caf754d9b 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -21,7 +21,7 @@
21#include <linux/pagemap.h> 21#include <linux/pagemap.h>
22#include <linux/pagevec.h> 22#include <linux/pagevec.h>
23#include <linux/init.h> 23#include <linux/init.h>
24#include <linux/module.h> 24#include <linux/export.h>
25#include <linux/mm_inline.h> 25#include <linux/mm_inline.h>
26#include <linux/buffer_head.h> /* for try_to_release_page() */ 26#include <linux/buffer_head.h> /* for try_to_release_page() */
27#include <linux/percpu_counter.h> 27#include <linux/percpu_counter.h>
@@ -78,39 +78,22 @@ static void put_compound_page(struct page *page)
78{ 78{
79 if (unlikely(PageTail(page))) { 79 if (unlikely(PageTail(page))) {
80 /* __split_huge_page_refcount can run under us */ 80 /* __split_huge_page_refcount can run under us */
81 struct page *page_head = page->first_page; 81 struct page *page_head = compound_trans_head(page);
82 smp_rmb(); 82
83 /* 83 if (likely(page != page_head &&
84 * If PageTail is still set after smp_rmb() we can be sure 84 get_page_unless_zero(page_head))) {
85 * that the page->first_page we read wasn't a dangling pointer.
86 * See __split_huge_page_refcount() smp_wmb().
87 */
88 if (likely(PageTail(page) && get_page_unless_zero(page_head))) {
89 unsigned long flags; 85 unsigned long flags;
90 /* 86 /*
91 * Verify that our page_head wasn't converted 87 * page_head wasn't a dangling pointer but it
92 * to a a regular page before we got a 88 * may not be a head page anymore by the time
93 * reference on it. 89 * we obtain the lock. That is ok as long as it
90 * can't be freed from under us.
94 */ 91 */
95 if (unlikely(!PageHead(page_head))) {
96 /* PageHead is cleared after PageTail */
97 smp_rmb();
98 VM_BUG_ON(PageTail(page));
99 goto out_put_head;
100 }
101 /*
102 * Only run compound_lock on a valid PageHead,
103 * after having it pinned with
104 * get_page_unless_zero() above.
105 */
106 smp_mb();
107 /* page_head wasn't a dangling pointer */
108 flags = compound_lock_irqsave(page_head); 92 flags = compound_lock_irqsave(page_head);
109 if (unlikely(!PageTail(page))) { 93 if (unlikely(!PageTail(page))) {
110 /* __split_huge_page_refcount run before us */ 94 /* __split_huge_page_refcount run before us */
111 compound_unlock_irqrestore(page_head, flags); 95 compound_unlock_irqrestore(page_head, flags);
112 VM_BUG_ON(PageHead(page_head)); 96 VM_BUG_ON(PageHead(page_head));
113 out_put_head:
114 if (put_page_testzero(page_head)) 97 if (put_page_testzero(page_head))
115 __put_single_page(page_head); 98 __put_single_page(page_head);
116 out_put_single: 99 out_put_single:
@@ -121,16 +104,17 @@ static void put_compound_page(struct page *page)
121 VM_BUG_ON(page_head != page->first_page); 104 VM_BUG_ON(page_head != page->first_page);
122 /* 105 /*
123 * We can release the refcount taken by 106 * We can release the refcount taken by
124 * get_page_unless_zero now that 107 * get_page_unless_zero() now that
125 * split_huge_page_refcount is blocked on the 108 * __split_huge_page_refcount() is blocked on
126 * compound_lock. 109 * the compound_lock.
127 */ 110 */
128 if (put_page_testzero(page_head)) 111 if (put_page_testzero(page_head))
129 VM_BUG_ON(1); 112 VM_BUG_ON(1);
130 /* __split_huge_page_refcount will wait now */ 113 /* __split_huge_page_refcount will wait now */
131 VM_BUG_ON(atomic_read(&page->_count) <= 0); 114 VM_BUG_ON(page_mapcount(page) <= 0);
132 atomic_dec(&page->_count); 115 atomic_dec(&page->_mapcount);
133 VM_BUG_ON(atomic_read(&page_head->_count) <= 0); 116 VM_BUG_ON(atomic_read(&page_head->_count) <= 0);
117 VM_BUG_ON(atomic_read(&page->_count) != 0);
134 compound_unlock_irqrestore(page_head, flags); 118 compound_unlock_irqrestore(page_head, flags);
135 if (put_page_testzero(page_head)) { 119 if (put_page_testzero(page_head)) {
136 if (PageHead(page_head)) 120 if (PageHead(page_head))
@@ -160,6 +144,45 @@ void put_page(struct page *page)
160} 144}
161EXPORT_SYMBOL(put_page); 145EXPORT_SYMBOL(put_page);
162 146
147/*
148 * This function is exported but must not be called by anything other
149 * than get_page(). It implements the slow path of get_page().
150 */
151bool __get_page_tail(struct page *page)
152{
153 /*
154 * This takes care of get_page() if run on a tail page
155 * returned by one of the get_user_pages/follow_page variants.
156 * get_user_pages/follow_page itself doesn't need the compound
157 * lock because it runs __get_page_tail_foll() under the
158 * proper PT lock that already serializes against
159 * split_huge_page().
160 */
161 unsigned long flags;
162 bool got = false;
163 struct page *page_head = compound_trans_head(page);
164
165 if (likely(page != page_head && get_page_unless_zero(page_head))) {
166 /*
167 * page_head wasn't a dangling pointer but it
168 * may not be a head page anymore by the time
169 * we obtain the lock. That is ok as long as it
170 * can't be freed from under us.
171 */
172 flags = compound_lock_irqsave(page_head);
173 /* here __split_huge_page_refcount won't run anymore */
174 if (likely(PageTail(page))) {
175 __get_page_tail_foll(page, false);
176 got = true;
177 }
178 compound_unlock_irqrestore(page_head, flags);
179 if (unlikely(!got))
180 put_page(page_head);
181 }
182 return got;
183}
184EXPORT_SYMBOL(__get_page_tail);
185
163/** 186/**
164 * put_pages_list() - release a list of pages 187 * put_pages_list() - release a list of pages
165 * @pages: list of pages threaded on page->lru 188 * @pages: list of pages threaded on page->lru
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 46680461785b..78cc4d1f6cce 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -6,7 +6,6 @@
6 * 6 *
7 * Rewritten to use page cache, (C) 1998 Stephen Tweedie 7 * Rewritten to use page cache, (C) 1998 Stephen Tweedie
8 */ 8 */
9#include <linux/module.h>
10#include <linux/mm.h> 9#include <linux/mm.h>
11#include <linux/gfp.h> 10#include <linux/gfp.h>
12#include <linux/kernel_stat.h> 11#include <linux/kernel_stat.h>
diff --git a/mm/swapfile.c b/mm/swapfile.c
index ff8dc1a18cb4..b1cd12060723 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -21,7 +21,6 @@
21#include <linux/proc_fs.h> 21#include <linux/proc_fs.h>
22#include <linux/seq_file.h> 22#include <linux/seq_file.h>
23#include <linux/init.h> 23#include <linux/init.h>
24#include <linux/module.h>
25#include <linux/ksm.h> 24#include <linux/ksm.h>
26#include <linux/rmap.h> 25#include <linux/rmap.h>
27#include <linux/security.h> 26#include <linux/security.h>
@@ -1617,7 +1616,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1617 1616
1618 oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX); 1617 oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX);
1619 err = try_to_unuse(type); 1618 err = try_to_unuse(type);
1620 test_set_oom_score_adj(oom_score_adj); 1619 compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, oom_score_adj);
1621 1620
1622 if (err) { 1621 if (err) {
1623 /* 1622 /*
@@ -1681,19 +1680,14 @@ out:
1681} 1680}
1682 1681
1683#ifdef CONFIG_PROC_FS 1682#ifdef CONFIG_PROC_FS
1684struct proc_swaps {
1685 struct seq_file seq;
1686 int event;
1687};
1688
1689static unsigned swaps_poll(struct file *file, poll_table *wait) 1683static unsigned swaps_poll(struct file *file, poll_table *wait)
1690{ 1684{
1691 struct proc_swaps *s = file->private_data; 1685 struct seq_file *seq = file->private_data;
1692 1686
1693 poll_wait(file, &proc_poll_wait, wait); 1687 poll_wait(file, &proc_poll_wait, wait);
1694 1688
1695 if (s->event != atomic_read(&proc_poll_event)) { 1689 if (seq->poll_event != atomic_read(&proc_poll_event)) {
1696 s->event = atomic_read(&proc_poll_event); 1690 seq->poll_event = atomic_read(&proc_poll_event);
1697 return POLLIN | POLLRDNORM | POLLERR | POLLPRI; 1691 return POLLIN | POLLRDNORM | POLLERR | POLLPRI;
1698 } 1692 }
1699 1693
@@ -1783,24 +1777,16 @@ static const struct seq_operations swaps_op = {
1783 1777
1784static int swaps_open(struct inode *inode, struct file *file) 1778static int swaps_open(struct inode *inode, struct file *file)
1785{ 1779{
1786 struct proc_swaps *s; 1780 struct seq_file *seq;
1787 int ret; 1781 int ret;
1788 1782
1789 s = kmalloc(sizeof(struct proc_swaps), GFP_KERNEL);
1790 if (!s)
1791 return -ENOMEM;
1792
1793 file->private_data = s;
1794
1795 ret = seq_open(file, &swaps_op); 1783 ret = seq_open(file, &swaps_op);
1796 if (ret) { 1784 if (ret)
1797 kfree(s);
1798 return ret; 1785 return ret;
1799 }
1800 1786
1801 s->seq.private = s; 1787 seq = file->private_data;
1802 s->event = atomic_read(&proc_poll_event); 1788 seq->poll_event = atomic_read(&proc_poll_event);
1803 return ret; 1789 return 0;
1804} 1790}
1805 1791
1806static const struct file_operations proc_swaps_operations = { 1792static const struct file_operations proc_swaps_operations = {
@@ -1937,20 +1923,24 @@ static unsigned long read_swap_header(struct swap_info_struct *p,
1937 1923
1938 /* 1924 /*
1939 * Find out how many pages are allowed for a single swap 1925 * Find out how many pages are allowed for a single swap
1940 * device. There are two limiting factors: 1) the number of 1926 * device. There are three limiting factors: 1) the number
1941 * bits for the swap offset in the swp_entry_t type and 1927 * of bits for the swap offset in the swp_entry_t type, and
1942 * 2) the number of bits in the a swap pte as defined by 1928 * 2) the number of bits in the swap pte as defined by the
1943 * the different architectures. In order to find the 1929 * the different architectures, and 3) the number of free bits
1944 * largest possible bit mask a swap entry with swap type 0 1930 * in an exceptional radix_tree entry. In order to find the
1931 * largest possible bit mask, a swap entry with swap type 0
1945 * and swap offset ~0UL is created, encoded to a swap pte, 1932 * and swap offset ~0UL is created, encoded to a swap pte,
1946 * decoded to a swp_entry_t again and finally the swap 1933 * decoded to a swp_entry_t again, and finally the swap
1947 * offset is extracted. This will mask all the bits from 1934 * offset is extracted. This will mask all the bits from
1948 * the initial ~0UL mask that can't be encoded in either 1935 * the initial ~0UL mask that can't be encoded in either
1949 * the swp_entry_t or the architecture definition of a 1936 * the swp_entry_t or the architecture definition of a
1950 * swap pte. 1937 * swap pte. Then the same is done for a radix_tree entry.
1951 */ 1938 */
1952 maxpages = swp_offset(pte_to_swp_entry( 1939 maxpages = swp_offset(pte_to_swp_entry(
1953 swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1; 1940 swp_entry_to_pte(swp_entry(0, ~0UL))));
1941 maxpages = swp_offset(radix_to_swp_entry(
1942 swp_to_radix_entry(swp_entry(0, maxpages)))) + 1;
1943
1954 if (maxpages > swap_header->info.last_page) { 1944 if (maxpages > swap_header->info.last_page) {
1955 maxpages = swap_header->info.last_page + 1; 1945 maxpages = swap_header->info.last_page + 1;
1956 /* p->max is an unsigned int: don't overflow it */ 1946 /* p->max is an unsigned int: don't overflow it */
diff --git a/mm/thrash.c b/mm/thrash.c
index fabf2d0f5169..57ad495dbd54 100644
--- a/mm/thrash.c
+++ b/mm/thrash.c
@@ -6,7 +6,7 @@
6 * Released under the GPL, see the file COPYING for details. 6 * Released under the GPL, see the file COPYING for details.
7 * 7 *
8 * Simple token based thrashing protection, using the algorithm 8 * Simple token based thrashing protection, using the algorithm
9 * described in: http://www.cs.wm.edu/~sjiang/token.pdf 9 * described in: http://www.cse.ohio-state.edu/hpcs/WWW/HTML/publications/abs05-1.html
10 * 10 *
11 * Sep 2006, Ashwin Chaugule <ashwin.chaugule@celunite.com> 11 * Sep 2006, Ashwin Chaugule <ashwin.chaugule@celunite.com>
12 * Improved algorithm to pass token: 12 * Improved algorithm to pass token:
@@ -29,9 +29,7 @@
29 29
30static DEFINE_SPINLOCK(swap_token_lock); 30static DEFINE_SPINLOCK(swap_token_lock);
31struct mm_struct *swap_token_mm; 31struct mm_struct *swap_token_mm;
32struct mem_cgroup *swap_token_memcg; 32static struct mem_cgroup *swap_token_memcg;
33static unsigned int global_faults;
34static unsigned int last_aging;
35 33
36#ifdef CONFIG_CGROUP_MEM_RES_CTLR 34#ifdef CONFIG_CGROUP_MEM_RES_CTLR
37static struct mem_cgroup *swap_token_memcg_from_mm(struct mm_struct *mm) 35static struct mem_cgroup *swap_token_memcg_from_mm(struct mm_struct *mm)
@@ -55,6 +53,8 @@ void grab_swap_token(struct mm_struct *mm)
55{ 53{
56 int current_interval; 54 int current_interval;
57 unsigned int old_prio = mm->token_priority; 55 unsigned int old_prio = mm->token_priority;
56 static unsigned int global_faults;
57 static unsigned int last_aging;
58 58
59 global_faults++; 59 global_faults++;
60 60
@@ -67,6 +67,17 @@ void grab_swap_token(struct mm_struct *mm)
67 if (!swap_token_mm) 67 if (!swap_token_mm)
68 goto replace_token; 68 goto replace_token;
69 69
70 /*
71 * Usually, we don't need priority aging because long interval faults
72 * makes priority decrease quickly. But there is one exception. If the
73 * token owner task is sleeping, it never make long interval faults.
74 * Thus, we need a priority aging mechanism instead. The requirements
75 * of priority aging are
76 * 1) An aging interval is reasonable enough long. Too short aging
77 * interval makes quick swap token lost and decrease performance.
78 * 2) The swap token owner task have to get priority aging even if
79 * it's under sleep.
80 */
70 if ((global_faults - last_aging) > TOKEN_AGING_INTERVAL) { 81 if ((global_faults - last_aging) > TOKEN_AGING_INTERVAL) {
71 swap_token_mm->token_priority /= 2; 82 swap_token_mm->token_priority /= 2;
72 last_aging = global_faults; 83 last_aging = global_faults;
diff --git a/mm/truncate.c b/mm/truncate.c
index e13f22efaad7..632b15e29f74 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -12,7 +12,7 @@
12#include <linux/gfp.h> 12#include <linux/gfp.h>
13#include <linux/mm.h> 13#include <linux/mm.h>
14#include <linux/swap.h> 14#include <linux/swap.h>
15#include <linux/module.h> 15#include <linux/export.h>
16#include <linux/pagemap.h> 16#include <linux/pagemap.h>
17#include <linux/highmem.h> 17#include <linux/highmem.h>
18#include <linux/pagevec.h> 18#include <linux/pagevec.h>
@@ -199,9 +199,6 @@ int invalidate_inode_page(struct page *page)
199 * The first pass will remove most pages, so the search cost of the second pass 199 * The first pass will remove most pages, so the search cost of the second pass
200 * is low. 200 * is low.
201 * 201 *
202 * When looking at page->index outside the page lock we need to be careful to
203 * copy it into a local to avoid races (it could change at any time).
204 *
205 * We pass down the cache-hot hint to the page freeing code. Even if the 202 * We pass down the cache-hot hint to the page freeing code. Even if the
206 * mapping is large, it is probably the case that the final pages are the most 203 * mapping is large, it is probably the case that the final pages are the most
207 * recently touched, and freeing happens in ascending file offset order. 204 * recently touched, and freeing happens in ascending file offset order.
@@ -210,10 +207,10 @@ void truncate_inode_pages_range(struct address_space *mapping,
210 loff_t lstart, loff_t lend) 207 loff_t lstart, loff_t lend)
211{ 208{
212 const pgoff_t start = (lstart + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; 209 const pgoff_t start = (lstart + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
213 pgoff_t end;
214 const unsigned partial = lstart & (PAGE_CACHE_SIZE - 1); 210 const unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
215 struct pagevec pvec; 211 struct pagevec pvec;
216 pgoff_t next; 212 pgoff_t index;
213 pgoff_t end;
217 int i; 214 int i;
218 215
219 cleancache_flush_inode(mapping); 216 cleancache_flush_inode(mapping);
@@ -224,24 +221,21 @@ void truncate_inode_pages_range(struct address_space *mapping,
224 end = (lend >> PAGE_CACHE_SHIFT); 221 end = (lend >> PAGE_CACHE_SHIFT);
225 222
226 pagevec_init(&pvec, 0); 223 pagevec_init(&pvec, 0);
227 next = start; 224 index = start;
228 while (next <= end && 225 while (index <= end && pagevec_lookup(&pvec, mapping, index,
229 pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { 226 min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
230 mem_cgroup_uncharge_start(); 227 mem_cgroup_uncharge_start();
231 for (i = 0; i < pagevec_count(&pvec); i++) { 228 for (i = 0; i < pagevec_count(&pvec); i++) {
232 struct page *page = pvec.pages[i]; 229 struct page *page = pvec.pages[i];
233 pgoff_t page_index = page->index;
234 230
235 if (page_index > end) { 231 /* We rely upon deletion not changing page->index */
236 next = page_index; 232 index = page->index;
233 if (index > end)
237 break; 234 break;
238 }
239 235
240 if (page_index > next)
241 next = page_index;
242 next++;
243 if (!trylock_page(page)) 236 if (!trylock_page(page))
244 continue; 237 continue;
238 WARN_ON(page->index != index);
245 if (PageWriteback(page)) { 239 if (PageWriteback(page)) {
246 unlock_page(page); 240 unlock_page(page);
247 continue; 241 continue;
@@ -252,6 +246,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
252 pagevec_release(&pvec); 246 pagevec_release(&pvec);
253 mem_cgroup_uncharge_end(); 247 mem_cgroup_uncharge_end();
254 cond_resched(); 248 cond_resched();
249 index++;
255 } 250 }
256 251
257 if (partial) { 252 if (partial) {
@@ -264,16 +259,17 @@ void truncate_inode_pages_range(struct address_space *mapping,
264 } 259 }
265 } 260 }
266 261
267 next = start; 262 index = start;
268 for ( ; ; ) { 263 for ( ; ; ) {
269 cond_resched(); 264 cond_resched();
270 if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { 265 if (!pagevec_lookup(&pvec, mapping, index,
271 if (next == start) 266 min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
267 if (index == start)
272 break; 268 break;
273 next = start; 269 index = start;
274 continue; 270 continue;
275 } 271 }
276 if (pvec.pages[0]->index > end) { 272 if (index == start && pvec.pages[0]->index > end) {
277 pagevec_release(&pvec); 273 pagevec_release(&pvec);
278 break; 274 break;
279 } 275 }
@@ -281,18 +277,20 @@ void truncate_inode_pages_range(struct address_space *mapping,
281 for (i = 0; i < pagevec_count(&pvec); i++) { 277 for (i = 0; i < pagevec_count(&pvec); i++) {
282 struct page *page = pvec.pages[i]; 278 struct page *page = pvec.pages[i];
283 279
284 if (page->index > end) 280 /* We rely upon deletion not changing page->index */
281 index = page->index;
282 if (index > end)
285 break; 283 break;
284
286 lock_page(page); 285 lock_page(page);
286 WARN_ON(page->index != index);
287 wait_on_page_writeback(page); 287 wait_on_page_writeback(page);
288 truncate_inode_page(mapping, page); 288 truncate_inode_page(mapping, page);
289 if (page->index > next)
290 next = page->index;
291 next++;
292 unlock_page(page); 289 unlock_page(page);
293 } 290 }
294 pagevec_release(&pvec); 291 pagevec_release(&pvec);
295 mem_cgroup_uncharge_end(); 292 mem_cgroup_uncharge_end();
293 index++;
296 } 294 }
297 cleancache_flush_inode(mapping); 295 cleancache_flush_inode(mapping);
298} 296}
@@ -333,35 +331,34 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
333 pgoff_t start, pgoff_t end) 331 pgoff_t start, pgoff_t end)
334{ 332{
335 struct pagevec pvec; 333 struct pagevec pvec;
336 pgoff_t next = start; 334 pgoff_t index = start;
337 unsigned long ret; 335 unsigned long ret;
338 unsigned long count = 0; 336 unsigned long count = 0;
339 int i; 337 int i;
340 338
339 /*
340 * Note: this function may get called on a shmem/tmpfs mapping:
341 * pagevec_lookup() might then return 0 prematurely (because it
342 * got a gangful of swap entries); but it's hardly worth worrying
343 * about - it can rarely have anything to free from such a mapping
344 * (most pages are dirty), and already skips over any difficulties.
345 */
346
341 pagevec_init(&pvec, 0); 347 pagevec_init(&pvec, 0);
342 while (next <= end && 348 while (index <= end && pagevec_lookup(&pvec, mapping, index,
343 pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { 349 min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
344 mem_cgroup_uncharge_start(); 350 mem_cgroup_uncharge_start();
345 for (i = 0; i < pagevec_count(&pvec); i++) { 351 for (i = 0; i < pagevec_count(&pvec); i++) {
346 struct page *page = pvec.pages[i]; 352 struct page *page = pvec.pages[i];
347 pgoff_t index;
348 int lock_failed;
349 353
350 lock_failed = !trylock_page(page); 354 /* We rely upon deletion not changing page->index */
351
352 /*
353 * We really shouldn't be looking at the ->index of an
354 * unlocked page. But we're not allowed to lock these
355 * pages. So we rely upon nobody altering the ->index
356 * of this (pinned-by-us) page.
357 */
358 index = page->index; 355 index = page->index;
359 if (index > next) 356 if (index > end)
360 next = index; 357 break;
361 next++;
362 if (lock_failed)
363 continue;
364 358
359 if (!trylock_page(page))
360 continue;
361 WARN_ON(page->index != index);
365 ret = invalidate_inode_page(page); 362 ret = invalidate_inode_page(page);
366 unlock_page(page); 363 unlock_page(page);
367 /* 364 /*
@@ -371,12 +368,11 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
371 if (!ret) 368 if (!ret)
372 deactivate_page(page); 369 deactivate_page(page);
373 count += ret; 370 count += ret;
374 if (next > end)
375 break;
376 } 371 }
377 pagevec_release(&pvec); 372 pagevec_release(&pvec);
378 mem_cgroup_uncharge_end(); 373 mem_cgroup_uncharge_end();
379 cond_resched(); 374 cond_resched();
375 index++;
380 } 376 }
381 return count; 377 return count;
382} 378}
@@ -442,37 +438,32 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
442 pgoff_t start, pgoff_t end) 438 pgoff_t start, pgoff_t end)
443{ 439{
444 struct pagevec pvec; 440 struct pagevec pvec;
445 pgoff_t next; 441 pgoff_t index;
446 int i; 442 int i;
447 int ret = 0; 443 int ret = 0;
448 int ret2 = 0; 444 int ret2 = 0;
449 int did_range_unmap = 0; 445 int did_range_unmap = 0;
450 int wrapped = 0;
451 446
452 cleancache_flush_inode(mapping); 447 cleancache_flush_inode(mapping);
453 pagevec_init(&pvec, 0); 448 pagevec_init(&pvec, 0);
454 next = start; 449 index = start;
455 while (next <= end && !wrapped && 450 while (index <= end && pagevec_lookup(&pvec, mapping, index,
456 pagevec_lookup(&pvec, mapping, next, 451 min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
457 min(end - next, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
458 mem_cgroup_uncharge_start(); 452 mem_cgroup_uncharge_start();
459 for (i = 0; i < pagevec_count(&pvec); i++) { 453 for (i = 0; i < pagevec_count(&pvec); i++) {
460 struct page *page = pvec.pages[i]; 454 struct page *page = pvec.pages[i];
461 pgoff_t page_index; 455
456 /* We rely upon deletion not changing page->index */
457 index = page->index;
458 if (index > end)
459 break;
462 460
463 lock_page(page); 461 lock_page(page);
462 WARN_ON(page->index != index);
464 if (page->mapping != mapping) { 463 if (page->mapping != mapping) {
465 unlock_page(page); 464 unlock_page(page);
466 continue; 465 continue;
467 } 466 }
468 page_index = page->index;
469 next = page_index + 1;
470 if (next == 0)
471 wrapped = 1;
472 if (page_index > end) {
473 unlock_page(page);
474 break;
475 }
476 wait_on_page_writeback(page); 467 wait_on_page_writeback(page);
477 if (page_mapped(page)) { 468 if (page_mapped(page)) {
478 if (!did_range_unmap) { 469 if (!did_range_unmap) {
@@ -480,9 +471,9 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
480 * Zap the rest of the file in one hit. 471 * Zap the rest of the file in one hit.
481 */ 472 */
482 unmap_mapping_range(mapping, 473 unmap_mapping_range(mapping,
483 (loff_t)page_index<<PAGE_CACHE_SHIFT, 474 (loff_t)index << PAGE_CACHE_SHIFT,
484 (loff_t)(end - page_index + 1) 475 (loff_t)(1 + end - index)
485 << PAGE_CACHE_SHIFT, 476 << PAGE_CACHE_SHIFT,
486 0); 477 0);
487 did_range_unmap = 1; 478 did_range_unmap = 1;
488 } else { 479 } else {
@@ -490,8 +481,8 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
490 * Just zap this page 481 * Just zap this page
491 */ 482 */
492 unmap_mapping_range(mapping, 483 unmap_mapping_range(mapping,
493 (loff_t)page_index<<PAGE_CACHE_SHIFT, 484 (loff_t)index << PAGE_CACHE_SHIFT,
494 PAGE_CACHE_SIZE, 0); 485 PAGE_CACHE_SIZE, 0);
495 } 486 }
496 } 487 }
497 BUG_ON(page_mapped(page)); 488 BUG_ON(page_mapped(page));
@@ -507,6 +498,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
507 pagevec_release(&pvec); 498 pagevec_release(&pvec);
508 mem_cgroup_uncharge_end(); 499 mem_cgroup_uncharge_end();
509 cond_resched(); 500 cond_resched();
501 index++;
510 } 502 }
511 cleancache_flush_inode(mapping); 503 cleancache_flush_inode(mapping);
512 return ret; 504 return ret;
@@ -531,8 +523,8 @@ EXPORT_SYMBOL_GPL(invalidate_inode_pages2);
531/** 523/**
532 * truncate_pagecache - unmap and remove pagecache that has been truncated 524 * truncate_pagecache - unmap and remove pagecache that has been truncated
533 * @inode: inode 525 * @inode: inode
534 * @old: old file offset 526 * @oldsize: old file size
535 * @new: new file offset 527 * @newsize: new file size
536 * 528 *
537 * inode's new i_size must already be written before truncate_pagecache 529 * inode's new i_size must already be written before truncate_pagecache
538 * is called. 530 * is called.
@@ -544,9 +536,10 @@ EXPORT_SYMBOL_GPL(invalidate_inode_pages2);
544 * situations such as writepage being called for a page that has already 536 * situations such as writepage being called for a page that has already
545 * had its underlying blocks deallocated. 537 * had its underlying blocks deallocated.
546 */ 538 */
547void truncate_pagecache(struct inode *inode, loff_t old, loff_t new) 539void truncate_pagecache(struct inode *inode, loff_t oldsize, loff_t newsize)
548{ 540{
549 struct address_space *mapping = inode->i_mapping; 541 struct address_space *mapping = inode->i_mapping;
542 loff_t holebegin = round_up(newsize, PAGE_SIZE);
550 543
551 /* 544 /*
552 * unmap_mapping_range is called twice, first simply for 545 * unmap_mapping_range is called twice, first simply for
@@ -557,9 +550,9 @@ void truncate_pagecache(struct inode *inode, loff_t old, loff_t new)
557 * truncate_inode_pages finishes, hence the second 550 * truncate_inode_pages finishes, hence the second
558 * unmap_mapping_range call must be made for correctness. 551 * unmap_mapping_range call must be made for correctness.
559 */ 552 */
560 unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1); 553 unmap_mapping_range(mapping, holebegin, 0, 1);
561 truncate_inode_pages(mapping, new); 554 truncate_inode_pages(mapping, newsize);
562 unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1); 555 unmap_mapping_range(mapping, holebegin, 0, 1);
563} 556}
564EXPORT_SYMBOL(truncate_pagecache); 557EXPORT_SYMBOL(truncate_pagecache);
565 558
@@ -589,29 +582,31 @@ EXPORT_SYMBOL(truncate_setsize);
589/** 582/**
590 * vmtruncate - unmap mappings "freed" by truncate() syscall 583 * vmtruncate - unmap mappings "freed" by truncate() syscall
591 * @inode: inode of the file used 584 * @inode: inode of the file used
592 * @offset: file offset to start truncating 585 * @newsize: file offset to start truncating
593 * 586 *
594 * This function is deprecated and truncate_setsize or truncate_pagecache 587 * This function is deprecated and truncate_setsize or truncate_pagecache
595 * should be used instead, together with filesystem specific block truncation. 588 * should be used instead, together with filesystem specific block truncation.
596 */ 589 */
597int vmtruncate(struct inode *inode, loff_t offset) 590int vmtruncate(struct inode *inode, loff_t newsize)
598{ 591{
599 int error; 592 int error;
600 593
601 error = inode_newsize_ok(inode, offset); 594 error = inode_newsize_ok(inode, newsize);
602 if (error) 595 if (error)
603 return error; 596 return error;
604 597
605 truncate_setsize(inode, offset); 598 truncate_setsize(inode, newsize);
606 if (inode->i_op->truncate) 599 if (inode->i_op->truncate)
607 inode->i_op->truncate(inode); 600 inode->i_op->truncate(inode);
608 return 0; 601 return 0;
609} 602}
610EXPORT_SYMBOL(vmtruncate); 603EXPORT_SYMBOL(vmtruncate);
611 604
612int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end) 605int vmtruncate_range(struct inode *inode, loff_t lstart, loff_t lend)
613{ 606{
614 struct address_space *mapping = inode->i_mapping; 607 struct address_space *mapping = inode->i_mapping;
608 loff_t holebegin = round_up(lstart, PAGE_SIZE);
609 loff_t holelen = 1 + lend - holebegin;
615 610
616 /* 611 /*
617 * If the underlying filesystem is not going to provide 612 * If the underlying filesystem is not going to provide
@@ -622,12 +617,11 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
622 return -ENOSYS; 617 return -ENOSYS;
623 618
624 mutex_lock(&inode->i_mutex); 619 mutex_lock(&inode->i_mutex);
625 down_write(&inode->i_alloc_sem); 620 inode_dio_wait(inode);
626 unmap_mapping_range(mapping, offset, (end - offset), 1); 621 unmap_mapping_range(mapping, holebegin, holelen, 1);
627 inode->i_op->truncate_range(inode, offset, end); 622 inode->i_op->truncate_range(inode, lstart, lend);
628 /* unmap again to remove racily COWed private pages */ 623 /* unmap again to remove racily COWed private pages */
629 unmap_mapping_range(mapping, offset, (end - offset), 1); 624 unmap_mapping_range(mapping, holebegin, holelen, 1);
630 up_write(&inode->i_alloc_sem);
631 mutex_unlock(&inode->i_mutex); 625 mutex_unlock(&inode->i_mutex);
632 626
633 return 0; 627 return 0;
diff --git a/mm/util.c b/mm/util.c
index 88ea1bd661c0..136ac4f322b8 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -1,7 +1,7 @@
1#include <linux/mm.h> 1#include <linux/mm.h>
2#include <linux/slab.h> 2#include <linux/slab.h>
3#include <linux/string.h> 3#include <linux/string.h>
4#include <linux/module.h> 4#include <linux/export.h>
5#include <linux/err.h> 5#include <linux/err.h>
6#include <linux/sched.h> 6#include <linux/sched.h>
7#include <asm/uaccess.h> 7#include <asm/uaccess.h>
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 1d34d75366a7..3231bf332878 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -26,7 +26,7 @@
26#include <linux/rcupdate.h> 26#include <linux/rcupdate.h>
27#include <linux/pfn.h> 27#include <linux/pfn.h>
28#include <linux/kmemleak.h> 28#include <linux/kmemleak.h>
29#include <asm/atomic.h> 29#include <linux/atomic.h>
30#include <asm/uaccess.h> 30#include <asm/uaccess.h>
31#include <asm/tlbflush.h> 31#include <asm/tlbflush.h>
32#include <asm/shmparam.h> 32#include <asm/shmparam.h>
@@ -452,13 +452,6 @@ overflow:
452 return ERR_PTR(-EBUSY); 452 return ERR_PTR(-EBUSY);
453} 453}
454 454
455static void rcu_free_va(struct rcu_head *head)
456{
457 struct vmap_area *va = container_of(head, struct vmap_area, rcu_head);
458
459 kfree(va);
460}
461
462static void __free_vmap_area(struct vmap_area *va) 455static void __free_vmap_area(struct vmap_area *va)
463{ 456{
464 BUG_ON(RB_EMPTY_NODE(&va->rb_node)); 457 BUG_ON(RB_EMPTY_NODE(&va->rb_node));
@@ -491,7 +484,7 @@ static void __free_vmap_area(struct vmap_area *va)
491 if (va->va_end > VMALLOC_START && va->va_end <= VMALLOC_END) 484 if (va->va_end > VMALLOC_START && va->va_end <= VMALLOC_END)
492 vmap_area_pcpu_hole = max(vmap_area_pcpu_hole, va->va_end); 485 vmap_area_pcpu_hole = max(vmap_area_pcpu_hole, va->va_end);
493 486
494 call_rcu(&va->rcu_head, rcu_free_va); 487 kfree_rcu(va, rcu_head);
495} 488}
496 489
497/* 490/*
@@ -732,9 +725,10 @@ static void free_unmap_vmap_area_addr(unsigned long addr)
732#define VMAP_BBMAP_BITS_MIN (VMAP_MAX_ALLOC*2) 725#define VMAP_BBMAP_BITS_MIN (VMAP_MAX_ALLOC*2)
733#define VMAP_MIN(x, y) ((x) < (y) ? (x) : (y)) /* can't use min() */ 726#define VMAP_MIN(x, y) ((x) < (y) ? (x) : (y)) /* can't use min() */
734#define VMAP_MAX(x, y) ((x) > (y) ? (x) : (y)) /* can't use max() */ 727#define VMAP_MAX(x, y) ((x) > (y) ? (x) : (y)) /* can't use max() */
735#define VMAP_BBMAP_BITS VMAP_MIN(VMAP_BBMAP_BITS_MAX, \ 728#define VMAP_BBMAP_BITS \
736 VMAP_MAX(VMAP_BBMAP_BITS_MIN, \ 729 VMAP_MIN(VMAP_BBMAP_BITS_MAX, \
737 VMALLOC_PAGES / NR_CPUS / 16)) 730 VMAP_MAX(VMAP_BBMAP_BITS_MIN, \
731 VMALLOC_PAGES / roundup_pow_of_two(NR_CPUS) / 16))
738 732
739#define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE) 733#define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE)
740 734
@@ -837,13 +831,6 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
837 return vb; 831 return vb;
838} 832}
839 833
840static void rcu_free_vb(struct rcu_head *head)
841{
842 struct vmap_block *vb = container_of(head, struct vmap_block, rcu_head);
843
844 kfree(vb);
845}
846
847static void free_vmap_block(struct vmap_block *vb) 834static void free_vmap_block(struct vmap_block *vb)
848{ 835{
849 struct vmap_block *tmp; 836 struct vmap_block *tmp;
@@ -856,7 +843,7 @@ static void free_vmap_block(struct vmap_block *vb)
856 BUG_ON(tmp != vb); 843 BUG_ON(tmp != vb);
857 844
858 free_vmap_area_noflush(vb->va); 845 free_vmap_area_noflush(vb->va);
859 call_rcu(&vb->rcu_head, rcu_free_vb); 846 kfree_rcu(vb, rcu_head);
860} 847}
861 848
862static void purge_fragmented_blocks(int cpu) 849static void purge_fragmented_blocks(int cpu)
@@ -1266,18 +1253,22 @@ EXPORT_SYMBOL_GPL(map_vm_area);
1266DEFINE_RWLOCK(vmlist_lock); 1253DEFINE_RWLOCK(vmlist_lock);
1267struct vm_struct *vmlist; 1254struct vm_struct *vmlist;
1268 1255
1269static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, 1256static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
1270 unsigned long flags, void *caller) 1257 unsigned long flags, void *caller)
1271{ 1258{
1272 struct vm_struct *tmp, **p;
1273
1274 vm->flags = flags; 1259 vm->flags = flags;
1275 vm->addr = (void *)va->va_start; 1260 vm->addr = (void *)va->va_start;
1276 vm->size = va->va_end - va->va_start; 1261 vm->size = va->va_end - va->va_start;
1277 vm->caller = caller; 1262 vm->caller = caller;
1278 va->private = vm; 1263 va->private = vm;
1279 va->flags |= VM_VM_AREA; 1264 va->flags |= VM_VM_AREA;
1265}
1266
1267static void insert_vmalloc_vmlist(struct vm_struct *vm)
1268{
1269 struct vm_struct *tmp, **p;
1280 1270
1271 vm->flags &= ~VM_UNLIST;
1281 write_lock(&vmlist_lock); 1272 write_lock(&vmlist_lock);
1282 for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) { 1273 for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) {
1283 if (tmp->addr >= vm->addr) 1274 if (tmp->addr >= vm->addr)
@@ -1288,6 +1279,13 @@ static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
1288 write_unlock(&vmlist_lock); 1279 write_unlock(&vmlist_lock);
1289} 1280}
1290 1281
1282static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
1283 unsigned long flags, void *caller)
1284{
1285 setup_vmalloc_vm(vm, va, flags, caller);
1286 insert_vmalloc_vmlist(vm);
1287}
1288
1291static struct vm_struct *__get_vm_area_node(unsigned long size, 1289static struct vm_struct *__get_vm_area_node(unsigned long size,
1292 unsigned long align, unsigned long flags, unsigned long start, 1290 unsigned long align, unsigned long flags, unsigned long start,
1293 unsigned long end, int node, gfp_t gfp_mask, void *caller) 1291 unsigned long end, int node, gfp_t gfp_mask, void *caller)
@@ -1326,7 +1324,18 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
1326 return NULL; 1324 return NULL;
1327 } 1325 }
1328 1326
1329 insert_vmalloc_vm(area, va, flags, caller); 1327 /*
1328 * When this function is called from __vmalloc_node_range,
1329 * we do not add vm_struct to vmlist here to avoid
1330 * accessing uninitialized members of vm_struct such as
1331 * pages and nr_pages fields. They will be set later.
1332 * To distinguish it from others, we use a VM_UNLIST flag.
1333 */
1334 if (flags & VM_UNLIST)
1335 setup_vmalloc_vm(area, va, flags, caller);
1336 else
1337 insert_vmalloc_vm(area, va, flags, caller);
1338
1330 return area; 1339 return area;
1331} 1340}
1332 1341
@@ -1394,17 +1403,20 @@ struct vm_struct *remove_vm_area(const void *addr)
1394 va = find_vmap_area((unsigned long)addr); 1403 va = find_vmap_area((unsigned long)addr);
1395 if (va && va->flags & VM_VM_AREA) { 1404 if (va && va->flags & VM_VM_AREA) {
1396 struct vm_struct *vm = va->private; 1405 struct vm_struct *vm = va->private;
1397 struct vm_struct *tmp, **p; 1406
1398 /* 1407 if (!(vm->flags & VM_UNLIST)) {
1399 * remove from list and disallow access to this vm_struct 1408 struct vm_struct *tmp, **p;
1400 * before unmap. (address range confliction is maintained by 1409 /*
1401 * vmap.) 1410 * remove from list and disallow access to
1402 */ 1411 * this vm_struct before unmap. (address range
1403 write_lock(&vmlist_lock); 1412 * confliction is maintained by vmap.)
1404 for (p = &vmlist; (tmp = *p) != vm; p = &tmp->next) 1413 */
1405 ; 1414 write_lock(&vmlist_lock);
1406 *p = tmp->next; 1415 for (p = &vmlist; (tmp = *p) != vm; p = &tmp->next)
1407 write_unlock(&vmlist_lock); 1416 ;
1417 *p = tmp->next;
1418 write_unlock(&vmlist_lock);
1419 }
1408 1420
1409 vmap_debug_free_range(va->va_start, va->va_end); 1421 vmap_debug_free_range(va->va_start, va->va_end);
1410 free_unmap_vmap_area(va); 1422 free_unmap_vmap_area(va);
@@ -1581,8 +1593,8 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
1581 return area->addr; 1593 return area->addr;
1582 1594
1583fail: 1595fail:
1584 warn_alloc_failed(gfp_mask, order, "vmalloc: allocation failure, " 1596 warn_alloc_failed(gfp_mask, order,
1585 "allocated %ld of %ld bytes\n", 1597 "vmalloc: allocation failure, allocated %ld of %ld bytes\n",
1586 (area->nr_pages*PAGE_SIZE), area->size); 1598 (area->nr_pages*PAGE_SIZE), area->size);
1587 vfree(area->addr); 1599 vfree(area->addr);
1588 return NULL; 1600 return NULL;
@@ -1613,17 +1625,22 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
1613 1625
1614 size = PAGE_ALIGN(size); 1626 size = PAGE_ALIGN(size);
1615 if (!size || (size >> PAGE_SHIFT) > totalram_pages) 1627 if (!size || (size >> PAGE_SHIFT) > totalram_pages)
1616 return NULL; 1628 goto fail;
1617
1618 area = __get_vm_area_node(size, align, VM_ALLOC, start, end, node,
1619 gfp_mask, caller);
1620 1629
1630 area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNLIST,
1631 start, end, node, gfp_mask, caller);
1621 if (!area) 1632 if (!area)
1622 return NULL; 1633 goto fail;
1623 1634
1624 addr = __vmalloc_area_node(area, gfp_mask, prot, node, caller); 1635 addr = __vmalloc_area_node(area, gfp_mask, prot, node, caller);
1625 1636
1626 /* 1637 /*
1638 * In this function, newly allocated vm_struct is not added
1639 * to vmlist at __get_vm_area_node(). so, it is added here.
1640 */
1641 insert_vmalloc_vmlist(area);
1642
1643 /*
1627 * A ref_count = 3 is needed because the vm_struct and vmap_area 1644 * A ref_count = 3 is needed because the vm_struct and vmap_area
1628 * structures allocated in the __get_vm_area_node() function contain 1645 * structures allocated in the __get_vm_area_node() function contain
1629 * references to the virtual address of the vmalloc'ed block. 1646 * references to the virtual address of the vmalloc'ed block.
@@ -1631,6 +1648,12 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
1631 kmemleak_alloc(addr, real_size, 3, gfp_mask); 1648 kmemleak_alloc(addr, real_size, 3, gfp_mask);
1632 1649
1633 return addr; 1650 return addr;
1651
1652fail:
1653 warn_alloc_failed(gfp_mask, 0,
1654 "vmalloc: allocation failure: %lu bytes\n",
1655 real_size);
1656 return NULL;
1634} 1657}
1635 1658
1636/** 1659/**
@@ -2118,23 +2141,30 @@ void __attribute__((weak)) vmalloc_sync_all(void)
2118 2141
2119static int f(pte_t *pte, pgtable_t table, unsigned long addr, void *data) 2142static int f(pte_t *pte, pgtable_t table, unsigned long addr, void *data)
2120{ 2143{
2121 /* apply_to_page_range() does all the hard work. */ 2144 pte_t ***p = data;
2145
2146 if (p) {
2147 *(*p) = pte;
2148 (*p)++;
2149 }
2122 return 0; 2150 return 0;
2123} 2151}
2124 2152
2125/** 2153/**
2126 * alloc_vm_area - allocate a range of kernel address space 2154 * alloc_vm_area - allocate a range of kernel address space
2127 * @size: size of the area 2155 * @size: size of the area
2156 * @ptes: returns the PTEs for the address space
2128 * 2157 *
2129 * Returns: NULL on failure, vm_struct on success 2158 * Returns: NULL on failure, vm_struct on success
2130 * 2159 *
2131 * This function reserves a range of kernel address space, and 2160 * This function reserves a range of kernel address space, and
2132 * allocates pagetables to map that range. No actual mappings 2161 * allocates pagetables to map that range. No actual mappings
2133 * are created. If the kernel address space is not shared 2162 * are created.
2134 * between processes, it syncs the pagetable across all 2163 *
2135 * processes. 2164 * If @ptes is non-NULL, pointers to the PTEs (in init_mm)
2165 * allocated for the VM area are returned.
2136 */ 2166 */
2137struct vm_struct *alloc_vm_area(size_t size) 2167struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes)
2138{ 2168{
2139 struct vm_struct *area; 2169 struct vm_struct *area;
2140 2170
@@ -2148,7 +2178,7 @@ struct vm_struct *alloc_vm_area(size_t size)
2148 * of kernel virtual address space and mapped into init_mm. 2178 * of kernel virtual address space and mapped into init_mm.
2149 */ 2179 */
2150 if (apply_to_page_range(&init_mm, (unsigned long)area->addr, 2180 if (apply_to_page_range(&init_mm, (unsigned long)area->addr,
2151 area->size, f, NULL)) { 2181 size, f, ptes ? &ptes : NULL)) {
2152 free_vm_area(area); 2182 free_vm_area(area);
2153 return NULL; 2183 return NULL;
2154 } 2184 }
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 5ed24b94c5e6..a1893c050795 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -95,8 +95,6 @@ struct scan_control {
95 /* Can pages be swapped as part of reclaim? */ 95 /* Can pages be swapped as part of reclaim? */
96 int may_swap; 96 int may_swap;
97 97
98 int swappiness;
99
100 int order; 98 int order;
101 99
102 /* 100 /*
@@ -173,7 +171,8 @@ static unsigned long zone_nr_lru_pages(struct zone *zone,
173 struct scan_control *sc, enum lru_list lru) 171 struct scan_control *sc, enum lru_list lru)
174{ 172{
175 if (!scanning_global_lru(sc)) 173 if (!scanning_global_lru(sc))
176 return mem_cgroup_zone_nr_lru_pages(sc->mem_cgroup, zone, lru); 174 return mem_cgroup_zone_nr_lru_pages(sc->mem_cgroup,
175 zone_to_nid(zone), zone_idx(zone), BIT(lru));
177 176
178 return zone_page_state(zone, NR_LRU_BASE + lru); 177 return zone_page_state(zone, NR_LRU_BASE + lru);
179} 178}
@@ -250,49 +249,90 @@ unsigned long shrink_slab(struct shrink_control *shrink,
250 unsigned long long delta; 249 unsigned long long delta;
251 unsigned long total_scan; 250 unsigned long total_scan;
252 unsigned long max_pass; 251 unsigned long max_pass;
252 int shrink_ret = 0;
253 long nr;
254 long new_nr;
255 long batch_size = shrinker->batch ? shrinker->batch
256 : SHRINK_BATCH;
257
258 /*
259 * copy the current shrinker scan count into a local variable
260 * and zero it so that other concurrent shrinker invocations
261 * don't also do this scanning work.
262 */
263 do {
264 nr = shrinker->nr;
265 } while (cmpxchg(&shrinker->nr, nr, 0) != nr);
253 266
267 total_scan = nr;
254 max_pass = do_shrinker_shrink(shrinker, shrink, 0); 268 max_pass = do_shrinker_shrink(shrinker, shrink, 0);
255 delta = (4 * nr_pages_scanned) / shrinker->seeks; 269 delta = (4 * nr_pages_scanned) / shrinker->seeks;
256 delta *= max_pass; 270 delta *= max_pass;
257 do_div(delta, lru_pages + 1); 271 do_div(delta, lru_pages + 1);
258 shrinker->nr += delta; 272 total_scan += delta;
259 if (shrinker->nr < 0) { 273 if (total_scan < 0) {
260 printk(KERN_ERR "shrink_slab: %pF negative objects to " 274 printk(KERN_ERR "shrink_slab: %pF negative objects to "
261 "delete nr=%ld\n", 275 "delete nr=%ld\n",
262 shrinker->shrink, shrinker->nr); 276 shrinker->shrink, total_scan);
263 shrinker->nr = max_pass; 277 total_scan = max_pass;
264 } 278 }
265 279
266 /* 280 /*
281 * We need to avoid excessive windup on filesystem shrinkers
282 * due to large numbers of GFP_NOFS allocations causing the
283 * shrinkers to return -1 all the time. This results in a large
284 * nr being built up so when a shrink that can do some work
285 * comes along it empties the entire cache due to nr >>>
286 * max_pass. This is bad for sustaining a working set in
287 * memory.
288 *
289 * Hence only allow the shrinker to scan the entire cache when
290 * a large delta change is calculated directly.
291 */
292 if (delta < max_pass / 4)
293 total_scan = min(total_scan, max_pass / 2);
294
295 /*
267 * Avoid risking looping forever due to too large nr value: 296 * Avoid risking looping forever due to too large nr value:
268 * never try to free more than twice the estimate number of 297 * never try to free more than twice the estimate number of
269 * freeable entries. 298 * freeable entries.
270 */ 299 */
271 if (shrinker->nr > max_pass * 2) 300 if (total_scan > max_pass * 2)
272 shrinker->nr = max_pass * 2; 301 total_scan = max_pass * 2;
273 302
274 total_scan = shrinker->nr; 303 trace_mm_shrink_slab_start(shrinker, shrink, nr,
275 shrinker->nr = 0; 304 nr_pages_scanned, lru_pages,
305 max_pass, delta, total_scan);
276 306
277 while (total_scan >= SHRINK_BATCH) { 307 while (total_scan >= batch_size) {
278 long this_scan = SHRINK_BATCH;
279 int shrink_ret;
280 int nr_before; 308 int nr_before;
281 309
282 nr_before = do_shrinker_shrink(shrinker, shrink, 0); 310 nr_before = do_shrinker_shrink(shrinker, shrink, 0);
283 shrink_ret = do_shrinker_shrink(shrinker, shrink, 311 shrink_ret = do_shrinker_shrink(shrinker, shrink,
284 this_scan); 312 batch_size);
285 if (shrink_ret == -1) 313 if (shrink_ret == -1)
286 break; 314 break;
287 if (shrink_ret < nr_before) 315 if (shrink_ret < nr_before)
288 ret += nr_before - shrink_ret; 316 ret += nr_before - shrink_ret;
289 count_vm_events(SLABS_SCANNED, this_scan); 317 count_vm_events(SLABS_SCANNED, batch_size);
290 total_scan -= this_scan; 318 total_scan -= batch_size;
291 319
292 cond_resched(); 320 cond_resched();
293 } 321 }
294 322
295 shrinker->nr += total_scan; 323 /*
324 * move the unused scan count back into the shrinker in a
325 * manner that handles concurrent updates. If we exhausted the
326 * scan, there is no need to do an update.
327 */
328 do {
329 nr = shrinker->nr;
330 new_nr = total_scan + nr;
331 if (total_scan <= 0)
332 break;
333 } while (cmpxchg(&shrinker->nr, nr, new_nr) != nr);
334
335 trace_mm_shrink_slab_end(shrinker, shrink_ret, nr, new_nr);
296 } 336 }
297 up_read(&shrinker_rwsem); 337 up_read(&shrinker_rwsem);
298out: 338out:
@@ -455,15 +495,6 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
455 return PAGE_ACTIVATE; 495 return PAGE_ACTIVATE;
456 } 496 }
457 497
458 /*
459 * Wait on writeback if requested to. This happens when
460 * direct reclaiming a large contiguous area and the
461 * first attempt to free a range of pages fails.
462 */
463 if (PageWriteback(page) &&
464 (sc->reclaim_mode & RECLAIM_MODE_SYNC))
465 wait_on_page_writeback(page);
466
467 if (!PageWriteback(page)) { 498 if (!PageWriteback(page)) {
468 /* synchronous write or broken a_ops? */ 499 /* synchronous write or broken a_ops? */
469 ClearPageReclaim(page); 500 ClearPageReclaim(page);
@@ -602,13 +633,14 @@ redo:
602 lru = LRU_UNEVICTABLE; 633 lru = LRU_UNEVICTABLE;
603 add_page_to_unevictable_list(page); 634 add_page_to_unevictable_list(page);
604 /* 635 /*
605 * When racing with an mlock clearing (page is 636 * When racing with an mlock or AS_UNEVICTABLE clearing
606 * unlocked), make sure that if the other thread does 637 * (page is unlocked) make sure that if the other thread
607 * not observe our setting of PG_lru and fails 638 * does not observe our setting of PG_lru and fails
608 * isolation, we see PG_mlocked cleared below and move 639 * isolation/check_move_unevictable_page,
640 * we see PG_mlocked/AS_UNEVICTABLE cleared below and move
609 * the page back to the evictable list. 641 * the page back to the evictable list.
610 * 642 *
611 * The other side is TestClearPageMlocked(). 643 * The other side is TestClearPageMlocked() or shmem_lock().
612 */ 644 */
613 smp_mb(); 645 smp_mb();
614 } 646 }
@@ -719,7 +751,10 @@ static noinline_for_stack void free_page_list(struct list_head *free_pages)
719 */ 751 */
720static unsigned long shrink_page_list(struct list_head *page_list, 752static unsigned long shrink_page_list(struct list_head *page_list,
721 struct zone *zone, 753 struct zone *zone,
722 struct scan_control *sc) 754 struct scan_control *sc,
755 int priority,
756 unsigned long *ret_nr_dirty,
757 unsigned long *ret_nr_writeback)
723{ 758{
724 LIST_HEAD(ret_pages); 759 LIST_HEAD(ret_pages);
725 LIST_HEAD(free_pages); 760 LIST_HEAD(free_pages);
@@ -727,6 +762,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
727 unsigned long nr_dirty = 0; 762 unsigned long nr_dirty = 0;
728 unsigned long nr_congested = 0; 763 unsigned long nr_congested = 0;
729 unsigned long nr_reclaimed = 0; 764 unsigned long nr_reclaimed = 0;
765 unsigned long nr_writeback = 0;
730 766
731 cond_resched(); 767 cond_resched();
732 768
@@ -763,13 +799,12 @@ static unsigned long shrink_page_list(struct list_head *page_list,
763 (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); 799 (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
764 800
765 if (PageWriteback(page)) { 801 if (PageWriteback(page)) {
802 nr_writeback++;
766 /* 803 /*
767 * Synchronous reclaim is performed in two passes, 804 * Synchronous reclaim cannot queue pages for
768 * first an asynchronous pass over the list to 805 * writeback due to the possibility of stack overflow
769 * start parallel writeback, and a second synchronous 806 * but if it encounters a page under writeback, wait
770 * pass to wait for the IO to complete. Wait here 807 * for the IO to complete.
771 * for any page for which writeback has already
772 * started.
773 */ 808 */
774 if ((sc->reclaim_mode & RECLAIM_MODE_SYNC) && 809 if ((sc->reclaim_mode & RECLAIM_MODE_SYNC) &&
775 may_enter_fs) 810 may_enter_fs)
@@ -825,6 +860,25 @@ static unsigned long shrink_page_list(struct list_head *page_list,
825 if (PageDirty(page)) { 860 if (PageDirty(page)) {
826 nr_dirty++; 861 nr_dirty++;
827 862
863 /*
864 * Only kswapd can writeback filesystem pages to
865 * avoid risk of stack overflow but do not writeback
866 * unless under significant pressure.
867 */
868 if (page_is_file_cache(page) &&
869 (!current_is_kswapd() || priority >= DEF_PRIORITY - 2)) {
870 /*
871 * Immediately reclaim when written back.
872 * Similar in principal to deactivate_page()
873 * except we already have the page isolated
874 * and know it's dirty
875 */
876 inc_zone_page_state(page, NR_VMSCAN_IMMEDIATE);
877 SetPageReclaim(page);
878
879 goto keep_locked;
880 }
881
828 if (references == PAGEREF_RECLAIM_CLEAN) 882 if (references == PAGEREF_RECLAIM_CLEAN)
829 goto keep_locked; 883 goto keep_locked;
830 if (!may_enter_fs) 884 if (!may_enter_fs)
@@ -959,6 +1013,8 @@ keep_lumpy:
959 1013
960 list_splice(&ret_pages, page_list); 1014 list_splice(&ret_pages, page_list);
961 count_vm_events(PGACTIVATE, pgactivate); 1015 count_vm_events(PGACTIVATE, pgactivate);
1016 *ret_nr_dirty += nr_dirty;
1017 *ret_nr_writeback += nr_writeback;
962 return nr_reclaimed; 1018 return nr_reclaimed;
963} 1019}
964 1020
@@ -972,23 +1028,27 @@ keep_lumpy:
972 * 1028 *
973 * returns 0 on success, -ve errno on failure. 1029 * returns 0 on success, -ve errno on failure.
974 */ 1030 */
975int __isolate_lru_page(struct page *page, int mode, int file) 1031int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file)
976{ 1032{
1033 bool all_lru_mode;
977 int ret = -EINVAL; 1034 int ret = -EINVAL;
978 1035
979 /* Only take pages on the LRU. */ 1036 /* Only take pages on the LRU. */
980 if (!PageLRU(page)) 1037 if (!PageLRU(page))
981 return ret; 1038 return ret;
982 1039
1040 all_lru_mode = (mode & (ISOLATE_ACTIVE|ISOLATE_INACTIVE)) ==
1041 (ISOLATE_ACTIVE|ISOLATE_INACTIVE);
1042
983 /* 1043 /*
984 * When checking the active state, we need to be sure we are 1044 * When checking the active state, we need to be sure we are
985 * dealing with comparible boolean values. Take the logical not 1045 * dealing with comparible boolean values. Take the logical not
986 * of each. 1046 * of each.
987 */ 1047 */
988 if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode)) 1048 if (!all_lru_mode && !PageActive(page) != !(mode & ISOLATE_ACTIVE))
989 return ret; 1049 return ret;
990 1050
991 if (mode != ISOLATE_BOTH && page_is_file_cache(page) != file) 1051 if (!all_lru_mode && !!page_is_file_cache(page) != file)
992 return ret; 1052 return ret;
993 1053
994 /* 1054 /*
@@ -1001,6 +1061,12 @@ int __isolate_lru_page(struct page *page, int mode, int file)
1001 1061
1002 ret = -EBUSY; 1062 ret = -EBUSY;
1003 1063
1064 if ((mode & ISOLATE_CLEAN) && (PageDirty(page) || PageWriteback(page)))
1065 return ret;
1066
1067 if ((mode & ISOLATE_UNMAPPED) && page_mapped(page))
1068 return ret;
1069
1004 if (likely(get_page_unless_zero(page))) { 1070 if (likely(get_page_unless_zero(page))) {
1005 /* 1071 /*
1006 * Be careful not to clear PageLRU until after we're 1072 * Be careful not to clear PageLRU until after we're
@@ -1036,7 +1102,8 @@ int __isolate_lru_page(struct page *page, int mode, int file)
1036 */ 1102 */
1037static unsigned long isolate_lru_pages(unsigned long nr_to_scan, 1103static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1038 struct list_head *src, struct list_head *dst, 1104 struct list_head *src, struct list_head *dst,
1039 unsigned long *scanned, int order, int mode, int file) 1105 unsigned long *scanned, int order, isolate_mode_t mode,
1106 int file)
1040{ 1107{
1041 unsigned long nr_taken = 0; 1108 unsigned long nr_taken = 0;
1042 unsigned long nr_lumpy_taken = 0; 1109 unsigned long nr_lumpy_taken = 0;
@@ -1161,8 +1228,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1161static unsigned long isolate_pages_global(unsigned long nr, 1228static unsigned long isolate_pages_global(unsigned long nr,
1162 struct list_head *dst, 1229 struct list_head *dst,
1163 unsigned long *scanned, int order, 1230 unsigned long *scanned, int order,
1164 int mode, struct zone *z, 1231 isolate_mode_t mode,
1165 int active, int file) 1232 struct zone *z, int active, int file)
1166{ 1233{
1167 int lru = LRU_BASE; 1234 int lru = LRU_BASE;
1168 if (active) 1235 if (active)
@@ -1354,7 +1421,7 @@ static noinline_for_stack void update_isolated_counts(struct zone *zone,
1354} 1421}
1355 1422
1356/* 1423/*
1357 * Returns true if the caller should wait to clean dirty/writeback pages. 1424 * Returns true if a direct reclaim should wait on pages under writeback.
1358 * 1425 *
1359 * If we are direct reclaiming for contiguous pages and we do not reclaim 1426 * If we are direct reclaiming for contiguous pages and we do not reclaim
1360 * everything in the list, try again and wait for writeback IO to complete. 1427 * everything in the list, try again and wait for writeback IO to complete.
@@ -1376,7 +1443,7 @@ static inline bool should_reclaim_stall(unsigned long nr_taken,
1376 if (sc->reclaim_mode & RECLAIM_MODE_SINGLE) 1443 if (sc->reclaim_mode & RECLAIM_MODE_SINGLE)
1377 return false; 1444 return false;
1378 1445
1379 /* If we have relaimed everything on the isolated list, no stall */ 1446 /* If we have reclaimed everything on the isolated list, no stall */
1380 if (nr_freed == nr_taken) 1447 if (nr_freed == nr_taken)
1381 return false; 1448 return false;
1382 1449
@@ -1408,6 +1475,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1408 unsigned long nr_taken; 1475 unsigned long nr_taken;
1409 unsigned long nr_anon; 1476 unsigned long nr_anon;
1410 unsigned long nr_file; 1477 unsigned long nr_file;
1478 unsigned long nr_dirty = 0;
1479 unsigned long nr_writeback = 0;
1480 isolate_mode_t reclaim_mode = ISOLATE_INACTIVE;
1411 1481
1412 while (unlikely(too_many_isolated(zone, file, sc))) { 1482 while (unlikely(too_many_isolated(zone, file, sc))) {
1413 congestion_wait(BLK_RW_ASYNC, HZ/10); 1483 congestion_wait(BLK_RW_ASYNC, HZ/10);
@@ -1418,15 +1488,21 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1418 } 1488 }
1419 1489
1420 set_reclaim_mode(priority, sc, false); 1490 set_reclaim_mode(priority, sc, false);
1491 if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM)
1492 reclaim_mode |= ISOLATE_ACTIVE;
1493
1421 lru_add_drain(); 1494 lru_add_drain();
1495
1496 if (!sc->may_unmap)
1497 reclaim_mode |= ISOLATE_UNMAPPED;
1498 if (!sc->may_writepage)
1499 reclaim_mode |= ISOLATE_CLEAN;
1500
1422 spin_lock_irq(&zone->lru_lock); 1501 spin_lock_irq(&zone->lru_lock);
1423 1502
1424 if (scanning_global_lru(sc)) { 1503 if (scanning_global_lru(sc)) {
1425 nr_taken = isolate_pages_global(nr_to_scan, 1504 nr_taken = isolate_pages_global(nr_to_scan, &page_list,
1426 &page_list, &nr_scanned, sc->order, 1505 &nr_scanned, sc->order, reclaim_mode, zone, 0, file);
1427 sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
1428 ISOLATE_BOTH : ISOLATE_INACTIVE,
1429 zone, 0, file);
1430 zone->pages_scanned += nr_scanned; 1506 zone->pages_scanned += nr_scanned;
1431 if (current_is_kswapd()) 1507 if (current_is_kswapd())
1432 __count_zone_vm_events(PGSCAN_KSWAPD, zone, 1508 __count_zone_vm_events(PGSCAN_KSWAPD, zone,
@@ -1435,12 +1511,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1435 __count_zone_vm_events(PGSCAN_DIRECT, zone, 1511 __count_zone_vm_events(PGSCAN_DIRECT, zone,
1436 nr_scanned); 1512 nr_scanned);
1437 } else { 1513 } else {
1438 nr_taken = mem_cgroup_isolate_pages(nr_to_scan, 1514 nr_taken = mem_cgroup_isolate_pages(nr_to_scan, &page_list,
1439 &page_list, &nr_scanned, sc->order, 1515 &nr_scanned, sc->order, reclaim_mode, zone,
1440 sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ? 1516 sc->mem_cgroup, 0, file);
1441 ISOLATE_BOTH : ISOLATE_INACTIVE,
1442 zone, sc->mem_cgroup,
1443 0, file);
1444 /* 1517 /*
1445 * mem_cgroup_isolate_pages() keeps track of 1518 * mem_cgroup_isolate_pages() keeps track of
1446 * scanned pages on its own. 1519 * scanned pages on its own.
@@ -1456,12 +1529,14 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1456 1529
1457 spin_unlock_irq(&zone->lru_lock); 1530 spin_unlock_irq(&zone->lru_lock);
1458 1531
1459 nr_reclaimed = shrink_page_list(&page_list, zone, sc); 1532 nr_reclaimed = shrink_page_list(&page_list, zone, sc, priority,
1533 &nr_dirty, &nr_writeback);
1460 1534
1461 /* Check if we should syncronously wait for writeback */ 1535 /* Check if we should syncronously wait for writeback */
1462 if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) { 1536 if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) {
1463 set_reclaim_mode(priority, sc, true); 1537 set_reclaim_mode(priority, sc, true);
1464 nr_reclaimed += shrink_page_list(&page_list, zone, sc); 1538 nr_reclaimed += shrink_page_list(&page_list, zone, sc,
1539 priority, &nr_dirty, &nr_writeback);
1465 } 1540 }
1466 1541
1467 local_irq_disable(); 1542 local_irq_disable();
@@ -1471,6 +1546,32 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1471 1546
1472 putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list); 1547 putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list);
1473 1548
1549 /*
1550 * If reclaim is isolating dirty pages under writeback, it implies
1551 * that the long-lived page allocation rate is exceeding the page
1552 * laundering rate. Either the global limits are not being effective
1553 * at throttling processes due to the page distribution throughout
1554 * zones or there is heavy usage of a slow backing device. The
1555 * only option is to throttle from reclaim context which is not ideal
1556 * as there is no guarantee the dirtying process is throttled in the
1557 * same way balance_dirty_pages() manages.
1558 *
1559 * This scales the number of dirty pages that must be under writeback
1560 * before throttling depending on priority. It is a simple backoff
1561 * function that has the most effect in the range DEF_PRIORITY to
1562 * DEF_PRIORITY-2 which is the priority reclaim is considered to be
1563 * in trouble and reclaim is considered to be in trouble.
1564 *
1565 * DEF_PRIORITY 100% isolated pages must be PageWriteback to throttle
1566 * DEF_PRIORITY-1 50% must be PageWriteback
1567 * DEF_PRIORITY-2 25% must be PageWriteback, kswapd in trouble
1568 * ...
1569 * DEF_PRIORITY-6 For SWAP_CLUSTER_MAX isolated pages, throttle if any
1570 * isolated page is PageWriteback
1571 */
1572 if (nr_writeback && nr_writeback >= (nr_taken >> (DEF_PRIORITY-priority)))
1573 wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);
1574
1474 trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id, 1575 trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,
1475 zone_idx(zone), 1576 zone_idx(zone),
1476 nr_scanned, nr_reclaimed, 1577 nr_scanned, nr_reclaimed,
@@ -1542,19 +1643,26 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1542 struct page *page; 1643 struct page *page;
1543 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); 1644 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1544 unsigned long nr_rotated = 0; 1645 unsigned long nr_rotated = 0;
1646 isolate_mode_t reclaim_mode = ISOLATE_ACTIVE;
1545 1647
1546 lru_add_drain(); 1648 lru_add_drain();
1649
1650 if (!sc->may_unmap)
1651 reclaim_mode |= ISOLATE_UNMAPPED;
1652 if (!sc->may_writepage)
1653 reclaim_mode |= ISOLATE_CLEAN;
1654
1547 spin_lock_irq(&zone->lru_lock); 1655 spin_lock_irq(&zone->lru_lock);
1548 if (scanning_global_lru(sc)) { 1656 if (scanning_global_lru(sc)) {
1549 nr_taken = isolate_pages_global(nr_pages, &l_hold, 1657 nr_taken = isolate_pages_global(nr_pages, &l_hold,
1550 &pgscanned, sc->order, 1658 &pgscanned, sc->order,
1551 ISOLATE_ACTIVE, zone, 1659 reclaim_mode, zone,
1552 1, file); 1660 1, file);
1553 zone->pages_scanned += pgscanned; 1661 zone->pages_scanned += pgscanned;
1554 } else { 1662 } else {
1555 nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold, 1663 nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold,
1556 &pgscanned, sc->order, 1664 &pgscanned, sc->order,
1557 ISOLATE_ACTIVE, zone, 1665 reclaim_mode, zone,
1558 sc->mem_cgroup, 1, file); 1666 sc->mem_cgroup, 1, file);
1559 /* 1667 /*
1560 * mem_cgroup_isolate_pages() keeps track of 1668 * mem_cgroup_isolate_pages() keeps track of
@@ -1659,7 +1767,7 @@ static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc)
1659 if (scanning_global_lru(sc)) 1767 if (scanning_global_lru(sc))
1660 low = inactive_anon_is_low_global(zone); 1768 low = inactive_anon_is_low_global(zone);
1661 else 1769 else
1662 low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup); 1770 low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup, zone);
1663 return low; 1771 return low;
1664} 1772}
1665#else 1773#else
@@ -1702,7 +1810,7 @@ static int inactive_file_is_low(struct zone *zone, struct scan_control *sc)
1702 if (scanning_global_lru(sc)) 1810 if (scanning_global_lru(sc))
1703 low = inactive_file_is_low_global(zone); 1811 low = inactive_file_is_low_global(zone);
1704 else 1812 else
1705 low = mem_cgroup_inactive_file_is_low(sc->mem_cgroup); 1813 low = mem_cgroup_inactive_file_is_low(sc->mem_cgroup, zone);
1706 return low; 1814 return low;
1707} 1815}
1708 1816
@@ -1729,6 +1837,13 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
1729 return shrink_inactive_list(nr_to_scan, zone, sc, priority, file); 1837 return shrink_inactive_list(nr_to_scan, zone, sc, priority, file);
1730} 1838}
1731 1839
1840static int vmscan_swappiness(struct scan_control *sc)
1841{
1842 if (scanning_global_lru(sc))
1843 return vm_swappiness;
1844 return mem_cgroup_swappiness(sc->mem_cgroup);
1845}
1846
1732/* 1847/*
1733 * Determine how aggressively the anon and file LRU lists should be 1848 * Determine how aggressively the anon and file LRU lists should be
1734 * scanned. The relative value of each set of LRU lists is determined 1849 * scanned. The relative value of each set of LRU lists is determined
@@ -1747,22 +1862,22 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
1747 u64 fraction[2], denominator; 1862 u64 fraction[2], denominator;
1748 enum lru_list l; 1863 enum lru_list l;
1749 int noswap = 0; 1864 int noswap = 0;
1750 int force_scan = 0; 1865 bool force_scan = false;
1751
1752
1753 anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) +
1754 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
1755 file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) +
1756 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
1757 1866
1758 if (((anon + file) >> priority) < SWAP_CLUSTER_MAX) { 1867 /*
1759 /* kswapd does zone balancing and need to scan this zone */ 1868 * If the zone or memcg is small, nr[l] can be 0. This
1760 if (scanning_global_lru(sc) && current_is_kswapd()) 1869 * results in no scanning on this priority and a potential
1761 force_scan = 1; 1870 * priority drop. Global direct reclaim can go to the next
1762 /* memcg may have small limit and need to avoid priority drop */ 1871 * zone and tends to have no problems. Global kswapd is for
1763 if (!scanning_global_lru(sc)) 1872 * zone balancing and it needs to scan a minimum amount. When
1764 force_scan = 1; 1873 * reclaiming for a memcg, a priority drop can cause high
1765 } 1874 * latencies, so it's better to scan a minimum amount there as
1875 * well.
1876 */
1877 if (scanning_global_lru(sc) && current_is_kswapd())
1878 force_scan = true;
1879 if (!scanning_global_lru(sc))
1880 force_scan = true;
1766 1881
1767 /* If we have no swap space, do not bother scanning anon pages. */ 1882 /* If we have no swap space, do not bother scanning anon pages. */
1768 if (!sc->may_swap || (nr_swap_pages <= 0)) { 1883 if (!sc->may_swap || (nr_swap_pages <= 0)) {
@@ -1773,6 +1888,11 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
1773 goto out; 1888 goto out;
1774 } 1889 }
1775 1890
1891 anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) +
1892 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
1893 file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) +
1894 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
1895
1776 if (scanning_global_lru(sc)) { 1896 if (scanning_global_lru(sc)) {
1777 free = zone_page_state(zone, NR_FREE_PAGES); 1897 free = zone_page_state(zone, NR_FREE_PAGES);
1778 /* If we have very few page cache pages, 1898 /* If we have very few page cache pages,
@@ -1789,8 +1909,8 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
1789 * With swappiness at 100, anonymous and file have the same priority. 1909 * With swappiness at 100, anonymous and file have the same priority.
1790 * This scanning priority is essentially the inverse of IO cost. 1910 * This scanning priority is essentially the inverse of IO cost.
1791 */ 1911 */
1792 anon_prio = sc->swappiness; 1912 anon_prio = vmscan_swappiness(sc);
1793 file_prio = 200 - sc->swappiness; 1913 file_prio = 200 - vmscan_swappiness(sc);
1794 1914
1795 /* 1915 /*
1796 * OK, so we have swap space and a fair amount of page cache 1916 * OK, so we have swap space and a fair amount of page cache
@@ -1837,23 +1957,9 @@ out:
1837 scan = zone_nr_lru_pages(zone, sc, l); 1957 scan = zone_nr_lru_pages(zone, sc, l);
1838 if (priority || noswap) { 1958 if (priority || noswap) {
1839 scan >>= priority; 1959 scan >>= priority;
1840 scan = div64_u64(scan * fraction[file], denominator); 1960 if (!scan && force_scan)
1841 }
1842
1843 /*
1844 * If zone is small or memcg is small, nr[l] can be 0.
1845 * This results no-scan on this priority and priority drop down.
1846 * For global direct reclaim, it can visit next zone and tend
1847 * not to have problems. For global kswapd, it's for zone
1848 * balancing and it need to scan a small amounts. When using
1849 * memcg, priority drop can cause big latency. So, it's better
1850 * to scan small amount. See may_noscan above.
1851 */
1852 if (!scan && force_scan) {
1853 if (file)
1854 scan = SWAP_CLUSTER_MAX;
1855 else if (!noswap)
1856 scan = SWAP_CLUSTER_MAX; 1961 scan = SWAP_CLUSTER_MAX;
1962 scan = div64_u64(scan * fraction[file], denominator);
1857 } 1963 }
1858 nr[l] = scan; 1964 nr[l] = scan;
1859 } 1965 }
@@ -1933,12 +2039,14 @@ static void shrink_zone(int priority, struct zone *zone,
1933 enum lru_list l; 2039 enum lru_list l;
1934 unsigned long nr_reclaimed, nr_scanned; 2040 unsigned long nr_reclaimed, nr_scanned;
1935 unsigned long nr_to_reclaim = sc->nr_to_reclaim; 2041 unsigned long nr_to_reclaim = sc->nr_to_reclaim;
2042 struct blk_plug plug;
1936 2043
1937restart: 2044restart:
1938 nr_reclaimed = 0; 2045 nr_reclaimed = 0;
1939 nr_scanned = sc->nr_scanned; 2046 nr_scanned = sc->nr_scanned;
1940 get_scan_count(zone, sc, nr, priority); 2047 get_scan_count(zone, sc, nr, priority);
1941 2048
2049 blk_start_plug(&plug);
1942 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || 2050 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
1943 nr[LRU_INACTIVE_FILE]) { 2051 nr[LRU_INACTIVE_FILE]) {
1944 for_each_evictable_lru(l) { 2052 for_each_evictable_lru(l) {
@@ -1962,6 +2070,7 @@ restart:
1962 if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY) 2070 if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY)
1963 break; 2071 break;
1964 } 2072 }
2073 blk_finish_plug(&plug);
1965 sc->nr_reclaimed += nr_reclaimed; 2074 sc->nr_reclaimed += nr_reclaimed;
1966 2075
1967 /* 2076 /*
@@ -1994,14 +2103,19 @@ restart:
1994 * 2103 *
1995 * If a zone is deemed to be full of pinned pages then just give it a light 2104 * If a zone is deemed to be full of pinned pages then just give it a light
1996 * scan then give up on it. 2105 * scan then give up on it.
2106 *
2107 * This function returns true if a zone is being reclaimed for a costly
2108 * high-order allocation and compaction is either ready to begin or deferred.
2109 * This indicates to the caller that it should retry the allocation or fail.
1997 */ 2110 */
1998static void shrink_zones(int priority, struct zonelist *zonelist, 2111static bool shrink_zones(int priority, struct zonelist *zonelist,
1999 struct scan_control *sc) 2112 struct scan_control *sc)
2000{ 2113{
2001 struct zoneref *z; 2114 struct zoneref *z;
2002 struct zone *zone; 2115 struct zone *zone;
2003 unsigned long nr_soft_reclaimed; 2116 unsigned long nr_soft_reclaimed;
2004 unsigned long nr_soft_scanned; 2117 unsigned long nr_soft_scanned;
2118 bool should_abort_reclaim = false;
2005 2119
2006 for_each_zone_zonelist_nodemask(zone, z, zonelist, 2120 for_each_zone_zonelist_nodemask(zone, z, zonelist,
2007 gfp_zone(sc->gfp_mask), sc->nodemask) { 2121 gfp_zone(sc->gfp_mask), sc->nodemask) {
@@ -2016,6 +2130,23 @@ static void shrink_zones(int priority, struct zonelist *zonelist,
2016 continue; 2130 continue;
2017 if (zone->all_unreclaimable && priority != DEF_PRIORITY) 2131 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
2018 continue; /* Let kswapd poll it */ 2132 continue; /* Let kswapd poll it */
2133 if (COMPACTION_BUILD) {
2134 /*
2135 * If we already have plenty of memory free for
2136 * compaction in this zone, don't free any more.
2137 * Even though compaction is invoked for any
2138 * non-zero order, only frequent costly order
2139 * reclamation is disruptive enough to become a
2140 * noticable problem, like transparent huge page
2141 * allocations.
2142 */
2143 if (sc->order > PAGE_ALLOC_COSTLY_ORDER &&
2144 (compaction_suitable(zone, sc->order) ||
2145 compaction_deferred(zone))) {
2146 should_abort_reclaim = true;
2147 continue;
2148 }
2149 }
2019 /* 2150 /*
2020 * This steals pages from memory cgroups over softlimit 2151 * This steals pages from memory cgroups over softlimit
2021 * and returns the number of reclaimed pages and 2152 * and returns the number of reclaimed pages and
@@ -2033,6 +2164,8 @@ static void shrink_zones(int priority, struct zonelist *zonelist,
2033 2164
2034 shrink_zone(priority, zone, sc); 2165 shrink_zone(priority, zone, sc);
2035 } 2166 }
2167
2168 return should_abort_reclaim;
2036} 2169}
2037 2170
2038static bool zone_reclaimable(struct zone *zone) 2171static bool zone_reclaimable(struct zone *zone)
@@ -2097,7 +2230,9 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2097 sc->nr_scanned = 0; 2230 sc->nr_scanned = 0;
2098 if (!priority) 2231 if (!priority)
2099 disable_swap_token(sc->mem_cgroup); 2232 disable_swap_token(sc->mem_cgroup);
2100 shrink_zones(priority, zonelist, sc); 2233 if (shrink_zones(priority, zonelist, sc))
2234 break;
2235
2101 /* 2236 /*
2102 * Don't shrink slabs when reclaiming memory from 2237 * Don't shrink slabs when reclaiming memory from
2103 * over limit cgroups 2238 * over limit cgroups
@@ -2131,7 +2266,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2131 */ 2266 */
2132 writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2; 2267 writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2;
2133 if (total_scanned > writeback_threshold) { 2268 if (total_scanned > writeback_threshold) {
2134 wakeup_flusher_threads(laptop_mode ? 0 : total_scanned); 2269 wakeup_flusher_threads(laptop_mode ? 0 : total_scanned,
2270 WB_REASON_TRY_TO_FREE_PAGES);
2135 sc->may_writepage = 1; 2271 sc->may_writepage = 1;
2136 } 2272 }
2137 2273
@@ -2179,7 +2315,6 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
2179 .nr_to_reclaim = SWAP_CLUSTER_MAX, 2315 .nr_to_reclaim = SWAP_CLUSTER_MAX,
2180 .may_unmap = 1, 2316 .may_unmap = 1,
2181 .may_swap = 1, 2317 .may_swap = 1,
2182 .swappiness = vm_swappiness,
2183 .order = order, 2318 .order = order,
2184 .mem_cgroup = NULL, 2319 .mem_cgroup = NULL,
2185 .nodemask = nodemask, 2320 .nodemask = nodemask,
@@ -2203,7 +2338,6 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
2203 2338
2204unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, 2339unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
2205 gfp_t gfp_mask, bool noswap, 2340 gfp_t gfp_mask, bool noswap,
2206 unsigned int swappiness,
2207 struct zone *zone, 2341 struct zone *zone,
2208 unsigned long *nr_scanned) 2342 unsigned long *nr_scanned)
2209{ 2343{
@@ -2213,7 +2347,6 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
2213 .may_writepage = !laptop_mode, 2347 .may_writepage = !laptop_mode,
2214 .may_unmap = 1, 2348 .may_unmap = 1,
2215 .may_swap = !noswap, 2349 .may_swap = !noswap,
2216 .swappiness = swappiness,
2217 .order = 0, 2350 .order = 0,
2218 .mem_cgroup = mem, 2351 .mem_cgroup = mem,
2219 }; 2352 };
@@ -2242,8 +2375,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
2242 2375
2243unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, 2376unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
2244 gfp_t gfp_mask, 2377 gfp_t gfp_mask,
2245 bool noswap, 2378 bool noswap)
2246 unsigned int swappiness)
2247{ 2379{
2248 struct zonelist *zonelist; 2380 struct zonelist *zonelist;
2249 unsigned long nr_reclaimed; 2381 unsigned long nr_reclaimed;
@@ -2253,7 +2385,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
2253 .may_unmap = 1, 2385 .may_unmap = 1,
2254 .may_swap = !noswap, 2386 .may_swap = !noswap,
2255 .nr_to_reclaim = SWAP_CLUSTER_MAX, 2387 .nr_to_reclaim = SWAP_CLUSTER_MAX,
2256 .swappiness = swappiness,
2257 .order = 0, 2388 .order = 0,
2258 .mem_cgroup = mem_cont, 2389 .mem_cgroup = mem_cont,
2259 .nodemask = NULL, /* we don't care the placement */ 2390 .nodemask = NULL, /* we don't care the placement */
@@ -2310,7 +2441,8 @@ static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages,
2310 for (i = 0; i <= classzone_idx; i++) 2441 for (i = 0; i <= classzone_idx; i++)
2311 present_pages += pgdat->node_zones[i].present_pages; 2442 present_pages += pgdat->node_zones[i].present_pages;
2312 2443
2313 return balanced_pages > (present_pages >> 2); 2444 /* A special case here: if zone has no page, we think it's balanced */
2445 return balanced_pages >= (present_pages >> 2);
2314} 2446}
2315 2447
2316/* is kswapd sleeping prematurely? */ 2448/* is kswapd sleeping prematurely? */
@@ -2403,7 +2535,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
2403 * we want to put equal scanning pressure on each zone. 2535 * we want to put equal scanning pressure on each zone.
2404 */ 2536 */
2405 .nr_to_reclaim = ULONG_MAX, 2537 .nr_to_reclaim = ULONG_MAX,
2406 .swappiness = vm_swappiness,
2407 .order = order, 2538 .order = order,
2408 .mem_cgroup = NULL, 2539 .mem_cgroup = NULL,
2409 }; 2540 };
@@ -2452,6 +2583,9 @@ loop_again:
2452 high_wmark_pages(zone), 0, 0)) { 2583 high_wmark_pages(zone), 0, 0)) {
2453 end_zone = i; 2584 end_zone = i;
2454 break; 2585 break;
2586 } else {
2587 /* If balanced, clear the congested flag */
2588 zone_clear_flag(zone, ZONE_CONGESTED);
2455 } 2589 }
2456 } 2590 }
2457 if (i < 0) 2591 if (i < 0)
@@ -2642,6 +2776,8 @@ out:
2642 2776
2643 /* If balanced, clear the congested flag */ 2777 /* If balanced, clear the congested flag */
2644 zone_clear_flag(zone, ZONE_CONGESTED); 2778 zone_clear_flag(zone, ZONE_CONGESTED);
2779 if (i <= *classzone_idx)
2780 balanced += zone->present_pages;
2645 } 2781 }
2646 } 2782 }
2647 2783
@@ -2715,7 +2851,9 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
2715static int kswapd(void *p) 2851static int kswapd(void *p)
2716{ 2852{
2717 unsigned long order, new_order; 2853 unsigned long order, new_order;
2854 unsigned balanced_order;
2718 int classzone_idx, new_classzone_idx; 2855 int classzone_idx, new_classzone_idx;
2856 int balanced_classzone_idx;
2719 pg_data_t *pgdat = (pg_data_t*)p; 2857 pg_data_t *pgdat = (pg_data_t*)p;
2720 struct task_struct *tsk = current; 2858 struct task_struct *tsk = current;
2721 2859
@@ -2746,7 +2884,9 @@ static int kswapd(void *p)
2746 set_freezable(); 2884 set_freezable();
2747 2885
2748 order = new_order = 0; 2886 order = new_order = 0;
2887 balanced_order = 0;
2749 classzone_idx = new_classzone_idx = pgdat->nr_zones - 1; 2888 classzone_idx = new_classzone_idx = pgdat->nr_zones - 1;
2889 balanced_classzone_idx = classzone_idx;
2750 for ( ; ; ) { 2890 for ( ; ; ) {
2751 int ret; 2891 int ret;
2752 2892
@@ -2755,7 +2895,8 @@ static int kswapd(void *p)
2755 * new request of a similar or harder type will succeed soon 2895 * new request of a similar or harder type will succeed soon
2756 * so consider going to sleep on the basis we reclaimed at 2896 * so consider going to sleep on the basis we reclaimed at
2757 */ 2897 */
2758 if (classzone_idx >= new_classzone_idx && order == new_order) { 2898 if (balanced_classzone_idx >= new_classzone_idx &&
2899 balanced_order == new_order) {
2759 new_order = pgdat->kswapd_max_order; 2900 new_order = pgdat->kswapd_max_order;
2760 new_classzone_idx = pgdat->classzone_idx; 2901 new_classzone_idx = pgdat->classzone_idx;
2761 pgdat->kswapd_max_order = 0; 2902 pgdat->kswapd_max_order = 0;
@@ -2770,9 +2911,12 @@ static int kswapd(void *p)
2770 order = new_order; 2911 order = new_order;
2771 classzone_idx = new_classzone_idx; 2912 classzone_idx = new_classzone_idx;
2772 } else { 2913 } else {
2773 kswapd_try_to_sleep(pgdat, order, classzone_idx); 2914 kswapd_try_to_sleep(pgdat, balanced_order,
2915 balanced_classzone_idx);
2774 order = pgdat->kswapd_max_order; 2916 order = pgdat->kswapd_max_order;
2775 classzone_idx = pgdat->classzone_idx; 2917 classzone_idx = pgdat->classzone_idx;
2918 new_order = order;
2919 new_classzone_idx = classzone_idx;
2776 pgdat->kswapd_max_order = 0; 2920 pgdat->kswapd_max_order = 0;
2777 pgdat->classzone_idx = pgdat->nr_zones - 1; 2921 pgdat->classzone_idx = pgdat->nr_zones - 1;
2778 } 2922 }
@@ -2787,7 +2931,9 @@ static int kswapd(void *p)
2787 */ 2931 */
2788 if (!ret) { 2932 if (!ret) {
2789 trace_mm_vmscan_kswapd_wake(pgdat->node_id, order); 2933 trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
2790 order = balance_pgdat(pgdat, order, &classzone_idx); 2934 balanced_classzone_idx = classzone_idx;
2935 balanced_order = balance_pgdat(pgdat, order,
2936 &balanced_classzone_idx);
2791 } 2937 }
2792 } 2938 }
2793 return 0; 2939 return 0;
@@ -2873,7 +3019,6 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
2873 .may_writepage = 1, 3019 .may_writepage = 1,
2874 .nr_to_reclaim = nr_to_reclaim, 3020 .nr_to_reclaim = nr_to_reclaim,
2875 .hibernation_mode = 1, 3021 .hibernation_mode = 1,
2876 .swappiness = vm_swappiness,
2877 .order = 0, 3022 .order = 0,
2878 }; 3023 };
2879 struct shrink_control shrink = { 3024 struct shrink_control shrink = {
@@ -3060,7 +3205,6 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
3060 .nr_to_reclaim = max_t(unsigned long, nr_pages, 3205 .nr_to_reclaim = max_t(unsigned long, nr_pages,
3061 SWAP_CLUSTER_MAX), 3206 SWAP_CLUSTER_MAX),
3062 .gfp_mask = gfp_mask, 3207 .gfp_mask = gfp_mask,
3063 .swappiness = vm_swappiness,
3064 .order = order, 3208 .order = order,
3065 }; 3209 };
3066 struct shrink_control shrink = { 3210 struct shrink_control shrink = {
@@ -3301,66 +3445,12 @@ void scan_mapping_unevictable_pages(struct address_space *mapping)
3301 3445
3302} 3446}
3303 3447
3304/** 3448static void warn_scan_unevictable_pages(void)
3305 * scan_zone_unevictable_pages - check unevictable list for evictable pages
3306 * @zone - zone of which to scan the unevictable list
3307 *
3308 * Scan @zone's unevictable LRU lists to check for pages that have become
3309 * evictable. Move those that have to @zone's inactive list where they
3310 * become candidates for reclaim, unless shrink_inactive_zone() decides
3311 * to reactivate them. Pages that are still unevictable are rotated
3312 * back onto @zone's unevictable list.
3313 */
3314#define SCAN_UNEVICTABLE_BATCH_SIZE 16UL /* arbitrary lock hold batch size */
3315static void scan_zone_unevictable_pages(struct zone *zone)
3316{ 3449{
3317 struct list_head *l_unevictable = &zone->lru[LRU_UNEVICTABLE].list; 3450 printk_once(KERN_WARNING
3318 unsigned long scan; 3451 "The scan_unevictable_pages sysctl/node-interface has been "
3319 unsigned long nr_to_scan = zone_page_state(zone, NR_UNEVICTABLE); 3452 "disabled for lack of a legitimate use case. If you have "
3320 3453 "one, please send an email to linux-mm@kvack.org.\n");
3321 while (nr_to_scan > 0) {
3322 unsigned long batch_size = min(nr_to_scan,
3323 SCAN_UNEVICTABLE_BATCH_SIZE);
3324
3325 spin_lock_irq(&zone->lru_lock);
3326 for (scan = 0; scan < batch_size; scan++) {
3327 struct page *page = lru_to_page(l_unevictable);
3328
3329 if (!trylock_page(page))
3330 continue;
3331
3332 prefetchw_prev_lru_page(page, l_unevictable, flags);
3333
3334 if (likely(PageLRU(page) && PageUnevictable(page)))
3335 check_move_unevictable_page(page, zone);
3336
3337 unlock_page(page);
3338 }
3339 spin_unlock_irq(&zone->lru_lock);
3340
3341 nr_to_scan -= batch_size;
3342 }
3343}
3344
3345
3346/**
3347 * scan_all_zones_unevictable_pages - scan all unevictable lists for evictable pages
3348 *
3349 * A really big hammer: scan all zones' unevictable LRU lists to check for
3350 * pages that have become evictable. Move those back to the zones'
3351 * inactive list where they become candidates for reclaim.
3352 * This occurs when, e.g., we have unswappable pages on the unevictable lists,
3353 * and we add swap to the system. As such, it runs in the context of a task
3354 * that has possibly/probably made some previously unevictable pages
3355 * evictable.
3356 */
3357static void scan_all_zones_unevictable_pages(void)
3358{
3359 struct zone *zone;
3360
3361 for_each_zone(zone) {
3362 scan_zone_unevictable_pages(zone);
3363 }
3364} 3454}
3365 3455
3366/* 3456/*
@@ -3373,11 +3463,8 @@ int scan_unevictable_handler(struct ctl_table *table, int write,
3373 void __user *buffer, 3463 void __user *buffer,
3374 size_t *length, loff_t *ppos) 3464 size_t *length, loff_t *ppos)
3375{ 3465{
3466 warn_scan_unevictable_pages();
3376 proc_doulongvec_minmax(table, write, buffer, length, ppos); 3467 proc_doulongvec_minmax(table, write, buffer, length, ppos);
3377
3378 if (write && *(unsigned long *)table->data)
3379 scan_all_zones_unevictable_pages();
3380
3381 scan_unevictable_pages = 0; 3468 scan_unevictable_pages = 0;
3382 return 0; 3469 return 0;
3383} 3470}
@@ -3392,6 +3479,7 @@ static ssize_t read_scan_unevictable_node(struct sys_device *dev,
3392 struct sysdev_attribute *attr, 3479 struct sysdev_attribute *attr,
3393 char *buf) 3480 char *buf)
3394{ 3481{
3482 warn_scan_unevictable_pages();
3395 return sprintf(buf, "0\n"); /* always zero; should fit... */ 3483 return sprintf(buf, "0\n"); /* always zero; should fit... */
3396} 3484}
3397 3485
@@ -3399,19 +3487,7 @@ static ssize_t write_scan_unevictable_node(struct sys_device *dev,
3399 struct sysdev_attribute *attr, 3487 struct sysdev_attribute *attr,
3400 const char *buf, size_t count) 3488 const char *buf, size_t count)
3401{ 3489{
3402 struct zone *node_zones = NODE_DATA(dev->id)->node_zones; 3490 warn_scan_unevictable_pages();
3403 struct zone *zone;
3404 unsigned long res;
3405 unsigned long req = strict_strtoul(buf, 10, &res);
3406
3407 if (!req)
3408 return 1; /* zero is no-op */
3409
3410 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
3411 if (!populated_zone(zone))
3412 continue;
3413 scan_zone_unevictable_pages(zone);
3414 }
3415 return 1; 3491 return 1;
3416} 3492}
3417 3493
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 20c18b7694b2..8fd603b1665e 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -78,7 +78,7 @@ void vm_events_fold_cpu(int cpu)
78 * 78 *
79 * vm_stat contains the global counters 79 * vm_stat contains the global counters
80 */ 80 */
81atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS]; 81atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp;
82EXPORT_SYMBOL(vm_stat); 82EXPORT_SYMBOL(vm_stat);
83 83
84#ifdef CONFIG_SMP 84#ifdef CONFIG_SMP
@@ -659,7 +659,7 @@ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
659} 659}
660#endif 660#endif
661 661
662#if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS) 662#if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS) || defined(CONFIG_NUMA)
663#ifdef CONFIG_ZONE_DMA 663#ifdef CONFIG_ZONE_DMA
664#define TEXT_FOR_DMA(xx) xx "_dma", 664#define TEXT_FOR_DMA(xx) xx "_dma",
665#else 665#else
@@ -702,6 +702,7 @@ const char * const vmstat_text[] = {
702 "nr_unstable", 702 "nr_unstable",
703 "nr_bounce", 703 "nr_bounce",
704 "nr_vmscan_write", 704 "nr_vmscan_write",
705 "nr_vmscan_immediate_reclaim",
705 "nr_writeback_temp", 706 "nr_writeback_temp",
706 "nr_isolated_anon", 707 "nr_isolated_anon",
707 "nr_isolated_file", 708 "nr_isolated_file",
@@ -788,7 +789,7 @@ const char * const vmstat_text[] = {
788 789
789#endif /* CONFIG_VM_EVENTS_COUNTERS */ 790#endif /* CONFIG_VM_EVENTS_COUNTERS */
790}; 791};
791#endif /* CONFIG_PROC_FS || CONFIG_SYSFS */ 792#endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA */
792 793
793 794
794#ifdef CONFIG_PROC_FS 795#ifdef CONFIG_PROC_FS