aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig4
-rw-r--r--mm/bootmem.c182
-rw-r--r--mm/fadvise.c10
-rw-r--r--mm/failslab.c18
-rw-r--r--mm/filemap.c4
-rw-r--r--mm/filemap_xip.c2
-rw-r--r--mm/fremap.c2
-rw-r--r--mm/highmem.c2
-rw-r--r--mm/ksm.c14
-rw-r--r--mm/memcontrol.c1398
-rw-r--r--mm/memory-failure.c5
-rw-r--r--mm/memory.c167
-rw-r--r--mm/memory_hotplug.c10
-rw-r--r--mm/mempolicy.c164
-rw-r--r--mm/migrate.c4
-rw-r--r--mm/mlock.c12
-rw-r--r--mm/mmap.c175
-rw-r--r--mm/mmu_context.c1
-rw-r--r--mm/mremap.c9
-rw-r--r--mm/nommu.c35
-rw-r--r--mm/oom_kill.c14
-rw-r--r--mm/page_alloc.c401
-rw-r--r--mm/page_cgroup.c42
-rw-r--r--mm/percpu.c36
-rw-r--r--mm/readahead.c6
-rw-r--r--mm/rmap.c185
-rw-r--r--mm/slab.c13
-rw-r--r--mm/slub.c343
-rw-r--r--mm/sparse-vmemmap.c76
-rw-r--r--mm/sparse.c196
-rw-r--r--mm/swap.c2
-rw-r--r--mm/swapfile.c71
-rw-r--r--mm/vmscan.c177
-rw-r--r--mm/vmstat.c17
34 files changed, 2806 insertions, 991 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index d34c2b971032..9c61158308dc 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -115,6 +115,10 @@ config SPARSEMEM_EXTREME
115config SPARSEMEM_VMEMMAP_ENABLE 115config SPARSEMEM_VMEMMAP_ENABLE
116 bool 116 bool
117 117
118config SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
119 def_bool y
120 depends on SPARSEMEM && X86_64
121
118config SPARSEMEM_VMEMMAP 122config SPARSEMEM_VMEMMAP
119 bool "Sparse Memory virtual memmap" 123 bool "Sparse Memory virtual memmap"
120 depends on SPARSEMEM && SPARSEMEM_VMEMMAP_ENABLE 124 depends on SPARSEMEM && SPARSEMEM_VMEMMAP_ENABLE
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 7d1486875e1c..9b134460b016 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -13,6 +13,7 @@
13#include <linux/bootmem.h> 13#include <linux/bootmem.h>
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/kmemleak.h> 15#include <linux/kmemleak.h>
16#include <linux/range.h>
16 17
17#include <asm/bug.h> 18#include <asm/bug.h>
18#include <asm/io.h> 19#include <asm/io.h>
@@ -32,6 +33,7 @@ unsigned long max_pfn;
32unsigned long saved_max_pfn; 33unsigned long saved_max_pfn;
33#endif 34#endif
34 35
36#ifndef CONFIG_NO_BOOTMEM
35bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata; 37bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata;
36 38
37static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list); 39static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list);
@@ -142,7 +144,7 @@ unsigned long __init init_bootmem(unsigned long start, unsigned long pages)
142 min_low_pfn = start; 144 min_low_pfn = start;
143 return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages); 145 return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages);
144} 146}
145 147#endif
146/* 148/*
147 * free_bootmem_late - free bootmem pages directly to page allocator 149 * free_bootmem_late - free bootmem pages directly to page allocator
148 * @addr: starting address of the range 150 * @addr: starting address of the range
@@ -167,6 +169,53 @@ void __init free_bootmem_late(unsigned long addr, unsigned long size)
167 } 169 }
168} 170}
169 171
172#ifdef CONFIG_NO_BOOTMEM
173static void __init __free_pages_memory(unsigned long start, unsigned long end)
174{
175 int i;
176 unsigned long start_aligned, end_aligned;
177 int order = ilog2(BITS_PER_LONG);
178
179 start_aligned = (start + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1);
180 end_aligned = end & ~(BITS_PER_LONG - 1);
181
182 if (end_aligned <= start_aligned) {
183 for (i = start; i < end; i++)
184 __free_pages_bootmem(pfn_to_page(i), 0);
185
186 return;
187 }
188
189 for (i = start; i < start_aligned; i++)
190 __free_pages_bootmem(pfn_to_page(i), 0);
191
192 for (i = start_aligned; i < end_aligned; i += BITS_PER_LONG)
193 __free_pages_bootmem(pfn_to_page(i), order);
194
195 for (i = end_aligned; i < end; i++)
196 __free_pages_bootmem(pfn_to_page(i), 0);
197}
198
199unsigned long __init free_all_memory_core_early(int nodeid)
200{
201 int i;
202 u64 start, end;
203 unsigned long count = 0;
204 struct range *range = NULL;
205 int nr_range;
206
207 nr_range = get_free_all_memory_range(&range, nodeid);
208
209 for (i = 0; i < nr_range; i++) {
210 start = range[i].start;
211 end = range[i].end;
212 count += end - start;
213 __free_pages_memory(start, end);
214 }
215
216 return count;
217}
218#else
170static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) 219static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
171{ 220{
172 int aligned; 221 int aligned;
@@ -227,6 +276,7 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
227 276
228 return count; 277 return count;
229} 278}
279#endif
230 280
231/** 281/**
232 * free_all_bootmem_node - release a node's free pages to the buddy allocator 282 * free_all_bootmem_node - release a node's free pages to the buddy allocator
@@ -237,7 +287,12 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
237unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) 287unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
238{ 288{
239 register_page_bootmem_info_node(pgdat); 289 register_page_bootmem_info_node(pgdat);
290#ifdef CONFIG_NO_BOOTMEM
291 /* free_all_memory_core_early(MAX_NUMNODES) will be called later */
292 return 0;
293#else
240 return free_all_bootmem_core(pgdat->bdata); 294 return free_all_bootmem_core(pgdat->bdata);
295#endif
241} 296}
242 297
243/** 298/**
@@ -247,9 +302,14 @@ unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
247 */ 302 */
248unsigned long __init free_all_bootmem(void) 303unsigned long __init free_all_bootmem(void)
249{ 304{
305#ifdef CONFIG_NO_BOOTMEM
306 return free_all_memory_core_early(NODE_DATA(0)->node_id);
307#else
250 return free_all_bootmem_core(NODE_DATA(0)->bdata); 308 return free_all_bootmem_core(NODE_DATA(0)->bdata);
309#endif
251} 310}
252 311
312#ifndef CONFIG_NO_BOOTMEM
253static void __init __free(bootmem_data_t *bdata, 313static void __init __free(bootmem_data_t *bdata,
254 unsigned long sidx, unsigned long eidx) 314 unsigned long sidx, unsigned long eidx)
255{ 315{
@@ -344,6 +404,7 @@ static int __init mark_bootmem(unsigned long start, unsigned long end,
344 } 404 }
345 BUG(); 405 BUG();
346} 406}
407#endif
347 408
348/** 409/**
349 * free_bootmem_node - mark a page range as usable 410 * free_bootmem_node - mark a page range as usable
@@ -358,6 +419,9 @@ static int __init mark_bootmem(unsigned long start, unsigned long end,
358void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, 419void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
359 unsigned long size) 420 unsigned long size)
360{ 421{
422#ifdef CONFIG_NO_BOOTMEM
423 free_early(physaddr, physaddr + size);
424#else
361 unsigned long start, end; 425 unsigned long start, end;
362 426
363 kmemleak_free_part(__va(physaddr), size); 427 kmemleak_free_part(__va(physaddr), size);
@@ -366,6 +430,7 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
366 end = PFN_DOWN(physaddr + size); 430 end = PFN_DOWN(physaddr + size);
367 431
368 mark_bootmem_node(pgdat->bdata, start, end, 0, 0); 432 mark_bootmem_node(pgdat->bdata, start, end, 0, 0);
433#endif
369} 434}
370 435
371/** 436/**
@@ -379,6 +444,9 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
379 */ 444 */
380void __init free_bootmem(unsigned long addr, unsigned long size) 445void __init free_bootmem(unsigned long addr, unsigned long size)
381{ 446{
447#ifdef CONFIG_NO_BOOTMEM
448 free_early(addr, addr + size);
449#else
382 unsigned long start, end; 450 unsigned long start, end;
383 451
384 kmemleak_free_part(__va(addr), size); 452 kmemleak_free_part(__va(addr), size);
@@ -387,6 +455,7 @@ void __init free_bootmem(unsigned long addr, unsigned long size)
387 end = PFN_DOWN(addr + size); 455 end = PFN_DOWN(addr + size);
388 456
389 mark_bootmem(start, end, 0, 0); 457 mark_bootmem(start, end, 0, 0);
458#endif
390} 459}
391 460
392/** 461/**
@@ -403,12 +472,17 @@ void __init free_bootmem(unsigned long addr, unsigned long size)
403int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, 472int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
404 unsigned long size, int flags) 473 unsigned long size, int flags)
405{ 474{
475#ifdef CONFIG_NO_BOOTMEM
476 panic("no bootmem");
477 return 0;
478#else
406 unsigned long start, end; 479 unsigned long start, end;
407 480
408 start = PFN_DOWN(physaddr); 481 start = PFN_DOWN(physaddr);
409 end = PFN_UP(physaddr + size); 482 end = PFN_UP(physaddr + size);
410 483
411 return mark_bootmem_node(pgdat->bdata, start, end, 1, flags); 484 return mark_bootmem_node(pgdat->bdata, start, end, 1, flags);
485#endif
412} 486}
413 487
414/** 488/**
@@ -424,14 +498,20 @@ int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
424int __init reserve_bootmem(unsigned long addr, unsigned long size, 498int __init reserve_bootmem(unsigned long addr, unsigned long size,
425 int flags) 499 int flags)
426{ 500{
501#ifdef CONFIG_NO_BOOTMEM
502 panic("no bootmem");
503 return 0;
504#else
427 unsigned long start, end; 505 unsigned long start, end;
428 506
429 start = PFN_DOWN(addr); 507 start = PFN_DOWN(addr);
430 end = PFN_UP(addr + size); 508 end = PFN_UP(addr + size);
431 509
432 return mark_bootmem(start, end, 1, flags); 510 return mark_bootmem(start, end, 1, flags);
511#endif
433} 512}
434 513
514#ifndef CONFIG_NO_BOOTMEM
435static unsigned long __init align_idx(struct bootmem_data *bdata, 515static unsigned long __init align_idx(struct bootmem_data *bdata,
436 unsigned long idx, unsigned long step) 516 unsigned long idx, unsigned long step)
437{ 517{
@@ -582,12 +662,33 @@ static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata,
582#endif 662#endif
583 return NULL; 663 return NULL;
584} 664}
665#endif
585 666
586static void * __init ___alloc_bootmem_nopanic(unsigned long size, 667static void * __init ___alloc_bootmem_nopanic(unsigned long size,
587 unsigned long align, 668 unsigned long align,
588 unsigned long goal, 669 unsigned long goal,
589 unsigned long limit) 670 unsigned long limit)
590{ 671{
672#ifdef CONFIG_NO_BOOTMEM
673 void *ptr;
674
675 if (WARN_ON_ONCE(slab_is_available()))
676 return kzalloc(size, GFP_NOWAIT);
677
678restart:
679
680 ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal, limit);
681
682 if (ptr)
683 return ptr;
684
685 if (goal != 0) {
686 goal = 0;
687 goto restart;
688 }
689
690 return NULL;
691#else
591 bootmem_data_t *bdata; 692 bootmem_data_t *bdata;
592 void *region; 693 void *region;
593 694
@@ -613,6 +714,7 @@ restart:
613 } 714 }
614 715
615 return NULL; 716 return NULL;
717#endif
616} 718}
617 719
618/** 720/**
@@ -631,7 +733,13 @@ restart:
631void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align, 733void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
632 unsigned long goal) 734 unsigned long goal)
633{ 735{
634 return ___alloc_bootmem_nopanic(size, align, goal, 0); 736 unsigned long limit = 0;
737
738#ifdef CONFIG_NO_BOOTMEM
739 limit = -1UL;
740#endif
741
742 return ___alloc_bootmem_nopanic(size, align, goal, limit);
635} 743}
636 744
637static void * __init ___alloc_bootmem(unsigned long size, unsigned long align, 745static void * __init ___alloc_bootmem(unsigned long size, unsigned long align,
@@ -665,9 +773,16 @@ static void * __init ___alloc_bootmem(unsigned long size, unsigned long align,
665void * __init __alloc_bootmem(unsigned long size, unsigned long align, 773void * __init __alloc_bootmem(unsigned long size, unsigned long align,
666 unsigned long goal) 774 unsigned long goal)
667{ 775{
668 return ___alloc_bootmem(size, align, goal, 0); 776 unsigned long limit = 0;
777
778#ifdef CONFIG_NO_BOOTMEM
779 limit = -1UL;
780#endif
781
782 return ___alloc_bootmem(size, align, goal, limit);
669} 783}
670 784
785#ifndef CONFIG_NO_BOOTMEM
671static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata, 786static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata,
672 unsigned long size, unsigned long align, 787 unsigned long size, unsigned long align,
673 unsigned long goal, unsigned long limit) 788 unsigned long goal, unsigned long limit)
@@ -684,6 +799,7 @@ static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata,
684 799
685 return ___alloc_bootmem(size, align, goal, limit); 800 return ___alloc_bootmem(size, align, goal, limit);
686} 801}
802#endif
687 803
688/** 804/**
689 * __alloc_bootmem_node - allocate boot memory from a specific node 805 * __alloc_bootmem_node - allocate boot memory from a specific node
@@ -706,7 +822,46 @@ void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
706 if (WARN_ON_ONCE(slab_is_available())) 822 if (WARN_ON_ONCE(slab_is_available()))
707 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); 823 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
708 824
825#ifdef CONFIG_NO_BOOTMEM
826 return __alloc_memory_core_early(pgdat->node_id, size, align,
827 goal, -1ULL);
828#else
709 return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0); 829 return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0);
830#endif
831}
832
833void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
834 unsigned long align, unsigned long goal)
835{
836#ifdef MAX_DMA32_PFN
837 unsigned long end_pfn;
838
839 if (WARN_ON_ONCE(slab_is_available()))
840 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
841
842 /* update goal according ...MAX_DMA32_PFN */
843 end_pfn = pgdat->node_start_pfn + pgdat->node_spanned_pages;
844
845 if (end_pfn > MAX_DMA32_PFN + (128 >> (20 - PAGE_SHIFT)) &&
846 (goal >> PAGE_SHIFT) < MAX_DMA32_PFN) {
847 void *ptr;
848 unsigned long new_goal;
849
850 new_goal = MAX_DMA32_PFN << PAGE_SHIFT;
851#ifdef CONFIG_NO_BOOTMEM
852 ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
853 new_goal, -1ULL);
854#else
855 ptr = alloc_bootmem_core(pgdat->bdata, size, align,
856 new_goal, 0);
857#endif
858 if (ptr)
859 return ptr;
860 }
861#endif
862
863 return __alloc_bootmem_node(pgdat, size, align, goal);
864
710} 865}
711 866
712#ifdef CONFIG_SPARSEMEM 867#ifdef CONFIG_SPARSEMEM
@@ -720,6 +875,16 @@ void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
720void * __init alloc_bootmem_section(unsigned long size, 875void * __init alloc_bootmem_section(unsigned long size,
721 unsigned long section_nr) 876 unsigned long section_nr)
722{ 877{
878#ifdef CONFIG_NO_BOOTMEM
879 unsigned long pfn, goal, limit;
880
881 pfn = section_nr_to_pfn(section_nr);
882 goal = pfn << PAGE_SHIFT;
883 limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT;
884
885 return __alloc_memory_core_early(early_pfn_to_nid(pfn), size,
886 SMP_CACHE_BYTES, goal, limit);
887#else
723 bootmem_data_t *bdata; 888 bootmem_data_t *bdata;
724 unsigned long pfn, goal, limit; 889 unsigned long pfn, goal, limit;
725 890
@@ -729,6 +894,7 @@ void * __init alloc_bootmem_section(unsigned long size,
729 bdata = &bootmem_node_data[early_pfn_to_nid(pfn)]; 894 bdata = &bootmem_node_data[early_pfn_to_nid(pfn)];
730 895
731 return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, limit); 896 return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, limit);
897#endif
732} 898}
733#endif 899#endif
734 900
@@ -740,11 +906,16 @@ void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
740 if (WARN_ON_ONCE(slab_is_available())) 906 if (WARN_ON_ONCE(slab_is_available()))
741 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); 907 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
742 908
909#ifdef CONFIG_NO_BOOTMEM
910 ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
911 goal, -1ULL);
912#else
743 ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0); 913 ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0);
744 if (ptr) 914 if (ptr)
745 return ptr; 915 return ptr;
746 916
747 ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0); 917 ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0);
918#endif
748 if (ptr) 919 if (ptr)
749 return ptr; 920 return ptr;
750 921
@@ -795,6 +966,11 @@ void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size,
795 if (WARN_ON_ONCE(slab_is_available())) 966 if (WARN_ON_ONCE(slab_is_available()))
796 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); 967 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
797 968
969#ifdef CONFIG_NO_BOOTMEM
970 return __alloc_memory_core_early(pgdat->node_id, size, align,
971 goal, ARCH_LOW_ADDRESS_LIMIT);
972#else
798 return ___alloc_bootmem_node(pgdat->bdata, size, align, 973 return ___alloc_bootmem_node(pgdat->bdata, size, align,
799 goal, ARCH_LOW_ADDRESS_LIMIT); 974 goal, ARCH_LOW_ADDRESS_LIMIT);
975#endif
800} 976}
diff --git a/mm/fadvise.c b/mm/fadvise.c
index e43359214f6f..8d723c9e8b75 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -77,12 +77,20 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
77 switch (advice) { 77 switch (advice) {
78 case POSIX_FADV_NORMAL: 78 case POSIX_FADV_NORMAL:
79 file->f_ra.ra_pages = bdi->ra_pages; 79 file->f_ra.ra_pages = bdi->ra_pages;
80 spin_lock(&file->f_lock);
81 file->f_mode &= ~FMODE_RANDOM;
82 spin_unlock(&file->f_lock);
80 break; 83 break;
81 case POSIX_FADV_RANDOM: 84 case POSIX_FADV_RANDOM:
82 file->f_ra.ra_pages = 0; 85 spin_lock(&file->f_lock);
86 file->f_mode |= FMODE_RANDOM;
87 spin_unlock(&file->f_lock);
83 break; 88 break;
84 case POSIX_FADV_SEQUENTIAL: 89 case POSIX_FADV_SEQUENTIAL:
85 file->f_ra.ra_pages = bdi->ra_pages * 2; 90 file->f_ra.ra_pages = bdi->ra_pages * 2;
91 spin_lock(&file->f_lock);
92 file->f_mode &= ~FMODE_RANDOM;
93 spin_unlock(&file->f_lock);
86 break; 94 break;
87 case POSIX_FADV_WILLNEED: 95 case POSIX_FADV_WILLNEED:
88 if (!mapping->a_ops->readpage) { 96 if (!mapping->a_ops->readpage) {
diff --git a/mm/failslab.c b/mm/failslab.c
index 9339de5f0a91..bb41f98dd8b7 100644
--- a/mm/failslab.c
+++ b/mm/failslab.c
@@ -1,18 +1,22 @@
1#include <linux/fault-inject.h> 1#include <linux/fault-inject.h>
2#include <linux/gfp.h> 2#include <linux/gfp.h>
3#include <linux/slab.h>
3 4
4static struct { 5static struct {
5 struct fault_attr attr; 6 struct fault_attr attr;
6 u32 ignore_gfp_wait; 7 u32 ignore_gfp_wait;
8 int cache_filter;
7#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS 9#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
8 struct dentry *ignore_gfp_wait_file; 10 struct dentry *ignore_gfp_wait_file;
11 struct dentry *cache_filter_file;
9#endif 12#endif
10} failslab = { 13} failslab = {
11 .attr = FAULT_ATTR_INITIALIZER, 14 .attr = FAULT_ATTR_INITIALIZER,
12 .ignore_gfp_wait = 1, 15 .ignore_gfp_wait = 1,
16 .cache_filter = 0,
13}; 17};
14 18
15bool should_failslab(size_t size, gfp_t gfpflags) 19bool should_failslab(size_t size, gfp_t gfpflags, unsigned long cache_flags)
16{ 20{
17 if (gfpflags & __GFP_NOFAIL) 21 if (gfpflags & __GFP_NOFAIL)
18 return false; 22 return false;
@@ -20,6 +24,9 @@ bool should_failslab(size_t size, gfp_t gfpflags)
20 if (failslab.ignore_gfp_wait && (gfpflags & __GFP_WAIT)) 24 if (failslab.ignore_gfp_wait && (gfpflags & __GFP_WAIT))
21 return false; 25 return false;
22 26
27 if (failslab.cache_filter && !(cache_flags & SLAB_FAILSLAB))
28 return false;
29
23 return should_fail(&failslab.attr, size); 30 return should_fail(&failslab.attr, size);
24} 31}
25 32
@@ -30,7 +37,6 @@ static int __init setup_failslab(char *str)
30__setup("failslab=", setup_failslab); 37__setup("failslab=", setup_failslab);
31 38
32#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS 39#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
33
34static int __init failslab_debugfs_init(void) 40static int __init failslab_debugfs_init(void)
35{ 41{
36 mode_t mode = S_IFREG | S_IRUSR | S_IWUSR; 42 mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
@@ -46,8 +52,14 @@ static int __init failslab_debugfs_init(void)
46 debugfs_create_bool("ignore-gfp-wait", mode, dir, 52 debugfs_create_bool("ignore-gfp-wait", mode, dir,
47 &failslab.ignore_gfp_wait); 53 &failslab.ignore_gfp_wait);
48 54
49 if (!failslab.ignore_gfp_wait_file) { 55 failslab.cache_filter_file =
56 debugfs_create_bool("cache-filter", mode, dir,
57 &failslab.cache_filter);
58
59 if (!failslab.ignore_gfp_wait_file ||
60 !failslab.cache_filter_file) {
50 err = -ENOMEM; 61 err = -ENOMEM;
62 debugfs_remove(failslab.cache_filter_file);
51 debugfs_remove(failslab.ignore_gfp_wait_file); 63 debugfs_remove(failslab.ignore_gfp_wait_file);
52 cleanup_fault_attr_dentries(&failslab.attr); 64 cleanup_fault_attr_dentries(&failslab.attr);
53 } 65 }
diff --git a/mm/filemap.c b/mm/filemap.c
index 698ea80f2102..045b31c37653 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1117,7 +1117,7 @@ readpage:
1117 if (!PageUptodate(page)) { 1117 if (!PageUptodate(page)) {
1118 if (page->mapping == NULL) { 1118 if (page->mapping == NULL) {
1119 /* 1119 /*
1120 * invalidate_inode_pages got it 1120 * invalidate_mapping_pages got it
1121 */ 1121 */
1122 unlock_page(page); 1122 unlock_page(page);
1123 page_cache_release(page); 1123 page_cache_release(page);
@@ -1986,7 +1986,7 @@ EXPORT_SYMBOL(iov_iter_single_seg_count);
1986inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk) 1986inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk)
1987{ 1987{
1988 struct inode *inode = file->f_mapping->host; 1988 struct inode *inode = file->f_mapping->host;
1989 unsigned long limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; 1989 unsigned long limit = rlimit(RLIMIT_FSIZE);
1990 1990
1991 if (unlikely(*pos < 0)) 1991 if (unlikely(*pos < 0))
1992 return -EINVAL; 1992 return -EINVAL;
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 1888b2d71bb8..78b94f0b6d5d 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -194,7 +194,7 @@ retry:
194 flush_cache_page(vma, address, pte_pfn(*pte)); 194 flush_cache_page(vma, address, pte_pfn(*pte));
195 pteval = ptep_clear_flush_notify(vma, address, pte); 195 pteval = ptep_clear_flush_notify(vma, address, pte);
196 page_remove_rmap(page); 196 page_remove_rmap(page);
197 dec_mm_counter(mm, file_rss); 197 dec_mm_counter(mm, MM_FILEPAGES);
198 BUG_ON(pte_dirty(pteval)); 198 BUG_ON(pte_dirty(pteval));
199 pte_unmap_unlock(pte, ptl); 199 pte_unmap_unlock(pte, ptl);
200 page_cache_release(page); 200 page_cache_release(page);
diff --git a/mm/fremap.c b/mm/fremap.c
index b6ec85abbb39..46f5dacf90a2 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -40,7 +40,7 @@ static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
40 page_remove_rmap(page); 40 page_remove_rmap(page);
41 page_cache_release(page); 41 page_cache_release(page);
42 update_hiwater_rss(mm); 42 update_hiwater_rss(mm);
43 dec_mm_counter(mm, file_rss); 43 dec_mm_counter(mm, MM_FILEPAGES);
44 } 44 }
45 } else { 45 } else {
46 if (!pte_file(pte)) 46 if (!pte_file(pte))
diff --git a/mm/highmem.c b/mm/highmem.c
index 9c1e627f282e..bed8a8bfd01f 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -220,7 +220,7 @@ EXPORT_SYMBOL(kmap_high);
220 * @page: &struct page to pin 220 * @page: &struct page to pin
221 * 221 *
222 * Returns the page's current virtual memory address, or NULL if no mapping 222 * Returns the page's current virtual memory address, or NULL if no mapping
223 * exists. When and only when a non null address is returned then a 223 * exists. If and only if a non null address is returned then a
224 * matching call to kunmap_high() is necessary. 224 * matching call to kunmap_high() is necessary.
225 * 225 *
226 * This can be called from any context. 226 * This can be called from any context.
diff --git a/mm/ksm.c b/mm/ksm.c
index 56a0da1f9979..8cdfc2a1e8bf 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -751,7 +751,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
751 * page 751 * page
752 */ 752 */
753 if (page_mapcount(page) + 1 + swapped != page_count(page)) { 753 if (page_mapcount(page) + 1 + swapped != page_count(page)) {
754 set_pte_at_notify(mm, addr, ptep, entry); 754 set_pte_at(mm, addr, ptep, entry);
755 goto out_unlock; 755 goto out_unlock;
756 } 756 }
757 entry = pte_wrprotect(entry); 757 entry = pte_wrprotect(entry);
@@ -1563,10 +1563,12 @@ int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg,
1563again: 1563again:
1564 hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { 1564 hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
1565 struct anon_vma *anon_vma = rmap_item->anon_vma; 1565 struct anon_vma *anon_vma = rmap_item->anon_vma;
1566 struct anon_vma_chain *vmac;
1566 struct vm_area_struct *vma; 1567 struct vm_area_struct *vma;
1567 1568
1568 spin_lock(&anon_vma->lock); 1569 spin_lock(&anon_vma->lock);
1569 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { 1570 list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
1571 vma = vmac->vma;
1570 if (rmap_item->address < vma->vm_start || 1572 if (rmap_item->address < vma->vm_start ||
1571 rmap_item->address >= vma->vm_end) 1573 rmap_item->address >= vma->vm_end)
1572 continue; 1574 continue;
@@ -1614,10 +1616,12 @@ int try_to_unmap_ksm(struct page *page, enum ttu_flags flags)
1614again: 1616again:
1615 hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { 1617 hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
1616 struct anon_vma *anon_vma = rmap_item->anon_vma; 1618 struct anon_vma *anon_vma = rmap_item->anon_vma;
1619 struct anon_vma_chain *vmac;
1617 struct vm_area_struct *vma; 1620 struct vm_area_struct *vma;
1618 1621
1619 spin_lock(&anon_vma->lock); 1622 spin_lock(&anon_vma->lock);
1620 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { 1623 list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
1624 vma = vmac->vma;
1621 if (rmap_item->address < vma->vm_start || 1625 if (rmap_item->address < vma->vm_start ||
1622 rmap_item->address >= vma->vm_end) 1626 rmap_item->address >= vma->vm_end)
1623 continue; 1627 continue;
@@ -1664,10 +1668,12 @@ int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *,
1664again: 1668again:
1665 hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { 1669 hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
1666 struct anon_vma *anon_vma = rmap_item->anon_vma; 1670 struct anon_vma *anon_vma = rmap_item->anon_vma;
1671 struct anon_vma_chain *vmac;
1667 struct vm_area_struct *vma; 1672 struct vm_area_struct *vma;
1668 1673
1669 spin_lock(&anon_vma->lock); 1674 spin_lock(&anon_vma->lock);
1670 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { 1675 list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
1676 vma = vmac->vma;
1671 if (rmap_item->address < vma->vm_start || 1677 if (rmap_item->address < vma->vm_start ||
1672 rmap_item->address >= vma->vm_end) 1678 rmap_item->address >= vma->vm_end)
1673 continue; 1679 continue;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 954032b80bed..9ed760dc7448 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -6,6 +6,10 @@
6 * Copyright 2007 OpenVZ SWsoft Inc 6 * Copyright 2007 OpenVZ SWsoft Inc
7 * Author: Pavel Emelianov <xemul@openvz.org> 7 * Author: Pavel Emelianov <xemul@openvz.org>
8 * 8 *
9 * Memory thresholds
10 * Copyright (C) 2009 Nokia Corporation
11 * Author: Kirill A. Shutemov
12 *
9 * This program is free software; you can redistribute it and/or modify 13 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by 14 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or 15 * the Free Software Foundation; either version 2 of the License, or
@@ -21,6 +25,7 @@
21#include <linux/memcontrol.h> 25#include <linux/memcontrol.h>
22#include <linux/cgroup.h> 26#include <linux/cgroup.h>
23#include <linux/mm.h> 27#include <linux/mm.h>
28#include <linux/hugetlb.h>
24#include <linux/pagemap.h> 29#include <linux/pagemap.h>
25#include <linux/smp.h> 30#include <linux/smp.h>
26#include <linux/page-flags.h> 31#include <linux/page-flags.h>
@@ -32,7 +37,10 @@
32#include <linux/rbtree.h> 37#include <linux/rbtree.h>
33#include <linux/slab.h> 38#include <linux/slab.h>
34#include <linux/swap.h> 39#include <linux/swap.h>
40#include <linux/swapops.h>
35#include <linux/spinlock.h> 41#include <linux/spinlock.h>
42#include <linux/eventfd.h>
43#include <linux/sort.h>
36#include <linux/fs.h> 44#include <linux/fs.h>
37#include <linux/seq_file.h> 45#include <linux/seq_file.h>
38#include <linux/vmalloc.h> 46#include <linux/vmalloc.h>
@@ -55,7 +63,15 @@ static int really_do_swap_account __initdata = 1; /* for remember boot option*/
55#define do_swap_account (0) 63#define do_swap_account (0)
56#endif 64#endif
57 65
58#define SOFTLIMIT_EVENTS_THRESH (1000) 66/*
67 * Per memcg event counter is incremented at every pagein/pageout. This counter
68 * is used for trigger some periodic events. This is straightforward and better
69 * than using jiffies etc. to handle periodic memcg event.
70 *
71 * These values will be used as !((event) & ((1 <<(thresh)) - 1))
72 */
73#define THRESHOLDS_EVENTS_THRESH (7) /* once in 128 */
74#define SOFTLIMIT_EVENTS_THRESH (10) /* once in 1024 */
59 75
60/* 76/*
61 * Statistics for memory cgroup. 77 * Statistics for memory cgroup.
@@ -69,62 +85,16 @@ enum mem_cgroup_stat_index {
69 MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ 85 MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */
70 MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ 86 MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */
71 MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ 87 MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */
72 MEM_CGROUP_STAT_EVENTS, /* sum of pagein + pageout for internal use */
73 MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ 88 MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
89 MEM_CGROUP_EVENTS, /* incremented at every pagein/pageout */
74 90
75 MEM_CGROUP_STAT_NSTATS, 91 MEM_CGROUP_STAT_NSTATS,
76}; 92};
77 93
78struct mem_cgroup_stat_cpu { 94struct mem_cgroup_stat_cpu {
79 s64 count[MEM_CGROUP_STAT_NSTATS]; 95 s64 count[MEM_CGROUP_STAT_NSTATS];
80} ____cacheline_aligned_in_smp;
81
82struct mem_cgroup_stat {
83 struct mem_cgroup_stat_cpu cpustat[0];
84}; 96};
85 97
86static inline void
87__mem_cgroup_stat_reset_safe(struct mem_cgroup_stat_cpu *stat,
88 enum mem_cgroup_stat_index idx)
89{
90 stat->count[idx] = 0;
91}
92
93static inline s64
94__mem_cgroup_stat_read_local(struct mem_cgroup_stat_cpu *stat,
95 enum mem_cgroup_stat_index idx)
96{
97 return stat->count[idx];
98}
99
100/*
101 * For accounting under irq disable, no need for increment preempt count.
102 */
103static inline void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat_cpu *stat,
104 enum mem_cgroup_stat_index idx, int val)
105{
106 stat->count[idx] += val;
107}
108
109static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat,
110 enum mem_cgroup_stat_index idx)
111{
112 int cpu;
113 s64 ret = 0;
114 for_each_possible_cpu(cpu)
115 ret += stat->cpustat[cpu].count[idx];
116 return ret;
117}
118
119static s64 mem_cgroup_local_usage(struct mem_cgroup_stat *stat)
120{
121 s64 ret;
122
123 ret = mem_cgroup_read_stat(stat, MEM_CGROUP_STAT_CACHE);
124 ret += mem_cgroup_read_stat(stat, MEM_CGROUP_STAT_RSS);
125 return ret;
126}
127
128/* 98/*
129 * per-zone information in memory controller. 99 * per-zone information in memory controller.
130 */ 100 */
@@ -174,6 +144,22 @@ struct mem_cgroup_tree {
174 144
175static struct mem_cgroup_tree soft_limit_tree __read_mostly; 145static struct mem_cgroup_tree soft_limit_tree __read_mostly;
176 146
147struct mem_cgroup_threshold {
148 struct eventfd_ctx *eventfd;
149 u64 threshold;
150};
151
152struct mem_cgroup_threshold_ary {
153 /* An array index points to threshold just below usage. */
154 atomic_t current_threshold;
155 /* Size of entries[] */
156 unsigned int size;
157 /* Array of thresholds */
158 struct mem_cgroup_threshold entries[0];
159};
160
161static void mem_cgroup_threshold(struct mem_cgroup *mem);
162
177/* 163/*
178 * The memory controller data structure. The memory controller controls both 164 * The memory controller data structure. The memory controller controls both
179 * page cache and RSS per cgroup. We would eventually like to provide 165 * page cache and RSS per cgroup. We would eventually like to provide
@@ -217,7 +203,7 @@ struct mem_cgroup {
217 * Should the accounting and control be hierarchical, per subtree? 203 * Should the accounting and control be hierarchical, per subtree?
218 */ 204 */
219 bool use_hierarchy; 205 bool use_hierarchy;
220 unsigned long last_oom_jiffies; 206 atomic_t oom_lock;
221 atomic_t refcnt; 207 atomic_t refcnt;
222 208
223 unsigned int swappiness; 209 unsigned int swappiness;
@@ -225,10 +211,48 @@ struct mem_cgroup {
225 /* set when res.limit == memsw.limit */ 211 /* set when res.limit == memsw.limit */
226 bool memsw_is_minimum; 212 bool memsw_is_minimum;
227 213
214 /* protect arrays of thresholds */
215 struct mutex thresholds_lock;
216
217 /* thresholds for memory usage. RCU-protected */
218 struct mem_cgroup_threshold_ary *thresholds;
219
220 /* thresholds for mem+swap usage. RCU-protected */
221 struct mem_cgroup_threshold_ary *memsw_thresholds;
222
228 /* 223 /*
229 * statistics. This must be placed at the end of memcg. 224 * Should we move charges of a task when a task is moved into this
225 * mem_cgroup ? And what type of charges should we move ?
230 */ 226 */
231 struct mem_cgroup_stat stat; 227 unsigned long move_charge_at_immigrate;
228
229 /*
230 * percpu counter.
231 */
232 struct mem_cgroup_stat_cpu *stat;
233};
234
235/* Stuffs for move charges at task migration. */
236/*
237 * Types of charges to be moved. "move_charge_at_immitgrate" is treated as a
238 * left-shifted bitmap of these types.
239 */
240enum move_type {
241 MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */
242 NR_MOVE_TYPE,
243};
244
245/* "mc" and its members are protected by cgroup_mutex */
246static struct move_charge_struct {
247 struct mem_cgroup *from;
248 struct mem_cgroup *to;
249 unsigned long precharge;
250 unsigned long moved_charge;
251 unsigned long moved_swap;
252 struct task_struct *moving_task; /* a task moving charges */
253 wait_queue_head_t waitq; /* a waitq for other context */
254} mc = {
255 .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
232}; 256};
233 257
234/* 258/*
@@ -371,23 +395,6 @@ mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
371 spin_unlock(&mctz->lock); 395 spin_unlock(&mctz->lock);
372} 396}
373 397
374static bool mem_cgroup_soft_limit_check(struct mem_cgroup *mem)
375{
376 bool ret = false;
377 int cpu;
378 s64 val;
379 struct mem_cgroup_stat_cpu *cpustat;
380
381 cpu = get_cpu();
382 cpustat = &mem->stat.cpustat[cpu];
383 val = __mem_cgroup_stat_read_local(cpustat, MEM_CGROUP_STAT_EVENTS);
384 if (unlikely(val > SOFTLIMIT_EVENTS_THRESH)) {
385 __mem_cgroup_stat_reset_safe(cpustat, MEM_CGROUP_STAT_EVENTS);
386 ret = true;
387 }
388 put_cpu();
389 return ret;
390}
391 398
392static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page) 399static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page)
393{ 400{
@@ -481,17 +488,31 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
481 return mz; 488 return mz;
482} 489}
483 490
491static s64 mem_cgroup_read_stat(struct mem_cgroup *mem,
492 enum mem_cgroup_stat_index idx)
493{
494 int cpu;
495 s64 val = 0;
496
497 for_each_possible_cpu(cpu)
498 val += per_cpu(mem->stat->count[idx], cpu);
499 return val;
500}
501
502static s64 mem_cgroup_local_usage(struct mem_cgroup *mem)
503{
504 s64 ret;
505
506 ret = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);
507 ret += mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE);
508 return ret;
509}
510
484static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, 511static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,
485 bool charge) 512 bool charge)
486{ 513{
487 int val = (charge) ? 1 : -1; 514 int val = (charge) ? 1 : -1;
488 struct mem_cgroup_stat *stat = &mem->stat; 515 this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val);
489 struct mem_cgroup_stat_cpu *cpustat;
490 int cpu = get_cpu();
491
492 cpustat = &stat->cpustat[cpu];
493 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_SWAPOUT, val);
494 put_cpu();
495} 516}
496 517
497static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, 518static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
@@ -499,24 +520,21 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
499 bool charge) 520 bool charge)
500{ 521{
501 int val = (charge) ? 1 : -1; 522 int val = (charge) ? 1 : -1;
502 struct mem_cgroup_stat *stat = &mem->stat;
503 struct mem_cgroup_stat_cpu *cpustat;
504 int cpu = get_cpu();
505 523
506 cpustat = &stat->cpustat[cpu]; 524 preempt_disable();
525
507 if (PageCgroupCache(pc)) 526 if (PageCgroupCache(pc))
508 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val); 527 __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], val);
509 else 528 else
510 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_RSS, val); 529 __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], val);
511 530
512 if (charge) 531 if (charge)
513 __mem_cgroup_stat_add_safe(cpustat, 532 __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGIN_COUNT]);
514 MEM_CGROUP_STAT_PGPGIN_COUNT, 1);
515 else 533 else
516 __mem_cgroup_stat_add_safe(cpustat, 534 __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGOUT_COUNT]);
517 MEM_CGROUP_STAT_PGPGOUT_COUNT, 1); 535 __this_cpu_inc(mem->stat->count[MEM_CGROUP_EVENTS]);
518 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_EVENTS, 1); 536
519 put_cpu(); 537 preempt_enable();
520} 538}
521 539
522static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, 540static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
@@ -534,6 +552,29 @@ static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
534 return total; 552 return total;
535} 553}
536 554
555static bool __memcg_event_check(struct mem_cgroup *mem, int event_mask_shift)
556{
557 s64 val;
558
559 val = this_cpu_read(mem->stat->count[MEM_CGROUP_EVENTS]);
560
561 return !(val & ((1 << event_mask_shift) - 1));
562}
563
564/*
565 * Check events in order.
566 *
567 */
568static void memcg_check_events(struct mem_cgroup *mem, struct page *page)
569{
570 /* threshold event is triggered in finer grain than soft limit */
571 if (unlikely(__memcg_event_check(mem, THRESHOLDS_EVENTS_THRESH))) {
572 mem_cgroup_threshold(mem);
573 if (unlikely(__memcg_event_check(mem, SOFTLIMIT_EVENTS_THRESH)))
574 mem_cgroup_update_tree(mem, page);
575 }
576}
577
537static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) 578static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
538{ 579{
539 return container_of(cgroup_subsys_state(cont, 580 return container_of(cgroup_subsys_state(cont,
@@ -1000,7 +1041,7 @@ static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data)
1000} 1041}
1001 1042
1002/** 1043/**
1003 * mem_cgroup_print_mem_info: Called from OOM with tasklist_lock held in read mode. 1044 * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode.
1004 * @memcg: The memory cgroup that went over limit 1045 * @memcg: The memory cgroup that went over limit
1005 * @p: Task that is going to be killed 1046 * @p: Task that is going to be killed
1006 * 1047 *
@@ -1174,7 +1215,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1174 } 1215 }
1175 } 1216 }
1176 } 1217 }
1177 if (!mem_cgroup_local_usage(&victim->stat)) { 1218 if (!mem_cgroup_local_usage(victim)) {
1178 /* this cgroup's local usage == 0 */ 1219 /* this cgroup's local usage == 0 */
1179 css_put(&victim->css); 1220 css_put(&victim->css);
1180 continue; 1221 continue;
@@ -1205,32 +1246,102 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1205 return total; 1246 return total;
1206} 1247}
1207 1248
1208bool mem_cgroup_oom_called(struct task_struct *task) 1249static int mem_cgroup_oom_lock_cb(struct mem_cgroup *mem, void *data)
1209{ 1250{
1210 bool ret = false; 1251 int *val = (int *)data;
1211 struct mem_cgroup *mem; 1252 int x;
1212 struct mm_struct *mm; 1253 /*
1254 * Logically, we can stop scanning immediately when we find
1255 * a memcg is already locked. But condidering unlock ops and
1256 * creation/removal of memcg, scan-all is simple operation.
1257 */
1258 x = atomic_inc_return(&mem->oom_lock);
1259 *val = max(x, *val);
1260 return 0;
1261}
1262/*
1263 * Check OOM-Killer is already running under our hierarchy.
1264 * If someone is running, return false.
1265 */
1266static bool mem_cgroup_oom_lock(struct mem_cgroup *mem)
1267{
1268 int lock_count = 0;
1213 1269
1214 rcu_read_lock(); 1270 mem_cgroup_walk_tree(mem, &lock_count, mem_cgroup_oom_lock_cb);
1215 mm = task->mm; 1271
1216 if (!mm) 1272 if (lock_count == 1)
1217 mm = &init_mm; 1273 return true;
1218 mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); 1274 return false;
1219 if (mem && time_before(jiffies, mem->last_oom_jiffies + HZ/10))
1220 ret = true;
1221 rcu_read_unlock();
1222 return ret;
1223} 1275}
1224 1276
1225static int record_last_oom_cb(struct mem_cgroup *mem, void *data) 1277static int mem_cgroup_oom_unlock_cb(struct mem_cgroup *mem, void *data)
1226{ 1278{
1227 mem->last_oom_jiffies = jiffies; 1279 /*
1280 * When a new child is created while the hierarchy is under oom,
1281 * mem_cgroup_oom_lock() may not be called. We have to use
1282 * atomic_add_unless() here.
1283 */
1284 atomic_add_unless(&mem->oom_lock, -1, 0);
1228 return 0; 1285 return 0;
1229} 1286}
1230 1287
1231static void record_last_oom(struct mem_cgroup *mem) 1288static void mem_cgroup_oom_unlock(struct mem_cgroup *mem)
1232{ 1289{
1233 mem_cgroup_walk_tree(mem, NULL, record_last_oom_cb); 1290 mem_cgroup_walk_tree(mem, NULL, mem_cgroup_oom_unlock_cb);
1291}
1292
1293static DEFINE_MUTEX(memcg_oom_mutex);
1294static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
1295
1296/*
1297 * try to call OOM killer. returns false if we should exit memory-reclaim loop.
1298 */
1299bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
1300{
1301 DEFINE_WAIT(wait);
1302 bool locked;
1303
1304 /* At first, try to OOM lock hierarchy under mem.*/
1305 mutex_lock(&memcg_oom_mutex);
1306 locked = mem_cgroup_oom_lock(mem);
1307 /*
1308 * Even if signal_pending(), we can't quit charge() loop without
1309 * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL
1310 * under OOM is always welcomed, use TASK_KILLABLE here.
1311 */
1312 if (!locked)
1313 prepare_to_wait(&memcg_oom_waitq, &wait, TASK_KILLABLE);
1314 mutex_unlock(&memcg_oom_mutex);
1315
1316 if (locked)
1317 mem_cgroup_out_of_memory(mem, mask);
1318 else {
1319 schedule();
1320 finish_wait(&memcg_oom_waitq, &wait);
1321 }
1322 mutex_lock(&memcg_oom_mutex);
1323 mem_cgroup_oom_unlock(mem);
1324 /*
1325 * Here, we use global waitq .....more fine grained waitq ?
1326 * Assume following hierarchy.
1327 * A/
1328 * 01
1329 * 02
1330 * assume OOM happens both in A and 01 at the same time. Tthey are
1331 * mutually exclusive by lock. (kill in 01 helps A.)
1332 * When we use per memcg waitq, we have to wake up waiters on A and 02
1333 * in addtion to waiters on 01. We use global waitq for avoiding mess.
1334 * It will not be a big problem.
1335 * (And a task may be moved to other groups while it's waiting for OOM.)
1336 */
1337 wake_up_all(&memcg_oom_waitq);
1338 mutex_unlock(&memcg_oom_mutex);
1339
1340 if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
1341 return false;
1342 /* Give chance to dying process */
1343 schedule_timeout(1);
1344 return true;
1234} 1345}
1235 1346
1236/* 1347/*
@@ -1240,9 +1351,6 @@ static void record_last_oom(struct mem_cgroup *mem)
1240void mem_cgroup_update_file_mapped(struct page *page, int val) 1351void mem_cgroup_update_file_mapped(struct page *page, int val)
1241{ 1352{
1242 struct mem_cgroup *mem; 1353 struct mem_cgroup *mem;
1243 struct mem_cgroup_stat *stat;
1244 struct mem_cgroup_stat_cpu *cpustat;
1245 int cpu;
1246 struct page_cgroup *pc; 1354 struct page_cgroup *pc;
1247 1355
1248 pc = lookup_page_cgroup(page); 1356 pc = lookup_page_cgroup(page);
@@ -1258,13 +1366,10 @@ void mem_cgroup_update_file_mapped(struct page *page, int val)
1258 goto done; 1366 goto done;
1259 1367
1260 /* 1368 /*
1261 * Preemption is already disabled, we don't need get_cpu() 1369 * Preemption is already disabled. We can use __this_cpu_xxx
1262 */ 1370 */
1263 cpu = smp_processor_id(); 1371 __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], val);
1264 stat = &mem->stat;
1265 cpustat = &stat->cpustat[cpu];
1266 1372
1267 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED, val);
1268done: 1373done:
1269 unlock_page_cgroup(pc); 1374 unlock_page_cgroup(pc);
1270} 1375}
@@ -1401,19 +1506,21 @@ static int __cpuinit memcg_stock_cpu_callback(struct notifier_block *nb,
1401 * oom-killer can be invoked. 1506 * oom-killer can be invoked.
1402 */ 1507 */
1403static int __mem_cgroup_try_charge(struct mm_struct *mm, 1508static int __mem_cgroup_try_charge(struct mm_struct *mm,
1404 gfp_t gfp_mask, struct mem_cgroup **memcg, 1509 gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom)
1405 bool oom, struct page *page)
1406{ 1510{
1407 struct mem_cgroup *mem, *mem_over_limit; 1511 struct mem_cgroup *mem, *mem_over_limit;
1408 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 1512 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
1409 struct res_counter *fail_res; 1513 struct res_counter *fail_res;
1410 int csize = CHARGE_SIZE; 1514 int csize = CHARGE_SIZE;
1411 1515
1412 if (unlikely(test_thread_flag(TIF_MEMDIE))) { 1516 /*
1413 /* Don't account this! */ 1517 * Unlike gloval-vm's OOM-kill, we're not in memory shortage
1414 *memcg = NULL; 1518 * in system level. So, allow to go ahead dying process in addition to
1415 return 0; 1519 * MEMDIE process.
1416 } 1520 */
1521 if (unlikely(test_thread_flag(TIF_MEMDIE)
1522 || fatal_signal_pending(current)))
1523 goto bypass;
1417 1524
1418 /* 1525 /*
1419 * We always charge the cgroup the mm_struct belongs to. 1526 * We always charge the cgroup the mm_struct belongs to.
@@ -1440,7 +1547,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
1440 unsigned long flags = 0; 1547 unsigned long flags = 0;
1441 1548
1442 if (consume_stock(mem)) 1549 if (consume_stock(mem))
1443 goto charged; 1550 goto done;
1444 1551
1445 ret = res_counter_charge(&mem->res, csize, &fail_res); 1552 ret = res_counter_charge(&mem->res, csize, &fail_res);
1446 if (likely(!ret)) { 1553 if (likely(!ret)) {
@@ -1483,28 +1590,70 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
1483 if (mem_cgroup_check_under_limit(mem_over_limit)) 1590 if (mem_cgroup_check_under_limit(mem_over_limit))
1484 continue; 1591 continue;
1485 1592
1593 /* try to avoid oom while someone is moving charge */
1594 if (mc.moving_task && current != mc.moving_task) {
1595 struct mem_cgroup *from, *to;
1596 bool do_continue = false;
1597 /*
1598 * There is a small race that "from" or "to" can be
1599 * freed by rmdir, so we use css_tryget().
1600 */
1601 rcu_read_lock();
1602 from = mc.from;
1603 to = mc.to;
1604 if (from && css_tryget(&from->css)) {
1605 if (mem_over_limit->use_hierarchy)
1606 do_continue = css_is_ancestor(
1607 &from->css,
1608 &mem_over_limit->css);
1609 else
1610 do_continue = (from == mem_over_limit);
1611 css_put(&from->css);
1612 }
1613 if (!do_continue && to && css_tryget(&to->css)) {
1614 if (mem_over_limit->use_hierarchy)
1615 do_continue = css_is_ancestor(
1616 &to->css,
1617 &mem_over_limit->css);
1618 else
1619 do_continue = (to == mem_over_limit);
1620 css_put(&to->css);
1621 }
1622 rcu_read_unlock();
1623 if (do_continue) {
1624 DEFINE_WAIT(wait);
1625 prepare_to_wait(&mc.waitq, &wait,
1626 TASK_INTERRUPTIBLE);
1627 /* moving charge context might have finished. */
1628 if (mc.moving_task)
1629 schedule();
1630 finish_wait(&mc.waitq, &wait);
1631 continue;
1632 }
1633 }
1634
1486 if (!nr_retries--) { 1635 if (!nr_retries--) {
1487 if (oom) { 1636 if (!oom)
1488 mem_cgroup_out_of_memory(mem_over_limit, gfp_mask); 1637 goto nomem;
1489 record_last_oom(mem_over_limit); 1638 if (mem_cgroup_handle_oom(mem_over_limit, gfp_mask)) {
1639 nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
1640 continue;
1490 } 1641 }
1491 goto nomem; 1642 /* When we reach here, current task is dying .*/
1643 css_put(&mem->css);
1644 goto bypass;
1492 } 1645 }
1493 } 1646 }
1494 if (csize > PAGE_SIZE) 1647 if (csize > PAGE_SIZE)
1495 refill_stock(mem, csize - PAGE_SIZE); 1648 refill_stock(mem, csize - PAGE_SIZE);
1496charged:
1497 /*
1498 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
1499 * if they exceeds softlimit.
1500 */
1501 if (mem_cgroup_soft_limit_check(mem))
1502 mem_cgroup_update_tree(mem, page);
1503done: 1649done:
1504 return 0; 1650 return 0;
1505nomem: 1651nomem:
1506 css_put(&mem->css); 1652 css_put(&mem->css);
1507 return -ENOMEM; 1653 return -ENOMEM;
1654bypass:
1655 *memcg = NULL;
1656 return 0;
1508} 1657}
1509 1658
1510/* 1659/*
@@ -1512,14 +1661,23 @@ nomem:
1512 * This function is for that and do uncharge, put css's refcnt. 1661 * This function is for that and do uncharge, put css's refcnt.
1513 * gotten by try_charge(). 1662 * gotten by try_charge().
1514 */ 1663 */
1515static void mem_cgroup_cancel_charge(struct mem_cgroup *mem) 1664static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem,
1665 unsigned long count)
1516{ 1666{
1517 if (!mem_cgroup_is_root(mem)) { 1667 if (!mem_cgroup_is_root(mem)) {
1518 res_counter_uncharge(&mem->res, PAGE_SIZE); 1668 res_counter_uncharge(&mem->res, PAGE_SIZE * count);
1519 if (do_swap_account) 1669 if (do_swap_account)
1520 res_counter_uncharge(&mem->memsw, PAGE_SIZE); 1670 res_counter_uncharge(&mem->memsw, PAGE_SIZE * count);
1671 VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags));
1672 WARN_ON_ONCE(count > INT_MAX);
1673 __css_put(&mem->css, (int)count);
1521 } 1674 }
1522 css_put(&mem->css); 1675 /* we don't need css_put for root */
1676}
1677
1678static void mem_cgroup_cancel_charge(struct mem_cgroup *mem)
1679{
1680 __mem_cgroup_cancel_charge(mem, 1);
1523} 1681}
1524 1682
1525/* 1683/*
@@ -1615,6 +1773,12 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
1615 mem_cgroup_charge_statistics(mem, pc, true); 1773 mem_cgroup_charge_statistics(mem, pc, true);
1616 1774
1617 unlock_page_cgroup(pc); 1775 unlock_page_cgroup(pc);
1776 /*
1777 * "charge_statistics" updated event counter. Then, check it.
1778 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
1779 * if they exceeds softlimit.
1780 */
1781 memcg_check_events(mem, pc->page);
1618} 1782}
1619 1783
1620/** 1784/**
@@ -1622,22 +1786,22 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
1622 * @pc: page_cgroup of the page. 1786 * @pc: page_cgroup of the page.
1623 * @from: mem_cgroup which the page is moved from. 1787 * @from: mem_cgroup which the page is moved from.
1624 * @to: mem_cgroup which the page is moved to. @from != @to. 1788 * @to: mem_cgroup which the page is moved to. @from != @to.
1789 * @uncharge: whether we should call uncharge and css_put against @from.
1625 * 1790 *
1626 * The caller must confirm following. 1791 * The caller must confirm following.
1627 * - page is not on LRU (isolate_page() is useful.) 1792 * - page is not on LRU (isolate_page() is useful.)
1628 * - the pc is locked, used, and ->mem_cgroup points to @from. 1793 * - the pc is locked, used, and ->mem_cgroup points to @from.
1629 * 1794 *
1630 * This function does "uncharge" from old cgroup but doesn't do "charge" to 1795 * This function doesn't do "charge" nor css_get to new cgroup. It should be
1631 * new cgroup. It should be done by a caller. 1796 * done by a caller(__mem_cgroup_try_charge would be usefull). If @uncharge is
1797 * true, this function does "uncharge" from old cgroup, but it doesn't if
1798 * @uncharge is false, so a caller should do "uncharge".
1632 */ 1799 */
1633 1800
1634static void __mem_cgroup_move_account(struct page_cgroup *pc, 1801static void __mem_cgroup_move_account(struct page_cgroup *pc,
1635 struct mem_cgroup *from, struct mem_cgroup *to) 1802 struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge)
1636{ 1803{
1637 struct page *page; 1804 struct page *page;
1638 int cpu;
1639 struct mem_cgroup_stat *stat;
1640 struct mem_cgroup_stat_cpu *cpustat;
1641 1805
1642 VM_BUG_ON(from == to); 1806 VM_BUG_ON(from == to);
1643 VM_BUG_ON(PageLRU(pc->page)); 1807 VM_BUG_ON(PageLRU(pc->page));
@@ -1645,38 +1809,28 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc,
1645 VM_BUG_ON(!PageCgroupUsed(pc)); 1809 VM_BUG_ON(!PageCgroupUsed(pc));
1646 VM_BUG_ON(pc->mem_cgroup != from); 1810 VM_BUG_ON(pc->mem_cgroup != from);
1647 1811
1648 if (!mem_cgroup_is_root(from))
1649 res_counter_uncharge(&from->res, PAGE_SIZE);
1650 mem_cgroup_charge_statistics(from, pc, false);
1651
1652 page = pc->page; 1812 page = pc->page;
1653 if (page_mapped(page) && !PageAnon(page)) { 1813 if (page_mapped(page) && !PageAnon(page)) {
1654 cpu = smp_processor_id(); 1814 /* Update mapped_file data for mem_cgroup */
1655 /* Update mapped_file data for mem_cgroup "from" */ 1815 preempt_disable();
1656 stat = &from->stat; 1816 __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
1657 cpustat = &stat->cpustat[cpu]; 1817 __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
1658 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED, 1818 preempt_enable();
1659 -1);
1660
1661 /* Update mapped_file data for mem_cgroup "to" */
1662 stat = &to->stat;
1663 cpustat = &stat->cpustat[cpu];
1664 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED,
1665 1);
1666 } 1819 }
1820 mem_cgroup_charge_statistics(from, pc, false);
1821 if (uncharge)
1822 /* This is not "cancel", but cancel_charge does all we need. */
1823 mem_cgroup_cancel_charge(from);
1667 1824
1668 if (do_swap_account && !mem_cgroup_is_root(from)) 1825 /* caller should have done css_get */
1669 res_counter_uncharge(&from->memsw, PAGE_SIZE);
1670 css_put(&from->css);
1671
1672 css_get(&to->css);
1673 pc->mem_cgroup = to; 1826 pc->mem_cgroup = to;
1674 mem_cgroup_charge_statistics(to, pc, true); 1827 mem_cgroup_charge_statistics(to, pc, true);
1675 /* 1828 /*
1676 * We charges against "to" which may not have any tasks. Then, "to" 1829 * We charges against "to" which may not have any tasks. Then, "to"
1677 * can be under rmdir(). But in current implementation, caller of 1830 * can be under rmdir(). But in current implementation, caller of
1678 * this function is just force_empty() and it's garanteed that 1831 * this function is just force_empty() and move charge, so it's
1679 * "to" is never removed. So, we don't check rmdir status here. 1832 * garanteed that "to" is never removed. So, we don't check rmdir
1833 * status here.
1680 */ 1834 */
1681} 1835}
1682 1836
@@ -1685,15 +1839,20 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc,
1685 * __mem_cgroup_move_account() 1839 * __mem_cgroup_move_account()
1686 */ 1840 */
1687static int mem_cgroup_move_account(struct page_cgroup *pc, 1841static int mem_cgroup_move_account(struct page_cgroup *pc,
1688 struct mem_cgroup *from, struct mem_cgroup *to) 1842 struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge)
1689{ 1843{
1690 int ret = -EINVAL; 1844 int ret = -EINVAL;
1691 lock_page_cgroup(pc); 1845 lock_page_cgroup(pc);
1692 if (PageCgroupUsed(pc) && pc->mem_cgroup == from) { 1846 if (PageCgroupUsed(pc) && pc->mem_cgroup == from) {
1693 __mem_cgroup_move_account(pc, from, to); 1847 __mem_cgroup_move_account(pc, from, to, uncharge);
1694 ret = 0; 1848 ret = 0;
1695 } 1849 }
1696 unlock_page_cgroup(pc); 1850 unlock_page_cgroup(pc);
1851 /*
1852 * check events
1853 */
1854 memcg_check_events(to, pc->page);
1855 memcg_check_events(from, pc->page);
1697 return ret; 1856 return ret;
1698} 1857}
1699 1858
@@ -1722,15 +1881,13 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
1722 goto put; 1881 goto put;
1723 1882
1724 parent = mem_cgroup_from_cont(pcg); 1883 parent = mem_cgroup_from_cont(pcg);
1725 ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, page); 1884 ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false);
1726 if (ret || !parent) 1885 if (ret || !parent)
1727 goto put_back; 1886 goto put_back;
1728 1887
1729 ret = mem_cgroup_move_account(pc, child, parent); 1888 ret = mem_cgroup_move_account(pc, child, parent, true);
1730 if (!ret) 1889 if (ret)
1731 css_put(&parent->css); /* drop extra refcnt by try_charge() */ 1890 mem_cgroup_cancel_charge(parent);
1732 else
1733 mem_cgroup_cancel_charge(parent); /* does css_put */
1734put_back: 1891put_back:
1735 putback_lru_page(page); 1892 putback_lru_page(page);
1736put: 1893put:
@@ -1760,7 +1917,7 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
1760 prefetchw(pc); 1917 prefetchw(pc);
1761 1918
1762 mem = memcg; 1919 mem = memcg;
1763 ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true, page); 1920 ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true);
1764 if (ret || !mem) 1921 if (ret || !mem)
1765 return ret; 1922 return ret;
1766 1923
@@ -1880,14 +2037,14 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
1880 if (!mem) 2037 if (!mem)
1881 goto charge_cur_mm; 2038 goto charge_cur_mm;
1882 *ptr = mem; 2039 *ptr = mem;
1883 ret = __mem_cgroup_try_charge(NULL, mask, ptr, true, page); 2040 ret = __mem_cgroup_try_charge(NULL, mask, ptr, true);
1884 /* drop extra refcnt from tryget */ 2041 /* drop extra refcnt from tryget */
1885 css_put(&mem->css); 2042 css_put(&mem->css);
1886 return ret; 2043 return ret;
1887charge_cur_mm: 2044charge_cur_mm:
1888 if (unlikely(!mm)) 2045 if (unlikely(!mm))
1889 mm = &init_mm; 2046 mm = &init_mm;
1890 return __mem_cgroup_try_charge(mm, mask, ptr, true, page); 2047 return __mem_cgroup_try_charge(mm, mask, ptr, true);
1891} 2048}
1892 2049
1893static void 2050static void
@@ -2064,8 +2221,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2064 mz = page_cgroup_zoneinfo(pc); 2221 mz = page_cgroup_zoneinfo(pc);
2065 unlock_page_cgroup(pc); 2222 unlock_page_cgroup(pc);
2066 2223
2067 if (mem_cgroup_soft_limit_check(mem)) 2224 memcg_check_events(mem, page);
2068 mem_cgroup_update_tree(mem, page);
2069 /* at swapout, this memcg will be accessed to record to swap */ 2225 /* at swapout, this memcg will be accessed to record to swap */
2070 if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT) 2226 if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
2071 css_put(&mem->css); 2227 css_put(&mem->css);
@@ -2192,6 +2348,64 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent)
2192 } 2348 }
2193 rcu_read_unlock(); 2349 rcu_read_unlock();
2194} 2350}
2351
2352/**
2353 * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.
2354 * @entry: swap entry to be moved
2355 * @from: mem_cgroup which the entry is moved from
2356 * @to: mem_cgroup which the entry is moved to
2357 * @need_fixup: whether we should fixup res_counters and refcounts.
2358 *
2359 * It succeeds only when the swap_cgroup's record for this entry is the same
2360 * as the mem_cgroup's id of @from.
2361 *
2362 * Returns 0 on success, -EINVAL on failure.
2363 *
2364 * The caller must have charged to @to, IOW, called res_counter_charge() about
2365 * both res and memsw, and called css_get().
2366 */
2367static int mem_cgroup_move_swap_account(swp_entry_t entry,
2368 struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup)
2369{
2370 unsigned short old_id, new_id;
2371
2372 old_id = css_id(&from->css);
2373 new_id = css_id(&to->css);
2374
2375 if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
2376 mem_cgroup_swap_statistics(from, false);
2377 mem_cgroup_swap_statistics(to, true);
2378 /*
2379 * This function is only called from task migration context now.
2380 * It postpones res_counter and refcount handling till the end
2381 * of task migration(mem_cgroup_clear_mc()) for performance
2382 * improvement. But we cannot postpone mem_cgroup_get(to)
2383 * because if the process that has been moved to @to does
2384 * swap-in, the refcount of @to might be decreased to 0.
2385 */
2386 mem_cgroup_get(to);
2387 if (need_fixup) {
2388 if (!mem_cgroup_is_root(from))
2389 res_counter_uncharge(&from->memsw, PAGE_SIZE);
2390 mem_cgroup_put(from);
2391 /*
2392 * we charged both to->res and to->memsw, so we should
2393 * uncharge to->res.
2394 */
2395 if (!mem_cgroup_is_root(to))
2396 res_counter_uncharge(&to->res, PAGE_SIZE);
2397 css_put(&to->css);
2398 }
2399 return 0;
2400 }
2401 return -EINVAL;
2402}
2403#else
2404static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
2405 struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup)
2406{
2407 return -EINVAL;
2408}
2195#endif 2409#endif
2196 2410
2197/* 2411/*
@@ -2216,8 +2430,7 @@ int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr)
2216 unlock_page_cgroup(pc); 2430 unlock_page_cgroup(pc);
2217 2431
2218 if (mem) { 2432 if (mem) {
2219 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false, 2433 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false);
2220 page);
2221 css_put(&mem->css); 2434 css_put(&mem->css);
2222 } 2435 }
2223 *ptr = mem; 2436 *ptr = mem;
@@ -2545,7 +2758,7 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
2545 pc = list_entry(list->prev, struct page_cgroup, lru); 2758 pc = list_entry(list->prev, struct page_cgroup, lru);
2546 if (busy == pc) { 2759 if (busy == pc) {
2547 list_move(&pc->lru, list); 2760 list_move(&pc->lru, list);
2548 busy = 0; 2761 busy = NULL;
2549 spin_unlock_irqrestore(&zone->lru_lock, flags); 2762 spin_unlock_irqrestore(&zone->lru_lock, flags);
2550 continue; 2763 continue;
2551 } 2764 }
@@ -2704,7 +2917,7 @@ static int
2704mem_cgroup_get_idx_stat(struct mem_cgroup *mem, void *data) 2917mem_cgroup_get_idx_stat(struct mem_cgroup *mem, void *data)
2705{ 2918{
2706 struct mem_cgroup_idx_data *d = data; 2919 struct mem_cgroup_idx_data *d = data;
2707 d->val += mem_cgroup_read_stat(&mem->stat, d->idx); 2920 d->val += mem_cgroup_read_stat(mem, d->idx);
2708 return 0; 2921 return 0;
2709} 2922}
2710 2923
@@ -2719,40 +2932,50 @@ mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem,
2719 *val = d.val; 2932 *val = d.val;
2720} 2933}
2721 2934
2935static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap)
2936{
2937 u64 idx_val, val;
2938
2939 if (!mem_cgroup_is_root(mem)) {
2940 if (!swap)
2941 return res_counter_read_u64(&mem->res, RES_USAGE);
2942 else
2943 return res_counter_read_u64(&mem->memsw, RES_USAGE);
2944 }
2945
2946 mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_CACHE, &idx_val);
2947 val = idx_val;
2948 mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_RSS, &idx_val);
2949 val += idx_val;
2950
2951 if (swap) {
2952 mem_cgroup_get_recursive_idx_stat(mem,
2953 MEM_CGROUP_STAT_SWAPOUT, &idx_val);
2954 val += idx_val;
2955 }
2956
2957 return val << PAGE_SHIFT;
2958}
2959
2722static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) 2960static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
2723{ 2961{
2724 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 2962 struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
2725 u64 idx_val, val; 2963 u64 val;
2726 int type, name; 2964 int type, name;
2727 2965
2728 type = MEMFILE_TYPE(cft->private); 2966 type = MEMFILE_TYPE(cft->private);
2729 name = MEMFILE_ATTR(cft->private); 2967 name = MEMFILE_ATTR(cft->private);
2730 switch (type) { 2968 switch (type) {
2731 case _MEM: 2969 case _MEM:
2732 if (name == RES_USAGE && mem_cgroup_is_root(mem)) { 2970 if (name == RES_USAGE)
2733 mem_cgroup_get_recursive_idx_stat(mem, 2971 val = mem_cgroup_usage(mem, false);
2734 MEM_CGROUP_STAT_CACHE, &idx_val); 2972 else
2735 val = idx_val;
2736 mem_cgroup_get_recursive_idx_stat(mem,
2737 MEM_CGROUP_STAT_RSS, &idx_val);
2738 val += idx_val;
2739 val <<= PAGE_SHIFT;
2740 } else
2741 val = res_counter_read_u64(&mem->res, name); 2973 val = res_counter_read_u64(&mem->res, name);
2742 break; 2974 break;
2743 case _MEMSWAP: 2975 case _MEMSWAP:
2744 if (name == RES_USAGE && mem_cgroup_is_root(mem)) { 2976 if (name == RES_USAGE)
2745 mem_cgroup_get_recursive_idx_stat(mem, 2977 val = mem_cgroup_usage(mem, true);
2746 MEM_CGROUP_STAT_CACHE, &idx_val); 2978 else
2747 val = idx_val;
2748 mem_cgroup_get_recursive_idx_stat(mem,
2749 MEM_CGROUP_STAT_RSS, &idx_val);
2750 val += idx_val;
2751 mem_cgroup_get_recursive_idx_stat(mem,
2752 MEM_CGROUP_STAT_SWAPOUT, &idx_val);
2753 val += idx_val;
2754 val <<= PAGE_SHIFT;
2755 } else
2756 val = res_counter_read_u64(&mem->memsw, name); 2979 val = res_counter_read_u64(&mem->memsw, name);
2757 break; 2980 break;
2758 default: 2981 default:
@@ -2865,6 +3088,39 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
2865 return 0; 3088 return 0;
2866} 3089}
2867 3090
3091static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp,
3092 struct cftype *cft)
3093{
3094 return mem_cgroup_from_cont(cgrp)->move_charge_at_immigrate;
3095}
3096
3097#ifdef CONFIG_MMU
3098static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
3099 struct cftype *cft, u64 val)
3100{
3101 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
3102
3103 if (val >= (1 << NR_MOVE_TYPE))
3104 return -EINVAL;
3105 /*
3106 * We check this value several times in both in can_attach() and
3107 * attach(), so we need cgroup lock to prevent this value from being
3108 * inconsistent.
3109 */
3110 cgroup_lock();
3111 mem->move_charge_at_immigrate = val;
3112 cgroup_unlock();
3113
3114 return 0;
3115}
3116#else
3117static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
3118 struct cftype *cft, u64 val)
3119{
3120 return -ENOSYS;
3121}
3122#endif
3123
2868 3124
2869/* For read statistics */ 3125/* For read statistics */
2870enum { 3126enum {
@@ -2910,18 +3166,18 @@ static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data)
2910 s64 val; 3166 s64 val;
2911 3167
2912 /* per cpu stat */ 3168 /* per cpu stat */
2913 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_CACHE); 3169 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE);
2914 s->stat[MCS_CACHE] += val * PAGE_SIZE; 3170 s->stat[MCS_CACHE] += val * PAGE_SIZE;
2915 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS); 3171 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);
2916 s->stat[MCS_RSS] += val * PAGE_SIZE; 3172 s->stat[MCS_RSS] += val * PAGE_SIZE;
2917 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_FILE_MAPPED); 3173 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED);
2918 s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE; 3174 s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE;
2919 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGIN_COUNT); 3175 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGIN_COUNT);
2920 s->stat[MCS_PGPGIN] += val; 3176 s->stat[MCS_PGPGIN] += val;
2921 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT); 3177 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGOUT_COUNT);
2922 s->stat[MCS_PGPGOUT] += val; 3178 s->stat[MCS_PGPGOUT] += val;
2923 if (do_swap_account) { 3179 if (do_swap_account) {
2924 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_SWAPOUT); 3180 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT);
2925 s->stat[MCS_SWAP] += val * PAGE_SIZE; 3181 s->stat[MCS_SWAP] += val * PAGE_SIZE;
2926 } 3182 }
2927 3183
@@ -3049,12 +3305,249 @@ static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
3049 return 0; 3305 return 0;
3050} 3306}
3051 3307
3308static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
3309{
3310 struct mem_cgroup_threshold_ary *t;
3311 u64 usage;
3312 int i;
3313
3314 rcu_read_lock();
3315 if (!swap)
3316 t = rcu_dereference(memcg->thresholds);
3317 else
3318 t = rcu_dereference(memcg->memsw_thresholds);
3319
3320 if (!t)
3321 goto unlock;
3322
3323 usage = mem_cgroup_usage(memcg, swap);
3324
3325 /*
3326 * current_threshold points to threshold just below usage.
3327 * If it's not true, a threshold was crossed after last
3328 * call of __mem_cgroup_threshold().
3329 */
3330 i = atomic_read(&t->current_threshold);
3331
3332 /*
3333 * Iterate backward over array of thresholds starting from
3334 * current_threshold and check if a threshold is crossed.
3335 * If none of thresholds below usage is crossed, we read
3336 * only one element of the array here.
3337 */
3338 for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
3339 eventfd_signal(t->entries[i].eventfd, 1);
3340
3341 /* i = current_threshold + 1 */
3342 i++;
3343
3344 /*
3345 * Iterate forward over array of thresholds starting from
3346 * current_threshold+1 and check if a threshold is crossed.
3347 * If none of thresholds above usage is crossed, we read
3348 * only one element of the array here.
3349 */
3350 for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
3351 eventfd_signal(t->entries[i].eventfd, 1);
3352
3353 /* Update current_threshold */
3354 atomic_set(&t->current_threshold, i - 1);
3355unlock:
3356 rcu_read_unlock();
3357}
3358
3359static void mem_cgroup_threshold(struct mem_cgroup *memcg)
3360{
3361 __mem_cgroup_threshold(memcg, false);
3362 if (do_swap_account)
3363 __mem_cgroup_threshold(memcg, true);
3364}
3365
3366static int compare_thresholds(const void *a, const void *b)
3367{
3368 const struct mem_cgroup_threshold *_a = a;
3369 const struct mem_cgroup_threshold *_b = b;
3370
3371 return _a->threshold - _b->threshold;
3372}
3373
3374static int mem_cgroup_register_event(struct cgroup *cgrp, struct cftype *cft,
3375 struct eventfd_ctx *eventfd, const char *args)
3376{
3377 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
3378 struct mem_cgroup_threshold_ary *thresholds, *thresholds_new;
3379 int type = MEMFILE_TYPE(cft->private);
3380 u64 threshold, usage;
3381 int size;
3382 int i, ret;
3383
3384 ret = res_counter_memparse_write_strategy(args, &threshold);
3385 if (ret)
3386 return ret;
3387
3388 mutex_lock(&memcg->thresholds_lock);
3389 if (type == _MEM)
3390 thresholds = memcg->thresholds;
3391 else if (type == _MEMSWAP)
3392 thresholds = memcg->memsw_thresholds;
3393 else
3394 BUG();
3395
3396 usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
3397
3398 /* Check if a threshold crossed before adding a new one */
3399 if (thresholds)
3400 __mem_cgroup_threshold(memcg, type == _MEMSWAP);
3401
3402 if (thresholds)
3403 size = thresholds->size + 1;
3404 else
3405 size = 1;
3406
3407 /* Allocate memory for new array of thresholds */
3408 thresholds_new = kmalloc(sizeof(*thresholds_new) +
3409 size * sizeof(struct mem_cgroup_threshold),
3410 GFP_KERNEL);
3411 if (!thresholds_new) {
3412 ret = -ENOMEM;
3413 goto unlock;
3414 }
3415 thresholds_new->size = size;
3416
3417 /* Copy thresholds (if any) to new array */
3418 if (thresholds)
3419 memcpy(thresholds_new->entries, thresholds->entries,
3420 thresholds->size *
3421 sizeof(struct mem_cgroup_threshold));
3422 /* Add new threshold */
3423 thresholds_new->entries[size - 1].eventfd = eventfd;
3424 thresholds_new->entries[size - 1].threshold = threshold;
3425
3426 /* Sort thresholds. Registering of new threshold isn't time-critical */
3427 sort(thresholds_new->entries, size,
3428 sizeof(struct mem_cgroup_threshold),
3429 compare_thresholds, NULL);
3430
3431 /* Find current threshold */
3432 atomic_set(&thresholds_new->current_threshold, -1);
3433 for (i = 0; i < size; i++) {
3434 if (thresholds_new->entries[i].threshold < usage) {
3435 /*
3436 * thresholds_new->current_threshold will not be used
3437 * until rcu_assign_pointer(), so it's safe to increment
3438 * it here.
3439 */
3440 atomic_inc(&thresholds_new->current_threshold);
3441 }
3442 }
3443
3444 if (type == _MEM)
3445 rcu_assign_pointer(memcg->thresholds, thresholds_new);
3446 else
3447 rcu_assign_pointer(memcg->memsw_thresholds, thresholds_new);
3448
3449 /* To be sure that nobody uses thresholds before freeing it */
3450 synchronize_rcu();
3451
3452 kfree(thresholds);
3453unlock:
3454 mutex_unlock(&memcg->thresholds_lock);
3455
3456 return ret;
3457}
3458
3459static int mem_cgroup_unregister_event(struct cgroup *cgrp, struct cftype *cft,
3460 struct eventfd_ctx *eventfd)
3461{
3462 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
3463 struct mem_cgroup_threshold_ary *thresholds, *thresholds_new;
3464 int type = MEMFILE_TYPE(cft->private);
3465 u64 usage;
3466 int size = 0;
3467 int i, j, ret;
3468
3469 mutex_lock(&memcg->thresholds_lock);
3470 if (type == _MEM)
3471 thresholds = memcg->thresholds;
3472 else if (type == _MEMSWAP)
3473 thresholds = memcg->memsw_thresholds;
3474 else
3475 BUG();
3476
3477 /*
3478 * Something went wrong if we trying to unregister a threshold
3479 * if we don't have thresholds
3480 */
3481 BUG_ON(!thresholds);
3482
3483 usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
3484
3485 /* Check if a threshold crossed before removing */
3486 __mem_cgroup_threshold(memcg, type == _MEMSWAP);
3487
3488 /* Calculate new number of threshold */
3489 for (i = 0; i < thresholds->size; i++) {
3490 if (thresholds->entries[i].eventfd != eventfd)
3491 size++;
3492 }
3493
3494 /* Set thresholds array to NULL if we don't have thresholds */
3495 if (!size) {
3496 thresholds_new = NULL;
3497 goto assign;
3498 }
3499
3500 /* Allocate memory for new array of thresholds */
3501 thresholds_new = kmalloc(sizeof(*thresholds_new) +
3502 size * sizeof(struct mem_cgroup_threshold),
3503 GFP_KERNEL);
3504 if (!thresholds_new) {
3505 ret = -ENOMEM;
3506 goto unlock;
3507 }
3508 thresholds_new->size = size;
3509
3510 /* Copy thresholds and find current threshold */
3511 atomic_set(&thresholds_new->current_threshold, -1);
3512 for (i = 0, j = 0; i < thresholds->size; i++) {
3513 if (thresholds->entries[i].eventfd == eventfd)
3514 continue;
3515
3516 thresholds_new->entries[j] = thresholds->entries[i];
3517 if (thresholds_new->entries[j].threshold < usage) {
3518 /*
3519 * thresholds_new->current_threshold will not be used
3520 * until rcu_assign_pointer(), so it's safe to increment
3521 * it here.
3522 */
3523 atomic_inc(&thresholds_new->current_threshold);
3524 }
3525 j++;
3526 }
3527
3528assign:
3529 if (type == _MEM)
3530 rcu_assign_pointer(memcg->thresholds, thresholds_new);
3531 else
3532 rcu_assign_pointer(memcg->memsw_thresholds, thresholds_new);
3533
3534 /* To be sure that nobody uses thresholds before freeing it */
3535 synchronize_rcu();
3536
3537 kfree(thresholds);
3538unlock:
3539 mutex_unlock(&memcg->thresholds_lock);
3540
3541 return ret;
3542}
3052 3543
3053static struct cftype mem_cgroup_files[] = { 3544static struct cftype mem_cgroup_files[] = {
3054 { 3545 {
3055 .name = "usage_in_bytes", 3546 .name = "usage_in_bytes",
3056 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), 3547 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
3057 .read_u64 = mem_cgroup_read, 3548 .read_u64 = mem_cgroup_read,
3549 .register_event = mem_cgroup_register_event,
3550 .unregister_event = mem_cgroup_unregister_event,
3058 }, 3551 },
3059 { 3552 {
3060 .name = "max_usage_in_bytes", 3553 .name = "max_usage_in_bytes",
@@ -3098,6 +3591,11 @@ static struct cftype mem_cgroup_files[] = {
3098 .read_u64 = mem_cgroup_swappiness_read, 3591 .read_u64 = mem_cgroup_swappiness_read,
3099 .write_u64 = mem_cgroup_swappiness_write, 3592 .write_u64 = mem_cgroup_swappiness_write,
3100 }, 3593 },
3594 {
3595 .name = "move_charge_at_immigrate",
3596 .read_u64 = mem_cgroup_move_charge_read,
3597 .write_u64 = mem_cgroup_move_charge_write,
3598 },
3101}; 3599};
3102 3600
3103#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 3601#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
@@ -3106,6 +3604,8 @@ static struct cftype memsw_cgroup_files[] = {
3106 .name = "memsw.usage_in_bytes", 3604 .name = "memsw.usage_in_bytes",
3107 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), 3605 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
3108 .read_u64 = mem_cgroup_read, 3606 .read_u64 = mem_cgroup_read,
3607 .register_event = mem_cgroup_register_event,
3608 .unregister_event = mem_cgroup_unregister_event,
3109 }, 3609 },
3110 { 3610 {
3111 .name = "memsw.max_usage_in_bytes", 3611 .name = "memsw.max_usage_in_bytes",
@@ -3180,24 +3680,29 @@ static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
3180 kfree(mem->info.nodeinfo[node]); 3680 kfree(mem->info.nodeinfo[node]);
3181} 3681}
3182 3682
3183static int mem_cgroup_size(void)
3184{
3185 int cpustat_size = nr_cpu_ids * sizeof(struct mem_cgroup_stat_cpu);
3186 return sizeof(struct mem_cgroup) + cpustat_size;
3187}
3188
3189static struct mem_cgroup *mem_cgroup_alloc(void) 3683static struct mem_cgroup *mem_cgroup_alloc(void)
3190{ 3684{
3191 struct mem_cgroup *mem; 3685 struct mem_cgroup *mem;
3192 int size = mem_cgroup_size(); 3686 int size = sizeof(struct mem_cgroup);
3193 3687
3688 /* Can be very big if MAX_NUMNODES is very big */
3194 if (size < PAGE_SIZE) 3689 if (size < PAGE_SIZE)
3195 mem = kmalloc(size, GFP_KERNEL); 3690 mem = kmalloc(size, GFP_KERNEL);
3196 else 3691 else
3197 mem = vmalloc(size); 3692 mem = vmalloc(size);
3198 3693
3199 if (mem) 3694 if (!mem)
3200 memset(mem, 0, size); 3695 return NULL;
3696
3697 memset(mem, 0, size);
3698 mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
3699 if (!mem->stat) {
3700 if (size < PAGE_SIZE)
3701 kfree(mem);
3702 else
3703 vfree(mem);
3704 mem = NULL;
3705 }
3201 return mem; 3706 return mem;
3202} 3707}
3203 3708
@@ -3222,7 +3727,8 @@ static void __mem_cgroup_free(struct mem_cgroup *mem)
3222 for_each_node_state(node, N_POSSIBLE) 3727 for_each_node_state(node, N_POSSIBLE)
3223 free_mem_cgroup_per_zone_info(mem, node); 3728 free_mem_cgroup_per_zone_info(mem, node);
3224 3729
3225 if (mem_cgroup_size() < PAGE_SIZE) 3730 free_percpu(mem->stat);
3731 if (sizeof(struct mem_cgroup) < PAGE_SIZE)
3226 kfree(mem); 3732 kfree(mem);
3227 else 3733 else
3228 vfree(mem); 3734 vfree(mem);
@@ -3233,9 +3739,9 @@ static void mem_cgroup_get(struct mem_cgroup *mem)
3233 atomic_inc(&mem->refcnt); 3739 atomic_inc(&mem->refcnt);
3234} 3740}
3235 3741
3236static void mem_cgroup_put(struct mem_cgroup *mem) 3742static void __mem_cgroup_put(struct mem_cgroup *mem, int count)
3237{ 3743{
3238 if (atomic_dec_and_test(&mem->refcnt)) { 3744 if (atomic_sub_and_test(count, &mem->refcnt)) {
3239 struct mem_cgroup *parent = parent_mem_cgroup(mem); 3745 struct mem_cgroup *parent = parent_mem_cgroup(mem);
3240 __mem_cgroup_free(mem); 3746 __mem_cgroup_free(mem);
3241 if (parent) 3747 if (parent)
@@ -3243,6 +3749,11 @@ static void mem_cgroup_put(struct mem_cgroup *mem)
3243 } 3749 }
3244} 3750}
3245 3751
3752static void mem_cgroup_put(struct mem_cgroup *mem)
3753{
3754 __mem_cgroup_put(mem, 1);
3755}
3756
3246/* 3757/*
3247 * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled. 3758 * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled.
3248 */ 3759 */
@@ -3319,7 +3830,6 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
3319 INIT_WORK(&stock->work, drain_local_stock); 3830 INIT_WORK(&stock->work, drain_local_stock);
3320 } 3831 }
3321 hotcpu_notifier(memcg_stock_cpu_callback, 0); 3832 hotcpu_notifier(memcg_stock_cpu_callback, 0);
3322
3323 } else { 3833 } else {
3324 parent = mem_cgroup_from_cont(cont->parent); 3834 parent = mem_cgroup_from_cont(cont->parent);
3325 mem->use_hierarchy = parent->use_hierarchy; 3835 mem->use_hierarchy = parent->use_hierarchy;
@@ -3345,6 +3855,8 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
3345 if (parent) 3855 if (parent)
3346 mem->swappiness = get_swappiness(parent); 3856 mem->swappiness = get_swappiness(parent);
3347 atomic_set(&mem->refcnt, 1); 3857 atomic_set(&mem->refcnt, 1);
3858 mem->move_charge_at_immigrate = 0;
3859 mutex_init(&mem->thresholds_lock);
3348 return &mem->css; 3860 return &mem->css;
3349free_out: 3861free_out:
3350 __mem_cgroup_free(mem); 3862 __mem_cgroup_free(mem);
@@ -3381,17 +3893,445 @@ static int mem_cgroup_populate(struct cgroup_subsys *ss,
3381 return ret; 3893 return ret;
3382} 3894}
3383 3895
3896#ifdef CONFIG_MMU
3897/* Handlers for move charge at task migration. */
3898#define PRECHARGE_COUNT_AT_ONCE 256
3899static int mem_cgroup_do_precharge(unsigned long count)
3900{
3901 int ret = 0;
3902 int batch_count = PRECHARGE_COUNT_AT_ONCE;
3903 struct mem_cgroup *mem = mc.to;
3904
3905 if (mem_cgroup_is_root(mem)) {
3906 mc.precharge += count;
3907 /* we don't need css_get for root */
3908 return ret;
3909 }
3910 /* try to charge at once */
3911 if (count > 1) {
3912 struct res_counter *dummy;
3913 /*
3914 * "mem" cannot be under rmdir() because we've already checked
3915 * by cgroup_lock_live_cgroup() that it is not removed and we
3916 * are still under the same cgroup_mutex. So we can postpone
3917 * css_get().
3918 */
3919 if (res_counter_charge(&mem->res, PAGE_SIZE * count, &dummy))
3920 goto one_by_one;
3921 if (do_swap_account && res_counter_charge(&mem->memsw,
3922 PAGE_SIZE * count, &dummy)) {
3923 res_counter_uncharge(&mem->res, PAGE_SIZE * count);
3924 goto one_by_one;
3925 }
3926 mc.precharge += count;
3927 VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags));
3928 WARN_ON_ONCE(count > INT_MAX);
3929 __css_get(&mem->css, (int)count);
3930 return ret;
3931 }
3932one_by_one:
3933 /* fall back to one by one charge */
3934 while (count--) {
3935 if (signal_pending(current)) {
3936 ret = -EINTR;
3937 break;
3938 }
3939 if (!batch_count--) {
3940 batch_count = PRECHARGE_COUNT_AT_ONCE;
3941 cond_resched();
3942 }
3943 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false);
3944 if (ret || !mem)
3945 /* mem_cgroup_clear_mc() will do uncharge later */
3946 return -ENOMEM;
3947 mc.precharge++;
3948 }
3949 return ret;
3950}
3951
3952/**
3953 * is_target_pte_for_mc - check a pte whether it is valid for move charge
3954 * @vma: the vma the pte to be checked belongs
3955 * @addr: the address corresponding to the pte to be checked
3956 * @ptent: the pte to be checked
3957 * @target: the pointer the target page or swap ent will be stored(can be NULL)
3958 *
3959 * Returns
3960 * 0(MC_TARGET_NONE): if the pte is not a target for move charge.
3961 * 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for
3962 * move charge. if @target is not NULL, the page is stored in target->page
3963 * with extra refcnt got(Callers should handle it).
3964 * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
3965 * target for charge migration. if @target is not NULL, the entry is stored
3966 * in target->ent.
3967 *
3968 * Called with pte lock held.
3969 */
3970union mc_target {
3971 struct page *page;
3972 swp_entry_t ent;
3973};
3974
3975enum mc_target_type {
3976 MC_TARGET_NONE, /* not used */
3977 MC_TARGET_PAGE,
3978 MC_TARGET_SWAP,
3979};
3980
3981static int is_target_pte_for_mc(struct vm_area_struct *vma,
3982 unsigned long addr, pte_t ptent, union mc_target *target)
3983{
3984 struct page *page = NULL;
3985 struct page_cgroup *pc;
3986 int ret = 0;
3987 swp_entry_t ent = { .val = 0 };
3988 int usage_count = 0;
3989 bool move_anon = test_bit(MOVE_CHARGE_TYPE_ANON,
3990 &mc.to->move_charge_at_immigrate);
3991
3992 if (!pte_present(ptent)) {
3993 /* TODO: handle swap of shmes/tmpfs */
3994 if (pte_none(ptent) || pte_file(ptent))
3995 return 0;
3996 else if (is_swap_pte(ptent)) {
3997 ent = pte_to_swp_entry(ptent);
3998 if (!move_anon || non_swap_entry(ent))
3999 return 0;
4000 usage_count = mem_cgroup_count_swap_user(ent, &page);
4001 }
4002 } else {
4003 page = vm_normal_page(vma, addr, ptent);
4004 if (!page || !page_mapped(page))
4005 return 0;
4006 /*
4007 * TODO: We don't move charges of file(including shmem/tmpfs)
4008 * pages for now.
4009 */
4010 if (!move_anon || !PageAnon(page))
4011 return 0;
4012 if (!get_page_unless_zero(page))
4013 return 0;
4014 usage_count = page_mapcount(page);
4015 }
4016 if (usage_count > 1) {
4017 /*
4018 * TODO: We don't move charges of shared(used by multiple
4019 * processes) pages for now.
4020 */
4021 if (page)
4022 put_page(page);
4023 return 0;
4024 }
4025 if (page) {
4026 pc = lookup_page_cgroup(page);
4027 /*
4028 * Do only loose check w/o page_cgroup lock.
4029 * mem_cgroup_move_account() checks the pc is valid or not under
4030 * the lock.
4031 */
4032 if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
4033 ret = MC_TARGET_PAGE;
4034 if (target)
4035 target->page = page;
4036 }
4037 if (!ret || !target)
4038 put_page(page);
4039 }
4040 /* throught */
4041 if (ent.val && do_swap_account && !ret &&
4042 css_id(&mc.from->css) == lookup_swap_cgroup(ent)) {
4043 ret = MC_TARGET_SWAP;
4044 if (target)
4045 target->ent = ent;
4046 }
4047 return ret;
4048}
4049
4050static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
4051 unsigned long addr, unsigned long end,
4052 struct mm_walk *walk)
4053{
4054 struct vm_area_struct *vma = walk->private;
4055 pte_t *pte;
4056 spinlock_t *ptl;
4057
4058 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
4059 for (; addr != end; pte++, addr += PAGE_SIZE)
4060 if (is_target_pte_for_mc(vma, addr, *pte, NULL))
4061 mc.precharge++; /* increment precharge temporarily */
4062 pte_unmap_unlock(pte - 1, ptl);
4063 cond_resched();
4064
4065 return 0;
4066}
4067
4068static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
4069{
4070 unsigned long precharge;
4071 struct vm_area_struct *vma;
4072
4073 down_read(&mm->mmap_sem);
4074 for (vma = mm->mmap; vma; vma = vma->vm_next) {
4075 struct mm_walk mem_cgroup_count_precharge_walk = {
4076 .pmd_entry = mem_cgroup_count_precharge_pte_range,
4077 .mm = mm,
4078 .private = vma,
4079 };
4080 if (is_vm_hugetlb_page(vma))
4081 continue;
4082 /* TODO: We don't move charges of shmem/tmpfs pages for now. */
4083 if (vma->vm_flags & VM_SHARED)
4084 continue;
4085 walk_page_range(vma->vm_start, vma->vm_end,
4086 &mem_cgroup_count_precharge_walk);
4087 }
4088 up_read(&mm->mmap_sem);
4089
4090 precharge = mc.precharge;
4091 mc.precharge = 0;
4092
4093 return precharge;
4094}
4095
4096static int mem_cgroup_precharge_mc(struct mm_struct *mm)
4097{
4098 return mem_cgroup_do_precharge(mem_cgroup_count_precharge(mm));
4099}
4100
4101static void mem_cgroup_clear_mc(void)
4102{
4103 /* we must uncharge all the leftover precharges from mc.to */
4104 if (mc.precharge) {
4105 __mem_cgroup_cancel_charge(mc.to, mc.precharge);
4106 mc.precharge = 0;
4107 }
4108 /*
4109 * we didn't uncharge from mc.from at mem_cgroup_move_account(), so
4110 * we must uncharge here.
4111 */
4112 if (mc.moved_charge) {
4113 __mem_cgroup_cancel_charge(mc.from, mc.moved_charge);
4114 mc.moved_charge = 0;
4115 }
4116 /* we must fixup refcnts and charges */
4117 if (mc.moved_swap) {
4118 WARN_ON_ONCE(mc.moved_swap > INT_MAX);
4119 /* uncharge swap account from the old cgroup */
4120 if (!mem_cgroup_is_root(mc.from))
4121 res_counter_uncharge(&mc.from->memsw,
4122 PAGE_SIZE * mc.moved_swap);
4123 __mem_cgroup_put(mc.from, mc.moved_swap);
4124
4125 if (!mem_cgroup_is_root(mc.to)) {
4126 /*
4127 * we charged both to->res and to->memsw, so we should
4128 * uncharge to->res.
4129 */
4130 res_counter_uncharge(&mc.to->res,
4131 PAGE_SIZE * mc.moved_swap);
4132 VM_BUG_ON(test_bit(CSS_ROOT, &mc.to->css.flags));
4133 __css_put(&mc.to->css, mc.moved_swap);
4134 }
4135 /* we've already done mem_cgroup_get(mc.to) */
4136
4137 mc.moved_swap = 0;
4138 }
4139 mc.from = NULL;
4140 mc.to = NULL;
4141 mc.moving_task = NULL;
4142 wake_up_all(&mc.waitq);
4143}
4144
4145static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
4146 struct cgroup *cgroup,
4147 struct task_struct *p,
4148 bool threadgroup)
4149{
4150 int ret = 0;
4151 struct mem_cgroup *mem = mem_cgroup_from_cont(cgroup);
4152
4153 if (mem->move_charge_at_immigrate) {
4154 struct mm_struct *mm;
4155 struct mem_cgroup *from = mem_cgroup_from_task(p);
4156
4157 VM_BUG_ON(from == mem);
4158
4159 mm = get_task_mm(p);
4160 if (!mm)
4161 return 0;
4162 /* We move charges only when we move a owner of the mm */
4163 if (mm->owner == p) {
4164 VM_BUG_ON(mc.from);
4165 VM_BUG_ON(mc.to);
4166 VM_BUG_ON(mc.precharge);
4167 VM_BUG_ON(mc.moved_charge);
4168 VM_BUG_ON(mc.moved_swap);
4169 VM_BUG_ON(mc.moving_task);
4170 mc.from = from;
4171 mc.to = mem;
4172 mc.precharge = 0;
4173 mc.moved_charge = 0;
4174 mc.moved_swap = 0;
4175 mc.moving_task = current;
4176
4177 ret = mem_cgroup_precharge_mc(mm);
4178 if (ret)
4179 mem_cgroup_clear_mc();
4180 }
4181 mmput(mm);
4182 }
4183 return ret;
4184}
4185
4186static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
4187 struct cgroup *cgroup,
4188 struct task_struct *p,
4189 bool threadgroup)
4190{
4191 mem_cgroup_clear_mc();
4192}
4193
4194static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
4195 unsigned long addr, unsigned long end,
4196 struct mm_walk *walk)
4197{
4198 int ret = 0;
4199 struct vm_area_struct *vma = walk->private;
4200 pte_t *pte;
4201 spinlock_t *ptl;
4202
4203retry:
4204 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
4205 for (; addr != end; addr += PAGE_SIZE) {
4206 pte_t ptent = *(pte++);
4207 union mc_target target;
4208 int type;
4209 struct page *page;
4210 struct page_cgroup *pc;
4211 swp_entry_t ent;
4212
4213 if (!mc.precharge)
4214 break;
4215
4216 type = is_target_pte_for_mc(vma, addr, ptent, &target);
4217 switch (type) {
4218 case MC_TARGET_PAGE:
4219 page = target.page;
4220 if (isolate_lru_page(page))
4221 goto put;
4222 pc = lookup_page_cgroup(page);
4223 if (!mem_cgroup_move_account(pc,
4224 mc.from, mc.to, false)) {
4225 mc.precharge--;
4226 /* we uncharge from mc.from later. */
4227 mc.moved_charge++;
4228 }
4229 putback_lru_page(page);
4230put: /* is_target_pte_for_mc() gets the page */
4231 put_page(page);
4232 break;
4233 case MC_TARGET_SWAP:
4234 ent = target.ent;
4235 if (!mem_cgroup_move_swap_account(ent,
4236 mc.from, mc.to, false)) {
4237 mc.precharge--;
4238 /* we fixup refcnts and charges later. */
4239 mc.moved_swap++;
4240 }
4241 break;
4242 default:
4243 break;
4244 }
4245 }
4246 pte_unmap_unlock(pte - 1, ptl);
4247 cond_resched();
4248
4249 if (addr != end) {
4250 /*
4251 * We have consumed all precharges we got in can_attach().
4252 * We try charge one by one, but don't do any additional
4253 * charges to mc.to if we have failed in charge once in attach()
4254 * phase.
4255 */
4256 ret = mem_cgroup_do_precharge(1);
4257 if (!ret)
4258 goto retry;
4259 }
4260
4261 return ret;
4262}
4263
4264static void mem_cgroup_move_charge(struct mm_struct *mm)
4265{
4266 struct vm_area_struct *vma;
4267
4268 lru_add_drain_all();
4269 down_read(&mm->mmap_sem);
4270 for (vma = mm->mmap; vma; vma = vma->vm_next) {
4271 int ret;
4272 struct mm_walk mem_cgroup_move_charge_walk = {
4273 .pmd_entry = mem_cgroup_move_charge_pte_range,
4274 .mm = mm,
4275 .private = vma,
4276 };
4277 if (is_vm_hugetlb_page(vma))
4278 continue;
4279 /* TODO: We don't move charges of shmem/tmpfs pages for now. */
4280 if (vma->vm_flags & VM_SHARED)
4281 continue;
4282 ret = walk_page_range(vma->vm_start, vma->vm_end,
4283 &mem_cgroup_move_charge_walk);
4284 if (ret)
4285 /*
4286 * means we have consumed all precharges and failed in
4287 * doing additional charge. Just abandon here.
4288 */
4289 break;
4290 }
4291 up_read(&mm->mmap_sem);
4292}
4293
3384static void mem_cgroup_move_task(struct cgroup_subsys *ss, 4294static void mem_cgroup_move_task(struct cgroup_subsys *ss,
3385 struct cgroup *cont, 4295 struct cgroup *cont,
3386 struct cgroup *old_cont, 4296 struct cgroup *old_cont,
3387 struct task_struct *p, 4297 struct task_struct *p,
3388 bool threadgroup) 4298 bool threadgroup)
3389{ 4299{
3390 /* 4300 struct mm_struct *mm;
3391 * FIXME: It's better to move charges of this process from old 4301
3392 * memcg to new memcg. But it's just on TODO-List now. 4302 if (!mc.to)
3393 */ 4303 /* no need to move charge */
4304 return;
4305
4306 mm = get_task_mm(p);
4307 if (mm) {
4308 mem_cgroup_move_charge(mm);
4309 mmput(mm);
4310 }
4311 mem_cgroup_clear_mc();
3394} 4312}
4313#else /* !CONFIG_MMU */
4314static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
4315 struct cgroup *cgroup,
4316 struct task_struct *p,
4317 bool threadgroup)
4318{
4319 return 0;
4320}
4321static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
4322 struct cgroup *cgroup,
4323 struct task_struct *p,
4324 bool threadgroup)
4325{
4326}
4327static void mem_cgroup_move_task(struct cgroup_subsys *ss,
4328 struct cgroup *cont,
4329 struct cgroup *old_cont,
4330 struct task_struct *p,
4331 bool threadgroup)
4332{
4333}
4334#endif
3395 4335
3396struct cgroup_subsys mem_cgroup_subsys = { 4336struct cgroup_subsys mem_cgroup_subsys = {
3397 .name = "memory", 4337 .name = "memory",
@@ -3400,6 +4340,8 @@ struct cgroup_subsys mem_cgroup_subsys = {
3400 .pre_destroy = mem_cgroup_pre_destroy, 4340 .pre_destroy = mem_cgroup_pre_destroy,
3401 .destroy = mem_cgroup_destroy, 4341 .destroy = mem_cgroup_destroy,
3402 .populate = mem_cgroup_populate, 4342 .populate = mem_cgroup_populate,
4343 .can_attach = mem_cgroup_can_attach,
4344 .cancel_attach = mem_cgroup_cancel_attach,
3403 .attach = mem_cgroup_move_task, 4345 .attach = mem_cgroup_move_task,
3404 .early_init = 0, 4346 .early_init = 0,
3405 .use_id = 1, 4347 .use_id = 1,
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 17299fd4577c..d1f335162976 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -383,9 +383,12 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
383 if (av == NULL) /* Not actually mapped anymore */ 383 if (av == NULL) /* Not actually mapped anymore */
384 goto out; 384 goto out;
385 for_each_process (tsk) { 385 for_each_process (tsk) {
386 struct anon_vma_chain *vmac;
387
386 if (!task_early_kill(tsk)) 388 if (!task_early_kill(tsk))
387 continue; 389 continue;
388 list_for_each_entry (vma, &av->head, anon_vma_node) { 390 list_for_each_entry(vmac, &av->head, same_anon_vma) {
391 vma = vmac->vma;
389 if (!page_mapped_in_vma(page, vma)) 392 if (!page_mapped_in_vma(page, vma))
390 continue; 393 continue;
391 if (vma->vm_mm == tsk->mm) 394 if (vma->vm_mm == tsk->mm)
diff --git a/mm/memory.c b/mm/memory.c
index 72fb5f39bccc..bc9ba5a1f5b9 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -121,6 +121,78 @@ static int __init init_zero_pfn(void)
121} 121}
122core_initcall(init_zero_pfn); 122core_initcall(init_zero_pfn);
123 123
124
125#if defined(SPLIT_RSS_COUNTING)
126
127void __sync_task_rss_stat(struct task_struct *task, struct mm_struct *mm)
128{
129 int i;
130
131 for (i = 0; i < NR_MM_COUNTERS; i++) {
132 if (task->rss_stat.count[i]) {
133 BUG_ON(!mm);
134 add_mm_counter(mm, i, task->rss_stat.count[i]);
135 task->rss_stat.count[i] = 0;
136 }
137 }
138 task->rss_stat.events = 0;
139}
140
141static void add_mm_counter_fast(struct mm_struct *mm, int member, int val)
142{
143 struct task_struct *task = current;
144
145 if (likely(task->mm == mm))
146 task->rss_stat.count[member] += val;
147 else
148 add_mm_counter(mm, member, val);
149}
150#define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1)
151#define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1)
152
153/* sync counter once per 64 page faults */
154#define TASK_RSS_EVENTS_THRESH (64)
155static void check_sync_rss_stat(struct task_struct *task)
156{
157 if (unlikely(task != current))
158 return;
159 if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH))
160 __sync_task_rss_stat(task, task->mm);
161}
162
163unsigned long get_mm_counter(struct mm_struct *mm, int member)
164{
165 long val = 0;
166
167 /*
168 * Don't use task->mm here...for avoiding to use task_get_mm()..
169 * The caller must guarantee task->mm is not invalid.
170 */
171 val = atomic_long_read(&mm->rss_stat.count[member]);
172 /*
173 * counter is updated in asynchronous manner and may go to minus.
174 * But it's never be expected number for users.
175 */
176 if (val < 0)
177 return 0;
178 return (unsigned long)val;
179}
180
181void sync_mm_rss(struct task_struct *task, struct mm_struct *mm)
182{
183 __sync_task_rss_stat(task, mm);
184}
185#else
186
187#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member)
188#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member)
189
190static void check_sync_rss_stat(struct task_struct *task)
191{
192}
193
194#endif
195
124/* 196/*
125 * If a p?d_bad entry is found while walking page tables, report 197 * If a p?d_bad entry is found while walking page tables, report
126 * the error, before resetting entry to p?d_none. Usually (but 198 * the error, before resetting entry to p?d_none. Usually (but
@@ -300,7 +372,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
300 * Hide vma from rmap and truncate_pagecache before freeing 372 * Hide vma from rmap and truncate_pagecache before freeing
301 * pgtables 373 * pgtables
302 */ 374 */
303 anon_vma_unlink(vma); 375 unlink_anon_vmas(vma);
304 unlink_file_vma(vma); 376 unlink_file_vma(vma);
305 377
306 if (is_vm_hugetlb_page(vma)) { 378 if (is_vm_hugetlb_page(vma)) {
@@ -314,7 +386,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
314 && !is_vm_hugetlb_page(next)) { 386 && !is_vm_hugetlb_page(next)) {
315 vma = next; 387 vma = next;
316 next = vma->vm_next; 388 next = vma->vm_next;
317 anon_vma_unlink(vma); 389 unlink_anon_vmas(vma);
318 unlink_file_vma(vma); 390 unlink_file_vma(vma);
319 } 391 }
320 free_pgd_range(tlb, addr, vma->vm_end, 392 free_pgd_range(tlb, addr, vma->vm_end,
@@ -376,12 +448,20 @@ int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
376 return 0; 448 return 0;
377} 449}
378 450
379static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss) 451static inline void init_rss_vec(int *rss)
380{ 452{
381 if (file_rss) 453 memset(rss, 0, sizeof(int) * NR_MM_COUNTERS);
382 add_mm_counter(mm, file_rss, file_rss); 454}
383 if (anon_rss) 455
384 add_mm_counter(mm, anon_rss, anon_rss); 456static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
457{
458 int i;
459
460 if (current->mm == mm)
461 sync_mm_rss(current, mm);
462 for (i = 0; i < NR_MM_COUNTERS; i++)
463 if (rss[i])
464 add_mm_counter(mm, i, rss[i]);
385} 465}
386 466
387/* 467/*
@@ -430,12 +510,8 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
430 "BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n", 510 "BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n",
431 current->comm, 511 current->comm,
432 (long long)pte_val(pte), (long long)pmd_val(*pmd)); 512 (long long)pte_val(pte), (long long)pmd_val(*pmd));
433 if (page) { 513 if (page)
434 printk(KERN_ALERT 514 dump_page(page);
435 "page:%p flags:%p count:%d mapcount:%d mapping:%p index:%lx\n",
436 page, (void *)page->flags, page_count(page),
437 page_mapcount(page), page->mapping, page->index);
438 }
439 printk(KERN_ALERT 515 printk(KERN_ALERT
440 "addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n", 516 "addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n",
441 (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index); 517 (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
@@ -597,7 +673,9 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
597 &src_mm->mmlist); 673 &src_mm->mmlist);
598 spin_unlock(&mmlist_lock); 674 spin_unlock(&mmlist_lock);
599 } 675 }
600 if (is_write_migration_entry(entry) && 676 if (likely(!non_swap_entry(entry)))
677 rss[MM_SWAPENTS]++;
678 else if (is_write_migration_entry(entry) &&
601 is_cow_mapping(vm_flags)) { 679 is_cow_mapping(vm_flags)) {
602 /* 680 /*
603 * COW mappings require pages in both parent 681 * COW mappings require pages in both parent
@@ -632,7 +710,10 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
632 if (page) { 710 if (page) {
633 get_page(page); 711 get_page(page);
634 page_dup_rmap(page); 712 page_dup_rmap(page);
635 rss[PageAnon(page)]++; 713 if (PageAnon(page))
714 rss[MM_ANONPAGES]++;
715 else
716 rss[MM_FILEPAGES]++;
636 } 717 }
637 718
638out_set_pte: 719out_set_pte:
@@ -648,11 +729,12 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
648 pte_t *src_pte, *dst_pte; 729 pte_t *src_pte, *dst_pte;
649 spinlock_t *src_ptl, *dst_ptl; 730 spinlock_t *src_ptl, *dst_ptl;
650 int progress = 0; 731 int progress = 0;
651 int rss[2]; 732 int rss[NR_MM_COUNTERS];
652 swp_entry_t entry = (swp_entry_t){0}; 733 swp_entry_t entry = (swp_entry_t){0};
653 734
654again: 735again:
655 rss[1] = rss[0] = 0; 736 init_rss_vec(rss);
737
656 dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl); 738 dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
657 if (!dst_pte) 739 if (!dst_pte)
658 return -ENOMEM; 740 return -ENOMEM;
@@ -688,7 +770,7 @@ again:
688 arch_leave_lazy_mmu_mode(); 770 arch_leave_lazy_mmu_mode();
689 spin_unlock(src_ptl); 771 spin_unlock(src_ptl);
690 pte_unmap_nested(orig_src_pte); 772 pte_unmap_nested(orig_src_pte);
691 add_mm_rss(dst_mm, rss[0], rss[1]); 773 add_mm_rss_vec(dst_mm, rss);
692 pte_unmap_unlock(orig_dst_pte, dst_ptl); 774 pte_unmap_unlock(orig_dst_pte, dst_ptl);
693 cond_resched(); 775 cond_resched();
694 776
@@ -816,8 +898,9 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
816 struct mm_struct *mm = tlb->mm; 898 struct mm_struct *mm = tlb->mm;
817 pte_t *pte; 899 pte_t *pte;
818 spinlock_t *ptl; 900 spinlock_t *ptl;
819 int file_rss = 0; 901 int rss[NR_MM_COUNTERS];
820 int anon_rss = 0; 902
903 init_rss_vec(rss);
821 904
822 pte = pte_offset_map_lock(mm, pmd, addr, &ptl); 905 pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
823 arch_enter_lazy_mmu_mode(); 906 arch_enter_lazy_mmu_mode();
@@ -863,14 +946,14 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
863 set_pte_at(mm, addr, pte, 946 set_pte_at(mm, addr, pte,
864 pgoff_to_pte(page->index)); 947 pgoff_to_pte(page->index));
865 if (PageAnon(page)) 948 if (PageAnon(page))
866 anon_rss--; 949 rss[MM_ANONPAGES]--;
867 else { 950 else {
868 if (pte_dirty(ptent)) 951 if (pte_dirty(ptent))
869 set_page_dirty(page); 952 set_page_dirty(page);
870 if (pte_young(ptent) && 953 if (pte_young(ptent) &&
871 likely(!VM_SequentialReadHint(vma))) 954 likely(!VM_SequentialReadHint(vma)))
872 mark_page_accessed(page); 955 mark_page_accessed(page);
873 file_rss--; 956 rss[MM_FILEPAGES]--;
874 } 957 }
875 page_remove_rmap(page); 958 page_remove_rmap(page);
876 if (unlikely(page_mapcount(page) < 0)) 959 if (unlikely(page_mapcount(page) < 0))
@@ -887,13 +970,18 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
887 if (pte_file(ptent)) { 970 if (pte_file(ptent)) {
888 if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) 971 if (unlikely(!(vma->vm_flags & VM_NONLINEAR)))
889 print_bad_pte(vma, addr, ptent, NULL); 972 print_bad_pte(vma, addr, ptent, NULL);
890 } else if 973 } else {
891 (unlikely(!free_swap_and_cache(pte_to_swp_entry(ptent)))) 974 swp_entry_t entry = pte_to_swp_entry(ptent);
892 print_bad_pte(vma, addr, ptent, NULL); 975
976 if (!non_swap_entry(entry))
977 rss[MM_SWAPENTS]--;
978 if (unlikely(!free_swap_and_cache(entry)))
979 print_bad_pte(vma, addr, ptent, NULL);
980 }
893 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); 981 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
894 } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0)); 982 } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
895 983
896 add_mm_rss(mm, file_rss, anon_rss); 984 add_mm_rss_vec(mm, rss);
897 arch_leave_lazy_mmu_mode(); 985 arch_leave_lazy_mmu_mode();
898 pte_unmap_unlock(pte - 1, ptl); 986 pte_unmap_unlock(pte - 1, ptl);
899 987
@@ -1527,7 +1615,7 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr,
1527 1615
1528 /* Ok, finally just insert the thing.. */ 1616 /* Ok, finally just insert the thing.. */
1529 get_page(page); 1617 get_page(page);
1530 inc_mm_counter(mm, file_rss); 1618 inc_mm_counter_fast(mm, MM_FILEPAGES);
1531 page_add_file_rmap(page); 1619 page_add_file_rmap(page);
1532 set_pte_at(mm, addr, pte, mk_pte(page, prot)); 1620 set_pte_at(mm, addr, pte, mk_pte(page, prot));
1533 1621
@@ -2044,6 +2132,13 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
2044 page_cache_release(old_page); 2132 page_cache_release(old_page);
2045 } 2133 }
2046 reuse = reuse_swap_page(old_page); 2134 reuse = reuse_swap_page(old_page);
2135 if (reuse)
2136 /*
2137 * The page is all ours. Move it to our anon_vma so
2138 * the rmap code will not search our parent or siblings.
2139 * Protected against the rmap code by the page lock.
2140 */
2141 page_move_anon_rmap(old_page, vma, address);
2047 unlock_page(old_page); 2142 unlock_page(old_page);
2048 } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == 2143 } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
2049 (VM_WRITE|VM_SHARED))) { 2144 (VM_WRITE|VM_SHARED))) {
@@ -2163,11 +2258,11 @@ gotten:
2163 if (likely(pte_same(*page_table, orig_pte))) { 2258 if (likely(pte_same(*page_table, orig_pte))) {
2164 if (old_page) { 2259 if (old_page) {
2165 if (!PageAnon(old_page)) { 2260 if (!PageAnon(old_page)) {
2166 dec_mm_counter(mm, file_rss); 2261 dec_mm_counter_fast(mm, MM_FILEPAGES);
2167 inc_mm_counter(mm, anon_rss); 2262 inc_mm_counter_fast(mm, MM_ANONPAGES);
2168 } 2263 }
2169 } else 2264 } else
2170 inc_mm_counter(mm, anon_rss); 2265 inc_mm_counter_fast(mm, MM_ANONPAGES);
2171 flush_cache_page(vma, address, pte_pfn(orig_pte)); 2266 flush_cache_page(vma, address, pte_pfn(orig_pte));
2172 entry = mk_pte(new_page, vma->vm_page_prot); 2267 entry = mk_pte(new_page, vma->vm_page_prot);
2173 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 2268 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
@@ -2604,7 +2699,8 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2604 * discarded at swap_free(). 2699 * discarded at swap_free().
2605 */ 2700 */
2606 2701
2607 inc_mm_counter(mm, anon_rss); 2702 inc_mm_counter_fast(mm, MM_ANONPAGES);
2703 dec_mm_counter_fast(mm, MM_SWAPENTS);
2608 pte = mk_pte(page, vma->vm_page_prot); 2704 pte = mk_pte(page, vma->vm_page_prot);
2609 if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) { 2705 if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
2610 pte = maybe_mkwrite(pte_mkdirty(pte), vma); 2706 pte = maybe_mkwrite(pte_mkdirty(pte), vma);
@@ -2688,7 +2784,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
2688 if (!pte_none(*page_table)) 2784 if (!pte_none(*page_table))
2689 goto release; 2785 goto release;
2690 2786
2691 inc_mm_counter(mm, anon_rss); 2787 inc_mm_counter_fast(mm, MM_ANONPAGES);
2692 page_add_new_anon_rmap(page, vma, address); 2788 page_add_new_anon_rmap(page, vma, address);
2693setpte: 2789setpte:
2694 set_pte_at(mm, address, page_table, entry); 2790 set_pte_at(mm, address, page_table, entry);
@@ -2842,10 +2938,10 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2842 if (flags & FAULT_FLAG_WRITE) 2938 if (flags & FAULT_FLAG_WRITE)
2843 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 2939 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2844 if (anon) { 2940 if (anon) {
2845 inc_mm_counter(mm, anon_rss); 2941 inc_mm_counter_fast(mm, MM_ANONPAGES);
2846 page_add_new_anon_rmap(page, vma, address); 2942 page_add_new_anon_rmap(page, vma, address);
2847 } else { 2943 } else {
2848 inc_mm_counter(mm, file_rss); 2944 inc_mm_counter_fast(mm, MM_FILEPAGES);
2849 page_add_file_rmap(page); 2945 page_add_file_rmap(page);
2850 if (flags & FAULT_FLAG_WRITE) { 2946 if (flags & FAULT_FLAG_WRITE) {
2851 dirty_page = page; 2947 dirty_page = page;
@@ -3023,6 +3119,9 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3023 3119
3024 count_vm_event(PGFAULT); 3120 count_vm_event(PGFAULT);
3025 3121
3122 /* do counter updates before entering really critical section. */
3123 check_sync_rss_stat(current);
3124
3026 if (unlikely(is_vm_hugetlb_page(vma))) 3125 if (unlikely(is_vm_hugetlb_page(vma)))
3027 return hugetlb_fault(mm, vma, address, flags); 3126 return hugetlb_fault(mm, vma, address, flags);
3028 3127
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 030ce8a5bb0e..be211a582930 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -28,6 +28,7 @@
28#include <linux/pfn.h> 28#include <linux/pfn.h>
29#include <linux/suspend.h> 29#include <linux/suspend.h>
30#include <linux/mm_inline.h> 30#include <linux/mm_inline.h>
31#include <linux/firmware-map.h>
31 32
32#include <asm/tlbflush.h> 33#include <asm/tlbflush.h>
33 34
@@ -523,6 +524,9 @@ int __ref add_memory(int nid, u64 start, u64 size)
523 BUG_ON(ret); 524 BUG_ON(ret);
524 } 525 }
525 526
527 /* create new memmap entry */
528 firmware_map_add_hotplug(start, start + size, "System RAM");
529
526 goto out; 530 goto out;
527 531
528error: 532error:
@@ -684,9 +688,9 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
684 if (page_count(page)) 688 if (page_count(page))
685 not_managed++; 689 not_managed++;
686#ifdef CONFIG_DEBUG_VM 690#ifdef CONFIG_DEBUG_VM
687 printk(KERN_INFO "removing from LRU failed" 691 printk(KERN_ALERT "removing pfn %lx from LRU failed\n",
688 " %lx/%d/%lx\n", 692 pfn);
689 pfn, page_count(page), page->flags); 693 dump_page(page);
690#endif 694#endif
691 } 695 }
692 } 696 }
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 290fb5bf0440..8034abd3a135 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -563,24 +563,50 @@ static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
563} 563}
564 564
565/* Step 2: apply policy to a range and do splits. */ 565/* Step 2: apply policy to a range and do splits. */
566static int mbind_range(struct vm_area_struct *vma, unsigned long start, 566static int mbind_range(struct mm_struct *mm, unsigned long start,
567 unsigned long end, struct mempolicy *new) 567 unsigned long end, struct mempolicy *new_pol)
568{ 568{
569 struct vm_area_struct *next; 569 struct vm_area_struct *next;
570 int err; 570 struct vm_area_struct *prev;
571 struct vm_area_struct *vma;
572 int err = 0;
573 pgoff_t pgoff;
574 unsigned long vmstart;
575 unsigned long vmend;
571 576
572 err = 0; 577 vma = find_vma_prev(mm, start, &prev);
573 for (; vma && vma->vm_start < end; vma = next) { 578 if (!vma || vma->vm_start > start)
579 return -EFAULT;
580
581 for (; vma && vma->vm_start < end; prev = vma, vma = next) {
574 next = vma->vm_next; 582 next = vma->vm_next;
575 if (vma->vm_start < start) 583 vmstart = max(start, vma->vm_start);
576 err = split_vma(vma->vm_mm, vma, start, 1); 584 vmend = min(end, vma->vm_end);
577 if (!err && vma->vm_end > end) 585
578 err = split_vma(vma->vm_mm, vma, end, 0); 586 pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
579 if (!err) 587 prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
580 err = policy_vma(vma, new); 588 vma->anon_vma, vma->vm_file, pgoff, new_pol);
589 if (prev) {
590 vma = prev;
591 next = vma->vm_next;
592 continue;
593 }
594 if (vma->vm_start != vmstart) {
595 err = split_vma(vma->vm_mm, vma, vmstart, 1);
596 if (err)
597 goto out;
598 }
599 if (vma->vm_end != vmend) {
600 err = split_vma(vma->vm_mm, vma, vmend, 0);
601 if (err)
602 goto out;
603 }
604 err = policy_vma(vma, new_pol);
581 if (err) 605 if (err)
582 break; 606 goto out;
583 } 607 }
608
609 out:
584 return err; 610 return err;
585} 611}
586 612
@@ -780,9 +806,13 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
780 806
781 err = 0; 807 err = 0;
782 if (nmask) { 808 if (nmask) {
783 task_lock(current); 809 if (mpol_store_user_nodemask(pol)) {
784 get_policy_nodemask(pol, nmask); 810 *nmask = pol->w.user_nodemask;
785 task_unlock(current); 811 } else {
812 task_lock(current);
813 get_policy_nodemask(pol, nmask);
814 task_unlock(current);
815 }
786 } 816 }
787 817
788 out: 818 out:
@@ -862,36 +892,36 @@ int do_migrate_pages(struct mm_struct *mm,
862 if (err) 892 if (err)
863 goto out; 893 goto out;
864 894
865/* 895 /*
866 * Find a 'source' bit set in 'tmp' whose corresponding 'dest' 896 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
867 * bit in 'to' is not also set in 'tmp'. Clear the found 'source' 897 * bit in 'to' is not also set in 'tmp'. Clear the found 'source'
868 * bit in 'tmp', and return that <source, dest> pair for migration. 898 * bit in 'tmp', and return that <source, dest> pair for migration.
869 * The pair of nodemasks 'to' and 'from' define the map. 899 * The pair of nodemasks 'to' and 'from' define the map.
870 * 900 *
871 * If no pair of bits is found that way, fallback to picking some 901 * If no pair of bits is found that way, fallback to picking some
872 * pair of 'source' and 'dest' bits that are not the same. If the 902 * pair of 'source' and 'dest' bits that are not the same. If the
873 * 'source' and 'dest' bits are the same, this represents a node 903 * 'source' and 'dest' bits are the same, this represents a node
874 * that will be migrating to itself, so no pages need move. 904 * that will be migrating to itself, so no pages need move.
875 * 905 *
876 * If no bits are left in 'tmp', or if all remaining bits left 906 * If no bits are left in 'tmp', or if all remaining bits left
877 * in 'tmp' correspond to the same bit in 'to', return false 907 * in 'tmp' correspond to the same bit in 'to', return false
878 * (nothing left to migrate). 908 * (nothing left to migrate).
879 * 909 *
880 * This lets us pick a pair of nodes to migrate between, such that 910 * This lets us pick a pair of nodes to migrate between, such that
881 * if possible the dest node is not already occupied by some other 911 * if possible the dest node is not already occupied by some other
882 * source node, minimizing the risk of overloading the memory on a 912 * source node, minimizing the risk of overloading the memory on a
883 * node that would happen if we migrated incoming memory to a node 913 * node that would happen if we migrated incoming memory to a node
884 * before migrating outgoing memory source that same node. 914 * before migrating outgoing memory source that same node.
885 * 915 *
886 * A single scan of tmp is sufficient. As we go, we remember the 916 * A single scan of tmp is sufficient. As we go, we remember the
887 * most recent <s, d> pair that moved (s != d). If we find a pair 917 * most recent <s, d> pair that moved (s != d). If we find a pair
888 * that not only moved, but what's better, moved to an empty slot 918 * that not only moved, but what's better, moved to an empty slot
889 * (d is not set in tmp), then we break out then, with that pair. 919 * (d is not set in tmp), then we break out then, with that pair.
890 * Otherwise when we finish scannng from_tmp, we at least have the 920 * Otherwise when we finish scannng from_tmp, we at least have the
891 * most recent <s, d> pair that moved. If we get all the way through 921 * most recent <s, d> pair that moved. If we get all the way through
892 * the scan of tmp without finding any node that moved, much less 922 * the scan of tmp without finding any node that moved, much less
893 * moved to an empty node, then there is nothing left worth migrating. 923 * moved to an empty node, then there is nothing left worth migrating.
894 */ 924 */
895 925
896 tmp = *from_nodes; 926 tmp = *from_nodes;
897 while (!nodes_empty(tmp)) { 927 while (!nodes_empty(tmp)) {
@@ -1047,7 +1077,7 @@ static long do_mbind(unsigned long start, unsigned long len,
1047 if (!IS_ERR(vma)) { 1077 if (!IS_ERR(vma)) {
1048 int nr_failed = 0; 1078 int nr_failed = 0;
1049 1079
1050 err = mbind_range(vma, start, end, new); 1080 err = mbind_range(mm, start, end, new);
1051 1081
1052 if (!list_empty(&pagelist)) 1082 if (!list_empty(&pagelist))
1053 nr_failed = migrate_pages(&pagelist, new_vma_page, 1083 nr_failed = migrate_pages(&pagelist, new_vma_page,
@@ -1730,10 +1760,12 @@ struct mempolicy *__mpol_dup(struct mempolicy *old)
1730 1760
1731 if (!new) 1761 if (!new)
1732 return ERR_PTR(-ENOMEM); 1762 return ERR_PTR(-ENOMEM);
1763 rcu_read_lock();
1733 if (current_cpuset_is_being_rebound()) { 1764 if (current_cpuset_is_being_rebound()) {
1734 nodemask_t mems = cpuset_mems_allowed(current); 1765 nodemask_t mems = cpuset_mems_allowed(current);
1735 mpol_rebind_policy(old, &mems); 1766 mpol_rebind_policy(old, &mems);
1736 } 1767 }
1768 rcu_read_unlock();
1737 *new = *old; 1769 *new = *old;
1738 atomic_set(&new->refcnt, 1); 1770 atomic_set(&new->refcnt, 1);
1739 return new; 1771 return new;
@@ -2167,8 +2199,8 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
2167 char *rest = nodelist; 2199 char *rest = nodelist;
2168 while (isdigit(*rest)) 2200 while (isdigit(*rest))
2169 rest++; 2201 rest++;
2170 if (!*rest) 2202 if (*rest)
2171 err = 0; 2203 goto out;
2172 } 2204 }
2173 break; 2205 break;
2174 case MPOL_INTERLEAVE: 2206 case MPOL_INTERLEAVE:
@@ -2177,7 +2209,6 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
2177 */ 2209 */
2178 if (!nodelist) 2210 if (!nodelist)
2179 nodes = node_states[N_HIGH_MEMORY]; 2211 nodes = node_states[N_HIGH_MEMORY];
2180 err = 0;
2181 break; 2212 break;
2182 case MPOL_LOCAL: 2213 case MPOL_LOCAL:
2183 /* 2214 /*
@@ -2187,11 +2218,19 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
2187 goto out; 2218 goto out;
2188 mode = MPOL_PREFERRED; 2219 mode = MPOL_PREFERRED;
2189 break; 2220 break;
2190 2221 case MPOL_DEFAULT:
2191 /* 2222 /*
2192 * case MPOL_BIND: mpol_new() enforces non-empty nodemask. 2223 * Insist on a empty nodelist
2193 * case MPOL_DEFAULT: mpol_new() enforces empty nodemask, ignores flags. 2224 */
2194 */ 2225 if (!nodelist)
2226 err = 0;
2227 goto out;
2228 case MPOL_BIND:
2229 /*
2230 * Insist on a nodelist
2231 */
2232 if (!nodelist)
2233 goto out;
2195 } 2234 }
2196 2235
2197 mode_flags = 0; 2236 mode_flags = 0;
@@ -2205,13 +2244,14 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
2205 else if (!strcmp(flags, "relative")) 2244 else if (!strcmp(flags, "relative"))
2206 mode_flags |= MPOL_F_RELATIVE_NODES; 2245 mode_flags |= MPOL_F_RELATIVE_NODES;
2207 else 2246 else
2208 err = 1; 2247 goto out;
2209 } 2248 }
2210 2249
2211 new = mpol_new(mode, mode_flags, &nodes); 2250 new = mpol_new(mode, mode_flags, &nodes);
2212 if (IS_ERR(new)) 2251 if (IS_ERR(new))
2213 err = 1; 2252 goto out;
2214 else { 2253
2254 {
2215 int ret; 2255 int ret;
2216 NODEMASK_SCRATCH(scratch); 2256 NODEMASK_SCRATCH(scratch);
2217 if (scratch) { 2257 if (scratch) {
@@ -2222,13 +2262,15 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
2222 ret = -ENOMEM; 2262 ret = -ENOMEM;
2223 NODEMASK_SCRATCH_FREE(scratch); 2263 NODEMASK_SCRATCH_FREE(scratch);
2224 if (ret) { 2264 if (ret) {
2225 err = 1;
2226 mpol_put(new); 2265 mpol_put(new);
2227 } else if (no_context) { 2266 goto out;
2228 /* save for contextualization */
2229 new->w.user_nodemask = nodes;
2230 } 2267 }
2231 } 2268 }
2269 err = 0;
2270 if (no_context) {
2271 /* save for contextualization */
2272 new->w.user_nodemask = nodes;
2273 }
2232 2274
2233out: 2275out:
2234 /* Restore string for error message */ 2276 /* Restore string for error message */
diff --git a/mm/migrate.c b/mm/migrate.c
index edb6101ed774..88000b89fc9a 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -275,8 +275,6 @@ static int migrate_page_move_mapping(struct address_space *mapping,
275 */ 275 */
276static void migrate_page_copy(struct page *newpage, struct page *page) 276static void migrate_page_copy(struct page *newpage, struct page *page)
277{ 277{
278 int anon;
279
280 copy_highpage(newpage, page); 278 copy_highpage(newpage, page);
281 279
282 if (PageError(page)) 280 if (PageError(page))
@@ -313,8 +311,6 @@ static void migrate_page_copy(struct page *newpage, struct page *page)
313 ClearPageSwapCache(page); 311 ClearPageSwapCache(page);
314 ClearPagePrivate(page); 312 ClearPagePrivate(page);
315 set_page_private(page, 0); 313 set_page_private(page, 0);
316 /* page->mapping contains a flag for PageAnon() */
317 anon = PageAnon(page);
318 page->mapping = NULL; 314 page->mapping = NULL;
319 315
320 /* 316 /*
diff --git a/mm/mlock.c b/mm/mlock.c
index 2b8335a89400..8f4e2dfceec1 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -25,7 +25,7 @@ int can_do_mlock(void)
25{ 25{
26 if (capable(CAP_IPC_LOCK)) 26 if (capable(CAP_IPC_LOCK))
27 return 1; 27 return 1;
28 if (current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur != 0) 28 if (rlimit(RLIMIT_MEMLOCK) != 0)
29 return 1; 29 return 1;
30 return 0; 30 return 0;
31} 31}
@@ -487,7 +487,7 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
487 locked = len >> PAGE_SHIFT; 487 locked = len >> PAGE_SHIFT;
488 locked += current->mm->locked_vm; 488 locked += current->mm->locked_vm;
489 489
490 lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; 490 lock_limit = rlimit(RLIMIT_MEMLOCK);
491 lock_limit >>= PAGE_SHIFT; 491 lock_limit >>= PAGE_SHIFT;
492 492
493 /* check against resource limits */ 493 /* check against resource limits */
@@ -550,7 +550,7 @@ SYSCALL_DEFINE1(mlockall, int, flags)
550 550
551 down_write(&current->mm->mmap_sem); 551 down_write(&current->mm->mmap_sem);
552 552
553 lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; 553 lock_limit = rlimit(RLIMIT_MEMLOCK);
554 lock_limit >>= PAGE_SHIFT; 554 lock_limit >>= PAGE_SHIFT;
555 555
556 ret = -ENOMEM; 556 ret = -ENOMEM;
@@ -584,7 +584,7 @@ int user_shm_lock(size_t size, struct user_struct *user)
584 int allowed = 0; 584 int allowed = 0;
585 585
586 locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; 586 locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
587 lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; 587 lock_limit = rlimit(RLIMIT_MEMLOCK);
588 if (lock_limit == RLIM_INFINITY) 588 if (lock_limit == RLIM_INFINITY)
589 allowed = 1; 589 allowed = 1;
590 lock_limit >>= PAGE_SHIFT; 590 lock_limit >>= PAGE_SHIFT;
@@ -618,12 +618,12 @@ int account_locked_memory(struct mm_struct *mm, struct rlimit *rlim,
618 618
619 down_write(&mm->mmap_sem); 619 down_write(&mm->mmap_sem);
620 620
621 lim = rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT; 621 lim = ACCESS_ONCE(rlim[RLIMIT_AS].rlim_cur) >> PAGE_SHIFT;
622 vm = mm->total_vm + pgsz; 622 vm = mm->total_vm + pgsz;
623 if (lim < vm) 623 if (lim < vm)
624 goto out; 624 goto out;
625 625
626 lim = rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; 626 lim = ACCESS_ONCE(rlim[RLIMIT_MEMLOCK].rlim_cur) >> PAGE_SHIFT;
627 vm = mm->locked_vm + pgsz; 627 vm = mm->locked_vm + pgsz;
628 if (lim < vm) 628 if (lim < vm)
629 goto out; 629 goto out;
diff --git a/mm/mmap.c b/mm/mmap.c
index ee2298936fe6..75557c639ad4 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -265,7 +265,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
265 * segment grow beyond its set limit the in case where the limit is 265 * segment grow beyond its set limit the in case where the limit is
266 * not page aligned -Ram Gupta 266 * not page aligned -Ram Gupta
267 */ 267 */
268 rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur; 268 rlim = rlimit(RLIMIT_DATA);
269 if (rlim < RLIM_INFINITY && (brk - mm->start_brk) + 269 if (rlim < RLIM_INFINITY && (brk - mm->start_brk) +
270 (mm->end_data - mm->start_data) > rlim) 270 (mm->end_data - mm->start_data) > rlim)
271 goto out; 271 goto out;
@@ -437,7 +437,6 @@ __vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
437{ 437{
438 __vma_link_list(mm, vma, prev, rb_parent); 438 __vma_link_list(mm, vma, prev, rb_parent);
439 __vma_link_rb(mm, vma, rb_link, rb_parent); 439 __vma_link_rb(mm, vma, rb_link, rb_parent);
440 __anon_vma_link(vma);
441} 440}
442 441
443static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma, 442static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -499,7 +498,7 @@ __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
499 * are necessary. The "insert" vma (if any) is to be inserted 498 * are necessary. The "insert" vma (if any) is to be inserted
500 * before we drop the necessary locks. 499 * before we drop the necessary locks.
501 */ 500 */
502void vma_adjust(struct vm_area_struct *vma, unsigned long start, 501int vma_adjust(struct vm_area_struct *vma, unsigned long start,
503 unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert) 502 unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert)
504{ 503{
505 struct mm_struct *mm = vma->vm_mm; 504 struct mm_struct *mm = vma->vm_mm;
@@ -542,6 +541,26 @@ again: remove_next = 1 + (end > next->vm_end);
542 } 541 }
543 } 542 }
544 543
544 /*
545 * When changing only vma->vm_end, we don't really need anon_vma lock.
546 */
547 if (vma->anon_vma && (insert || importer || start != vma->vm_start))
548 anon_vma = vma->anon_vma;
549 if (anon_vma) {
550 /*
551 * Easily overlooked: when mprotect shifts the boundary,
552 * make sure the expanding vma has anon_vma set if the
553 * shrinking vma had, to cover any anon pages imported.
554 */
555 if (importer && !importer->anon_vma) {
556 /* Block reverse map lookups until things are set up. */
557 if (anon_vma_clone(importer, vma)) {
558 return -ENOMEM;
559 }
560 importer->anon_vma = anon_vma;
561 }
562 }
563
545 if (file) { 564 if (file) {
546 mapping = file->f_mapping; 565 mapping = file->f_mapping;
547 if (!(vma->vm_flags & VM_NONLINEAR)) 566 if (!(vma->vm_flags & VM_NONLINEAR))
@@ -567,25 +586,6 @@ again: remove_next = 1 + (end > next->vm_end);
567 } 586 }
568 } 587 }
569 588
570 /*
571 * When changing only vma->vm_end, we don't really need
572 * anon_vma lock.
573 */
574 if (vma->anon_vma && (insert || importer || start != vma->vm_start))
575 anon_vma = vma->anon_vma;
576 if (anon_vma) {
577 spin_lock(&anon_vma->lock);
578 /*
579 * Easily overlooked: when mprotect shifts the boundary,
580 * make sure the expanding vma has anon_vma set if the
581 * shrinking vma had, to cover any anon pages imported.
582 */
583 if (importer && !importer->anon_vma) {
584 importer->anon_vma = anon_vma;
585 __anon_vma_link(importer);
586 }
587 }
588
589 if (root) { 589 if (root) {
590 flush_dcache_mmap_lock(mapping); 590 flush_dcache_mmap_lock(mapping);
591 vma_prio_tree_remove(vma, root); 591 vma_prio_tree_remove(vma, root);
@@ -616,8 +616,6 @@ again: remove_next = 1 + (end > next->vm_end);
616 __vma_unlink(mm, next, vma); 616 __vma_unlink(mm, next, vma);
617 if (file) 617 if (file)
618 __remove_shared_vm_struct(next, file, mapping); 618 __remove_shared_vm_struct(next, file, mapping);
619 if (next->anon_vma)
620 __anon_vma_merge(vma, next);
621 } else if (insert) { 619 } else if (insert) {
622 /* 620 /*
623 * split_vma has split insert from vma, and needs 621 * split_vma has split insert from vma, and needs
@@ -627,8 +625,6 @@ again: remove_next = 1 + (end > next->vm_end);
627 __insert_vm_struct(mm, insert); 625 __insert_vm_struct(mm, insert);
628 } 626 }
629 627
630 if (anon_vma)
631 spin_unlock(&anon_vma->lock);
632 if (mapping) 628 if (mapping)
633 spin_unlock(&mapping->i_mmap_lock); 629 spin_unlock(&mapping->i_mmap_lock);
634 630
@@ -638,6 +634,8 @@ again: remove_next = 1 + (end > next->vm_end);
638 if (next->vm_flags & VM_EXECUTABLE) 634 if (next->vm_flags & VM_EXECUTABLE)
639 removed_exe_file_vma(mm); 635 removed_exe_file_vma(mm);
640 } 636 }
637 if (next->anon_vma)
638 anon_vma_merge(vma, next);
641 mm->map_count--; 639 mm->map_count--;
642 mpol_put(vma_policy(next)); 640 mpol_put(vma_policy(next));
643 kmem_cache_free(vm_area_cachep, next); 641 kmem_cache_free(vm_area_cachep, next);
@@ -653,6 +651,8 @@ again: remove_next = 1 + (end > next->vm_end);
653 } 651 }
654 652
655 validate_mm(mm); 653 validate_mm(mm);
654
655 return 0;
656} 656}
657 657
658/* 658/*
@@ -759,6 +759,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
759{ 759{
760 pgoff_t pglen = (end - addr) >> PAGE_SHIFT; 760 pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
761 struct vm_area_struct *area, *next; 761 struct vm_area_struct *area, *next;
762 int err;
762 763
763 /* 764 /*
764 * We later require that vma->vm_flags == vm_flags, 765 * We later require that vma->vm_flags == vm_flags,
@@ -792,11 +793,13 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
792 is_mergeable_anon_vma(prev->anon_vma, 793 is_mergeable_anon_vma(prev->anon_vma,
793 next->anon_vma)) { 794 next->anon_vma)) {
794 /* cases 1, 6 */ 795 /* cases 1, 6 */
795 vma_adjust(prev, prev->vm_start, 796 err = vma_adjust(prev, prev->vm_start,
796 next->vm_end, prev->vm_pgoff, NULL); 797 next->vm_end, prev->vm_pgoff, NULL);
797 } else /* cases 2, 5, 7 */ 798 } else /* cases 2, 5, 7 */
798 vma_adjust(prev, prev->vm_start, 799 err = vma_adjust(prev, prev->vm_start,
799 end, prev->vm_pgoff, NULL); 800 end, prev->vm_pgoff, NULL);
801 if (err)
802 return NULL;
800 return prev; 803 return prev;
801 } 804 }
802 805
@@ -808,11 +811,13 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
808 can_vma_merge_before(next, vm_flags, 811 can_vma_merge_before(next, vm_flags,
809 anon_vma, file, pgoff+pglen)) { 812 anon_vma, file, pgoff+pglen)) {
810 if (prev && addr < prev->vm_end) /* case 4 */ 813 if (prev && addr < prev->vm_end) /* case 4 */
811 vma_adjust(prev, prev->vm_start, 814 err = vma_adjust(prev, prev->vm_start,
812 addr, prev->vm_pgoff, NULL); 815 addr, prev->vm_pgoff, NULL);
813 else /* cases 3, 8 */ 816 else /* cases 3, 8 */
814 vma_adjust(area, addr, next->vm_end, 817 err = vma_adjust(area, addr, next->vm_end,
815 next->vm_pgoff - pglen, NULL); 818 next->vm_pgoff - pglen, NULL);
819 if (err)
820 return NULL;
816 return area; 821 return area;
817 } 822 }
818 823
@@ -967,7 +972,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
967 unsigned long locked, lock_limit; 972 unsigned long locked, lock_limit;
968 locked = len >> PAGE_SHIFT; 973 locked = len >> PAGE_SHIFT;
969 locked += mm->locked_vm; 974 locked += mm->locked_vm;
970 lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; 975 lock_limit = rlimit(RLIMIT_MEMLOCK);
971 lock_limit >>= PAGE_SHIFT; 976 lock_limit >>= PAGE_SHIFT;
972 if (locked > lock_limit && !capable(CAP_IPC_LOCK)) 977 if (locked > lock_limit && !capable(CAP_IPC_LOCK))
973 return -EAGAIN; 978 return -EAGAIN;
@@ -1083,6 +1088,30 @@ out:
1083 return retval; 1088 return retval;
1084} 1089}
1085 1090
1091#ifdef __ARCH_WANT_SYS_OLD_MMAP
1092struct mmap_arg_struct {
1093 unsigned long addr;
1094 unsigned long len;
1095 unsigned long prot;
1096 unsigned long flags;
1097 unsigned long fd;
1098 unsigned long offset;
1099};
1100
1101SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)
1102{
1103 struct mmap_arg_struct a;
1104
1105 if (copy_from_user(&a, arg, sizeof(a)))
1106 return -EFAULT;
1107 if (a.offset & ~PAGE_MASK)
1108 return -EINVAL;
1109
1110 return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd,
1111 a.offset >> PAGE_SHIFT);
1112}
1113#endif /* __ARCH_WANT_SYS_OLD_MMAP */
1114
1086/* 1115/*
1087 * Some shared mappigns will want the pages marked read-only 1116 * Some shared mappigns will want the pages marked read-only
1088 * to track write events. If so, we'll downgrade vm_page_prot 1117 * to track write events. If so, we'll downgrade vm_page_prot
@@ -1205,6 +1234,7 @@ munmap_back:
1205 vma->vm_flags = vm_flags; 1234 vma->vm_flags = vm_flags;
1206 vma->vm_page_prot = vm_get_page_prot(vm_flags); 1235 vma->vm_page_prot = vm_get_page_prot(vm_flags);
1207 vma->vm_pgoff = pgoff; 1236 vma->vm_pgoff = pgoff;
1237 INIT_LIST_HEAD(&vma->anon_vma_chain);
1208 1238
1209 if (file) { 1239 if (file) {
1210 error = -EINVAL; 1240 error = -EINVAL;
@@ -1265,13 +1295,8 @@ out:
1265 mm->total_vm += len >> PAGE_SHIFT; 1295 mm->total_vm += len >> PAGE_SHIFT;
1266 vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); 1296 vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
1267 if (vm_flags & VM_LOCKED) { 1297 if (vm_flags & VM_LOCKED) {
1268 /* 1298 if (!mlock_vma_pages_range(vma, addr, addr + len))
1269 * makes pages present; downgrades, drops, reacquires mmap_sem 1299 mm->locked_vm += (len >> PAGE_SHIFT);
1270 */
1271 long nr_pages = mlock_vma_pages_range(vma, addr, addr + len);
1272 if (nr_pages < 0)
1273 return nr_pages; /* vma gone! */
1274 mm->locked_vm += (len >> PAGE_SHIFT) - nr_pages;
1275 } else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK)) 1300 } else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK))
1276 make_pages_present(addr, addr + len); 1301 make_pages_present(addr, addr + len);
1277 return addr; 1302 return addr;
@@ -1599,7 +1624,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
1599 return -ENOMEM; 1624 return -ENOMEM;
1600 1625
1601 /* Stack limit test */ 1626 /* Stack limit test */
1602 if (size > rlim[RLIMIT_STACK].rlim_cur) 1627 if (size > ACCESS_ONCE(rlim[RLIMIT_STACK].rlim_cur))
1603 return -ENOMEM; 1628 return -ENOMEM;
1604 1629
1605 /* mlock limit tests */ 1630 /* mlock limit tests */
@@ -1607,7 +1632,8 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
1607 unsigned long locked; 1632 unsigned long locked;
1608 unsigned long limit; 1633 unsigned long limit;
1609 locked = mm->locked_vm + grow; 1634 locked = mm->locked_vm + grow;
1610 limit = rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; 1635 limit = ACCESS_ONCE(rlim[RLIMIT_MEMLOCK].rlim_cur);
1636 limit >>= PAGE_SHIFT;
1611 if (locked > limit && !capable(CAP_IPC_LOCK)) 1637 if (locked > limit && !capable(CAP_IPC_LOCK))
1612 return -ENOMEM; 1638 return -ENOMEM;
1613 } 1639 }
@@ -1754,8 +1780,7 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr)
1754 if (!prev || expand_stack(prev, addr)) 1780 if (!prev || expand_stack(prev, addr))
1755 return NULL; 1781 return NULL;
1756 if (prev->vm_flags & VM_LOCKED) { 1782 if (prev->vm_flags & VM_LOCKED) {
1757 if (mlock_vma_pages_range(prev, addr, prev->vm_end) < 0) 1783 mlock_vma_pages_range(prev, addr, prev->vm_end);
1758 return NULL; /* vma gone! */
1759 } 1784 }
1760 return prev; 1785 return prev;
1761} 1786}
@@ -1783,8 +1808,7 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr)
1783 if (expand_stack(vma, addr)) 1808 if (expand_stack(vma, addr))
1784 return NULL; 1809 return NULL;
1785 if (vma->vm_flags & VM_LOCKED) { 1810 if (vma->vm_flags & VM_LOCKED) {
1786 if (mlock_vma_pages_range(vma, addr, start) < 0) 1811 mlock_vma_pages_range(vma, addr, start);
1787 return NULL; /* vma gone! */
1788 } 1812 }
1789 return vma; 1813 return vma;
1790} 1814}
@@ -1871,6 +1895,7 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
1871{ 1895{
1872 struct mempolicy *pol; 1896 struct mempolicy *pol;
1873 struct vm_area_struct *new; 1897 struct vm_area_struct *new;
1898 int err = -ENOMEM;
1874 1899
1875 if (is_vm_hugetlb_page(vma) && (addr & 1900 if (is_vm_hugetlb_page(vma) && (addr &
1876 ~(huge_page_mask(hstate_vma(vma))))) 1901 ~(huge_page_mask(hstate_vma(vma)))))
@@ -1878,11 +1903,13 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
1878 1903
1879 new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); 1904 new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
1880 if (!new) 1905 if (!new)
1881 return -ENOMEM; 1906 goto out_err;
1882 1907
1883 /* most fields are the same, copy all, and then fixup */ 1908 /* most fields are the same, copy all, and then fixup */
1884 *new = *vma; 1909 *new = *vma;
1885 1910
1911 INIT_LIST_HEAD(&new->anon_vma_chain);
1912
1886 if (new_below) 1913 if (new_below)
1887 new->vm_end = addr; 1914 new->vm_end = addr;
1888 else { 1915 else {
@@ -1892,11 +1919,14 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
1892 1919
1893 pol = mpol_dup(vma_policy(vma)); 1920 pol = mpol_dup(vma_policy(vma));
1894 if (IS_ERR(pol)) { 1921 if (IS_ERR(pol)) {
1895 kmem_cache_free(vm_area_cachep, new); 1922 err = PTR_ERR(pol);
1896 return PTR_ERR(pol); 1923 goto out_free_vma;
1897 } 1924 }
1898 vma_set_policy(new, pol); 1925 vma_set_policy(new, pol);
1899 1926
1927 if (anon_vma_clone(new, vma))
1928 goto out_free_mpol;
1929
1900 if (new->vm_file) { 1930 if (new->vm_file) {
1901 get_file(new->vm_file); 1931 get_file(new->vm_file);
1902 if (vma->vm_flags & VM_EXECUTABLE) 1932 if (vma->vm_flags & VM_EXECUTABLE)
@@ -1907,12 +1937,28 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
1907 new->vm_ops->open(new); 1937 new->vm_ops->open(new);
1908 1938
1909 if (new_below) 1939 if (new_below)
1910 vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff + 1940 err = vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff +
1911 ((addr - new->vm_start) >> PAGE_SHIFT), new); 1941 ((addr - new->vm_start) >> PAGE_SHIFT), new);
1912 else 1942 else
1913 vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new); 1943 err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new);
1914 1944
1915 return 0; 1945 /* Success. */
1946 if (!err)
1947 return 0;
1948
1949 /* Clean everything up if vma_adjust failed. */
1950 new->vm_ops->close(new);
1951 if (new->vm_file) {
1952 if (vma->vm_flags & VM_EXECUTABLE)
1953 removed_exe_file_vma(mm);
1954 fput(new->vm_file);
1955 }
1956 out_free_mpol:
1957 mpol_put(pol);
1958 out_free_vma:
1959 kmem_cache_free(vm_area_cachep, new);
1960 out_err:
1961 return err;
1916} 1962}
1917 1963
1918/* 1964/*
@@ -2074,7 +2120,7 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
2074 unsigned long locked, lock_limit; 2120 unsigned long locked, lock_limit;
2075 locked = len >> PAGE_SHIFT; 2121 locked = len >> PAGE_SHIFT;
2076 locked += mm->locked_vm; 2122 locked += mm->locked_vm;
2077 lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; 2123 lock_limit = rlimit(RLIMIT_MEMLOCK);
2078 lock_limit >>= PAGE_SHIFT; 2124 lock_limit >>= PAGE_SHIFT;
2079 if (locked > lock_limit && !capable(CAP_IPC_LOCK)) 2125 if (locked > lock_limit && !capable(CAP_IPC_LOCK))
2080 return -EAGAIN; 2126 return -EAGAIN;
@@ -2122,6 +2168,7 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
2122 return -ENOMEM; 2168 return -ENOMEM;
2123 } 2169 }
2124 2170
2171 INIT_LIST_HEAD(&vma->anon_vma_chain);
2125 vma->vm_mm = mm; 2172 vma->vm_mm = mm;
2126 vma->vm_start = addr; 2173 vma->vm_start = addr;
2127 vma->vm_end = addr + len; 2174 vma->vm_end = addr + len;
@@ -2258,10 +2305,11 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
2258 if (new_vma) { 2305 if (new_vma) {
2259 *new_vma = *vma; 2306 *new_vma = *vma;
2260 pol = mpol_dup(vma_policy(vma)); 2307 pol = mpol_dup(vma_policy(vma));
2261 if (IS_ERR(pol)) { 2308 if (IS_ERR(pol))
2262 kmem_cache_free(vm_area_cachep, new_vma); 2309 goto out_free_vma;
2263 return NULL; 2310 INIT_LIST_HEAD(&new_vma->anon_vma_chain);
2264 } 2311 if (anon_vma_clone(new_vma, vma))
2312 goto out_free_mempol;
2265 vma_set_policy(new_vma, pol); 2313 vma_set_policy(new_vma, pol);
2266 new_vma->vm_start = addr; 2314 new_vma->vm_start = addr;
2267 new_vma->vm_end = addr + len; 2315 new_vma->vm_end = addr + len;
@@ -2277,6 +2325,12 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
2277 } 2325 }
2278 } 2326 }
2279 return new_vma; 2327 return new_vma;
2328
2329 out_free_mempol:
2330 mpol_put(pol);
2331 out_free_vma:
2332 kmem_cache_free(vm_area_cachep, new_vma);
2333 return NULL;
2280} 2334}
2281 2335
2282/* 2336/*
@@ -2288,7 +2342,7 @@ int may_expand_vm(struct mm_struct *mm, unsigned long npages)
2288 unsigned long cur = mm->total_vm; /* pages */ 2342 unsigned long cur = mm->total_vm; /* pages */
2289 unsigned long lim; 2343 unsigned long lim;
2290 2344
2291 lim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT; 2345 lim = rlimit(RLIMIT_AS) >> PAGE_SHIFT;
2292 2346
2293 if (cur + npages > lim) 2347 if (cur + npages > lim)
2294 return 0; 2348 return 0;
@@ -2354,6 +2408,7 @@ int install_special_mapping(struct mm_struct *mm,
2354 if (unlikely(vma == NULL)) 2408 if (unlikely(vma == NULL))
2355 return -ENOMEM; 2409 return -ENOMEM;
2356 2410
2411 INIT_LIST_HEAD(&vma->anon_vma_chain);
2357 vma->vm_mm = mm; 2412 vma->vm_mm = mm;
2358 vma->vm_start = addr; 2413 vma->vm_start = addr;
2359 vma->vm_end = addr + len; 2414 vma->vm_end = addr + len;
@@ -2454,6 +2509,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
2454int mm_take_all_locks(struct mm_struct *mm) 2509int mm_take_all_locks(struct mm_struct *mm)
2455{ 2510{
2456 struct vm_area_struct *vma; 2511 struct vm_area_struct *vma;
2512 struct anon_vma_chain *avc;
2457 int ret = -EINTR; 2513 int ret = -EINTR;
2458 2514
2459 BUG_ON(down_read_trylock(&mm->mmap_sem)); 2515 BUG_ON(down_read_trylock(&mm->mmap_sem));
@@ -2471,7 +2527,8 @@ int mm_take_all_locks(struct mm_struct *mm)
2471 if (signal_pending(current)) 2527 if (signal_pending(current))
2472 goto out_unlock; 2528 goto out_unlock;
2473 if (vma->anon_vma) 2529 if (vma->anon_vma)
2474 vm_lock_anon_vma(mm, vma->anon_vma); 2530 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
2531 vm_lock_anon_vma(mm, avc->anon_vma);
2475 } 2532 }
2476 2533
2477 ret = 0; 2534 ret = 0;
@@ -2526,13 +2583,15 @@ static void vm_unlock_mapping(struct address_space *mapping)
2526void mm_drop_all_locks(struct mm_struct *mm) 2583void mm_drop_all_locks(struct mm_struct *mm)
2527{ 2584{
2528 struct vm_area_struct *vma; 2585 struct vm_area_struct *vma;
2586 struct anon_vma_chain *avc;
2529 2587
2530 BUG_ON(down_read_trylock(&mm->mmap_sem)); 2588 BUG_ON(down_read_trylock(&mm->mmap_sem));
2531 BUG_ON(!mutex_is_locked(&mm_all_locks_mutex)); 2589 BUG_ON(!mutex_is_locked(&mm_all_locks_mutex));
2532 2590
2533 for (vma = mm->mmap; vma; vma = vma->vm_next) { 2591 for (vma = mm->mmap; vma; vma = vma->vm_next) {
2534 if (vma->anon_vma) 2592 if (vma->anon_vma)
2535 vm_unlock_anon_vma(vma->anon_vma); 2593 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
2594 vm_unlock_anon_vma(avc->anon_vma);
2536 if (vma->vm_file && vma->vm_file->f_mapping) 2595 if (vma->vm_file && vma->vm_file->f_mapping)
2537 vm_unlock_mapping(vma->vm_file->f_mapping); 2596 vm_unlock_mapping(vma->vm_file->f_mapping);
2538 } 2597 }
diff --git a/mm/mmu_context.c b/mm/mmu_context.c
index 0777654147c9..9e82e937000e 100644
--- a/mm/mmu_context.c
+++ b/mm/mmu_context.c
@@ -53,6 +53,7 @@ void unuse_mm(struct mm_struct *mm)
53 struct task_struct *tsk = current; 53 struct task_struct *tsk = current;
54 54
55 task_lock(tsk); 55 task_lock(tsk);
56 sync_mm_rss(tsk, mm);
56 tsk->mm = NULL; 57 tsk->mm = NULL;
57 /* active_mm is still 'mm' */ 58 /* active_mm is still 'mm' */
58 enter_lazy_tlb(mm, tsk); 59 enter_lazy_tlb(mm, tsk);
diff --git a/mm/mremap.c b/mm/mremap.c
index 845190898d59..e9c75efce609 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -285,7 +285,7 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
285 if (vma->vm_flags & VM_LOCKED) { 285 if (vma->vm_flags & VM_LOCKED) {
286 unsigned long locked, lock_limit; 286 unsigned long locked, lock_limit;
287 locked = mm->locked_vm << PAGE_SHIFT; 287 locked = mm->locked_vm << PAGE_SHIFT;
288 lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; 288 lock_limit = rlimit(RLIMIT_MEMLOCK);
289 locked += new_len - old_len; 289 locked += new_len - old_len;
290 if (locked > lock_limit && !capable(CAP_IPC_LOCK)) 290 if (locked > lock_limit && !capable(CAP_IPC_LOCK))
291 goto Eagain; 291 goto Eagain;
@@ -460,8 +460,11 @@ unsigned long do_mremap(unsigned long addr,
460 if (vma_expandable(vma, new_len - old_len)) { 460 if (vma_expandable(vma, new_len - old_len)) {
461 int pages = (new_len - old_len) >> PAGE_SHIFT; 461 int pages = (new_len - old_len) >> PAGE_SHIFT;
462 462
463 vma_adjust(vma, vma->vm_start, 463 if (vma_adjust(vma, vma->vm_start, addr + new_len,
464 addr + new_len, vma->vm_pgoff, NULL); 464 vma->vm_pgoff, NULL)) {
465 ret = -ENOMEM;
466 goto out;
467 }
465 468
466 mm->total_vm += pages; 469 mm->total_vm += pages;
467 vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages); 470 vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages);
diff --git a/mm/nommu.c b/mm/nommu.c
index 48a2ecfaf059..63fa17d121f0 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -162,7 +162,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
162 } 162 }
163 if (vmas) 163 if (vmas)
164 vmas[i] = vma; 164 vmas[i] = vma;
165 start += PAGE_SIZE; 165 start = (start + PAGE_SIZE) & PAGE_MASK;
166 } 166 }
167 167
168 return i; 168 return i;
@@ -1040,10 +1040,9 @@ static int do_mmap_shared_file(struct vm_area_struct *vma)
1040 if (ret != -ENOSYS) 1040 if (ret != -ENOSYS)
1041 return ret; 1041 return ret;
1042 1042
1043 /* getting an ENOSYS error indicates that direct mmap isn't 1043 /* getting -ENOSYS indicates that direct mmap isn't possible (as
1044 * possible (as opposed to tried but failed) so we'll fall 1044 * opposed to tried but failed) so we can only give a suitable error as
1045 * through to making a private copy of the data and mapping 1045 * it's not possible to make a private copy if MAP_SHARED was given */
1046 * that if we can */
1047 return -ENODEV; 1046 return -ENODEV;
1048} 1047}
1049 1048
@@ -1209,7 +1208,7 @@ unsigned long do_mmap_pgoff(struct file *file,
1209 region->vm_flags = vm_flags; 1208 region->vm_flags = vm_flags;
1210 region->vm_pgoff = pgoff; 1209 region->vm_pgoff = pgoff;
1211 1210
1212 INIT_LIST_HEAD(&vma->anon_vma_node); 1211 INIT_LIST_HEAD(&vma->anon_vma_chain);
1213 vma->vm_flags = vm_flags; 1212 vma->vm_flags = vm_flags;
1214 vma->vm_pgoff = pgoff; 1213 vma->vm_pgoff = pgoff;
1215 1214
@@ -1428,6 +1427,30 @@ out:
1428 return retval; 1427 return retval;
1429} 1428}
1430 1429
1430#ifdef __ARCH_WANT_SYS_OLD_MMAP
1431struct mmap_arg_struct {
1432 unsigned long addr;
1433 unsigned long len;
1434 unsigned long prot;
1435 unsigned long flags;
1436 unsigned long fd;
1437 unsigned long offset;
1438};
1439
1440SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)
1441{
1442 struct mmap_arg_struct a;
1443
1444 if (copy_from_user(&a, arg, sizeof(a)))
1445 return -EFAULT;
1446 if (a.offset & ~PAGE_MASK)
1447 return -EINVAL;
1448
1449 return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd,
1450 a.offset >> PAGE_SHIFT);
1451}
1452#endif /* __ARCH_WANT_SYS_OLD_MMAP */
1453
1431/* 1454/*
1432 * split a vma into two pieces at address 'addr', a new vma is allocated either 1455 * split a vma into two pieces at address 'addr', a new vma is allocated either
1433 * for the first part or the tail. 1456 * for the first part or the tail.
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 237050478f28..9b223af6a147 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -401,8 +401,8 @@ static void __oom_kill_task(struct task_struct *p, int verbose)
401 "vsz:%lukB, anon-rss:%lukB, file-rss:%lukB\n", 401 "vsz:%lukB, anon-rss:%lukB, file-rss:%lukB\n",
402 task_pid_nr(p), p->comm, 402 task_pid_nr(p), p->comm,
403 K(p->mm->total_vm), 403 K(p->mm->total_vm),
404 K(get_mm_counter(p->mm, anon_rss)), 404 K(get_mm_counter(p->mm, MM_ANONPAGES)),
405 K(get_mm_counter(p->mm, file_rss))); 405 K(get_mm_counter(p->mm, MM_FILEPAGES)));
406 task_unlock(p); 406 task_unlock(p);
407 407
408 /* 408 /*
@@ -473,6 +473,8 @@ void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask)
473 unsigned long points = 0; 473 unsigned long points = 0;
474 struct task_struct *p; 474 struct task_struct *p;
475 475
476 if (sysctl_panic_on_oom == 2)
477 panic("out of memory(memcg). panic_on_oom is selected.\n");
476 read_lock(&tasklist_lock); 478 read_lock(&tasklist_lock);
477retry: 479retry:
478 p = select_bad_process(&points, mem); 480 p = select_bad_process(&points, mem);
@@ -601,13 +603,6 @@ void pagefault_out_of_memory(void)
601 /* Got some memory back in the last second. */ 603 /* Got some memory back in the last second. */
602 return; 604 return;
603 605
604 /*
605 * If this is from memcg, oom-killer is already invoked.
606 * and not worth to go system-wide-oom.
607 */
608 if (mem_cgroup_oom_called(current))
609 goto rest_and_return;
610
611 if (sysctl_panic_on_oom) 606 if (sysctl_panic_on_oom)
612 panic("out of memory from page fault. panic_on_oom is selected.\n"); 607 panic("out of memory from page fault. panic_on_oom is selected.\n");
613 608
@@ -619,7 +614,6 @@ void pagefault_out_of_memory(void)
619 * Give "p" a good chance of killing itself before we 614 * Give "p" a good chance of killing itself before we
620 * retry to allocate memory. 615 * retry to allocate memory.
621 */ 616 */
622rest_and_return:
623 if (!test_thread_flag(TIF_MEMDIE)) 617 if (!test_thread_flag(TIF_MEMDIE))
624 schedule_timeout_uninterruptible(1); 618 schedule_timeout_uninterruptible(1);
625} 619}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8deb9d0fd5b1..d03c946d5566 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -50,6 +50,7 @@
50#include <linux/kmemleak.h> 50#include <linux/kmemleak.h>
51#include <linux/memory.h> 51#include <linux/memory.h>
52#include <trace/events/kmem.h> 52#include <trace/events/kmem.h>
53#include <linux/ftrace_event.h>
53 54
54#include <asm/tlbflush.h> 55#include <asm/tlbflush.h>
55#include <asm/div64.h> 56#include <asm/div64.h>
@@ -76,6 +77,31 @@ unsigned long totalreserve_pages __read_mostly;
76int percpu_pagelist_fraction; 77int percpu_pagelist_fraction;
77gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; 78gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
78 79
80#ifdef CONFIG_PM_SLEEP
81/*
82 * The following functions are used by the suspend/hibernate code to temporarily
83 * change gfp_allowed_mask in order to avoid using I/O during memory allocations
84 * while devices are suspended. To avoid races with the suspend/hibernate code,
85 * they should always be called with pm_mutex held (gfp_allowed_mask also should
86 * only be modified with pm_mutex held, unless the suspend/hibernate code is
87 * guaranteed not to run in parallel with that modification).
88 */
89void set_gfp_allowed_mask(gfp_t mask)
90{
91 WARN_ON(!mutex_is_locked(&pm_mutex));
92 gfp_allowed_mask = mask;
93}
94
95gfp_t clear_gfp_allowed_mask(gfp_t mask)
96{
97 gfp_t ret = gfp_allowed_mask;
98
99 WARN_ON(!mutex_is_locked(&pm_mutex));
100 gfp_allowed_mask &= ~mask;
101 return ret;
102}
103#endif /* CONFIG_PM_SLEEP */
104
79#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE 105#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
80int pageblock_order __read_mostly; 106int pageblock_order __read_mostly;
81#endif 107#endif
@@ -263,10 +289,7 @@ static void bad_page(struct page *page)
263 289
264 printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n", 290 printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n",
265 current->comm, page_to_pfn(page)); 291 current->comm, page_to_pfn(page));
266 printk(KERN_ALERT 292 dump_page(page);
267 "page:%p flags:%p count:%d mapcount:%d mapping:%p index:%lx\n",
268 page, (void *)page->flags, page_count(page),
269 page_mapcount(page), page->mapping, page->index);
270 293
271 dump_stack(); 294 dump_stack();
272out: 295out:
@@ -530,7 +553,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
530 int batch_free = 0; 553 int batch_free = 0;
531 554
532 spin_lock(&zone->lock); 555 spin_lock(&zone->lock);
533 zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); 556 zone->all_unreclaimable = 0;
534 zone->pages_scanned = 0; 557 zone->pages_scanned = 0;
535 558
536 __mod_zone_page_state(zone, NR_FREE_PAGES, count); 559 __mod_zone_page_state(zone, NR_FREE_PAGES, count);
@@ -568,7 +591,7 @@ static void free_one_page(struct zone *zone, struct page *page, int order,
568 int migratetype) 591 int migratetype)
569{ 592{
570 spin_lock(&zone->lock); 593 spin_lock(&zone->lock);
571 zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); 594 zone->all_unreclaimable = 0;
572 zone->pages_scanned = 0; 595 zone->pages_scanned = 0;
573 596
574 __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order); 597 __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
@@ -583,6 +606,7 @@ static void __free_pages_ok(struct page *page, unsigned int order)
583 int bad = 0; 606 int bad = 0;
584 int wasMlocked = __TestClearPageMlocked(page); 607 int wasMlocked = __TestClearPageMlocked(page);
585 608
609 trace_mm_page_free_direct(page, order);
586 kmemcheck_free_shadow(page, order); 610 kmemcheck_free_shadow(page, order);
587 611
588 for (i = 0 ; i < (1 << order) ; ++i) 612 for (i = 0 ; i < (1 << order) ; ++i)
@@ -1009,10 +1033,10 @@ static void drain_pages(unsigned int cpu)
1009 struct per_cpu_pageset *pset; 1033 struct per_cpu_pageset *pset;
1010 struct per_cpu_pages *pcp; 1034 struct per_cpu_pages *pcp;
1011 1035
1012 pset = zone_pcp(zone, cpu); 1036 local_irq_save(flags);
1037 pset = per_cpu_ptr(zone->pageset, cpu);
1013 1038
1014 pcp = &pset->pcp; 1039 pcp = &pset->pcp;
1015 local_irq_save(flags);
1016 free_pcppages_bulk(zone, pcp->count, pcp); 1040 free_pcppages_bulk(zone, pcp->count, pcp);
1017 pcp->count = 0; 1041 pcp->count = 0;
1018 local_irq_restore(flags); 1042 local_irq_restore(flags);
@@ -1073,8 +1097,9 @@ void mark_free_pages(struct zone *zone)
1073 1097
1074/* 1098/*
1075 * Free a 0-order page 1099 * Free a 0-order page
1100 * cold == 1 ? free a cold page : free a hot page
1076 */ 1101 */
1077static void free_hot_cold_page(struct page *page, int cold) 1102void free_hot_cold_page(struct page *page, int cold)
1078{ 1103{
1079 struct zone *zone = page_zone(page); 1104 struct zone *zone = page_zone(page);
1080 struct per_cpu_pages *pcp; 1105 struct per_cpu_pages *pcp;
@@ -1082,6 +1107,7 @@ static void free_hot_cold_page(struct page *page, int cold)
1082 int migratetype; 1107 int migratetype;
1083 int wasMlocked = __TestClearPageMlocked(page); 1108 int wasMlocked = __TestClearPageMlocked(page);
1084 1109
1110 trace_mm_page_free_direct(page, 0);
1085 kmemcheck_free_shadow(page, 0); 1111 kmemcheck_free_shadow(page, 0);
1086 1112
1087 if (PageAnon(page)) 1113 if (PageAnon(page))
@@ -1096,7 +1122,6 @@ static void free_hot_cold_page(struct page *page, int cold)
1096 arch_free_page(page, 0); 1122 arch_free_page(page, 0);
1097 kernel_map_pages(page, 1, 0); 1123 kernel_map_pages(page, 1, 0);
1098 1124
1099 pcp = &zone_pcp(zone, get_cpu())->pcp;
1100 migratetype = get_pageblock_migratetype(page); 1125 migratetype = get_pageblock_migratetype(page);
1101 set_page_private(page, migratetype); 1126 set_page_private(page, migratetype);
1102 local_irq_save(flags); 1127 local_irq_save(flags);
@@ -1119,6 +1144,7 @@ static void free_hot_cold_page(struct page *page, int cold)
1119 migratetype = MIGRATE_MOVABLE; 1144 migratetype = MIGRATE_MOVABLE;
1120 } 1145 }
1121 1146
1147 pcp = &this_cpu_ptr(zone->pageset)->pcp;
1122 if (cold) 1148 if (cold)
1123 list_add_tail(&page->lru, &pcp->lists[migratetype]); 1149 list_add_tail(&page->lru, &pcp->lists[migratetype]);
1124 else 1150 else
@@ -1131,15 +1157,8 @@ static void free_hot_cold_page(struct page *page, int cold)
1131 1157
1132out: 1158out:
1133 local_irq_restore(flags); 1159 local_irq_restore(flags);
1134 put_cpu();
1135} 1160}
1136 1161
1137void free_hot_page(struct page *page)
1138{
1139 trace_mm_page_free_direct(page, 0);
1140 free_hot_cold_page(page, 0);
1141}
1142
1143/* 1162/*
1144 * split_page takes a non-compound higher-order page, and splits it into 1163 * split_page takes a non-compound higher-order page, and splits it into
1145 * n (1<<order) sub-pages: page[0..n] 1164 * n (1<<order) sub-pages: page[0..n]
@@ -1181,17 +1200,15 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
1181 unsigned long flags; 1200 unsigned long flags;
1182 struct page *page; 1201 struct page *page;
1183 int cold = !!(gfp_flags & __GFP_COLD); 1202 int cold = !!(gfp_flags & __GFP_COLD);
1184 int cpu;
1185 1203
1186again: 1204again:
1187 cpu = get_cpu();
1188 if (likely(order == 0)) { 1205 if (likely(order == 0)) {
1189 struct per_cpu_pages *pcp; 1206 struct per_cpu_pages *pcp;
1190 struct list_head *list; 1207 struct list_head *list;
1191 1208
1192 pcp = &zone_pcp(zone, cpu)->pcp;
1193 list = &pcp->lists[migratetype];
1194 local_irq_save(flags); 1209 local_irq_save(flags);
1210 pcp = &this_cpu_ptr(zone->pageset)->pcp;
1211 list = &pcp->lists[migratetype];
1195 if (list_empty(list)) { 1212 if (list_empty(list)) {
1196 pcp->count += rmqueue_bulk(zone, 0, 1213 pcp->count += rmqueue_bulk(zone, 0,
1197 pcp->batch, list, 1214 pcp->batch, list,
@@ -1232,7 +1249,6 @@ again:
1232 __count_zone_vm_events(PGALLOC, zone, 1 << order); 1249 __count_zone_vm_events(PGALLOC, zone, 1 << order);
1233 zone_statistics(preferred_zone, zone); 1250 zone_statistics(preferred_zone, zone);
1234 local_irq_restore(flags); 1251 local_irq_restore(flags);
1235 put_cpu();
1236 1252
1237 VM_BUG_ON(bad_range(zone, page)); 1253 VM_BUG_ON(bad_range(zone, page));
1238 if (prep_new_page(page, order, gfp_flags)) 1254 if (prep_new_page(page, order, gfp_flags))
@@ -1241,7 +1257,6 @@ again:
1241 1257
1242failed: 1258failed:
1243 local_irq_restore(flags); 1259 local_irq_restore(flags);
1244 put_cpu();
1245 return NULL; 1260 return NULL;
1246} 1261}
1247 1262
@@ -2013,9 +2028,8 @@ void __pagevec_free(struct pagevec *pvec)
2013void __free_pages(struct page *page, unsigned int order) 2028void __free_pages(struct page *page, unsigned int order)
2014{ 2029{
2015 if (put_page_testzero(page)) { 2030 if (put_page_testzero(page)) {
2016 trace_mm_page_free_direct(page, order);
2017 if (order == 0) 2031 if (order == 0)
2018 free_hot_page(page); 2032 free_hot_cold_page(page, 0);
2019 else 2033 else
2020 __free_pages_ok(page, order); 2034 __free_pages_ok(page, order);
2021 } 2035 }
@@ -2180,7 +2194,7 @@ void show_free_areas(void)
2180 for_each_online_cpu(cpu) { 2194 for_each_online_cpu(cpu) {
2181 struct per_cpu_pageset *pageset; 2195 struct per_cpu_pageset *pageset;
2182 2196
2183 pageset = zone_pcp(zone, cpu); 2197 pageset = per_cpu_ptr(zone->pageset, cpu);
2184 2198
2185 printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n", 2199 printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n",
2186 cpu, pageset->pcp.high, 2200 cpu, pageset->pcp.high,
@@ -2271,7 +2285,7 @@ void show_free_areas(void)
2271 K(zone_page_state(zone, NR_BOUNCE)), 2285 K(zone_page_state(zone, NR_BOUNCE)),
2272 K(zone_page_state(zone, NR_WRITEBACK_TEMP)), 2286 K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
2273 zone->pages_scanned, 2287 zone->pages_scanned,
2274 (zone_is_all_unreclaimable(zone) ? "yes" : "no") 2288 (zone->all_unreclaimable ? "yes" : "no")
2275 ); 2289 );
2276 printk("lowmem_reserve[]:"); 2290 printk("lowmem_reserve[]:");
2277 for (i = 0; i < MAX_NR_ZONES; i++) 2291 for (i = 0; i < MAX_NR_ZONES; i++)
@@ -2745,10 +2759,29 @@ static void build_zonelist_cache(pg_data_t *pgdat)
2745 2759
2746#endif /* CONFIG_NUMA */ 2760#endif /* CONFIG_NUMA */
2747 2761
2762/*
2763 * Boot pageset table. One per cpu which is going to be used for all
2764 * zones and all nodes. The parameters will be set in such a way
2765 * that an item put on a list will immediately be handed over to
2766 * the buddy list. This is safe since pageset manipulation is done
2767 * with interrupts disabled.
2768 *
2769 * The boot_pagesets must be kept even after bootup is complete for
2770 * unused processors and/or zones. They do play a role for bootstrapping
2771 * hotplugged processors.
2772 *
2773 * zoneinfo_show() and maybe other functions do
2774 * not check if the processor is online before following the pageset pointer.
2775 * Other parts of the kernel may not check if the zone is available.
2776 */
2777static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);
2778static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
2779
2748/* return values int ....just for stop_machine() */ 2780/* return values int ....just for stop_machine() */
2749static int __build_all_zonelists(void *dummy) 2781static int __build_all_zonelists(void *dummy)
2750{ 2782{
2751 int nid; 2783 int nid;
2784 int cpu;
2752 2785
2753#ifdef CONFIG_NUMA 2786#ifdef CONFIG_NUMA
2754 memset(node_load, 0, sizeof(node_load)); 2787 memset(node_load, 0, sizeof(node_load));
@@ -2759,6 +2792,23 @@ static int __build_all_zonelists(void *dummy)
2759 build_zonelists(pgdat); 2792 build_zonelists(pgdat);
2760 build_zonelist_cache(pgdat); 2793 build_zonelist_cache(pgdat);
2761 } 2794 }
2795
2796 /*
2797 * Initialize the boot_pagesets that are going to be used
2798 * for bootstrapping processors. The real pagesets for
2799 * each zone will be allocated later when the per cpu
2800 * allocator is available.
2801 *
2802 * boot_pagesets are used also for bootstrapping offline
2803 * cpus if the system is already booted because the pagesets
2804 * are needed to initialize allocators on a specific cpu too.
2805 * F.e. the percpu allocator needs the page allocator which
2806 * needs the percpu allocator in order to allocate its pagesets
2807 * (a chicken-egg dilemma).
2808 */
2809 for_each_possible_cpu(cpu)
2810 setup_pageset(&per_cpu(boot_pageset, cpu), 0);
2811
2762 return 0; 2812 return 0;
2763} 2813}
2764 2814
@@ -3096,121 +3146,33 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p,
3096 pcp->batch = PAGE_SHIFT * 8; 3146 pcp->batch = PAGE_SHIFT * 8;
3097} 3147}
3098 3148
3099
3100#ifdef CONFIG_NUMA
3101/*
3102 * Boot pageset table. One per cpu which is going to be used for all
3103 * zones and all nodes. The parameters will be set in such a way
3104 * that an item put on a list will immediately be handed over to
3105 * the buddy list. This is safe since pageset manipulation is done
3106 * with interrupts disabled.
3107 *
3108 * Some NUMA counter updates may also be caught by the boot pagesets.
3109 *
3110 * The boot_pagesets must be kept even after bootup is complete for
3111 * unused processors and/or zones. They do play a role for bootstrapping
3112 * hotplugged processors.
3113 *
3114 * zoneinfo_show() and maybe other functions do
3115 * not check if the processor is online before following the pageset pointer.
3116 * Other parts of the kernel may not check if the zone is available.
3117 */
3118static struct per_cpu_pageset boot_pageset[NR_CPUS];
3119
3120/* 3149/*
3121 * Dynamically allocate memory for the 3150 * Allocate per cpu pagesets and initialize them.
3122 * per cpu pageset array in struct zone. 3151 * Before this call only boot pagesets were available.
3152 * Boot pagesets will no longer be used by this processorr
3153 * after setup_per_cpu_pageset().
3123 */ 3154 */
3124static int __cpuinit process_zones(int cpu) 3155void __init setup_per_cpu_pageset(void)
3125{ 3156{
3126 struct zone *zone, *dzone; 3157 struct zone *zone;
3127 int node = cpu_to_node(cpu); 3158 int cpu;
3128
3129 node_set_state(node, N_CPU); /* this node has a cpu */
3130 3159
3131 for_each_populated_zone(zone) { 3160 for_each_populated_zone(zone) {
3132 zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset), 3161 zone->pageset = alloc_percpu(struct per_cpu_pageset);
3133 GFP_KERNEL, node);
3134 if (!zone_pcp(zone, cpu))
3135 goto bad;
3136 3162
3137 setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone)); 3163 for_each_possible_cpu(cpu) {
3164 struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);
3138 3165
3139 if (percpu_pagelist_fraction) 3166 setup_pageset(pcp, zone_batchsize(zone));
3140 setup_pagelist_highmark(zone_pcp(zone, cpu),
3141 (zone->present_pages / percpu_pagelist_fraction));
3142 }
3143 3167
3144 return 0; 3168 if (percpu_pagelist_fraction)
3145bad: 3169 setup_pagelist_highmark(pcp,
3146 for_each_zone(dzone) { 3170 (zone->present_pages /
3147 if (!populated_zone(dzone)) 3171 percpu_pagelist_fraction));
3148 continue; 3172 }
3149 if (dzone == zone)
3150 break;
3151 kfree(zone_pcp(dzone, cpu));
3152 zone_pcp(dzone, cpu) = &boot_pageset[cpu];
3153 }
3154 return -ENOMEM;
3155}
3156
3157static inline void free_zone_pagesets(int cpu)
3158{
3159 struct zone *zone;
3160
3161 for_each_zone(zone) {
3162 struct per_cpu_pageset *pset = zone_pcp(zone, cpu);
3163
3164 /* Free per_cpu_pageset if it is slab allocated */
3165 if (pset != &boot_pageset[cpu])
3166 kfree(pset);
3167 zone_pcp(zone, cpu) = &boot_pageset[cpu];
3168 }
3169}
3170
3171static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb,
3172 unsigned long action,
3173 void *hcpu)
3174{
3175 int cpu = (long)hcpu;
3176 int ret = NOTIFY_OK;
3177
3178 switch (action) {
3179 case CPU_UP_PREPARE:
3180 case CPU_UP_PREPARE_FROZEN:
3181 if (process_zones(cpu))
3182 ret = NOTIFY_BAD;
3183 break;
3184 case CPU_UP_CANCELED:
3185 case CPU_UP_CANCELED_FROZEN:
3186 case CPU_DEAD:
3187 case CPU_DEAD_FROZEN:
3188 free_zone_pagesets(cpu);
3189 break;
3190 default:
3191 break;
3192 } 3173 }
3193 return ret;
3194} 3174}
3195 3175
3196static struct notifier_block __cpuinitdata pageset_notifier =
3197 { &pageset_cpuup_callback, NULL, 0 };
3198
3199void __init setup_per_cpu_pageset(void)
3200{
3201 int err;
3202
3203 /* Initialize per_cpu_pageset for cpu 0.
3204 * A cpuup callback will do this for every cpu
3205 * as it comes online
3206 */
3207 err = process_zones(smp_processor_id());
3208 BUG_ON(err);
3209 register_cpu_notifier(&pageset_notifier);
3210}
3211
3212#endif
3213
3214static noinline __init_refok 3176static noinline __init_refok
3215int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) 3177int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
3216{ 3178{
@@ -3260,11 +3222,11 @@ static int __zone_pcp_update(void *data)
3260 int cpu; 3222 int cpu;
3261 unsigned long batch = zone_batchsize(zone), flags; 3223 unsigned long batch = zone_batchsize(zone), flags;
3262 3224
3263 for (cpu = 0; cpu < NR_CPUS; cpu++) { 3225 for_each_possible_cpu(cpu) {
3264 struct per_cpu_pageset *pset; 3226 struct per_cpu_pageset *pset;
3265 struct per_cpu_pages *pcp; 3227 struct per_cpu_pages *pcp;
3266 3228
3267 pset = zone_pcp(zone, cpu); 3229 pset = per_cpu_ptr(zone->pageset, cpu);
3268 pcp = &pset->pcp; 3230 pcp = &pset->pcp;
3269 3231
3270 local_irq_save(flags); 3232 local_irq_save(flags);
@@ -3282,21 +3244,17 @@ void zone_pcp_update(struct zone *zone)
3282 3244
3283static __meminit void zone_pcp_init(struct zone *zone) 3245static __meminit void zone_pcp_init(struct zone *zone)
3284{ 3246{
3285 int cpu; 3247 /*
3286 unsigned long batch = zone_batchsize(zone); 3248 * per cpu subsystem is not up at this point. The following code
3249 * relies on the ability of the linker to provide the
3250 * offset of a (static) per cpu variable into the per cpu area.
3251 */
3252 zone->pageset = &boot_pageset;
3287 3253
3288 for (cpu = 0; cpu < NR_CPUS; cpu++) {
3289#ifdef CONFIG_NUMA
3290 /* Early boot. Slab allocator not functional yet */
3291 zone_pcp(zone, cpu) = &boot_pageset[cpu];
3292 setup_pageset(&boot_pageset[cpu],0);
3293#else
3294 setup_pageset(zone_pcp(zone,cpu), batch);
3295#endif
3296 }
3297 if (zone->present_pages) 3254 if (zone->present_pages)
3298 printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", 3255 printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n",
3299 zone->name, zone->present_pages, batch); 3256 zone->name, zone->present_pages,
3257 zone_batchsize(zone));
3300} 3258}
3301 3259
3302__meminit int init_currently_empty_zone(struct zone *zone, 3260__meminit int init_currently_empty_zone(struct zone *zone,
@@ -3435,6 +3393,61 @@ void __init free_bootmem_with_active_regions(int nid,
3435 } 3393 }
3436} 3394}
3437 3395
3396int __init add_from_early_node_map(struct range *range, int az,
3397 int nr_range, int nid)
3398{
3399 int i;
3400 u64 start, end;
3401
3402 /* need to go over early_node_map to find out good range for node */
3403 for_each_active_range_index_in_nid(i, nid) {
3404 start = early_node_map[i].start_pfn;
3405 end = early_node_map[i].end_pfn;
3406 nr_range = add_range(range, az, nr_range, start, end);
3407 }
3408 return nr_range;
3409}
3410
3411#ifdef CONFIG_NO_BOOTMEM
3412void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
3413 u64 goal, u64 limit)
3414{
3415 int i;
3416 void *ptr;
3417
3418 /* need to go over early_node_map to find out good range for node */
3419 for_each_active_range_index_in_nid(i, nid) {
3420 u64 addr;
3421 u64 ei_start, ei_last;
3422
3423 ei_last = early_node_map[i].end_pfn;
3424 ei_last <<= PAGE_SHIFT;
3425 ei_start = early_node_map[i].start_pfn;
3426 ei_start <<= PAGE_SHIFT;
3427 addr = find_early_area(ei_start, ei_last,
3428 goal, limit, size, align);
3429
3430 if (addr == -1ULL)
3431 continue;
3432
3433#if 0
3434 printk(KERN_DEBUG "alloc (nid=%d %llx - %llx) (%llx - %llx) %llx %llx => %llx\n",
3435 nid,
3436 ei_start, ei_last, goal, limit, size,
3437 align, addr);
3438#endif
3439
3440 ptr = phys_to_virt(addr);
3441 memset(ptr, 0, size);
3442 reserve_early_without_check(addr, addr + size, "BOOTMEM");
3443 return ptr;
3444 }
3445
3446 return NULL;
3447}
3448#endif
3449
3450
3438void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data) 3451void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data)
3439{ 3452{
3440 int i; 3453 int i;
@@ -4377,8 +4390,12 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
4377 for (i = 0; i < MAX_NR_ZONES; i++) { 4390 for (i = 0; i < MAX_NR_ZONES; i++) {
4378 if (i == ZONE_MOVABLE) 4391 if (i == ZONE_MOVABLE)
4379 continue; 4392 continue;
4380 printk(" %-8s %0#10lx -> %0#10lx\n", 4393 printk(" %-8s ", zone_names[i]);
4381 zone_names[i], 4394 if (arch_zone_lowest_possible_pfn[i] ==
4395 arch_zone_highest_possible_pfn[i])
4396 printk("empty\n");
4397 else
4398 printk("%0#10lx -> %0#10lx\n",
4382 arch_zone_lowest_possible_pfn[i], 4399 arch_zone_lowest_possible_pfn[i],
4383 arch_zone_highest_possible_pfn[i]); 4400 arch_zone_highest_possible_pfn[i]);
4384 } 4401 }
@@ -4467,7 +4484,11 @@ void __init set_dma_reserve(unsigned long new_dma_reserve)
4467} 4484}
4468 4485
4469#ifndef CONFIG_NEED_MULTIPLE_NODES 4486#ifndef CONFIG_NEED_MULTIPLE_NODES
4470struct pglist_data __refdata contig_page_data = { .bdata = &bootmem_node_data[0] }; 4487struct pglist_data __refdata contig_page_data = {
4488#ifndef CONFIG_NO_BOOTMEM
4489 .bdata = &bootmem_node_data[0]
4490#endif
4491 };
4471EXPORT_SYMBOL(contig_page_data); 4492EXPORT_SYMBOL(contig_page_data);
4472#endif 4493#endif
4473 4494
@@ -4810,10 +4831,11 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
4810 if (!write || (ret == -EINVAL)) 4831 if (!write || (ret == -EINVAL))
4811 return ret; 4832 return ret;
4812 for_each_populated_zone(zone) { 4833 for_each_populated_zone(zone) {
4813 for_each_online_cpu(cpu) { 4834 for_each_possible_cpu(cpu) {
4814 unsigned long high; 4835 unsigned long high;
4815 high = zone->present_pages / percpu_pagelist_fraction; 4836 high = zone->present_pages / percpu_pagelist_fraction;
4816 setup_pagelist_highmark(zone_pcp(zone, cpu), high); 4837 setup_pagelist_highmark(
4838 per_cpu_ptr(zone->pageset, cpu), high);
4817 } 4839 }
4818 } 4840 }
4819 return 0; 4841 return 0;
@@ -5159,3 +5181,80 @@ bool is_free_buddy_page(struct page *page)
5159 return order < MAX_ORDER; 5181 return order < MAX_ORDER;
5160} 5182}
5161#endif 5183#endif
5184
5185static struct trace_print_flags pageflag_names[] = {
5186 {1UL << PG_locked, "locked" },
5187 {1UL << PG_error, "error" },
5188 {1UL << PG_referenced, "referenced" },
5189 {1UL << PG_uptodate, "uptodate" },
5190 {1UL << PG_dirty, "dirty" },
5191 {1UL << PG_lru, "lru" },
5192 {1UL << PG_active, "active" },
5193 {1UL << PG_slab, "slab" },
5194 {1UL << PG_owner_priv_1, "owner_priv_1" },
5195 {1UL << PG_arch_1, "arch_1" },
5196 {1UL << PG_reserved, "reserved" },
5197 {1UL << PG_private, "private" },
5198 {1UL << PG_private_2, "private_2" },
5199 {1UL << PG_writeback, "writeback" },
5200#ifdef CONFIG_PAGEFLAGS_EXTENDED
5201 {1UL << PG_head, "head" },
5202 {1UL << PG_tail, "tail" },
5203#else
5204 {1UL << PG_compound, "compound" },
5205#endif
5206 {1UL << PG_swapcache, "swapcache" },
5207 {1UL << PG_mappedtodisk, "mappedtodisk" },
5208 {1UL << PG_reclaim, "reclaim" },
5209 {1UL << PG_buddy, "buddy" },
5210 {1UL << PG_swapbacked, "swapbacked" },
5211 {1UL << PG_unevictable, "unevictable" },
5212#ifdef CONFIG_MMU
5213 {1UL << PG_mlocked, "mlocked" },
5214#endif
5215#ifdef CONFIG_ARCH_USES_PG_UNCACHED
5216 {1UL << PG_uncached, "uncached" },
5217#endif
5218#ifdef CONFIG_MEMORY_FAILURE
5219 {1UL << PG_hwpoison, "hwpoison" },
5220#endif
5221 {-1UL, NULL },
5222};
5223
5224static void dump_page_flags(unsigned long flags)
5225{
5226 const char *delim = "";
5227 unsigned long mask;
5228 int i;
5229
5230 printk(KERN_ALERT "page flags: %#lx(", flags);
5231
5232 /* remove zone id */
5233 flags &= (1UL << NR_PAGEFLAGS) - 1;
5234
5235 for (i = 0; pageflag_names[i].name && flags; i++) {
5236
5237 mask = pageflag_names[i].mask;
5238 if ((flags & mask) != mask)
5239 continue;
5240
5241 flags &= ~mask;
5242 printk("%s%s", delim, pageflag_names[i].name);
5243 delim = "|";
5244 }
5245
5246 /* check for left over flags */
5247 if (flags)
5248 printk("%s%#lx", delim, flags);
5249
5250 printk(")\n");
5251}
5252
5253void dump_page(struct page *page)
5254{
5255 printk(KERN_ALERT
5256 "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",
5257 page, page_count(page), page_mapcount(page),
5258 page->mapping, page->index);
5259 dump_page_flags(page->flags);
5260}
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 3d535d594826..6c0081441a32 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -284,6 +284,7 @@ static DEFINE_MUTEX(swap_cgroup_mutex);
284struct swap_cgroup_ctrl { 284struct swap_cgroup_ctrl {
285 struct page **map; 285 struct page **map;
286 unsigned long length; 286 unsigned long length;
287 spinlock_t lock;
287}; 288};
288 289
289struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES]; 290struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES];
@@ -335,6 +336,43 @@ not_enough_page:
335} 336}
336 337
337/** 338/**
339 * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry.
340 * @end: swap entry to be cmpxchged
341 * @old: old id
342 * @new: new id
343 *
344 * Returns old id at success, 0 at failure.
345 * (There is no mem_cgroup useing 0 as its id)
346 */
347unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
348 unsigned short old, unsigned short new)
349{
350 int type = swp_type(ent);
351 unsigned long offset = swp_offset(ent);
352 unsigned long idx = offset / SC_PER_PAGE;
353 unsigned long pos = offset & SC_POS_MASK;
354 struct swap_cgroup_ctrl *ctrl;
355 struct page *mappage;
356 struct swap_cgroup *sc;
357 unsigned long flags;
358 unsigned short retval;
359
360 ctrl = &swap_cgroup_ctrl[type];
361
362 mappage = ctrl->map[idx];
363 sc = page_address(mappage);
364 sc += pos;
365 spin_lock_irqsave(&ctrl->lock, flags);
366 retval = sc->id;
367 if (retval == old)
368 sc->id = new;
369 else
370 retval = 0;
371 spin_unlock_irqrestore(&ctrl->lock, flags);
372 return retval;
373}
374
375/**
338 * swap_cgroup_record - record mem_cgroup for this swp_entry. 376 * swap_cgroup_record - record mem_cgroup for this swp_entry.
339 * @ent: swap entry to be recorded into 377 * @ent: swap entry to be recorded into
340 * @mem: mem_cgroup to be recorded 378 * @mem: mem_cgroup to be recorded
@@ -352,14 +390,17 @@ unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
352 struct page *mappage; 390 struct page *mappage;
353 struct swap_cgroup *sc; 391 struct swap_cgroup *sc;
354 unsigned short old; 392 unsigned short old;
393 unsigned long flags;
355 394
356 ctrl = &swap_cgroup_ctrl[type]; 395 ctrl = &swap_cgroup_ctrl[type];
357 396
358 mappage = ctrl->map[idx]; 397 mappage = ctrl->map[idx];
359 sc = page_address(mappage); 398 sc = page_address(mappage);
360 sc += pos; 399 sc += pos;
400 spin_lock_irqsave(&ctrl->lock, flags);
361 old = sc->id; 401 old = sc->id;
362 sc->id = id; 402 sc->id = id;
403 spin_unlock_irqrestore(&ctrl->lock, flags);
363 404
364 return old; 405 return old;
365} 406}
@@ -411,6 +452,7 @@ int swap_cgroup_swapon(int type, unsigned long max_pages)
411 mutex_lock(&swap_cgroup_mutex); 452 mutex_lock(&swap_cgroup_mutex);
412 ctrl->length = length; 453 ctrl->length = length;
413 ctrl->map = array; 454 ctrl->map = array;
455 spin_lock_init(&ctrl->lock);
414 if (swap_cgroup_prepare(type)) { 456 if (swap_cgroup_prepare(type)) {
415 /* memory shortage */ 457 /* memory shortage */
416 ctrl->map = NULL; 458 ctrl->map = NULL;
diff --git a/mm/percpu.c b/mm/percpu.c
index 083e7c91e5f6..768419d44ad7 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -80,13 +80,15 @@
80/* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */ 80/* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */
81#ifndef __addr_to_pcpu_ptr 81#ifndef __addr_to_pcpu_ptr
82#define __addr_to_pcpu_ptr(addr) \ 82#define __addr_to_pcpu_ptr(addr) \
83 (void *)((unsigned long)(addr) - (unsigned long)pcpu_base_addr \ 83 (void __percpu *)((unsigned long)(addr) - \
84 + (unsigned long)__per_cpu_start) 84 (unsigned long)pcpu_base_addr + \
85 (unsigned long)__per_cpu_start)
85#endif 86#endif
86#ifndef __pcpu_ptr_to_addr 87#ifndef __pcpu_ptr_to_addr
87#define __pcpu_ptr_to_addr(ptr) \ 88#define __pcpu_ptr_to_addr(ptr) \
88 (void *)((unsigned long)(ptr) + (unsigned long)pcpu_base_addr \ 89 (void __force *)((unsigned long)(ptr) + \
89 - (unsigned long)__per_cpu_start) 90 (unsigned long)pcpu_base_addr - \
91 (unsigned long)__per_cpu_start)
90#endif 92#endif
91 93
92struct pcpu_chunk { 94struct pcpu_chunk {
@@ -913,11 +915,10 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size)
913 int rs, re; 915 int rs, re;
914 916
915 /* quick path, check whether it's empty already */ 917 /* quick path, check whether it's empty already */
916 pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { 918 rs = page_start;
917 if (rs == page_start && re == page_end) 919 pcpu_next_unpop(chunk, &rs, &re, page_end);
918 return; 920 if (rs == page_start && re == page_end)
919 break; 921 return;
920 }
921 922
922 /* immutable chunks can't be depopulated */ 923 /* immutable chunks can't be depopulated */
923 WARN_ON(chunk->immutable); 924 WARN_ON(chunk->immutable);
@@ -968,11 +969,10 @@ static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
968 int rs, re, rc; 969 int rs, re, rc;
969 970
970 /* quick path, check whether all pages are already there */ 971 /* quick path, check whether all pages are already there */
971 pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) { 972 rs = page_start;
972 if (rs == page_start && re == page_end) 973 pcpu_next_pop(chunk, &rs, &re, page_end);
973 goto clear; 974 if (rs == page_start && re == page_end)
974 break; 975 goto clear;
975 }
976 976
977 /* need to allocate and map pages, this chunk can't be immutable */ 977 /* need to allocate and map pages, this chunk can't be immutable */
978 WARN_ON(chunk->immutable); 978 WARN_ON(chunk->immutable);
@@ -1067,7 +1067,7 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void)
1067 * RETURNS: 1067 * RETURNS:
1068 * Percpu pointer to the allocated area on success, NULL on failure. 1068 * Percpu pointer to the allocated area on success, NULL on failure.
1069 */ 1069 */
1070static void *pcpu_alloc(size_t size, size_t align, bool reserved) 1070static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved)
1071{ 1071{
1072 static int warn_limit = 10; 1072 static int warn_limit = 10;
1073 struct pcpu_chunk *chunk; 1073 struct pcpu_chunk *chunk;
@@ -1196,7 +1196,7 @@ fail_unlock_mutex:
1196 * RETURNS: 1196 * RETURNS:
1197 * Percpu pointer to the allocated area on success, NULL on failure. 1197 * Percpu pointer to the allocated area on success, NULL on failure.
1198 */ 1198 */
1199void *__alloc_percpu(size_t size, size_t align) 1199void __percpu *__alloc_percpu(size_t size, size_t align)
1200{ 1200{
1201 return pcpu_alloc(size, align, false); 1201 return pcpu_alloc(size, align, false);
1202} 1202}
@@ -1217,7 +1217,7 @@ EXPORT_SYMBOL_GPL(__alloc_percpu);
1217 * RETURNS: 1217 * RETURNS:
1218 * Percpu pointer to the allocated area on success, NULL on failure. 1218 * Percpu pointer to the allocated area on success, NULL on failure.
1219 */ 1219 */
1220void *__alloc_reserved_percpu(size_t size, size_t align) 1220void __percpu *__alloc_reserved_percpu(size_t size, size_t align)
1221{ 1221{
1222 return pcpu_alloc(size, align, true); 1222 return pcpu_alloc(size, align, true);
1223} 1223}
@@ -1269,7 +1269,7 @@ static void pcpu_reclaim(struct work_struct *work)
1269 * CONTEXT: 1269 * CONTEXT:
1270 * Can be called from atomic context. 1270 * Can be called from atomic context.
1271 */ 1271 */
1272void free_percpu(void *ptr) 1272void free_percpu(void __percpu *ptr)
1273{ 1273{
1274 void *addr; 1274 void *addr;
1275 struct pcpu_chunk *chunk; 1275 struct pcpu_chunk *chunk;
diff --git a/mm/readahead.c b/mm/readahead.c
index 033bc135a41f..337b20e946f6 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -501,6 +501,12 @@ void page_cache_sync_readahead(struct address_space *mapping,
501 if (!ra->ra_pages) 501 if (!ra->ra_pages)
502 return; 502 return;
503 503
504 /* be dumb */
505 if (filp->f_mode & FMODE_RANDOM) {
506 force_page_cache_readahead(mapping, filp, offset, req_size);
507 return;
508 }
509
504 /* do read-ahead */ 510 /* do read-ahead */
505 ondemand_readahead(mapping, ra, filp, false, offset, req_size); 511 ondemand_readahead(mapping, ra, filp, false, offset, req_size);
506} 512}
diff --git a/mm/rmap.c b/mm/rmap.c
index 278cd277bdec..fcd593c9c997 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -62,6 +62,7 @@
62#include "internal.h" 62#include "internal.h"
63 63
64static struct kmem_cache *anon_vma_cachep; 64static struct kmem_cache *anon_vma_cachep;
65static struct kmem_cache *anon_vma_chain_cachep;
65 66
66static inline struct anon_vma *anon_vma_alloc(void) 67static inline struct anon_vma *anon_vma_alloc(void)
67{ 68{
@@ -73,6 +74,16 @@ void anon_vma_free(struct anon_vma *anon_vma)
73 kmem_cache_free(anon_vma_cachep, anon_vma); 74 kmem_cache_free(anon_vma_cachep, anon_vma);
74} 75}
75 76
77static inline struct anon_vma_chain *anon_vma_chain_alloc(void)
78{
79 return kmem_cache_alloc(anon_vma_chain_cachep, GFP_KERNEL);
80}
81
82void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
83{
84 kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain);
85}
86
76/** 87/**
77 * anon_vma_prepare - attach an anon_vma to a memory region 88 * anon_vma_prepare - attach an anon_vma to a memory region
78 * @vma: the memory region in question 89 * @vma: the memory region in question
@@ -103,18 +114,23 @@ void anon_vma_free(struct anon_vma *anon_vma)
103int anon_vma_prepare(struct vm_area_struct *vma) 114int anon_vma_prepare(struct vm_area_struct *vma)
104{ 115{
105 struct anon_vma *anon_vma = vma->anon_vma; 116 struct anon_vma *anon_vma = vma->anon_vma;
117 struct anon_vma_chain *avc;
106 118
107 might_sleep(); 119 might_sleep();
108 if (unlikely(!anon_vma)) { 120 if (unlikely(!anon_vma)) {
109 struct mm_struct *mm = vma->vm_mm; 121 struct mm_struct *mm = vma->vm_mm;
110 struct anon_vma *allocated; 122 struct anon_vma *allocated;
111 123
124 avc = anon_vma_chain_alloc();
125 if (!avc)
126 goto out_enomem;
127
112 anon_vma = find_mergeable_anon_vma(vma); 128 anon_vma = find_mergeable_anon_vma(vma);
113 allocated = NULL; 129 allocated = NULL;
114 if (!anon_vma) { 130 if (!anon_vma) {
115 anon_vma = anon_vma_alloc(); 131 anon_vma = anon_vma_alloc();
116 if (unlikely(!anon_vma)) 132 if (unlikely(!anon_vma))
117 return -ENOMEM; 133 goto out_enomem_free_avc;
118 allocated = anon_vma; 134 allocated = anon_vma;
119 } 135 }
120 spin_lock(&anon_vma->lock); 136 spin_lock(&anon_vma->lock);
@@ -123,53 +139,113 @@ int anon_vma_prepare(struct vm_area_struct *vma)
123 spin_lock(&mm->page_table_lock); 139 spin_lock(&mm->page_table_lock);
124 if (likely(!vma->anon_vma)) { 140 if (likely(!vma->anon_vma)) {
125 vma->anon_vma = anon_vma; 141 vma->anon_vma = anon_vma;
126 list_add_tail(&vma->anon_vma_node, &anon_vma->head); 142 avc->anon_vma = anon_vma;
143 avc->vma = vma;
144 list_add(&avc->same_vma, &vma->anon_vma_chain);
145 list_add(&avc->same_anon_vma, &anon_vma->head);
127 allocated = NULL; 146 allocated = NULL;
128 } 147 }
129 spin_unlock(&mm->page_table_lock); 148 spin_unlock(&mm->page_table_lock);
130 149
131 spin_unlock(&anon_vma->lock); 150 spin_unlock(&anon_vma->lock);
132 if (unlikely(allocated)) 151 if (unlikely(allocated)) {
133 anon_vma_free(allocated); 152 anon_vma_free(allocated);
153 anon_vma_chain_free(avc);
154 }
134 } 155 }
135 return 0; 156 return 0;
157
158 out_enomem_free_avc:
159 anon_vma_chain_free(avc);
160 out_enomem:
161 return -ENOMEM;
136} 162}
137 163
138void __anon_vma_merge(struct vm_area_struct *vma, struct vm_area_struct *next) 164static void anon_vma_chain_link(struct vm_area_struct *vma,
165 struct anon_vma_chain *avc,
166 struct anon_vma *anon_vma)
139{ 167{
140 BUG_ON(vma->anon_vma != next->anon_vma); 168 avc->vma = vma;
141 list_del(&next->anon_vma_node); 169 avc->anon_vma = anon_vma;
170 list_add(&avc->same_vma, &vma->anon_vma_chain);
171
172 spin_lock(&anon_vma->lock);
173 list_add_tail(&avc->same_anon_vma, &anon_vma->head);
174 spin_unlock(&anon_vma->lock);
142} 175}
143 176
144void __anon_vma_link(struct vm_area_struct *vma) 177/*
178 * Attach the anon_vmas from src to dst.
179 * Returns 0 on success, -ENOMEM on failure.
180 */
181int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
145{ 182{
146 struct anon_vma *anon_vma = vma->anon_vma; 183 struct anon_vma_chain *avc, *pavc;
147 184
148 if (anon_vma) 185 list_for_each_entry(pavc, &src->anon_vma_chain, same_vma) {
149 list_add_tail(&vma->anon_vma_node, &anon_vma->head); 186 avc = anon_vma_chain_alloc();
187 if (!avc)
188 goto enomem_failure;
189 anon_vma_chain_link(dst, avc, pavc->anon_vma);
190 }
191 return 0;
192
193 enomem_failure:
194 unlink_anon_vmas(dst);
195 return -ENOMEM;
150} 196}
151 197
152void anon_vma_link(struct vm_area_struct *vma) 198/*
199 * Attach vma to its own anon_vma, as well as to the anon_vmas that
200 * the corresponding VMA in the parent process is attached to.
201 * Returns 0 on success, non-zero on failure.
202 */
203int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
153{ 204{
154 struct anon_vma *anon_vma = vma->anon_vma; 205 struct anon_vma_chain *avc;
206 struct anon_vma *anon_vma;
155 207
156 if (anon_vma) { 208 /* Don't bother if the parent process has no anon_vma here. */
157 spin_lock(&anon_vma->lock); 209 if (!pvma->anon_vma)
158 list_add_tail(&vma->anon_vma_node, &anon_vma->head); 210 return 0;
159 spin_unlock(&anon_vma->lock); 211
160 } 212 /*
213 * First, attach the new VMA to the parent VMA's anon_vmas,
214 * so rmap can find non-COWed pages in child processes.
215 */
216 if (anon_vma_clone(vma, pvma))
217 return -ENOMEM;
218
219 /* Then add our own anon_vma. */
220 anon_vma = anon_vma_alloc();
221 if (!anon_vma)
222 goto out_error;
223 avc = anon_vma_chain_alloc();
224 if (!avc)
225 goto out_error_free_anon_vma;
226 anon_vma_chain_link(vma, avc, anon_vma);
227 /* Mark this anon_vma as the one where our new (COWed) pages go. */
228 vma->anon_vma = anon_vma;
229
230 return 0;
231
232 out_error_free_anon_vma:
233 anon_vma_free(anon_vma);
234 out_error:
235 return -ENOMEM;
161} 236}
162 237
163void anon_vma_unlink(struct vm_area_struct *vma) 238static void anon_vma_unlink(struct anon_vma_chain *anon_vma_chain)
164{ 239{
165 struct anon_vma *anon_vma = vma->anon_vma; 240 struct anon_vma *anon_vma = anon_vma_chain->anon_vma;
166 int empty; 241 int empty;
167 242
243 /* If anon_vma_fork fails, we can get an empty anon_vma_chain. */
168 if (!anon_vma) 244 if (!anon_vma)
169 return; 245 return;
170 246
171 spin_lock(&anon_vma->lock); 247 spin_lock(&anon_vma->lock);
172 list_del(&vma->anon_vma_node); 248 list_del(&anon_vma_chain->same_anon_vma);
173 249
174 /* We must garbage collect the anon_vma if it's empty */ 250 /* We must garbage collect the anon_vma if it's empty */
175 empty = list_empty(&anon_vma->head) && !ksm_refcount(anon_vma); 251 empty = list_empty(&anon_vma->head) && !ksm_refcount(anon_vma);
@@ -179,6 +255,18 @@ void anon_vma_unlink(struct vm_area_struct *vma)
179 anon_vma_free(anon_vma); 255 anon_vma_free(anon_vma);
180} 256}
181 257
258void unlink_anon_vmas(struct vm_area_struct *vma)
259{
260 struct anon_vma_chain *avc, *next;
261
262 /* Unlink each anon_vma chained to the VMA. */
263 list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
264 anon_vma_unlink(avc);
265 list_del(&avc->same_vma);
266 anon_vma_chain_free(avc);
267 }
268}
269
182static void anon_vma_ctor(void *data) 270static void anon_vma_ctor(void *data)
183{ 271{
184 struct anon_vma *anon_vma = data; 272 struct anon_vma *anon_vma = data;
@@ -192,6 +280,7 @@ void __init anon_vma_init(void)
192{ 280{
193 anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma), 281 anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma),
194 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor); 282 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor);
283 anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain, SLAB_PANIC);
195} 284}
196 285
197/* 286/*
@@ -396,7 +485,7 @@ static int page_referenced_anon(struct page *page,
396{ 485{
397 unsigned int mapcount; 486 unsigned int mapcount;
398 struct anon_vma *anon_vma; 487 struct anon_vma *anon_vma;
399 struct vm_area_struct *vma; 488 struct anon_vma_chain *avc;
400 int referenced = 0; 489 int referenced = 0;
401 490
402 anon_vma = page_lock_anon_vma(page); 491 anon_vma = page_lock_anon_vma(page);
@@ -404,7 +493,8 @@ static int page_referenced_anon(struct page *page,
404 return referenced; 493 return referenced;
405 494
406 mapcount = page_mapcount(page); 495 mapcount = page_mapcount(page);
407 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { 496 list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
497 struct vm_area_struct *vma = avc->vma;
408 unsigned long address = vma_address(page, vma); 498 unsigned long address = vma_address(page, vma);
409 if (address == -EFAULT) 499 if (address == -EFAULT)
410 continue; 500 continue;
@@ -511,9 +601,6 @@ int page_referenced(struct page *page,
511 int referenced = 0; 601 int referenced = 0;
512 int we_locked = 0; 602 int we_locked = 0;
513 603
514 if (TestClearPageReferenced(page))
515 referenced++;
516
517 *vm_flags = 0; 604 *vm_flags = 0;
518 if (page_mapped(page) && page_rmapping(page)) { 605 if (page_mapped(page) && page_rmapping(page)) {
519 if (!is_locked && (!PageAnon(page) || PageKsm(page))) { 606 if (!is_locked && (!PageAnon(page) || PageKsm(page))) {
@@ -614,6 +701,30 @@ int page_mkclean(struct page *page)
614EXPORT_SYMBOL_GPL(page_mkclean); 701EXPORT_SYMBOL_GPL(page_mkclean);
615 702
616/** 703/**
704 * page_move_anon_rmap - move a page to our anon_vma
705 * @page: the page to move to our anon_vma
706 * @vma: the vma the page belongs to
707 * @address: the user virtual address mapped
708 *
709 * When a page belongs exclusively to one process after a COW event,
710 * that page can be moved into the anon_vma that belongs to just that
711 * process, so the rmap code will not search the parent or sibling
712 * processes.
713 */
714void page_move_anon_rmap(struct page *page,
715 struct vm_area_struct *vma, unsigned long address)
716{
717 struct anon_vma *anon_vma = vma->anon_vma;
718
719 VM_BUG_ON(!PageLocked(page));
720 VM_BUG_ON(!anon_vma);
721 VM_BUG_ON(page->index != linear_page_index(vma, address));
722
723 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
724 page->mapping = (struct address_space *) anon_vma;
725}
726
727/**
617 * __page_set_anon_rmap - setup new anonymous rmap 728 * __page_set_anon_rmap - setup new anonymous rmap
618 * @page: the page to add the mapping to 729 * @page: the page to add the mapping to
619 * @vma: the vm area in which the mapping is added 730 * @vma: the vm area in which the mapping is added
@@ -652,9 +763,6 @@ static void __page_check_anon_rmap(struct page *page,
652 * are initially only visible via the pagetables, and the pte is locked 763 * are initially only visible via the pagetables, and the pte is locked
653 * over the call to page_add_new_anon_rmap. 764 * over the call to page_add_new_anon_rmap.
654 */ 765 */
655 struct anon_vma *anon_vma = vma->anon_vma;
656 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
657 BUG_ON(page->mapping != (struct address_space *)anon_vma);
658 BUG_ON(page->index != linear_page_index(vma, address)); 766 BUG_ON(page->index != linear_page_index(vma, address));
659#endif 767#endif
660} 768}
@@ -815,9 +923,9 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
815 923
816 if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) { 924 if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) {
817 if (PageAnon(page)) 925 if (PageAnon(page))
818 dec_mm_counter(mm, anon_rss); 926 dec_mm_counter(mm, MM_ANONPAGES);
819 else 927 else
820 dec_mm_counter(mm, file_rss); 928 dec_mm_counter(mm, MM_FILEPAGES);
821 set_pte_at(mm, address, pte, 929 set_pte_at(mm, address, pte,
822 swp_entry_to_pte(make_hwpoison_entry(page))); 930 swp_entry_to_pte(make_hwpoison_entry(page)));
823 } else if (PageAnon(page)) { 931 } else if (PageAnon(page)) {
@@ -839,7 +947,8 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
839 list_add(&mm->mmlist, &init_mm.mmlist); 947 list_add(&mm->mmlist, &init_mm.mmlist);
840 spin_unlock(&mmlist_lock); 948 spin_unlock(&mmlist_lock);
841 } 949 }
842 dec_mm_counter(mm, anon_rss); 950 dec_mm_counter(mm, MM_ANONPAGES);
951 inc_mm_counter(mm, MM_SWAPENTS);
843 } else if (PAGE_MIGRATION) { 952 } else if (PAGE_MIGRATION) {
844 /* 953 /*
845 * Store the pfn of the page in a special migration 954 * Store the pfn of the page in a special migration
@@ -857,7 +966,7 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
857 entry = make_migration_entry(page, pte_write(pteval)); 966 entry = make_migration_entry(page, pte_write(pteval));
858 set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); 967 set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
859 } else 968 } else
860 dec_mm_counter(mm, file_rss); 969 dec_mm_counter(mm, MM_FILEPAGES);
861 970
862 page_remove_rmap(page); 971 page_remove_rmap(page);
863 page_cache_release(page); 972 page_cache_release(page);
@@ -996,7 +1105,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
996 1105
997 page_remove_rmap(page); 1106 page_remove_rmap(page);
998 page_cache_release(page); 1107 page_cache_release(page);
999 dec_mm_counter(mm, file_rss); 1108 dec_mm_counter(mm, MM_FILEPAGES);
1000 (*mapcount)--; 1109 (*mapcount)--;
1001 } 1110 }
1002 pte_unmap_unlock(pte - 1, ptl); 1111 pte_unmap_unlock(pte - 1, ptl);
@@ -1024,14 +1133,15 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
1024static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) 1133static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
1025{ 1134{
1026 struct anon_vma *anon_vma; 1135 struct anon_vma *anon_vma;
1027 struct vm_area_struct *vma; 1136 struct anon_vma_chain *avc;
1028 int ret = SWAP_AGAIN; 1137 int ret = SWAP_AGAIN;
1029 1138
1030 anon_vma = page_lock_anon_vma(page); 1139 anon_vma = page_lock_anon_vma(page);
1031 if (!anon_vma) 1140 if (!anon_vma)
1032 return ret; 1141 return ret;
1033 1142
1034 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { 1143 list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
1144 struct vm_area_struct *vma = avc->vma;
1035 unsigned long address = vma_address(page, vma); 1145 unsigned long address = vma_address(page, vma);
1036 if (address == -EFAULT) 1146 if (address == -EFAULT)
1037 continue; 1147 continue;
@@ -1222,7 +1332,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
1222 struct vm_area_struct *, unsigned long, void *), void *arg) 1332 struct vm_area_struct *, unsigned long, void *), void *arg)
1223{ 1333{
1224 struct anon_vma *anon_vma; 1334 struct anon_vma *anon_vma;
1225 struct vm_area_struct *vma; 1335 struct anon_vma_chain *avc;
1226 int ret = SWAP_AGAIN; 1336 int ret = SWAP_AGAIN;
1227 1337
1228 /* 1338 /*
@@ -1237,7 +1347,8 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
1237 if (!anon_vma) 1347 if (!anon_vma)
1238 return ret; 1348 return ret;
1239 spin_lock(&anon_vma->lock); 1349 spin_lock(&anon_vma->lock);
1240 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { 1350 list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
1351 struct vm_area_struct *vma = avc->vma;
1241 unsigned long address = vma_address(page, vma); 1352 unsigned long address = vma_address(page, vma);
1242 if (address == -EFAULT) 1353 if (address == -EFAULT)
1243 continue; 1354 continue;
diff --git a/mm/slab.c b/mm/slab.c
index 7451bdacaf18..a9f325b28bed 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -935,7 +935,6 @@ static int transfer_objects(struct array_cache *to,
935 935
936 from->avail -= nr; 936 from->avail -= nr;
937 to->avail += nr; 937 to->avail += nr;
938 to->touched = 1;
939 return nr; 938 return nr;
940} 939}
941 940
@@ -983,13 +982,11 @@ static struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp)
983 982
984 if (limit > 1) 983 if (limit > 1)
985 limit = 12; 984 limit = 12;
986 ac_ptr = kmalloc_node(memsize, gfp, node); 985 ac_ptr = kzalloc_node(memsize, gfp, node);
987 if (ac_ptr) { 986 if (ac_ptr) {
988 for_each_node(i) { 987 for_each_node(i) {
989 if (i == node || !node_online(i)) { 988 if (i == node || !node_online(i))
990 ac_ptr[i] = NULL;
991 continue; 989 continue;
992 }
993 ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d, gfp); 990 ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d, gfp);
994 if (!ac_ptr[i]) { 991 if (!ac_ptr[i]) {
995 for (i--; i >= 0; i--) 992 for (i--; i >= 0; i--)
@@ -2963,8 +2960,10 @@ retry:
2963 spin_lock(&l3->list_lock); 2960 spin_lock(&l3->list_lock);
2964 2961
2965 /* See if we can refill from the shared array */ 2962 /* See if we can refill from the shared array */
2966 if (l3->shared && transfer_objects(ac, l3->shared, batchcount)) 2963 if (l3->shared && transfer_objects(ac, l3->shared, batchcount)) {
2964 l3->shared->touched = 1;
2967 goto alloc_done; 2965 goto alloc_done;
2966 }
2968 2967
2969 while (batchcount > 0) { 2968 while (batchcount > 0) {
2970 struct list_head *entry; 2969 struct list_head *entry;
@@ -3101,7 +3100,7 @@ static bool slab_should_failslab(struct kmem_cache *cachep, gfp_t flags)
3101 if (cachep == &cache_cache) 3100 if (cachep == &cache_cache)
3102 return false; 3101 return false;
3103 3102
3104 return should_failslab(obj_size(cachep), flags); 3103 return should_failslab(obj_size(cachep), flags, cachep->flags);
3105} 3104}
3106 3105
3107static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) 3106static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
diff --git a/mm/slub.c b/mm/slub.c
index 8d71aaf888d7..b364844a1068 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -151,7 +151,8 @@
151 * Set of flags that will prevent slab merging 151 * Set of flags that will prevent slab merging
152 */ 152 */
153#define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ 153#define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
154 SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE) 154 SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \
155 SLAB_FAILSLAB)
155 156
156#define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \ 157#define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \
157 SLAB_CACHE_DMA | SLAB_NOTRACK) 158 SLAB_CACHE_DMA | SLAB_NOTRACK)
@@ -217,10 +218,10 @@ static inline void sysfs_slab_remove(struct kmem_cache *s)
217 218
218#endif 219#endif
219 220
220static inline void stat(struct kmem_cache_cpu *c, enum stat_item si) 221static inline void stat(struct kmem_cache *s, enum stat_item si)
221{ 222{
222#ifdef CONFIG_SLUB_STATS 223#ifdef CONFIG_SLUB_STATS
223 c->stat[si]++; 224 __this_cpu_inc(s->cpu_slab->stat[si]);
224#endif 225#endif
225} 226}
226 227
@@ -242,15 +243,6 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
242#endif 243#endif
243} 244}
244 245
245static inline struct kmem_cache_cpu *get_cpu_slab(struct kmem_cache *s, int cpu)
246{
247#ifdef CONFIG_SMP
248 return s->cpu_slab[cpu];
249#else
250 return &s->cpu_slab;
251#endif
252}
253
254/* Verify that a pointer has an address that is valid within a slab page */ 246/* Verify that a pointer has an address that is valid within a slab page */
255static inline int check_valid_pointer(struct kmem_cache *s, 247static inline int check_valid_pointer(struct kmem_cache *s,
256 struct page *page, const void *object) 248 struct page *page, const void *object)
@@ -269,13 +261,6 @@ static inline int check_valid_pointer(struct kmem_cache *s,
269 return 1; 261 return 1;
270} 262}
271 263
272/*
273 * Slow version of get and set free pointer.
274 *
275 * This version requires touching the cache lines of kmem_cache which
276 * we avoid to do in the fast alloc free paths. There we obtain the offset
277 * from the page struct.
278 */
279static inline void *get_freepointer(struct kmem_cache *s, void *object) 264static inline void *get_freepointer(struct kmem_cache *s, void *object)
280{ 265{
281 return *(void **)(object + s->offset); 266 return *(void **)(object + s->offset);
@@ -1020,6 +1005,9 @@ static int __init setup_slub_debug(char *str)
1020 case 't': 1005 case 't':
1021 slub_debug |= SLAB_TRACE; 1006 slub_debug |= SLAB_TRACE;
1022 break; 1007 break;
1008 case 'a':
1009 slub_debug |= SLAB_FAILSLAB;
1010 break;
1023 default: 1011 default:
1024 printk(KERN_ERR "slub_debug option '%c' " 1012 printk(KERN_ERR "slub_debug option '%c' "
1025 "unknown. skipped\n", *str); 1013 "unknown. skipped\n", *str);
@@ -1124,7 +1112,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
1124 if (!page) 1112 if (!page)
1125 return NULL; 1113 return NULL;
1126 1114
1127 stat(get_cpu_slab(s, raw_smp_processor_id()), ORDER_FALLBACK); 1115 stat(s, ORDER_FALLBACK);
1128 } 1116 }
1129 1117
1130 if (kmemcheck_enabled 1118 if (kmemcheck_enabled
@@ -1422,23 +1410,22 @@ static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node)
1422static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) 1410static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
1423{ 1411{
1424 struct kmem_cache_node *n = get_node(s, page_to_nid(page)); 1412 struct kmem_cache_node *n = get_node(s, page_to_nid(page));
1425 struct kmem_cache_cpu *c = get_cpu_slab(s, smp_processor_id());
1426 1413
1427 __ClearPageSlubFrozen(page); 1414 __ClearPageSlubFrozen(page);
1428 if (page->inuse) { 1415 if (page->inuse) {
1429 1416
1430 if (page->freelist) { 1417 if (page->freelist) {
1431 add_partial(n, page, tail); 1418 add_partial(n, page, tail);
1432 stat(c, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD); 1419 stat(s, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD);
1433 } else { 1420 } else {
1434 stat(c, DEACTIVATE_FULL); 1421 stat(s, DEACTIVATE_FULL);
1435 if (SLABDEBUG && PageSlubDebug(page) && 1422 if (SLABDEBUG && PageSlubDebug(page) &&
1436 (s->flags & SLAB_STORE_USER)) 1423 (s->flags & SLAB_STORE_USER))
1437 add_full(n, page); 1424 add_full(n, page);
1438 } 1425 }
1439 slab_unlock(page); 1426 slab_unlock(page);
1440 } else { 1427 } else {
1441 stat(c, DEACTIVATE_EMPTY); 1428 stat(s, DEACTIVATE_EMPTY);
1442 if (n->nr_partial < s->min_partial) { 1429 if (n->nr_partial < s->min_partial) {
1443 /* 1430 /*
1444 * Adding an empty slab to the partial slabs in order 1431 * Adding an empty slab to the partial slabs in order
@@ -1454,7 +1441,7 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
1454 slab_unlock(page); 1441 slab_unlock(page);
1455 } else { 1442 } else {
1456 slab_unlock(page); 1443 slab_unlock(page);
1457 stat(get_cpu_slab(s, raw_smp_processor_id()), FREE_SLAB); 1444 stat(s, FREE_SLAB);
1458 discard_slab(s, page); 1445 discard_slab(s, page);
1459 } 1446 }
1460 } 1447 }
@@ -1469,7 +1456,7 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
1469 int tail = 1; 1456 int tail = 1;
1470 1457
1471 if (page->freelist) 1458 if (page->freelist)
1472 stat(c, DEACTIVATE_REMOTE_FREES); 1459 stat(s, DEACTIVATE_REMOTE_FREES);
1473 /* 1460 /*
1474 * Merge cpu freelist into slab freelist. Typically we get here 1461 * Merge cpu freelist into slab freelist. Typically we get here
1475 * because both freelists are empty. So this is unlikely 1462 * because both freelists are empty. So this is unlikely
@@ -1482,10 +1469,10 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
1482 1469
1483 /* Retrieve object from cpu_freelist */ 1470 /* Retrieve object from cpu_freelist */
1484 object = c->freelist; 1471 object = c->freelist;
1485 c->freelist = c->freelist[c->offset]; 1472 c->freelist = get_freepointer(s, c->freelist);
1486 1473
1487 /* And put onto the regular freelist */ 1474 /* And put onto the regular freelist */
1488 object[c->offset] = page->freelist; 1475 set_freepointer(s, object, page->freelist);
1489 page->freelist = object; 1476 page->freelist = object;
1490 page->inuse--; 1477 page->inuse--;
1491 } 1478 }
@@ -1495,7 +1482,7 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
1495 1482
1496static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) 1483static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
1497{ 1484{
1498 stat(c, CPUSLAB_FLUSH); 1485 stat(s, CPUSLAB_FLUSH);
1499 slab_lock(c->page); 1486 slab_lock(c->page);
1500 deactivate_slab(s, c); 1487 deactivate_slab(s, c);
1501} 1488}
@@ -1507,7 +1494,7 @@ static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
1507 */ 1494 */
1508static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) 1495static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu)
1509{ 1496{
1510 struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); 1497 struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
1511 1498
1512 if (likely(c && c->page)) 1499 if (likely(c && c->page))
1513 flush_slab(s, c); 1500 flush_slab(s, c);
@@ -1635,7 +1622,7 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
1635 if (unlikely(!node_match(c, node))) 1622 if (unlikely(!node_match(c, node)))
1636 goto another_slab; 1623 goto another_slab;
1637 1624
1638 stat(c, ALLOC_REFILL); 1625 stat(s, ALLOC_REFILL);
1639 1626
1640load_freelist: 1627load_freelist:
1641 object = c->page->freelist; 1628 object = c->page->freelist;
@@ -1644,13 +1631,13 @@ load_freelist:
1644 if (unlikely(SLABDEBUG && PageSlubDebug(c->page))) 1631 if (unlikely(SLABDEBUG && PageSlubDebug(c->page)))
1645 goto debug; 1632 goto debug;
1646 1633
1647 c->freelist = object[c->offset]; 1634 c->freelist = get_freepointer(s, object);
1648 c->page->inuse = c->page->objects; 1635 c->page->inuse = c->page->objects;
1649 c->page->freelist = NULL; 1636 c->page->freelist = NULL;
1650 c->node = page_to_nid(c->page); 1637 c->node = page_to_nid(c->page);
1651unlock_out: 1638unlock_out:
1652 slab_unlock(c->page); 1639 slab_unlock(c->page);
1653 stat(c, ALLOC_SLOWPATH); 1640 stat(s, ALLOC_SLOWPATH);
1654 return object; 1641 return object;
1655 1642
1656another_slab: 1643another_slab:
@@ -1660,7 +1647,7 @@ new_slab:
1660 new = get_partial(s, gfpflags, node); 1647 new = get_partial(s, gfpflags, node);
1661 if (new) { 1648 if (new) {
1662 c->page = new; 1649 c->page = new;
1663 stat(c, ALLOC_FROM_PARTIAL); 1650 stat(s, ALLOC_FROM_PARTIAL);
1664 goto load_freelist; 1651 goto load_freelist;
1665 } 1652 }
1666 1653
@@ -1673,8 +1660,8 @@ new_slab:
1673 local_irq_disable(); 1660 local_irq_disable();
1674 1661
1675 if (new) { 1662 if (new) {
1676 c = get_cpu_slab(s, smp_processor_id()); 1663 c = __this_cpu_ptr(s->cpu_slab);
1677 stat(c, ALLOC_SLAB); 1664 stat(s, ALLOC_SLAB);
1678 if (c->page) 1665 if (c->page)
1679 flush_slab(s, c); 1666 flush_slab(s, c);
1680 slab_lock(new); 1667 slab_lock(new);
@@ -1690,7 +1677,7 @@ debug:
1690 goto another_slab; 1677 goto another_slab;
1691 1678
1692 c->page->inuse++; 1679 c->page->inuse++;
1693 c->page->freelist = object[c->offset]; 1680 c->page->freelist = get_freepointer(s, object);
1694 c->node = -1; 1681 c->node = -1;
1695 goto unlock_out; 1682 goto unlock_out;
1696} 1683}
@@ -1711,35 +1698,33 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
1711 void **object; 1698 void **object;
1712 struct kmem_cache_cpu *c; 1699 struct kmem_cache_cpu *c;
1713 unsigned long flags; 1700 unsigned long flags;
1714 unsigned int objsize;
1715 1701
1716 gfpflags &= gfp_allowed_mask; 1702 gfpflags &= gfp_allowed_mask;
1717 1703
1718 lockdep_trace_alloc(gfpflags); 1704 lockdep_trace_alloc(gfpflags);
1719 might_sleep_if(gfpflags & __GFP_WAIT); 1705 might_sleep_if(gfpflags & __GFP_WAIT);
1720 1706
1721 if (should_failslab(s->objsize, gfpflags)) 1707 if (should_failslab(s->objsize, gfpflags, s->flags))
1722 return NULL; 1708 return NULL;
1723 1709
1724 local_irq_save(flags); 1710 local_irq_save(flags);
1725 c = get_cpu_slab(s, smp_processor_id()); 1711 c = __this_cpu_ptr(s->cpu_slab);
1726 objsize = c->objsize; 1712 object = c->freelist;
1727 if (unlikely(!c->freelist || !node_match(c, node))) 1713 if (unlikely(!object || !node_match(c, node)))
1728 1714
1729 object = __slab_alloc(s, gfpflags, node, addr, c); 1715 object = __slab_alloc(s, gfpflags, node, addr, c);
1730 1716
1731 else { 1717 else {
1732 object = c->freelist; 1718 c->freelist = get_freepointer(s, object);
1733 c->freelist = object[c->offset]; 1719 stat(s, ALLOC_FASTPATH);
1734 stat(c, ALLOC_FASTPATH);
1735 } 1720 }
1736 local_irq_restore(flags); 1721 local_irq_restore(flags);
1737 1722
1738 if (unlikely(gfpflags & __GFP_ZERO) && object) 1723 if (unlikely(gfpflags & __GFP_ZERO) && object)
1739 memset(object, 0, objsize); 1724 memset(object, 0, s->objsize);
1740 1725
1741 kmemcheck_slab_alloc(s, gfpflags, object, c->objsize); 1726 kmemcheck_slab_alloc(s, gfpflags, object, s->objsize);
1742 kmemleak_alloc_recursive(object, objsize, 1, s->flags, gfpflags); 1727 kmemleak_alloc_recursive(object, s->objsize, 1, s->flags, gfpflags);
1743 1728
1744 return object; 1729 return object;
1745} 1730}
@@ -1794,26 +1779,25 @@ EXPORT_SYMBOL(kmem_cache_alloc_node_notrace);
1794 * handling required then we can return immediately. 1779 * handling required then we can return immediately.
1795 */ 1780 */
1796static void __slab_free(struct kmem_cache *s, struct page *page, 1781static void __slab_free(struct kmem_cache *s, struct page *page,
1797 void *x, unsigned long addr, unsigned int offset) 1782 void *x, unsigned long addr)
1798{ 1783{
1799 void *prior; 1784 void *prior;
1800 void **object = (void *)x; 1785 void **object = (void *)x;
1801 struct kmem_cache_cpu *c;
1802 1786
1803 c = get_cpu_slab(s, raw_smp_processor_id()); 1787 stat(s, FREE_SLOWPATH);
1804 stat(c, FREE_SLOWPATH);
1805 slab_lock(page); 1788 slab_lock(page);
1806 1789
1807 if (unlikely(SLABDEBUG && PageSlubDebug(page))) 1790 if (unlikely(SLABDEBUG && PageSlubDebug(page)))
1808 goto debug; 1791 goto debug;
1809 1792
1810checks_ok: 1793checks_ok:
1811 prior = object[offset] = page->freelist; 1794 prior = page->freelist;
1795 set_freepointer(s, object, prior);
1812 page->freelist = object; 1796 page->freelist = object;
1813 page->inuse--; 1797 page->inuse--;
1814 1798
1815 if (unlikely(PageSlubFrozen(page))) { 1799 if (unlikely(PageSlubFrozen(page))) {
1816 stat(c, FREE_FROZEN); 1800 stat(s, FREE_FROZEN);
1817 goto out_unlock; 1801 goto out_unlock;
1818 } 1802 }
1819 1803
@@ -1826,7 +1810,7 @@ checks_ok:
1826 */ 1810 */
1827 if (unlikely(!prior)) { 1811 if (unlikely(!prior)) {
1828 add_partial(get_node(s, page_to_nid(page)), page, 1); 1812 add_partial(get_node(s, page_to_nid(page)), page, 1);
1829 stat(c, FREE_ADD_PARTIAL); 1813 stat(s, FREE_ADD_PARTIAL);
1830 } 1814 }
1831 1815
1832out_unlock: 1816out_unlock:
@@ -1839,10 +1823,10 @@ slab_empty:
1839 * Slab still on the partial list. 1823 * Slab still on the partial list.
1840 */ 1824 */
1841 remove_partial(s, page); 1825 remove_partial(s, page);
1842 stat(c, FREE_REMOVE_PARTIAL); 1826 stat(s, FREE_REMOVE_PARTIAL);
1843 } 1827 }
1844 slab_unlock(page); 1828 slab_unlock(page);
1845 stat(c, FREE_SLAB); 1829 stat(s, FREE_SLAB);
1846 discard_slab(s, page); 1830 discard_slab(s, page);
1847 return; 1831 return;
1848 1832
@@ -1872,17 +1856,17 @@ static __always_inline void slab_free(struct kmem_cache *s,
1872 1856
1873 kmemleak_free_recursive(x, s->flags); 1857 kmemleak_free_recursive(x, s->flags);
1874 local_irq_save(flags); 1858 local_irq_save(flags);
1875 c = get_cpu_slab(s, smp_processor_id()); 1859 c = __this_cpu_ptr(s->cpu_slab);
1876 kmemcheck_slab_free(s, object, c->objsize); 1860 kmemcheck_slab_free(s, object, s->objsize);
1877 debug_check_no_locks_freed(object, c->objsize); 1861 debug_check_no_locks_freed(object, s->objsize);
1878 if (!(s->flags & SLAB_DEBUG_OBJECTS)) 1862 if (!(s->flags & SLAB_DEBUG_OBJECTS))
1879 debug_check_no_obj_freed(object, c->objsize); 1863 debug_check_no_obj_freed(object, s->objsize);
1880 if (likely(page == c->page && c->node >= 0)) { 1864 if (likely(page == c->page && c->node >= 0)) {
1881 object[c->offset] = c->freelist; 1865 set_freepointer(s, object, c->freelist);
1882 c->freelist = object; 1866 c->freelist = object;
1883 stat(c, FREE_FASTPATH); 1867 stat(s, FREE_FASTPATH);
1884 } else 1868 } else
1885 __slab_free(s, page, x, addr, c->offset); 1869 __slab_free(s, page, x, addr);
1886 1870
1887 local_irq_restore(flags); 1871 local_irq_restore(flags);
1888} 1872}
@@ -2069,19 +2053,6 @@ static unsigned long calculate_alignment(unsigned long flags,
2069 return ALIGN(align, sizeof(void *)); 2053 return ALIGN(align, sizeof(void *));
2070} 2054}
2071 2055
2072static void init_kmem_cache_cpu(struct kmem_cache *s,
2073 struct kmem_cache_cpu *c)
2074{
2075 c->page = NULL;
2076 c->freelist = NULL;
2077 c->node = 0;
2078 c->offset = s->offset / sizeof(void *);
2079 c->objsize = s->objsize;
2080#ifdef CONFIG_SLUB_STATS
2081 memset(c->stat, 0, NR_SLUB_STAT_ITEMS * sizeof(unsigned));
2082#endif
2083}
2084
2085static void 2056static void
2086init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s) 2057init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s)
2087{ 2058{
@@ -2095,130 +2066,24 @@ init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s)
2095#endif 2066#endif
2096} 2067}
2097 2068
2098#ifdef CONFIG_SMP 2069static DEFINE_PER_CPU(struct kmem_cache_cpu, kmalloc_percpu[KMALLOC_CACHES]);
2099/*
2100 * Per cpu array for per cpu structures.
2101 *
2102 * The per cpu array places all kmem_cache_cpu structures from one processor
2103 * close together meaning that it becomes possible that multiple per cpu
2104 * structures are contained in one cacheline. This may be particularly
2105 * beneficial for the kmalloc caches.
2106 *
2107 * A desktop system typically has around 60-80 slabs. With 100 here we are
2108 * likely able to get per cpu structures for all caches from the array defined
2109 * here. We must be able to cover all kmalloc caches during bootstrap.
2110 *
2111 * If the per cpu array is exhausted then fall back to kmalloc
2112 * of individual cachelines. No sharing is possible then.
2113 */
2114#define NR_KMEM_CACHE_CPU 100
2115
2116static DEFINE_PER_CPU(struct kmem_cache_cpu [NR_KMEM_CACHE_CPU],
2117 kmem_cache_cpu);
2118
2119static DEFINE_PER_CPU(struct kmem_cache_cpu *, kmem_cache_cpu_free);
2120static DECLARE_BITMAP(kmem_cach_cpu_free_init_once, CONFIG_NR_CPUS);
2121
2122static struct kmem_cache_cpu *alloc_kmem_cache_cpu(struct kmem_cache *s,
2123 int cpu, gfp_t flags)
2124{
2125 struct kmem_cache_cpu *c = per_cpu(kmem_cache_cpu_free, cpu);
2126
2127 if (c)
2128 per_cpu(kmem_cache_cpu_free, cpu) =
2129 (void *)c->freelist;
2130 else {
2131 /* Table overflow: So allocate ourselves */
2132 c = kmalloc_node(
2133 ALIGN(sizeof(struct kmem_cache_cpu), cache_line_size()),
2134 flags, cpu_to_node(cpu));
2135 if (!c)
2136 return NULL;
2137 }
2138
2139 init_kmem_cache_cpu(s, c);
2140 return c;
2141}
2142
2143static void free_kmem_cache_cpu(struct kmem_cache_cpu *c, int cpu)
2144{
2145 if (c < per_cpu(kmem_cache_cpu, cpu) ||
2146 c >= per_cpu(kmem_cache_cpu, cpu) + NR_KMEM_CACHE_CPU) {
2147 kfree(c);
2148 return;
2149 }
2150 c->freelist = (void *)per_cpu(kmem_cache_cpu_free, cpu);
2151 per_cpu(kmem_cache_cpu_free, cpu) = c;
2152}
2153
2154static void free_kmem_cache_cpus(struct kmem_cache *s)
2155{
2156 int cpu;
2157
2158 for_each_online_cpu(cpu) {
2159 struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
2160
2161 if (c) {
2162 s->cpu_slab[cpu] = NULL;
2163 free_kmem_cache_cpu(c, cpu);
2164 }
2165 }
2166}
2167
2168static int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags)
2169{
2170 int cpu;
2171
2172 for_each_online_cpu(cpu) {
2173 struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
2174
2175 if (c)
2176 continue;
2177
2178 c = alloc_kmem_cache_cpu(s, cpu, flags);
2179 if (!c) {
2180 free_kmem_cache_cpus(s);
2181 return 0;
2182 }
2183 s->cpu_slab[cpu] = c;
2184 }
2185 return 1;
2186}
2187
2188/*
2189 * Initialize the per cpu array.
2190 */
2191static void init_alloc_cpu_cpu(int cpu)
2192{
2193 int i;
2194 2070
2195 if (cpumask_test_cpu(cpu, to_cpumask(kmem_cach_cpu_free_init_once))) 2071static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags)
2196 return;
2197
2198 for (i = NR_KMEM_CACHE_CPU - 1; i >= 0; i--)
2199 free_kmem_cache_cpu(&per_cpu(kmem_cache_cpu, cpu)[i], cpu);
2200
2201 cpumask_set_cpu(cpu, to_cpumask(kmem_cach_cpu_free_init_once));
2202}
2203
2204static void __init init_alloc_cpu(void)
2205{ 2072{
2206 int cpu; 2073 if (s < kmalloc_caches + KMALLOC_CACHES && s >= kmalloc_caches)
2207 2074 /*
2208 for_each_online_cpu(cpu) 2075 * Boot time creation of the kmalloc array. Use static per cpu data
2209 init_alloc_cpu_cpu(cpu); 2076 * since the per cpu allocator is not available yet.
2210 } 2077 */
2078 s->cpu_slab = kmalloc_percpu + (s - kmalloc_caches);
2079 else
2080 s->cpu_slab = alloc_percpu(struct kmem_cache_cpu);
2211 2081
2212#else 2082 if (!s->cpu_slab)
2213static inline void free_kmem_cache_cpus(struct kmem_cache *s) {} 2083 return 0;
2214static inline void init_alloc_cpu(void) {}
2215 2084
2216static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags)
2217{
2218 init_kmem_cache_cpu(s, &s->cpu_slab);
2219 return 1; 2085 return 1;
2220} 2086}
2221#endif
2222 2087
2223#ifdef CONFIG_NUMA 2088#ifdef CONFIG_NUMA
2224/* 2089/*
@@ -2287,7 +2152,8 @@ static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
2287 int node; 2152 int node;
2288 int local_node; 2153 int local_node;
2289 2154
2290 if (slab_state >= UP) 2155 if (slab_state >= UP && (s < kmalloc_caches ||
2156 s > kmalloc_caches + KMALLOC_CACHES))
2291 local_node = page_to_nid(virt_to_page(s)); 2157 local_node = page_to_nid(virt_to_page(s));
2292 else 2158 else
2293 local_node = 0; 2159 local_node = 0;
@@ -2502,6 +2368,7 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
2502 2368
2503 if (alloc_kmem_cache_cpus(s, gfpflags & ~SLUB_DMA)) 2369 if (alloc_kmem_cache_cpus(s, gfpflags & ~SLUB_DMA))
2504 return 1; 2370 return 1;
2371
2505 free_kmem_cache_nodes(s); 2372 free_kmem_cache_nodes(s);
2506error: 2373error:
2507 if (flags & SLAB_PANIC) 2374 if (flags & SLAB_PANIC)
@@ -2609,9 +2476,8 @@ static inline int kmem_cache_close(struct kmem_cache *s)
2609 int node; 2476 int node;
2610 2477
2611 flush_all(s); 2478 flush_all(s);
2612 2479 free_percpu(s->cpu_slab);
2613 /* Attempt to free all objects */ 2480 /* Attempt to free all objects */
2614 free_kmem_cache_cpus(s);
2615 for_each_node_state(node, N_NORMAL_MEMORY) { 2481 for_each_node_state(node, N_NORMAL_MEMORY) {
2616 struct kmem_cache_node *n = get_node(s, node); 2482 struct kmem_cache_node *n = get_node(s, node);
2617 2483
@@ -2651,7 +2517,7 @@ EXPORT_SYMBOL(kmem_cache_destroy);
2651 * Kmalloc subsystem 2517 * Kmalloc subsystem
2652 *******************************************************************/ 2518 *******************************************************************/
2653 2519
2654struct kmem_cache kmalloc_caches[SLUB_PAGE_SHIFT] __cacheline_aligned; 2520struct kmem_cache kmalloc_caches[KMALLOC_CACHES] __cacheline_aligned;
2655EXPORT_SYMBOL(kmalloc_caches); 2521EXPORT_SYMBOL(kmalloc_caches);
2656 2522
2657static int __init setup_slub_min_order(char *str) 2523static int __init setup_slub_min_order(char *str)
@@ -2741,6 +2607,7 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags)
2741 char *text; 2607 char *text;
2742 size_t realsize; 2608 size_t realsize;
2743 unsigned long slabflags; 2609 unsigned long slabflags;
2610 int i;
2744 2611
2745 s = kmalloc_caches_dma[index]; 2612 s = kmalloc_caches_dma[index];
2746 if (s) 2613 if (s)
@@ -2760,7 +2627,14 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags)
2760 realsize = kmalloc_caches[index].objsize; 2627 realsize = kmalloc_caches[index].objsize;
2761 text = kasprintf(flags & ~SLUB_DMA, "kmalloc_dma-%d", 2628 text = kasprintf(flags & ~SLUB_DMA, "kmalloc_dma-%d",
2762 (unsigned int)realsize); 2629 (unsigned int)realsize);
2763 s = kmalloc(kmem_size, flags & ~SLUB_DMA); 2630
2631 s = NULL;
2632 for (i = 0; i < KMALLOC_CACHES; i++)
2633 if (!kmalloc_caches[i].size)
2634 break;
2635
2636 BUG_ON(i >= KMALLOC_CACHES);
2637 s = kmalloc_caches + i;
2764 2638
2765 /* 2639 /*
2766 * Must defer sysfs creation to a workqueue because we don't know 2640 * Must defer sysfs creation to a workqueue because we don't know
@@ -2772,9 +2646,9 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags)
2772 if (slab_state >= SYSFS) 2646 if (slab_state >= SYSFS)
2773 slabflags |= __SYSFS_ADD_DEFERRED; 2647 slabflags |= __SYSFS_ADD_DEFERRED;
2774 2648
2775 if (!s || !text || !kmem_cache_open(s, flags, text, 2649 if (!text || !kmem_cache_open(s, flags, text,
2776 realsize, ARCH_KMALLOC_MINALIGN, slabflags, NULL)) { 2650 realsize, ARCH_KMALLOC_MINALIGN, slabflags, NULL)) {
2777 kfree(s); 2651 s->size = 0;
2778 kfree(text); 2652 kfree(text);
2779 goto unlock_out; 2653 goto unlock_out;
2780 } 2654 }
@@ -3086,7 +2960,7 @@ static void slab_mem_offline_callback(void *arg)
3086 /* 2960 /*
3087 * if n->nr_slabs > 0, slabs still exist on the node 2961 * if n->nr_slabs > 0, slabs still exist on the node
3088 * that is going down. We were unable to free them, 2962 * that is going down. We were unable to free them,
3089 * and offline_pages() function shoudn't call this 2963 * and offline_pages() function shouldn't call this
3090 * callback. So, we must fail. 2964 * callback. So, we must fail.
3091 */ 2965 */
3092 BUG_ON(slabs_node(s, offline_node)); 2966 BUG_ON(slabs_node(s, offline_node));
@@ -3176,8 +3050,6 @@ void __init kmem_cache_init(void)
3176 int i; 3050 int i;
3177 int caches = 0; 3051 int caches = 0;
3178 3052
3179 init_alloc_cpu();
3180
3181#ifdef CONFIG_NUMA 3053#ifdef CONFIG_NUMA
3182 /* 3054 /*
3183 * Must first have the slab cache available for the allocations of the 3055 * Must first have the slab cache available for the allocations of the
@@ -3261,8 +3133,10 @@ void __init kmem_cache_init(void)
3261 3133
3262#ifdef CONFIG_SMP 3134#ifdef CONFIG_SMP
3263 register_cpu_notifier(&slab_notifier); 3135 register_cpu_notifier(&slab_notifier);
3264 kmem_size = offsetof(struct kmem_cache, cpu_slab) + 3136#endif
3265 nr_cpu_ids * sizeof(struct kmem_cache_cpu *); 3137#ifdef CONFIG_NUMA
3138 kmem_size = offsetof(struct kmem_cache, node) +
3139 nr_node_ids * sizeof(struct kmem_cache_node *);
3266#else 3140#else
3267 kmem_size = sizeof(struct kmem_cache); 3141 kmem_size = sizeof(struct kmem_cache);
3268#endif 3142#endif
@@ -3351,22 +3225,12 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
3351 down_write(&slub_lock); 3225 down_write(&slub_lock);
3352 s = find_mergeable(size, align, flags, name, ctor); 3226 s = find_mergeable(size, align, flags, name, ctor);
3353 if (s) { 3227 if (s) {
3354 int cpu;
3355
3356 s->refcount++; 3228 s->refcount++;
3357 /* 3229 /*
3358 * Adjust the object sizes so that we clear 3230 * Adjust the object sizes so that we clear
3359 * the complete object on kzalloc. 3231 * the complete object on kzalloc.
3360 */ 3232 */
3361 s->objsize = max(s->objsize, (int)size); 3233 s->objsize = max(s->objsize, (int)size);
3362
3363 /*
3364 * And then we need to update the object size in the
3365 * per cpu structures
3366 */
3367 for_each_online_cpu(cpu)
3368 get_cpu_slab(s, cpu)->objsize = s->objsize;
3369
3370 s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); 3234 s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *)));
3371 up_write(&slub_lock); 3235 up_write(&slub_lock);
3372 3236
@@ -3420,29 +3284,15 @@ static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb,
3420 unsigned long flags; 3284 unsigned long flags;
3421 3285
3422 switch (action) { 3286 switch (action) {
3423 case CPU_UP_PREPARE:
3424 case CPU_UP_PREPARE_FROZEN:
3425 init_alloc_cpu_cpu(cpu);
3426 down_read(&slub_lock);
3427 list_for_each_entry(s, &slab_caches, list)
3428 s->cpu_slab[cpu] = alloc_kmem_cache_cpu(s, cpu,
3429 GFP_KERNEL);
3430 up_read(&slub_lock);
3431 break;
3432
3433 case CPU_UP_CANCELED: 3287 case CPU_UP_CANCELED:
3434 case CPU_UP_CANCELED_FROZEN: 3288 case CPU_UP_CANCELED_FROZEN:
3435 case CPU_DEAD: 3289 case CPU_DEAD:
3436 case CPU_DEAD_FROZEN: 3290 case CPU_DEAD_FROZEN:
3437 down_read(&slub_lock); 3291 down_read(&slub_lock);
3438 list_for_each_entry(s, &slab_caches, list) { 3292 list_for_each_entry(s, &slab_caches, list) {
3439 struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
3440
3441 local_irq_save(flags); 3293 local_irq_save(flags);
3442 __flush_cpu_slab(s, cpu); 3294 __flush_cpu_slab(s, cpu);
3443 local_irq_restore(flags); 3295 local_irq_restore(flags);
3444 free_kmem_cache_cpu(c, cpu);
3445 s->cpu_slab[cpu] = NULL;
3446 } 3296 }
3447 up_read(&slub_lock); 3297 up_read(&slub_lock);
3448 break; 3298 break;
@@ -3928,7 +3778,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
3928 int cpu; 3778 int cpu;
3929 3779
3930 for_each_possible_cpu(cpu) { 3780 for_each_possible_cpu(cpu) {
3931 struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); 3781 struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
3932 3782
3933 if (!c || c->node < 0) 3783 if (!c || c->node < 0)
3934 continue; 3784 continue;
@@ -4171,6 +4021,23 @@ static ssize_t trace_store(struct kmem_cache *s, const char *buf,
4171} 4021}
4172SLAB_ATTR(trace); 4022SLAB_ATTR(trace);
4173 4023
4024#ifdef CONFIG_FAILSLAB
4025static ssize_t failslab_show(struct kmem_cache *s, char *buf)
4026{
4027 return sprintf(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB));
4028}
4029
4030static ssize_t failslab_store(struct kmem_cache *s, const char *buf,
4031 size_t length)
4032{
4033 s->flags &= ~SLAB_FAILSLAB;
4034 if (buf[0] == '1')
4035 s->flags |= SLAB_FAILSLAB;
4036 return length;
4037}
4038SLAB_ATTR(failslab);
4039#endif
4040
4174static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf) 4041static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf)
4175{ 4042{
4176 return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT)); 4043 return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT));
@@ -4353,7 +4220,7 @@ static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si)
4353 return -ENOMEM; 4220 return -ENOMEM;
4354 4221
4355 for_each_online_cpu(cpu) { 4222 for_each_online_cpu(cpu) {
4356 unsigned x = get_cpu_slab(s, cpu)->stat[si]; 4223 unsigned x = per_cpu_ptr(s->cpu_slab, cpu)->stat[si];
4357 4224
4358 data[cpu] = x; 4225 data[cpu] = x;
4359 sum += x; 4226 sum += x;
@@ -4376,7 +4243,7 @@ static void clear_stat(struct kmem_cache *s, enum stat_item si)
4376 int cpu; 4243 int cpu;
4377 4244
4378 for_each_online_cpu(cpu) 4245 for_each_online_cpu(cpu)
4379 get_cpu_slab(s, cpu)->stat[si] = 0; 4246 per_cpu_ptr(s->cpu_slab, cpu)->stat[si] = 0;
4380} 4247}
4381 4248
4382#define STAT_ATTR(si, text) \ 4249#define STAT_ATTR(si, text) \
@@ -4467,6 +4334,10 @@ static struct attribute *slab_attrs[] = {
4467 &deactivate_remote_frees_attr.attr, 4334 &deactivate_remote_frees_attr.attr,
4468 &order_fallback_attr.attr, 4335 &order_fallback_attr.attr,
4469#endif 4336#endif
4337#ifdef CONFIG_FAILSLAB
4338 &failslab_attr.attr,
4339#endif
4340
4470 NULL 4341 NULL
4471}; 4342};
4472 4343
@@ -4519,7 +4390,7 @@ static void kmem_cache_release(struct kobject *kobj)
4519 kfree(s); 4390 kfree(s);
4520} 4391}
4521 4392
4522static struct sysfs_ops slab_sysfs_ops = { 4393static const struct sysfs_ops slab_sysfs_ops = {
4523 .show = slab_attr_show, 4394 .show = slab_attr_show,
4524 .store = slab_attr_store, 4395 .store = slab_attr_store,
4525}; 4396};
@@ -4538,7 +4409,7 @@ static int uevent_filter(struct kset *kset, struct kobject *kobj)
4538 return 0; 4409 return 0;
4539} 4410}
4540 4411
4541static struct kset_uevent_ops slab_uevent_ops = { 4412static const struct kset_uevent_ops slab_uevent_ops = {
4542 .filter = uevent_filter, 4413 .filter = uevent_filter,
4543}; 4414};
4544 4415
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index d9714bdcb4a3..392b9bb5bc01 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -40,9 +40,11 @@ static void * __init_refok __earlyonly_bootmem_alloc(int node,
40 unsigned long align, 40 unsigned long align,
41 unsigned long goal) 41 unsigned long goal)
42{ 42{
43 return __alloc_bootmem_node(NODE_DATA(node), size, align, goal); 43 return __alloc_bootmem_node_high(NODE_DATA(node), size, align, goal);
44} 44}
45 45
46static void *vmemmap_buf;
47static void *vmemmap_buf_end;
46 48
47void * __meminit vmemmap_alloc_block(unsigned long size, int node) 49void * __meminit vmemmap_alloc_block(unsigned long size, int node)
48{ 50{
@@ -64,6 +66,24 @@ void * __meminit vmemmap_alloc_block(unsigned long size, int node)
64 __pa(MAX_DMA_ADDRESS)); 66 __pa(MAX_DMA_ADDRESS));
65} 67}
66 68
69/* need to make sure size is all the same during early stage */
70void * __meminit vmemmap_alloc_block_buf(unsigned long size, int node)
71{
72 void *ptr;
73
74 if (!vmemmap_buf)
75 return vmemmap_alloc_block(size, node);
76
77 /* take the from buf */
78 ptr = (void *)ALIGN((unsigned long)vmemmap_buf, size);
79 if (ptr + size > vmemmap_buf_end)
80 return vmemmap_alloc_block(size, node);
81
82 vmemmap_buf = ptr + size;
83
84 return ptr;
85}
86
67void __meminit vmemmap_verify(pte_t *pte, int node, 87void __meminit vmemmap_verify(pte_t *pte, int node,
68 unsigned long start, unsigned long end) 88 unsigned long start, unsigned long end)
69{ 89{
@@ -80,7 +100,7 @@ pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node)
80 pte_t *pte = pte_offset_kernel(pmd, addr); 100 pte_t *pte = pte_offset_kernel(pmd, addr);
81 if (pte_none(*pte)) { 101 if (pte_none(*pte)) {
82 pte_t entry; 102 pte_t entry;
83 void *p = vmemmap_alloc_block(PAGE_SIZE, node); 103 void *p = vmemmap_alloc_block_buf(PAGE_SIZE, node);
84 if (!p) 104 if (!p)
85 return NULL; 105 return NULL;
86 entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL); 106 entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
@@ -163,3 +183,55 @@ struct page * __meminit sparse_mem_map_populate(unsigned long pnum, int nid)
163 183
164 return map; 184 return map;
165} 185}
186
187void __init sparse_mem_maps_populate_node(struct page **map_map,
188 unsigned long pnum_begin,
189 unsigned long pnum_end,
190 unsigned long map_count, int nodeid)
191{
192 unsigned long pnum;
193 unsigned long size = sizeof(struct page) * PAGES_PER_SECTION;
194 void *vmemmap_buf_start;
195
196 size = ALIGN(size, PMD_SIZE);
197 vmemmap_buf_start = __earlyonly_bootmem_alloc(nodeid, size * map_count,
198 PMD_SIZE, __pa(MAX_DMA_ADDRESS));
199
200 if (vmemmap_buf_start) {
201 vmemmap_buf = vmemmap_buf_start;
202 vmemmap_buf_end = vmemmap_buf_start + size * map_count;
203 }
204
205 for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
206 struct mem_section *ms;
207
208 if (!present_section_nr(pnum))
209 continue;
210
211 map_map[pnum] = sparse_mem_map_populate(pnum, nodeid);
212 if (map_map[pnum])
213 continue;
214 ms = __nr_to_section(pnum);
215 printk(KERN_ERR "%s: sparsemem memory map backing failed "
216 "some memory will not be available.\n", __func__);
217 ms->section_mem_map = 0;
218 }
219
220 if (vmemmap_buf_start) {
221 /* need to free left buf */
222#ifdef CONFIG_NO_BOOTMEM
223 free_early(__pa(vmemmap_buf_start), __pa(vmemmap_buf_end));
224 if (vmemmap_buf_start < vmemmap_buf) {
225 char name[15];
226
227 snprintf(name, sizeof(name), "MEMMAP %d", nodeid);
228 reserve_early_without_check(__pa(vmemmap_buf_start),
229 __pa(vmemmap_buf), name);
230 }
231#else
232 free_bootmem(__pa(vmemmap_buf), vmemmap_buf_end - vmemmap_buf);
233#endif
234 vmemmap_buf = NULL;
235 vmemmap_buf_end = NULL;
236 }
237}
diff --git a/mm/sparse.c b/mm/sparse.c
index 6ce4aab69e99..22896d589133 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -271,7 +271,8 @@ static unsigned long *__kmalloc_section_usemap(void)
271 271
272#ifdef CONFIG_MEMORY_HOTREMOVE 272#ifdef CONFIG_MEMORY_HOTREMOVE
273static unsigned long * __init 273static unsigned long * __init
274sparse_early_usemap_alloc_pgdat_section(struct pglist_data *pgdat) 274sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
275 unsigned long count)
275{ 276{
276 unsigned long section_nr; 277 unsigned long section_nr;
277 278
@@ -286,7 +287,7 @@ sparse_early_usemap_alloc_pgdat_section(struct pglist_data *pgdat)
286 * this problem. 287 * this problem.
287 */ 288 */
288 section_nr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT); 289 section_nr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT);
289 return alloc_bootmem_section(usemap_size(), section_nr); 290 return alloc_bootmem_section(usemap_size() * count, section_nr);
290} 291}
291 292
292static void __init check_usemap_section_nr(int nid, unsigned long *usemap) 293static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
@@ -329,7 +330,8 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
329} 330}
330#else 331#else
331static unsigned long * __init 332static unsigned long * __init
332sparse_early_usemap_alloc_pgdat_section(struct pglist_data *pgdat) 333sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
334 unsigned long count)
333{ 335{
334 return NULL; 336 return NULL;
335} 337}
@@ -339,27 +341,40 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
339} 341}
340#endif /* CONFIG_MEMORY_HOTREMOVE */ 342#endif /* CONFIG_MEMORY_HOTREMOVE */
341 343
342static unsigned long *__init sparse_early_usemap_alloc(unsigned long pnum) 344static void __init sparse_early_usemaps_alloc_node(unsigned long**usemap_map,
345 unsigned long pnum_begin,
346 unsigned long pnum_end,
347 unsigned long usemap_count, int nodeid)
343{ 348{
344 unsigned long *usemap; 349 void *usemap;
345 struct mem_section *ms = __nr_to_section(pnum); 350 unsigned long pnum;
346 int nid = sparse_early_nid(ms); 351 int size = usemap_size();
347
348 usemap = sparse_early_usemap_alloc_pgdat_section(NODE_DATA(nid));
349 if (usemap)
350 return usemap;
351 352
352 usemap = alloc_bootmem_node(NODE_DATA(nid), usemap_size()); 353 usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid),
354 usemap_count);
353 if (usemap) { 355 if (usemap) {
354 check_usemap_section_nr(nid, usemap); 356 for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
355 return usemap; 357 if (!present_section_nr(pnum))
358 continue;
359 usemap_map[pnum] = usemap;
360 usemap += size;
361 }
362 return;
356 } 363 }
357 364
358 /* Stupid: suppress gcc warning for SPARSEMEM && !NUMA */ 365 usemap = alloc_bootmem_node(NODE_DATA(nodeid), size * usemap_count);
359 nid = 0; 366 if (usemap) {
367 for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
368 if (!present_section_nr(pnum))
369 continue;
370 usemap_map[pnum] = usemap;
371 usemap += size;
372 check_usemap_section_nr(nodeid, usemap_map[pnum]);
373 }
374 return;
375 }
360 376
361 printk(KERN_WARNING "%s: allocation failed\n", __func__); 377 printk(KERN_WARNING "%s: allocation failed\n", __func__);
362 return NULL;
363} 378}
364 379
365#ifndef CONFIG_SPARSEMEM_VMEMMAP 380#ifndef CONFIG_SPARSEMEM_VMEMMAP
@@ -375,8 +390,65 @@ struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid)
375 PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION)); 390 PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION));
376 return map; 391 return map;
377} 392}
393void __init sparse_mem_maps_populate_node(struct page **map_map,
394 unsigned long pnum_begin,
395 unsigned long pnum_end,
396 unsigned long map_count, int nodeid)
397{
398 void *map;
399 unsigned long pnum;
400 unsigned long size = sizeof(struct page) * PAGES_PER_SECTION;
401
402 map = alloc_remap(nodeid, size * map_count);
403 if (map) {
404 for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
405 if (!present_section_nr(pnum))
406 continue;
407 map_map[pnum] = map;
408 map += size;
409 }
410 return;
411 }
412
413 size = PAGE_ALIGN(size);
414 map = alloc_bootmem_pages_node(NODE_DATA(nodeid), size * map_count);
415 if (map) {
416 for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
417 if (!present_section_nr(pnum))
418 continue;
419 map_map[pnum] = map;
420 map += size;
421 }
422 return;
423 }
424
425 /* fallback */
426 for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
427 struct mem_section *ms;
428
429 if (!present_section_nr(pnum))
430 continue;
431 map_map[pnum] = sparse_mem_map_populate(pnum, nodeid);
432 if (map_map[pnum])
433 continue;
434 ms = __nr_to_section(pnum);
435 printk(KERN_ERR "%s: sparsemem memory map backing failed "
436 "some memory will not be available.\n", __func__);
437 ms->section_mem_map = 0;
438 }
439}
378#endif /* !CONFIG_SPARSEMEM_VMEMMAP */ 440#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
379 441
442#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
443static void __init sparse_early_mem_maps_alloc_node(struct page **map_map,
444 unsigned long pnum_begin,
445 unsigned long pnum_end,
446 unsigned long map_count, int nodeid)
447{
448 sparse_mem_maps_populate_node(map_map, pnum_begin, pnum_end,
449 map_count, nodeid);
450}
451#else
380static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) 452static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum)
381{ 453{
382 struct page *map; 454 struct page *map;
@@ -392,10 +464,12 @@ static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum)
392 ms->section_mem_map = 0; 464 ms->section_mem_map = 0;
393 return NULL; 465 return NULL;
394} 466}
467#endif
395 468
396void __attribute__((weak)) __meminit vmemmap_populate_print_last(void) 469void __attribute__((weak)) __meminit vmemmap_populate_print_last(void)
397{ 470{
398} 471}
472
399/* 473/*
400 * Allocate the accumulated non-linear sections, allocate a mem_map 474 * Allocate the accumulated non-linear sections, allocate a mem_map
401 * for each and record the physical to section mapping. 475 * for each and record the physical to section mapping.
@@ -407,6 +481,14 @@ void __init sparse_init(void)
407 unsigned long *usemap; 481 unsigned long *usemap;
408 unsigned long **usemap_map; 482 unsigned long **usemap_map;
409 int size; 483 int size;
484 int nodeid_begin = 0;
485 unsigned long pnum_begin = 0;
486 unsigned long usemap_count;
487#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
488 unsigned long map_count;
489 int size2;
490 struct page **map_map;
491#endif
410 492
411 /* 493 /*
412 * map is using big page (aka 2M in x86 64 bit) 494 * map is using big page (aka 2M in x86 64 bit)
@@ -425,10 +507,81 @@ void __init sparse_init(void)
425 panic("can not allocate usemap_map\n"); 507 panic("can not allocate usemap_map\n");
426 508
427 for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { 509 for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
510 struct mem_section *ms;
511
428 if (!present_section_nr(pnum)) 512 if (!present_section_nr(pnum))
429 continue; 513 continue;
430 usemap_map[pnum] = sparse_early_usemap_alloc(pnum); 514 ms = __nr_to_section(pnum);
515 nodeid_begin = sparse_early_nid(ms);
516 pnum_begin = pnum;
517 break;
431 } 518 }
519 usemap_count = 1;
520 for (pnum = pnum_begin + 1; pnum < NR_MEM_SECTIONS; pnum++) {
521 struct mem_section *ms;
522 int nodeid;
523
524 if (!present_section_nr(pnum))
525 continue;
526 ms = __nr_to_section(pnum);
527 nodeid = sparse_early_nid(ms);
528 if (nodeid == nodeid_begin) {
529 usemap_count++;
530 continue;
531 }
532 /* ok, we need to take cake of from pnum_begin to pnum - 1*/
533 sparse_early_usemaps_alloc_node(usemap_map, pnum_begin, pnum,
534 usemap_count, nodeid_begin);
535 /* new start, update count etc*/
536 nodeid_begin = nodeid;
537 pnum_begin = pnum;
538 usemap_count = 1;
539 }
540 /* ok, last chunk */
541 sparse_early_usemaps_alloc_node(usemap_map, pnum_begin, NR_MEM_SECTIONS,
542 usemap_count, nodeid_begin);
543
544#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
545 size2 = sizeof(struct page *) * NR_MEM_SECTIONS;
546 map_map = alloc_bootmem(size2);
547 if (!map_map)
548 panic("can not allocate map_map\n");
549
550 for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
551 struct mem_section *ms;
552
553 if (!present_section_nr(pnum))
554 continue;
555 ms = __nr_to_section(pnum);
556 nodeid_begin = sparse_early_nid(ms);
557 pnum_begin = pnum;
558 break;
559 }
560 map_count = 1;
561 for (pnum = pnum_begin + 1; pnum < NR_MEM_SECTIONS; pnum++) {
562 struct mem_section *ms;
563 int nodeid;
564
565 if (!present_section_nr(pnum))
566 continue;
567 ms = __nr_to_section(pnum);
568 nodeid = sparse_early_nid(ms);
569 if (nodeid == nodeid_begin) {
570 map_count++;
571 continue;
572 }
573 /* ok, we need to take cake of from pnum_begin to pnum - 1*/
574 sparse_early_mem_maps_alloc_node(map_map, pnum_begin, pnum,
575 map_count, nodeid_begin);
576 /* new start, update count etc*/
577 nodeid_begin = nodeid;
578 pnum_begin = pnum;
579 map_count = 1;
580 }
581 /* ok, last chunk */
582 sparse_early_mem_maps_alloc_node(map_map, pnum_begin, NR_MEM_SECTIONS,
583 map_count, nodeid_begin);
584#endif
432 585
433 for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { 586 for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
434 if (!present_section_nr(pnum)) 587 if (!present_section_nr(pnum))
@@ -438,7 +591,11 @@ void __init sparse_init(void)
438 if (!usemap) 591 if (!usemap)
439 continue; 592 continue;
440 593
594#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
595 map = map_map[pnum];
596#else
441 map = sparse_early_mem_map_alloc(pnum); 597 map = sparse_early_mem_map_alloc(pnum);
598#endif
442 if (!map) 599 if (!map)
443 continue; 600 continue;
444 601
@@ -448,6 +605,9 @@ void __init sparse_init(void)
448 605
449 vmemmap_populate_print_last(); 606 vmemmap_populate_print_last();
450 607
608#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
609 free_bootmem(__pa(map_map), size2);
610#endif
451 free_bootmem(__pa(usemap_map), size); 611 free_bootmem(__pa(usemap_map), size);
452} 612}
453 613
diff --git a/mm/swap.c b/mm/swap.c
index 308e57d8d7ed..9036b89813ac 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -55,7 +55,7 @@ static void __page_cache_release(struct page *page)
55 del_page_from_lru(zone, page); 55 del_page_from_lru(zone, page);
56 spin_unlock_irqrestore(&zone->lru_lock, flags); 56 spin_unlock_irqrestore(&zone->lru_lock, flags);
57 } 57 }
58 free_hot_page(page); 58 free_hot_cold_page(page, 0);
59} 59}
60 60
61static void put_compound_page(struct page *page) 61static void put_compound_page(struct page *page)
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 6c0585b16418..6cd0a8f90dc7 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -723,6 +723,37 @@ int free_swap_and_cache(swp_entry_t entry)
723 return p != NULL; 723 return p != NULL;
724} 724}
725 725
726#ifdef CONFIG_CGROUP_MEM_RES_CTLR
727/**
728 * mem_cgroup_count_swap_user - count the user of a swap entry
729 * @ent: the swap entry to be checked
730 * @pagep: the pointer for the swap cache page of the entry to be stored
731 *
732 * Returns the number of the user of the swap entry. The number is valid only
733 * for swaps of anonymous pages.
734 * If the entry is found on swap cache, the page is stored to pagep with
735 * refcount of it being incremented.
736 */
737int mem_cgroup_count_swap_user(swp_entry_t ent, struct page **pagep)
738{
739 struct page *page;
740 struct swap_info_struct *p;
741 int count = 0;
742
743 page = find_get_page(&swapper_space, ent.val);
744 if (page)
745 count += page_mapcount(page);
746 p = swap_info_get(ent);
747 if (p) {
748 count += swap_count(p->swap_map[swp_offset(ent)]);
749 spin_unlock(&swap_lock);
750 }
751
752 *pagep = page;
753 return count;
754}
755#endif
756
726#ifdef CONFIG_HIBERNATION 757#ifdef CONFIG_HIBERNATION
727/* 758/*
728 * Find the swap type that corresponds to given device (if any). 759 * Find the swap type that corresponds to given device (if any).
@@ -840,7 +871,8 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
840 goto out; 871 goto out;
841 } 872 }
842 873
843 inc_mm_counter(vma->vm_mm, anon_rss); 874 dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
875 inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
844 get_page(page); 876 get_page(page);
845 set_pte_at(vma->vm_mm, addr, pte, 877 set_pte_at(vma->vm_mm, addr, pte,
846 pte_mkold(mk_pte(page, vma->vm_page_prot))); 878 pte_mkold(mk_pte(page, vma->vm_page_prot)));
@@ -1759,11 +1791,11 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
1759 unsigned int type; 1791 unsigned int type;
1760 int i, prev; 1792 int i, prev;
1761 int error; 1793 int error;
1762 union swap_header *swap_header = NULL; 1794 union swap_header *swap_header;
1763 unsigned int nr_good_pages = 0; 1795 unsigned int nr_good_pages;
1764 int nr_extents = 0; 1796 int nr_extents = 0;
1765 sector_t span; 1797 sector_t span;
1766 unsigned long maxpages = 1; 1798 unsigned long maxpages;
1767 unsigned long swapfilepages; 1799 unsigned long swapfilepages;
1768 unsigned char *swap_map = NULL; 1800 unsigned char *swap_map = NULL;
1769 struct page *page = NULL; 1801 struct page *page = NULL;
@@ -1922,9 +1954,13 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
1922 * swap pte. 1954 * swap pte.
1923 */ 1955 */
1924 maxpages = swp_offset(pte_to_swp_entry( 1956 maxpages = swp_offset(pte_to_swp_entry(
1925 swp_entry_to_pte(swp_entry(0, ~0UL)))) - 1; 1957 swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
1926 if (maxpages > swap_header->info.last_page) 1958 if (maxpages > swap_header->info.last_page) {
1927 maxpages = swap_header->info.last_page; 1959 maxpages = swap_header->info.last_page + 1;
1960 /* p->max is an unsigned int: don't overflow it */
1961 if ((unsigned int)maxpages == 0)
1962 maxpages = UINT_MAX;
1963 }
1928 p->highest_bit = maxpages - 1; 1964 p->highest_bit = maxpages - 1;
1929 1965
1930 error = -EINVAL; 1966 error = -EINVAL;
@@ -1948,23 +1984,24 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
1948 } 1984 }
1949 1985
1950 memset(swap_map, 0, maxpages); 1986 memset(swap_map, 0, maxpages);
1987 nr_good_pages = maxpages - 1; /* omit header page */
1988
1951 for (i = 0; i < swap_header->info.nr_badpages; i++) { 1989 for (i = 0; i < swap_header->info.nr_badpages; i++) {
1952 int page_nr = swap_header->info.badpages[i]; 1990 unsigned int page_nr = swap_header->info.badpages[i];
1953 if (page_nr <= 0 || page_nr >= swap_header->info.last_page) { 1991 if (page_nr == 0 || page_nr > swap_header->info.last_page) {
1954 error = -EINVAL; 1992 error = -EINVAL;
1955 goto bad_swap; 1993 goto bad_swap;
1956 } 1994 }
1957 swap_map[page_nr] = SWAP_MAP_BAD; 1995 if (page_nr < maxpages) {
1996 swap_map[page_nr] = SWAP_MAP_BAD;
1997 nr_good_pages--;
1998 }
1958 } 1999 }
1959 2000
1960 error = swap_cgroup_swapon(type, maxpages); 2001 error = swap_cgroup_swapon(type, maxpages);
1961 if (error) 2002 if (error)
1962 goto bad_swap; 2003 goto bad_swap;
1963 2004
1964 nr_good_pages = swap_header->info.last_page -
1965 swap_header->info.nr_badpages -
1966 1 /* header page */;
1967
1968 if (nr_good_pages) { 2005 if (nr_good_pages) {
1969 swap_map[0] = SWAP_MAP_BAD; 2006 swap_map[0] = SWAP_MAP_BAD;
1970 p->max = maxpages; 2007 p->max = maxpages;
@@ -2155,7 +2192,11 @@ void swap_shmem_alloc(swp_entry_t entry)
2155} 2192}
2156 2193
2157/* 2194/*
2158 * increase reference count of swap entry by 1. 2195 * Increase reference count of swap entry by 1.
2196 * Returns 0 for success, or -ENOMEM if a swap_count_continuation is required
2197 * but could not be atomically allocated. Returns 0, just as if it succeeded,
2198 * if __swap_duplicate() fails for another reason (-EINVAL or -ENOENT), which
2199 * might occur if a page table entry has got corrupted.
2159 */ 2200 */
2160int swap_duplicate(swp_entry_t entry) 2201int swap_duplicate(swp_entry_t entry)
2161{ 2202{
diff --git a/mm/vmscan.c b/mm/vmscan.c
index c26986c85ce0..79c809895fba 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -262,27 +262,6 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
262 return ret; 262 return ret;
263} 263}
264 264
265/* Called without lock on whether page is mapped, so answer is unstable */
266static inline int page_mapping_inuse(struct page *page)
267{
268 struct address_space *mapping;
269
270 /* Page is in somebody's page tables. */
271 if (page_mapped(page))
272 return 1;
273
274 /* Be more reluctant to reclaim swapcache than pagecache */
275 if (PageSwapCache(page))
276 return 1;
277
278 mapping = page_mapping(page);
279 if (!mapping)
280 return 0;
281
282 /* File is mmap'd by somebody? */
283 return mapping_mapped(mapping);
284}
285
286static inline int is_page_cache_freeable(struct page *page) 265static inline int is_page_cache_freeable(struct page *page)
287{ 266{
288 /* 267 /*
@@ -579,6 +558,65 @@ redo:
579 put_page(page); /* drop ref from isolate */ 558 put_page(page); /* drop ref from isolate */
580} 559}
581 560
561enum page_references {
562 PAGEREF_RECLAIM,
563 PAGEREF_RECLAIM_CLEAN,
564 PAGEREF_KEEP,
565 PAGEREF_ACTIVATE,
566};
567
568static enum page_references page_check_references(struct page *page,
569 struct scan_control *sc)
570{
571 int referenced_ptes, referenced_page;
572 unsigned long vm_flags;
573
574 referenced_ptes = page_referenced(page, 1, sc->mem_cgroup, &vm_flags);
575 referenced_page = TestClearPageReferenced(page);
576
577 /* Lumpy reclaim - ignore references */
578 if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
579 return PAGEREF_RECLAIM;
580
581 /*
582 * Mlock lost the isolation race with us. Let try_to_unmap()
583 * move the page to the unevictable list.
584 */
585 if (vm_flags & VM_LOCKED)
586 return PAGEREF_RECLAIM;
587
588 if (referenced_ptes) {
589 if (PageAnon(page))
590 return PAGEREF_ACTIVATE;
591 /*
592 * All mapped pages start out with page table
593 * references from the instantiating fault, so we need
594 * to look twice if a mapped file page is used more
595 * than once.
596 *
597 * Mark it and spare it for another trip around the
598 * inactive list. Another page table reference will
599 * lead to its activation.
600 *
601 * Note: the mark is set for activated pages as well
602 * so that recently deactivated but used pages are
603 * quickly recovered.
604 */
605 SetPageReferenced(page);
606
607 if (referenced_page)
608 return PAGEREF_ACTIVATE;
609
610 return PAGEREF_KEEP;
611 }
612
613 /* Reclaim if clean, defer dirty pages to writeback */
614 if (referenced_page)
615 return PAGEREF_RECLAIM_CLEAN;
616
617 return PAGEREF_RECLAIM;
618}
619
582/* 620/*
583 * shrink_page_list() returns the number of reclaimed pages 621 * shrink_page_list() returns the number of reclaimed pages
584 */ 622 */
@@ -590,16 +628,15 @@ static unsigned long shrink_page_list(struct list_head *page_list,
590 struct pagevec freed_pvec; 628 struct pagevec freed_pvec;
591 int pgactivate = 0; 629 int pgactivate = 0;
592 unsigned long nr_reclaimed = 0; 630 unsigned long nr_reclaimed = 0;
593 unsigned long vm_flags;
594 631
595 cond_resched(); 632 cond_resched();
596 633
597 pagevec_init(&freed_pvec, 1); 634 pagevec_init(&freed_pvec, 1);
598 while (!list_empty(page_list)) { 635 while (!list_empty(page_list)) {
636 enum page_references references;
599 struct address_space *mapping; 637 struct address_space *mapping;
600 struct page *page; 638 struct page *page;
601 int may_enter_fs; 639 int may_enter_fs;
602 int referenced;
603 640
604 cond_resched(); 641 cond_resched();
605 642
@@ -641,17 +678,16 @@ static unsigned long shrink_page_list(struct list_head *page_list,
641 goto keep_locked; 678 goto keep_locked;
642 } 679 }
643 680
644 referenced = page_referenced(page, 1, 681 references = page_check_references(page, sc);
645 sc->mem_cgroup, &vm_flags); 682 switch (references) {
646 /* 683 case PAGEREF_ACTIVATE:
647 * In active use or really unfreeable? Activate it.
648 * If page which have PG_mlocked lost isoltation race,
649 * try_to_unmap moves it to unevictable list
650 */
651 if (sc->order <= PAGE_ALLOC_COSTLY_ORDER &&
652 referenced && page_mapping_inuse(page)
653 && !(vm_flags & VM_LOCKED))
654 goto activate_locked; 684 goto activate_locked;
685 case PAGEREF_KEEP:
686 goto keep_locked;
687 case PAGEREF_RECLAIM:
688 case PAGEREF_RECLAIM_CLEAN:
689 ; /* try to reclaim the page below */
690 }
655 691
656 /* 692 /*
657 * Anonymous process memory has backing store? 693 * Anonymous process memory has backing store?
@@ -685,7 +721,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
685 } 721 }
686 722
687 if (PageDirty(page)) { 723 if (PageDirty(page)) {
688 if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && referenced) 724 if (references == PAGEREF_RECLAIM_CLEAN)
689 goto keep_locked; 725 goto keep_locked;
690 if (!may_enter_fs) 726 if (!may_enter_fs)
691 goto keep_locked; 727 goto keep_locked;
@@ -1350,9 +1386,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1350 continue; 1386 continue;
1351 } 1387 }
1352 1388
1353 /* page_referenced clears PageReferenced */ 1389 if (page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) {
1354 if (page_mapping_inuse(page) &&
1355 page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) {
1356 nr_rotated++; 1390 nr_rotated++;
1357 /* 1391 /*
1358 * Identify referenced, file-backed active pages and 1392 * Identify referenced, file-backed active pages and
@@ -1501,6 +1535,13 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
1501 unsigned long ap, fp; 1535 unsigned long ap, fp;
1502 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); 1536 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1503 1537
1538 /* If we have no swap space, do not bother scanning anon pages. */
1539 if (!sc->may_swap || (nr_swap_pages <= 0)) {
1540 percent[0] = 0;
1541 percent[1] = 100;
1542 return;
1543 }
1544
1504 anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) + 1545 anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) +
1505 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON); 1546 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
1506 file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) + 1547 file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) +
@@ -1598,22 +1639,20 @@ static void shrink_zone(int priority, struct zone *zone,
1598 unsigned long nr_reclaimed = sc->nr_reclaimed; 1639 unsigned long nr_reclaimed = sc->nr_reclaimed;
1599 unsigned long nr_to_reclaim = sc->nr_to_reclaim; 1640 unsigned long nr_to_reclaim = sc->nr_to_reclaim;
1600 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); 1641 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1601 int noswap = 0;
1602 1642
1603 /* If we have no swap space, do not bother scanning anon pages. */ 1643 get_scan_ratio(zone, sc, percent);
1604 if (!sc->may_swap || (nr_swap_pages <= 0)) {
1605 noswap = 1;
1606 percent[0] = 0;
1607 percent[1] = 100;
1608 } else
1609 get_scan_ratio(zone, sc, percent);
1610 1644
1611 for_each_evictable_lru(l) { 1645 for_each_evictable_lru(l) {
1612 int file = is_file_lru(l); 1646 int file = is_file_lru(l);
1613 unsigned long scan; 1647 unsigned long scan;
1614 1648
1649 if (percent[file] == 0) {
1650 nr[l] = 0;
1651 continue;
1652 }
1653
1615 scan = zone_nr_lru_pages(zone, sc, l); 1654 scan = zone_nr_lru_pages(zone, sc, l);
1616 if (priority || noswap) { 1655 if (priority) {
1617 scan >>= priority; 1656 scan >>= priority;
1618 scan = (scan * percent[file]) / 100; 1657 scan = (scan * percent[file]) / 100;
1619 } 1658 }
@@ -1694,8 +1733,7 @@ static void shrink_zones(int priority, struct zonelist *zonelist,
1694 continue; 1733 continue;
1695 note_zone_scanning_priority(zone, priority); 1734 note_zone_scanning_priority(zone, priority);
1696 1735
1697 if (zone_is_all_unreclaimable(zone) && 1736 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
1698 priority != DEF_PRIORITY)
1699 continue; /* Let kswapd poll it */ 1737 continue; /* Let kswapd poll it */
1700 sc->all_unreclaimable = 0; 1738 sc->all_unreclaimable = 0;
1701 } else { 1739 } else {
@@ -1922,7 +1960,7 @@ static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining)
1922 if (!populated_zone(zone)) 1960 if (!populated_zone(zone))
1923 continue; 1961 continue;
1924 1962
1925 if (zone_is_all_unreclaimable(zone)) 1963 if (zone->all_unreclaimable)
1926 continue; 1964 continue;
1927 1965
1928 if (!zone_watermark_ok(zone, order, high_wmark_pages(zone), 1966 if (!zone_watermark_ok(zone, order, high_wmark_pages(zone),
@@ -2012,8 +2050,7 @@ loop_again:
2012 if (!populated_zone(zone)) 2050 if (!populated_zone(zone))
2013 continue; 2051 continue;
2014 2052
2015 if (zone_is_all_unreclaimable(zone) && 2053 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
2016 priority != DEF_PRIORITY)
2017 continue; 2054 continue;
2018 2055
2019 /* 2056 /*
@@ -2056,13 +2093,9 @@ loop_again:
2056 if (!populated_zone(zone)) 2093 if (!populated_zone(zone))
2057 continue; 2094 continue;
2058 2095
2059 if (zone_is_all_unreclaimable(zone) && 2096 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
2060 priority != DEF_PRIORITY)
2061 continue; 2097 continue;
2062 2098
2063 if (!zone_watermark_ok(zone, order,
2064 high_wmark_pages(zone), end_zone, 0))
2065 all_zones_ok = 0;
2066 temp_priority[i] = priority; 2099 temp_priority[i] = priority;
2067 sc.nr_scanned = 0; 2100 sc.nr_scanned = 0;
2068 note_zone_scanning_priority(zone, priority); 2101 note_zone_scanning_priority(zone, priority);
@@ -2087,12 +2120,11 @@ loop_again:
2087 lru_pages); 2120 lru_pages);
2088 sc.nr_reclaimed += reclaim_state->reclaimed_slab; 2121 sc.nr_reclaimed += reclaim_state->reclaimed_slab;
2089 total_scanned += sc.nr_scanned; 2122 total_scanned += sc.nr_scanned;
2090 if (zone_is_all_unreclaimable(zone)) 2123 if (zone->all_unreclaimable)
2091 continue; 2124 continue;
2092 if (nr_slab == 0 && zone->pages_scanned >= 2125 if (nr_slab == 0 &&
2093 (zone_reclaimable_pages(zone) * 6)) 2126 zone->pages_scanned >= (zone_reclaimable_pages(zone) * 6))
2094 zone_set_flag(zone, 2127 zone->all_unreclaimable = 1;
2095 ZONE_ALL_UNRECLAIMABLE);
2096 /* 2128 /*
2097 * If we've done a decent amount of scanning and 2129 * If we've done a decent amount of scanning and
2098 * the reclaim ratio is low, start doing writepage 2130 * the reclaim ratio is low, start doing writepage
@@ -2102,13 +2134,18 @@ loop_again:
2102 total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) 2134 total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
2103 sc.may_writepage = 1; 2135 sc.may_writepage = 1;
2104 2136
2105 /* 2137 if (!zone_watermark_ok(zone, order,
2106 * We are still under min water mark. it mean we have 2138 high_wmark_pages(zone), end_zone, 0)) {
2107 * GFP_ATOMIC allocation failure risk. Hurry up! 2139 all_zones_ok = 0;
2108 */ 2140 /*
2109 if (!zone_watermark_ok(zone, order, min_wmark_pages(zone), 2141 * We are still under min water mark. This
2110 end_zone, 0)) 2142 * means that we have a GFP_ATOMIC allocation
2111 has_under_min_watermark_zone = 1; 2143 * failure risk. Hurry up!
2144 */
2145 if (!zone_watermark_ok(zone, order,
2146 min_wmark_pages(zone), end_zone, 0))
2147 has_under_min_watermark_zone = 1;
2148 }
2112 2149
2113 } 2150 }
2114 if (all_zones_ok) 2151 if (all_zones_ok)
@@ -2550,6 +2587,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2550 * and RECLAIM_SWAP. 2587 * and RECLAIM_SWAP.
2551 */ 2588 */
2552 p->flags |= PF_MEMALLOC | PF_SWAPWRITE; 2589 p->flags |= PF_MEMALLOC | PF_SWAPWRITE;
2590 lockdep_set_current_reclaim_state(gfp_mask);
2553 reclaim_state.reclaimed_slab = 0; 2591 reclaim_state.reclaimed_slab = 0;
2554 p->reclaim_state = &reclaim_state; 2592 p->reclaim_state = &reclaim_state;
2555 2593
@@ -2593,6 +2631,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2593 2631
2594 p->reclaim_state = NULL; 2632 p->reclaim_state = NULL;
2595 current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); 2633 current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
2634 lockdep_clear_current_reclaim_state();
2596 return sc.nr_reclaimed >= nr_pages; 2635 return sc.nr_reclaimed >= nr_pages;
2597} 2636}
2598 2637
@@ -2615,7 +2654,7 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2615 zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages) 2654 zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages)
2616 return ZONE_RECLAIM_FULL; 2655 return ZONE_RECLAIM_FULL;
2617 2656
2618 if (zone_is_all_unreclaimable(zone)) 2657 if (zone->all_unreclaimable)
2619 return ZONE_RECLAIM_FULL; 2658 return ZONE_RECLAIM_FULL;
2620 2659
2621 /* 2660 /*
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 6051fbab67ba..7f760cbc73f3 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -139,7 +139,8 @@ static void refresh_zone_stat_thresholds(void)
139 threshold = calculate_threshold(zone); 139 threshold = calculate_threshold(zone);
140 140
141 for_each_online_cpu(cpu) 141 for_each_online_cpu(cpu)
142 zone_pcp(zone, cpu)->stat_threshold = threshold; 142 per_cpu_ptr(zone->pageset, cpu)->stat_threshold
143 = threshold;
143 } 144 }
144} 145}
145 146
@@ -149,7 +150,8 @@ static void refresh_zone_stat_thresholds(void)
149void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, 150void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
150 int delta) 151 int delta)
151{ 152{
152 struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id()); 153 struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset);
154
153 s8 *p = pcp->vm_stat_diff + item; 155 s8 *p = pcp->vm_stat_diff + item;
154 long x; 156 long x;
155 157
@@ -202,7 +204,7 @@ EXPORT_SYMBOL(mod_zone_page_state);
202 */ 204 */
203void __inc_zone_state(struct zone *zone, enum zone_stat_item item) 205void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
204{ 206{
205 struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id()); 207 struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset);
206 s8 *p = pcp->vm_stat_diff + item; 208 s8 *p = pcp->vm_stat_diff + item;
207 209
208 (*p)++; 210 (*p)++;
@@ -223,7 +225,7 @@ EXPORT_SYMBOL(__inc_zone_page_state);
223 225
224void __dec_zone_state(struct zone *zone, enum zone_stat_item item) 226void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
225{ 227{
226 struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id()); 228 struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset);
227 s8 *p = pcp->vm_stat_diff + item; 229 s8 *p = pcp->vm_stat_diff + item;
228 230
229 (*p)--; 231 (*p)--;
@@ -300,7 +302,7 @@ void refresh_cpu_vm_stats(int cpu)
300 for_each_populated_zone(zone) { 302 for_each_populated_zone(zone) {
301 struct per_cpu_pageset *p; 303 struct per_cpu_pageset *p;
302 304
303 p = zone_pcp(zone, cpu); 305 p = per_cpu_ptr(zone->pageset, cpu);
304 306
305 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) 307 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
306 if (p->vm_stat_diff[i]) { 308 if (p->vm_stat_diff[i]) {
@@ -741,7 +743,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
741 for_each_online_cpu(i) { 743 for_each_online_cpu(i) {
742 struct per_cpu_pageset *pageset; 744 struct per_cpu_pageset *pageset;
743 745
744 pageset = zone_pcp(zone, i); 746 pageset = per_cpu_ptr(zone->pageset, i);
745 seq_printf(m, 747 seq_printf(m,
746 "\n cpu: %i" 748 "\n cpu: %i"
747 "\n count: %i" 749 "\n count: %i"
@@ -761,7 +763,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
761 "\n prev_priority: %i" 763 "\n prev_priority: %i"
762 "\n start_pfn: %lu" 764 "\n start_pfn: %lu"
763 "\n inactive_ratio: %u", 765 "\n inactive_ratio: %u",
764 zone_is_all_unreclaimable(zone), 766 zone->all_unreclaimable,
765 zone->prev_priority, 767 zone->prev_priority,
766 zone->zone_start_pfn, 768 zone->zone_start_pfn,
767 zone->inactive_ratio); 769 zone->inactive_ratio);
@@ -906,6 +908,7 @@ static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb,
906 case CPU_ONLINE: 908 case CPU_ONLINE:
907 case CPU_ONLINE_FROZEN: 909 case CPU_ONLINE_FROZEN:
908 start_cpu_timer(cpu); 910 start_cpu_timer(cpu);
911 node_set_state(cpu_to_node(cpu), N_CPU);
909 break; 912 break;
910 case CPU_DOWN_PREPARE: 913 case CPU_DOWN_PREPARE:
911 case CPU_DOWN_PREPARE_FROZEN: 914 case CPU_DOWN_PREPARE_FROZEN: