aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig4
-rw-r--r--mm/bootmem.c195
-rw-r--r--mm/fadvise.c10
-rw-r--r--mm/failslab.c18
-rw-r--r--mm/filemap.c4
-rw-r--r--mm/filemap_xip.c2
-rw-r--r--mm/fremap.c2
-rw-r--r--mm/highmem.c2
-rw-r--r--mm/hugetlb.c4
-rw-r--r--mm/ksm.c12
-rw-r--r--mm/memcontrol.c1388
-rw-r--r--mm/memory-failure.c5
-rw-r--r--mm/memory.c180
-rw-r--r--mm/memory_hotplug.c10
-rw-r--r--mm/mempolicy.c112
-rw-r--r--mm/migrate.c6
-rw-r--r--mm/mlock.c12
-rw-r--r--mm/mmap.c175
-rw-r--r--mm/mmu_context.c3
-rw-r--r--mm/mremap.c9
-rw-r--r--mm/nommu.c30
-rw-r--r--mm/oom_kill.c14
-rw-r--r--mm/page_alloc.c401
-rw-r--r--mm/page_cgroup.c34
-rw-r--r--mm/percpu.c36
-rw-r--r--mm/readahead.c6
-rw-r--r--mm/rmap.c185
-rw-r--r--mm/slab.c13
-rw-r--r--mm/slub.c343
-rw-r--r--mm/sparse-vmemmap.c76
-rw-r--r--mm/sparse.c196
-rw-r--r--mm/swap.c2
-rw-r--r--mm/swapfile.c71
-rw-r--r--mm/vmscan.c177
-rw-r--r--mm/vmstat.c17
35 files changed, 2777 insertions, 977 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index d34c2b971032..9c61158308dc 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -115,6 +115,10 @@ config SPARSEMEM_EXTREME
115config SPARSEMEM_VMEMMAP_ENABLE 115config SPARSEMEM_VMEMMAP_ENABLE
116 bool 116 bool
117 117
118config SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
119 def_bool y
120 depends on SPARSEMEM && X86_64
121
118config SPARSEMEM_VMEMMAP 122config SPARSEMEM_VMEMMAP
119 bool "Sparse Memory virtual memmap" 123 bool "Sparse Memory virtual memmap"
120 depends on SPARSEMEM && SPARSEMEM_VMEMMAP_ENABLE 124 depends on SPARSEMEM && SPARSEMEM_VMEMMAP_ENABLE
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 7d1486875e1c..d7c791ef0036 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -13,6 +13,7 @@
13#include <linux/bootmem.h> 13#include <linux/bootmem.h>
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/kmemleak.h> 15#include <linux/kmemleak.h>
16#include <linux/range.h>
16 17
17#include <asm/bug.h> 18#include <asm/bug.h>
18#include <asm/io.h> 19#include <asm/io.h>
@@ -32,6 +33,7 @@ unsigned long max_pfn;
32unsigned long saved_max_pfn; 33unsigned long saved_max_pfn;
33#endif 34#endif
34 35
36#ifndef CONFIG_NO_BOOTMEM
35bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata; 37bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata;
36 38
37static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list); 39static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list);
@@ -142,7 +144,7 @@ unsigned long __init init_bootmem(unsigned long start, unsigned long pages)
142 min_low_pfn = start; 144 min_low_pfn = start;
143 return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages); 145 return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages);
144} 146}
145 147#endif
146/* 148/*
147 * free_bootmem_late - free bootmem pages directly to page allocator 149 * free_bootmem_late - free bootmem pages directly to page allocator
148 * @addr: starting address of the range 150 * @addr: starting address of the range
@@ -167,6 +169,60 @@ void __init free_bootmem_late(unsigned long addr, unsigned long size)
167 } 169 }
168} 170}
169 171
172#ifdef CONFIG_NO_BOOTMEM
173static void __init __free_pages_memory(unsigned long start, unsigned long end)
174{
175 int i;
176 unsigned long start_aligned, end_aligned;
177 int order = ilog2(BITS_PER_LONG);
178
179 start_aligned = (start + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1);
180 end_aligned = end & ~(BITS_PER_LONG - 1);
181
182 if (end_aligned <= start_aligned) {
183#if 1
184 printk(KERN_DEBUG " %lx - %lx\n", start, end);
185#endif
186 for (i = start; i < end; i++)
187 __free_pages_bootmem(pfn_to_page(i), 0);
188
189 return;
190 }
191
192#if 1
193 printk(KERN_DEBUG " %lx %lx - %lx %lx\n",
194 start, start_aligned, end_aligned, end);
195#endif
196 for (i = start; i < start_aligned; i++)
197 __free_pages_bootmem(pfn_to_page(i), 0);
198
199 for (i = start_aligned; i < end_aligned; i += BITS_PER_LONG)
200 __free_pages_bootmem(pfn_to_page(i), order);
201
202 for (i = end_aligned; i < end; i++)
203 __free_pages_bootmem(pfn_to_page(i), 0);
204}
205
206unsigned long __init free_all_memory_core_early(int nodeid)
207{
208 int i;
209 u64 start, end;
210 unsigned long count = 0;
211 struct range *range = NULL;
212 int nr_range;
213
214 nr_range = get_free_all_memory_range(&range, nodeid);
215
216 for (i = 0; i < nr_range; i++) {
217 start = range[i].start;
218 end = range[i].end;
219 count += end - start;
220 __free_pages_memory(start, end);
221 }
222
223 return count;
224}
225#else
170static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) 226static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
171{ 227{
172 int aligned; 228 int aligned;
@@ -227,6 +283,7 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
227 283
228 return count; 284 return count;
229} 285}
286#endif
230 287
231/** 288/**
232 * free_all_bootmem_node - release a node's free pages to the buddy allocator 289 * free_all_bootmem_node - release a node's free pages to the buddy allocator
@@ -237,7 +294,12 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
237unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) 294unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
238{ 295{
239 register_page_bootmem_info_node(pgdat); 296 register_page_bootmem_info_node(pgdat);
297#ifdef CONFIG_NO_BOOTMEM
298 /* free_all_memory_core_early(MAX_NUMNODES) will be called later */
299 return 0;
300#else
240 return free_all_bootmem_core(pgdat->bdata); 301 return free_all_bootmem_core(pgdat->bdata);
302#endif
241} 303}
242 304
243/** 305/**
@@ -247,9 +309,14 @@ unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
247 */ 309 */
248unsigned long __init free_all_bootmem(void) 310unsigned long __init free_all_bootmem(void)
249{ 311{
312#ifdef CONFIG_NO_BOOTMEM
313 return free_all_memory_core_early(NODE_DATA(0)->node_id);
314#else
250 return free_all_bootmem_core(NODE_DATA(0)->bdata); 315 return free_all_bootmem_core(NODE_DATA(0)->bdata);
316#endif
251} 317}
252 318
319#ifndef CONFIG_NO_BOOTMEM
253static void __init __free(bootmem_data_t *bdata, 320static void __init __free(bootmem_data_t *bdata,
254 unsigned long sidx, unsigned long eidx) 321 unsigned long sidx, unsigned long eidx)
255{ 322{
@@ -344,6 +411,7 @@ static int __init mark_bootmem(unsigned long start, unsigned long end,
344 } 411 }
345 BUG(); 412 BUG();
346} 413}
414#endif
347 415
348/** 416/**
349 * free_bootmem_node - mark a page range as usable 417 * free_bootmem_node - mark a page range as usable
@@ -358,6 +426,12 @@ static int __init mark_bootmem(unsigned long start, unsigned long end,
358void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, 426void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
359 unsigned long size) 427 unsigned long size)
360{ 428{
429#ifdef CONFIG_NO_BOOTMEM
430 free_early(physaddr, physaddr + size);
431#if 0
432 printk(KERN_DEBUG "free %lx %lx\n", physaddr, size);
433#endif
434#else
361 unsigned long start, end; 435 unsigned long start, end;
362 436
363 kmemleak_free_part(__va(physaddr), size); 437 kmemleak_free_part(__va(physaddr), size);
@@ -366,6 +440,7 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
366 end = PFN_DOWN(physaddr + size); 440 end = PFN_DOWN(physaddr + size);
367 441
368 mark_bootmem_node(pgdat->bdata, start, end, 0, 0); 442 mark_bootmem_node(pgdat->bdata, start, end, 0, 0);
443#endif
369} 444}
370 445
371/** 446/**
@@ -379,6 +454,12 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
379 */ 454 */
380void __init free_bootmem(unsigned long addr, unsigned long size) 455void __init free_bootmem(unsigned long addr, unsigned long size)
381{ 456{
457#ifdef CONFIG_NO_BOOTMEM
458 free_early(addr, addr + size);
459#if 0
460 printk(KERN_DEBUG "free %lx %lx\n", addr, size);
461#endif
462#else
382 unsigned long start, end; 463 unsigned long start, end;
383 464
384 kmemleak_free_part(__va(addr), size); 465 kmemleak_free_part(__va(addr), size);
@@ -387,6 +468,7 @@ void __init free_bootmem(unsigned long addr, unsigned long size)
387 end = PFN_DOWN(addr + size); 468 end = PFN_DOWN(addr + size);
388 469
389 mark_bootmem(start, end, 0, 0); 470 mark_bootmem(start, end, 0, 0);
471#endif
390} 472}
391 473
392/** 474/**
@@ -403,12 +485,17 @@ void __init free_bootmem(unsigned long addr, unsigned long size)
403int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, 485int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
404 unsigned long size, int flags) 486 unsigned long size, int flags)
405{ 487{
488#ifdef CONFIG_NO_BOOTMEM
489 panic("no bootmem");
490 return 0;
491#else
406 unsigned long start, end; 492 unsigned long start, end;
407 493
408 start = PFN_DOWN(physaddr); 494 start = PFN_DOWN(physaddr);
409 end = PFN_UP(physaddr + size); 495 end = PFN_UP(physaddr + size);
410 496
411 return mark_bootmem_node(pgdat->bdata, start, end, 1, flags); 497 return mark_bootmem_node(pgdat->bdata, start, end, 1, flags);
498#endif
412} 499}
413 500
414/** 501/**
@@ -424,14 +511,20 @@ int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
424int __init reserve_bootmem(unsigned long addr, unsigned long size, 511int __init reserve_bootmem(unsigned long addr, unsigned long size,
425 int flags) 512 int flags)
426{ 513{
514#ifdef CONFIG_NO_BOOTMEM
515 panic("no bootmem");
516 return 0;
517#else
427 unsigned long start, end; 518 unsigned long start, end;
428 519
429 start = PFN_DOWN(addr); 520 start = PFN_DOWN(addr);
430 end = PFN_UP(addr + size); 521 end = PFN_UP(addr + size);
431 522
432 return mark_bootmem(start, end, 1, flags); 523 return mark_bootmem(start, end, 1, flags);
524#endif
433} 525}
434 526
527#ifndef CONFIG_NO_BOOTMEM
435static unsigned long __init align_idx(struct bootmem_data *bdata, 528static unsigned long __init align_idx(struct bootmem_data *bdata,
436 unsigned long idx, unsigned long step) 529 unsigned long idx, unsigned long step)
437{ 530{
@@ -582,12 +675,33 @@ static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata,
582#endif 675#endif
583 return NULL; 676 return NULL;
584} 677}
678#endif
585 679
586static void * __init ___alloc_bootmem_nopanic(unsigned long size, 680static void * __init ___alloc_bootmem_nopanic(unsigned long size,
587 unsigned long align, 681 unsigned long align,
588 unsigned long goal, 682 unsigned long goal,
589 unsigned long limit) 683 unsigned long limit)
590{ 684{
685#ifdef CONFIG_NO_BOOTMEM
686 void *ptr;
687
688 if (WARN_ON_ONCE(slab_is_available()))
689 return kzalloc(size, GFP_NOWAIT);
690
691restart:
692
693 ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal, limit);
694
695 if (ptr)
696 return ptr;
697
698 if (goal != 0) {
699 goal = 0;
700 goto restart;
701 }
702
703 return NULL;
704#else
591 bootmem_data_t *bdata; 705 bootmem_data_t *bdata;
592 void *region; 706 void *region;
593 707
@@ -613,6 +727,7 @@ restart:
613 } 727 }
614 728
615 return NULL; 729 return NULL;
730#endif
616} 731}
617 732
618/** 733/**
@@ -631,7 +746,13 @@ restart:
631void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align, 746void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
632 unsigned long goal) 747 unsigned long goal)
633{ 748{
634 return ___alloc_bootmem_nopanic(size, align, goal, 0); 749 unsigned long limit = 0;
750
751#ifdef CONFIG_NO_BOOTMEM
752 limit = -1UL;
753#endif
754
755 return ___alloc_bootmem_nopanic(size, align, goal, limit);
635} 756}
636 757
637static void * __init ___alloc_bootmem(unsigned long size, unsigned long align, 758static void * __init ___alloc_bootmem(unsigned long size, unsigned long align,
@@ -665,9 +786,16 @@ static void * __init ___alloc_bootmem(unsigned long size, unsigned long align,
665void * __init __alloc_bootmem(unsigned long size, unsigned long align, 786void * __init __alloc_bootmem(unsigned long size, unsigned long align,
666 unsigned long goal) 787 unsigned long goal)
667{ 788{
668 return ___alloc_bootmem(size, align, goal, 0); 789 unsigned long limit = 0;
790
791#ifdef CONFIG_NO_BOOTMEM
792 limit = -1UL;
793#endif
794
795 return ___alloc_bootmem(size, align, goal, limit);
669} 796}
670 797
798#ifndef CONFIG_NO_BOOTMEM
671static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata, 799static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata,
672 unsigned long size, unsigned long align, 800 unsigned long size, unsigned long align,
673 unsigned long goal, unsigned long limit) 801 unsigned long goal, unsigned long limit)
@@ -684,6 +812,7 @@ static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata,
684 812
685 return ___alloc_bootmem(size, align, goal, limit); 813 return ___alloc_bootmem(size, align, goal, limit);
686} 814}
815#endif
687 816
688/** 817/**
689 * __alloc_bootmem_node - allocate boot memory from a specific node 818 * __alloc_bootmem_node - allocate boot memory from a specific node
@@ -706,7 +835,46 @@ void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
706 if (WARN_ON_ONCE(slab_is_available())) 835 if (WARN_ON_ONCE(slab_is_available()))
707 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); 836 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
708 837
838#ifdef CONFIG_NO_BOOTMEM
839 return __alloc_memory_core_early(pgdat->node_id, size, align,
840 goal, -1ULL);
841#else
709 return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0); 842 return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0);
843#endif
844}
845
846void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
847 unsigned long align, unsigned long goal)
848{
849#ifdef MAX_DMA32_PFN
850 unsigned long end_pfn;
851
852 if (WARN_ON_ONCE(slab_is_available()))
853 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
854
855 /* update goal according ...MAX_DMA32_PFN */
856 end_pfn = pgdat->node_start_pfn + pgdat->node_spanned_pages;
857
858 if (end_pfn > MAX_DMA32_PFN + (128 >> (20 - PAGE_SHIFT)) &&
859 (goal >> PAGE_SHIFT) < MAX_DMA32_PFN) {
860 void *ptr;
861 unsigned long new_goal;
862
863 new_goal = MAX_DMA32_PFN << PAGE_SHIFT;
864#ifdef CONFIG_NO_BOOTMEM
865 ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
866 new_goal, -1ULL);
867#else
868 ptr = alloc_bootmem_core(pgdat->bdata, size, align,
869 new_goal, 0);
870#endif
871 if (ptr)
872 return ptr;
873 }
874#endif
875
876 return __alloc_bootmem_node(pgdat, size, align, goal);
877
710} 878}
711 879
712#ifdef CONFIG_SPARSEMEM 880#ifdef CONFIG_SPARSEMEM
@@ -720,6 +888,16 @@ void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
720void * __init alloc_bootmem_section(unsigned long size, 888void * __init alloc_bootmem_section(unsigned long size,
721 unsigned long section_nr) 889 unsigned long section_nr)
722{ 890{
891#ifdef CONFIG_NO_BOOTMEM
892 unsigned long pfn, goal, limit;
893
894 pfn = section_nr_to_pfn(section_nr);
895 goal = pfn << PAGE_SHIFT;
896 limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT;
897
898 return __alloc_memory_core_early(early_pfn_to_nid(pfn), size,
899 SMP_CACHE_BYTES, goal, limit);
900#else
723 bootmem_data_t *bdata; 901 bootmem_data_t *bdata;
724 unsigned long pfn, goal, limit; 902 unsigned long pfn, goal, limit;
725 903
@@ -729,6 +907,7 @@ void * __init alloc_bootmem_section(unsigned long size,
729 bdata = &bootmem_node_data[early_pfn_to_nid(pfn)]; 907 bdata = &bootmem_node_data[early_pfn_to_nid(pfn)];
730 908
731 return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, limit); 909 return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, limit);
910#endif
732} 911}
733#endif 912#endif
734 913
@@ -740,11 +919,16 @@ void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
740 if (WARN_ON_ONCE(slab_is_available())) 919 if (WARN_ON_ONCE(slab_is_available()))
741 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); 920 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
742 921
922#ifdef CONFIG_NO_BOOTMEM
923 ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
924 goal, -1ULL);
925#else
743 ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0); 926 ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0);
744 if (ptr) 927 if (ptr)
745 return ptr; 928 return ptr;
746 929
747 ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0); 930 ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0);
931#endif
748 if (ptr) 932 if (ptr)
749 return ptr; 933 return ptr;
750 934
@@ -795,6 +979,11 @@ void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size,
795 if (WARN_ON_ONCE(slab_is_available())) 979 if (WARN_ON_ONCE(slab_is_available()))
796 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); 980 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
797 981
982#ifdef CONFIG_NO_BOOTMEM
983 return __alloc_memory_core_early(pgdat->node_id, size, align,
984 goal, ARCH_LOW_ADDRESS_LIMIT);
985#else
798 return ___alloc_bootmem_node(pgdat->bdata, size, align, 986 return ___alloc_bootmem_node(pgdat->bdata, size, align,
799 goal, ARCH_LOW_ADDRESS_LIMIT); 987 goal, ARCH_LOW_ADDRESS_LIMIT);
988#endif
800} 989}
diff --git a/mm/fadvise.c b/mm/fadvise.c
index e43359214f6f..8d723c9e8b75 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -77,12 +77,20 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
77 switch (advice) { 77 switch (advice) {
78 case POSIX_FADV_NORMAL: 78 case POSIX_FADV_NORMAL:
79 file->f_ra.ra_pages = bdi->ra_pages; 79 file->f_ra.ra_pages = bdi->ra_pages;
80 spin_lock(&file->f_lock);
81 file->f_mode &= ~FMODE_RANDOM;
82 spin_unlock(&file->f_lock);
80 break; 83 break;
81 case POSIX_FADV_RANDOM: 84 case POSIX_FADV_RANDOM:
82 file->f_ra.ra_pages = 0; 85 spin_lock(&file->f_lock);
86 file->f_mode |= FMODE_RANDOM;
87 spin_unlock(&file->f_lock);
83 break; 88 break;
84 case POSIX_FADV_SEQUENTIAL: 89 case POSIX_FADV_SEQUENTIAL:
85 file->f_ra.ra_pages = bdi->ra_pages * 2; 90 file->f_ra.ra_pages = bdi->ra_pages * 2;
91 spin_lock(&file->f_lock);
92 file->f_mode &= ~FMODE_RANDOM;
93 spin_unlock(&file->f_lock);
86 break; 94 break;
87 case POSIX_FADV_WILLNEED: 95 case POSIX_FADV_WILLNEED:
88 if (!mapping->a_ops->readpage) { 96 if (!mapping->a_ops->readpage) {
diff --git a/mm/failslab.c b/mm/failslab.c
index 9339de5f0a91..bb41f98dd8b7 100644
--- a/mm/failslab.c
+++ b/mm/failslab.c
@@ -1,18 +1,22 @@
1#include <linux/fault-inject.h> 1#include <linux/fault-inject.h>
2#include <linux/gfp.h> 2#include <linux/gfp.h>
3#include <linux/slab.h>
3 4
4static struct { 5static struct {
5 struct fault_attr attr; 6 struct fault_attr attr;
6 u32 ignore_gfp_wait; 7 u32 ignore_gfp_wait;
8 int cache_filter;
7#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS 9#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
8 struct dentry *ignore_gfp_wait_file; 10 struct dentry *ignore_gfp_wait_file;
11 struct dentry *cache_filter_file;
9#endif 12#endif
10} failslab = { 13} failslab = {
11 .attr = FAULT_ATTR_INITIALIZER, 14 .attr = FAULT_ATTR_INITIALIZER,
12 .ignore_gfp_wait = 1, 15 .ignore_gfp_wait = 1,
16 .cache_filter = 0,
13}; 17};
14 18
15bool should_failslab(size_t size, gfp_t gfpflags) 19bool should_failslab(size_t size, gfp_t gfpflags, unsigned long cache_flags)
16{ 20{
17 if (gfpflags & __GFP_NOFAIL) 21 if (gfpflags & __GFP_NOFAIL)
18 return false; 22 return false;
@@ -20,6 +24,9 @@ bool should_failslab(size_t size, gfp_t gfpflags)
20 if (failslab.ignore_gfp_wait && (gfpflags & __GFP_WAIT)) 24 if (failslab.ignore_gfp_wait && (gfpflags & __GFP_WAIT))
21 return false; 25 return false;
22 26
27 if (failslab.cache_filter && !(cache_flags & SLAB_FAILSLAB))
28 return false;
29
23 return should_fail(&failslab.attr, size); 30 return should_fail(&failslab.attr, size);
24} 31}
25 32
@@ -30,7 +37,6 @@ static int __init setup_failslab(char *str)
30__setup("failslab=", setup_failslab); 37__setup("failslab=", setup_failslab);
31 38
32#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS 39#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
33
34static int __init failslab_debugfs_init(void) 40static int __init failslab_debugfs_init(void)
35{ 41{
36 mode_t mode = S_IFREG | S_IRUSR | S_IWUSR; 42 mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
@@ -46,8 +52,14 @@ static int __init failslab_debugfs_init(void)
46 debugfs_create_bool("ignore-gfp-wait", mode, dir, 52 debugfs_create_bool("ignore-gfp-wait", mode, dir,
47 &failslab.ignore_gfp_wait); 53 &failslab.ignore_gfp_wait);
48 54
49 if (!failslab.ignore_gfp_wait_file) { 55 failslab.cache_filter_file =
56 debugfs_create_bool("cache-filter", mode, dir,
57 &failslab.cache_filter);
58
59 if (!failslab.ignore_gfp_wait_file ||
60 !failslab.cache_filter_file) {
50 err = -ENOMEM; 61 err = -ENOMEM;
62 debugfs_remove(failslab.cache_filter_file);
51 debugfs_remove(failslab.ignore_gfp_wait_file); 63 debugfs_remove(failslab.ignore_gfp_wait_file);
52 cleanup_fault_attr_dentries(&failslab.attr); 64 cleanup_fault_attr_dentries(&failslab.attr);
53 } 65 }
diff --git a/mm/filemap.c b/mm/filemap.c
index 698ea80f2102..045b31c37653 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1117,7 +1117,7 @@ readpage:
1117 if (!PageUptodate(page)) { 1117 if (!PageUptodate(page)) {
1118 if (page->mapping == NULL) { 1118 if (page->mapping == NULL) {
1119 /* 1119 /*
1120 * invalidate_inode_pages got it 1120 * invalidate_mapping_pages got it
1121 */ 1121 */
1122 unlock_page(page); 1122 unlock_page(page);
1123 page_cache_release(page); 1123 page_cache_release(page);
@@ -1986,7 +1986,7 @@ EXPORT_SYMBOL(iov_iter_single_seg_count);
1986inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk) 1986inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk)
1987{ 1987{
1988 struct inode *inode = file->f_mapping->host; 1988 struct inode *inode = file->f_mapping->host;
1989 unsigned long limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; 1989 unsigned long limit = rlimit(RLIMIT_FSIZE);
1990 1990
1991 if (unlikely(*pos < 0)) 1991 if (unlikely(*pos < 0))
1992 return -EINVAL; 1992 return -EINVAL;
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 1888b2d71bb8..78b94f0b6d5d 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -194,7 +194,7 @@ retry:
194 flush_cache_page(vma, address, pte_pfn(*pte)); 194 flush_cache_page(vma, address, pte_pfn(*pte));
195 pteval = ptep_clear_flush_notify(vma, address, pte); 195 pteval = ptep_clear_flush_notify(vma, address, pte);
196 page_remove_rmap(page); 196 page_remove_rmap(page);
197 dec_mm_counter(mm, file_rss); 197 dec_mm_counter(mm, MM_FILEPAGES);
198 BUG_ON(pte_dirty(pteval)); 198 BUG_ON(pte_dirty(pteval));
199 pte_unmap_unlock(pte, ptl); 199 pte_unmap_unlock(pte, ptl);
200 page_cache_release(page); 200 page_cache_release(page);
diff --git a/mm/fremap.c b/mm/fremap.c
index b6ec85abbb39..46f5dacf90a2 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -40,7 +40,7 @@ static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
40 page_remove_rmap(page); 40 page_remove_rmap(page);
41 page_cache_release(page); 41 page_cache_release(page);
42 update_hiwater_rss(mm); 42 update_hiwater_rss(mm);
43 dec_mm_counter(mm, file_rss); 43 dec_mm_counter(mm, MM_FILEPAGES);
44 } 44 }
45 } else { 45 } else {
46 if (!pte_file(pte)) 46 if (!pte_file(pte))
diff --git a/mm/highmem.c b/mm/highmem.c
index 9c1e627f282e..bed8a8bfd01f 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -220,7 +220,7 @@ EXPORT_SYMBOL(kmap_high);
220 * @page: &struct page to pin 220 * @page: &struct page to pin
221 * 221 *
222 * Returns the page's current virtual memory address, or NULL if no mapping 222 * Returns the page's current virtual memory address, or NULL if no mapping
223 * exists. When and only when a non null address is returned then a 223 * exists. If and only if a non null address is returned then a
224 * matching call to kunmap_high() is necessary. 224 * matching call to kunmap_high() is necessary.
225 * 225 *
226 * This can be called from any context. 226 * This can be called from any context.
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 2d16fa6b8c2d..3a5aeb37c110 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2087,7 +2087,7 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma,
2087 2087
2088 entry = pte_mkwrite(pte_mkdirty(huge_ptep_get(ptep))); 2088 entry = pte_mkwrite(pte_mkdirty(huge_ptep_get(ptep)));
2089 if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) { 2089 if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) {
2090 update_mmu_cache(vma, address, entry); 2090 update_mmu_cache(vma, address, ptep);
2091 } 2091 }
2092} 2092}
2093 2093
@@ -2558,7 +2558,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2558 entry = pte_mkyoung(entry); 2558 entry = pte_mkyoung(entry);
2559 if (huge_ptep_set_access_flags(vma, address, ptep, entry, 2559 if (huge_ptep_set_access_flags(vma, address, ptep, entry,
2560 flags & FAULT_FLAG_WRITE)) 2560 flags & FAULT_FLAG_WRITE))
2561 update_mmu_cache(vma, address, entry); 2561 update_mmu_cache(vma, address, ptep);
2562 2562
2563out_page_table_lock: 2563out_page_table_lock:
2564 spin_unlock(&mm->page_table_lock); 2564 spin_unlock(&mm->page_table_lock);
diff --git a/mm/ksm.c b/mm/ksm.c
index 56a0da1f9979..a93f1b7f508c 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1563,10 +1563,12 @@ int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg,
1563again: 1563again:
1564 hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { 1564 hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
1565 struct anon_vma *anon_vma = rmap_item->anon_vma; 1565 struct anon_vma *anon_vma = rmap_item->anon_vma;
1566 struct anon_vma_chain *vmac;
1566 struct vm_area_struct *vma; 1567 struct vm_area_struct *vma;
1567 1568
1568 spin_lock(&anon_vma->lock); 1569 spin_lock(&anon_vma->lock);
1569 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { 1570 list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
1571 vma = vmac->vma;
1570 if (rmap_item->address < vma->vm_start || 1572 if (rmap_item->address < vma->vm_start ||
1571 rmap_item->address >= vma->vm_end) 1573 rmap_item->address >= vma->vm_end)
1572 continue; 1574 continue;
@@ -1614,10 +1616,12 @@ int try_to_unmap_ksm(struct page *page, enum ttu_flags flags)
1614again: 1616again:
1615 hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { 1617 hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
1616 struct anon_vma *anon_vma = rmap_item->anon_vma; 1618 struct anon_vma *anon_vma = rmap_item->anon_vma;
1619 struct anon_vma_chain *vmac;
1617 struct vm_area_struct *vma; 1620 struct vm_area_struct *vma;
1618 1621
1619 spin_lock(&anon_vma->lock); 1622 spin_lock(&anon_vma->lock);
1620 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { 1623 list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
1624 vma = vmac->vma;
1621 if (rmap_item->address < vma->vm_start || 1625 if (rmap_item->address < vma->vm_start ||
1622 rmap_item->address >= vma->vm_end) 1626 rmap_item->address >= vma->vm_end)
1623 continue; 1627 continue;
@@ -1664,10 +1668,12 @@ int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *,
1664again: 1668again:
1665 hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { 1669 hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
1666 struct anon_vma *anon_vma = rmap_item->anon_vma; 1670 struct anon_vma *anon_vma = rmap_item->anon_vma;
1671 struct anon_vma_chain *vmac;
1667 struct vm_area_struct *vma; 1672 struct vm_area_struct *vma;
1668 1673
1669 spin_lock(&anon_vma->lock); 1674 spin_lock(&anon_vma->lock);
1670 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { 1675 list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
1676 vma = vmac->vma;
1671 if (rmap_item->address < vma->vm_start || 1677 if (rmap_item->address < vma->vm_start ||
1672 rmap_item->address >= vma->vm_end) 1678 rmap_item->address >= vma->vm_end)
1673 continue; 1679 continue;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 954032b80bed..7973b5221fb8 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -6,6 +6,10 @@
6 * Copyright 2007 OpenVZ SWsoft Inc 6 * Copyright 2007 OpenVZ SWsoft Inc
7 * Author: Pavel Emelianov <xemul@openvz.org> 7 * Author: Pavel Emelianov <xemul@openvz.org>
8 * 8 *
9 * Memory thresholds
10 * Copyright (C) 2009 Nokia Corporation
11 * Author: Kirill A. Shutemov
12 *
9 * This program is free software; you can redistribute it and/or modify 13 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by 14 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or 15 * the Free Software Foundation; either version 2 of the License, or
@@ -21,6 +25,7 @@
21#include <linux/memcontrol.h> 25#include <linux/memcontrol.h>
22#include <linux/cgroup.h> 26#include <linux/cgroup.h>
23#include <linux/mm.h> 27#include <linux/mm.h>
28#include <linux/hugetlb.h>
24#include <linux/pagemap.h> 29#include <linux/pagemap.h>
25#include <linux/smp.h> 30#include <linux/smp.h>
26#include <linux/page-flags.h> 31#include <linux/page-flags.h>
@@ -32,7 +37,10 @@
32#include <linux/rbtree.h> 37#include <linux/rbtree.h>
33#include <linux/slab.h> 38#include <linux/slab.h>
34#include <linux/swap.h> 39#include <linux/swap.h>
40#include <linux/swapops.h>
35#include <linux/spinlock.h> 41#include <linux/spinlock.h>
42#include <linux/eventfd.h>
43#include <linux/sort.h>
36#include <linux/fs.h> 44#include <linux/fs.h>
37#include <linux/seq_file.h> 45#include <linux/seq_file.h>
38#include <linux/vmalloc.h> 46#include <linux/vmalloc.h>
@@ -55,7 +63,15 @@ static int really_do_swap_account __initdata = 1; /* for remember boot option*/
55#define do_swap_account (0) 63#define do_swap_account (0)
56#endif 64#endif
57 65
58#define SOFTLIMIT_EVENTS_THRESH (1000) 66/*
67 * Per memcg event counter is incremented at every pagein/pageout. This counter
68 * is used for trigger some periodic events. This is straightforward and better
69 * than using jiffies etc. to handle periodic memcg event.
70 *
71 * These values will be used as !((event) & ((1 <<(thresh)) - 1))
72 */
73#define THRESHOLDS_EVENTS_THRESH (7) /* once in 128 */
74#define SOFTLIMIT_EVENTS_THRESH (10) /* once in 1024 */
59 75
60/* 76/*
61 * Statistics for memory cgroup. 77 * Statistics for memory cgroup.
@@ -69,62 +85,16 @@ enum mem_cgroup_stat_index {
69 MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ 85 MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */
70 MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ 86 MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */
71 MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ 87 MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */
72 MEM_CGROUP_STAT_EVENTS, /* sum of pagein + pageout for internal use */
73 MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ 88 MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
89 MEM_CGROUP_EVENTS, /* incremented at every pagein/pageout */
74 90
75 MEM_CGROUP_STAT_NSTATS, 91 MEM_CGROUP_STAT_NSTATS,
76}; 92};
77 93
78struct mem_cgroup_stat_cpu { 94struct mem_cgroup_stat_cpu {
79 s64 count[MEM_CGROUP_STAT_NSTATS]; 95 s64 count[MEM_CGROUP_STAT_NSTATS];
80} ____cacheline_aligned_in_smp;
81
82struct mem_cgroup_stat {
83 struct mem_cgroup_stat_cpu cpustat[0];
84}; 96};
85 97
86static inline void
87__mem_cgroup_stat_reset_safe(struct mem_cgroup_stat_cpu *stat,
88 enum mem_cgroup_stat_index idx)
89{
90 stat->count[idx] = 0;
91}
92
93static inline s64
94__mem_cgroup_stat_read_local(struct mem_cgroup_stat_cpu *stat,
95 enum mem_cgroup_stat_index idx)
96{
97 return stat->count[idx];
98}
99
100/*
101 * For accounting under irq disable, no need for increment preempt count.
102 */
103static inline void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat_cpu *stat,
104 enum mem_cgroup_stat_index idx, int val)
105{
106 stat->count[idx] += val;
107}
108
109static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat,
110 enum mem_cgroup_stat_index idx)
111{
112 int cpu;
113 s64 ret = 0;
114 for_each_possible_cpu(cpu)
115 ret += stat->cpustat[cpu].count[idx];
116 return ret;
117}
118
119static s64 mem_cgroup_local_usage(struct mem_cgroup_stat *stat)
120{
121 s64 ret;
122
123 ret = mem_cgroup_read_stat(stat, MEM_CGROUP_STAT_CACHE);
124 ret += mem_cgroup_read_stat(stat, MEM_CGROUP_STAT_RSS);
125 return ret;
126}
127
128/* 98/*
129 * per-zone information in memory controller. 99 * per-zone information in memory controller.
130 */ 100 */
@@ -174,6 +144,22 @@ struct mem_cgroup_tree {
174 144
175static struct mem_cgroup_tree soft_limit_tree __read_mostly; 145static struct mem_cgroup_tree soft_limit_tree __read_mostly;
176 146
147struct mem_cgroup_threshold {
148 struct eventfd_ctx *eventfd;
149 u64 threshold;
150};
151
152struct mem_cgroup_threshold_ary {
153 /* An array index points to threshold just below usage. */
154 atomic_t current_threshold;
155 /* Size of entries[] */
156 unsigned int size;
157 /* Array of thresholds */
158 struct mem_cgroup_threshold entries[0];
159};
160
161static void mem_cgroup_threshold(struct mem_cgroup *mem);
162
177/* 163/*
178 * The memory controller data structure. The memory controller controls both 164 * The memory controller data structure. The memory controller controls both
179 * page cache and RSS per cgroup. We would eventually like to provide 165 * page cache and RSS per cgroup. We would eventually like to provide
@@ -217,7 +203,7 @@ struct mem_cgroup {
217 * Should the accounting and control be hierarchical, per subtree? 203 * Should the accounting and control be hierarchical, per subtree?
218 */ 204 */
219 bool use_hierarchy; 205 bool use_hierarchy;
220 unsigned long last_oom_jiffies; 206 atomic_t oom_lock;
221 atomic_t refcnt; 207 atomic_t refcnt;
222 208
223 unsigned int swappiness; 209 unsigned int swappiness;
@@ -225,10 +211,48 @@ struct mem_cgroup {
225 /* set when res.limit == memsw.limit */ 211 /* set when res.limit == memsw.limit */
226 bool memsw_is_minimum; 212 bool memsw_is_minimum;
227 213
214 /* protect arrays of thresholds */
215 struct mutex thresholds_lock;
216
217 /* thresholds for memory usage. RCU-protected */
218 struct mem_cgroup_threshold_ary *thresholds;
219
220 /* thresholds for mem+swap usage. RCU-protected */
221 struct mem_cgroup_threshold_ary *memsw_thresholds;
222
228 /* 223 /*
229 * statistics. This must be placed at the end of memcg. 224 * Should we move charges of a task when a task is moved into this
225 * mem_cgroup ? And what type of charges should we move ?
230 */ 226 */
231 struct mem_cgroup_stat stat; 227 unsigned long move_charge_at_immigrate;
228
229 /*
230 * percpu counter.
231 */
232 struct mem_cgroup_stat_cpu *stat;
233};
234
235/* Stuffs for move charges at task migration. */
236/*
237 * Types of charges to be moved. "move_charge_at_immitgrate" is treated as a
238 * left-shifted bitmap of these types.
239 */
240enum move_type {
241 MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */
242 NR_MOVE_TYPE,
243};
244
245/* "mc" and its members are protected by cgroup_mutex */
246static struct move_charge_struct {
247 struct mem_cgroup *from;
248 struct mem_cgroup *to;
249 unsigned long precharge;
250 unsigned long moved_charge;
251 unsigned long moved_swap;
252 struct task_struct *moving_task; /* a task moving charges */
253 wait_queue_head_t waitq; /* a waitq for other context */
254} mc = {
255 .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
232}; 256};
233 257
234/* 258/*
@@ -371,23 +395,6 @@ mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
371 spin_unlock(&mctz->lock); 395 spin_unlock(&mctz->lock);
372} 396}
373 397
374static bool mem_cgroup_soft_limit_check(struct mem_cgroup *mem)
375{
376 bool ret = false;
377 int cpu;
378 s64 val;
379 struct mem_cgroup_stat_cpu *cpustat;
380
381 cpu = get_cpu();
382 cpustat = &mem->stat.cpustat[cpu];
383 val = __mem_cgroup_stat_read_local(cpustat, MEM_CGROUP_STAT_EVENTS);
384 if (unlikely(val > SOFTLIMIT_EVENTS_THRESH)) {
385 __mem_cgroup_stat_reset_safe(cpustat, MEM_CGROUP_STAT_EVENTS);
386 ret = true;
387 }
388 put_cpu();
389 return ret;
390}
391 398
392static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page) 399static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page)
393{ 400{
@@ -481,17 +488,31 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
481 return mz; 488 return mz;
482} 489}
483 490
491static s64 mem_cgroup_read_stat(struct mem_cgroup *mem,
492 enum mem_cgroup_stat_index idx)
493{
494 int cpu;
495 s64 val = 0;
496
497 for_each_possible_cpu(cpu)
498 val += per_cpu(mem->stat->count[idx], cpu);
499 return val;
500}
501
502static s64 mem_cgroup_local_usage(struct mem_cgroup *mem)
503{
504 s64 ret;
505
506 ret = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);
507 ret += mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE);
508 return ret;
509}
510
484static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, 511static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,
485 bool charge) 512 bool charge)
486{ 513{
487 int val = (charge) ? 1 : -1; 514 int val = (charge) ? 1 : -1;
488 struct mem_cgroup_stat *stat = &mem->stat; 515 this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val);
489 struct mem_cgroup_stat_cpu *cpustat;
490 int cpu = get_cpu();
491
492 cpustat = &stat->cpustat[cpu];
493 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_SWAPOUT, val);
494 put_cpu();
495} 516}
496 517
497static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, 518static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
@@ -499,24 +520,21 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
499 bool charge) 520 bool charge)
500{ 521{
501 int val = (charge) ? 1 : -1; 522 int val = (charge) ? 1 : -1;
502 struct mem_cgroup_stat *stat = &mem->stat;
503 struct mem_cgroup_stat_cpu *cpustat;
504 int cpu = get_cpu();
505 523
506 cpustat = &stat->cpustat[cpu]; 524 preempt_disable();
525
507 if (PageCgroupCache(pc)) 526 if (PageCgroupCache(pc))
508 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val); 527 __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], val);
509 else 528 else
510 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_RSS, val); 529 __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], val);
511 530
512 if (charge) 531 if (charge)
513 __mem_cgroup_stat_add_safe(cpustat, 532 __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGIN_COUNT]);
514 MEM_CGROUP_STAT_PGPGIN_COUNT, 1);
515 else 533 else
516 __mem_cgroup_stat_add_safe(cpustat, 534 __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGOUT_COUNT]);
517 MEM_CGROUP_STAT_PGPGOUT_COUNT, 1); 535 __this_cpu_inc(mem->stat->count[MEM_CGROUP_EVENTS]);
518 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_EVENTS, 1); 536
519 put_cpu(); 537 preempt_enable();
520} 538}
521 539
522static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, 540static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
@@ -534,6 +552,29 @@ static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
534 return total; 552 return total;
535} 553}
536 554
555static bool __memcg_event_check(struct mem_cgroup *mem, int event_mask_shift)
556{
557 s64 val;
558
559 val = this_cpu_read(mem->stat->count[MEM_CGROUP_EVENTS]);
560
561 return !(val & ((1 << event_mask_shift) - 1));
562}
563
564/*
565 * Check events in order.
566 *
567 */
568static void memcg_check_events(struct mem_cgroup *mem, struct page *page)
569{
570 /* threshold event is triggered in finer grain than soft limit */
571 if (unlikely(__memcg_event_check(mem, THRESHOLDS_EVENTS_THRESH))) {
572 mem_cgroup_threshold(mem);
573 if (unlikely(__memcg_event_check(mem, SOFTLIMIT_EVENTS_THRESH)))
574 mem_cgroup_update_tree(mem, page);
575 }
576}
577
537static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) 578static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
538{ 579{
539 return container_of(cgroup_subsys_state(cont, 580 return container_of(cgroup_subsys_state(cont,
@@ -1000,7 +1041,7 @@ static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data)
1000} 1041}
1001 1042
1002/** 1043/**
1003 * mem_cgroup_print_mem_info: Called from OOM with tasklist_lock held in read mode. 1044 * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode.
1004 * @memcg: The memory cgroup that went over limit 1045 * @memcg: The memory cgroup that went over limit
1005 * @p: Task that is going to be killed 1046 * @p: Task that is going to be killed
1006 * 1047 *
@@ -1174,7 +1215,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1174 } 1215 }
1175 } 1216 }
1176 } 1217 }
1177 if (!mem_cgroup_local_usage(&victim->stat)) { 1218 if (!mem_cgroup_local_usage(victim)) {
1178 /* this cgroup's local usage == 0 */ 1219 /* this cgroup's local usage == 0 */
1179 css_put(&victim->css); 1220 css_put(&victim->css);
1180 continue; 1221 continue;
@@ -1205,32 +1246,102 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1205 return total; 1246 return total;
1206} 1247}
1207 1248
1208bool mem_cgroup_oom_called(struct task_struct *task) 1249static int mem_cgroup_oom_lock_cb(struct mem_cgroup *mem, void *data)
1209{ 1250{
1210 bool ret = false; 1251 int *val = (int *)data;
1211 struct mem_cgroup *mem; 1252 int x;
1212 struct mm_struct *mm; 1253 /*
1254 * Logically, we can stop scanning immediately when we find
1255 * a memcg is already locked. But condidering unlock ops and
1256 * creation/removal of memcg, scan-all is simple operation.
1257 */
1258 x = atomic_inc_return(&mem->oom_lock);
1259 *val = max(x, *val);
1260 return 0;
1261}
1262/*
1263 * Check OOM-Killer is already running under our hierarchy.
1264 * If someone is running, return false.
1265 */
1266static bool mem_cgroup_oom_lock(struct mem_cgroup *mem)
1267{
1268 int lock_count = 0;
1213 1269
1214 rcu_read_lock(); 1270 mem_cgroup_walk_tree(mem, &lock_count, mem_cgroup_oom_lock_cb);
1215 mm = task->mm; 1271
1216 if (!mm) 1272 if (lock_count == 1)
1217 mm = &init_mm; 1273 return true;
1218 mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); 1274 return false;
1219 if (mem && time_before(jiffies, mem->last_oom_jiffies + HZ/10))
1220 ret = true;
1221 rcu_read_unlock();
1222 return ret;
1223} 1275}
1224 1276
1225static int record_last_oom_cb(struct mem_cgroup *mem, void *data) 1277static int mem_cgroup_oom_unlock_cb(struct mem_cgroup *mem, void *data)
1226{ 1278{
1227 mem->last_oom_jiffies = jiffies; 1279 /*
1280 * When a new child is created while the hierarchy is under oom,
1281 * mem_cgroup_oom_lock() may not be called. We have to use
1282 * atomic_add_unless() here.
1283 */
1284 atomic_add_unless(&mem->oom_lock, -1, 0);
1228 return 0; 1285 return 0;
1229} 1286}
1230 1287
1231static void record_last_oom(struct mem_cgroup *mem) 1288static void mem_cgroup_oom_unlock(struct mem_cgroup *mem)
1232{ 1289{
1233 mem_cgroup_walk_tree(mem, NULL, record_last_oom_cb); 1290 mem_cgroup_walk_tree(mem, NULL, mem_cgroup_oom_unlock_cb);
1291}
1292
1293static DEFINE_MUTEX(memcg_oom_mutex);
1294static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
1295
1296/*
1297 * try to call OOM killer. returns false if we should exit memory-reclaim loop.
1298 */
1299bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
1300{
1301 DEFINE_WAIT(wait);
1302 bool locked;
1303
1304 /* At first, try to OOM lock hierarchy under mem.*/
1305 mutex_lock(&memcg_oom_mutex);
1306 locked = mem_cgroup_oom_lock(mem);
1307 /*
1308 * Even if signal_pending(), we can't quit charge() loop without
1309 * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL
1310 * under OOM is always welcomed, use TASK_KILLABLE here.
1311 */
1312 if (!locked)
1313 prepare_to_wait(&memcg_oom_waitq, &wait, TASK_KILLABLE);
1314 mutex_unlock(&memcg_oom_mutex);
1315
1316 if (locked)
1317 mem_cgroup_out_of_memory(mem, mask);
1318 else {
1319 schedule();
1320 finish_wait(&memcg_oom_waitq, &wait);
1321 }
1322 mutex_lock(&memcg_oom_mutex);
1323 mem_cgroup_oom_unlock(mem);
1324 /*
1325 * Here, we use global waitq .....more fine grained waitq ?
1326 * Assume following hierarchy.
1327 * A/
1328 * 01
1329 * 02
1330 * assume OOM happens both in A and 01 at the same time. Tthey are
1331 * mutually exclusive by lock. (kill in 01 helps A.)
1332 * When we use per memcg waitq, we have to wake up waiters on A and 02
1333 * in addtion to waiters on 01. We use global waitq for avoiding mess.
1334 * It will not be a big problem.
1335 * (And a task may be moved to other groups while it's waiting for OOM.)
1336 */
1337 wake_up_all(&memcg_oom_waitq);
1338 mutex_unlock(&memcg_oom_mutex);
1339
1340 if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
1341 return false;
1342 /* Give chance to dying process */
1343 schedule_timeout(1);
1344 return true;
1234} 1345}
1235 1346
1236/* 1347/*
@@ -1240,9 +1351,6 @@ static void record_last_oom(struct mem_cgroup *mem)
1240void mem_cgroup_update_file_mapped(struct page *page, int val) 1351void mem_cgroup_update_file_mapped(struct page *page, int val)
1241{ 1352{
1242 struct mem_cgroup *mem; 1353 struct mem_cgroup *mem;
1243 struct mem_cgroup_stat *stat;
1244 struct mem_cgroup_stat_cpu *cpustat;
1245 int cpu;
1246 struct page_cgroup *pc; 1354 struct page_cgroup *pc;
1247 1355
1248 pc = lookup_page_cgroup(page); 1356 pc = lookup_page_cgroup(page);
@@ -1258,13 +1366,10 @@ void mem_cgroup_update_file_mapped(struct page *page, int val)
1258 goto done; 1366 goto done;
1259 1367
1260 /* 1368 /*
1261 * Preemption is already disabled, we don't need get_cpu() 1369 * Preemption is already disabled. We can use __this_cpu_xxx
1262 */ 1370 */
1263 cpu = smp_processor_id(); 1371 __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], val);
1264 stat = &mem->stat;
1265 cpustat = &stat->cpustat[cpu];
1266 1372
1267 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED, val);
1268done: 1373done:
1269 unlock_page_cgroup(pc); 1374 unlock_page_cgroup(pc);
1270} 1375}
@@ -1401,19 +1506,21 @@ static int __cpuinit memcg_stock_cpu_callback(struct notifier_block *nb,
1401 * oom-killer can be invoked. 1506 * oom-killer can be invoked.
1402 */ 1507 */
1403static int __mem_cgroup_try_charge(struct mm_struct *mm, 1508static int __mem_cgroup_try_charge(struct mm_struct *mm,
1404 gfp_t gfp_mask, struct mem_cgroup **memcg, 1509 gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom)
1405 bool oom, struct page *page)
1406{ 1510{
1407 struct mem_cgroup *mem, *mem_over_limit; 1511 struct mem_cgroup *mem, *mem_over_limit;
1408 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 1512 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
1409 struct res_counter *fail_res; 1513 struct res_counter *fail_res;
1410 int csize = CHARGE_SIZE; 1514 int csize = CHARGE_SIZE;
1411 1515
1412 if (unlikely(test_thread_flag(TIF_MEMDIE))) { 1516 /*
1413 /* Don't account this! */ 1517 * Unlike gloval-vm's OOM-kill, we're not in memory shortage
1414 *memcg = NULL; 1518 * in system level. So, allow to go ahead dying process in addition to
1415 return 0; 1519 * MEMDIE process.
1416 } 1520 */
1521 if (unlikely(test_thread_flag(TIF_MEMDIE)
1522 || fatal_signal_pending(current)))
1523 goto bypass;
1417 1524
1418 /* 1525 /*
1419 * We always charge the cgroup the mm_struct belongs to. 1526 * We always charge the cgroup the mm_struct belongs to.
@@ -1440,7 +1547,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
1440 unsigned long flags = 0; 1547 unsigned long flags = 0;
1441 1548
1442 if (consume_stock(mem)) 1549 if (consume_stock(mem))
1443 goto charged; 1550 goto done;
1444 1551
1445 ret = res_counter_charge(&mem->res, csize, &fail_res); 1552 ret = res_counter_charge(&mem->res, csize, &fail_res);
1446 if (likely(!ret)) { 1553 if (likely(!ret)) {
@@ -1483,28 +1590,70 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
1483 if (mem_cgroup_check_under_limit(mem_over_limit)) 1590 if (mem_cgroup_check_under_limit(mem_over_limit))
1484 continue; 1591 continue;
1485 1592
1593 /* try to avoid oom while someone is moving charge */
1594 if (mc.moving_task && current != mc.moving_task) {
1595 struct mem_cgroup *from, *to;
1596 bool do_continue = false;
1597 /*
1598 * There is a small race that "from" or "to" can be
1599 * freed by rmdir, so we use css_tryget().
1600 */
1601 rcu_read_lock();
1602 from = mc.from;
1603 to = mc.to;
1604 if (from && css_tryget(&from->css)) {
1605 if (mem_over_limit->use_hierarchy)
1606 do_continue = css_is_ancestor(
1607 &from->css,
1608 &mem_over_limit->css);
1609 else
1610 do_continue = (from == mem_over_limit);
1611 css_put(&from->css);
1612 }
1613 if (!do_continue && to && css_tryget(&to->css)) {
1614 if (mem_over_limit->use_hierarchy)
1615 do_continue = css_is_ancestor(
1616 &to->css,
1617 &mem_over_limit->css);
1618 else
1619 do_continue = (to == mem_over_limit);
1620 css_put(&to->css);
1621 }
1622 rcu_read_unlock();
1623 if (do_continue) {
1624 DEFINE_WAIT(wait);
1625 prepare_to_wait(&mc.waitq, &wait,
1626 TASK_INTERRUPTIBLE);
1627 /* moving charge context might have finished. */
1628 if (mc.moving_task)
1629 schedule();
1630 finish_wait(&mc.waitq, &wait);
1631 continue;
1632 }
1633 }
1634
1486 if (!nr_retries--) { 1635 if (!nr_retries--) {
1487 if (oom) { 1636 if (!oom)
1488 mem_cgroup_out_of_memory(mem_over_limit, gfp_mask); 1637 goto nomem;
1489 record_last_oom(mem_over_limit); 1638 if (mem_cgroup_handle_oom(mem_over_limit, gfp_mask)) {
1639 nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
1640 continue;
1490 } 1641 }
1491 goto nomem; 1642 /* When we reach here, current task is dying .*/
1643 css_put(&mem->css);
1644 goto bypass;
1492 } 1645 }
1493 } 1646 }
1494 if (csize > PAGE_SIZE) 1647 if (csize > PAGE_SIZE)
1495 refill_stock(mem, csize - PAGE_SIZE); 1648 refill_stock(mem, csize - PAGE_SIZE);
1496charged:
1497 /*
1498 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
1499 * if they exceeds softlimit.
1500 */
1501 if (mem_cgroup_soft_limit_check(mem))
1502 mem_cgroup_update_tree(mem, page);
1503done: 1649done:
1504 return 0; 1650 return 0;
1505nomem: 1651nomem:
1506 css_put(&mem->css); 1652 css_put(&mem->css);
1507 return -ENOMEM; 1653 return -ENOMEM;
1654bypass:
1655 *memcg = NULL;
1656 return 0;
1508} 1657}
1509 1658
1510/* 1659/*
@@ -1512,14 +1661,23 @@ nomem:
1512 * This function is for that and do uncharge, put css's refcnt. 1661 * This function is for that and do uncharge, put css's refcnt.
1513 * gotten by try_charge(). 1662 * gotten by try_charge().
1514 */ 1663 */
1515static void mem_cgroup_cancel_charge(struct mem_cgroup *mem) 1664static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem,
1665 unsigned long count)
1516{ 1666{
1517 if (!mem_cgroup_is_root(mem)) { 1667 if (!mem_cgroup_is_root(mem)) {
1518 res_counter_uncharge(&mem->res, PAGE_SIZE); 1668 res_counter_uncharge(&mem->res, PAGE_SIZE * count);
1519 if (do_swap_account) 1669 if (do_swap_account)
1520 res_counter_uncharge(&mem->memsw, PAGE_SIZE); 1670 res_counter_uncharge(&mem->memsw, PAGE_SIZE * count);
1671 VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags));
1672 WARN_ON_ONCE(count > INT_MAX);
1673 __css_put(&mem->css, (int)count);
1521 } 1674 }
1522 css_put(&mem->css); 1675 /* we don't need css_put for root */
1676}
1677
1678static void mem_cgroup_cancel_charge(struct mem_cgroup *mem)
1679{
1680 __mem_cgroup_cancel_charge(mem, 1);
1523} 1681}
1524 1682
1525/* 1683/*
@@ -1615,6 +1773,12 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
1615 mem_cgroup_charge_statistics(mem, pc, true); 1773 mem_cgroup_charge_statistics(mem, pc, true);
1616 1774
1617 unlock_page_cgroup(pc); 1775 unlock_page_cgroup(pc);
1776 /*
1777 * "charge_statistics" updated event counter. Then, check it.
1778 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
1779 * if they exceeds softlimit.
1780 */
1781 memcg_check_events(mem, pc->page);
1618} 1782}
1619 1783
1620/** 1784/**
@@ -1622,22 +1786,22 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
1622 * @pc: page_cgroup of the page. 1786 * @pc: page_cgroup of the page.
1623 * @from: mem_cgroup which the page is moved from. 1787 * @from: mem_cgroup which the page is moved from.
1624 * @to: mem_cgroup which the page is moved to. @from != @to. 1788 * @to: mem_cgroup which the page is moved to. @from != @to.
1789 * @uncharge: whether we should call uncharge and css_put against @from.
1625 * 1790 *
1626 * The caller must confirm following. 1791 * The caller must confirm following.
1627 * - page is not on LRU (isolate_page() is useful.) 1792 * - page is not on LRU (isolate_page() is useful.)
1628 * - the pc is locked, used, and ->mem_cgroup points to @from. 1793 * - the pc is locked, used, and ->mem_cgroup points to @from.
1629 * 1794 *
1630 * This function does "uncharge" from old cgroup but doesn't do "charge" to 1795 * This function doesn't do "charge" nor css_get to new cgroup. It should be
1631 * new cgroup. It should be done by a caller. 1796 * done by a caller(__mem_cgroup_try_charge would be usefull). If @uncharge is
1797 * true, this function does "uncharge" from old cgroup, but it doesn't if
1798 * @uncharge is false, so a caller should do "uncharge".
1632 */ 1799 */
1633 1800
1634static void __mem_cgroup_move_account(struct page_cgroup *pc, 1801static void __mem_cgroup_move_account(struct page_cgroup *pc,
1635 struct mem_cgroup *from, struct mem_cgroup *to) 1802 struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge)
1636{ 1803{
1637 struct page *page; 1804 struct page *page;
1638 int cpu;
1639 struct mem_cgroup_stat *stat;
1640 struct mem_cgroup_stat_cpu *cpustat;
1641 1805
1642 VM_BUG_ON(from == to); 1806 VM_BUG_ON(from == to);
1643 VM_BUG_ON(PageLRU(pc->page)); 1807 VM_BUG_ON(PageLRU(pc->page));
@@ -1645,38 +1809,28 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc,
1645 VM_BUG_ON(!PageCgroupUsed(pc)); 1809 VM_BUG_ON(!PageCgroupUsed(pc));
1646 VM_BUG_ON(pc->mem_cgroup != from); 1810 VM_BUG_ON(pc->mem_cgroup != from);
1647 1811
1648 if (!mem_cgroup_is_root(from))
1649 res_counter_uncharge(&from->res, PAGE_SIZE);
1650 mem_cgroup_charge_statistics(from, pc, false);
1651
1652 page = pc->page; 1812 page = pc->page;
1653 if (page_mapped(page) && !PageAnon(page)) { 1813 if (page_mapped(page) && !PageAnon(page)) {
1654 cpu = smp_processor_id(); 1814 /* Update mapped_file data for mem_cgroup */
1655 /* Update mapped_file data for mem_cgroup "from" */ 1815 preempt_disable();
1656 stat = &from->stat; 1816 __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
1657 cpustat = &stat->cpustat[cpu]; 1817 __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
1658 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED, 1818 preempt_enable();
1659 -1);
1660
1661 /* Update mapped_file data for mem_cgroup "to" */
1662 stat = &to->stat;
1663 cpustat = &stat->cpustat[cpu];
1664 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED,
1665 1);
1666 } 1819 }
1820 mem_cgroup_charge_statistics(from, pc, false);
1821 if (uncharge)
1822 /* This is not "cancel", but cancel_charge does all we need. */
1823 mem_cgroup_cancel_charge(from);
1667 1824
1668 if (do_swap_account && !mem_cgroup_is_root(from)) 1825 /* caller should have done css_get */
1669 res_counter_uncharge(&from->memsw, PAGE_SIZE);
1670 css_put(&from->css);
1671
1672 css_get(&to->css);
1673 pc->mem_cgroup = to; 1826 pc->mem_cgroup = to;
1674 mem_cgroup_charge_statistics(to, pc, true); 1827 mem_cgroup_charge_statistics(to, pc, true);
1675 /* 1828 /*
1676 * We charges against "to" which may not have any tasks. Then, "to" 1829 * We charges against "to" which may not have any tasks. Then, "to"
1677 * can be under rmdir(). But in current implementation, caller of 1830 * can be under rmdir(). But in current implementation, caller of
1678 * this function is just force_empty() and it's garanteed that 1831 * this function is just force_empty() and move charge, so it's
1679 * "to" is never removed. So, we don't check rmdir status here. 1832 * garanteed that "to" is never removed. So, we don't check rmdir
1833 * status here.
1680 */ 1834 */
1681} 1835}
1682 1836
@@ -1685,15 +1839,20 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc,
1685 * __mem_cgroup_move_account() 1839 * __mem_cgroup_move_account()
1686 */ 1840 */
1687static int mem_cgroup_move_account(struct page_cgroup *pc, 1841static int mem_cgroup_move_account(struct page_cgroup *pc,
1688 struct mem_cgroup *from, struct mem_cgroup *to) 1842 struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge)
1689{ 1843{
1690 int ret = -EINVAL; 1844 int ret = -EINVAL;
1691 lock_page_cgroup(pc); 1845 lock_page_cgroup(pc);
1692 if (PageCgroupUsed(pc) && pc->mem_cgroup == from) { 1846 if (PageCgroupUsed(pc) && pc->mem_cgroup == from) {
1693 __mem_cgroup_move_account(pc, from, to); 1847 __mem_cgroup_move_account(pc, from, to, uncharge);
1694 ret = 0; 1848 ret = 0;
1695 } 1849 }
1696 unlock_page_cgroup(pc); 1850 unlock_page_cgroup(pc);
1851 /*
1852 * check events
1853 */
1854 memcg_check_events(to, pc->page);
1855 memcg_check_events(from, pc->page);
1697 return ret; 1856 return ret;
1698} 1857}
1699 1858
@@ -1722,15 +1881,13 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
1722 goto put; 1881 goto put;
1723 1882
1724 parent = mem_cgroup_from_cont(pcg); 1883 parent = mem_cgroup_from_cont(pcg);
1725 ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, page); 1884 ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false);
1726 if (ret || !parent) 1885 if (ret || !parent)
1727 goto put_back; 1886 goto put_back;
1728 1887
1729 ret = mem_cgroup_move_account(pc, child, parent); 1888 ret = mem_cgroup_move_account(pc, child, parent, true);
1730 if (!ret) 1889 if (ret)
1731 css_put(&parent->css); /* drop extra refcnt by try_charge() */ 1890 mem_cgroup_cancel_charge(parent);
1732 else
1733 mem_cgroup_cancel_charge(parent); /* does css_put */
1734put_back: 1891put_back:
1735 putback_lru_page(page); 1892 putback_lru_page(page);
1736put: 1893put:
@@ -1760,7 +1917,7 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
1760 prefetchw(pc); 1917 prefetchw(pc);
1761 1918
1762 mem = memcg; 1919 mem = memcg;
1763 ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true, page); 1920 ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true);
1764 if (ret || !mem) 1921 if (ret || !mem)
1765 return ret; 1922 return ret;
1766 1923
@@ -1880,14 +2037,14 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
1880 if (!mem) 2037 if (!mem)
1881 goto charge_cur_mm; 2038 goto charge_cur_mm;
1882 *ptr = mem; 2039 *ptr = mem;
1883 ret = __mem_cgroup_try_charge(NULL, mask, ptr, true, page); 2040 ret = __mem_cgroup_try_charge(NULL, mask, ptr, true);
1884 /* drop extra refcnt from tryget */ 2041 /* drop extra refcnt from tryget */
1885 css_put(&mem->css); 2042 css_put(&mem->css);
1886 return ret; 2043 return ret;
1887charge_cur_mm: 2044charge_cur_mm:
1888 if (unlikely(!mm)) 2045 if (unlikely(!mm))
1889 mm = &init_mm; 2046 mm = &init_mm;
1890 return __mem_cgroup_try_charge(mm, mask, ptr, true, page); 2047 return __mem_cgroup_try_charge(mm, mask, ptr, true);
1891} 2048}
1892 2049
1893static void 2050static void
@@ -2064,8 +2221,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2064 mz = page_cgroup_zoneinfo(pc); 2221 mz = page_cgroup_zoneinfo(pc);
2065 unlock_page_cgroup(pc); 2222 unlock_page_cgroup(pc);
2066 2223
2067 if (mem_cgroup_soft_limit_check(mem)) 2224 memcg_check_events(mem, page);
2068 mem_cgroup_update_tree(mem, page);
2069 /* at swapout, this memcg will be accessed to record to swap */ 2225 /* at swapout, this memcg will be accessed to record to swap */
2070 if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT) 2226 if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
2071 css_put(&mem->css); 2227 css_put(&mem->css);
@@ -2192,6 +2348,64 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent)
2192 } 2348 }
2193 rcu_read_unlock(); 2349 rcu_read_unlock();
2194} 2350}
2351
2352/**
2353 * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.
2354 * @entry: swap entry to be moved
2355 * @from: mem_cgroup which the entry is moved from
2356 * @to: mem_cgroup which the entry is moved to
2357 * @need_fixup: whether we should fixup res_counters and refcounts.
2358 *
2359 * It succeeds only when the swap_cgroup's record for this entry is the same
2360 * as the mem_cgroup's id of @from.
2361 *
2362 * Returns 0 on success, -EINVAL on failure.
2363 *
2364 * The caller must have charged to @to, IOW, called res_counter_charge() about
2365 * both res and memsw, and called css_get().
2366 */
2367static int mem_cgroup_move_swap_account(swp_entry_t entry,
2368 struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup)
2369{
2370 unsigned short old_id, new_id;
2371
2372 old_id = css_id(&from->css);
2373 new_id = css_id(&to->css);
2374
2375 if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
2376 mem_cgroup_swap_statistics(from, false);
2377 mem_cgroup_swap_statistics(to, true);
2378 /*
2379 * This function is only called from task migration context now.
2380 * It postpones res_counter and refcount handling till the end
2381 * of task migration(mem_cgroup_clear_mc()) for performance
2382 * improvement. But we cannot postpone mem_cgroup_get(to)
2383 * because if the process that has been moved to @to does
2384 * swap-in, the refcount of @to might be decreased to 0.
2385 */
2386 mem_cgroup_get(to);
2387 if (need_fixup) {
2388 if (!mem_cgroup_is_root(from))
2389 res_counter_uncharge(&from->memsw, PAGE_SIZE);
2390 mem_cgroup_put(from);
2391 /*
2392 * we charged both to->res and to->memsw, so we should
2393 * uncharge to->res.
2394 */
2395 if (!mem_cgroup_is_root(to))
2396 res_counter_uncharge(&to->res, PAGE_SIZE);
2397 css_put(&to->css);
2398 }
2399 return 0;
2400 }
2401 return -EINVAL;
2402}
2403#else
2404static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
2405 struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup)
2406{
2407 return -EINVAL;
2408}
2195#endif 2409#endif
2196 2410
2197/* 2411/*
@@ -2216,8 +2430,7 @@ int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr)
2216 unlock_page_cgroup(pc); 2430 unlock_page_cgroup(pc);
2217 2431
2218 if (mem) { 2432 if (mem) {
2219 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false, 2433 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false);
2220 page);
2221 css_put(&mem->css); 2434 css_put(&mem->css);
2222 } 2435 }
2223 *ptr = mem; 2436 *ptr = mem;
@@ -2545,7 +2758,7 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
2545 pc = list_entry(list->prev, struct page_cgroup, lru); 2758 pc = list_entry(list->prev, struct page_cgroup, lru);
2546 if (busy == pc) { 2759 if (busy == pc) {
2547 list_move(&pc->lru, list); 2760 list_move(&pc->lru, list);
2548 busy = 0; 2761 busy = NULL;
2549 spin_unlock_irqrestore(&zone->lru_lock, flags); 2762 spin_unlock_irqrestore(&zone->lru_lock, flags);
2550 continue; 2763 continue;
2551 } 2764 }
@@ -2704,7 +2917,7 @@ static int
2704mem_cgroup_get_idx_stat(struct mem_cgroup *mem, void *data) 2917mem_cgroup_get_idx_stat(struct mem_cgroup *mem, void *data)
2705{ 2918{
2706 struct mem_cgroup_idx_data *d = data; 2919 struct mem_cgroup_idx_data *d = data;
2707 d->val += mem_cgroup_read_stat(&mem->stat, d->idx); 2920 d->val += mem_cgroup_read_stat(mem, d->idx);
2708 return 0; 2921 return 0;
2709} 2922}
2710 2923
@@ -2719,40 +2932,50 @@ mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem,
2719 *val = d.val; 2932 *val = d.val;
2720} 2933}
2721 2934
2935static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap)
2936{
2937 u64 idx_val, val;
2938
2939 if (!mem_cgroup_is_root(mem)) {
2940 if (!swap)
2941 return res_counter_read_u64(&mem->res, RES_USAGE);
2942 else
2943 return res_counter_read_u64(&mem->memsw, RES_USAGE);
2944 }
2945
2946 mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_CACHE, &idx_val);
2947 val = idx_val;
2948 mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_RSS, &idx_val);
2949 val += idx_val;
2950
2951 if (swap) {
2952 mem_cgroup_get_recursive_idx_stat(mem,
2953 MEM_CGROUP_STAT_SWAPOUT, &idx_val);
2954 val += idx_val;
2955 }
2956
2957 return val << PAGE_SHIFT;
2958}
2959
2722static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) 2960static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
2723{ 2961{
2724 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 2962 struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
2725 u64 idx_val, val; 2963 u64 val;
2726 int type, name; 2964 int type, name;
2727 2965
2728 type = MEMFILE_TYPE(cft->private); 2966 type = MEMFILE_TYPE(cft->private);
2729 name = MEMFILE_ATTR(cft->private); 2967 name = MEMFILE_ATTR(cft->private);
2730 switch (type) { 2968 switch (type) {
2731 case _MEM: 2969 case _MEM:
2732 if (name == RES_USAGE && mem_cgroup_is_root(mem)) { 2970 if (name == RES_USAGE)
2733 mem_cgroup_get_recursive_idx_stat(mem, 2971 val = mem_cgroup_usage(mem, false);
2734 MEM_CGROUP_STAT_CACHE, &idx_val); 2972 else
2735 val = idx_val;
2736 mem_cgroup_get_recursive_idx_stat(mem,
2737 MEM_CGROUP_STAT_RSS, &idx_val);
2738 val += idx_val;
2739 val <<= PAGE_SHIFT;
2740 } else
2741 val = res_counter_read_u64(&mem->res, name); 2973 val = res_counter_read_u64(&mem->res, name);
2742 break; 2974 break;
2743 case _MEMSWAP: 2975 case _MEMSWAP:
2744 if (name == RES_USAGE && mem_cgroup_is_root(mem)) { 2976 if (name == RES_USAGE)
2745 mem_cgroup_get_recursive_idx_stat(mem, 2977 val = mem_cgroup_usage(mem, true);
2746 MEM_CGROUP_STAT_CACHE, &idx_val); 2978 else
2747 val = idx_val;
2748 mem_cgroup_get_recursive_idx_stat(mem,
2749 MEM_CGROUP_STAT_RSS, &idx_val);
2750 val += idx_val;
2751 mem_cgroup_get_recursive_idx_stat(mem,
2752 MEM_CGROUP_STAT_SWAPOUT, &idx_val);
2753 val += idx_val;
2754 val <<= PAGE_SHIFT;
2755 } else
2756 val = res_counter_read_u64(&mem->memsw, name); 2979 val = res_counter_read_u64(&mem->memsw, name);
2757 break; 2980 break;
2758 default: 2981 default:
@@ -2865,6 +3088,39 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
2865 return 0; 3088 return 0;
2866} 3089}
2867 3090
3091static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp,
3092 struct cftype *cft)
3093{
3094 return mem_cgroup_from_cont(cgrp)->move_charge_at_immigrate;
3095}
3096
3097#ifdef CONFIG_MMU
3098static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
3099 struct cftype *cft, u64 val)
3100{
3101 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
3102
3103 if (val >= (1 << NR_MOVE_TYPE))
3104 return -EINVAL;
3105 /*
3106 * We check this value several times in both in can_attach() and
3107 * attach(), so we need cgroup lock to prevent this value from being
3108 * inconsistent.
3109 */
3110 cgroup_lock();
3111 mem->move_charge_at_immigrate = val;
3112 cgroup_unlock();
3113
3114 return 0;
3115}
3116#else
3117static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
3118 struct cftype *cft, u64 val)
3119{
3120 return -ENOSYS;
3121}
3122#endif
3123
2868 3124
2869/* For read statistics */ 3125/* For read statistics */
2870enum { 3126enum {
@@ -2910,18 +3166,18 @@ static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data)
2910 s64 val; 3166 s64 val;
2911 3167
2912 /* per cpu stat */ 3168 /* per cpu stat */
2913 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_CACHE); 3169 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE);
2914 s->stat[MCS_CACHE] += val * PAGE_SIZE; 3170 s->stat[MCS_CACHE] += val * PAGE_SIZE;
2915 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS); 3171 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);
2916 s->stat[MCS_RSS] += val * PAGE_SIZE; 3172 s->stat[MCS_RSS] += val * PAGE_SIZE;
2917 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_FILE_MAPPED); 3173 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED);
2918 s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE; 3174 s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE;
2919 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGIN_COUNT); 3175 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGIN_COUNT);
2920 s->stat[MCS_PGPGIN] += val; 3176 s->stat[MCS_PGPGIN] += val;
2921 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT); 3177 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGOUT_COUNT);
2922 s->stat[MCS_PGPGOUT] += val; 3178 s->stat[MCS_PGPGOUT] += val;
2923 if (do_swap_account) { 3179 if (do_swap_account) {
2924 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_SWAPOUT); 3180 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT);
2925 s->stat[MCS_SWAP] += val * PAGE_SIZE; 3181 s->stat[MCS_SWAP] += val * PAGE_SIZE;
2926 } 3182 }
2927 3183
@@ -3049,12 +3305,249 @@ static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
3049 return 0; 3305 return 0;
3050} 3306}
3051 3307
3308static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
3309{
3310 struct mem_cgroup_threshold_ary *t;
3311 u64 usage;
3312 int i;
3313
3314 rcu_read_lock();
3315 if (!swap)
3316 t = rcu_dereference(memcg->thresholds);
3317 else
3318 t = rcu_dereference(memcg->memsw_thresholds);
3319
3320 if (!t)
3321 goto unlock;
3322
3323 usage = mem_cgroup_usage(memcg, swap);
3324
3325 /*
3326 * current_threshold points to threshold just below usage.
3327 * If it's not true, a threshold was crossed after last
3328 * call of __mem_cgroup_threshold().
3329 */
3330 i = atomic_read(&t->current_threshold);
3331
3332 /*
3333 * Iterate backward over array of thresholds starting from
3334 * current_threshold and check if a threshold is crossed.
3335 * If none of thresholds below usage is crossed, we read
3336 * only one element of the array here.
3337 */
3338 for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
3339 eventfd_signal(t->entries[i].eventfd, 1);
3340
3341 /* i = current_threshold + 1 */
3342 i++;
3343
3344 /*
3345 * Iterate forward over array of thresholds starting from
3346 * current_threshold+1 and check if a threshold is crossed.
3347 * If none of thresholds above usage is crossed, we read
3348 * only one element of the array here.
3349 */
3350 for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
3351 eventfd_signal(t->entries[i].eventfd, 1);
3352
3353 /* Update current_threshold */
3354 atomic_set(&t->current_threshold, i - 1);
3355unlock:
3356 rcu_read_unlock();
3357}
3358
3359static void mem_cgroup_threshold(struct mem_cgroup *memcg)
3360{
3361 __mem_cgroup_threshold(memcg, false);
3362 if (do_swap_account)
3363 __mem_cgroup_threshold(memcg, true);
3364}
3365
3366static int compare_thresholds(const void *a, const void *b)
3367{
3368 const struct mem_cgroup_threshold *_a = a;
3369 const struct mem_cgroup_threshold *_b = b;
3370
3371 return _a->threshold - _b->threshold;
3372}
3373
3374static int mem_cgroup_register_event(struct cgroup *cgrp, struct cftype *cft,
3375 struct eventfd_ctx *eventfd, const char *args)
3376{
3377 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
3378 struct mem_cgroup_threshold_ary *thresholds, *thresholds_new;
3379 int type = MEMFILE_TYPE(cft->private);
3380 u64 threshold, usage;
3381 int size;
3382 int i, ret;
3383
3384 ret = res_counter_memparse_write_strategy(args, &threshold);
3385 if (ret)
3386 return ret;
3387
3388 mutex_lock(&memcg->thresholds_lock);
3389 if (type == _MEM)
3390 thresholds = memcg->thresholds;
3391 else if (type == _MEMSWAP)
3392 thresholds = memcg->memsw_thresholds;
3393 else
3394 BUG();
3395
3396 usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
3397
3398 /* Check if a threshold crossed before adding a new one */
3399 if (thresholds)
3400 __mem_cgroup_threshold(memcg, type == _MEMSWAP);
3401
3402 if (thresholds)
3403 size = thresholds->size + 1;
3404 else
3405 size = 1;
3406
3407 /* Allocate memory for new array of thresholds */
3408 thresholds_new = kmalloc(sizeof(*thresholds_new) +
3409 size * sizeof(struct mem_cgroup_threshold),
3410 GFP_KERNEL);
3411 if (!thresholds_new) {
3412 ret = -ENOMEM;
3413 goto unlock;
3414 }
3415 thresholds_new->size = size;
3416
3417 /* Copy thresholds (if any) to new array */
3418 if (thresholds)
3419 memcpy(thresholds_new->entries, thresholds->entries,
3420 thresholds->size *
3421 sizeof(struct mem_cgroup_threshold));
3422 /* Add new threshold */
3423 thresholds_new->entries[size - 1].eventfd = eventfd;
3424 thresholds_new->entries[size - 1].threshold = threshold;
3425
3426 /* Sort thresholds. Registering of new threshold isn't time-critical */
3427 sort(thresholds_new->entries, size,
3428 sizeof(struct mem_cgroup_threshold),
3429 compare_thresholds, NULL);
3430
3431 /* Find current threshold */
3432 atomic_set(&thresholds_new->current_threshold, -1);
3433 for (i = 0; i < size; i++) {
3434 if (thresholds_new->entries[i].threshold < usage) {
3435 /*
3436 * thresholds_new->current_threshold will not be used
3437 * until rcu_assign_pointer(), so it's safe to increment
3438 * it here.
3439 */
3440 atomic_inc(&thresholds_new->current_threshold);
3441 }
3442 }
3443
3444 if (type == _MEM)
3445 rcu_assign_pointer(memcg->thresholds, thresholds_new);
3446 else
3447 rcu_assign_pointer(memcg->memsw_thresholds, thresholds_new);
3448
3449 /* To be sure that nobody uses thresholds before freeing it */
3450 synchronize_rcu();
3451
3452 kfree(thresholds);
3453unlock:
3454 mutex_unlock(&memcg->thresholds_lock);
3455
3456 return ret;
3457}
3458
3459static int mem_cgroup_unregister_event(struct cgroup *cgrp, struct cftype *cft,
3460 struct eventfd_ctx *eventfd)
3461{
3462 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
3463 struct mem_cgroup_threshold_ary *thresholds, *thresholds_new;
3464 int type = MEMFILE_TYPE(cft->private);
3465 u64 usage;
3466 int size = 0;
3467 int i, j, ret;
3468
3469 mutex_lock(&memcg->thresholds_lock);
3470 if (type == _MEM)
3471 thresholds = memcg->thresholds;
3472 else if (type == _MEMSWAP)
3473 thresholds = memcg->memsw_thresholds;
3474 else
3475 BUG();
3476
3477 /*
3478 * Something went wrong if we trying to unregister a threshold
3479 * if we don't have thresholds
3480 */
3481 BUG_ON(!thresholds);
3482
3483 usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
3484
3485 /* Check if a threshold crossed before removing */
3486 __mem_cgroup_threshold(memcg, type == _MEMSWAP);
3487
3488 /* Calculate new number of threshold */
3489 for (i = 0; i < thresholds->size; i++) {
3490 if (thresholds->entries[i].eventfd != eventfd)
3491 size++;
3492 }
3493
3494 /* Set thresholds array to NULL if we don't have thresholds */
3495 if (!size) {
3496 thresholds_new = NULL;
3497 goto assign;
3498 }
3499
3500 /* Allocate memory for new array of thresholds */
3501 thresholds_new = kmalloc(sizeof(*thresholds_new) +
3502 size * sizeof(struct mem_cgroup_threshold),
3503 GFP_KERNEL);
3504 if (!thresholds_new) {
3505 ret = -ENOMEM;
3506 goto unlock;
3507 }
3508 thresholds_new->size = size;
3509
3510 /* Copy thresholds and find current threshold */
3511 atomic_set(&thresholds_new->current_threshold, -1);
3512 for (i = 0, j = 0; i < thresholds->size; i++) {
3513 if (thresholds->entries[i].eventfd == eventfd)
3514 continue;
3515
3516 thresholds_new->entries[j] = thresholds->entries[i];
3517 if (thresholds_new->entries[j].threshold < usage) {
3518 /*
3519 * thresholds_new->current_threshold will not be used
3520 * until rcu_assign_pointer(), so it's safe to increment
3521 * it here.
3522 */
3523 atomic_inc(&thresholds_new->current_threshold);
3524 }
3525 j++;
3526 }
3527
3528assign:
3529 if (type == _MEM)
3530 rcu_assign_pointer(memcg->thresholds, thresholds_new);
3531 else
3532 rcu_assign_pointer(memcg->memsw_thresholds, thresholds_new);
3533
3534 /* To be sure that nobody uses thresholds before freeing it */
3535 synchronize_rcu();
3536
3537 kfree(thresholds);
3538unlock:
3539 mutex_unlock(&memcg->thresholds_lock);
3540
3541 return ret;
3542}
3052 3543
3053static struct cftype mem_cgroup_files[] = { 3544static struct cftype mem_cgroup_files[] = {
3054 { 3545 {
3055 .name = "usage_in_bytes", 3546 .name = "usage_in_bytes",
3056 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), 3547 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
3057 .read_u64 = mem_cgroup_read, 3548 .read_u64 = mem_cgroup_read,
3549 .register_event = mem_cgroup_register_event,
3550 .unregister_event = mem_cgroup_unregister_event,
3058 }, 3551 },
3059 { 3552 {
3060 .name = "max_usage_in_bytes", 3553 .name = "max_usage_in_bytes",
@@ -3098,6 +3591,11 @@ static struct cftype mem_cgroup_files[] = {
3098 .read_u64 = mem_cgroup_swappiness_read, 3591 .read_u64 = mem_cgroup_swappiness_read,
3099 .write_u64 = mem_cgroup_swappiness_write, 3592 .write_u64 = mem_cgroup_swappiness_write,
3100 }, 3593 },
3594 {
3595 .name = "move_charge_at_immigrate",
3596 .read_u64 = mem_cgroup_move_charge_read,
3597 .write_u64 = mem_cgroup_move_charge_write,
3598 },
3101}; 3599};
3102 3600
3103#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 3601#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
@@ -3106,6 +3604,8 @@ static struct cftype memsw_cgroup_files[] = {
3106 .name = "memsw.usage_in_bytes", 3604 .name = "memsw.usage_in_bytes",
3107 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), 3605 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
3108 .read_u64 = mem_cgroup_read, 3606 .read_u64 = mem_cgroup_read,
3607 .register_event = mem_cgroup_register_event,
3608 .unregister_event = mem_cgroup_unregister_event,
3109 }, 3609 },
3110 { 3610 {
3111 .name = "memsw.max_usage_in_bytes", 3611 .name = "memsw.max_usage_in_bytes",
@@ -3180,17 +3680,12 @@ static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
3180 kfree(mem->info.nodeinfo[node]); 3680 kfree(mem->info.nodeinfo[node]);
3181} 3681}
3182 3682
3183static int mem_cgroup_size(void)
3184{
3185 int cpustat_size = nr_cpu_ids * sizeof(struct mem_cgroup_stat_cpu);
3186 return sizeof(struct mem_cgroup) + cpustat_size;
3187}
3188
3189static struct mem_cgroup *mem_cgroup_alloc(void) 3683static struct mem_cgroup *mem_cgroup_alloc(void)
3190{ 3684{
3191 struct mem_cgroup *mem; 3685 struct mem_cgroup *mem;
3192 int size = mem_cgroup_size(); 3686 int size = sizeof(struct mem_cgroup);
3193 3687
3688 /* Can be very big if MAX_NUMNODES is very big */
3194 if (size < PAGE_SIZE) 3689 if (size < PAGE_SIZE)
3195 mem = kmalloc(size, GFP_KERNEL); 3690 mem = kmalloc(size, GFP_KERNEL);
3196 else 3691 else
@@ -3198,6 +3693,14 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
3198 3693
3199 if (mem) 3694 if (mem)
3200 memset(mem, 0, size); 3695 memset(mem, 0, size);
3696 mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
3697 if (!mem->stat) {
3698 if (size < PAGE_SIZE)
3699 kfree(mem);
3700 else
3701 vfree(mem);
3702 mem = NULL;
3703 }
3201 return mem; 3704 return mem;
3202} 3705}
3203 3706
@@ -3222,7 +3725,8 @@ static void __mem_cgroup_free(struct mem_cgroup *mem)
3222 for_each_node_state(node, N_POSSIBLE) 3725 for_each_node_state(node, N_POSSIBLE)
3223 free_mem_cgroup_per_zone_info(mem, node); 3726 free_mem_cgroup_per_zone_info(mem, node);
3224 3727
3225 if (mem_cgroup_size() < PAGE_SIZE) 3728 free_percpu(mem->stat);
3729 if (sizeof(struct mem_cgroup) < PAGE_SIZE)
3226 kfree(mem); 3730 kfree(mem);
3227 else 3731 else
3228 vfree(mem); 3732 vfree(mem);
@@ -3233,9 +3737,9 @@ static void mem_cgroup_get(struct mem_cgroup *mem)
3233 atomic_inc(&mem->refcnt); 3737 atomic_inc(&mem->refcnt);
3234} 3738}
3235 3739
3236static void mem_cgroup_put(struct mem_cgroup *mem) 3740static void __mem_cgroup_put(struct mem_cgroup *mem, int count)
3237{ 3741{
3238 if (atomic_dec_and_test(&mem->refcnt)) { 3742 if (atomic_sub_and_test(count, &mem->refcnt)) {
3239 struct mem_cgroup *parent = parent_mem_cgroup(mem); 3743 struct mem_cgroup *parent = parent_mem_cgroup(mem);
3240 __mem_cgroup_free(mem); 3744 __mem_cgroup_free(mem);
3241 if (parent) 3745 if (parent)
@@ -3243,6 +3747,11 @@ static void mem_cgroup_put(struct mem_cgroup *mem)
3243 } 3747 }
3244} 3748}
3245 3749
3750static void mem_cgroup_put(struct mem_cgroup *mem)
3751{
3752 __mem_cgroup_put(mem, 1);
3753}
3754
3246/* 3755/*
3247 * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled. 3756 * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled.
3248 */ 3757 */
@@ -3319,7 +3828,6 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
3319 INIT_WORK(&stock->work, drain_local_stock); 3828 INIT_WORK(&stock->work, drain_local_stock);
3320 } 3829 }
3321 hotcpu_notifier(memcg_stock_cpu_callback, 0); 3830 hotcpu_notifier(memcg_stock_cpu_callback, 0);
3322
3323 } else { 3831 } else {
3324 parent = mem_cgroup_from_cont(cont->parent); 3832 parent = mem_cgroup_from_cont(cont->parent);
3325 mem->use_hierarchy = parent->use_hierarchy; 3833 mem->use_hierarchy = parent->use_hierarchy;
@@ -3345,6 +3853,8 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
3345 if (parent) 3853 if (parent)
3346 mem->swappiness = get_swappiness(parent); 3854 mem->swappiness = get_swappiness(parent);
3347 atomic_set(&mem->refcnt, 1); 3855 atomic_set(&mem->refcnt, 1);
3856 mem->move_charge_at_immigrate = 0;
3857 mutex_init(&mem->thresholds_lock);
3348 return &mem->css; 3858 return &mem->css;
3349free_out: 3859free_out:
3350 __mem_cgroup_free(mem); 3860 __mem_cgroup_free(mem);
@@ -3381,16 +3891,444 @@ static int mem_cgroup_populate(struct cgroup_subsys *ss,
3381 return ret; 3891 return ret;
3382} 3892}
3383 3893
3894#ifdef CONFIG_MMU
3895/* Handlers for move charge at task migration. */
3896#define PRECHARGE_COUNT_AT_ONCE 256
3897static int mem_cgroup_do_precharge(unsigned long count)
3898{
3899 int ret = 0;
3900 int batch_count = PRECHARGE_COUNT_AT_ONCE;
3901 struct mem_cgroup *mem = mc.to;
3902
3903 if (mem_cgroup_is_root(mem)) {
3904 mc.precharge += count;
3905 /* we don't need css_get for root */
3906 return ret;
3907 }
3908 /* try to charge at once */
3909 if (count > 1) {
3910 struct res_counter *dummy;
3911 /*
3912 * "mem" cannot be under rmdir() because we've already checked
3913 * by cgroup_lock_live_cgroup() that it is not removed and we
3914 * are still under the same cgroup_mutex. So we can postpone
3915 * css_get().
3916 */
3917 if (res_counter_charge(&mem->res, PAGE_SIZE * count, &dummy))
3918 goto one_by_one;
3919 if (do_swap_account && res_counter_charge(&mem->memsw,
3920 PAGE_SIZE * count, &dummy)) {
3921 res_counter_uncharge(&mem->res, PAGE_SIZE * count);
3922 goto one_by_one;
3923 }
3924 mc.precharge += count;
3925 VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags));
3926 WARN_ON_ONCE(count > INT_MAX);
3927 __css_get(&mem->css, (int)count);
3928 return ret;
3929 }
3930one_by_one:
3931 /* fall back to one by one charge */
3932 while (count--) {
3933 if (signal_pending(current)) {
3934 ret = -EINTR;
3935 break;
3936 }
3937 if (!batch_count--) {
3938 batch_count = PRECHARGE_COUNT_AT_ONCE;
3939 cond_resched();
3940 }
3941 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false);
3942 if (ret || !mem)
3943 /* mem_cgroup_clear_mc() will do uncharge later */
3944 return -ENOMEM;
3945 mc.precharge++;
3946 }
3947 return ret;
3948}
3949#else /* !CONFIG_MMU */
3950static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
3951 struct cgroup *cgroup,
3952 struct task_struct *p,
3953 bool threadgroup)
3954{
3955 return 0;
3956}
3957static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
3958 struct cgroup *cgroup,
3959 struct task_struct *p,
3960 bool threadgroup)
3961{
3962}
3384static void mem_cgroup_move_task(struct cgroup_subsys *ss, 3963static void mem_cgroup_move_task(struct cgroup_subsys *ss,
3385 struct cgroup *cont, 3964 struct cgroup *cont,
3386 struct cgroup *old_cont, 3965 struct cgroup *old_cont,
3387 struct task_struct *p, 3966 struct task_struct *p,
3388 bool threadgroup) 3967 bool threadgroup)
3389{ 3968{
3969}
3970#endif
3971
3972/**
3973 * is_target_pte_for_mc - check a pte whether it is valid for move charge
3974 * @vma: the vma the pte to be checked belongs
3975 * @addr: the address corresponding to the pte to be checked
3976 * @ptent: the pte to be checked
3977 * @target: the pointer the target page or swap ent will be stored(can be NULL)
3978 *
3979 * Returns
3980 * 0(MC_TARGET_NONE): if the pte is not a target for move charge.
3981 * 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for
3982 * move charge. if @target is not NULL, the page is stored in target->page
3983 * with extra refcnt got(Callers should handle it).
3984 * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
3985 * target for charge migration. if @target is not NULL, the entry is stored
3986 * in target->ent.
3987 *
3988 * Called with pte lock held.
3989 */
3990union mc_target {
3991 struct page *page;
3992 swp_entry_t ent;
3993};
3994
3995enum mc_target_type {
3996 MC_TARGET_NONE, /* not used */
3997 MC_TARGET_PAGE,
3998 MC_TARGET_SWAP,
3999};
4000
4001static int is_target_pte_for_mc(struct vm_area_struct *vma,
4002 unsigned long addr, pte_t ptent, union mc_target *target)
4003{
4004 struct page *page = NULL;
4005 struct page_cgroup *pc;
4006 int ret = 0;
4007 swp_entry_t ent = { .val = 0 };
4008 int usage_count = 0;
4009 bool move_anon = test_bit(MOVE_CHARGE_TYPE_ANON,
4010 &mc.to->move_charge_at_immigrate);
4011
4012 if (!pte_present(ptent)) {
4013 /* TODO: handle swap of shmes/tmpfs */
4014 if (pte_none(ptent) || pte_file(ptent))
4015 return 0;
4016 else if (is_swap_pte(ptent)) {
4017 ent = pte_to_swp_entry(ptent);
4018 if (!move_anon || non_swap_entry(ent))
4019 return 0;
4020 usage_count = mem_cgroup_count_swap_user(ent, &page);
4021 }
4022 } else {
4023 page = vm_normal_page(vma, addr, ptent);
4024 if (!page || !page_mapped(page))
4025 return 0;
4026 /*
4027 * TODO: We don't move charges of file(including shmem/tmpfs)
4028 * pages for now.
4029 */
4030 if (!move_anon || !PageAnon(page))
4031 return 0;
4032 if (!get_page_unless_zero(page))
4033 return 0;
4034 usage_count = page_mapcount(page);
4035 }
4036 if (usage_count > 1) {
4037 /*
4038 * TODO: We don't move charges of shared(used by multiple
4039 * processes) pages for now.
4040 */
4041 if (page)
4042 put_page(page);
4043 return 0;
4044 }
4045 if (page) {
4046 pc = lookup_page_cgroup(page);
4047 /*
4048 * Do only loose check w/o page_cgroup lock.
4049 * mem_cgroup_move_account() checks the pc is valid or not under
4050 * the lock.
4051 */
4052 if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
4053 ret = MC_TARGET_PAGE;
4054 if (target)
4055 target->page = page;
4056 }
4057 if (!ret || !target)
4058 put_page(page);
4059 }
4060 /* throught */
4061 if (ent.val && do_swap_account && !ret &&
4062 css_id(&mc.from->css) == lookup_swap_cgroup(ent)) {
4063 ret = MC_TARGET_SWAP;
4064 if (target)
4065 target->ent = ent;
4066 }
4067 return ret;
4068}
4069
4070static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
4071 unsigned long addr, unsigned long end,
4072 struct mm_walk *walk)
4073{
4074 struct vm_area_struct *vma = walk->private;
4075 pte_t *pte;
4076 spinlock_t *ptl;
4077
4078 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
4079 for (; addr != end; pte++, addr += PAGE_SIZE)
4080 if (is_target_pte_for_mc(vma, addr, *pte, NULL))
4081 mc.precharge++; /* increment precharge temporarily */
4082 pte_unmap_unlock(pte - 1, ptl);
4083 cond_resched();
4084
4085 return 0;
4086}
4087
4088static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
4089{
4090 unsigned long precharge;
4091 struct vm_area_struct *vma;
4092
4093 down_read(&mm->mmap_sem);
4094 for (vma = mm->mmap; vma; vma = vma->vm_next) {
4095 struct mm_walk mem_cgroup_count_precharge_walk = {
4096 .pmd_entry = mem_cgroup_count_precharge_pte_range,
4097 .mm = mm,
4098 .private = vma,
4099 };
4100 if (is_vm_hugetlb_page(vma))
4101 continue;
4102 /* TODO: We don't move charges of shmem/tmpfs pages for now. */
4103 if (vma->vm_flags & VM_SHARED)
4104 continue;
4105 walk_page_range(vma->vm_start, vma->vm_end,
4106 &mem_cgroup_count_precharge_walk);
4107 }
4108 up_read(&mm->mmap_sem);
4109
4110 precharge = mc.precharge;
4111 mc.precharge = 0;
4112
4113 return precharge;
4114}
4115
4116static int mem_cgroup_precharge_mc(struct mm_struct *mm)
4117{
4118 return mem_cgroup_do_precharge(mem_cgroup_count_precharge(mm));
4119}
4120
4121static void mem_cgroup_clear_mc(void)
4122{
4123 /* we must uncharge all the leftover precharges from mc.to */
4124 if (mc.precharge) {
4125 __mem_cgroup_cancel_charge(mc.to, mc.precharge);
4126 mc.precharge = 0;
4127 }
3390 /* 4128 /*
3391 * FIXME: It's better to move charges of this process from old 4129 * we didn't uncharge from mc.from at mem_cgroup_move_account(), so
3392 * memcg to new memcg. But it's just on TODO-List now. 4130 * we must uncharge here.
3393 */ 4131 */
4132 if (mc.moved_charge) {
4133 __mem_cgroup_cancel_charge(mc.from, mc.moved_charge);
4134 mc.moved_charge = 0;
4135 }
4136 /* we must fixup refcnts and charges */
4137 if (mc.moved_swap) {
4138 WARN_ON_ONCE(mc.moved_swap > INT_MAX);
4139 /* uncharge swap account from the old cgroup */
4140 if (!mem_cgroup_is_root(mc.from))
4141 res_counter_uncharge(&mc.from->memsw,
4142 PAGE_SIZE * mc.moved_swap);
4143 __mem_cgroup_put(mc.from, mc.moved_swap);
4144
4145 if (!mem_cgroup_is_root(mc.to)) {
4146 /*
4147 * we charged both to->res and to->memsw, so we should
4148 * uncharge to->res.
4149 */
4150 res_counter_uncharge(&mc.to->res,
4151 PAGE_SIZE * mc.moved_swap);
4152 VM_BUG_ON(test_bit(CSS_ROOT, &mc.to->css.flags));
4153 __css_put(&mc.to->css, mc.moved_swap);
4154 }
4155 /* we've already done mem_cgroup_get(mc.to) */
4156
4157 mc.moved_swap = 0;
4158 }
4159 mc.from = NULL;
4160 mc.to = NULL;
4161 mc.moving_task = NULL;
4162 wake_up_all(&mc.waitq);
4163}
4164
4165static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
4166 struct cgroup *cgroup,
4167 struct task_struct *p,
4168 bool threadgroup)
4169{
4170 int ret = 0;
4171 struct mem_cgroup *mem = mem_cgroup_from_cont(cgroup);
4172
4173 if (mem->move_charge_at_immigrate) {
4174 struct mm_struct *mm;
4175 struct mem_cgroup *from = mem_cgroup_from_task(p);
4176
4177 VM_BUG_ON(from == mem);
4178
4179 mm = get_task_mm(p);
4180 if (!mm)
4181 return 0;
4182 /* We move charges only when we move a owner of the mm */
4183 if (mm->owner == p) {
4184 VM_BUG_ON(mc.from);
4185 VM_BUG_ON(mc.to);
4186 VM_BUG_ON(mc.precharge);
4187 VM_BUG_ON(mc.moved_charge);
4188 VM_BUG_ON(mc.moved_swap);
4189 VM_BUG_ON(mc.moving_task);
4190 mc.from = from;
4191 mc.to = mem;
4192 mc.precharge = 0;
4193 mc.moved_charge = 0;
4194 mc.moved_swap = 0;
4195 mc.moving_task = current;
4196
4197 ret = mem_cgroup_precharge_mc(mm);
4198 if (ret)
4199 mem_cgroup_clear_mc();
4200 }
4201 mmput(mm);
4202 }
4203 return ret;
4204}
4205
4206static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
4207 struct cgroup *cgroup,
4208 struct task_struct *p,
4209 bool threadgroup)
4210{
4211 mem_cgroup_clear_mc();
4212}
4213
4214static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
4215 unsigned long addr, unsigned long end,
4216 struct mm_walk *walk)
4217{
4218 int ret = 0;
4219 struct vm_area_struct *vma = walk->private;
4220 pte_t *pte;
4221 spinlock_t *ptl;
4222
4223retry:
4224 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
4225 for (; addr != end; addr += PAGE_SIZE) {
4226 pte_t ptent = *(pte++);
4227 union mc_target target;
4228 int type;
4229 struct page *page;
4230 struct page_cgroup *pc;
4231 swp_entry_t ent;
4232
4233 if (!mc.precharge)
4234 break;
4235
4236 type = is_target_pte_for_mc(vma, addr, ptent, &target);
4237 switch (type) {
4238 case MC_TARGET_PAGE:
4239 page = target.page;
4240 if (isolate_lru_page(page))
4241 goto put;
4242 pc = lookup_page_cgroup(page);
4243 if (!mem_cgroup_move_account(pc,
4244 mc.from, mc.to, false)) {
4245 mc.precharge--;
4246 /* we uncharge from mc.from later. */
4247 mc.moved_charge++;
4248 }
4249 putback_lru_page(page);
4250put: /* is_target_pte_for_mc() gets the page */
4251 put_page(page);
4252 break;
4253 case MC_TARGET_SWAP:
4254 ent = target.ent;
4255 if (!mem_cgroup_move_swap_account(ent,
4256 mc.from, mc.to, false)) {
4257 mc.precharge--;
4258 /* we fixup refcnts and charges later. */
4259 mc.moved_swap++;
4260 }
4261 break;
4262 default:
4263 break;
4264 }
4265 }
4266 pte_unmap_unlock(pte - 1, ptl);
4267 cond_resched();
4268
4269 if (addr != end) {
4270 /*
4271 * We have consumed all precharges we got in can_attach().
4272 * We try charge one by one, but don't do any additional
4273 * charges to mc.to if we have failed in charge once in attach()
4274 * phase.
4275 */
4276 ret = mem_cgroup_do_precharge(1);
4277 if (!ret)
4278 goto retry;
4279 }
4280
4281 return ret;
4282}
4283
4284static void mem_cgroup_move_charge(struct mm_struct *mm)
4285{
4286 struct vm_area_struct *vma;
4287
4288 lru_add_drain_all();
4289 down_read(&mm->mmap_sem);
4290 for (vma = mm->mmap; vma; vma = vma->vm_next) {
4291 int ret;
4292 struct mm_walk mem_cgroup_move_charge_walk = {
4293 .pmd_entry = mem_cgroup_move_charge_pte_range,
4294 .mm = mm,
4295 .private = vma,
4296 };
4297 if (is_vm_hugetlb_page(vma))
4298 continue;
4299 /* TODO: We don't move charges of shmem/tmpfs pages for now. */
4300 if (vma->vm_flags & VM_SHARED)
4301 continue;
4302 ret = walk_page_range(vma->vm_start, vma->vm_end,
4303 &mem_cgroup_move_charge_walk);
4304 if (ret)
4305 /*
4306 * means we have consumed all precharges and failed in
4307 * doing additional charge. Just abandon here.
4308 */
4309 break;
4310 }
4311 up_read(&mm->mmap_sem);
4312}
4313
4314static void mem_cgroup_move_task(struct cgroup_subsys *ss,
4315 struct cgroup *cont,
4316 struct cgroup *old_cont,
4317 struct task_struct *p,
4318 bool threadgroup)
4319{
4320 struct mm_struct *mm;
4321
4322 if (!mc.to)
4323 /* no need to move charge */
4324 return;
4325
4326 mm = get_task_mm(p);
4327 if (mm) {
4328 mem_cgroup_move_charge(mm);
4329 mmput(mm);
4330 }
4331 mem_cgroup_clear_mc();
3394} 4332}
3395 4333
3396struct cgroup_subsys mem_cgroup_subsys = { 4334struct cgroup_subsys mem_cgroup_subsys = {
@@ -3400,6 +4338,8 @@ struct cgroup_subsys mem_cgroup_subsys = {
3400 .pre_destroy = mem_cgroup_pre_destroy, 4338 .pre_destroy = mem_cgroup_pre_destroy,
3401 .destroy = mem_cgroup_destroy, 4339 .destroy = mem_cgroup_destroy,
3402 .populate = mem_cgroup_populate, 4340 .populate = mem_cgroup_populate,
4341 .can_attach = mem_cgroup_can_attach,
4342 .cancel_attach = mem_cgroup_cancel_attach,
3403 .attach = mem_cgroup_move_task, 4343 .attach = mem_cgroup_move_task,
3404 .early_init = 0, 4344 .early_init = 0,
3405 .use_id = 1, 4345 .use_id = 1,
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 17299fd4577c..d1f335162976 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -383,9 +383,12 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
383 if (av == NULL) /* Not actually mapped anymore */ 383 if (av == NULL) /* Not actually mapped anymore */
384 goto out; 384 goto out;
385 for_each_process (tsk) { 385 for_each_process (tsk) {
386 struct anon_vma_chain *vmac;
387
386 if (!task_early_kill(tsk)) 388 if (!task_early_kill(tsk))
387 continue; 389 continue;
388 list_for_each_entry (vma, &av->head, anon_vma_node) { 390 list_for_each_entry(vmac, &av->head, same_anon_vma) {
391 vma = vmac->vma;
389 if (!page_mapped_in_vma(page, vma)) 392 if (!page_mapped_in_vma(page, vma))
390 continue; 393 continue;
391 if (vma->vm_mm == tsk->mm) 394 if (vma->vm_mm == tsk->mm)
diff --git a/mm/memory.c b/mm/memory.c
index 09e4b1be7b67..5b7f2002e54b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -121,6 +121,77 @@ static int __init init_zero_pfn(void)
121} 121}
122core_initcall(init_zero_pfn); 122core_initcall(init_zero_pfn);
123 123
124
125#if defined(SPLIT_RSS_COUNTING)
126
127void __sync_task_rss_stat(struct task_struct *task, struct mm_struct *mm)
128{
129 int i;
130
131 for (i = 0; i < NR_MM_COUNTERS; i++) {
132 if (task->rss_stat.count[i]) {
133 add_mm_counter(mm, i, task->rss_stat.count[i]);
134 task->rss_stat.count[i] = 0;
135 }
136 }
137 task->rss_stat.events = 0;
138}
139
140static void add_mm_counter_fast(struct mm_struct *mm, int member, int val)
141{
142 struct task_struct *task = current;
143
144 if (likely(task->mm == mm))
145 task->rss_stat.count[member] += val;
146 else
147 add_mm_counter(mm, member, val);
148}
149#define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1)
150#define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1)
151
152/* sync counter once per 64 page faults */
153#define TASK_RSS_EVENTS_THRESH (64)
154static void check_sync_rss_stat(struct task_struct *task)
155{
156 if (unlikely(task != current))
157 return;
158 if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH))
159 __sync_task_rss_stat(task, task->mm);
160}
161
162unsigned long get_mm_counter(struct mm_struct *mm, int member)
163{
164 long val = 0;
165
166 /*
167 * Don't use task->mm here...for avoiding to use task_get_mm()..
168 * The caller must guarantee task->mm is not invalid.
169 */
170 val = atomic_long_read(&mm->rss_stat.count[member]);
171 /*
172 * counter is updated in asynchronous manner and may go to minus.
173 * But it's never be expected number for users.
174 */
175 if (val < 0)
176 return 0;
177 return (unsigned long)val;
178}
179
180void sync_mm_rss(struct task_struct *task, struct mm_struct *mm)
181{
182 __sync_task_rss_stat(task, mm);
183}
184#else
185
186#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member)
187#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member)
188
189static void check_sync_rss_stat(struct task_struct *task)
190{
191}
192
193#endif
194
124/* 195/*
125 * If a p?d_bad entry is found while walking page tables, report 196 * If a p?d_bad entry is found while walking page tables, report
126 * the error, before resetting entry to p?d_none. Usually (but 197 * the error, before resetting entry to p?d_none. Usually (but
@@ -300,7 +371,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
300 * Hide vma from rmap and truncate_pagecache before freeing 371 * Hide vma from rmap and truncate_pagecache before freeing
301 * pgtables 372 * pgtables
302 */ 373 */
303 anon_vma_unlink(vma); 374 unlink_anon_vmas(vma);
304 unlink_file_vma(vma); 375 unlink_file_vma(vma);
305 376
306 if (is_vm_hugetlb_page(vma)) { 377 if (is_vm_hugetlb_page(vma)) {
@@ -314,7 +385,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
314 && !is_vm_hugetlb_page(next)) { 385 && !is_vm_hugetlb_page(next)) {
315 vma = next; 386 vma = next;
316 next = vma->vm_next; 387 next = vma->vm_next;
317 anon_vma_unlink(vma); 388 unlink_anon_vmas(vma);
318 unlink_file_vma(vma); 389 unlink_file_vma(vma);
319 } 390 }
320 free_pgd_range(tlb, addr, vma->vm_end, 391 free_pgd_range(tlb, addr, vma->vm_end,
@@ -376,12 +447,20 @@ int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
376 return 0; 447 return 0;
377} 448}
378 449
379static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss) 450static inline void init_rss_vec(int *rss)
380{ 451{
381 if (file_rss) 452 memset(rss, 0, sizeof(int) * NR_MM_COUNTERS);
382 add_mm_counter(mm, file_rss, file_rss); 453}
383 if (anon_rss) 454
384 add_mm_counter(mm, anon_rss, anon_rss); 455static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
456{
457 int i;
458
459 if (current->mm == mm)
460 sync_mm_rss(current, mm);
461 for (i = 0; i < NR_MM_COUNTERS; i++)
462 if (rss[i])
463 add_mm_counter(mm, i, rss[i]);
385} 464}
386 465
387/* 466/*
@@ -430,12 +509,8 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
430 "BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n", 509 "BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n",
431 current->comm, 510 current->comm,
432 (long long)pte_val(pte), (long long)pmd_val(*pmd)); 511 (long long)pte_val(pte), (long long)pmd_val(*pmd));
433 if (page) { 512 if (page)
434 printk(KERN_ALERT 513 dump_page(page);
435 "page:%p flags:%p count:%d mapcount:%d mapping:%p index:%lx\n",
436 page, (void *)page->flags, page_count(page),
437 page_mapcount(page), page->mapping, page->index);
438 }
439 printk(KERN_ALERT 514 printk(KERN_ALERT
440 "addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n", 515 "addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n",
441 (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index); 516 (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
@@ -597,7 +672,9 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
597 &src_mm->mmlist); 672 &src_mm->mmlist);
598 spin_unlock(&mmlist_lock); 673 spin_unlock(&mmlist_lock);
599 } 674 }
600 if (is_write_migration_entry(entry) && 675 if (likely(!non_swap_entry(entry)))
676 rss[MM_SWAPENTS]++;
677 else if (is_write_migration_entry(entry) &&
601 is_cow_mapping(vm_flags)) { 678 is_cow_mapping(vm_flags)) {
602 /* 679 /*
603 * COW mappings require pages in both parent 680 * COW mappings require pages in both parent
@@ -632,7 +709,10 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
632 if (page) { 709 if (page) {
633 get_page(page); 710 get_page(page);
634 page_dup_rmap(page); 711 page_dup_rmap(page);
635 rss[PageAnon(page)]++; 712 if (PageAnon(page))
713 rss[MM_ANONPAGES]++;
714 else
715 rss[MM_FILEPAGES]++;
636 } 716 }
637 717
638out_set_pte: 718out_set_pte:
@@ -648,11 +728,12 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
648 pte_t *src_pte, *dst_pte; 728 pte_t *src_pte, *dst_pte;
649 spinlock_t *src_ptl, *dst_ptl; 729 spinlock_t *src_ptl, *dst_ptl;
650 int progress = 0; 730 int progress = 0;
651 int rss[2]; 731 int rss[NR_MM_COUNTERS];
652 swp_entry_t entry = (swp_entry_t){0}; 732 swp_entry_t entry = (swp_entry_t){0};
653 733
654again: 734again:
655 rss[1] = rss[0] = 0; 735 init_rss_vec(rss);
736
656 dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl); 737 dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
657 if (!dst_pte) 738 if (!dst_pte)
658 return -ENOMEM; 739 return -ENOMEM;
@@ -688,7 +769,7 @@ again:
688 arch_leave_lazy_mmu_mode(); 769 arch_leave_lazy_mmu_mode();
689 spin_unlock(src_ptl); 770 spin_unlock(src_ptl);
690 pte_unmap_nested(orig_src_pte); 771 pte_unmap_nested(orig_src_pte);
691 add_mm_rss(dst_mm, rss[0], rss[1]); 772 add_mm_rss_vec(dst_mm, rss);
692 pte_unmap_unlock(orig_dst_pte, dst_ptl); 773 pte_unmap_unlock(orig_dst_pte, dst_ptl);
693 cond_resched(); 774 cond_resched();
694 775
@@ -816,8 +897,9 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
816 struct mm_struct *mm = tlb->mm; 897 struct mm_struct *mm = tlb->mm;
817 pte_t *pte; 898 pte_t *pte;
818 spinlock_t *ptl; 899 spinlock_t *ptl;
819 int file_rss = 0; 900 int rss[NR_MM_COUNTERS];
820 int anon_rss = 0; 901
902 init_rss_vec(rss);
821 903
822 pte = pte_offset_map_lock(mm, pmd, addr, &ptl); 904 pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
823 arch_enter_lazy_mmu_mode(); 905 arch_enter_lazy_mmu_mode();
@@ -863,14 +945,14 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
863 set_pte_at(mm, addr, pte, 945 set_pte_at(mm, addr, pte,
864 pgoff_to_pte(page->index)); 946 pgoff_to_pte(page->index));
865 if (PageAnon(page)) 947 if (PageAnon(page))
866 anon_rss--; 948 rss[MM_ANONPAGES]--;
867 else { 949 else {
868 if (pte_dirty(ptent)) 950 if (pte_dirty(ptent))
869 set_page_dirty(page); 951 set_page_dirty(page);
870 if (pte_young(ptent) && 952 if (pte_young(ptent) &&
871 likely(!VM_SequentialReadHint(vma))) 953 likely(!VM_SequentialReadHint(vma)))
872 mark_page_accessed(page); 954 mark_page_accessed(page);
873 file_rss--; 955 rss[MM_FILEPAGES]--;
874 } 956 }
875 page_remove_rmap(page); 957 page_remove_rmap(page);
876 if (unlikely(page_mapcount(page) < 0)) 958 if (unlikely(page_mapcount(page) < 0))
@@ -887,13 +969,18 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
887 if (pte_file(ptent)) { 969 if (pte_file(ptent)) {
888 if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) 970 if (unlikely(!(vma->vm_flags & VM_NONLINEAR)))
889 print_bad_pte(vma, addr, ptent, NULL); 971 print_bad_pte(vma, addr, ptent, NULL);
890 } else if 972 } else {
891 (unlikely(!free_swap_and_cache(pte_to_swp_entry(ptent)))) 973 swp_entry_t entry = pte_to_swp_entry(ptent);
892 print_bad_pte(vma, addr, ptent, NULL); 974
975 if (!non_swap_entry(entry))
976 rss[MM_SWAPENTS]--;
977 if (unlikely(!free_swap_and_cache(entry)))
978 print_bad_pte(vma, addr, ptent, NULL);
979 }
893 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); 980 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
894 } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0)); 981 } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
895 982
896 add_mm_rss(mm, file_rss, anon_rss); 983 add_mm_rss_vec(mm, rss);
897 arch_leave_lazy_mmu_mode(); 984 arch_leave_lazy_mmu_mode();
898 pte_unmap_unlock(pte - 1, ptl); 985 pte_unmap_unlock(pte - 1, ptl);
899 986
@@ -1527,7 +1614,7 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr,
1527 1614
1528 /* Ok, finally just insert the thing.. */ 1615 /* Ok, finally just insert the thing.. */
1529 get_page(page); 1616 get_page(page);
1530 inc_mm_counter(mm, file_rss); 1617 inc_mm_counter_fast(mm, MM_FILEPAGES);
1531 page_add_file_rmap(page); 1618 page_add_file_rmap(page);
1532 set_pte_at(mm, addr, pte, mk_pte(page, prot)); 1619 set_pte_at(mm, addr, pte, mk_pte(page, prot));
1533 1620
@@ -1593,7 +1680,7 @@ static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
1593 /* Ok, finally just insert the thing.. */ 1680 /* Ok, finally just insert the thing.. */
1594 entry = pte_mkspecial(pfn_pte(pfn, prot)); 1681 entry = pte_mkspecial(pfn_pte(pfn, prot));
1595 set_pte_at(mm, addr, pte, entry); 1682 set_pte_at(mm, addr, pte, entry);
1596 update_mmu_cache(vma, addr, entry); /* XXX: why not for insert_page? */ 1683 update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */
1597 1684
1598 retval = 0; 1685 retval = 0;
1599out_unlock: 1686out_unlock:
@@ -2044,6 +2131,13 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
2044 page_cache_release(old_page); 2131 page_cache_release(old_page);
2045 } 2132 }
2046 reuse = reuse_swap_page(old_page); 2133 reuse = reuse_swap_page(old_page);
2134 if (reuse)
2135 /*
2136 * The page is all ours. Move it to our anon_vma so
2137 * the rmap code will not search our parent or siblings.
2138 * Protected against the rmap code by the page lock.
2139 */
2140 page_move_anon_rmap(old_page, vma, address);
2047 unlock_page(old_page); 2141 unlock_page(old_page);
2048 } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == 2142 } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
2049 (VM_WRITE|VM_SHARED))) { 2143 (VM_WRITE|VM_SHARED))) {
@@ -2116,7 +2210,7 @@ reuse:
2116 entry = pte_mkyoung(orig_pte); 2210 entry = pte_mkyoung(orig_pte);
2117 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 2211 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2118 if (ptep_set_access_flags(vma, address, page_table, entry,1)) 2212 if (ptep_set_access_flags(vma, address, page_table, entry,1))
2119 update_mmu_cache(vma, address, entry); 2213 update_mmu_cache(vma, address, page_table);
2120 ret |= VM_FAULT_WRITE; 2214 ret |= VM_FAULT_WRITE;
2121 goto unlock; 2215 goto unlock;
2122 } 2216 }
@@ -2163,11 +2257,11 @@ gotten:
2163 if (likely(pte_same(*page_table, orig_pte))) { 2257 if (likely(pte_same(*page_table, orig_pte))) {
2164 if (old_page) { 2258 if (old_page) {
2165 if (!PageAnon(old_page)) { 2259 if (!PageAnon(old_page)) {
2166 dec_mm_counter(mm, file_rss); 2260 dec_mm_counter_fast(mm, MM_FILEPAGES);
2167 inc_mm_counter(mm, anon_rss); 2261 inc_mm_counter_fast(mm, MM_ANONPAGES);
2168 } 2262 }
2169 } else 2263 } else
2170 inc_mm_counter(mm, anon_rss); 2264 inc_mm_counter_fast(mm, MM_ANONPAGES);
2171 flush_cache_page(vma, address, pte_pfn(orig_pte)); 2265 flush_cache_page(vma, address, pte_pfn(orig_pte));
2172 entry = mk_pte(new_page, vma->vm_page_prot); 2266 entry = mk_pte(new_page, vma->vm_page_prot);
2173 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 2267 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
@@ -2185,7 +2279,7 @@ gotten:
2185 * new page to be mapped directly into the secondary page table. 2279 * new page to be mapped directly into the secondary page table.
2186 */ 2280 */
2187 set_pte_at_notify(mm, address, page_table, entry); 2281 set_pte_at_notify(mm, address, page_table, entry);
2188 update_mmu_cache(vma, address, entry); 2282 update_mmu_cache(vma, address, page_table);
2189 if (old_page) { 2283 if (old_page) {
2190 /* 2284 /*
2191 * Only after switching the pte to the new page may 2285 * Only after switching the pte to the new page may
@@ -2604,7 +2698,8 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2604 * discarded at swap_free(). 2698 * discarded at swap_free().
2605 */ 2699 */
2606 2700
2607 inc_mm_counter(mm, anon_rss); 2701 inc_mm_counter_fast(mm, MM_ANONPAGES);
2702 dec_mm_counter_fast(mm, MM_SWAPENTS);
2608 pte = mk_pte(page, vma->vm_page_prot); 2703 pte = mk_pte(page, vma->vm_page_prot);
2609 if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) { 2704 if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
2610 pte = maybe_mkwrite(pte_mkdirty(pte), vma); 2705 pte = maybe_mkwrite(pte_mkdirty(pte), vma);
@@ -2629,7 +2724,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2629 } 2724 }
2630 2725
2631 /* No need to invalidate - it was non-present before */ 2726 /* No need to invalidate - it was non-present before */
2632 update_mmu_cache(vma, address, pte); 2727 update_mmu_cache(vma, address, page_table);
2633unlock: 2728unlock:
2634 pte_unmap_unlock(page_table, ptl); 2729 pte_unmap_unlock(page_table, ptl);
2635out: 2730out:
@@ -2688,13 +2783,13 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
2688 if (!pte_none(*page_table)) 2783 if (!pte_none(*page_table))
2689 goto release; 2784 goto release;
2690 2785
2691 inc_mm_counter(mm, anon_rss); 2786 inc_mm_counter_fast(mm, MM_ANONPAGES);
2692 page_add_new_anon_rmap(page, vma, address); 2787 page_add_new_anon_rmap(page, vma, address);
2693setpte: 2788setpte:
2694 set_pte_at(mm, address, page_table, entry); 2789 set_pte_at(mm, address, page_table, entry);
2695 2790
2696 /* No need to invalidate - it was non-present before */ 2791 /* No need to invalidate - it was non-present before */
2697 update_mmu_cache(vma, address, entry); 2792 update_mmu_cache(vma, address, page_table);
2698unlock: 2793unlock:
2699 pte_unmap_unlock(page_table, ptl); 2794 pte_unmap_unlock(page_table, ptl);
2700 return 0; 2795 return 0;
@@ -2842,10 +2937,10 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2842 if (flags & FAULT_FLAG_WRITE) 2937 if (flags & FAULT_FLAG_WRITE)
2843 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 2938 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2844 if (anon) { 2939 if (anon) {
2845 inc_mm_counter(mm, anon_rss); 2940 inc_mm_counter_fast(mm, MM_ANONPAGES);
2846 page_add_new_anon_rmap(page, vma, address); 2941 page_add_new_anon_rmap(page, vma, address);
2847 } else { 2942 } else {
2848 inc_mm_counter(mm, file_rss); 2943 inc_mm_counter_fast(mm, MM_FILEPAGES);
2849 page_add_file_rmap(page); 2944 page_add_file_rmap(page);
2850 if (flags & FAULT_FLAG_WRITE) { 2945 if (flags & FAULT_FLAG_WRITE) {
2851 dirty_page = page; 2946 dirty_page = page;
@@ -2855,7 +2950,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2855 set_pte_at(mm, address, page_table, entry); 2950 set_pte_at(mm, address, page_table, entry);
2856 2951
2857 /* no need to invalidate: a not-present page won't be cached */ 2952 /* no need to invalidate: a not-present page won't be cached */
2858 update_mmu_cache(vma, address, entry); 2953 update_mmu_cache(vma, address, page_table);
2859 } else { 2954 } else {
2860 if (charged) 2955 if (charged)
2861 mem_cgroup_uncharge_page(page); 2956 mem_cgroup_uncharge_page(page);
@@ -2992,7 +3087,7 @@ static inline int handle_pte_fault(struct mm_struct *mm,
2992 } 3087 }
2993 entry = pte_mkyoung(entry); 3088 entry = pte_mkyoung(entry);
2994 if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) { 3089 if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) {
2995 update_mmu_cache(vma, address, entry); 3090 update_mmu_cache(vma, address, pte);
2996 } else { 3091 } else {
2997 /* 3092 /*
2998 * This is needed only for protection faults but the arch code 3093 * This is needed only for protection faults but the arch code
@@ -3023,6 +3118,9 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3023 3118
3024 count_vm_event(PGFAULT); 3119 count_vm_event(PGFAULT);
3025 3120
3121 /* do counter updates before entering really critical section. */
3122 check_sync_rss_stat(current);
3123
3026 if (unlikely(is_vm_hugetlb_page(vma))) 3124 if (unlikely(is_vm_hugetlb_page(vma)))
3027 return hugetlb_fault(mm, vma, address, flags); 3125 return hugetlb_fault(mm, vma, address, flags);
3028 3126
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 030ce8a5bb0e..be211a582930 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -28,6 +28,7 @@
28#include <linux/pfn.h> 28#include <linux/pfn.h>
29#include <linux/suspend.h> 29#include <linux/suspend.h>
30#include <linux/mm_inline.h> 30#include <linux/mm_inline.h>
31#include <linux/firmware-map.h>
31 32
32#include <asm/tlbflush.h> 33#include <asm/tlbflush.h>
33 34
@@ -523,6 +524,9 @@ int __ref add_memory(int nid, u64 start, u64 size)
523 BUG_ON(ret); 524 BUG_ON(ret);
524 } 525 }
525 526
527 /* create new memmap entry */
528 firmware_map_add_hotplug(start, start + size, "System RAM");
529
526 goto out; 530 goto out;
527 531
528error: 532error:
@@ -684,9 +688,9 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
684 if (page_count(page)) 688 if (page_count(page))
685 not_managed++; 689 not_managed++;
686#ifdef CONFIG_DEBUG_VM 690#ifdef CONFIG_DEBUG_VM
687 printk(KERN_INFO "removing from LRU failed" 691 printk(KERN_ALERT "removing pfn %lx from LRU failed\n",
688 " %lx/%d/%lx\n", 692 pfn);
689 pfn, page_count(page), page->flags); 693 dump_page(page);
690#endif 694#endif
691 } 695 }
692 } 696 }
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 3cec080faa23..643f66e10187 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -563,24 +563,50 @@ static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
563} 563}
564 564
565/* Step 2: apply policy to a range and do splits. */ 565/* Step 2: apply policy to a range and do splits. */
566static int mbind_range(struct vm_area_struct *vma, unsigned long start, 566static int mbind_range(struct mm_struct *mm, unsigned long start,
567 unsigned long end, struct mempolicy *new) 567 unsigned long end, struct mempolicy *new_pol)
568{ 568{
569 struct vm_area_struct *next; 569 struct vm_area_struct *next;
570 int err; 570 struct vm_area_struct *prev;
571 struct vm_area_struct *vma;
572 int err = 0;
573 pgoff_t pgoff;
574 unsigned long vmstart;
575 unsigned long vmend;
571 576
572 err = 0; 577 vma = find_vma_prev(mm, start, &prev);
573 for (; vma && vma->vm_start < end; vma = next) { 578 if (!vma || vma->vm_start > start)
579 return -EFAULT;
580
581 for (; vma && vma->vm_start < end; prev = vma, vma = next) {
574 next = vma->vm_next; 582 next = vma->vm_next;
575 if (vma->vm_start < start) 583 vmstart = max(start, vma->vm_start);
576 err = split_vma(vma->vm_mm, vma, start, 1); 584 vmend = min(end, vma->vm_end);
577 if (!err && vma->vm_end > end) 585
578 err = split_vma(vma->vm_mm, vma, end, 0); 586 pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
579 if (!err) 587 prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
580 err = policy_vma(vma, new); 588 vma->anon_vma, vma->vm_file, pgoff, new_pol);
589 if (prev) {
590 vma = prev;
591 next = vma->vm_next;
592 continue;
593 }
594 if (vma->vm_start != vmstart) {
595 err = split_vma(vma->vm_mm, vma, vmstart, 1);
596 if (err)
597 goto out;
598 }
599 if (vma->vm_end != vmend) {
600 err = split_vma(vma->vm_mm, vma, vmend, 0);
601 if (err)
602 goto out;
603 }
604 err = policy_vma(vma, new_pol);
581 if (err) 605 if (err)
582 break; 606 goto out;
583 } 607 }
608
609 out:
584 return err; 610 return err;
585} 611}
586 612
@@ -862,36 +888,36 @@ int do_migrate_pages(struct mm_struct *mm,
862 if (err) 888 if (err)
863 goto out; 889 goto out;
864 890
865/* 891 /*
866 * Find a 'source' bit set in 'tmp' whose corresponding 'dest' 892 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
867 * bit in 'to' is not also set in 'tmp'. Clear the found 'source' 893 * bit in 'to' is not also set in 'tmp'. Clear the found 'source'
868 * bit in 'tmp', and return that <source, dest> pair for migration. 894 * bit in 'tmp', and return that <source, dest> pair for migration.
869 * The pair of nodemasks 'to' and 'from' define the map. 895 * The pair of nodemasks 'to' and 'from' define the map.
870 * 896 *
871 * If no pair of bits is found that way, fallback to picking some 897 * If no pair of bits is found that way, fallback to picking some
872 * pair of 'source' and 'dest' bits that are not the same. If the 898 * pair of 'source' and 'dest' bits that are not the same. If the
873 * 'source' and 'dest' bits are the same, this represents a node 899 * 'source' and 'dest' bits are the same, this represents a node
874 * that will be migrating to itself, so no pages need move. 900 * that will be migrating to itself, so no pages need move.
875 * 901 *
876 * If no bits are left in 'tmp', or if all remaining bits left 902 * If no bits are left in 'tmp', or if all remaining bits left
877 * in 'tmp' correspond to the same bit in 'to', return false 903 * in 'tmp' correspond to the same bit in 'to', return false
878 * (nothing left to migrate). 904 * (nothing left to migrate).
879 * 905 *
880 * This lets us pick a pair of nodes to migrate between, such that 906 * This lets us pick a pair of nodes to migrate between, such that
881 * if possible the dest node is not already occupied by some other 907 * if possible the dest node is not already occupied by some other
882 * source node, minimizing the risk of overloading the memory on a 908 * source node, minimizing the risk of overloading the memory on a
883 * node that would happen if we migrated incoming memory to a node 909 * node that would happen if we migrated incoming memory to a node
884 * before migrating outgoing memory source that same node. 910 * before migrating outgoing memory source that same node.
885 * 911 *
886 * A single scan of tmp is sufficient. As we go, we remember the 912 * A single scan of tmp is sufficient. As we go, we remember the
887 * most recent <s, d> pair that moved (s != d). If we find a pair 913 * most recent <s, d> pair that moved (s != d). If we find a pair
888 * that not only moved, but what's better, moved to an empty slot 914 * that not only moved, but what's better, moved to an empty slot
889 * (d is not set in tmp), then we break out then, with that pair. 915 * (d is not set in tmp), then we break out then, with that pair.
890 * Otherwise when we finish scannng from_tmp, we at least have the 916 * Otherwise when we finish scannng from_tmp, we at least have the
891 * most recent <s, d> pair that moved. If we get all the way through 917 * most recent <s, d> pair that moved. If we get all the way through
892 * the scan of tmp without finding any node that moved, much less 918 * the scan of tmp without finding any node that moved, much less
893 * moved to an empty node, then there is nothing left worth migrating. 919 * moved to an empty node, then there is nothing left worth migrating.
894 */ 920 */
895 921
896 tmp = *from_nodes; 922 tmp = *from_nodes;
897 while (!nodes_empty(tmp)) { 923 while (!nodes_empty(tmp)) {
@@ -1047,7 +1073,7 @@ static long do_mbind(unsigned long start, unsigned long len,
1047 if (!IS_ERR(vma)) { 1073 if (!IS_ERR(vma)) {
1048 int nr_failed = 0; 1074 int nr_failed = 0;
1049 1075
1050 err = mbind_range(vma, start, end, new); 1076 err = mbind_range(mm, start, end, new);
1051 1077
1052 if (!list_empty(&pagelist)) 1078 if (!list_empty(&pagelist))
1053 nr_failed = migrate_pages(&pagelist, new_vma_page, 1079 nr_failed = migrate_pages(&pagelist, new_vma_page,
diff --git a/mm/migrate.c b/mm/migrate.c
index 880bd592d38e..88000b89fc9a 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -134,7 +134,7 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
134 page_add_file_rmap(new); 134 page_add_file_rmap(new);
135 135
136 /* No need to invalidate - it was non-present before */ 136 /* No need to invalidate - it was non-present before */
137 update_mmu_cache(vma, addr, pte); 137 update_mmu_cache(vma, addr, ptep);
138unlock: 138unlock:
139 pte_unmap_unlock(ptep, ptl); 139 pte_unmap_unlock(ptep, ptl);
140out: 140out:
@@ -275,8 +275,6 @@ static int migrate_page_move_mapping(struct address_space *mapping,
275 */ 275 */
276static void migrate_page_copy(struct page *newpage, struct page *page) 276static void migrate_page_copy(struct page *newpage, struct page *page)
277{ 277{
278 int anon;
279
280 copy_highpage(newpage, page); 278 copy_highpage(newpage, page);
281 279
282 if (PageError(page)) 280 if (PageError(page))
@@ -313,8 +311,6 @@ static void migrate_page_copy(struct page *newpage, struct page *page)
313 ClearPageSwapCache(page); 311 ClearPageSwapCache(page);
314 ClearPagePrivate(page); 312 ClearPagePrivate(page);
315 set_page_private(page, 0); 313 set_page_private(page, 0);
316 /* page->mapping contains a flag for PageAnon() */
317 anon = PageAnon(page);
318 page->mapping = NULL; 314 page->mapping = NULL;
319 315
320 /* 316 /*
diff --git a/mm/mlock.c b/mm/mlock.c
index 2b8335a89400..8f4e2dfceec1 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -25,7 +25,7 @@ int can_do_mlock(void)
25{ 25{
26 if (capable(CAP_IPC_LOCK)) 26 if (capable(CAP_IPC_LOCK))
27 return 1; 27 return 1;
28 if (current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur != 0) 28 if (rlimit(RLIMIT_MEMLOCK) != 0)
29 return 1; 29 return 1;
30 return 0; 30 return 0;
31} 31}
@@ -487,7 +487,7 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
487 locked = len >> PAGE_SHIFT; 487 locked = len >> PAGE_SHIFT;
488 locked += current->mm->locked_vm; 488 locked += current->mm->locked_vm;
489 489
490 lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; 490 lock_limit = rlimit(RLIMIT_MEMLOCK);
491 lock_limit >>= PAGE_SHIFT; 491 lock_limit >>= PAGE_SHIFT;
492 492
493 /* check against resource limits */ 493 /* check against resource limits */
@@ -550,7 +550,7 @@ SYSCALL_DEFINE1(mlockall, int, flags)
550 550
551 down_write(&current->mm->mmap_sem); 551 down_write(&current->mm->mmap_sem);
552 552
553 lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; 553 lock_limit = rlimit(RLIMIT_MEMLOCK);
554 lock_limit >>= PAGE_SHIFT; 554 lock_limit >>= PAGE_SHIFT;
555 555
556 ret = -ENOMEM; 556 ret = -ENOMEM;
@@ -584,7 +584,7 @@ int user_shm_lock(size_t size, struct user_struct *user)
584 int allowed = 0; 584 int allowed = 0;
585 585
586 locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; 586 locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
587 lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; 587 lock_limit = rlimit(RLIMIT_MEMLOCK);
588 if (lock_limit == RLIM_INFINITY) 588 if (lock_limit == RLIM_INFINITY)
589 allowed = 1; 589 allowed = 1;
590 lock_limit >>= PAGE_SHIFT; 590 lock_limit >>= PAGE_SHIFT;
@@ -618,12 +618,12 @@ int account_locked_memory(struct mm_struct *mm, struct rlimit *rlim,
618 618
619 down_write(&mm->mmap_sem); 619 down_write(&mm->mmap_sem);
620 620
621 lim = rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT; 621 lim = ACCESS_ONCE(rlim[RLIMIT_AS].rlim_cur) >> PAGE_SHIFT;
622 vm = mm->total_vm + pgsz; 622 vm = mm->total_vm + pgsz;
623 if (lim < vm) 623 if (lim < vm)
624 goto out; 624 goto out;
625 625
626 lim = rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; 626 lim = ACCESS_ONCE(rlim[RLIMIT_MEMLOCK].rlim_cur) >> PAGE_SHIFT;
627 vm = mm->locked_vm + pgsz; 627 vm = mm->locked_vm + pgsz;
628 if (lim < vm) 628 if (lim < vm)
629 goto out; 629 goto out;
diff --git a/mm/mmap.c b/mm/mmap.c
index ee2298936fe6..75557c639ad4 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -265,7 +265,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
265 * segment grow beyond its set limit the in case where the limit is 265 * segment grow beyond its set limit the in case where the limit is
266 * not page aligned -Ram Gupta 266 * not page aligned -Ram Gupta
267 */ 267 */
268 rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur; 268 rlim = rlimit(RLIMIT_DATA);
269 if (rlim < RLIM_INFINITY && (brk - mm->start_brk) + 269 if (rlim < RLIM_INFINITY && (brk - mm->start_brk) +
270 (mm->end_data - mm->start_data) > rlim) 270 (mm->end_data - mm->start_data) > rlim)
271 goto out; 271 goto out;
@@ -437,7 +437,6 @@ __vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
437{ 437{
438 __vma_link_list(mm, vma, prev, rb_parent); 438 __vma_link_list(mm, vma, prev, rb_parent);
439 __vma_link_rb(mm, vma, rb_link, rb_parent); 439 __vma_link_rb(mm, vma, rb_link, rb_parent);
440 __anon_vma_link(vma);
441} 440}
442 441
443static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma, 442static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -499,7 +498,7 @@ __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
499 * are necessary. The "insert" vma (if any) is to be inserted 498 * are necessary. The "insert" vma (if any) is to be inserted
500 * before we drop the necessary locks. 499 * before we drop the necessary locks.
501 */ 500 */
502void vma_adjust(struct vm_area_struct *vma, unsigned long start, 501int vma_adjust(struct vm_area_struct *vma, unsigned long start,
503 unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert) 502 unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert)
504{ 503{
505 struct mm_struct *mm = vma->vm_mm; 504 struct mm_struct *mm = vma->vm_mm;
@@ -542,6 +541,26 @@ again: remove_next = 1 + (end > next->vm_end);
542 } 541 }
543 } 542 }
544 543
544 /*
545 * When changing only vma->vm_end, we don't really need anon_vma lock.
546 */
547 if (vma->anon_vma && (insert || importer || start != vma->vm_start))
548 anon_vma = vma->anon_vma;
549 if (anon_vma) {
550 /*
551 * Easily overlooked: when mprotect shifts the boundary,
552 * make sure the expanding vma has anon_vma set if the
553 * shrinking vma had, to cover any anon pages imported.
554 */
555 if (importer && !importer->anon_vma) {
556 /* Block reverse map lookups until things are set up. */
557 if (anon_vma_clone(importer, vma)) {
558 return -ENOMEM;
559 }
560 importer->anon_vma = anon_vma;
561 }
562 }
563
545 if (file) { 564 if (file) {
546 mapping = file->f_mapping; 565 mapping = file->f_mapping;
547 if (!(vma->vm_flags & VM_NONLINEAR)) 566 if (!(vma->vm_flags & VM_NONLINEAR))
@@ -567,25 +586,6 @@ again: remove_next = 1 + (end > next->vm_end);
567 } 586 }
568 } 587 }
569 588
570 /*
571 * When changing only vma->vm_end, we don't really need
572 * anon_vma lock.
573 */
574 if (vma->anon_vma && (insert || importer || start != vma->vm_start))
575 anon_vma = vma->anon_vma;
576 if (anon_vma) {
577 spin_lock(&anon_vma->lock);
578 /*
579 * Easily overlooked: when mprotect shifts the boundary,
580 * make sure the expanding vma has anon_vma set if the
581 * shrinking vma had, to cover any anon pages imported.
582 */
583 if (importer && !importer->anon_vma) {
584 importer->anon_vma = anon_vma;
585 __anon_vma_link(importer);
586 }
587 }
588
589 if (root) { 589 if (root) {
590 flush_dcache_mmap_lock(mapping); 590 flush_dcache_mmap_lock(mapping);
591 vma_prio_tree_remove(vma, root); 591 vma_prio_tree_remove(vma, root);
@@ -616,8 +616,6 @@ again: remove_next = 1 + (end > next->vm_end);
616 __vma_unlink(mm, next, vma); 616 __vma_unlink(mm, next, vma);
617 if (file) 617 if (file)
618 __remove_shared_vm_struct(next, file, mapping); 618 __remove_shared_vm_struct(next, file, mapping);
619 if (next->anon_vma)
620 __anon_vma_merge(vma, next);
621 } else if (insert) { 619 } else if (insert) {
622 /* 620 /*
623 * split_vma has split insert from vma, and needs 621 * split_vma has split insert from vma, and needs
@@ -627,8 +625,6 @@ again: remove_next = 1 + (end > next->vm_end);
627 __insert_vm_struct(mm, insert); 625 __insert_vm_struct(mm, insert);
628 } 626 }
629 627
630 if (anon_vma)
631 spin_unlock(&anon_vma->lock);
632 if (mapping) 628 if (mapping)
633 spin_unlock(&mapping->i_mmap_lock); 629 spin_unlock(&mapping->i_mmap_lock);
634 630
@@ -638,6 +634,8 @@ again: remove_next = 1 + (end > next->vm_end);
638 if (next->vm_flags & VM_EXECUTABLE) 634 if (next->vm_flags & VM_EXECUTABLE)
639 removed_exe_file_vma(mm); 635 removed_exe_file_vma(mm);
640 } 636 }
637 if (next->anon_vma)
638 anon_vma_merge(vma, next);
641 mm->map_count--; 639 mm->map_count--;
642 mpol_put(vma_policy(next)); 640 mpol_put(vma_policy(next));
643 kmem_cache_free(vm_area_cachep, next); 641 kmem_cache_free(vm_area_cachep, next);
@@ -653,6 +651,8 @@ again: remove_next = 1 + (end > next->vm_end);
653 } 651 }
654 652
655 validate_mm(mm); 653 validate_mm(mm);
654
655 return 0;
656} 656}
657 657
658/* 658/*
@@ -759,6 +759,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
759{ 759{
760 pgoff_t pglen = (end - addr) >> PAGE_SHIFT; 760 pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
761 struct vm_area_struct *area, *next; 761 struct vm_area_struct *area, *next;
762 int err;
762 763
763 /* 764 /*
764 * We later require that vma->vm_flags == vm_flags, 765 * We later require that vma->vm_flags == vm_flags,
@@ -792,11 +793,13 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
792 is_mergeable_anon_vma(prev->anon_vma, 793 is_mergeable_anon_vma(prev->anon_vma,
793 next->anon_vma)) { 794 next->anon_vma)) {
794 /* cases 1, 6 */ 795 /* cases 1, 6 */
795 vma_adjust(prev, prev->vm_start, 796 err = vma_adjust(prev, prev->vm_start,
796 next->vm_end, prev->vm_pgoff, NULL); 797 next->vm_end, prev->vm_pgoff, NULL);
797 } else /* cases 2, 5, 7 */ 798 } else /* cases 2, 5, 7 */
798 vma_adjust(prev, prev->vm_start, 799 err = vma_adjust(prev, prev->vm_start,
799 end, prev->vm_pgoff, NULL); 800 end, prev->vm_pgoff, NULL);
801 if (err)
802 return NULL;
800 return prev; 803 return prev;
801 } 804 }
802 805
@@ -808,11 +811,13 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
808 can_vma_merge_before(next, vm_flags, 811 can_vma_merge_before(next, vm_flags,
809 anon_vma, file, pgoff+pglen)) { 812 anon_vma, file, pgoff+pglen)) {
810 if (prev && addr < prev->vm_end) /* case 4 */ 813 if (prev && addr < prev->vm_end) /* case 4 */
811 vma_adjust(prev, prev->vm_start, 814 err = vma_adjust(prev, prev->vm_start,
812 addr, prev->vm_pgoff, NULL); 815 addr, prev->vm_pgoff, NULL);
813 else /* cases 3, 8 */ 816 else /* cases 3, 8 */
814 vma_adjust(area, addr, next->vm_end, 817 err = vma_adjust(area, addr, next->vm_end,
815 next->vm_pgoff - pglen, NULL); 818 next->vm_pgoff - pglen, NULL);
819 if (err)
820 return NULL;
816 return area; 821 return area;
817 } 822 }
818 823
@@ -967,7 +972,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
967 unsigned long locked, lock_limit; 972 unsigned long locked, lock_limit;
968 locked = len >> PAGE_SHIFT; 973 locked = len >> PAGE_SHIFT;
969 locked += mm->locked_vm; 974 locked += mm->locked_vm;
970 lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; 975 lock_limit = rlimit(RLIMIT_MEMLOCK);
971 lock_limit >>= PAGE_SHIFT; 976 lock_limit >>= PAGE_SHIFT;
972 if (locked > lock_limit && !capable(CAP_IPC_LOCK)) 977 if (locked > lock_limit && !capable(CAP_IPC_LOCK))
973 return -EAGAIN; 978 return -EAGAIN;
@@ -1083,6 +1088,30 @@ out:
1083 return retval; 1088 return retval;
1084} 1089}
1085 1090
1091#ifdef __ARCH_WANT_SYS_OLD_MMAP
1092struct mmap_arg_struct {
1093 unsigned long addr;
1094 unsigned long len;
1095 unsigned long prot;
1096 unsigned long flags;
1097 unsigned long fd;
1098 unsigned long offset;
1099};
1100
1101SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)
1102{
1103 struct mmap_arg_struct a;
1104
1105 if (copy_from_user(&a, arg, sizeof(a)))
1106 return -EFAULT;
1107 if (a.offset & ~PAGE_MASK)
1108 return -EINVAL;
1109
1110 return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd,
1111 a.offset >> PAGE_SHIFT);
1112}
1113#endif /* __ARCH_WANT_SYS_OLD_MMAP */
1114
1086/* 1115/*
1087 * Some shared mappigns will want the pages marked read-only 1116 * Some shared mappigns will want the pages marked read-only
1088 * to track write events. If so, we'll downgrade vm_page_prot 1117 * to track write events. If so, we'll downgrade vm_page_prot
@@ -1205,6 +1234,7 @@ munmap_back:
1205 vma->vm_flags = vm_flags; 1234 vma->vm_flags = vm_flags;
1206 vma->vm_page_prot = vm_get_page_prot(vm_flags); 1235 vma->vm_page_prot = vm_get_page_prot(vm_flags);
1207 vma->vm_pgoff = pgoff; 1236 vma->vm_pgoff = pgoff;
1237 INIT_LIST_HEAD(&vma->anon_vma_chain);
1208 1238
1209 if (file) { 1239 if (file) {
1210 error = -EINVAL; 1240 error = -EINVAL;
@@ -1265,13 +1295,8 @@ out:
1265 mm->total_vm += len >> PAGE_SHIFT; 1295 mm->total_vm += len >> PAGE_SHIFT;
1266 vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); 1296 vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
1267 if (vm_flags & VM_LOCKED) { 1297 if (vm_flags & VM_LOCKED) {
1268 /* 1298 if (!mlock_vma_pages_range(vma, addr, addr + len))
1269 * makes pages present; downgrades, drops, reacquires mmap_sem 1299 mm->locked_vm += (len >> PAGE_SHIFT);
1270 */
1271 long nr_pages = mlock_vma_pages_range(vma, addr, addr + len);
1272 if (nr_pages < 0)
1273 return nr_pages; /* vma gone! */
1274 mm->locked_vm += (len >> PAGE_SHIFT) - nr_pages;
1275 } else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK)) 1300 } else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK))
1276 make_pages_present(addr, addr + len); 1301 make_pages_present(addr, addr + len);
1277 return addr; 1302 return addr;
@@ -1599,7 +1624,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
1599 return -ENOMEM; 1624 return -ENOMEM;
1600 1625
1601 /* Stack limit test */ 1626 /* Stack limit test */
1602 if (size > rlim[RLIMIT_STACK].rlim_cur) 1627 if (size > ACCESS_ONCE(rlim[RLIMIT_STACK].rlim_cur))
1603 return -ENOMEM; 1628 return -ENOMEM;
1604 1629
1605 /* mlock limit tests */ 1630 /* mlock limit tests */
@@ -1607,7 +1632,8 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
1607 unsigned long locked; 1632 unsigned long locked;
1608 unsigned long limit; 1633 unsigned long limit;
1609 locked = mm->locked_vm + grow; 1634 locked = mm->locked_vm + grow;
1610 limit = rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; 1635 limit = ACCESS_ONCE(rlim[RLIMIT_MEMLOCK].rlim_cur);
1636 limit >>= PAGE_SHIFT;
1611 if (locked > limit && !capable(CAP_IPC_LOCK)) 1637 if (locked > limit && !capable(CAP_IPC_LOCK))
1612 return -ENOMEM; 1638 return -ENOMEM;
1613 } 1639 }
@@ -1754,8 +1780,7 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr)
1754 if (!prev || expand_stack(prev, addr)) 1780 if (!prev || expand_stack(prev, addr))
1755 return NULL; 1781 return NULL;
1756 if (prev->vm_flags & VM_LOCKED) { 1782 if (prev->vm_flags & VM_LOCKED) {
1757 if (mlock_vma_pages_range(prev, addr, prev->vm_end) < 0) 1783 mlock_vma_pages_range(prev, addr, prev->vm_end);
1758 return NULL; /* vma gone! */
1759 } 1784 }
1760 return prev; 1785 return prev;
1761} 1786}
@@ -1783,8 +1808,7 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr)
1783 if (expand_stack(vma, addr)) 1808 if (expand_stack(vma, addr))
1784 return NULL; 1809 return NULL;
1785 if (vma->vm_flags & VM_LOCKED) { 1810 if (vma->vm_flags & VM_LOCKED) {
1786 if (mlock_vma_pages_range(vma, addr, start) < 0) 1811 mlock_vma_pages_range(vma, addr, start);
1787 return NULL; /* vma gone! */
1788 } 1812 }
1789 return vma; 1813 return vma;
1790} 1814}
@@ -1871,6 +1895,7 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
1871{ 1895{
1872 struct mempolicy *pol; 1896 struct mempolicy *pol;
1873 struct vm_area_struct *new; 1897 struct vm_area_struct *new;
1898 int err = -ENOMEM;
1874 1899
1875 if (is_vm_hugetlb_page(vma) && (addr & 1900 if (is_vm_hugetlb_page(vma) && (addr &
1876 ~(huge_page_mask(hstate_vma(vma))))) 1901 ~(huge_page_mask(hstate_vma(vma)))))
@@ -1878,11 +1903,13 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
1878 1903
1879 new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); 1904 new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
1880 if (!new) 1905 if (!new)
1881 return -ENOMEM; 1906 goto out_err;
1882 1907
1883 /* most fields are the same, copy all, and then fixup */ 1908 /* most fields are the same, copy all, and then fixup */
1884 *new = *vma; 1909 *new = *vma;
1885 1910
1911 INIT_LIST_HEAD(&new->anon_vma_chain);
1912
1886 if (new_below) 1913 if (new_below)
1887 new->vm_end = addr; 1914 new->vm_end = addr;
1888 else { 1915 else {
@@ -1892,11 +1919,14 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
1892 1919
1893 pol = mpol_dup(vma_policy(vma)); 1920 pol = mpol_dup(vma_policy(vma));
1894 if (IS_ERR(pol)) { 1921 if (IS_ERR(pol)) {
1895 kmem_cache_free(vm_area_cachep, new); 1922 err = PTR_ERR(pol);
1896 return PTR_ERR(pol); 1923 goto out_free_vma;
1897 } 1924 }
1898 vma_set_policy(new, pol); 1925 vma_set_policy(new, pol);
1899 1926
1927 if (anon_vma_clone(new, vma))
1928 goto out_free_mpol;
1929
1900 if (new->vm_file) { 1930 if (new->vm_file) {
1901 get_file(new->vm_file); 1931 get_file(new->vm_file);
1902 if (vma->vm_flags & VM_EXECUTABLE) 1932 if (vma->vm_flags & VM_EXECUTABLE)
@@ -1907,12 +1937,28 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
1907 new->vm_ops->open(new); 1937 new->vm_ops->open(new);
1908 1938
1909 if (new_below) 1939 if (new_below)
1910 vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff + 1940 err = vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff +
1911 ((addr - new->vm_start) >> PAGE_SHIFT), new); 1941 ((addr - new->vm_start) >> PAGE_SHIFT), new);
1912 else 1942 else
1913 vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new); 1943 err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new);
1914 1944
1915 return 0; 1945 /* Success. */
1946 if (!err)
1947 return 0;
1948
1949 /* Clean everything up if vma_adjust failed. */
1950 new->vm_ops->close(new);
1951 if (new->vm_file) {
1952 if (vma->vm_flags & VM_EXECUTABLE)
1953 removed_exe_file_vma(mm);
1954 fput(new->vm_file);
1955 }
1956 out_free_mpol:
1957 mpol_put(pol);
1958 out_free_vma:
1959 kmem_cache_free(vm_area_cachep, new);
1960 out_err:
1961 return err;
1916} 1962}
1917 1963
1918/* 1964/*
@@ -2074,7 +2120,7 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
2074 unsigned long locked, lock_limit; 2120 unsigned long locked, lock_limit;
2075 locked = len >> PAGE_SHIFT; 2121 locked = len >> PAGE_SHIFT;
2076 locked += mm->locked_vm; 2122 locked += mm->locked_vm;
2077 lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; 2123 lock_limit = rlimit(RLIMIT_MEMLOCK);
2078 lock_limit >>= PAGE_SHIFT; 2124 lock_limit >>= PAGE_SHIFT;
2079 if (locked > lock_limit && !capable(CAP_IPC_LOCK)) 2125 if (locked > lock_limit && !capable(CAP_IPC_LOCK))
2080 return -EAGAIN; 2126 return -EAGAIN;
@@ -2122,6 +2168,7 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
2122 return -ENOMEM; 2168 return -ENOMEM;
2123 } 2169 }
2124 2170
2171 INIT_LIST_HEAD(&vma->anon_vma_chain);
2125 vma->vm_mm = mm; 2172 vma->vm_mm = mm;
2126 vma->vm_start = addr; 2173 vma->vm_start = addr;
2127 vma->vm_end = addr + len; 2174 vma->vm_end = addr + len;
@@ -2258,10 +2305,11 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
2258 if (new_vma) { 2305 if (new_vma) {
2259 *new_vma = *vma; 2306 *new_vma = *vma;
2260 pol = mpol_dup(vma_policy(vma)); 2307 pol = mpol_dup(vma_policy(vma));
2261 if (IS_ERR(pol)) { 2308 if (IS_ERR(pol))
2262 kmem_cache_free(vm_area_cachep, new_vma); 2309 goto out_free_vma;
2263 return NULL; 2310 INIT_LIST_HEAD(&new_vma->anon_vma_chain);
2264 } 2311 if (anon_vma_clone(new_vma, vma))
2312 goto out_free_mempol;
2265 vma_set_policy(new_vma, pol); 2313 vma_set_policy(new_vma, pol);
2266 new_vma->vm_start = addr; 2314 new_vma->vm_start = addr;
2267 new_vma->vm_end = addr + len; 2315 new_vma->vm_end = addr + len;
@@ -2277,6 +2325,12 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
2277 } 2325 }
2278 } 2326 }
2279 return new_vma; 2327 return new_vma;
2328
2329 out_free_mempol:
2330 mpol_put(pol);
2331 out_free_vma:
2332 kmem_cache_free(vm_area_cachep, new_vma);
2333 return NULL;
2280} 2334}
2281 2335
2282/* 2336/*
@@ -2288,7 +2342,7 @@ int may_expand_vm(struct mm_struct *mm, unsigned long npages)
2288 unsigned long cur = mm->total_vm; /* pages */ 2342 unsigned long cur = mm->total_vm; /* pages */
2289 unsigned long lim; 2343 unsigned long lim;
2290 2344
2291 lim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT; 2345 lim = rlimit(RLIMIT_AS) >> PAGE_SHIFT;
2292 2346
2293 if (cur + npages > lim) 2347 if (cur + npages > lim)
2294 return 0; 2348 return 0;
@@ -2354,6 +2408,7 @@ int install_special_mapping(struct mm_struct *mm,
2354 if (unlikely(vma == NULL)) 2408 if (unlikely(vma == NULL))
2355 return -ENOMEM; 2409 return -ENOMEM;
2356 2410
2411 INIT_LIST_HEAD(&vma->anon_vma_chain);
2357 vma->vm_mm = mm; 2412 vma->vm_mm = mm;
2358 vma->vm_start = addr; 2413 vma->vm_start = addr;
2359 vma->vm_end = addr + len; 2414 vma->vm_end = addr + len;
@@ -2454,6 +2509,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
2454int mm_take_all_locks(struct mm_struct *mm) 2509int mm_take_all_locks(struct mm_struct *mm)
2455{ 2510{
2456 struct vm_area_struct *vma; 2511 struct vm_area_struct *vma;
2512 struct anon_vma_chain *avc;
2457 int ret = -EINTR; 2513 int ret = -EINTR;
2458 2514
2459 BUG_ON(down_read_trylock(&mm->mmap_sem)); 2515 BUG_ON(down_read_trylock(&mm->mmap_sem));
@@ -2471,7 +2527,8 @@ int mm_take_all_locks(struct mm_struct *mm)
2471 if (signal_pending(current)) 2527 if (signal_pending(current))
2472 goto out_unlock; 2528 goto out_unlock;
2473 if (vma->anon_vma) 2529 if (vma->anon_vma)
2474 vm_lock_anon_vma(mm, vma->anon_vma); 2530 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
2531 vm_lock_anon_vma(mm, avc->anon_vma);
2475 } 2532 }
2476 2533
2477 ret = 0; 2534 ret = 0;
@@ -2526,13 +2583,15 @@ static void vm_unlock_mapping(struct address_space *mapping)
2526void mm_drop_all_locks(struct mm_struct *mm) 2583void mm_drop_all_locks(struct mm_struct *mm)
2527{ 2584{
2528 struct vm_area_struct *vma; 2585 struct vm_area_struct *vma;
2586 struct anon_vma_chain *avc;
2529 2587
2530 BUG_ON(down_read_trylock(&mm->mmap_sem)); 2588 BUG_ON(down_read_trylock(&mm->mmap_sem));
2531 BUG_ON(!mutex_is_locked(&mm_all_locks_mutex)); 2589 BUG_ON(!mutex_is_locked(&mm_all_locks_mutex));
2532 2590
2533 for (vma = mm->mmap; vma; vma = vma->vm_next) { 2591 for (vma = mm->mmap; vma; vma = vma->vm_next) {
2534 if (vma->anon_vma) 2592 if (vma->anon_vma)
2535 vm_unlock_anon_vma(vma->anon_vma); 2593 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
2594 vm_unlock_anon_vma(avc->anon_vma);
2536 if (vma->vm_file && vma->vm_file->f_mapping) 2595 if (vma->vm_file && vma->vm_file->f_mapping)
2537 vm_unlock_mapping(vma->vm_file->f_mapping); 2596 vm_unlock_mapping(vma->vm_file->f_mapping);
2538 } 2597 }
diff --git a/mm/mmu_context.c b/mm/mmu_context.c
index ded9081f4021..0777654147c9 100644
--- a/mm/mmu_context.c
+++ b/mm/mmu_context.c
@@ -5,6 +5,7 @@
5 5
6#include <linux/mm.h> 6#include <linux/mm.h>
7#include <linux/mmu_context.h> 7#include <linux/mmu_context.h>
8#include <linux/module.h>
8#include <linux/sched.h> 9#include <linux/sched.h>
9 10
10#include <asm/mmu_context.h> 11#include <asm/mmu_context.h>
@@ -37,6 +38,7 @@ void use_mm(struct mm_struct *mm)
37 if (active_mm != mm) 38 if (active_mm != mm)
38 mmdrop(active_mm); 39 mmdrop(active_mm);
39} 40}
41EXPORT_SYMBOL_GPL(use_mm);
40 42
41/* 43/*
42 * unuse_mm 44 * unuse_mm
@@ -56,3 +58,4 @@ void unuse_mm(struct mm_struct *mm)
56 enter_lazy_tlb(mm, tsk); 58 enter_lazy_tlb(mm, tsk);
57 task_unlock(tsk); 59 task_unlock(tsk);
58} 60}
61EXPORT_SYMBOL_GPL(unuse_mm);
diff --git a/mm/mremap.c b/mm/mremap.c
index 845190898d59..e9c75efce609 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -285,7 +285,7 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
285 if (vma->vm_flags & VM_LOCKED) { 285 if (vma->vm_flags & VM_LOCKED) {
286 unsigned long locked, lock_limit; 286 unsigned long locked, lock_limit;
287 locked = mm->locked_vm << PAGE_SHIFT; 287 locked = mm->locked_vm << PAGE_SHIFT;
288 lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; 288 lock_limit = rlimit(RLIMIT_MEMLOCK);
289 locked += new_len - old_len; 289 locked += new_len - old_len;
290 if (locked > lock_limit && !capable(CAP_IPC_LOCK)) 290 if (locked > lock_limit && !capable(CAP_IPC_LOCK))
291 goto Eagain; 291 goto Eagain;
@@ -460,8 +460,11 @@ unsigned long do_mremap(unsigned long addr,
460 if (vma_expandable(vma, new_len - old_len)) { 460 if (vma_expandable(vma, new_len - old_len)) {
461 int pages = (new_len - old_len) >> PAGE_SHIFT; 461 int pages = (new_len - old_len) >> PAGE_SHIFT;
462 462
463 vma_adjust(vma, vma->vm_start, 463 if (vma_adjust(vma, vma->vm_start, addr + new_len,
464 addr + new_len, vma->vm_pgoff, NULL); 464 vma->vm_pgoff, NULL)) {
465 ret = -ENOMEM;
466 goto out;
467 }
465 468
466 mm->total_vm += pages; 469 mm->total_vm += pages;
467 vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages); 470 vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages);
diff --git a/mm/nommu.c b/mm/nommu.c
index 48a2ecfaf059..605ace8982a8 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -146,7 +146,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
146 (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); 146 (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
147 147
148 for (i = 0; i < nr_pages; i++) { 148 for (i = 0; i < nr_pages; i++) {
149 vma = find_vma(mm, start); 149 vma = find_extend_vma(mm, start);
150 if (!vma) 150 if (!vma)
151 goto finish_or_fault; 151 goto finish_or_fault;
152 152
@@ -764,7 +764,7 @@ EXPORT_SYMBOL(find_vma);
764 */ 764 */
765struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr) 765struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr)
766{ 766{
767 return find_vma(mm, addr); 767 return find_vma(mm, addr & PAGE_MASK);
768} 768}
769 769
770/* 770/*
@@ -1209,7 +1209,7 @@ unsigned long do_mmap_pgoff(struct file *file,
1209 region->vm_flags = vm_flags; 1209 region->vm_flags = vm_flags;
1210 region->vm_pgoff = pgoff; 1210 region->vm_pgoff = pgoff;
1211 1211
1212 INIT_LIST_HEAD(&vma->anon_vma_node); 1212 INIT_LIST_HEAD(&vma->anon_vma_chain);
1213 vma->vm_flags = vm_flags; 1213 vma->vm_flags = vm_flags;
1214 vma->vm_pgoff = pgoff; 1214 vma->vm_pgoff = pgoff;
1215 1215
@@ -1428,6 +1428,30 @@ out:
1428 return retval; 1428 return retval;
1429} 1429}
1430 1430
1431#ifdef __ARCH_WANT_SYS_OLD_MMAP
1432struct mmap_arg_struct {
1433 unsigned long addr;
1434 unsigned long len;
1435 unsigned long prot;
1436 unsigned long flags;
1437 unsigned long fd;
1438 unsigned long offset;
1439};
1440
1441SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)
1442{
1443 struct mmap_arg_struct a;
1444
1445 if (copy_from_user(&a, arg, sizeof(a)))
1446 return -EFAULT;
1447 if (a.offset & ~PAGE_MASK)
1448 return -EINVAL;
1449
1450 return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd,
1451 a.offset >> PAGE_SHIFT);
1452}
1453#endif /* __ARCH_WANT_SYS_OLD_MMAP */
1454
1431/* 1455/*
1432 * split a vma into two pieces at address 'addr', a new vma is allocated either 1456 * split a vma into two pieces at address 'addr', a new vma is allocated either
1433 * for the first part or the tail. 1457 * for the first part or the tail.
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 237050478f28..9b223af6a147 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -401,8 +401,8 @@ static void __oom_kill_task(struct task_struct *p, int verbose)
401 "vsz:%lukB, anon-rss:%lukB, file-rss:%lukB\n", 401 "vsz:%lukB, anon-rss:%lukB, file-rss:%lukB\n",
402 task_pid_nr(p), p->comm, 402 task_pid_nr(p), p->comm,
403 K(p->mm->total_vm), 403 K(p->mm->total_vm),
404 K(get_mm_counter(p->mm, anon_rss)), 404 K(get_mm_counter(p->mm, MM_ANONPAGES)),
405 K(get_mm_counter(p->mm, file_rss))); 405 K(get_mm_counter(p->mm, MM_FILEPAGES)));
406 task_unlock(p); 406 task_unlock(p);
407 407
408 /* 408 /*
@@ -473,6 +473,8 @@ void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask)
473 unsigned long points = 0; 473 unsigned long points = 0;
474 struct task_struct *p; 474 struct task_struct *p;
475 475
476 if (sysctl_panic_on_oom == 2)
477 panic("out of memory(memcg). panic_on_oom is selected.\n");
476 read_lock(&tasklist_lock); 478 read_lock(&tasklist_lock);
477retry: 479retry:
478 p = select_bad_process(&points, mem); 480 p = select_bad_process(&points, mem);
@@ -601,13 +603,6 @@ void pagefault_out_of_memory(void)
601 /* Got some memory back in the last second. */ 603 /* Got some memory back in the last second. */
602 return; 604 return;
603 605
604 /*
605 * If this is from memcg, oom-killer is already invoked.
606 * and not worth to go system-wide-oom.
607 */
608 if (mem_cgroup_oom_called(current))
609 goto rest_and_return;
610
611 if (sysctl_panic_on_oom) 606 if (sysctl_panic_on_oom)
612 panic("out of memory from page fault. panic_on_oom is selected.\n"); 607 panic("out of memory from page fault. panic_on_oom is selected.\n");
613 608
@@ -619,7 +614,6 @@ void pagefault_out_of_memory(void)
619 * Give "p" a good chance of killing itself before we 614 * Give "p" a good chance of killing itself before we
620 * retry to allocate memory. 615 * retry to allocate memory.
621 */ 616 */
622rest_and_return:
623 if (!test_thread_flag(TIF_MEMDIE)) 617 if (!test_thread_flag(TIF_MEMDIE))
624 schedule_timeout_uninterruptible(1); 618 schedule_timeout_uninterruptible(1);
625} 619}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8deb9d0fd5b1..d03c946d5566 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -50,6 +50,7 @@
50#include <linux/kmemleak.h> 50#include <linux/kmemleak.h>
51#include <linux/memory.h> 51#include <linux/memory.h>
52#include <trace/events/kmem.h> 52#include <trace/events/kmem.h>
53#include <linux/ftrace_event.h>
53 54
54#include <asm/tlbflush.h> 55#include <asm/tlbflush.h>
55#include <asm/div64.h> 56#include <asm/div64.h>
@@ -76,6 +77,31 @@ unsigned long totalreserve_pages __read_mostly;
76int percpu_pagelist_fraction; 77int percpu_pagelist_fraction;
77gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; 78gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
78 79
80#ifdef CONFIG_PM_SLEEP
81/*
82 * The following functions are used by the suspend/hibernate code to temporarily
83 * change gfp_allowed_mask in order to avoid using I/O during memory allocations
84 * while devices are suspended. To avoid races with the suspend/hibernate code,
85 * they should always be called with pm_mutex held (gfp_allowed_mask also should
86 * only be modified with pm_mutex held, unless the suspend/hibernate code is
87 * guaranteed not to run in parallel with that modification).
88 */
89void set_gfp_allowed_mask(gfp_t mask)
90{
91 WARN_ON(!mutex_is_locked(&pm_mutex));
92 gfp_allowed_mask = mask;
93}
94
95gfp_t clear_gfp_allowed_mask(gfp_t mask)
96{
97 gfp_t ret = gfp_allowed_mask;
98
99 WARN_ON(!mutex_is_locked(&pm_mutex));
100 gfp_allowed_mask &= ~mask;
101 return ret;
102}
103#endif /* CONFIG_PM_SLEEP */
104
79#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE 105#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
80int pageblock_order __read_mostly; 106int pageblock_order __read_mostly;
81#endif 107#endif
@@ -263,10 +289,7 @@ static void bad_page(struct page *page)
263 289
264 printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n", 290 printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n",
265 current->comm, page_to_pfn(page)); 291 current->comm, page_to_pfn(page));
266 printk(KERN_ALERT 292 dump_page(page);
267 "page:%p flags:%p count:%d mapcount:%d mapping:%p index:%lx\n",
268 page, (void *)page->flags, page_count(page),
269 page_mapcount(page), page->mapping, page->index);
270 293
271 dump_stack(); 294 dump_stack();
272out: 295out:
@@ -530,7 +553,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
530 int batch_free = 0; 553 int batch_free = 0;
531 554
532 spin_lock(&zone->lock); 555 spin_lock(&zone->lock);
533 zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); 556 zone->all_unreclaimable = 0;
534 zone->pages_scanned = 0; 557 zone->pages_scanned = 0;
535 558
536 __mod_zone_page_state(zone, NR_FREE_PAGES, count); 559 __mod_zone_page_state(zone, NR_FREE_PAGES, count);
@@ -568,7 +591,7 @@ static void free_one_page(struct zone *zone, struct page *page, int order,
568 int migratetype) 591 int migratetype)
569{ 592{
570 spin_lock(&zone->lock); 593 spin_lock(&zone->lock);
571 zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); 594 zone->all_unreclaimable = 0;
572 zone->pages_scanned = 0; 595 zone->pages_scanned = 0;
573 596
574 __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order); 597 __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
@@ -583,6 +606,7 @@ static void __free_pages_ok(struct page *page, unsigned int order)
583 int bad = 0; 606 int bad = 0;
584 int wasMlocked = __TestClearPageMlocked(page); 607 int wasMlocked = __TestClearPageMlocked(page);
585 608
609 trace_mm_page_free_direct(page, order);
586 kmemcheck_free_shadow(page, order); 610 kmemcheck_free_shadow(page, order);
587 611
588 for (i = 0 ; i < (1 << order) ; ++i) 612 for (i = 0 ; i < (1 << order) ; ++i)
@@ -1009,10 +1033,10 @@ static void drain_pages(unsigned int cpu)
1009 struct per_cpu_pageset *pset; 1033 struct per_cpu_pageset *pset;
1010 struct per_cpu_pages *pcp; 1034 struct per_cpu_pages *pcp;
1011 1035
1012 pset = zone_pcp(zone, cpu); 1036 local_irq_save(flags);
1037 pset = per_cpu_ptr(zone->pageset, cpu);
1013 1038
1014 pcp = &pset->pcp; 1039 pcp = &pset->pcp;
1015 local_irq_save(flags);
1016 free_pcppages_bulk(zone, pcp->count, pcp); 1040 free_pcppages_bulk(zone, pcp->count, pcp);
1017 pcp->count = 0; 1041 pcp->count = 0;
1018 local_irq_restore(flags); 1042 local_irq_restore(flags);
@@ -1073,8 +1097,9 @@ void mark_free_pages(struct zone *zone)
1073 1097
1074/* 1098/*
1075 * Free a 0-order page 1099 * Free a 0-order page
1100 * cold == 1 ? free a cold page : free a hot page
1076 */ 1101 */
1077static void free_hot_cold_page(struct page *page, int cold) 1102void free_hot_cold_page(struct page *page, int cold)
1078{ 1103{
1079 struct zone *zone = page_zone(page); 1104 struct zone *zone = page_zone(page);
1080 struct per_cpu_pages *pcp; 1105 struct per_cpu_pages *pcp;
@@ -1082,6 +1107,7 @@ static void free_hot_cold_page(struct page *page, int cold)
1082 int migratetype; 1107 int migratetype;
1083 int wasMlocked = __TestClearPageMlocked(page); 1108 int wasMlocked = __TestClearPageMlocked(page);
1084 1109
1110 trace_mm_page_free_direct(page, 0);
1085 kmemcheck_free_shadow(page, 0); 1111 kmemcheck_free_shadow(page, 0);
1086 1112
1087 if (PageAnon(page)) 1113 if (PageAnon(page))
@@ -1096,7 +1122,6 @@ static void free_hot_cold_page(struct page *page, int cold)
1096 arch_free_page(page, 0); 1122 arch_free_page(page, 0);
1097 kernel_map_pages(page, 1, 0); 1123 kernel_map_pages(page, 1, 0);
1098 1124
1099 pcp = &zone_pcp(zone, get_cpu())->pcp;
1100 migratetype = get_pageblock_migratetype(page); 1125 migratetype = get_pageblock_migratetype(page);
1101 set_page_private(page, migratetype); 1126 set_page_private(page, migratetype);
1102 local_irq_save(flags); 1127 local_irq_save(flags);
@@ -1119,6 +1144,7 @@ static void free_hot_cold_page(struct page *page, int cold)
1119 migratetype = MIGRATE_MOVABLE; 1144 migratetype = MIGRATE_MOVABLE;
1120 } 1145 }
1121 1146
1147 pcp = &this_cpu_ptr(zone->pageset)->pcp;
1122 if (cold) 1148 if (cold)
1123 list_add_tail(&page->lru, &pcp->lists[migratetype]); 1149 list_add_tail(&page->lru, &pcp->lists[migratetype]);
1124 else 1150 else
@@ -1131,15 +1157,8 @@ static void free_hot_cold_page(struct page *page, int cold)
1131 1157
1132out: 1158out:
1133 local_irq_restore(flags); 1159 local_irq_restore(flags);
1134 put_cpu();
1135} 1160}
1136 1161
1137void free_hot_page(struct page *page)
1138{
1139 trace_mm_page_free_direct(page, 0);
1140 free_hot_cold_page(page, 0);
1141}
1142
1143/* 1162/*
1144 * split_page takes a non-compound higher-order page, and splits it into 1163 * split_page takes a non-compound higher-order page, and splits it into
1145 * n (1<<order) sub-pages: page[0..n] 1164 * n (1<<order) sub-pages: page[0..n]
@@ -1181,17 +1200,15 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
1181 unsigned long flags; 1200 unsigned long flags;
1182 struct page *page; 1201 struct page *page;
1183 int cold = !!(gfp_flags & __GFP_COLD); 1202 int cold = !!(gfp_flags & __GFP_COLD);
1184 int cpu;
1185 1203
1186again: 1204again:
1187 cpu = get_cpu();
1188 if (likely(order == 0)) { 1205 if (likely(order == 0)) {
1189 struct per_cpu_pages *pcp; 1206 struct per_cpu_pages *pcp;
1190 struct list_head *list; 1207 struct list_head *list;
1191 1208
1192 pcp = &zone_pcp(zone, cpu)->pcp;
1193 list = &pcp->lists[migratetype];
1194 local_irq_save(flags); 1209 local_irq_save(flags);
1210 pcp = &this_cpu_ptr(zone->pageset)->pcp;
1211 list = &pcp->lists[migratetype];
1195 if (list_empty(list)) { 1212 if (list_empty(list)) {
1196 pcp->count += rmqueue_bulk(zone, 0, 1213 pcp->count += rmqueue_bulk(zone, 0,
1197 pcp->batch, list, 1214 pcp->batch, list,
@@ -1232,7 +1249,6 @@ again:
1232 __count_zone_vm_events(PGALLOC, zone, 1 << order); 1249 __count_zone_vm_events(PGALLOC, zone, 1 << order);
1233 zone_statistics(preferred_zone, zone); 1250 zone_statistics(preferred_zone, zone);
1234 local_irq_restore(flags); 1251 local_irq_restore(flags);
1235 put_cpu();
1236 1252
1237 VM_BUG_ON(bad_range(zone, page)); 1253 VM_BUG_ON(bad_range(zone, page));
1238 if (prep_new_page(page, order, gfp_flags)) 1254 if (prep_new_page(page, order, gfp_flags))
@@ -1241,7 +1257,6 @@ again:
1241 1257
1242failed: 1258failed:
1243 local_irq_restore(flags); 1259 local_irq_restore(flags);
1244 put_cpu();
1245 return NULL; 1260 return NULL;
1246} 1261}
1247 1262
@@ -2013,9 +2028,8 @@ void __pagevec_free(struct pagevec *pvec)
2013void __free_pages(struct page *page, unsigned int order) 2028void __free_pages(struct page *page, unsigned int order)
2014{ 2029{
2015 if (put_page_testzero(page)) { 2030 if (put_page_testzero(page)) {
2016 trace_mm_page_free_direct(page, order);
2017 if (order == 0) 2031 if (order == 0)
2018 free_hot_page(page); 2032 free_hot_cold_page(page, 0);
2019 else 2033 else
2020 __free_pages_ok(page, order); 2034 __free_pages_ok(page, order);
2021 } 2035 }
@@ -2180,7 +2194,7 @@ void show_free_areas(void)
2180 for_each_online_cpu(cpu) { 2194 for_each_online_cpu(cpu) {
2181 struct per_cpu_pageset *pageset; 2195 struct per_cpu_pageset *pageset;
2182 2196
2183 pageset = zone_pcp(zone, cpu); 2197 pageset = per_cpu_ptr(zone->pageset, cpu);
2184 2198
2185 printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n", 2199 printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n",
2186 cpu, pageset->pcp.high, 2200 cpu, pageset->pcp.high,
@@ -2271,7 +2285,7 @@ void show_free_areas(void)
2271 K(zone_page_state(zone, NR_BOUNCE)), 2285 K(zone_page_state(zone, NR_BOUNCE)),
2272 K(zone_page_state(zone, NR_WRITEBACK_TEMP)), 2286 K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
2273 zone->pages_scanned, 2287 zone->pages_scanned,
2274 (zone_is_all_unreclaimable(zone) ? "yes" : "no") 2288 (zone->all_unreclaimable ? "yes" : "no")
2275 ); 2289 );
2276 printk("lowmem_reserve[]:"); 2290 printk("lowmem_reserve[]:");
2277 for (i = 0; i < MAX_NR_ZONES; i++) 2291 for (i = 0; i < MAX_NR_ZONES; i++)
@@ -2745,10 +2759,29 @@ static void build_zonelist_cache(pg_data_t *pgdat)
2745 2759
2746#endif /* CONFIG_NUMA */ 2760#endif /* CONFIG_NUMA */
2747 2761
2762/*
2763 * Boot pageset table. One per cpu which is going to be used for all
2764 * zones and all nodes. The parameters will be set in such a way
2765 * that an item put on a list will immediately be handed over to
2766 * the buddy list. This is safe since pageset manipulation is done
2767 * with interrupts disabled.
2768 *
2769 * The boot_pagesets must be kept even after bootup is complete for
2770 * unused processors and/or zones. They do play a role for bootstrapping
2771 * hotplugged processors.
2772 *
2773 * zoneinfo_show() and maybe other functions do
2774 * not check if the processor is online before following the pageset pointer.
2775 * Other parts of the kernel may not check if the zone is available.
2776 */
2777static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);
2778static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
2779
2748/* return values int ....just for stop_machine() */ 2780/* return values int ....just for stop_machine() */
2749static int __build_all_zonelists(void *dummy) 2781static int __build_all_zonelists(void *dummy)
2750{ 2782{
2751 int nid; 2783 int nid;
2784 int cpu;
2752 2785
2753#ifdef CONFIG_NUMA 2786#ifdef CONFIG_NUMA
2754 memset(node_load, 0, sizeof(node_load)); 2787 memset(node_load, 0, sizeof(node_load));
@@ -2759,6 +2792,23 @@ static int __build_all_zonelists(void *dummy)
2759 build_zonelists(pgdat); 2792 build_zonelists(pgdat);
2760 build_zonelist_cache(pgdat); 2793 build_zonelist_cache(pgdat);
2761 } 2794 }
2795
2796 /*
2797 * Initialize the boot_pagesets that are going to be used
2798 * for bootstrapping processors. The real pagesets for
2799 * each zone will be allocated later when the per cpu
2800 * allocator is available.
2801 *
2802 * boot_pagesets are used also for bootstrapping offline
2803 * cpus if the system is already booted because the pagesets
2804 * are needed to initialize allocators on a specific cpu too.
2805 * F.e. the percpu allocator needs the page allocator which
2806 * needs the percpu allocator in order to allocate its pagesets
2807 * (a chicken-egg dilemma).
2808 */
2809 for_each_possible_cpu(cpu)
2810 setup_pageset(&per_cpu(boot_pageset, cpu), 0);
2811
2762 return 0; 2812 return 0;
2763} 2813}
2764 2814
@@ -3096,121 +3146,33 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p,
3096 pcp->batch = PAGE_SHIFT * 8; 3146 pcp->batch = PAGE_SHIFT * 8;
3097} 3147}
3098 3148
3099
3100#ifdef CONFIG_NUMA
3101/*
3102 * Boot pageset table. One per cpu which is going to be used for all
3103 * zones and all nodes. The parameters will be set in such a way
3104 * that an item put on a list will immediately be handed over to
3105 * the buddy list. This is safe since pageset manipulation is done
3106 * with interrupts disabled.
3107 *
3108 * Some NUMA counter updates may also be caught by the boot pagesets.
3109 *
3110 * The boot_pagesets must be kept even after bootup is complete for
3111 * unused processors and/or zones. They do play a role for bootstrapping
3112 * hotplugged processors.
3113 *
3114 * zoneinfo_show() and maybe other functions do
3115 * not check if the processor is online before following the pageset pointer.
3116 * Other parts of the kernel may not check if the zone is available.
3117 */
3118static struct per_cpu_pageset boot_pageset[NR_CPUS];
3119
3120/* 3149/*
3121 * Dynamically allocate memory for the 3150 * Allocate per cpu pagesets and initialize them.
3122 * per cpu pageset array in struct zone. 3151 * Before this call only boot pagesets were available.
3152 * Boot pagesets will no longer be used by this processorr
3153 * after setup_per_cpu_pageset().
3123 */ 3154 */
3124static int __cpuinit process_zones(int cpu) 3155void __init setup_per_cpu_pageset(void)
3125{ 3156{
3126 struct zone *zone, *dzone; 3157 struct zone *zone;
3127 int node = cpu_to_node(cpu); 3158 int cpu;
3128
3129 node_set_state(node, N_CPU); /* this node has a cpu */
3130 3159
3131 for_each_populated_zone(zone) { 3160 for_each_populated_zone(zone) {
3132 zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset), 3161 zone->pageset = alloc_percpu(struct per_cpu_pageset);
3133 GFP_KERNEL, node);
3134 if (!zone_pcp(zone, cpu))
3135 goto bad;
3136 3162
3137 setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone)); 3163 for_each_possible_cpu(cpu) {
3164 struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);
3138 3165
3139 if (percpu_pagelist_fraction) 3166 setup_pageset(pcp, zone_batchsize(zone));
3140 setup_pagelist_highmark(zone_pcp(zone, cpu),
3141 (zone->present_pages / percpu_pagelist_fraction));
3142 }
3143 3167
3144 return 0; 3168 if (percpu_pagelist_fraction)
3145bad: 3169 setup_pagelist_highmark(pcp,
3146 for_each_zone(dzone) { 3170 (zone->present_pages /
3147 if (!populated_zone(dzone)) 3171 percpu_pagelist_fraction));
3148 continue; 3172 }
3149 if (dzone == zone)
3150 break;
3151 kfree(zone_pcp(dzone, cpu));
3152 zone_pcp(dzone, cpu) = &boot_pageset[cpu];
3153 }
3154 return -ENOMEM;
3155}
3156
3157static inline void free_zone_pagesets(int cpu)
3158{
3159 struct zone *zone;
3160
3161 for_each_zone(zone) {
3162 struct per_cpu_pageset *pset = zone_pcp(zone, cpu);
3163
3164 /* Free per_cpu_pageset if it is slab allocated */
3165 if (pset != &boot_pageset[cpu])
3166 kfree(pset);
3167 zone_pcp(zone, cpu) = &boot_pageset[cpu];
3168 }
3169}
3170
3171static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb,
3172 unsigned long action,
3173 void *hcpu)
3174{
3175 int cpu = (long)hcpu;
3176 int ret = NOTIFY_OK;
3177
3178 switch (action) {
3179 case CPU_UP_PREPARE:
3180 case CPU_UP_PREPARE_FROZEN:
3181 if (process_zones(cpu))
3182 ret = NOTIFY_BAD;
3183 break;
3184 case CPU_UP_CANCELED:
3185 case CPU_UP_CANCELED_FROZEN:
3186 case CPU_DEAD:
3187 case CPU_DEAD_FROZEN:
3188 free_zone_pagesets(cpu);
3189 break;
3190 default:
3191 break;
3192 } 3173 }
3193 return ret;
3194} 3174}
3195 3175
3196static struct notifier_block __cpuinitdata pageset_notifier =
3197 { &pageset_cpuup_callback, NULL, 0 };
3198
3199void __init setup_per_cpu_pageset(void)
3200{
3201 int err;
3202
3203 /* Initialize per_cpu_pageset for cpu 0.
3204 * A cpuup callback will do this for every cpu
3205 * as it comes online
3206 */
3207 err = process_zones(smp_processor_id());
3208 BUG_ON(err);
3209 register_cpu_notifier(&pageset_notifier);
3210}
3211
3212#endif
3213
3214static noinline __init_refok 3176static noinline __init_refok
3215int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) 3177int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
3216{ 3178{
@@ -3260,11 +3222,11 @@ static int __zone_pcp_update(void *data)
3260 int cpu; 3222 int cpu;
3261 unsigned long batch = zone_batchsize(zone), flags; 3223 unsigned long batch = zone_batchsize(zone), flags;
3262 3224
3263 for (cpu = 0; cpu < NR_CPUS; cpu++) { 3225 for_each_possible_cpu(cpu) {
3264 struct per_cpu_pageset *pset; 3226 struct per_cpu_pageset *pset;
3265 struct per_cpu_pages *pcp; 3227 struct per_cpu_pages *pcp;
3266 3228
3267 pset = zone_pcp(zone, cpu); 3229 pset = per_cpu_ptr(zone->pageset, cpu);
3268 pcp = &pset->pcp; 3230 pcp = &pset->pcp;
3269 3231
3270 local_irq_save(flags); 3232 local_irq_save(flags);
@@ -3282,21 +3244,17 @@ void zone_pcp_update(struct zone *zone)
3282 3244
3283static __meminit void zone_pcp_init(struct zone *zone) 3245static __meminit void zone_pcp_init(struct zone *zone)
3284{ 3246{
3285 int cpu; 3247 /*
3286 unsigned long batch = zone_batchsize(zone); 3248 * per cpu subsystem is not up at this point. The following code
3249 * relies on the ability of the linker to provide the
3250 * offset of a (static) per cpu variable into the per cpu area.
3251 */
3252 zone->pageset = &boot_pageset;
3287 3253
3288 for (cpu = 0; cpu < NR_CPUS; cpu++) {
3289#ifdef CONFIG_NUMA
3290 /* Early boot. Slab allocator not functional yet */
3291 zone_pcp(zone, cpu) = &boot_pageset[cpu];
3292 setup_pageset(&boot_pageset[cpu],0);
3293#else
3294 setup_pageset(zone_pcp(zone,cpu), batch);
3295#endif
3296 }
3297 if (zone->present_pages) 3254 if (zone->present_pages)
3298 printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", 3255 printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n",
3299 zone->name, zone->present_pages, batch); 3256 zone->name, zone->present_pages,
3257 zone_batchsize(zone));
3300} 3258}
3301 3259
3302__meminit int init_currently_empty_zone(struct zone *zone, 3260__meminit int init_currently_empty_zone(struct zone *zone,
@@ -3435,6 +3393,61 @@ void __init free_bootmem_with_active_regions(int nid,
3435 } 3393 }
3436} 3394}
3437 3395
3396int __init add_from_early_node_map(struct range *range, int az,
3397 int nr_range, int nid)
3398{
3399 int i;
3400 u64 start, end;
3401
3402 /* need to go over early_node_map to find out good range for node */
3403 for_each_active_range_index_in_nid(i, nid) {
3404 start = early_node_map[i].start_pfn;
3405 end = early_node_map[i].end_pfn;
3406 nr_range = add_range(range, az, nr_range, start, end);
3407 }
3408 return nr_range;
3409}
3410
3411#ifdef CONFIG_NO_BOOTMEM
3412void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
3413 u64 goal, u64 limit)
3414{
3415 int i;
3416 void *ptr;
3417
3418 /* need to go over early_node_map to find out good range for node */
3419 for_each_active_range_index_in_nid(i, nid) {
3420 u64 addr;
3421 u64 ei_start, ei_last;
3422
3423 ei_last = early_node_map[i].end_pfn;
3424 ei_last <<= PAGE_SHIFT;
3425 ei_start = early_node_map[i].start_pfn;
3426 ei_start <<= PAGE_SHIFT;
3427 addr = find_early_area(ei_start, ei_last,
3428 goal, limit, size, align);
3429
3430 if (addr == -1ULL)
3431 continue;
3432
3433#if 0
3434 printk(KERN_DEBUG "alloc (nid=%d %llx - %llx) (%llx - %llx) %llx %llx => %llx\n",
3435 nid,
3436 ei_start, ei_last, goal, limit, size,
3437 align, addr);
3438#endif
3439
3440 ptr = phys_to_virt(addr);
3441 memset(ptr, 0, size);
3442 reserve_early_without_check(addr, addr + size, "BOOTMEM");
3443 return ptr;
3444 }
3445
3446 return NULL;
3447}
3448#endif
3449
3450
3438void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data) 3451void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data)
3439{ 3452{
3440 int i; 3453 int i;
@@ -4377,8 +4390,12 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
4377 for (i = 0; i < MAX_NR_ZONES; i++) { 4390 for (i = 0; i < MAX_NR_ZONES; i++) {
4378 if (i == ZONE_MOVABLE) 4391 if (i == ZONE_MOVABLE)
4379 continue; 4392 continue;
4380 printk(" %-8s %0#10lx -> %0#10lx\n", 4393 printk(" %-8s ", zone_names[i]);
4381 zone_names[i], 4394 if (arch_zone_lowest_possible_pfn[i] ==
4395 arch_zone_highest_possible_pfn[i])
4396 printk("empty\n");
4397 else
4398 printk("%0#10lx -> %0#10lx\n",
4382 arch_zone_lowest_possible_pfn[i], 4399 arch_zone_lowest_possible_pfn[i],
4383 arch_zone_highest_possible_pfn[i]); 4400 arch_zone_highest_possible_pfn[i]);
4384 } 4401 }
@@ -4467,7 +4484,11 @@ void __init set_dma_reserve(unsigned long new_dma_reserve)
4467} 4484}
4468 4485
4469#ifndef CONFIG_NEED_MULTIPLE_NODES 4486#ifndef CONFIG_NEED_MULTIPLE_NODES
4470struct pglist_data __refdata contig_page_data = { .bdata = &bootmem_node_data[0] }; 4487struct pglist_data __refdata contig_page_data = {
4488#ifndef CONFIG_NO_BOOTMEM
4489 .bdata = &bootmem_node_data[0]
4490#endif
4491 };
4471EXPORT_SYMBOL(contig_page_data); 4492EXPORT_SYMBOL(contig_page_data);
4472#endif 4493#endif
4473 4494
@@ -4810,10 +4831,11 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
4810 if (!write || (ret == -EINVAL)) 4831 if (!write || (ret == -EINVAL))
4811 return ret; 4832 return ret;
4812 for_each_populated_zone(zone) { 4833 for_each_populated_zone(zone) {
4813 for_each_online_cpu(cpu) { 4834 for_each_possible_cpu(cpu) {
4814 unsigned long high; 4835 unsigned long high;
4815 high = zone->present_pages / percpu_pagelist_fraction; 4836 high = zone->present_pages / percpu_pagelist_fraction;
4816 setup_pagelist_highmark(zone_pcp(zone, cpu), high); 4837 setup_pagelist_highmark(
4838 per_cpu_ptr(zone->pageset, cpu), high);
4817 } 4839 }
4818 } 4840 }
4819 return 0; 4841 return 0;
@@ -5159,3 +5181,80 @@ bool is_free_buddy_page(struct page *page)
5159 return order < MAX_ORDER; 5181 return order < MAX_ORDER;
5160} 5182}
5161#endif 5183#endif
5184
5185static struct trace_print_flags pageflag_names[] = {
5186 {1UL << PG_locked, "locked" },
5187 {1UL << PG_error, "error" },
5188 {1UL << PG_referenced, "referenced" },
5189 {1UL << PG_uptodate, "uptodate" },
5190 {1UL << PG_dirty, "dirty" },
5191 {1UL << PG_lru, "lru" },
5192 {1UL << PG_active, "active" },
5193 {1UL << PG_slab, "slab" },
5194 {1UL << PG_owner_priv_1, "owner_priv_1" },
5195 {1UL << PG_arch_1, "arch_1" },
5196 {1UL << PG_reserved, "reserved" },
5197 {1UL << PG_private, "private" },
5198 {1UL << PG_private_2, "private_2" },
5199 {1UL << PG_writeback, "writeback" },
5200#ifdef CONFIG_PAGEFLAGS_EXTENDED
5201 {1UL << PG_head, "head" },
5202 {1UL << PG_tail, "tail" },
5203#else
5204 {1UL << PG_compound, "compound" },
5205#endif
5206 {1UL << PG_swapcache, "swapcache" },
5207 {1UL << PG_mappedtodisk, "mappedtodisk" },
5208 {1UL << PG_reclaim, "reclaim" },
5209 {1UL << PG_buddy, "buddy" },
5210 {1UL << PG_swapbacked, "swapbacked" },
5211 {1UL << PG_unevictable, "unevictable" },
5212#ifdef CONFIG_MMU
5213 {1UL << PG_mlocked, "mlocked" },
5214#endif
5215#ifdef CONFIG_ARCH_USES_PG_UNCACHED
5216 {1UL << PG_uncached, "uncached" },
5217#endif
5218#ifdef CONFIG_MEMORY_FAILURE
5219 {1UL << PG_hwpoison, "hwpoison" },
5220#endif
5221 {-1UL, NULL },
5222};
5223
5224static void dump_page_flags(unsigned long flags)
5225{
5226 const char *delim = "";
5227 unsigned long mask;
5228 int i;
5229
5230 printk(KERN_ALERT "page flags: %#lx(", flags);
5231
5232 /* remove zone id */
5233 flags &= (1UL << NR_PAGEFLAGS) - 1;
5234
5235 for (i = 0; pageflag_names[i].name && flags; i++) {
5236
5237 mask = pageflag_names[i].mask;
5238 if ((flags & mask) != mask)
5239 continue;
5240
5241 flags &= ~mask;
5242 printk("%s%s", delim, pageflag_names[i].name);
5243 delim = "|";
5244 }
5245
5246 /* check for left over flags */
5247 if (flags)
5248 printk("%s%#lx", delim, flags);
5249
5250 printk(")\n");
5251}
5252
5253void dump_page(struct page *page)
5254{
5255 printk(KERN_ALERT
5256 "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",
5257 page, page_count(page), page_mapcount(page),
5258 page->mapping, page->index);
5259 dump_page_flags(page->flags);
5260}
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 3d535d594826..3dd88539a0e6 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -335,6 +335,37 @@ not_enough_page:
335} 335}
336 336
337/** 337/**
338 * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry.
339 * @end: swap entry to be cmpxchged
340 * @old: old id
341 * @new: new id
342 *
343 * Returns old id at success, 0 at failure.
344 * (There is no mem_cgroup useing 0 as its id)
345 */
346unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
347 unsigned short old, unsigned short new)
348{
349 int type = swp_type(ent);
350 unsigned long offset = swp_offset(ent);
351 unsigned long idx = offset / SC_PER_PAGE;
352 unsigned long pos = offset & SC_POS_MASK;
353 struct swap_cgroup_ctrl *ctrl;
354 struct page *mappage;
355 struct swap_cgroup *sc;
356
357 ctrl = &swap_cgroup_ctrl[type];
358
359 mappage = ctrl->map[idx];
360 sc = page_address(mappage);
361 sc += pos;
362 if (cmpxchg(&sc->id, old, new) == old)
363 return old;
364 else
365 return 0;
366}
367
368/**
338 * swap_cgroup_record - record mem_cgroup for this swp_entry. 369 * swap_cgroup_record - record mem_cgroup for this swp_entry.
339 * @ent: swap entry to be recorded into 370 * @ent: swap entry to be recorded into
340 * @mem: mem_cgroup to be recorded 371 * @mem: mem_cgroup to be recorded
@@ -358,8 +389,7 @@ unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
358 mappage = ctrl->map[idx]; 389 mappage = ctrl->map[idx];
359 sc = page_address(mappage); 390 sc = page_address(mappage);
360 sc += pos; 391 sc += pos;
361 old = sc->id; 392 old = xchg(&sc->id, id);
362 sc->id = id;
363 393
364 return old; 394 return old;
365} 395}
diff --git a/mm/percpu.c b/mm/percpu.c
index 083e7c91e5f6..768419d44ad7 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -80,13 +80,15 @@
80/* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */ 80/* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */
81#ifndef __addr_to_pcpu_ptr 81#ifndef __addr_to_pcpu_ptr
82#define __addr_to_pcpu_ptr(addr) \ 82#define __addr_to_pcpu_ptr(addr) \
83 (void *)((unsigned long)(addr) - (unsigned long)pcpu_base_addr \ 83 (void __percpu *)((unsigned long)(addr) - \
84 + (unsigned long)__per_cpu_start) 84 (unsigned long)pcpu_base_addr + \
85 (unsigned long)__per_cpu_start)
85#endif 86#endif
86#ifndef __pcpu_ptr_to_addr 87#ifndef __pcpu_ptr_to_addr
87#define __pcpu_ptr_to_addr(ptr) \ 88#define __pcpu_ptr_to_addr(ptr) \
88 (void *)((unsigned long)(ptr) + (unsigned long)pcpu_base_addr \ 89 (void __force *)((unsigned long)(ptr) + \
89 - (unsigned long)__per_cpu_start) 90 (unsigned long)pcpu_base_addr - \
91 (unsigned long)__per_cpu_start)
90#endif 92#endif
91 93
92struct pcpu_chunk { 94struct pcpu_chunk {
@@ -913,11 +915,10 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size)
913 int rs, re; 915 int rs, re;
914 916
915 /* quick path, check whether it's empty already */ 917 /* quick path, check whether it's empty already */
916 pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { 918 rs = page_start;
917 if (rs == page_start && re == page_end) 919 pcpu_next_unpop(chunk, &rs, &re, page_end);
918 return; 920 if (rs == page_start && re == page_end)
919 break; 921 return;
920 }
921 922
922 /* immutable chunks can't be depopulated */ 923 /* immutable chunks can't be depopulated */
923 WARN_ON(chunk->immutable); 924 WARN_ON(chunk->immutable);
@@ -968,11 +969,10 @@ static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
968 int rs, re, rc; 969 int rs, re, rc;
969 970
970 /* quick path, check whether all pages are already there */ 971 /* quick path, check whether all pages are already there */
971 pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) { 972 rs = page_start;
972 if (rs == page_start && re == page_end) 973 pcpu_next_pop(chunk, &rs, &re, page_end);
973 goto clear; 974 if (rs == page_start && re == page_end)
974 break; 975 goto clear;
975 }
976 976
977 /* need to allocate and map pages, this chunk can't be immutable */ 977 /* need to allocate and map pages, this chunk can't be immutable */
978 WARN_ON(chunk->immutable); 978 WARN_ON(chunk->immutable);
@@ -1067,7 +1067,7 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void)
1067 * RETURNS: 1067 * RETURNS:
1068 * Percpu pointer to the allocated area on success, NULL on failure. 1068 * Percpu pointer to the allocated area on success, NULL on failure.
1069 */ 1069 */
1070static void *pcpu_alloc(size_t size, size_t align, bool reserved) 1070static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved)
1071{ 1071{
1072 static int warn_limit = 10; 1072 static int warn_limit = 10;
1073 struct pcpu_chunk *chunk; 1073 struct pcpu_chunk *chunk;
@@ -1196,7 +1196,7 @@ fail_unlock_mutex:
1196 * RETURNS: 1196 * RETURNS:
1197 * Percpu pointer to the allocated area on success, NULL on failure. 1197 * Percpu pointer to the allocated area on success, NULL on failure.
1198 */ 1198 */
1199void *__alloc_percpu(size_t size, size_t align) 1199void __percpu *__alloc_percpu(size_t size, size_t align)
1200{ 1200{
1201 return pcpu_alloc(size, align, false); 1201 return pcpu_alloc(size, align, false);
1202} 1202}
@@ -1217,7 +1217,7 @@ EXPORT_SYMBOL_GPL(__alloc_percpu);
1217 * RETURNS: 1217 * RETURNS:
1218 * Percpu pointer to the allocated area on success, NULL on failure. 1218 * Percpu pointer to the allocated area on success, NULL on failure.
1219 */ 1219 */
1220void *__alloc_reserved_percpu(size_t size, size_t align) 1220void __percpu *__alloc_reserved_percpu(size_t size, size_t align)
1221{ 1221{
1222 return pcpu_alloc(size, align, true); 1222 return pcpu_alloc(size, align, true);
1223} 1223}
@@ -1269,7 +1269,7 @@ static void pcpu_reclaim(struct work_struct *work)
1269 * CONTEXT: 1269 * CONTEXT:
1270 * Can be called from atomic context. 1270 * Can be called from atomic context.
1271 */ 1271 */
1272void free_percpu(void *ptr) 1272void free_percpu(void __percpu *ptr)
1273{ 1273{
1274 void *addr; 1274 void *addr;
1275 struct pcpu_chunk *chunk; 1275 struct pcpu_chunk *chunk;
diff --git a/mm/readahead.c b/mm/readahead.c
index 033bc135a41f..337b20e946f6 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -501,6 +501,12 @@ void page_cache_sync_readahead(struct address_space *mapping,
501 if (!ra->ra_pages) 501 if (!ra->ra_pages)
502 return; 502 return;
503 503
504 /* be dumb */
505 if (filp->f_mode & FMODE_RANDOM) {
506 force_page_cache_readahead(mapping, filp, offset, req_size);
507 return;
508 }
509
504 /* do read-ahead */ 510 /* do read-ahead */
505 ondemand_readahead(mapping, ra, filp, false, offset, req_size); 511 ondemand_readahead(mapping, ra, filp, false, offset, req_size);
506} 512}
diff --git a/mm/rmap.c b/mm/rmap.c
index 278cd277bdec..fcd593c9c997 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -62,6 +62,7 @@
62#include "internal.h" 62#include "internal.h"
63 63
64static struct kmem_cache *anon_vma_cachep; 64static struct kmem_cache *anon_vma_cachep;
65static struct kmem_cache *anon_vma_chain_cachep;
65 66
66static inline struct anon_vma *anon_vma_alloc(void) 67static inline struct anon_vma *anon_vma_alloc(void)
67{ 68{
@@ -73,6 +74,16 @@ void anon_vma_free(struct anon_vma *anon_vma)
73 kmem_cache_free(anon_vma_cachep, anon_vma); 74 kmem_cache_free(anon_vma_cachep, anon_vma);
74} 75}
75 76
77static inline struct anon_vma_chain *anon_vma_chain_alloc(void)
78{
79 return kmem_cache_alloc(anon_vma_chain_cachep, GFP_KERNEL);
80}
81
82void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
83{
84 kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain);
85}
86
76/** 87/**
77 * anon_vma_prepare - attach an anon_vma to a memory region 88 * anon_vma_prepare - attach an anon_vma to a memory region
78 * @vma: the memory region in question 89 * @vma: the memory region in question
@@ -103,18 +114,23 @@ void anon_vma_free(struct anon_vma *anon_vma)
103int anon_vma_prepare(struct vm_area_struct *vma) 114int anon_vma_prepare(struct vm_area_struct *vma)
104{ 115{
105 struct anon_vma *anon_vma = vma->anon_vma; 116 struct anon_vma *anon_vma = vma->anon_vma;
117 struct anon_vma_chain *avc;
106 118
107 might_sleep(); 119 might_sleep();
108 if (unlikely(!anon_vma)) { 120 if (unlikely(!anon_vma)) {
109 struct mm_struct *mm = vma->vm_mm; 121 struct mm_struct *mm = vma->vm_mm;
110 struct anon_vma *allocated; 122 struct anon_vma *allocated;
111 123
124 avc = anon_vma_chain_alloc();
125 if (!avc)
126 goto out_enomem;
127
112 anon_vma = find_mergeable_anon_vma(vma); 128 anon_vma = find_mergeable_anon_vma(vma);
113 allocated = NULL; 129 allocated = NULL;
114 if (!anon_vma) { 130 if (!anon_vma) {
115 anon_vma = anon_vma_alloc(); 131 anon_vma = anon_vma_alloc();
116 if (unlikely(!anon_vma)) 132 if (unlikely(!anon_vma))
117 return -ENOMEM; 133 goto out_enomem_free_avc;
118 allocated = anon_vma; 134 allocated = anon_vma;
119 } 135 }
120 spin_lock(&anon_vma->lock); 136 spin_lock(&anon_vma->lock);
@@ -123,53 +139,113 @@ int anon_vma_prepare(struct vm_area_struct *vma)
123 spin_lock(&mm->page_table_lock); 139 spin_lock(&mm->page_table_lock);
124 if (likely(!vma->anon_vma)) { 140 if (likely(!vma->anon_vma)) {
125 vma->anon_vma = anon_vma; 141 vma->anon_vma = anon_vma;
126 list_add_tail(&vma->anon_vma_node, &anon_vma->head); 142 avc->anon_vma = anon_vma;
143 avc->vma = vma;
144 list_add(&avc->same_vma, &vma->anon_vma_chain);
145 list_add(&avc->same_anon_vma, &anon_vma->head);
127 allocated = NULL; 146 allocated = NULL;
128 } 147 }
129 spin_unlock(&mm->page_table_lock); 148 spin_unlock(&mm->page_table_lock);
130 149
131 spin_unlock(&anon_vma->lock); 150 spin_unlock(&anon_vma->lock);
132 if (unlikely(allocated)) 151 if (unlikely(allocated)) {
133 anon_vma_free(allocated); 152 anon_vma_free(allocated);
153 anon_vma_chain_free(avc);
154 }
134 } 155 }
135 return 0; 156 return 0;
157
158 out_enomem_free_avc:
159 anon_vma_chain_free(avc);
160 out_enomem:
161 return -ENOMEM;
136} 162}
137 163
138void __anon_vma_merge(struct vm_area_struct *vma, struct vm_area_struct *next) 164static void anon_vma_chain_link(struct vm_area_struct *vma,
165 struct anon_vma_chain *avc,
166 struct anon_vma *anon_vma)
139{ 167{
140 BUG_ON(vma->anon_vma != next->anon_vma); 168 avc->vma = vma;
141 list_del(&next->anon_vma_node); 169 avc->anon_vma = anon_vma;
170 list_add(&avc->same_vma, &vma->anon_vma_chain);
171
172 spin_lock(&anon_vma->lock);
173 list_add_tail(&avc->same_anon_vma, &anon_vma->head);
174 spin_unlock(&anon_vma->lock);
142} 175}
143 176
144void __anon_vma_link(struct vm_area_struct *vma) 177/*
178 * Attach the anon_vmas from src to dst.
179 * Returns 0 on success, -ENOMEM on failure.
180 */
181int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
145{ 182{
146 struct anon_vma *anon_vma = vma->anon_vma; 183 struct anon_vma_chain *avc, *pavc;
147 184
148 if (anon_vma) 185 list_for_each_entry(pavc, &src->anon_vma_chain, same_vma) {
149 list_add_tail(&vma->anon_vma_node, &anon_vma->head); 186 avc = anon_vma_chain_alloc();
187 if (!avc)
188 goto enomem_failure;
189 anon_vma_chain_link(dst, avc, pavc->anon_vma);
190 }
191 return 0;
192
193 enomem_failure:
194 unlink_anon_vmas(dst);
195 return -ENOMEM;
150} 196}
151 197
152void anon_vma_link(struct vm_area_struct *vma) 198/*
199 * Attach vma to its own anon_vma, as well as to the anon_vmas that
200 * the corresponding VMA in the parent process is attached to.
201 * Returns 0 on success, non-zero on failure.
202 */
203int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
153{ 204{
154 struct anon_vma *anon_vma = vma->anon_vma; 205 struct anon_vma_chain *avc;
206 struct anon_vma *anon_vma;
155 207
156 if (anon_vma) { 208 /* Don't bother if the parent process has no anon_vma here. */
157 spin_lock(&anon_vma->lock); 209 if (!pvma->anon_vma)
158 list_add_tail(&vma->anon_vma_node, &anon_vma->head); 210 return 0;
159 spin_unlock(&anon_vma->lock); 211
160 } 212 /*
213 * First, attach the new VMA to the parent VMA's anon_vmas,
214 * so rmap can find non-COWed pages in child processes.
215 */
216 if (anon_vma_clone(vma, pvma))
217 return -ENOMEM;
218
219 /* Then add our own anon_vma. */
220 anon_vma = anon_vma_alloc();
221 if (!anon_vma)
222 goto out_error;
223 avc = anon_vma_chain_alloc();
224 if (!avc)
225 goto out_error_free_anon_vma;
226 anon_vma_chain_link(vma, avc, anon_vma);
227 /* Mark this anon_vma as the one where our new (COWed) pages go. */
228 vma->anon_vma = anon_vma;
229
230 return 0;
231
232 out_error_free_anon_vma:
233 anon_vma_free(anon_vma);
234 out_error:
235 return -ENOMEM;
161} 236}
162 237
163void anon_vma_unlink(struct vm_area_struct *vma) 238static void anon_vma_unlink(struct anon_vma_chain *anon_vma_chain)
164{ 239{
165 struct anon_vma *anon_vma = vma->anon_vma; 240 struct anon_vma *anon_vma = anon_vma_chain->anon_vma;
166 int empty; 241 int empty;
167 242
243 /* If anon_vma_fork fails, we can get an empty anon_vma_chain. */
168 if (!anon_vma) 244 if (!anon_vma)
169 return; 245 return;
170 246
171 spin_lock(&anon_vma->lock); 247 spin_lock(&anon_vma->lock);
172 list_del(&vma->anon_vma_node); 248 list_del(&anon_vma_chain->same_anon_vma);
173 249
174 /* We must garbage collect the anon_vma if it's empty */ 250 /* We must garbage collect the anon_vma if it's empty */
175 empty = list_empty(&anon_vma->head) && !ksm_refcount(anon_vma); 251 empty = list_empty(&anon_vma->head) && !ksm_refcount(anon_vma);
@@ -179,6 +255,18 @@ void anon_vma_unlink(struct vm_area_struct *vma)
179 anon_vma_free(anon_vma); 255 anon_vma_free(anon_vma);
180} 256}
181 257
258void unlink_anon_vmas(struct vm_area_struct *vma)
259{
260 struct anon_vma_chain *avc, *next;
261
262 /* Unlink each anon_vma chained to the VMA. */
263 list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
264 anon_vma_unlink(avc);
265 list_del(&avc->same_vma);
266 anon_vma_chain_free(avc);
267 }
268}
269
182static void anon_vma_ctor(void *data) 270static void anon_vma_ctor(void *data)
183{ 271{
184 struct anon_vma *anon_vma = data; 272 struct anon_vma *anon_vma = data;
@@ -192,6 +280,7 @@ void __init anon_vma_init(void)
192{ 280{
193 anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma), 281 anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma),
194 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor); 282 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor);
283 anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain, SLAB_PANIC);
195} 284}
196 285
197/* 286/*
@@ -396,7 +485,7 @@ static int page_referenced_anon(struct page *page,
396{ 485{
397 unsigned int mapcount; 486 unsigned int mapcount;
398 struct anon_vma *anon_vma; 487 struct anon_vma *anon_vma;
399 struct vm_area_struct *vma; 488 struct anon_vma_chain *avc;
400 int referenced = 0; 489 int referenced = 0;
401 490
402 anon_vma = page_lock_anon_vma(page); 491 anon_vma = page_lock_anon_vma(page);
@@ -404,7 +493,8 @@ static int page_referenced_anon(struct page *page,
404 return referenced; 493 return referenced;
405 494
406 mapcount = page_mapcount(page); 495 mapcount = page_mapcount(page);
407 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { 496 list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
497 struct vm_area_struct *vma = avc->vma;
408 unsigned long address = vma_address(page, vma); 498 unsigned long address = vma_address(page, vma);
409 if (address == -EFAULT) 499 if (address == -EFAULT)
410 continue; 500 continue;
@@ -511,9 +601,6 @@ int page_referenced(struct page *page,
511 int referenced = 0; 601 int referenced = 0;
512 int we_locked = 0; 602 int we_locked = 0;
513 603
514 if (TestClearPageReferenced(page))
515 referenced++;
516
517 *vm_flags = 0; 604 *vm_flags = 0;
518 if (page_mapped(page) && page_rmapping(page)) { 605 if (page_mapped(page) && page_rmapping(page)) {
519 if (!is_locked && (!PageAnon(page) || PageKsm(page))) { 606 if (!is_locked && (!PageAnon(page) || PageKsm(page))) {
@@ -614,6 +701,30 @@ int page_mkclean(struct page *page)
614EXPORT_SYMBOL_GPL(page_mkclean); 701EXPORT_SYMBOL_GPL(page_mkclean);
615 702
616/** 703/**
704 * page_move_anon_rmap - move a page to our anon_vma
705 * @page: the page to move to our anon_vma
706 * @vma: the vma the page belongs to
707 * @address: the user virtual address mapped
708 *
709 * When a page belongs exclusively to one process after a COW event,
710 * that page can be moved into the anon_vma that belongs to just that
711 * process, so the rmap code will not search the parent or sibling
712 * processes.
713 */
714void page_move_anon_rmap(struct page *page,
715 struct vm_area_struct *vma, unsigned long address)
716{
717 struct anon_vma *anon_vma = vma->anon_vma;
718
719 VM_BUG_ON(!PageLocked(page));
720 VM_BUG_ON(!anon_vma);
721 VM_BUG_ON(page->index != linear_page_index(vma, address));
722
723 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
724 page->mapping = (struct address_space *) anon_vma;
725}
726
727/**
617 * __page_set_anon_rmap - setup new anonymous rmap 728 * __page_set_anon_rmap - setup new anonymous rmap
618 * @page: the page to add the mapping to 729 * @page: the page to add the mapping to
619 * @vma: the vm area in which the mapping is added 730 * @vma: the vm area in which the mapping is added
@@ -652,9 +763,6 @@ static void __page_check_anon_rmap(struct page *page,
652 * are initially only visible via the pagetables, and the pte is locked 763 * are initially only visible via the pagetables, and the pte is locked
653 * over the call to page_add_new_anon_rmap. 764 * over the call to page_add_new_anon_rmap.
654 */ 765 */
655 struct anon_vma *anon_vma = vma->anon_vma;
656 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
657 BUG_ON(page->mapping != (struct address_space *)anon_vma);
658 BUG_ON(page->index != linear_page_index(vma, address)); 766 BUG_ON(page->index != linear_page_index(vma, address));
659#endif 767#endif
660} 768}
@@ -815,9 +923,9 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
815 923
816 if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) { 924 if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) {
817 if (PageAnon(page)) 925 if (PageAnon(page))
818 dec_mm_counter(mm, anon_rss); 926 dec_mm_counter(mm, MM_ANONPAGES);
819 else 927 else
820 dec_mm_counter(mm, file_rss); 928 dec_mm_counter(mm, MM_FILEPAGES);
821 set_pte_at(mm, address, pte, 929 set_pte_at(mm, address, pte,
822 swp_entry_to_pte(make_hwpoison_entry(page))); 930 swp_entry_to_pte(make_hwpoison_entry(page)));
823 } else if (PageAnon(page)) { 931 } else if (PageAnon(page)) {
@@ -839,7 +947,8 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
839 list_add(&mm->mmlist, &init_mm.mmlist); 947 list_add(&mm->mmlist, &init_mm.mmlist);
840 spin_unlock(&mmlist_lock); 948 spin_unlock(&mmlist_lock);
841 } 949 }
842 dec_mm_counter(mm, anon_rss); 950 dec_mm_counter(mm, MM_ANONPAGES);
951 inc_mm_counter(mm, MM_SWAPENTS);
843 } else if (PAGE_MIGRATION) { 952 } else if (PAGE_MIGRATION) {
844 /* 953 /*
845 * Store the pfn of the page in a special migration 954 * Store the pfn of the page in a special migration
@@ -857,7 +966,7 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
857 entry = make_migration_entry(page, pte_write(pteval)); 966 entry = make_migration_entry(page, pte_write(pteval));
858 set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); 967 set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
859 } else 968 } else
860 dec_mm_counter(mm, file_rss); 969 dec_mm_counter(mm, MM_FILEPAGES);
861 970
862 page_remove_rmap(page); 971 page_remove_rmap(page);
863 page_cache_release(page); 972 page_cache_release(page);
@@ -996,7 +1105,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
996 1105
997 page_remove_rmap(page); 1106 page_remove_rmap(page);
998 page_cache_release(page); 1107 page_cache_release(page);
999 dec_mm_counter(mm, file_rss); 1108 dec_mm_counter(mm, MM_FILEPAGES);
1000 (*mapcount)--; 1109 (*mapcount)--;
1001 } 1110 }
1002 pte_unmap_unlock(pte - 1, ptl); 1111 pte_unmap_unlock(pte - 1, ptl);
@@ -1024,14 +1133,15 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
1024static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) 1133static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
1025{ 1134{
1026 struct anon_vma *anon_vma; 1135 struct anon_vma *anon_vma;
1027 struct vm_area_struct *vma; 1136 struct anon_vma_chain *avc;
1028 int ret = SWAP_AGAIN; 1137 int ret = SWAP_AGAIN;
1029 1138
1030 anon_vma = page_lock_anon_vma(page); 1139 anon_vma = page_lock_anon_vma(page);
1031 if (!anon_vma) 1140 if (!anon_vma)
1032 return ret; 1141 return ret;
1033 1142
1034 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { 1143 list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
1144 struct vm_area_struct *vma = avc->vma;
1035 unsigned long address = vma_address(page, vma); 1145 unsigned long address = vma_address(page, vma);
1036 if (address == -EFAULT) 1146 if (address == -EFAULT)
1037 continue; 1147 continue;
@@ -1222,7 +1332,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
1222 struct vm_area_struct *, unsigned long, void *), void *arg) 1332 struct vm_area_struct *, unsigned long, void *), void *arg)
1223{ 1333{
1224 struct anon_vma *anon_vma; 1334 struct anon_vma *anon_vma;
1225 struct vm_area_struct *vma; 1335 struct anon_vma_chain *avc;
1226 int ret = SWAP_AGAIN; 1336 int ret = SWAP_AGAIN;
1227 1337
1228 /* 1338 /*
@@ -1237,7 +1347,8 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
1237 if (!anon_vma) 1347 if (!anon_vma)
1238 return ret; 1348 return ret;
1239 spin_lock(&anon_vma->lock); 1349 spin_lock(&anon_vma->lock);
1240 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { 1350 list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
1351 struct vm_area_struct *vma = avc->vma;
1241 unsigned long address = vma_address(page, vma); 1352 unsigned long address = vma_address(page, vma);
1242 if (address == -EFAULT) 1353 if (address == -EFAULT)
1243 continue; 1354 continue;
diff --git a/mm/slab.c b/mm/slab.c
index 7451bdacaf18..a9f325b28bed 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -935,7 +935,6 @@ static int transfer_objects(struct array_cache *to,
935 935
936 from->avail -= nr; 936 from->avail -= nr;
937 to->avail += nr; 937 to->avail += nr;
938 to->touched = 1;
939 return nr; 938 return nr;
940} 939}
941 940
@@ -983,13 +982,11 @@ static struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp)
983 982
984 if (limit > 1) 983 if (limit > 1)
985 limit = 12; 984 limit = 12;
986 ac_ptr = kmalloc_node(memsize, gfp, node); 985 ac_ptr = kzalloc_node(memsize, gfp, node);
987 if (ac_ptr) { 986 if (ac_ptr) {
988 for_each_node(i) { 987 for_each_node(i) {
989 if (i == node || !node_online(i)) { 988 if (i == node || !node_online(i))
990 ac_ptr[i] = NULL;
991 continue; 989 continue;
992 }
993 ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d, gfp); 990 ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d, gfp);
994 if (!ac_ptr[i]) { 991 if (!ac_ptr[i]) {
995 for (i--; i >= 0; i--) 992 for (i--; i >= 0; i--)
@@ -2963,8 +2960,10 @@ retry:
2963 spin_lock(&l3->list_lock); 2960 spin_lock(&l3->list_lock);
2964 2961
2965 /* See if we can refill from the shared array */ 2962 /* See if we can refill from the shared array */
2966 if (l3->shared && transfer_objects(ac, l3->shared, batchcount)) 2963 if (l3->shared && transfer_objects(ac, l3->shared, batchcount)) {
2964 l3->shared->touched = 1;
2967 goto alloc_done; 2965 goto alloc_done;
2966 }
2968 2967
2969 while (batchcount > 0) { 2968 while (batchcount > 0) {
2970 struct list_head *entry; 2969 struct list_head *entry;
@@ -3101,7 +3100,7 @@ static bool slab_should_failslab(struct kmem_cache *cachep, gfp_t flags)
3101 if (cachep == &cache_cache) 3100 if (cachep == &cache_cache)
3102 return false; 3101 return false;
3103 3102
3104 return should_failslab(obj_size(cachep), flags); 3103 return should_failslab(obj_size(cachep), flags, cachep->flags);
3105} 3104}
3106 3105
3107static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) 3106static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
diff --git a/mm/slub.c b/mm/slub.c
index 8d71aaf888d7..b364844a1068 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -151,7 +151,8 @@
151 * Set of flags that will prevent slab merging 151 * Set of flags that will prevent slab merging
152 */ 152 */
153#define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ 153#define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
154 SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE) 154 SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \
155 SLAB_FAILSLAB)
155 156
156#define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \ 157#define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \
157 SLAB_CACHE_DMA | SLAB_NOTRACK) 158 SLAB_CACHE_DMA | SLAB_NOTRACK)
@@ -217,10 +218,10 @@ static inline void sysfs_slab_remove(struct kmem_cache *s)
217 218
218#endif 219#endif
219 220
220static inline void stat(struct kmem_cache_cpu *c, enum stat_item si) 221static inline void stat(struct kmem_cache *s, enum stat_item si)
221{ 222{
222#ifdef CONFIG_SLUB_STATS 223#ifdef CONFIG_SLUB_STATS
223 c->stat[si]++; 224 __this_cpu_inc(s->cpu_slab->stat[si]);
224#endif 225#endif
225} 226}
226 227
@@ -242,15 +243,6 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
242#endif 243#endif
243} 244}
244 245
245static inline struct kmem_cache_cpu *get_cpu_slab(struct kmem_cache *s, int cpu)
246{
247#ifdef CONFIG_SMP
248 return s->cpu_slab[cpu];
249#else
250 return &s->cpu_slab;
251#endif
252}
253
254/* Verify that a pointer has an address that is valid within a slab page */ 246/* Verify that a pointer has an address that is valid within a slab page */
255static inline int check_valid_pointer(struct kmem_cache *s, 247static inline int check_valid_pointer(struct kmem_cache *s,
256 struct page *page, const void *object) 248 struct page *page, const void *object)
@@ -269,13 +261,6 @@ static inline int check_valid_pointer(struct kmem_cache *s,
269 return 1; 261 return 1;
270} 262}
271 263
272/*
273 * Slow version of get and set free pointer.
274 *
275 * This version requires touching the cache lines of kmem_cache which
276 * we avoid to do in the fast alloc free paths. There we obtain the offset
277 * from the page struct.
278 */
279static inline void *get_freepointer(struct kmem_cache *s, void *object) 264static inline void *get_freepointer(struct kmem_cache *s, void *object)
280{ 265{
281 return *(void **)(object + s->offset); 266 return *(void **)(object + s->offset);
@@ -1020,6 +1005,9 @@ static int __init setup_slub_debug(char *str)
1020 case 't': 1005 case 't':
1021 slub_debug |= SLAB_TRACE; 1006 slub_debug |= SLAB_TRACE;
1022 break; 1007 break;
1008 case 'a':
1009 slub_debug |= SLAB_FAILSLAB;
1010 break;
1023 default: 1011 default:
1024 printk(KERN_ERR "slub_debug option '%c' " 1012 printk(KERN_ERR "slub_debug option '%c' "
1025 "unknown. skipped\n", *str); 1013 "unknown. skipped\n", *str);
@@ -1124,7 +1112,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
1124 if (!page) 1112 if (!page)
1125 return NULL; 1113 return NULL;
1126 1114
1127 stat(get_cpu_slab(s, raw_smp_processor_id()), ORDER_FALLBACK); 1115 stat(s, ORDER_FALLBACK);
1128 } 1116 }
1129 1117
1130 if (kmemcheck_enabled 1118 if (kmemcheck_enabled
@@ -1422,23 +1410,22 @@ static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node)
1422static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) 1410static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
1423{ 1411{
1424 struct kmem_cache_node *n = get_node(s, page_to_nid(page)); 1412 struct kmem_cache_node *n = get_node(s, page_to_nid(page));
1425 struct kmem_cache_cpu *c = get_cpu_slab(s, smp_processor_id());
1426 1413
1427 __ClearPageSlubFrozen(page); 1414 __ClearPageSlubFrozen(page);
1428 if (page->inuse) { 1415 if (page->inuse) {
1429 1416
1430 if (page->freelist) { 1417 if (page->freelist) {
1431 add_partial(n, page, tail); 1418 add_partial(n, page, tail);
1432 stat(c, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD); 1419 stat(s, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD);
1433 } else { 1420 } else {
1434 stat(c, DEACTIVATE_FULL); 1421 stat(s, DEACTIVATE_FULL);
1435 if (SLABDEBUG && PageSlubDebug(page) && 1422 if (SLABDEBUG && PageSlubDebug(page) &&
1436 (s->flags & SLAB_STORE_USER)) 1423 (s->flags & SLAB_STORE_USER))
1437 add_full(n, page); 1424 add_full(n, page);
1438 } 1425 }
1439 slab_unlock(page); 1426 slab_unlock(page);
1440 } else { 1427 } else {
1441 stat(c, DEACTIVATE_EMPTY); 1428 stat(s, DEACTIVATE_EMPTY);
1442 if (n->nr_partial < s->min_partial) { 1429 if (n->nr_partial < s->min_partial) {
1443 /* 1430 /*
1444 * Adding an empty slab to the partial slabs in order 1431 * Adding an empty slab to the partial slabs in order
@@ -1454,7 +1441,7 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
1454 slab_unlock(page); 1441 slab_unlock(page);
1455 } else { 1442 } else {
1456 slab_unlock(page); 1443 slab_unlock(page);
1457 stat(get_cpu_slab(s, raw_smp_processor_id()), FREE_SLAB); 1444 stat(s, FREE_SLAB);
1458 discard_slab(s, page); 1445 discard_slab(s, page);
1459 } 1446 }
1460 } 1447 }
@@ -1469,7 +1456,7 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
1469 int tail = 1; 1456 int tail = 1;
1470 1457
1471 if (page->freelist) 1458 if (page->freelist)
1472 stat(c, DEACTIVATE_REMOTE_FREES); 1459 stat(s, DEACTIVATE_REMOTE_FREES);
1473 /* 1460 /*
1474 * Merge cpu freelist into slab freelist. Typically we get here 1461 * Merge cpu freelist into slab freelist. Typically we get here
1475 * because both freelists are empty. So this is unlikely 1462 * because both freelists are empty. So this is unlikely
@@ -1482,10 +1469,10 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
1482 1469
1483 /* Retrieve object from cpu_freelist */ 1470 /* Retrieve object from cpu_freelist */
1484 object = c->freelist; 1471 object = c->freelist;
1485 c->freelist = c->freelist[c->offset]; 1472 c->freelist = get_freepointer(s, c->freelist);
1486 1473
1487 /* And put onto the regular freelist */ 1474 /* And put onto the regular freelist */
1488 object[c->offset] = page->freelist; 1475 set_freepointer(s, object, page->freelist);
1489 page->freelist = object; 1476 page->freelist = object;
1490 page->inuse--; 1477 page->inuse--;
1491 } 1478 }
@@ -1495,7 +1482,7 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
1495 1482
1496static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) 1483static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
1497{ 1484{
1498 stat(c, CPUSLAB_FLUSH); 1485 stat(s, CPUSLAB_FLUSH);
1499 slab_lock(c->page); 1486 slab_lock(c->page);
1500 deactivate_slab(s, c); 1487 deactivate_slab(s, c);
1501} 1488}
@@ -1507,7 +1494,7 @@ static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
1507 */ 1494 */
1508static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) 1495static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu)
1509{ 1496{
1510 struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); 1497 struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
1511 1498
1512 if (likely(c && c->page)) 1499 if (likely(c && c->page))
1513 flush_slab(s, c); 1500 flush_slab(s, c);
@@ -1635,7 +1622,7 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
1635 if (unlikely(!node_match(c, node))) 1622 if (unlikely(!node_match(c, node)))
1636 goto another_slab; 1623 goto another_slab;
1637 1624
1638 stat(c, ALLOC_REFILL); 1625 stat(s, ALLOC_REFILL);
1639 1626
1640load_freelist: 1627load_freelist:
1641 object = c->page->freelist; 1628 object = c->page->freelist;
@@ -1644,13 +1631,13 @@ load_freelist:
1644 if (unlikely(SLABDEBUG && PageSlubDebug(c->page))) 1631 if (unlikely(SLABDEBUG && PageSlubDebug(c->page)))
1645 goto debug; 1632 goto debug;
1646 1633
1647 c->freelist = object[c->offset]; 1634 c->freelist = get_freepointer(s, object);
1648 c->page->inuse = c->page->objects; 1635 c->page->inuse = c->page->objects;
1649 c->page->freelist = NULL; 1636 c->page->freelist = NULL;
1650 c->node = page_to_nid(c->page); 1637 c->node = page_to_nid(c->page);
1651unlock_out: 1638unlock_out:
1652 slab_unlock(c->page); 1639 slab_unlock(c->page);
1653 stat(c, ALLOC_SLOWPATH); 1640 stat(s, ALLOC_SLOWPATH);
1654 return object; 1641 return object;
1655 1642
1656another_slab: 1643another_slab:
@@ -1660,7 +1647,7 @@ new_slab:
1660 new = get_partial(s, gfpflags, node); 1647 new = get_partial(s, gfpflags, node);
1661 if (new) { 1648 if (new) {
1662 c->page = new; 1649 c->page = new;
1663 stat(c, ALLOC_FROM_PARTIAL); 1650 stat(s, ALLOC_FROM_PARTIAL);
1664 goto load_freelist; 1651 goto load_freelist;
1665 } 1652 }
1666 1653
@@ -1673,8 +1660,8 @@ new_slab:
1673 local_irq_disable(); 1660 local_irq_disable();
1674 1661
1675 if (new) { 1662 if (new) {
1676 c = get_cpu_slab(s, smp_processor_id()); 1663 c = __this_cpu_ptr(s->cpu_slab);
1677 stat(c, ALLOC_SLAB); 1664 stat(s, ALLOC_SLAB);
1678 if (c->page) 1665 if (c->page)
1679 flush_slab(s, c); 1666 flush_slab(s, c);
1680 slab_lock(new); 1667 slab_lock(new);
@@ -1690,7 +1677,7 @@ debug:
1690 goto another_slab; 1677 goto another_slab;
1691 1678
1692 c->page->inuse++; 1679 c->page->inuse++;
1693 c->page->freelist = object[c->offset]; 1680 c->page->freelist = get_freepointer(s, object);
1694 c->node = -1; 1681 c->node = -1;
1695 goto unlock_out; 1682 goto unlock_out;
1696} 1683}
@@ -1711,35 +1698,33 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
1711 void **object; 1698 void **object;
1712 struct kmem_cache_cpu *c; 1699 struct kmem_cache_cpu *c;
1713 unsigned long flags; 1700 unsigned long flags;
1714 unsigned int objsize;
1715 1701
1716 gfpflags &= gfp_allowed_mask; 1702 gfpflags &= gfp_allowed_mask;
1717 1703
1718 lockdep_trace_alloc(gfpflags); 1704 lockdep_trace_alloc(gfpflags);
1719 might_sleep_if(gfpflags & __GFP_WAIT); 1705 might_sleep_if(gfpflags & __GFP_WAIT);
1720 1706
1721 if (should_failslab(s->objsize, gfpflags)) 1707 if (should_failslab(s->objsize, gfpflags, s->flags))
1722 return NULL; 1708 return NULL;
1723 1709
1724 local_irq_save(flags); 1710 local_irq_save(flags);
1725 c = get_cpu_slab(s, smp_processor_id()); 1711 c = __this_cpu_ptr(s->cpu_slab);
1726 objsize = c->objsize; 1712 object = c->freelist;
1727 if (unlikely(!c->freelist || !node_match(c, node))) 1713 if (unlikely(!object || !node_match(c, node)))
1728 1714
1729 object = __slab_alloc(s, gfpflags, node, addr, c); 1715 object = __slab_alloc(s, gfpflags, node, addr, c);
1730 1716
1731 else { 1717 else {
1732 object = c->freelist; 1718 c->freelist = get_freepointer(s, object);
1733 c->freelist = object[c->offset]; 1719 stat(s, ALLOC_FASTPATH);
1734 stat(c, ALLOC_FASTPATH);
1735 } 1720 }
1736 local_irq_restore(flags); 1721 local_irq_restore(flags);
1737 1722
1738 if (unlikely(gfpflags & __GFP_ZERO) && object) 1723 if (unlikely(gfpflags & __GFP_ZERO) && object)
1739 memset(object, 0, objsize); 1724 memset(object, 0, s->objsize);
1740 1725
1741 kmemcheck_slab_alloc(s, gfpflags, object, c->objsize); 1726 kmemcheck_slab_alloc(s, gfpflags, object, s->objsize);
1742 kmemleak_alloc_recursive(object, objsize, 1, s->flags, gfpflags); 1727 kmemleak_alloc_recursive(object, s->objsize, 1, s->flags, gfpflags);
1743 1728
1744 return object; 1729 return object;
1745} 1730}
@@ -1794,26 +1779,25 @@ EXPORT_SYMBOL(kmem_cache_alloc_node_notrace);
1794 * handling required then we can return immediately. 1779 * handling required then we can return immediately.
1795 */ 1780 */
1796static void __slab_free(struct kmem_cache *s, struct page *page, 1781static void __slab_free(struct kmem_cache *s, struct page *page,
1797 void *x, unsigned long addr, unsigned int offset) 1782 void *x, unsigned long addr)
1798{ 1783{
1799 void *prior; 1784 void *prior;
1800 void **object = (void *)x; 1785 void **object = (void *)x;
1801 struct kmem_cache_cpu *c;
1802 1786
1803 c = get_cpu_slab(s, raw_smp_processor_id()); 1787 stat(s, FREE_SLOWPATH);
1804 stat(c, FREE_SLOWPATH);
1805 slab_lock(page); 1788 slab_lock(page);
1806 1789
1807 if (unlikely(SLABDEBUG && PageSlubDebug(page))) 1790 if (unlikely(SLABDEBUG && PageSlubDebug(page)))
1808 goto debug; 1791 goto debug;
1809 1792
1810checks_ok: 1793checks_ok:
1811 prior = object[offset] = page->freelist; 1794 prior = page->freelist;
1795 set_freepointer(s, object, prior);
1812 page->freelist = object; 1796 page->freelist = object;
1813 page->inuse--; 1797 page->inuse--;
1814 1798
1815 if (unlikely(PageSlubFrozen(page))) { 1799 if (unlikely(PageSlubFrozen(page))) {
1816 stat(c, FREE_FROZEN); 1800 stat(s, FREE_FROZEN);
1817 goto out_unlock; 1801 goto out_unlock;
1818 } 1802 }
1819 1803
@@ -1826,7 +1810,7 @@ checks_ok:
1826 */ 1810 */
1827 if (unlikely(!prior)) { 1811 if (unlikely(!prior)) {
1828 add_partial(get_node(s, page_to_nid(page)), page, 1); 1812 add_partial(get_node(s, page_to_nid(page)), page, 1);
1829 stat(c, FREE_ADD_PARTIAL); 1813 stat(s, FREE_ADD_PARTIAL);
1830 } 1814 }
1831 1815
1832out_unlock: 1816out_unlock:
@@ -1839,10 +1823,10 @@ slab_empty:
1839 * Slab still on the partial list. 1823 * Slab still on the partial list.
1840 */ 1824 */
1841 remove_partial(s, page); 1825 remove_partial(s, page);
1842 stat(c, FREE_REMOVE_PARTIAL); 1826 stat(s, FREE_REMOVE_PARTIAL);
1843 } 1827 }
1844 slab_unlock(page); 1828 slab_unlock(page);
1845 stat(c, FREE_SLAB); 1829 stat(s, FREE_SLAB);
1846 discard_slab(s, page); 1830 discard_slab(s, page);
1847 return; 1831 return;
1848 1832
@@ -1872,17 +1856,17 @@ static __always_inline void slab_free(struct kmem_cache *s,
1872 1856
1873 kmemleak_free_recursive(x, s->flags); 1857 kmemleak_free_recursive(x, s->flags);
1874 local_irq_save(flags); 1858 local_irq_save(flags);
1875 c = get_cpu_slab(s, smp_processor_id()); 1859 c = __this_cpu_ptr(s->cpu_slab);
1876 kmemcheck_slab_free(s, object, c->objsize); 1860 kmemcheck_slab_free(s, object, s->objsize);
1877 debug_check_no_locks_freed(object, c->objsize); 1861 debug_check_no_locks_freed(object, s->objsize);
1878 if (!(s->flags & SLAB_DEBUG_OBJECTS)) 1862 if (!(s->flags & SLAB_DEBUG_OBJECTS))
1879 debug_check_no_obj_freed(object, c->objsize); 1863 debug_check_no_obj_freed(object, s->objsize);
1880 if (likely(page == c->page && c->node >= 0)) { 1864 if (likely(page == c->page && c->node >= 0)) {
1881 object[c->offset] = c->freelist; 1865 set_freepointer(s, object, c->freelist);
1882 c->freelist = object; 1866 c->freelist = object;
1883 stat(c, FREE_FASTPATH); 1867 stat(s, FREE_FASTPATH);
1884 } else 1868 } else
1885 __slab_free(s, page, x, addr, c->offset); 1869 __slab_free(s, page, x, addr);
1886 1870
1887 local_irq_restore(flags); 1871 local_irq_restore(flags);
1888} 1872}
@@ -2069,19 +2053,6 @@ static unsigned long calculate_alignment(unsigned long flags,
2069 return ALIGN(align, sizeof(void *)); 2053 return ALIGN(align, sizeof(void *));
2070} 2054}
2071 2055
2072static void init_kmem_cache_cpu(struct kmem_cache *s,
2073 struct kmem_cache_cpu *c)
2074{
2075 c->page = NULL;
2076 c->freelist = NULL;
2077 c->node = 0;
2078 c->offset = s->offset / sizeof(void *);
2079 c->objsize = s->objsize;
2080#ifdef CONFIG_SLUB_STATS
2081 memset(c->stat, 0, NR_SLUB_STAT_ITEMS * sizeof(unsigned));
2082#endif
2083}
2084
2085static void 2056static void
2086init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s) 2057init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s)
2087{ 2058{
@@ -2095,130 +2066,24 @@ init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s)
2095#endif 2066#endif
2096} 2067}
2097 2068
2098#ifdef CONFIG_SMP 2069static DEFINE_PER_CPU(struct kmem_cache_cpu, kmalloc_percpu[KMALLOC_CACHES]);
2099/*
2100 * Per cpu array for per cpu structures.
2101 *
2102 * The per cpu array places all kmem_cache_cpu structures from one processor
2103 * close together meaning that it becomes possible that multiple per cpu
2104 * structures are contained in one cacheline. This may be particularly
2105 * beneficial for the kmalloc caches.
2106 *
2107 * A desktop system typically has around 60-80 slabs. With 100 here we are
2108 * likely able to get per cpu structures for all caches from the array defined
2109 * here. We must be able to cover all kmalloc caches during bootstrap.
2110 *
2111 * If the per cpu array is exhausted then fall back to kmalloc
2112 * of individual cachelines. No sharing is possible then.
2113 */
2114#define NR_KMEM_CACHE_CPU 100
2115
2116static DEFINE_PER_CPU(struct kmem_cache_cpu [NR_KMEM_CACHE_CPU],
2117 kmem_cache_cpu);
2118
2119static DEFINE_PER_CPU(struct kmem_cache_cpu *, kmem_cache_cpu_free);
2120static DECLARE_BITMAP(kmem_cach_cpu_free_init_once, CONFIG_NR_CPUS);
2121
2122static struct kmem_cache_cpu *alloc_kmem_cache_cpu(struct kmem_cache *s,
2123 int cpu, gfp_t flags)
2124{
2125 struct kmem_cache_cpu *c = per_cpu(kmem_cache_cpu_free, cpu);
2126
2127 if (c)
2128 per_cpu(kmem_cache_cpu_free, cpu) =
2129 (void *)c->freelist;
2130 else {
2131 /* Table overflow: So allocate ourselves */
2132 c = kmalloc_node(
2133 ALIGN(sizeof(struct kmem_cache_cpu), cache_line_size()),
2134 flags, cpu_to_node(cpu));
2135 if (!c)
2136 return NULL;
2137 }
2138
2139 init_kmem_cache_cpu(s, c);
2140 return c;
2141}
2142
2143static void free_kmem_cache_cpu(struct kmem_cache_cpu *c, int cpu)
2144{
2145 if (c < per_cpu(kmem_cache_cpu, cpu) ||
2146 c >= per_cpu(kmem_cache_cpu, cpu) + NR_KMEM_CACHE_CPU) {
2147 kfree(c);
2148 return;
2149 }
2150 c->freelist = (void *)per_cpu(kmem_cache_cpu_free, cpu);
2151 per_cpu(kmem_cache_cpu_free, cpu) = c;
2152}
2153
2154static void free_kmem_cache_cpus(struct kmem_cache *s)
2155{
2156 int cpu;
2157
2158 for_each_online_cpu(cpu) {
2159 struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
2160
2161 if (c) {
2162 s->cpu_slab[cpu] = NULL;
2163 free_kmem_cache_cpu(c, cpu);
2164 }
2165 }
2166}
2167
2168static int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags)
2169{
2170 int cpu;
2171
2172 for_each_online_cpu(cpu) {
2173 struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
2174
2175 if (c)
2176 continue;
2177
2178 c = alloc_kmem_cache_cpu(s, cpu, flags);
2179 if (!c) {
2180 free_kmem_cache_cpus(s);
2181 return 0;
2182 }
2183 s->cpu_slab[cpu] = c;
2184 }
2185 return 1;
2186}
2187
2188/*
2189 * Initialize the per cpu array.
2190 */
2191static void init_alloc_cpu_cpu(int cpu)
2192{
2193 int i;
2194 2070
2195 if (cpumask_test_cpu(cpu, to_cpumask(kmem_cach_cpu_free_init_once))) 2071static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags)
2196 return;
2197
2198 for (i = NR_KMEM_CACHE_CPU - 1; i >= 0; i--)
2199 free_kmem_cache_cpu(&per_cpu(kmem_cache_cpu, cpu)[i], cpu);
2200
2201 cpumask_set_cpu(cpu, to_cpumask(kmem_cach_cpu_free_init_once));
2202}
2203
2204static void __init init_alloc_cpu(void)
2205{ 2072{
2206 int cpu; 2073 if (s < kmalloc_caches + KMALLOC_CACHES && s >= kmalloc_caches)
2207 2074 /*
2208 for_each_online_cpu(cpu) 2075 * Boot time creation of the kmalloc array. Use static per cpu data
2209 init_alloc_cpu_cpu(cpu); 2076 * since the per cpu allocator is not available yet.
2210 } 2077 */
2078 s->cpu_slab = kmalloc_percpu + (s - kmalloc_caches);
2079 else
2080 s->cpu_slab = alloc_percpu(struct kmem_cache_cpu);
2211 2081
2212#else 2082 if (!s->cpu_slab)
2213static inline void free_kmem_cache_cpus(struct kmem_cache *s) {} 2083 return 0;
2214static inline void init_alloc_cpu(void) {}
2215 2084
2216static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags)
2217{
2218 init_kmem_cache_cpu(s, &s->cpu_slab);
2219 return 1; 2085 return 1;
2220} 2086}
2221#endif
2222 2087
2223#ifdef CONFIG_NUMA 2088#ifdef CONFIG_NUMA
2224/* 2089/*
@@ -2287,7 +2152,8 @@ static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
2287 int node; 2152 int node;
2288 int local_node; 2153 int local_node;
2289 2154
2290 if (slab_state >= UP) 2155 if (slab_state >= UP && (s < kmalloc_caches ||
2156 s > kmalloc_caches + KMALLOC_CACHES))
2291 local_node = page_to_nid(virt_to_page(s)); 2157 local_node = page_to_nid(virt_to_page(s));
2292 else 2158 else
2293 local_node = 0; 2159 local_node = 0;
@@ -2502,6 +2368,7 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
2502 2368
2503 if (alloc_kmem_cache_cpus(s, gfpflags & ~SLUB_DMA)) 2369 if (alloc_kmem_cache_cpus(s, gfpflags & ~SLUB_DMA))
2504 return 1; 2370 return 1;
2371
2505 free_kmem_cache_nodes(s); 2372 free_kmem_cache_nodes(s);
2506error: 2373error:
2507 if (flags & SLAB_PANIC) 2374 if (flags & SLAB_PANIC)
@@ -2609,9 +2476,8 @@ static inline int kmem_cache_close(struct kmem_cache *s)
2609 int node; 2476 int node;
2610 2477
2611 flush_all(s); 2478 flush_all(s);
2612 2479 free_percpu(s->cpu_slab);
2613 /* Attempt to free all objects */ 2480 /* Attempt to free all objects */
2614 free_kmem_cache_cpus(s);
2615 for_each_node_state(node, N_NORMAL_MEMORY) { 2481 for_each_node_state(node, N_NORMAL_MEMORY) {
2616 struct kmem_cache_node *n = get_node(s, node); 2482 struct kmem_cache_node *n = get_node(s, node);
2617 2483
@@ -2651,7 +2517,7 @@ EXPORT_SYMBOL(kmem_cache_destroy);
2651 * Kmalloc subsystem 2517 * Kmalloc subsystem
2652 *******************************************************************/ 2518 *******************************************************************/
2653 2519
2654struct kmem_cache kmalloc_caches[SLUB_PAGE_SHIFT] __cacheline_aligned; 2520struct kmem_cache kmalloc_caches[KMALLOC_CACHES] __cacheline_aligned;
2655EXPORT_SYMBOL(kmalloc_caches); 2521EXPORT_SYMBOL(kmalloc_caches);
2656 2522
2657static int __init setup_slub_min_order(char *str) 2523static int __init setup_slub_min_order(char *str)
@@ -2741,6 +2607,7 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags)
2741 char *text; 2607 char *text;
2742 size_t realsize; 2608 size_t realsize;
2743 unsigned long slabflags; 2609 unsigned long slabflags;
2610 int i;
2744 2611
2745 s = kmalloc_caches_dma[index]; 2612 s = kmalloc_caches_dma[index];
2746 if (s) 2613 if (s)
@@ -2760,7 +2627,14 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags)
2760 realsize = kmalloc_caches[index].objsize; 2627 realsize = kmalloc_caches[index].objsize;
2761 text = kasprintf(flags & ~SLUB_DMA, "kmalloc_dma-%d", 2628 text = kasprintf(flags & ~SLUB_DMA, "kmalloc_dma-%d",
2762 (unsigned int)realsize); 2629 (unsigned int)realsize);
2763 s = kmalloc(kmem_size, flags & ~SLUB_DMA); 2630
2631 s = NULL;
2632 for (i = 0; i < KMALLOC_CACHES; i++)
2633 if (!kmalloc_caches[i].size)
2634 break;
2635
2636 BUG_ON(i >= KMALLOC_CACHES);
2637 s = kmalloc_caches + i;
2764 2638
2765 /* 2639 /*
2766 * Must defer sysfs creation to a workqueue because we don't know 2640 * Must defer sysfs creation to a workqueue because we don't know
@@ -2772,9 +2646,9 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags)
2772 if (slab_state >= SYSFS) 2646 if (slab_state >= SYSFS)
2773 slabflags |= __SYSFS_ADD_DEFERRED; 2647 slabflags |= __SYSFS_ADD_DEFERRED;
2774 2648
2775 if (!s || !text || !kmem_cache_open(s, flags, text, 2649 if (!text || !kmem_cache_open(s, flags, text,
2776 realsize, ARCH_KMALLOC_MINALIGN, slabflags, NULL)) { 2650 realsize, ARCH_KMALLOC_MINALIGN, slabflags, NULL)) {
2777 kfree(s); 2651 s->size = 0;
2778 kfree(text); 2652 kfree(text);
2779 goto unlock_out; 2653 goto unlock_out;
2780 } 2654 }
@@ -3086,7 +2960,7 @@ static void slab_mem_offline_callback(void *arg)
3086 /* 2960 /*
3087 * if n->nr_slabs > 0, slabs still exist on the node 2961 * if n->nr_slabs > 0, slabs still exist on the node
3088 * that is going down. We were unable to free them, 2962 * that is going down. We were unable to free them,
3089 * and offline_pages() function shoudn't call this 2963 * and offline_pages() function shouldn't call this
3090 * callback. So, we must fail. 2964 * callback. So, we must fail.
3091 */ 2965 */
3092 BUG_ON(slabs_node(s, offline_node)); 2966 BUG_ON(slabs_node(s, offline_node));
@@ -3176,8 +3050,6 @@ void __init kmem_cache_init(void)
3176 int i; 3050 int i;
3177 int caches = 0; 3051 int caches = 0;
3178 3052
3179 init_alloc_cpu();
3180
3181#ifdef CONFIG_NUMA 3053#ifdef CONFIG_NUMA
3182 /* 3054 /*
3183 * Must first have the slab cache available for the allocations of the 3055 * Must first have the slab cache available for the allocations of the
@@ -3261,8 +3133,10 @@ void __init kmem_cache_init(void)
3261 3133
3262#ifdef CONFIG_SMP 3134#ifdef CONFIG_SMP
3263 register_cpu_notifier(&slab_notifier); 3135 register_cpu_notifier(&slab_notifier);
3264 kmem_size = offsetof(struct kmem_cache, cpu_slab) + 3136#endif
3265 nr_cpu_ids * sizeof(struct kmem_cache_cpu *); 3137#ifdef CONFIG_NUMA
3138 kmem_size = offsetof(struct kmem_cache, node) +
3139 nr_node_ids * sizeof(struct kmem_cache_node *);
3266#else 3140#else
3267 kmem_size = sizeof(struct kmem_cache); 3141 kmem_size = sizeof(struct kmem_cache);
3268#endif 3142#endif
@@ -3351,22 +3225,12 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
3351 down_write(&slub_lock); 3225 down_write(&slub_lock);
3352 s = find_mergeable(size, align, flags, name, ctor); 3226 s = find_mergeable(size, align, flags, name, ctor);
3353 if (s) { 3227 if (s) {
3354 int cpu;
3355
3356 s->refcount++; 3228 s->refcount++;
3357 /* 3229 /*
3358 * Adjust the object sizes so that we clear 3230 * Adjust the object sizes so that we clear
3359 * the complete object on kzalloc. 3231 * the complete object on kzalloc.
3360 */ 3232 */
3361 s->objsize = max(s->objsize, (int)size); 3233 s->objsize = max(s->objsize, (int)size);
3362
3363 /*
3364 * And then we need to update the object size in the
3365 * per cpu structures
3366 */
3367 for_each_online_cpu(cpu)
3368 get_cpu_slab(s, cpu)->objsize = s->objsize;
3369
3370 s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); 3234 s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *)));
3371 up_write(&slub_lock); 3235 up_write(&slub_lock);
3372 3236
@@ -3420,29 +3284,15 @@ static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb,
3420 unsigned long flags; 3284 unsigned long flags;
3421 3285
3422 switch (action) { 3286 switch (action) {
3423 case CPU_UP_PREPARE:
3424 case CPU_UP_PREPARE_FROZEN:
3425 init_alloc_cpu_cpu(cpu);
3426 down_read(&slub_lock);
3427 list_for_each_entry(s, &slab_caches, list)
3428 s->cpu_slab[cpu] = alloc_kmem_cache_cpu(s, cpu,
3429 GFP_KERNEL);
3430 up_read(&slub_lock);
3431 break;
3432
3433 case CPU_UP_CANCELED: 3287 case CPU_UP_CANCELED:
3434 case CPU_UP_CANCELED_FROZEN: 3288 case CPU_UP_CANCELED_FROZEN:
3435 case CPU_DEAD: 3289 case CPU_DEAD:
3436 case CPU_DEAD_FROZEN: 3290 case CPU_DEAD_FROZEN:
3437 down_read(&slub_lock); 3291 down_read(&slub_lock);
3438 list_for_each_entry(s, &slab_caches, list) { 3292 list_for_each_entry(s, &slab_caches, list) {
3439 struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
3440
3441 local_irq_save(flags); 3293 local_irq_save(flags);
3442 __flush_cpu_slab(s, cpu); 3294 __flush_cpu_slab(s, cpu);
3443 local_irq_restore(flags); 3295 local_irq_restore(flags);
3444 free_kmem_cache_cpu(c, cpu);
3445 s->cpu_slab[cpu] = NULL;
3446 } 3296 }
3447 up_read(&slub_lock); 3297 up_read(&slub_lock);
3448 break; 3298 break;
@@ -3928,7 +3778,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
3928 int cpu; 3778 int cpu;
3929 3779
3930 for_each_possible_cpu(cpu) { 3780 for_each_possible_cpu(cpu) {
3931 struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); 3781 struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
3932 3782
3933 if (!c || c->node < 0) 3783 if (!c || c->node < 0)
3934 continue; 3784 continue;
@@ -4171,6 +4021,23 @@ static ssize_t trace_store(struct kmem_cache *s, const char *buf,
4171} 4021}
4172SLAB_ATTR(trace); 4022SLAB_ATTR(trace);
4173 4023
4024#ifdef CONFIG_FAILSLAB
4025static ssize_t failslab_show(struct kmem_cache *s, char *buf)
4026{
4027 return sprintf(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB));
4028}
4029
4030static ssize_t failslab_store(struct kmem_cache *s, const char *buf,
4031 size_t length)
4032{
4033 s->flags &= ~SLAB_FAILSLAB;
4034 if (buf[0] == '1')
4035 s->flags |= SLAB_FAILSLAB;
4036 return length;
4037}
4038SLAB_ATTR(failslab);
4039#endif
4040
4174static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf) 4041static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf)
4175{ 4042{
4176 return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT)); 4043 return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT));
@@ -4353,7 +4220,7 @@ static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si)
4353 return -ENOMEM; 4220 return -ENOMEM;
4354 4221
4355 for_each_online_cpu(cpu) { 4222 for_each_online_cpu(cpu) {
4356 unsigned x = get_cpu_slab(s, cpu)->stat[si]; 4223 unsigned x = per_cpu_ptr(s->cpu_slab, cpu)->stat[si];
4357 4224
4358 data[cpu] = x; 4225 data[cpu] = x;
4359 sum += x; 4226 sum += x;
@@ -4376,7 +4243,7 @@ static void clear_stat(struct kmem_cache *s, enum stat_item si)
4376 int cpu; 4243 int cpu;
4377 4244
4378 for_each_online_cpu(cpu) 4245 for_each_online_cpu(cpu)
4379 get_cpu_slab(s, cpu)->stat[si] = 0; 4246 per_cpu_ptr(s->cpu_slab, cpu)->stat[si] = 0;
4380} 4247}
4381 4248
4382#define STAT_ATTR(si, text) \ 4249#define STAT_ATTR(si, text) \
@@ -4467,6 +4334,10 @@ static struct attribute *slab_attrs[] = {
4467 &deactivate_remote_frees_attr.attr, 4334 &deactivate_remote_frees_attr.attr,
4468 &order_fallback_attr.attr, 4335 &order_fallback_attr.attr,
4469#endif 4336#endif
4337#ifdef CONFIG_FAILSLAB
4338 &failslab_attr.attr,
4339#endif
4340
4470 NULL 4341 NULL
4471}; 4342};
4472 4343
@@ -4519,7 +4390,7 @@ static void kmem_cache_release(struct kobject *kobj)
4519 kfree(s); 4390 kfree(s);
4520} 4391}
4521 4392
4522static struct sysfs_ops slab_sysfs_ops = { 4393static const struct sysfs_ops slab_sysfs_ops = {
4523 .show = slab_attr_show, 4394 .show = slab_attr_show,
4524 .store = slab_attr_store, 4395 .store = slab_attr_store,
4525}; 4396};
@@ -4538,7 +4409,7 @@ static int uevent_filter(struct kset *kset, struct kobject *kobj)
4538 return 0; 4409 return 0;
4539} 4410}
4540 4411
4541static struct kset_uevent_ops slab_uevent_ops = { 4412static const struct kset_uevent_ops slab_uevent_ops = {
4542 .filter = uevent_filter, 4413 .filter = uevent_filter,
4543}; 4414};
4544 4415
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index d9714bdcb4a3..392b9bb5bc01 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -40,9 +40,11 @@ static void * __init_refok __earlyonly_bootmem_alloc(int node,
40 unsigned long align, 40 unsigned long align,
41 unsigned long goal) 41 unsigned long goal)
42{ 42{
43 return __alloc_bootmem_node(NODE_DATA(node), size, align, goal); 43 return __alloc_bootmem_node_high(NODE_DATA(node), size, align, goal);
44} 44}
45 45
46static void *vmemmap_buf;
47static void *vmemmap_buf_end;
46 48
47void * __meminit vmemmap_alloc_block(unsigned long size, int node) 49void * __meminit vmemmap_alloc_block(unsigned long size, int node)
48{ 50{
@@ -64,6 +66,24 @@ void * __meminit vmemmap_alloc_block(unsigned long size, int node)
64 __pa(MAX_DMA_ADDRESS)); 66 __pa(MAX_DMA_ADDRESS));
65} 67}
66 68
69/* need to make sure size is all the same during early stage */
70void * __meminit vmemmap_alloc_block_buf(unsigned long size, int node)
71{
72 void *ptr;
73
74 if (!vmemmap_buf)
75 return vmemmap_alloc_block(size, node);
76
77 /* take the from buf */
78 ptr = (void *)ALIGN((unsigned long)vmemmap_buf, size);
79 if (ptr + size > vmemmap_buf_end)
80 return vmemmap_alloc_block(size, node);
81
82 vmemmap_buf = ptr + size;
83
84 return ptr;
85}
86
67void __meminit vmemmap_verify(pte_t *pte, int node, 87void __meminit vmemmap_verify(pte_t *pte, int node,
68 unsigned long start, unsigned long end) 88 unsigned long start, unsigned long end)
69{ 89{
@@ -80,7 +100,7 @@ pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node)
80 pte_t *pte = pte_offset_kernel(pmd, addr); 100 pte_t *pte = pte_offset_kernel(pmd, addr);
81 if (pte_none(*pte)) { 101 if (pte_none(*pte)) {
82 pte_t entry; 102 pte_t entry;
83 void *p = vmemmap_alloc_block(PAGE_SIZE, node); 103 void *p = vmemmap_alloc_block_buf(PAGE_SIZE, node);
84 if (!p) 104 if (!p)
85 return NULL; 105 return NULL;
86 entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL); 106 entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
@@ -163,3 +183,55 @@ struct page * __meminit sparse_mem_map_populate(unsigned long pnum, int nid)
163 183
164 return map; 184 return map;
165} 185}
186
187void __init sparse_mem_maps_populate_node(struct page **map_map,
188 unsigned long pnum_begin,
189 unsigned long pnum_end,
190 unsigned long map_count, int nodeid)
191{
192 unsigned long pnum;
193 unsigned long size = sizeof(struct page) * PAGES_PER_SECTION;
194 void *vmemmap_buf_start;
195
196 size = ALIGN(size, PMD_SIZE);
197 vmemmap_buf_start = __earlyonly_bootmem_alloc(nodeid, size * map_count,
198 PMD_SIZE, __pa(MAX_DMA_ADDRESS));
199
200 if (vmemmap_buf_start) {
201 vmemmap_buf = vmemmap_buf_start;
202 vmemmap_buf_end = vmemmap_buf_start + size * map_count;
203 }
204
205 for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
206 struct mem_section *ms;
207
208 if (!present_section_nr(pnum))
209 continue;
210
211 map_map[pnum] = sparse_mem_map_populate(pnum, nodeid);
212 if (map_map[pnum])
213 continue;
214 ms = __nr_to_section(pnum);
215 printk(KERN_ERR "%s: sparsemem memory map backing failed "
216 "some memory will not be available.\n", __func__);
217 ms->section_mem_map = 0;
218 }
219
220 if (vmemmap_buf_start) {
221 /* need to free left buf */
222#ifdef CONFIG_NO_BOOTMEM
223 free_early(__pa(vmemmap_buf_start), __pa(vmemmap_buf_end));
224 if (vmemmap_buf_start < vmemmap_buf) {
225 char name[15];
226
227 snprintf(name, sizeof(name), "MEMMAP %d", nodeid);
228 reserve_early_without_check(__pa(vmemmap_buf_start),
229 __pa(vmemmap_buf), name);
230 }
231#else
232 free_bootmem(__pa(vmemmap_buf), vmemmap_buf_end - vmemmap_buf);
233#endif
234 vmemmap_buf = NULL;
235 vmemmap_buf_end = NULL;
236 }
237}
diff --git a/mm/sparse.c b/mm/sparse.c
index 6ce4aab69e99..22896d589133 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -271,7 +271,8 @@ static unsigned long *__kmalloc_section_usemap(void)
271 271
272#ifdef CONFIG_MEMORY_HOTREMOVE 272#ifdef CONFIG_MEMORY_HOTREMOVE
273static unsigned long * __init 273static unsigned long * __init
274sparse_early_usemap_alloc_pgdat_section(struct pglist_data *pgdat) 274sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
275 unsigned long count)
275{ 276{
276 unsigned long section_nr; 277 unsigned long section_nr;
277 278
@@ -286,7 +287,7 @@ sparse_early_usemap_alloc_pgdat_section(struct pglist_data *pgdat)
286 * this problem. 287 * this problem.
287 */ 288 */
288 section_nr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT); 289 section_nr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT);
289 return alloc_bootmem_section(usemap_size(), section_nr); 290 return alloc_bootmem_section(usemap_size() * count, section_nr);
290} 291}
291 292
292static void __init check_usemap_section_nr(int nid, unsigned long *usemap) 293static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
@@ -329,7 +330,8 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
329} 330}
330#else 331#else
331static unsigned long * __init 332static unsigned long * __init
332sparse_early_usemap_alloc_pgdat_section(struct pglist_data *pgdat) 333sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
334 unsigned long count)
333{ 335{
334 return NULL; 336 return NULL;
335} 337}
@@ -339,27 +341,40 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
339} 341}
340#endif /* CONFIG_MEMORY_HOTREMOVE */ 342#endif /* CONFIG_MEMORY_HOTREMOVE */
341 343
342static unsigned long *__init sparse_early_usemap_alloc(unsigned long pnum) 344static void __init sparse_early_usemaps_alloc_node(unsigned long**usemap_map,
345 unsigned long pnum_begin,
346 unsigned long pnum_end,
347 unsigned long usemap_count, int nodeid)
343{ 348{
344 unsigned long *usemap; 349 void *usemap;
345 struct mem_section *ms = __nr_to_section(pnum); 350 unsigned long pnum;
346 int nid = sparse_early_nid(ms); 351 int size = usemap_size();
347
348 usemap = sparse_early_usemap_alloc_pgdat_section(NODE_DATA(nid));
349 if (usemap)
350 return usemap;
351 352
352 usemap = alloc_bootmem_node(NODE_DATA(nid), usemap_size()); 353 usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid),
354 usemap_count);
353 if (usemap) { 355 if (usemap) {
354 check_usemap_section_nr(nid, usemap); 356 for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
355 return usemap; 357 if (!present_section_nr(pnum))
358 continue;
359 usemap_map[pnum] = usemap;
360 usemap += size;
361 }
362 return;
356 } 363 }
357 364
358 /* Stupid: suppress gcc warning for SPARSEMEM && !NUMA */ 365 usemap = alloc_bootmem_node(NODE_DATA(nodeid), size * usemap_count);
359 nid = 0; 366 if (usemap) {
367 for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
368 if (!present_section_nr(pnum))
369 continue;
370 usemap_map[pnum] = usemap;
371 usemap += size;
372 check_usemap_section_nr(nodeid, usemap_map[pnum]);
373 }
374 return;
375 }
360 376
361 printk(KERN_WARNING "%s: allocation failed\n", __func__); 377 printk(KERN_WARNING "%s: allocation failed\n", __func__);
362 return NULL;
363} 378}
364 379
365#ifndef CONFIG_SPARSEMEM_VMEMMAP 380#ifndef CONFIG_SPARSEMEM_VMEMMAP
@@ -375,8 +390,65 @@ struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid)
375 PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION)); 390 PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION));
376 return map; 391 return map;
377} 392}
393void __init sparse_mem_maps_populate_node(struct page **map_map,
394 unsigned long pnum_begin,
395 unsigned long pnum_end,
396 unsigned long map_count, int nodeid)
397{
398 void *map;
399 unsigned long pnum;
400 unsigned long size = sizeof(struct page) * PAGES_PER_SECTION;
401
402 map = alloc_remap(nodeid, size * map_count);
403 if (map) {
404 for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
405 if (!present_section_nr(pnum))
406 continue;
407 map_map[pnum] = map;
408 map += size;
409 }
410 return;
411 }
412
413 size = PAGE_ALIGN(size);
414 map = alloc_bootmem_pages_node(NODE_DATA(nodeid), size * map_count);
415 if (map) {
416 for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
417 if (!present_section_nr(pnum))
418 continue;
419 map_map[pnum] = map;
420 map += size;
421 }
422 return;
423 }
424
425 /* fallback */
426 for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
427 struct mem_section *ms;
428
429 if (!present_section_nr(pnum))
430 continue;
431 map_map[pnum] = sparse_mem_map_populate(pnum, nodeid);
432 if (map_map[pnum])
433 continue;
434 ms = __nr_to_section(pnum);
435 printk(KERN_ERR "%s: sparsemem memory map backing failed "
436 "some memory will not be available.\n", __func__);
437 ms->section_mem_map = 0;
438 }
439}
378#endif /* !CONFIG_SPARSEMEM_VMEMMAP */ 440#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
379 441
442#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
443static void __init sparse_early_mem_maps_alloc_node(struct page **map_map,
444 unsigned long pnum_begin,
445 unsigned long pnum_end,
446 unsigned long map_count, int nodeid)
447{
448 sparse_mem_maps_populate_node(map_map, pnum_begin, pnum_end,
449 map_count, nodeid);
450}
451#else
380static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) 452static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum)
381{ 453{
382 struct page *map; 454 struct page *map;
@@ -392,10 +464,12 @@ static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum)
392 ms->section_mem_map = 0; 464 ms->section_mem_map = 0;
393 return NULL; 465 return NULL;
394} 466}
467#endif
395 468
396void __attribute__((weak)) __meminit vmemmap_populate_print_last(void) 469void __attribute__((weak)) __meminit vmemmap_populate_print_last(void)
397{ 470{
398} 471}
472
399/* 473/*
400 * Allocate the accumulated non-linear sections, allocate a mem_map 474 * Allocate the accumulated non-linear sections, allocate a mem_map
401 * for each and record the physical to section mapping. 475 * for each and record the physical to section mapping.
@@ -407,6 +481,14 @@ void __init sparse_init(void)
407 unsigned long *usemap; 481 unsigned long *usemap;
408 unsigned long **usemap_map; 482 unsigned long **usemap_map;
409 int size; 483 int size;
484 int nodeid_begin = 0;
485 unsigned long pnum_begin = 0;
486 unsigned long usemap_count;
487#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
488 unsigned long map_count;
489 int size2;
490 struct page **map_map;
491#endif
410 492
411 /* 493 /*
412 * map is using big page (aka 2M in x86 64 bit) 494 * map is using big page (aka 2M in x86 64 bit)
@@ -425,10 +507,81 @@ void __init sparse_init(void)
425 panic("can not allocate usemap_map\n"); 507 panic("can not allocate usemap_map\n");
426 508
427 for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { 509 for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
510 struct mem_section *ms;
511
428 if (!present_section_nr(pnum)) 512 if (!present_section_nr(pnum))
429 continue; 513 continue;
430 usemap_map[pnum] = sparse_early_usemap_alloc(pnum); 514 ms = __nr_to_section(pnum);
515 nodeid_begin = sparse_early_nid(ms);
516 pnum_begin = pnum;
517 break;
431 } 518 }
519 usemap_count = 1;
520 for (pnum = pnum_begin + 1; pnum < NR_MEM_SECTIONS; pnum++) {
521 struct mem_section *ms;
522 int nodeid;
523
524 if (!present_section_nr(pnum))
525 continue;
526 ms = __nr_to_section(pnum);
527 nodeid = sparse_early_nid(ms);
528 if (nodeid == nodeid_begin) {
529 usemap_count++;
530 continue;
531 }
532 /* ok, we need to take cake of from pnum_begin to pnum - 1*/
533 sparse_early_usemaps_alloc_node(usemap_map, pnum_begin, pnum,
534 usemap_count, nodeid_begin);
535 /* new start, update count etc*/
536 nodeid_begin = nodeid;
537 pnum_begin = pnum;
538 usemap_count = 1;
539 }
540 /* ok, last chunk */
541 sparse_early_usemaps_alloc_node(usemap_map, pnum_begin, NR_MEM_SECTIONS,
542 usemap_count, nodeid_begin);
543
544#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
545 size2 = sizeof(struct page *) * NR_MEM_SECTIONS;
546 map_map = alloc_bootmem(size2);
547 if (!map_map)
548 panic("can not allocate map_map\n");
549
550 for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
551 struct mem_section *ms;
552
553 if (!present_section_nr(pnum))
554 continue;
555 ms = __nr_to_section(pnum);
556 nodeid_begin = sparse_early_nid(ms);
557 pnum_begin = pnum;
558 break;
559 }
560 map_count = 1;
561 for (pnum = pnum_begin + 1; pnum < NR_MEM_SECTIONS; pnum++) {
562 struct mem_section *ms;
563 int nodeid;
564
565 if (!present_section_nr(pnum))
566 continue;
567 ms = __nr_to_section(pnum);
568 nodeid = sparse_early_nid(ms);
569 if (nodeid == nodeid_begin) {
570 map_count++;
571 continue;
572 }
573 /* ok, we need to take cake of from pnum_begin to pnum - 1*/
574 sparse_early_mem_maps_alloc_node(map_map, pnum_begin, pnum,
575 map_count, nodeid_begin);
576 /* new start, update count etc*/
577 nodeid_begin = nodeid;
578 pnum_begin = pnum;
579 map_count = 1;
580 }
581 /* ok, last chunk */
582 sparse_early_mem_maps_alloc_node(map_map, pnum_begin, NR_MEM_SECTIONS,
583 map_count, nodeid_begin);
584#endif
432 585
433 for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { 586 for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
434 if (!present_section_nr(pnum)) 587 if (!present_section_nr(pnum))
@@ -438,7 +591,11 @@ void __init sparse_init(void)
438 if (!usemap) 591 if (!usemap)
439 continue; 592 continue;
440 593
594#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
595 map = map_map[pnum];
596#else
441 map = sparse_early_mem_map_alloc(pnum); 597 map = sparse_early_mem_map_alloc(pnum);
598#endif
442 if (!map) 599 if (!map)
443 continue; 600 continue;
444 601
@@ -448,6 +605,9 @@ void __init sparse_init(void)
448 605
449 vmemmap_populate_print_last(); 606 vmemmap_populate_print_last();
450 607
608#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
609 free_bootmem(__pa(map_map), size2);
610#endif
451 free_bootmem(__pa(usemap_map), size); 611 free_bootmem(__pa(usemap_map), size);
452} 612}
453 613
diff --git a/mm/swap.c b/mm/swap.c
index 308e57d8d7ed..9036b89813ac 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -55,7 +55,7 @@ static void __page_cache_release(struct page *page)
55 del_page_from_lru(zone, page); 55 del_page_from_lru(zone, page);
56 spin_unlock_irqrestore(&zone->lru_lock, flags); 56 spin_unlock_irqrestore(&zone->lru_lock, flags);
57 } 57 }
58 free_hot_page(page); 58 free_hot_cold_page(page, 0);
59} 59}
60 60
61static void put_compound_page(struct page *page) 61static void put_compound_page(struct page *page)
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 6c0585b16418..6cd0a8f90dc7 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -723,6 +723,37 @@ int free_swap_and_cache(swp_entry_t entry)
723 return p != NULL; 723 return p != NULL;
724} 724}
725 725
726#ifdef CONFIG_CGROUP_MEM_RES_CTLR
727/**
728 * mem_cgroup_count_swap_user - count the user of a swap entry
729 * @ent: the swap entry to be checked
730 * @pagep: the pointer for the swap cache page of the entry to be stored
731 *
732 * Returns the number of the user of the swap entry. The number is valid only
733 * for swaps of anonymous pages.
734 * If the entry is found on swap cache, the page is stored to pagep with
735 * refcount of it being incremented.
736 */
737int mem_cgroup_count_swap_user(swp_entry_t ent, struct page **pagep)
738{
739 struct page *page;
740 struct swap_info_struct *p;
741 int count = 0;
742
743 page = find_get_page(&swapper_space, ent.val);
744 if (page)
745 count += page_mapcount(page);
746 p = swap_info_get(ent);
747 if (p) {
748 count += swap_count(p->swap_map[swp_offset(ent)]);
749 spin_unlock(&swap_lock);
750 }
751
752 *pagep = page;
753 return count;
754}
755#endif
756
726#ifdef CONFIG_HIBERNATION 757#ifdef CONFIG_HIBERNATION
727/* 758/*
728 * Find the swap type that corresponds to given device (if any). 759 * Find the swap type that corresponds to given device (if any).
@@ -840,7 +871,8 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
840 goto out; 871 goto out;
841 } 872 }
842 873
843 inc_mm_counter(vma->vm_mm, anon_rss); 874 dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
875 inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
844 get_page(page); 876 get_page(page);
845 set_pte_at(vma->vm_mm, addr, pte, 877 set_pte_at(vma->vm_mm, addr, pte,
846 pte_mkold(mk_pte(page, vma->vm_page_prot))); 878 pte_mkold(mk_pte(page, vma->vm_page_prot)));
@@ -1759,11 +1791,11 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
1759 unsigned int type; 1791 unsigned int type;
1760 int i, prev; 1792 int i, prev;
1761 int error; 1793 int error;
1762 union swap_header *swap_header = NULL; 1794 union swap_header *swap_header;
1763 unsigned int nr_good_pages = 0; 1795 unsigned int nr_good_pages;
1764 int nr_extents = 0; 1796 int nr_extents = 0;
1765 sector_t span; 1797 sector_t span;
1766 unsigned long maxpages = 1; 1798 unsigned long maxpages;
1767 unsigned long swapfilepages; 1799 unsigned long swapfilepages;
1768 unsigned char *swap_map = NULL; 1800 unsigned char *swap_map = NULL;
1769 struct page *page = NULL; 1801 struct page *page = NULL;
@@ -1922,9 +1954,13 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
1922 * swap pte. 1954 * swap pte.
1923 */ 1955 */
1924 maxpages = swp_offset(pte_to_swp_entry( 1956 maxpages = swp_offset(pte_to_swp_entry(
1925 swp_entry_to_pte(swp_entry(0, ~0UL)))) - 1; 1957 swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
1926 if (maxpages > swap_header->info.last_page) 1958 if (maxpages > swap_header->info.last_page) {
1927 maxpages = swap_header->info.last_page; 1959 maxpages = swap_header->info.last_page + 1;
1960 /* p->max is an unsigned int: don't overflow it */
1961 if ((unsigned int)maxpages == 0)
1962 maxpages = UINT_MAX;
1963 }
1928 p->highest_bit = maxpages - 1; 1964 p->highest_bit = maxpages - 1;
1929 1965
1930 error = -EINVAL; 1966 error = -EINVAL;
@@ -1948,23 +1984,24 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
1948 } 1984 }
1949 1985
1950 memset(swap_map, 0, maxpages); 1986 memset(swap_map, 0, maxpages);
1987 nr_good_pages = maxpages - 1; /* omit header page */
1988
1951 for (i = 0; i < swap_header->info.nr_badpages; i++) { 1989 for (i = 0; i < swap_header->info.nr_badpages; i++) {
1952 int page_nr = swap_header->info.badpages[i]; 1990 unsigned int page_nr = swap_header->info.badpages[i];
1953 if (page_nr <= 0 || page_nr >= swap_header->info.last_page) { 1991 if (page_nr == 0 || page_nr > swap_header->info.last_page) {
1954 error = -EINVAL; 1992 error = -EINVAL;
1955 goto bad_swap; 1993 goto bad_swap;
1956 } 1994 }
1957 swap_map[page_nr] = SWAP_MAP_BAD; 1995 if (page_nr < maxpages) {
1996 swap_map[page_nr] = SWAP_MAP_BAD;
1997 nr_good_pages--;
1998 }
1958 } 1999 }
1959 2000
1960 error = swap_cgroup_swapon(type, maxpages); 2001 error = swap_cgroup_swapon(type, maxpages);
1961 if (error) 2002 if (error)
1962 goto bad_swap; 2003 goto bad_swap;
1963 2004
1964 nr_good_pages = swap_header->info.last_page -
1965 swap_header->info.nr_badpages -
1966 1 /* header page */;
1967
1968 if (nr_good_pages) { 2005 if (nr_good_pages) {
1969 swap_map[0] = SWAP_MAP_BAD; 2006 swap_map[0] = SWAP_MAP_BAD;
1970 p->max = maxpages; 2007 p->max = maxpages;
@@ -2155,7 +2192,11 @@ void swap_shmem_alloc(swp_entry_t entry)
2155} 2192}
2156 2193
2157/* 2194/*
2158 * increase reference count of swap entry by 1. 2195 * Increase reference count of swap entry by 1.
2196 * Returns 0 for success, or -ENOMEM if a swap_count_continuation is required
2197 * but could not be atomically allocated. Returns 0, just as if it succeeded,
2198 * if __swap_duplicate() fails for another reason (-EINVAL or -ENOENT), which
2199 * might occur if a page table entry has got corrupted.
2159 */ 2200 */
2160int swap_duplicate(swp_entry_t entry) 2201int swap_duplicate(swp_entry_t entry)
2161{ 2202{
diff --git a/mm/vmscan.c b/mm/vmscan.c
index c26986c85ce0..79c809895fba 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -262,27 +262,6 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
262 return ret; 262 return ret;
263} 263}
264 264
265/* Called without lock on whether page is mapped, so answer is unstable */
266static inline int page_mapping_inuse(struct page *page)
267{
268 struct address_space *mapping;
269
270 /* Page is in somebody's page tables. */
271 if (page_mapped(page))
272 return 1;
273
274 /* Be more reluctant to reclaim swapcache than pagecache */
275 if (PageSwapCache(page))
276 return 1;
277
278 mapping = page_mapping(page);
279 if (!mapping)
280 return 0;
281
282 /* File is mmap'd by somebody? */
283 return mapping_mapped(mapping);
284}
285
286static inline int is_page_cache_freeable(struct page *page) 265static inline int is_page_cache_freeable(struct page *page)
287{ 266{
288 /* 267 /*
@@ -579,6 +558,65 @@ redo:
579 put_page(page); /* drop ref from isolate */ 558 put_page(page); /* drop ref from isolate */
580} 559}
581 560
561enum page_references {
562 PAGEREF_RECLAIM,
563 PAGEREF_RECLAIM_CLEAN,
564 PAGEREF_KEEP,
565 PAGEREF_ACTIVATE,
566};
567
568static enum page_references page_check_references(struct page *page,
569 struct scan_control *sc)
570{
571 int referenced_ptes, referenced_page;
572 unsigned long vm_flags;
573
574 referenced_ptes = page_referenced(page, 1, sc->mem_cgroup, &vm_flags);
575 referenced_page = TestClearPageReferenced(page);
576
577 /* Lumpy reclaim - ignore references */
578 if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
579 return PAGEREF_RECLAIM;
580
581 /*
582 * Mlock lost the isolation race with us. Let try_to_unmap()
583 * move the page to the unevictable list.
584 */
585 if (vm_flags & VM_LOCKED)
586 return PAGEREF_RECLAIM;
587
588 if (referenced_ptes) {
589 if (PageAnon(page))
590 return PAGEREF_ACTIVATE;
591 /*
592 * All mapped pages start out with page table
593 * references from the instantiating fault, so we need
594 * to look twice if a mapped file page is used more
595 * than once.
596 *
597 * Mark it and spare it for another trip around the
598 * inactive list. Another page table reference will
599 * lead to its activation.
600 *
601 * Note: the mark is set for activated pages as well
602 * so that recently deactivated but used pages are
603 * quickly recovered.
604 */
605 SetPageReferenced(page);
606
607 if (referenced_page)
608 return PAGEREF_ACTIVATE;
609
610 return PAGEREF_KEEP;
611 }
612
613 /* Reclaim if clean, defer dirty pages to writeback */
614 if (referenced_page)
615 return PAGEREF_RECLAIM_CLEAN;
616
617 return PAGEREF_RECLAIM;
618}
619
582/* 620/*
583 * shrink_page_list() returns the number of reclaimed pages 621 * shrink_page_list() returns the number of reclaimed pages
584 */ 622 */
@@ -590,16 +628,15 @@ static unsigned long shrink_page_list(struct list_head *page_list,
590 struct pagevec freed_pvec; 628 struct pagevec freed_pvec;
591 int pgactivate = 0; 629 int pgactivate = 0;
592 unsigned long nr_reclaimed = 0; 630 unsigned long nr_reclaimed = 0;
593 unsigned long vm_flags;
594 631
595 cond_resched(); 632 cond_resched();
596 633
597 pagevec_init(&freed_pvec, 1); 634 pagevec_init(&freed_pvec, 1);
598 while (!list_empty(page_list)) { 635 while (!list_empty(page_list)) {
636 enum page_references references;
599 struct address_space *mapping; 637 struct address_space *mapping;
600 struct page *page; 638 struct page *page;
601 int may_enter_fs; 639 int may_enter_fs;
602 int referenced;
603 640
604 cond_resched(); 641 cond_resched();
605 642
@@ -641,17 +678,16 @@ static unsigned long shrink_page_list(struct list_head *page_list,
641 goto keep_locked; 678 goto keep_locked;
642 } 679 }
643 680
644 referenced = page_referenced(page, 1, 681 references = page_check_references(page, sc);
645 sc->mem_cgroup, &vm_flags); 682 switch (references) {
646 /* 683 case PAGEREF_ACTIVATE:
647 * In active use or really unfreeable? Activate it.
648 * If page which have PG_mlocked lost isoltation race,
649 * try_to_unmap moves it to unevictable list
650 */
651 if (sc->order <= PAGE_ALLOC_COSTLY_ORDER &&
652 referenced && page_mapping_inuse(page)
653 && !(vm_flags & VM_LOCKED))
654 goto activate_locked; 684 goto activate_locked;
685 case PAGEREF_KEEP:
686 goto keep_locked;
687 case PAGEREF_RECLAIM:
688 case PAGEREF_RECLAIM_CLEAN:
689 ; /* try to reclaim the page below */
690 }
655 691
656 /* 692 /*
657 * Anonymous process memory has backing store? 693 * Anonymous process memory has backing store?
@@ -685,7 +721,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
685 } 721 }
686 722
687 if (PageDirty(page)) { 723 if (PageDirty(page)) {
688 if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && referenced) 724 if (references == PAGEREF_RECLAIM_CLEAN)
689 goto keep_locked; 725 goto keep_locked;
690 if (!may_enter_fs) 726 if (!may_enter_fs)
691 goto keep_locked; 727 goto keep_locked;
@@ -1350,9 +1386,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1350 continue; 1386 continue;
1351 } 1387 }
1352 1388
1353 /* page_referenced clears PageReferenced */ 1389 if (page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) {
1354 if (page_mapping_inuse(page) &&
1355 page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) {
1356 nr_rotated++; 1390 nr_rotated++;
1357 /* 1391 /*
1358 * Identify referenced, file-backed active pages and 1392 * Identify referenced, file-backed active pages and
@@ -1501,6 +1535,13 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
1501 unsigned long ap, fp; 1535 unsigned long ap, fp;
1502 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); 1536 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1503 1537
1538 /* If we have no swap space, do not bother scanning anon pages. */
1539 if (!sc->may_swap || (nr_swap_pages <= 0)) {
1540 percent[0] = 0;
1541 percent[1] = 100;
1542 return;
1543 }
1544
1504 anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) + 1545 anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) +
1505 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON); 1546 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
1506 file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) + 1547 file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) +
@@ -1598,22 +1639,20 @@ static void shrink_zone(int priority, struct zone *zone,
1598 unsigned long nr_reclaimed = sc->nr_reclaimed; 1639 unsigned long nr_reclaimed = sc->nr_reclaimed;
1599 unsigned long nr_to_reclaim = sc->nr_to_reclaim; 1640 unsigned long nr_to_reclaim = sc->nr_to_reclaim;
1600 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); 1641 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1601 int noswap = 0;
1602 1642
1603 /* If we have no swap space, do not bother scanning anon pages. */ 1643 get_scan_ratio(zone, sc, percent);
1604 if (!sc->may_swap || (nr_swap_pages <= 0)) {
1605 noswap = 1;
1606 percent[0] = 0;
1607 percent[1] = 100;
1608 } else
1609 get_scan_ratio(zone, sc, percent);
1610 1644
1611 for_each_evictable_lru(l) { 1645 for_each_evictable_lru(l) {
1612 int file = is_file_lru(l); 1646 int file = is_file_lru(l);
1613 unsigned long scan; 1647 unsigned long scan;
1614 1648
1649 if (percent[file] == 0) {
1650 nr[l] = 0;
1651 continue;
1652 }
1653
1615 scan = zone_nr_lru_pages(zone, sc, l); 1654 scan = zone_nr_lru_pages(zone, sc, l);
1616 if (priority || noswap) { 1655 if (priority) {
1617 scan >>= priority; 1656 scan >>= priority;
1618 scan = (scan * percent[file]) / 100; 1657 scan = (scan * percent[file]) / 100;
1619 } 1658 }
@@ -1694,8 +1733,7 @@ static void shrink_zones(int priority, struct zonelist *zonelist,
1694 continue; 1733 continue;
1695 note_zone_scanning_priority(zone, priority); 1734 note_zone_scanning_priority(zone, priority);
1696 1735
1697 if (zone_is_all_unreclaimable(zone) && 1736 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
1698 priority != DEF_PRIORITY)
1699 continue; /* Let kswapd poll it */ 1737 continue; /* Let kswapd poll it */
1700 sc->all_unreclaimable = 0; 1738 sc->all_unreclaimable = 0;
1701 } else { 1739 } else {
@@ -1922,7 +1960,7 @@ static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining)
1922 if (!populated_zone(zone)) 1960 if (!populated_zone(zone))
1923 continue; 1961 continue;
1924 1962
1925 if (zone_is_all_unreclaimable(zone)) 1963 if (zone->all_unreclaimable)
1926 continue; 1964 continue;
1927 1965
1928 if (!zone_watermark_ok(zone, order, high_wmark_pages(zone), 1966 if (!zone_watermark_ok(zone, order, high_wmark_pages(zone),
@@ -2012,8 +2050,7 @@ loop_again:
2012 if (!populated_zone(zone)) 2050 if (!populated_zone(zone))
2013 continue; 2051 continue;
2014 2052
2015 if (zone_is_all_unreclaimable(zone) && 2053 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
2016 priority != DEF_PRIORITY)
2017 continue; 2054 continue;
2018 2055
2019 /* 2056 /*
@@ -2056,13 +2093,9 @@ loop_again:
2056 if (!populated_zone(zone)) 2093 if (!populated_zone(zone))
2057 continue; 2094 continue;
2058 2095
2059 if (zone_is_all_unreclaimable(zone) && 2096 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
2060 priority != DEF_PRIORITY)
2061 continue; 2097 continue;
2062 2098
2063 if (!zone_watermark_ok(zone, order,
2064 high_wmark_pages(zone), end_zone, 0))
2065 all_zones_ok = 0;
2066 temp_priority[i] = priority; 2099 temp_priority[i] = priority;
2067 sc.nr_scanned = 0; 2100 sc.nr_scanned = 0;
2068 note_zone_scanning_priority(zone, priority); 2101 note_zone_scanning_priority(zone, priority);
@@ -2087,12 +2120,11 @@ loop_again:
2087 lru_pages); 2120 lru_pages);
2088 sc.nr_reclaimed += reclaim_state->reclaimed_slab; 2121 sc.nr_reclaimed += reclaim_state->reclaimed_slab;
2089 total_scanned += sc.nr_scanned; 2122 total_scanned += sc.nr_scanned;
2090 if (zone_is_all_unreclaimable(zone)) 2123 if (zone->all_unreclaimable)
2091 continue; 2124 continue;
2092 if (nr_slab == 0 && zone->pages_scanned >= 2125 if (nr_slab == 0 &&
2093 (zone_reclaimable_pages(zone) * 6)) 2126 zone->pages_scanned >= (zone_reclaimable_pages(zone) * 6))
2094 zone_set_flag(zone, 2127 zone->all_unreclaimable = 1;
2095 ZONE_ALL_UNRECLAIMABLE);
2096 /* 2128 /*
2097 * If we've done a decent amount of scanning and 2129 * If we've done a decent amount of scanning and
2098 * the reclaim ratio is low, start doing writepage 2130 * the reclaim ratio is low, start doing writepage
@@ -2102,13 +2134,18 @@ loop_again:
2102 total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) 2134 total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
2103 sc.may_writepage = 1; 2135 sc.may_writepage = 1;
2104 2136
2105 /* 2137 if (!zone_watermark_ok(zone, order,
2106 * We are still under min water mark. it mean we have 2138 high_wmark_pages(zone), end_zone, 0)) {
2107 * GFP_ATOMIC allocation failure risk. Hurry up! 2139 all_zones_ok = 0;
2108 */ 2140 /*
2109 if (!zone_watermark_ok(zone, order, min_wmark_pages(zone), 2141 * We are still under min water mark. This
2110 end_zone, 0)) 2142 * means that we have a GFP_ATOMIC allocation
2111 has_under_min_watermark_zone = 1; 2143 * failure risk. Hurry up!
2144 */
2145 if (!zone_watermark_ok(zone, order,
2146 min_wmark_pages(zone), end_zone, 0))
2147 has_under_min_watermark_zone = 1;
2148 }
2112 2149
2113 } 2150 }
2114 if (all_zones_ok) 2151 if (all_zones_ok)
@@ -2550,6 +2587,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2550 * and RECLAIM_SWAP. 2587 * and RECLAIM_SWAP.
2551 */ 2588 */
2552 p->flags |= PF_MEMALLOC | PF_SWAPWRITE; 2589 p->flags |= PF_MEMALLOC | PF_SWAPWRITE;
2590 lockdep_set_current_reclaim_state(gfp_mask);
2553 reclaim_state.reclaimed_slab = 0; 2591 reclaim_state.reclaimed_slab = 0;
2554 p->reclaim_state = &reclaim_state; 2592 p->reclaim_state = &reclaim_state;
2555 2593
@@ -2593,6 +2631,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2593 2631
2594 p->reclaim_state = NULL; 2632 p->reclaim_state = NULL;
2595 current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); 2633 current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
2634 lockdep_clear_current_reclaim_state();
2596 return sc.nr_reclaimed >= nr_pages; 2635 return sc.nr_reclaimed >= nr_pages;
2597} 2636}
2598 2637
@@ -2615,7 +2654,7 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2615 zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages) 2654 zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages)
2616 return ZONE_RECLAIM_FULL; 2655 return ZONE_RECLAIM_FULL;
2617 2656
2618 if (zone_is_all_unreclaimable(zone)) 2657 if (zone->all_unreclaimable)
2619 return ZONE_RECLAIM_FULL; 2658 return ZONE_RECLAIM_FULL;
2620 2659
2621 /* 2660 /*
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 6051fbab67ba..7f760cbc73f3 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -139,7 +139,8 @@ static void refresh_zone_stat_thresholds(void)
139 threshold = calculate_threshold(zone); 139 threshold = calculate_threshold(zone);
140 140
141 for_each_online_cpu(cpu) 141 for_each_online_cpu(cpu)
142 zone_pcp(zone, cpu)->stat_threshold = threshold; 142 per_cpu_ptr(zone->pageset, cpu)->stat_threshold
143 = threshold;
143 } 144 }
144} 145}
145 146
@@ -149,7 +150,8 @@ static void refresh_zone_stat_thresholds(void)
149void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, 150void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
150 int delta) 151 int delta)
151{ 152{
152 struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id()); 153 struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset);
154
153 s8 *p = pcp->vm_stat_diff + item; 155 s8 *p = pcp->vm_stat_diff + item;
154 long x; 156 long x;
155 157
@@ -202,7 +204,7 @@ EXPORT_SYMBOL(mod_zone_page_state);
202 */ 204 */
203void __inc_zone_state(struct zone *zone, enum zone_stat_item item) 205void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
204{ 206{
205 struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id()); 207 struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset);
206 s8 *p = pcp->vm_stat_diff + item; 208 s8 *p = pcp->vm_stat_diff + item;
207 209
208 (*p)++; 210 (*p)++;
@@ -223,7 +225,7 @@ EXPORT_SYMBOL(__inc_zone_page_state);
223 225
224void __dec_zone_state(struct zone *zone, enum zone_stat_item item) 226void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
225{ 227{
226 struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id()); 228 struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset);
227 s8 *p = pcp->vm_stat_diff + item; 229 s8 *p = pcp->vm_stat_diff + item;
228 230
229 (*p)--; 231 (*p)--;
@@ -300,7 +302,7 @@ void refresh_cpu_vm_stats(int cpu)
300 for_each_populated_zone(zone) { 302 for_each_populated_zone(zone) {
301 struct per_cpu_pageset *p; 303 struct per_cpu_pageset *p;
302 304
303 p = zone_pcp(zone, cpu); 305 p = per_cpu_ptr(zone->pageset, cpu);
304 306
305 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) 307 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
306 if (p->vm_stat_diff[i]) { 308 if (p->vm_stat_diff[i]) {
@@ -741,7 +743,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
741 for_each_online_cpu(i) { 743 for_each_online_cpu(i) {
742 struct per_cpu_pageset *pageset; 744 struct per_cpu_pageset *pageset;
743 745
744 pageset = zone_pcp(zone, i); 746 pageset = per_cpu_ptr(zone->pageset, i);
745 seq_printf(m, 747 seq_printf(m,
746 "\n cpu: %i" 748 "\n cpu: %i"
747 "\n count: %i" 749 "\n count: %i"
@@ -761,7 +763,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
761 "\n prev_priority: %i" 763 "\n prev_priority: %i"
762 "\n start_pfn: %lu" 764 "\n start_pfn: %lu"
763 "\n inactive_ratio: %u", 765 "\n inactive_ratio: %u",
764 zone_is_all_unreclaimable(zone), 766 zone->all_unreclaimable,
765 zone->prev_priority, 767 zone->prev_priority,
766 zone->zone_start_pfn, 768 zone->zone_start_pfn,
767 zone->inactive_ratio); 769 zone->inactive_ratio);
@@ -906,6 +908,7 @@ static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb,
906 case CPU_ONLINE: 908 case CPU_ONLINE:
907 case CPU_ONLINE_FROZEN: 909 case CPU_ONLINE_FROZEN:
908 start_cpu_timer(cpu); 910 start_cpu_timer(cpu);
911 node_set_state(cpu_to_node(cpu), N_CPU);
909 break; 912 break;
910 case CPU_DOWN_PREPARE: 913 case CPU_DOWN_PREPARE:
911 case CPU_DOWN_PREPARE_FROZEN: 914 case CPU_DOWN_PREPARE_FROZEN: