aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig4
-rw-r--r--mm/Makefile6
-rw-r--r--mm/backing-dev.c37
-rw-r--r--mm/bootmem.c198
-rw-r--r--mm/bounce.c1
-rw-r--r--mm/fadvise.c10
-rw-r--r--mm/failslab.c19
-rw-r--r--mm/filemap.c6
-rw-r--r--mm/filemap_xip.c3
-rw-r--r--mm/fremap.c2
-rw-r--r--mm/highmem.c2
-rw-r--r--mm/hugetlb.c11
-rw-r--r--mm/kmemleak.c1
-rw-r--r--mm/ksm.c26
-rw-r--r--mm/memcontrol.c1427
-rw-r--r--mm/memory-failure.c6
-rw-r--r--mm/memory.c181
-rw-r--r--mm/memory_hotplug.c10
-rw-r--r--mm/mempolicy.c165
-rw-r--r--mm/migrate.c7
-rw-r--r--mm/mincore.c2
-rw-r--r--mm/mlock.c12
-rw-r--r--mm/mmap.c262
-rw-r--r--mm/mmu_context.c4
-rw-r--r--mm/mmu_notifier.c1
-rw-r--r--mm/mprotect.c1
-rw-r--r--mm/mremap.c10
-rw-r--r--mm/nommu.c35
-rw-r--r--mm/oom_kill.c15
-rw-r--r--mm/page_alloc.c401
-rw-r--r--mm/page_cgroup.c42
-rw-r--r--mm/page_io.c1
-rw-r--r--mm/pagewalk.c47
-rw-r--r--mm/percpu.c62
-rw-r--r--mm/percpu_up.c30
-rw-r--r--mm/quicklist.c1
-rw-r--r--mm/readahead.c7
-rw-r--r--mm/rmap.c211
-rw-r--r--mm/slab.c26
-rw-r--r--mm/slub.c346
-rw-r--r--mm/sparse-vmemmap.c77
-rw-r--r--mm/sparse.c197
-rw-r--r--mm/swap.c3
-rw-r--r--mm/swap_state.c1
-rw-r--r--mm/swapfile.c71
-rw-r--r--mm/truncate.c1
-rw-r--r--mm/util.c21
-rw-r--r--mm/vmscan.c156
-rw-r--r--mm/vmstat.c18
49 files changed, 3110 insertions, 1073 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index d34c2b971032..9c61158308dc 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -115,6 +115,10 @@ config SPARSEMEM_EXTREME
115config SPARSEMEM_VMEMMAP_ENABLE 115config SPARSEMEM_VMEMMAP_ENABLE
116 bool 116 bool
117 117
118config SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
119 def_bool y
120 depends on SPARSEMEM && X86_64
121
118config SPARSEMEM_VMEMMAP 122config SPARSEMEM_VMEMMAP
119 bool "Sparse Memory virtual memmap" 123 bool "Sparse Memory virtual memmap"
120 depends on SPARSEMEM && SPARSEMEM_VMEMMAP_ENABLE 124 depends on SPARSEMEM && SPARSEMEM_VMEMMAP_ENABLE
diff --git a/mm/Makefile b/mm/Makefile
index 7a68d2ab5560..6c2a73a54a43 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -33,7 +33,11 @@ obj-$(CONFIG_FAILSLAB) += failslab.o
33obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o 33obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
34obj-$(CONFIG_FS_XIP) += filemap_xip.o 34obj-$(CONFIG_FS_XIP) += filemap_xip.o
35obj-$(CONFIG_MIGRATION) += migrate.o 35obj-$(CONFIG_MIGRATION) += migrate.o
36obj-$(CONFIG_SMP) += percpu.o 36ifdef CONFIG_SMP
37obj-y += percpu.o
38else
39obj-y += percpu_up.o
40endif
37obj-$(CONFIG_QUICKLIST) += quicklist.o 41obj-$(CONFIG_QUICKLIST) += quicklist.o
38obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o 42obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
39obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o 43obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 0e8ca0347707..707d0dc6da0f 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -11,6 +11,8 @@
11#include <linux/writeback.h> 11#include <linux/writeback.h>
12#include <linux/device.h> 12#include <linux/device.h>
13 13
14static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
15
14void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) 16void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
15{ 17{
16} 18}
@@ -25,6 +27,11 @@ struct backing_dev_info default_backing_dev_info = {
25}; 27};
26EXPORT_SYMBOL_GPL(default_backing_dev_info); 28EXPORT_SYMBOL_GPL(default_backing_dev_info);
27 29
30struct backing_dev_info noop_backing_dev_info = {
31 .name = "noop",
32};
33EXPORT_SYMBOL_GPL(noop_backing_dev_info);
34
28static struct class *bdi_class; 35static struct class *bdi_class;
29 36
30/* 37/*
@@ -227,6 +234,9 @@ static struct device_attribute bdi_dev_attrs[] = {
227static __init int bdi_class_init(void) 234static __init int bdi_class_init(void)
228{ 235{
229 bdi_class = class_create(THIS_MODULE, "bdi"); 236 bdi_class = class_create(THIS_MODULE, "bdi");
237 if (IS_ERR(bdi_class))
238 return PTR_ERR(bdi_class);
239
230 bdi_class->dev_attrs = bdi_dev_attrs; 240 bdi_class->dev_attrs = bdi_dev_attrs;
231 bdi_debug_init(); 241 bdi_debug_init();
232 return 0; 242 return 0;
@@ -712,6 +722,33 @@ void bdi_destroy(struct backing_dev_info *bdi)
712} 722}
713EXPORT_SYMBOL(bdi_destroy); 723EXPORT_SYMBOL(bdi_destroy);
714 724
725/*
726 * For use from filesystems to quickly init and register a bdi associated
727 * with dirty writeback
728 */
729int bdi_setup_and_register(struct backing_dev_info *bdi, char *name,
730 unsigned int cap)
731{
732 char tmp[32];
733 int err;
734
735 bdi->name = name;
736 bdi->capabilities = cap;
737 err = bdi_init(bdi);
738 if (err)
739 return err;
740
741 sprintf(tmp, "%.28s%s", name, "-%d");
742 err = bdi_register(bdi, NULL, tmp, atomic_long_inc_return(&bdi_seq));
743 if (err) {
744 bdi_destroy(bdi);
745 return err;
746 }
747
748 return 0;
749}
750EXPORT_SYMBOL(bdi_setup_and_register);
751
715static wait_queue_head_t congestion_wqh[2] = { 752static wait_queue_head_t congestion_wqh[2] = {
716 __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]), 753 __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
717 __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1]) 754 __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 7d1486875e1c..58c66cc5056a 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -10,9 +10,11 @@
10 */ 10 */
11#include <linux/init.h> 11#include <linux/init.h>
12#include <linux/pfn.h> 12#include <linux/pfn.h>
13#include <linux/slab.h>
13#include <linux/bootmem.h> 14#include <linux/bootmem.h>
14#include <linux/module.h> 15#include <linux/module.h>
15#include <linux/kmemleak.h> 16#include <linux/kmemleak.h>
17#include <linux/range.h>
16 18
17#include <asm/bug.h> 19#include <asm/bug.h>
18#include <asm/io.h> 20#include <asm/io.h>
@@ -32,6 +34,7 @@ unsigned long max_pfn;
32unsigned long saved_max_pfn; 34unsigned long saved_max_pfn;
33#endif 35#endif
34 36
37#ifndef CONFIG_NO_BOOTMEM
35bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata; 38bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata;
36 39
37static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list); 40static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list);
@@ -142,7 +145,7 @@ unsigned long __init init_bootmem(unsigned long start, unsigned long pages)
142 min_low_pfn = start; 145 min_low_pfn = start;
143 return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages); 146 return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages);
144} 147}
145 148#endif
146/* 149/*
147 * free_bootmem_late - free bootmem pages directly to page allocator 150 * free_bootmem_late - free bootmem pages directly to page allocator
148 * @addr: starting address of the range 151 * @addr: starting address of the range
@@ -167,6 +170,53 @@ void __init free_bootmem_late(unsigned long addr, unsigned long size)
167 } 170 }
168} 171}
169 172
173#ifdef CONFIG_NO_BOOTMEM
174static void __init __free_pages_memory(unsigned long start, unsigned long end)
175{
176 int i;
177 unsigned long start_aligned, end_aligned;
178 int order = ilog2(BITS_PER_LONG);
179
180 start_aligned = (start + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1);
181 end_aligned = end & ~(BITS_PER_LONG - 1);
182
183 if (end_aligned <= start_aligned) {
184 for (i = start; i < end; i++)
185 __free_pages_bootmem(pfn_to_page(i), 0);
186
187 return;
188 }
189
190 for (i = start; i < start_aligned; i++)
191 __free_pages_bootmem(pfn_to_page(i), 0);
192
193 for (i = start_aligned; i < end_aligned; i += BITS_PER_LONG)
194 __free_pages_bootmem(pfn_to_page(i), order);
195
196 for (i = end_aligned; i < end; i++)
197 __free_pages_bootmem(pfn_to_page(i), 0);
198}
199
200unsigned long __init free_all_memory_core_early(int nodeid)
201{
202 int i;
203 u64 start, end;
204 unsigned long count = 0;
205 struct range *range = NULL;
206 int nr_range;
207
208 nr_range = get_free_all_memory_range(&range, nodeid);
209
210 for (i = 0; i < nr_range; i++) {
211 start = range[i].start;
212 end = range[i].end;
213 count += end - start;
214 __free_pages_memory(start, end);
215 }
216
217 return count;
218}
219#else
170static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) 220static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
171{ 221{
172 int aligned; 222 int aligned;
@@ -227,6 +277,7 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
227 277
228 return count; 278 return count;
229} 279}
280#endif
230 281
231/** 282/**
232 * free_all_bootmem_node - release a node's free pages to the buddy allocator 283 * free_all_bootmem_node - release a node's free pages to the buddy allocator
@@ -237,7 +288,12 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
237unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) 288unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
238{ 289{
239 register_page_bootmem_info_node(pgdat); 290 register_page_bootmem_info_node(pgdat);
291#ifdef CONFIG_NO_BOOTMEM
292 /* free_all_memory_core_early(MAX_NUMNODES) will be called later */
293 return 0;
294#else
240 return free_all_bootmem_core(pgdat->bdata); 295 return free_all_bootmem_core(pgdat->bdata);
296#endif
241} 297}
242 298
243/** 299/**
@@ -247,9 +303,27 @@ unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
247 */ 303 */
248unsigned long __init free_all_bootmem(void) 304unsigned long __init free_all_bootmem(void)
249{ 305{
250 return free_all_bootmem_core(NODE_DATA(0)->bdata); 306#ifdef CONFIG_NO_BOOTMEM
307 /*
308 * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id
309 * because in some case like Node0 doesnt have RAM installed
310 * low ram will be on Node1
311 * Use MAX_NUMNODES will make sure all ranges in early_node_map[]
312 * will be used instead of only Node0 related
313 */
314 return free_all_memory_core_early(MAX_NUMNODES);
315#else
316 unsigned long total_pages = 0;
317 bootmem_data_t *bdata;
318
319 list_for_each_entry(bdata, &bdata_list, list)
320 total_pages += free_all_bootmem_core(bdata);
321
322 return total_pages;
323#endif
251} 324}
252 325
326#ifndef CONFIG_NO_BOOTMEM
253static void __init __free(bootmem_data_t *bdata, 327static void __init __free(bootmem_data_t *bdata,
254 unsigned long sidx, unsigned long eidx) 328 unsigned long sidx, unsigned long eidx)
255{ 329{
@@ -344,6 +418,7 @@ static int __init mark_bootmem(unsigned long start, unsigned long end,
344 } 418 }
345 BUG(); 419 BUG();
346} 420}
421#endif
347 422
348/** 423/**
349 * free_bootmem_node - mark a page range as usable 424 * free_bootmem_node - mark a page range as usable
@@ -358,6 +433,9 @@ static int __init mark_bootmem(unsigned long start, unsigned long end,
358void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, 433void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
359 unsigned long size) 434 unsigned long size)
360{ 435{
436#ifdef CONFIG_NO_BOOTMEM
437 free_early(physaddr, physaddr + size);
438#else
361 unsigned long start, end; 439 unsigned long start, end;
362 440
363 kmemleak_free_part(__va(physaddr), size); 441 kmemleak_free_part(__va(physaddr), size);
@@ -366,6 +444,7 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
366 end = PFN_DOWN(physaddr + size); 444 end = PFN_DOWN(physaddr + size);
367 445
368 mark_bootmem_node(pgdat->bdata, start, end, 0, 0); 446 mark_bootmem_node(pgdat->bdata, start, end, 0, 0);
447#endif
369} 448}
370 449
371/** 450/**
@@ -379,6 +458,9 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
379 */ 458 */
380void __init free_bootmem(unsigned long addr, unsigned long size) 459void __init free_bootmem(unsigned long addr, unsigned long size)
381{ 460{
461#ifdef CONFIG_NO_BOOTMEM
462 free_early(addr, addr + size);
463#else
382 unsigned long start, end; 464 unsigned long start, end;
383 465
384 kmemleak_free_part(__va(addr), size); 466 kmemleak_free_part(__va(addr), size);
@@ -387,6 +469,7 @@ void __init free_bootmem(unsigned long addr, unsigned long size)
387 end = PFN_DOWN(addr + size); 469 end = PFN_DOWN(addr + size);
388 470
389 mark_bootmem(start, end, 0, 0); 471 mark_bootmem(start, end, 0, 0);
472#endif
390} 473}
391 474
392/** 475/**
@@ -403,12 +486,17 @@ void __init free_bootmem(unsigned long addr, unsigned long size)
403int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, 486int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
404 unsigned long size, int flags) 487 unsigned long size, int flags)
405{ 488{
489#ifdef CONFIG_NO_BOOTMEM
490 panic("no bootmem");
491 return 0;
492#else
406 unsigned long start, end; 493 unsigned long start, end;
407 494
408 start = PFN_DOWN(physaddr); 495 start = PFN_DOWN(physaddr);
409 end = PFN_UP(physaddr + size); 496 end = PFN_UP(physaddr + size);
410 497
411 return mark_bootmem_node(pgdat->bdata, start, end, 1, flags); 498 return mark_bootmem_node(pgdat->bdata, start, end, 1, flags);
499#endif
412} 500}
413 501
414/** 502/**
@@ -424,14 +512,20 @@ int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
424int __init reserve_bootmem(unsigned long addr, unsigned long size, 512int __init reserve_bootmem(unsigned long addr, unsigned long size,
425 int flags) 513 int flags)
426{ 514{
515#ifdef CONFIG_NO_BOOTMEM
516 panic("no bootmem");
517 return 0;
518#else
427 unsigned long start, end; 519 unsigned long start, end;
428 520
429 start = PFN_DOWN(addr); 521 start = PFN_DOWN(addr);
430 end = PFN_UP(addr + size); 522 end = PFN_UP(addr + size);
431 523
432 return mark_bootmem(start, end, 1, flags); 524 return mark_bootmem(start, end, 1, flags);
525#endif
433} 526}
434 527
528#ifndef CONFIG_NO_BOOTMEM
435static unsigned long __init align_idx(struct bootmem_data *bdata, 529static unsigned long __init align_idx(struct bootmem_data *bdata,
436 unsigned long idx, unsigned long step) 530 unsigned long idx, unsigned long step)
437{ 531{
@@ -582,12 +676,33 @@ static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata,
582#endif 676#endif
583 return NULL; 677 return NULL;
584} 678}
679#endif
585 680
586static void * __init ___alloc_bootmem_nopanic(unsigned long size, 681static void * __init ___alloc_bootmem_nopanic(unsigned long size,
587 unsigned long align, 682 unsigned long align,
588 unsigned long goal, 683 unsigned long goal,
589 unsigned long limit) 684 unsigned long limit)
590{ 685{
686#ifdef CONFIG_NO_BOOTMEM
687 void *ptr;
688
689 if (WARN_ON_ONCE(slab_is_available()))
690 return kzalloc(size, GFP_NOWAIT);
691
692restart:
693
694 ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal, limit);
695
696 if (ptr)
697 return ptr;
698
699 if (goal != 0) {
700 goal = 0;
701 goto restart;
702 }
703
704 return NULL;
705#else
591 bootmem_data_t *bdata; 706 bootmem_data_t *bdata;
592 void *region; 707 void *region;
593 708
@@ -613,6 +728,7 @@ restart:
613 } 728 }
614 729
615 return NULL; 730 return NULL;
731#endif
616} 732}
617 733
618/** 734/**
@@ -631,7 +747,13 @@ restart:
631void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align, 747void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
632 unsigned long goal) 748 unsigned long goal)
633{ 749{
634 return ___alloc_bootmem_nopanic(size, align, goal, 0); 750 unsigned long limit = 0;
751
752#ifdef CONFIG_NO_BOOTMEM
753 limit = -1UL;
754#endif
755
756 return ___alloc_bootmem_nopanic(size, align, goal, limit);
635} 757}
636 758
637static void * __init ___alloc_bootmem(unsigned long size, unsigned long align, 759static void * __init ___alloc_bootmem(unsigned long size, unsigned long align,
@@ -665,9 +787,16 @@ static void * __init ___alloc_bootmem(unsigned long size, unsigned long align,
665void * __init __alloc_bootmem(unsigned long size, unsigned long align, 787void * __init __alloc_bootmem(unsigned long size, unsigned long align,
666 unsigned long goal) 788 unsigned long goal)
667{ 789{
668 return ___alloc_bootmem(size, align, goal, 0); 790 unsigned long limit = 0;
791
792#ifdef CONFIG_NO_BOOTMEM
793 limit = -1UL;
794#endif
795
796 return ___alloc_bootmem(size, align, goal, limit);
669} 797}
670 798
799#ifndef CONFIG_NO_BOOTMEM
671static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata, 800static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata,
672 unsigned long size, unsigned long align, 801 unsigned long size, unsigned long align,
673 unsigned long goal, unsigned long limit) 802 unsigned long goal, unsigned long limit)
@@ -684,6 +813,7 @@ static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata,
684 813
685 return ___alloc_bootmem(size, align, goal, limit); 814 return ___alloc_bootmem(size, align, goal, limit);
686} 815}
816#endif
687 817
688/** 818/**
689 * __alloc_bootmem_node - allocate boot memory from a specific node 819 * __alloc_bootmem_node - allocate boot memory from a specific node
@@ -706,7 +836,46 @@ void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
706 if (WARN_ON_ONCE(slab_is_available())) 836 if (WARN_ON_ONCE(slab_is_available()))
707 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); 837 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
708 838
839#ifdef CONFIG_NO_BOOTMEM
840 return __alloc_memory_core_early(pgdat->node_id, size, align,
841 goal, -1ULL);
842#else
709 return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0); 843 return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0);
844#endif
845}
846
847void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
848 unsigned long align, unsigned long goal)
849{
850#ifdef MAX_DMA32_PFN
851 unsigned long end_pfn;
852
853 if (WARN_ON_ONCE(slab_is_available()))
854 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
855
856 /* update goal according ...MAX_DMA32_PFN */
857 end_pfn = pgdat->node_start_pfn + pgdat->node_spanned_pages;
858
859 if (end_pfn > MAX_DMA32_PFN + (128 >> (20 - PAGE_SHIFT)) &&
860 (goal >> PAGE_SHIFT) < MAX_DMA32_PFN) {
861 void *ptr;
862 unsigned long new_goal;
863
864 new_goal = MAX_DMA32_PFN << PAGE_SHIFT;
865#ifdef CONFIG_NO_BOOTMEM
866 ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
867 new_goal, -1ULL);
868#else
869 ptr = alloc_bootmem_core(pgdat->bdata, size, align,
870 new_goal, 0);
871#endif
872 if (ptr)
873 return ptr;
874 }
875#endif
876
877 return __alloc_bootmem_node(pgdat, size, align, goal);
878
710} 879}
711 880
712#ifdef CONFIG_SPARSEMEM 881#ifdef CONFIG_SPARSEMEM
@@ -720,6 +889,16 @@ void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
720void * __init alloc_bootmem_section(unsigned long size, 889void * __init alloc_bootmem_section(unsigned long size,
721 unsigned long section_nr) 890 unsigned long section_nr)
722{ 891{
892#ifdef CONFIG_NO_BOOTMEM
893 unsigned long pfn, goal, limit;
894
895 pfn = section_nr_to_pfn(section_nr);
896 goal = pfn << PAGE_SHIFT;
897 limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT;
898
899 return __alloc_memory_core_early(early_pfn_to_nid(pfn), size,
900 SMP_CACHE_BYTES, goal, limit);
901#else
723 bootmem_data_t *bdata; 902 bootmem_data_t *bdata;
724 unsigned long pfn, goal, limit; 903 unsigned long pfn, goal, limit;
725 904
@@ -729,6 +908,7 @@ void * __init alloc_bootmem_section(unsigned long size,
729 bdata = &bootmem_node_data[early_pfn_to_nid(pfn)]; 908 bdata = &bootmem_node_data[early_pfn_to_nid(pfn)];
730 909
731 return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, limit); 910 return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, limit);
911#endif
732} 912}
733#endif 913#endif
734 914
@@ -740,11 +920,16 @@ void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
740 if (WARN_ON_ONCE(slab_is_available())) 920 if (WARN_ON_ONCE(slab_is_available()))
741 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); 921 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
742 922
923#ifdef CONFIG_NO_BOOTMEM
924 ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
925 goal, -1ULL);
926#else
743 ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0); 927 ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0);
744 if (ptr) 928 if (ptr)
745 return ptr; 929 return ptr;
746 930
747 ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0); 931 ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0);
932#endif
748 if (ptr) 933 if (ptr)
749 return ptr; 934 return ptr;
750 935
@@ -795,6 +980,11 @@ void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size,
795 if (WARN_ON_ONCE(slab_is_available())) 980 if (WARN_ON_ONCE(slab_is_available()))
796 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); 981 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
797 982
983#ifdef CONFIG_NO_BOOTMEM
984 return __alloc_memory_core_early(pgdat->node_id, size, align,
985 goal, ARCH_LOW_ADDRESS_LIMIT);
986#else
798 return ___alloc_bootmem_node(pgdat->bdata, size, align, 987 return ___alloc_bootmem_node(pgdat->bdata, size, align,
799 goal, ARCH_LOW_ADDRESS_LIMIT); 988 goal, ARCH_LOW_ADDRESS_LIMIT);
989#endif
800} 990}
diff --git a/mm/bounce.c b/mm/bounce.c
index a2b76a588e34..13b6dad1eed2 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -6,6 +6,7 @@
6#include <linux/mm.h> 6#include <linux/mm.h>
7#include <linux/module.h> 7#include <linux/module.h>
8#include <linux/swap.h> 8#include <linux/swap.h>
9#include <linux/gfp.h>
9#include <linux/bio.h> 10#include <linux/bio.h>
10#include <linux/pagemap.h> 11#include <linux/pagemap.h>
11#include <linux/mempool.h> 12#include <linux/mempool.h>
diff --git a/mm/fadvise.c b/mm/fadvise.c
index e43359214f6f..8d723c9e8b75 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -77,12 +77,20 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
77 switch (advice) { 77 switch (advice) {
78 case POSIX_FADV_NORMAL: 78 case POSIX_FADV_NORMAL:
79 file->f_ra.ra_pages = bdi->ra_pages; 79 file->f_ra.ra_pages = bdi->ra_pages;
80 spin_lock(&file->f_lock);
81 file->f_mode &= ~FMODE_RANDOM;
82 spin_unlock(&file->f_lock);
80 break; 83 break;
81 case POSIX_FADV_RANDOM: 84 case POSIX_FADV_RANDOM:
82 file->f_ra.ra_pages = 0; 85 spin_lock(&file->f_lock);
86 file->f_mode |= FMODE_RANDOM;
87 spin_unlock(&file->f_lock);
83 break; 88 break;
84 case POSIX_FADV_SEQUENTIAL: 89 case POSIX_FADV_SEQUENTIAL:
85 file->f_ra.ra_pages = bdi->ra_pages * 2; 90 file->f_ra.ra_pages = bdi->ra_pages * 2;
91 spin_lock(&file->f_lock);
92 file->f_mode &= ~FMODE_RANDOM;
93 spin_unlock(&file->f_lock);
86 break; 94 break;
87 case POSIX_FADV_WILLNEED: 95 case POSIX_FADV_WILLNEED:
88 if (!mapping->a_ops->readpage) { 96 if (!mapping->a_ops->readpage) {
diff --git a/mm/failslab.c b/mm/failslab.c
index 9339de5f0a91..c5f88f240ddc 100644
--- a/mm/failslab.c
+++ b/mm/failslab.c
@@ -1,18 +1,21 @@
1#include <linux/fault-inject.h> 1#include <linux/fault-inject.h>
2#include <linux/gfp.h> 2#include <linux/slab.h>
3 3
4static struct { 4static struct {
5 struct fault_attr attr; 5 struct fault_attr attr;
6 u32 ignore_gfp_wait; 6 u32 ignore_gfp_wait;
7 int cache_filter;
7#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS 8#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
8 struct dentry *ignore_gfp_wait_file; 9 struct dentry *ignore_gfp_wait_file;
10 struct dentry *cache_filter_file;
9#endif 11#endif
10} failslab = { 12} failslab = {
11 .attr = FAULT_ATTR_INITIALIZER, 13 .attr = FAULT_ATTR_INITIALIZER,
12 .ignore_gfp_wait = 1, 14 .ignore_gfp_wait = 1,
15 .cache_filter = 0,
13}; 16};
14 17
15bool should_failslab(size_t size, gfp_t gfpflags) 18bool should_failslab(size_t size, gfp_t gfpflags, unsigned long cache_flags)
16{ 19{
17 if (gfpflags & __GFP_NOFAIL) 20 if (gfpflags & __GFP_NOFAIL)
18 return false; 21 return false;
@@ -20,6 +23,9 @@ bool should_failslab(size_t size, gfp_t gfpflags)
20 if (failslab.ignore_gfp_wait && (gfpflags & __GFP_WAIT)) 23 if (failslab.ignore_gfp_wait && (gfpflags & __GFP_WAIT))
21 return false; 24 return false;
22 25
26 if (failslab.cache_filter && !(cache_flags & SLAB_FAILSLAB))
27 return false;
28
23 return should_fail(&failslab.attr, size); 29 return should_fail(&failslab.attr, size);
24} 30}
25 31
@@ -30,7 +36,6 @@ static int __init setup_failslab(char *str)
30__setup("failslab=", setup_failslab); 36__setup("failslab=", setup_failslab);
31 37
32#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS 38#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
33
34static int __init failslab_debugfs_init(void) 39static int __init failslab_debugfs_init(void)
35{ 40{
36 mode_t mode = S_IFREG | S_IRUSR | S_IWUSR; 41 mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
@@ -46,8 +51,14 @@ static int __init failslab_debugfs_init(void)
46 debugfs_create_bool("ignore-gfp-wait", mode, dir, 51 debugfs_create_bool("ignore-gfp-wait", mode, dir,
47 &failslab.ignore_gfp_wait); 52 &failslab.ignore_gfp_wait);
48 53
49 if (!failslab.ignore_gfp_wait_file) { 54 failslab.cache_filter_file =
55 debugfs_create_bool("cache-filter", mode, dir,
56 &failslab.cache_filter);
57
58 if (!failslab.ignore_gfp_wait_file ||
59 !failslab.cache_filter_file) {
50 err = -ENOMEM; 60 err = -ENOMEM;
61 debugfs_remove(failslab.cache_filter_file);
51 debugfs_remove(failslab.ignore_gfp_wait_file); 62 debugfs_remove(failslab.ignore_gfp_wait_file);
52 cleanup_fault_attr_dentries(&failslab.attr); 63 cleanup_fault_attr_dentries(&failslab.attr);
53 } 64 }
diff --git a/mm/filemap.c b/mm/filemap.c
index 698ea80f2102..140ebda9640f 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -10,13 +10,13 @@
10 * the NFS filesystem used to do this differently, for example) 10 * the NFS filesystem used to do this differently, for example)
11 */ 11 */
12#include <linux/module.h> 12#include <linux/module.h>
13#include <linux/slab.h>
14#include <linux/compiler.h> 13#include <linux/compiler.h>
15#include <linux/fs.h> 14#include <linux/fs.h>
16#include <linux/uaccess.h> 15#include <linux/uaccess.h>
17#include <linux/aio.h> 16#include <linux/aio.h>
18#include <linux/capability.h> 17#include <linux/capability.h>
19#include <linux/kernel_stat.h> 18#include <linux/kernel_stat.h>
19#include <linux/gfp.h>
20#include <linux/mm.h> 20#include <linux/mm.h>
21#include <linux/swap.h> 21#include <linux/swap.h>
22#include <linux/mman.h> 22#include <linux/mman.h>
@@ -1117,7 +1117,7 @@ readpage:
1117 if (!PageUptodate(page)) { 1117 if (!PageUptodate(page)) {
1118 if (page->mapping == NULL) { 1118 if (page->mapping == NULL) {
1119 /* 1119 /*
1120 * invalidate_inode_pages got it 1120 * invalidate_mapping_pages got it
1121 */ 1121 */
1122 unlock_page(page); 1122 unlock_page(page);
1123 page_cache_release(page); 1123 page_cache_release(page);
@@ -1986,7 +1986,7 @@ EXPORT_SYMBOL(iov_iter_single_seg_count);
1986inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk) 1986inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk)
1987{ 1987{
1988 struct inode *inode = file->f_mapping->host; 1988 struct inode *inode = file->f_mapping->host;
1989 unsigned long limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; 1989 unsigned long limit = rlimit(RLIMIT_FSIZE);
1990 1990
1991 if (unlikely(*pos < 0)) 1991 if (unlikely(*pos < 0))
1992 return -EINVAL; 1992 return -EINVAL;
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 1888b2d71bb8..83364df74a33 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -17,6 +17,7 @@
17#include <linux/sched.h> 17#include <linux/sched.h>
18#include <linux/seqlock.h> 18#include <linux/seqlock.h>
19#include <linux/mutex.h> 19#include <linux/mutex.h>
20#include <linux/gfp.h>
20#include <asm/tlbflush.h> 21#include <asm/tlbflush.h>
21#include <asm/io.h> 22#include <asm/io.h>
22 23
@@ -194,7 +195,7 @@ retry:
194 flush_cache_page(vma, address, pte_pfn(*pte)); 195 flush_cache_page(vma, address, pte_pfn(*pte));
195 pteval = ptep_clear_flush_notify(vma, address, pte); 196 pteval = ptep_clear_flush_notify(vma, address, pte);
196 page_remove_rmap(page); 197 page_remove_rmap(page);
197 dec_mm_counter(mm, file_rss); 198 dec_mm_counter(mm, MM_FILEPAGES);
198 BUG_ON(pte_dirty(pteval)); 199 BUG_ON(pte_dirty(pteval));
199 pte_unmap_unlock(pte, ptl); 200 pte_unmap_unlock(pte, ptl);
200 page_cache_release(page); 201 page_cache_release(page);
diff --git a/mm/fremap.c b/mm/fremap.c
index b6ec85abbb39..46f5dacf90a2 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -40,7 +40,7 @@ static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
40 page_remove_rmap(page); 40 page_remove_rmap(page);
41 page_cache_release(page); 41 page_cache_release(page);
42 update_hiwater_rss(mm); 42 update_hiwater_rss(mm);
43 dec_mm_counter(mm, file_rss); 43 dec_mm_counter(mm, MM_FILEPAGES);
44 } 44 }
45 } else { 45 } else {
46 if (!pte_file(pte)) 46 if (!pte_file(pte))
diff --git a/mm/highmem.c b/mm/highmem.c
index 9c1e627f282e..bed8a8bfd01f 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -220,7 +220,7 @@ EXPORT_SYMBOL(kmap_high);
220 * @page: &struct page to pin 220 * @page: &struct page to pin
221 * 221 *
222 * Returns the page's current virtual memory address, or NULL if no mapping 222 * Returns the page's current virtual memory address, or NULL if no mapping
223 * exists. When and only when a non null address is returned then a 223 * exists. If and only if a non null address is returned then a
224 * matching call to kunmap_high() is necessary. 224 * matching call to kunmap_high() is necessary.
225 * 225 *
226 * This can be called from any context. 226 * This can be called from any context.
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 2d16fa6b8c2d..ffbdfc86aedf 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2,7 +2,6 @@
2 * Generic hugetlb support. 2 * Generic hugetlb support.
3 * (C) William Irwin, April 2004 3 * (C) William Irwin, April 2004
4 */ 4 */
5#include <linux/gfp.h>
6#include <linux/list.h> 5#include <linux/list.h>
7#include <linux/init.h> 6#include <linux/init.h>
8#include <linux/module.h> 7#include <linux/module.h>
@@ -18,6 +17,7 @@
18#include <linux/mutex.h> 17#include <linux/mutex.h>
19#include <linux/bootmem.h> 18#include <linux/bootmem.h>
20#include <linux/sysfs.h> 19#include <linux/sysfs.h>
20#include <linux/slab.h>
21 21
22#include <asm/page.h> 22#include <asm/page.h>
23#include <asm/pgtable.h> 23#include <asm/pgtable.h>
@@ -546,6 +546,7 @@ static void free_huge_page(struct page *page)
546 546
547 mapping = (struct address_space *) page_private(page); 547 mapping = (struct address_space *) page_private(page);
548 set_page_private(page, 0); 548 set_page_private(page, 0);
549 page->mapping = NULL;
549 BUG_ON(page_count(page)); 550 BUG_ON(page_count(page));
550 INIT_LIST_HEAD(&page->lru); 551 INIT_LIST_HEAD(&page->lru);
551 552
@@ -2087,7 +2088,7 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma,
2087 2088
2088 entry = pte_mkwrite(pte_mkdirty(huge_ptep_get(ptep))); 2089 entry = pte_mkwrite(pte_mkdirty(huge_ptep_get(ptep)));
2089 if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) { 2090 if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) {
2090 update_mmu_cache(vma, address, entry); 2091 update_mmu_cache(vma, address, ptep);
2091 } 2092 }
2092} 2093}
2093 2094
@@ -2447,8 +2448,10 @@ retry:
2447 spin_lock(&inode->i_lock); 2448 spin_lock(&inode->i_lock);
2448 inode->i_blocks += blocks_per_huge_page(h); 2449 inode->i_blocks += blocks_per_huge_page(h);
2449 spin_unlock(&inode->i_lock); 2450 spin_unlock(&inode->i_lock);
2450 } else 2451 } else {
2451 lock_page(page); 2452 lock_page(page);
2453 page->mapping = HUGETLB_POISON;
2454 }
2452 } 2455 }
2453 2456
2454 /* 2457 /*
@@ -2558,7 +2561,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2558 entry = pte_mkyoung(entry); 2561 entry = pte_mkyoung(entry);
2559 if (huge_ptep_set_access_flags(vma, address, ptep, entry, 2562 if (huge_ptep_set_access_flags(vma, address, ptep, entry,
2560 flags & FAULT_FLAG_WRITE)) 2563 flags & FAULT_FLAG_WRITE))
2561 update_mmu_cache(vma, address, entry); 2564 update_mmu_cache(vma, address, ptep);
2562 2565
2563out_page_table_lock: 2566out_page_table_lock:
2564 spin_unlock(&mm->page_table_lock); 2567 spin_unlock(&mm->page_table_lock);
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 5b069e4f5e48..2c0d032ac898 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -72,7 +72,6 @@
72#include <linux/module.h> 72#include <linux/module.h>
73#include <linux/kthread.h> 73#include <linux/kthread.h>
74#include <linux/prio_tree.h> 74#include <linux/prio_tree.h>
75#include <linux/gfp.h>
76#include <linux/fs.h> 75#include <linux/fs.h>
77#include <linux/debugfs.h> 76#include <linux/debugfs.h>
78#include <linux/seq_file.h> 77#include <linux/seq_file.h>
diff --git a/mm/ksm.c b/mm/ksm.c
index 56a0da1f9979..956880f2ff49 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -365,7 +365,7 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
365 do { 365 do {
366 cond_resched(); 366 cond_resched();
367 page = follow_page(vma, addr, FOLL_GET); 367 page = follow_page(vma, addr, FOLL_GET);
368 if (!page) 368 if (IS_ERR_OR_NULL(page))
369 break; 369 break;
370 if (PageKsm(page)) 370 if (PageKsm(page))
371 ret = handle_mm_fault(vma->vm_mm, vma, addr, 371 ret = handle_mm_fault(vma->vm_mm, vma, addr,
@@ -447,7 +447,7 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item)
447 goto out; 447 goto out;
448 448
449 page = follow_page(vma, addr, FOLL_GET); 449 page = follow_page(vma, addr, FOLL_GET);
450 if (!page) 450 if (IS_ERR_OR_NULL(page))
451 goto out; 451 goto out;
452 if (PageAnon(page)) { 452 if (PageAnon(page)) {
453 flush_anon_page(vma, page, addr); 453 flush_anon_page(vma, page, addr);
@@ -751,7 +751,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
751 * page 751 * page
752 */ 752 */
753 if (page_mapcount(page) + 1 + swapped != page_count(page)) { 753 if (page_mapcount(page) + 1 + swapped != page_count(page)) {
754 set_pte_at_notify(mm, addr, ptep, entry); 754 set_pte_at(mm, addr, ptep, entry);
755 goto out_unlock; 755 goto out_unlock;
756 } 756 }
757 entry = pte_wrprotect(entry); 757 entry = pte_wrprotect(entry);
@@ -1086,7 +1086,7 @@ struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
1086 cond_resched(); 1086 cond_resched();
1087 tree_rmap_item = rb_entry(*new, struct rmap_item, node); 1087 tree_rmap_item = rb_entry(*new, struct rmap_item, node);
1088 tree_page = get_mergeable_page(tree_rmap_item); 1088 tree_page = get_mergeable_page(tree_rmap_item);
1089 if (!tree_page) 1089 if (IS_ERR_OR_NULL(tree_page))
1090 return NULL; 1090 return NULL;
1091 1091
1092 /* 1092 /*
@@ -1294,7 +1294,7 @@ next_mm:
1294 if (ksm_test_exit(mm)) 1294 if (ksm_test_exit(mm))
1295 break; 1295 break;
1296 *page = follow_page(vma, ksm_scan.address, FOLL_GET); 1296 *page = follow_page(vma, ksm_scan.address, FOLL_GET);
1297 if (*page && PageAnon(*page)) { 1297 if (!IS_ERR_OR_NULL(*page) && PageAnon(*page)) {
1298 flush_anon_page(vma, *page, ksm_scan.address); 1298 flush_anon_page(vma, *page, ksm_scan.address);
1299 flush_dcache_page(*page); 1299 flush_dcache_page(*page);
1300 rmap_item = get_next_rmap_item(slot, 1300 rmap_item = get_next_rmap_item(slot,
@@ -1308,7 +1308,7 @@ next_mm:
1308 up_read(&mm->mmap_sem); 1308 up_read(&mm->mmap_sem);
1309 return rmap_item; 1309 return rmap_item;
1310 } 1310 }
1311 if (*page) 1311 if (!IS_ERR_OR_NULL(*page))
1312 put_page(*page); 1312 put_page(*page);
1313 ksm_scan.address += PAGE_SIZE; 1313 ksm_scan.address += PAGE_SIZE;
1314 cond_resched(); 1314 cond_resched();
@@ -1367,7 +1367,7 @@ next_mm:
1367static void ksm_do_scan(unsigned int scan_npages) 1367static void ksm_do_scan(unsigned int scan_npages)
1368{ 1368{
1369 struct rmap_item *rmap_item; 1369 struct rmap_item *rmap_item;
1370 struct page *page; 1370 struct page *uninitialized_var(page);
1371 1371
1372 while (scan_npages--) { 1372 while (scan_npages--) {
1373 cond_resched(); 1373 cond_resched();
@@ -1563,10 +1563,12 @@ int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg,
1563again: 1563again:
1564 hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { 1564 hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
1565 struct anon_vma *anon_vma = rmap_item->anon_vma; 1565 struct anon_vma *anon_vma = rmap_item->anon_vma;
1566 struct anon_vma_chain *vmac;
1566 struct vm_area_struct *vma; 1567 struct vm_area_struct *vma;
1567 1568
1568 spin_lock(&anon_vma->lock); 1569 spin_lock(&anon_vma->lock);
1569 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { 1570 list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
1571 vma = vmac->vma;
1570 if (rmap_item->address < vma->vm_start || 1572 if (rmap_item->address < vma->vm_start ||
1571 rmap_item->address >= vma->vm_end) 1573 rmap_item->address >= vma->vm_end)
1572 continue; 1574 continue;
@@ -1614,10 +1616,12 @@ int try_to_unmap_ksm(struct page *page, enum ttu_flags flags)
1614again: 1616again:
1615 hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { 1617 hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
1616 struct anon_vma *anon_vma = rmap_item->anon_vma; 1618 struct anon_vma *anon_vma = rmap_item->anon_vma;
1619 struct anon_vma_chain *vmac;
1617 struct vm_area_struct *vma; 1620 struct vm_area_struct *vma;
1618 1621
1619 spin_lock(&anon_vma->lock); 1622 spin_lock(&anon_vma->lock);
1620 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { 1623 list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
1624 vma = vmac->vma;
1621 if (rmap_item->address < vma->vm_start || 1625 if (rmap_item->address < vma->vm_start ||
1622 rmap_item->address >= vma->vm_end) 1626 rmap_item->address >= vma->vm_end)
1623 continue; 1627 continue;
@@ -1664,10 +1668,12 @@ int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *,
1664again: 1668again:
1665 hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { 1669 hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
1666 struct anon_vma *anon_vma = rmap_item->anon_vma; 1670 struct anon_vma *anon_vma = rmap_item->anon_vma;
1671 struct anon_vma_chain *vmac;
1667 struct vm_area_struct *vma; 1672 struct vm_area_struct *vma;
1668 1673
1669 spin_lock(&anon_vma->lock); 1674 spin_lock(&anon_vma->lock);
1670 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { 1675 list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
1676 vma = vmac->vma;
1671 if (rmap_item->address < vma->vm_start || 1677 if (rmap_item->address < vma->vm_start ||
1672 rmap_item->address >= vma->vm_end) 1678 rmap_item->address >= vma->vm_end)
1673 continue; 1679 continue;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 954032b80bed..0f711c213d2e 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -6,6 +6,10 @@
6 * Copyright 2007 OpenVZ SWsoft Inc 6 * Copyright 2007 OpenVZ SWsoft Inc
7 * Author: Pavel Emelianov <xemul@openvz.org> 7 * Author: Pavel Emelianov <xemul@openvz.org>
8 * 8 *
9 * Memory thresholds
10 * Copyright (C) 2009 Nokia Corporation
11 * Author: Kirill A. Shutemov
12 *
9 * This program is free software; you can redistribute it and/or modify 13 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by 14 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or 15 * the Free Software Foundation; either version 2 of the License, or
@@ -21,6 +25,7 @@
21#include <linux/memcontrol.h> 25#include <linux/memcontrol.h>
22#include <linux/cgroup.h> 26#include <linux/cgroup.h>
23#include <linux/mm.h> 27#include <linux/mm.h>
28#include <linux/hugetlb.h>
24#include <linux/pagemap.h> 29#include <linux/pagemap.h>
25#include <linux/smp.h> 30#include <linux/smp.h>
26#include <linux/page-flags.h> 31#include <linux/page-flags.h>
@@ -32,7 +37,10 @@
32#include <linux/rbtree.h> 37#include <linux/rbtree.h>
33#include <linux/slab.h> 38#include <linux/slab.h>
34#include <linux/swap.h> 39#include <linux/swap.h>
40#include <linux/swapops.h>
35#include <linux/spinlock.h> 41#include <linux/spinlock.h>
42#include <linux/eventfd.h>
43#include <linux/sort.h>
36#include <linux/fs.h> 44#include <linux/fs.h>
37#include <linux/seq_file.h> 45#include <linux/seq_file.h>
38#include <linux/vmalloc.h> 46#include <linux/vmalloc.h>
@@ -55,7 +63,15 @@ static int really_do_swap_account __initdata = 1; /* for remember boot option*/
55#define do_swap_account (0) 63#define do_swap_account (0)
56#endif 64#endif
57 65
58#define SOFTLIMIT_EVENTS_THRESH (1000) 66/*
67 * Per memcg event counter is incremented at every pagein/pageout. This counter
68 * is used for trigger some periodic events. This is straightforward and better
69 * than using jiffies etc. to handle periodic memcg event.
70 *
71 * These values will be used as !((event) & ((1 <<(thresh)) - 1))
72 */
73#define THRESHOLDS_EVENTS_THRESH (7) /* once in 128 */
74#define SOFTLIMIT_EVENTS_THRESH (10) /* once in 1024 */
59 75
60/* 76/*
61 * Statistics for memory cgroup. 77 * Statistics for memory cgroup.
@@ -69,62 +85,16 @@ enum mem_cgroup_stat_index {
69 MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ 85 MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */
70 MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ 86 MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */
71 MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ 87 MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */
72 MEM_CGROUP_STAT_EVENTS, /* sum of pagein + pageout for internal use */
73 MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ 88 MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
89 MEM_CGROUP_EVENTS, /* incremented at every pagein/pageout */
74 90
75 MEM_CGROUP_STAT_NSTATS, 91 MEM_CGROUP_STAT_NSTATS,
76}; 92};
77 93
78struct mem_cgroup_stat_cpu { 94struct mem_cgroup_stat_cpu {
79 s64 count[MEM_CGROUP_STAT_NSTATS]; 95 s64 count[MEM_CGROUP_STAT_NSTATS];
80} ____cacheline_aligned_in_smp;
81
82struct mem_cgroup_stat {
83 struct mem_cgroup_stat_cpu cpustat[0];
84}; 96};
85 97
86static inline void
87__mem_cgroup_stat_reset_safe(struct mem_cgroup_stat_cpu *stat,
88 enum mem_cgroup_stat_index idx)
89{
90 stat->count[idx] = 0;
91}
92
93static inline s64
94__mem_cgroup_stat_read_local(struct mem_cgroup_stat_cpu *stat,
95 enum mem_cgroup_stat_index idx)
96{
97 return stat->count[idx];
98}
99
100/*
101 * For accounting under irq disable, no need for increment preempt count.
102 */
103static inline void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat_cpu *stat,
104 enum mem_cgroup_stat_index idx, int val)
105{
106 stat->count[idx] += val;
107}
108
109static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat,
110 enum mem_cgroup_stat_index idx)
111{
112 int cpu;
113 s64 ret = 0;
114 for_each_possible_cpu(cpu)
115 ret += stat->cpustat[cpu].count[idx];
116 return ret;
117}
118
119static s64 mem_cgroup_local_usage(struct mem_cgroup_stat *stat)
120{
121 s64 ret;
122
123 ret = mem_cgroup_read_stat(stat, MEM_CGROUP_STAT_CACHE);
124 ret += mem_cgroup_read_stat(stat, MEM_CGROUP_STAT_RSS);
125 return ret;
126}
127
128/* 98/*
129 * per-zone information in memory controller. 99 * per-zone information in memory controller.
130 */ 100 */
@@ -174,6 +144,22 @@ struct mem_cgroup_tree {
174 144
175static struct mem_cgroup_tree soft_limit_tree __read_mostly; 145static struct mem_cgroup_tree soft_limit_tree __read_mostly;
176 146
147struct mem_cgroup_threshold {
148 struct eventfd_ctx *eventfd;
149 u64 threshold;
150};
151
152struct mem_cgroup_threshold_ary {
153 /* An array index points to threshold just below usage. */
154 atomic_t current_threshold;
155 /* Size of entries[] */
156 unsigned int size;
157 /* Array of thresholds */
158 struct mem_cgroup_threshold entries[0];
159};
160
161static void mem_cgroup_threshold(struct mem_cgroup *mem);
162
177/* 163/*
178 * The memory controller data structure. The memory controller controls both 164 * The memory controller data structure. The memory controller controls both
179 * page cache and RSS per cgroup. We would eventually like to provide 165 * page cache and RSS per cgroup. We would eventually like to provide
@@ -217,7 +203,7 @@ struct mem_cgroup {
217 * Should the accounting and control be hierarchical, per subtree? 203 * Should the accounting and control be hierarchical, per subtree?
218 */ 204 */
219 bool use_hierarchy; 205 bool use_hierarchy;
220 unsigned long last_oom_jiffies; 206 atomic_t oom_lock;
221 atomic_t refcnt; 207 atomic_t refcnt;
222 208
223 unsigned int swappiness; 209 unsigned int swappiness;
@@ -225,10 +211,48 @@ struct mem_cgroup {
225 /* set when res.limit == memsw.limit */ 211 /* set when res.limit == memsw.limit */
226 bool memsw_is_minimum; 212 bool memsw_is_minimum;
227 213
214 /* protect arrays of thresholds */
215 struct mutex thresholds_lock;
216
217 /* thresholds for memory usage. RCU-protected */
218 struct mem_cgroup_threshold_ary *thresholds;
219
220 /* thresholds for mem+swap usage. RCU-protected */
221 struct mem_cgroup_threshold_ary *memsw_thresholds;
222
223 /*
224 * Should we move charges of a task when a task is moved into this
225 * mem_cgroup ? And what type of charges should we move ?
226 */
227 unsigned long move_charge_at_immigrate;
228
228 /* 229 /*
229 * statistics. This must be placed at the end of memcg. 230 * percpu counter.
230 */ 231 */
231 struct mem_cgroup_stat stat; 232 struct mem_cgroup_stat_cpu *stat;
233};
234
235/* Stuffs for move charges at task migration. */
236/*
237 * Types of charges to be moved. "move_charge_at_immitgrate" is treated as a
238 * left-shifted bitmap of these types.
239 */
240enum move_type {
241 MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */
242 NR_MOVE_TYPE,
243};
244
245/* "mc" and its members are protected by cgroup_mutex */
246static struct move_charge_struct {
247 struct mem_cgroup *from;
248 struct mem_cgroup *to;
249 unsigned long precharge;
250 unsigned long moved_charge;
251 unsigned long moved_swap;
252 struct task_struct *moving_task; /* a task moving charges */
253 wait_queue_head_t waitq; /* a waitq for other context */
254} mc = {
255 .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
232}; 256};
233 257
234/* 258/*
@@ -371,23 +395,6 @@ mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
371 spin_unlock(&mctz->lock); 395 spin_unlock(&mctz->lock);
372} 396}
373 397
374static bool mem_cgroup_soft_limit_check(struct mem_cgroup *mem)
375{
376 bool ret = false;
377 int cpu;
378 s64 val;
379 struct mem_cgroup_stat_cpu *cpustat;
380
381 cpu = get_cpu();
382 cpustat = &mem->stat.cpustat[cpu];
383 val = __mem_cgroup_stat_read_local(cpustat, MEM_CGROUP_STAT_EVENTS);
384 if (unlikely(val > SOFTLIMIT_EVENTS_THRESH)) {
385 __mem_cgroup_stat_reset_safe(cpustat, MEM_CGROUP_STAT_EVENTS);
386 ret = true;
387 }
388 put_cpu();
389 return ret;
390}
391 398
392static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page) 399static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page)
393{ 400{
@@ -481,17 +488,31 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
481 return mz; 488 return mz;
482} 489}
483 490
491static s64 mem_cgroup_read_stat(struct mem_cgroup *mem,
492 enum mem_cgroup_stat_index idx)
493{
494 int cpu;
495 s64 val = 0;
496
497 for_each_possible_cpu(cpu)
498 val += per_cpu(mem->stat->count[idx], cpu);
499 return val;
500}
501
502static s64 mem_cgroup_local_usage(struct mem_cgroup *mem)
503{
504 s64 ret;
505
506 ret = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);
507 ret += mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE);
508 return ret;
509}
510
484static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, 511static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,
485 bool charge) 512 bool charge)
486{ 513{
487 int val = (charge) ? 1 : -1; 514 int val = (charge) ? 1 : -1;
488 struct mem_cgroup_stat *stat = &mem->stat; 515 this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val);
489 struct mem_cgroup_stat_cpu *cpustat;
490 int cpu = get_cpu();
491
492 cpustat = &stat->cpustat[cpu];
493 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_SWAPOUT, val);
494 put_cpu();
495} 516}
496 517
497static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, 518static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
@@ -499,24 +520,21 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
499 bool charge) 520 bool charge)
500{ 521{
501 int val = (charge) ? 1 : -1; 522 int val = (charge) ? 1 : -1;
502 struct mem_cgroup_stat *stat = &mem->stat;
503 struct mem_cgroup_stat_cpu *cpustat;
504 int cpu = get_cpu();
505 523
506 cpustat = &stat->cpustat[cpu]; 524 preempt_disable();
525
507 if (PageCgroupCache(pc)) 526 if (PageCgroupCache(pc))
508 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val); 527 __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], val);
509 else 528 else
510 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_RSS, val); 529 __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], val);
511 530
512 if (charge) 531 if (charge)
513 __mem_cgroup_stat_add_safe(cpustat, 532 __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGIN_COUNT]);
514 MEM_CGROUP_STAT_PGPGIN_COUNT, 1);
515 else 533 else
516 __mem_cgroup_stat_add_safe(cpustat, 534 __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGOUT_COUNT]);
517 MEM_CGROUP_STAT_PGPGOUT_COUNT, 1); 535 __this_cpu_inc(mem->stat->count[MEM_CGROUP_EVENTS]);
518 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_EVENTS, 1); 536
519 put_cpu(); 537 preempt_enable();
520} 538}
521 539
522static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, 540static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
@@ -534,6 +552,29 @@ static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
534 return total; 552 return total;
535} 553}
536 554
555static bool __memcg_event_check(struct mem_cgroup *mem, int event_mask_shift)
556{
557 s64 val;
558
559 val = this_cpu_read(mem->stat->count[MEM_CGROUP_EVENTS]);
560
561 return !(val & ((1 << event_mask_shift) - 1));
562}
563
564/*
565 * Check events in order.
566 *
567 */
568static void memcg_check_events(struct mem_cgroup *mem, struct page *page)
569{
570 /* threshold event is triggered in finer grain than soft limit */
571 if (unlikely(__memcg_event_check(mem, THRESHOLDS_EVENTS_THRESH))) {
572 mem_cgroup_threshold(mem);
573 if (unlikely(__memcg_event_check(mem, SOFTLIMIT_EVENTS_THRESH)))
574 mem_cgroup_update_tree(mem, page);
575 }
576}
577
537static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) 578static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
538{ 579{
539 return container_of(cgroup_subsys_state(cont, 580 return container_of(cgroup_subsys_state(cont,
@@ -770,10 +811,12 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
770 * enabled in "curr" and "curr" is a child of "mem" in *cgroup* 811 * enabled in "curr" and "curr" is a child of "mem" in *cgroup*
771 * hierarchy(even if use_hierarchy is disabled in "mem"). 812 * hierarchy(even if use_hierarchy is disabled in "mem").
772 */ 813 */
814 rcu_read_lock();
773 if (mem->use_hierarchy) 815 if (mem->use_hierarchy)
774 ret = css_is_ancestor(&curr->css, &mem->css); 816 ret = css_is_ancestor(&curr->css, &mem->css);
775 else 817 else
776 ret = (curr == mem); 818 ret = (curr == mem);
819 rcu_read_unlock();
777 css_put(&curr->css); 820 css_put(&curr->css);
778 return ret; 821 return ret;
779} 822}
@@ -1000,7 +1043,7 @@ static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data)
1000} 1043}
1001 1044
1002/** 1045/**
1003 * mem_cgroup_print_mem_info: Called from OOM with tasklist_lock held in read mode. 1046 * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode.
1004 * @memcg: The memory cgroup that went over limit 1047 * @memcg: The memory cgroup that went over limit
1005 * @p: Task that is going to be killed 1048 * @p: Task that is going to be killed
1006 * 1049 *
@@ -1174,7 +1217,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1174 } 1217 }
1175 } 1218 }
1176 } 1219 }
1177 if (!mem_cgroup_local_usage(&victim->stat)) { 1220 if (!mem_cgroup_local_usage(victim)) {
1178 /* this cgroup's local usage == 0 */ 1221 /* this cgroup's local usage == 0 */
1179 css_put(&victim->css); 1222 css_put(&victim->css);
1180 continue; 1223 continue;
@@ -1205,32 +1248,102 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1205 return total; 1248 return total;
1206} 1249}
1207 1250
1208bool mem_cgroup_oom_called(struct task_struct *task) 1251static int mem_cgroup_oom_lock_cb(struct mem_cgroup *mem, void *data)
1209{ 1252{
1210 bool ret = false; 1253 int *val = (int *)data;
1211 struct mem_cgroup *mem; 1254 int x;
1212 struct mm_struct *mm; 1255 /*
1256 * Logically, we can stop scanning immediately when we find
1257 * a memcg is already locked. But condidering unlock ops and
1258 * creation/removal of memcg, scan-all is simple operation.
1259 */
1260 x = atomic_inc_return(&mem->oom_lock);
1261 *val = max(x, *val);
1262 return 0;
1263}
1264/*
1265 * Check OOM-Killer is already running under our hierarchy.
1266 * If someone is running, return false.
1267 */
1268static bool mem_cgroup_oom_lock(struct mem_cgroup *mem)
1269{
1270 int lock_count = 0;
1213 1271
1214 rcu_read_lock(); 1272 mem_cgroup_walk_tree(mem, &lock_count, mem_cgroup_oom_lock_cb);
1215 mm = task->mm; 1273
1216 if (!mm) 1274 if (lock_count == 1)
1217 mm = &init_mm; 1275 return true;
1218 mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); 1276 return false;
1219 if (mem && time_before(jiffies, mem->last_oom_jiffies + HZ/10))
1220 ret = true;
1221 rcu_read_unlock();
1222 return ret;
1223} 1277}
1224 1278
1225static int record_last_oom_cb(struct mem_cgroup *mem, void *data) 1279static int mem_cgroup_oom_unlock_cb(struct mem_cgroup *mem, void *data)
1226{ 1280{
1227 mem->last_oom_jiffies = jiffies; 1281 /*
1282 * When a new child is created while the hierarchy is under oom,
1283 * mem_cgroup_oom_lock() may not be called. We have to use
1284 * atomic_add_unless() here.
1285 */
1286 atomic_add_unless(&mem->oom_lock, -1, 0);
1228 return 0; 1287 return 0;
1229} 1288}
1230 1289
1231static void record_last_oom(struct mem_cgroup *mem) 1290static void mem_cgroup_oom_unlock(struct mem_cgroup *mem)
1291{
1292 mem_cgroup_walk_tree(mem, NULL, mem_cgroup_oom_unlock_cb);
1293}
1294
1295static DEFINE_MUTEX(memcg_oom_mutex);
1296static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
1297
1298/*
1299 * try to call OOM killer. returns false if we should exit memory-reclaim loop.
1300 */
1301bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
1232{ 1302{
1233 mem_cgroup_walk_tree(mem, NULL, record_last_oom_cb); 1303 DEFINE_WAIT(wait);
1304 bool locked;
1305
1306 /* At first, try to OOM lock hierarchy under mem.*/
1307 mutex_lock(&memcg_oom_mutex);
1308 locked = mem_cgroup_oom_lock(mem);
1309 /*
1310 * Even if signal_pending(), we can't quit charge() loop without
1311 * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL
1312 * under OOM is always welcomed, use TASK_KILLABLE here.
1313 */
1314 if (!locked)
1315 prepare_to_wait(&memcg_oom_waitq, &wait, TASK_KILLABLE);
1316 mutex_unlock(&memcg_oom_mutex);
1317
1318 if (locked)
1319 mem_cgroup_out_of_memory(mem, mask);
1320 else {
1321 schedule();
1322 finish_wait(&memcg_oom_waitq, &wait);
1323 }
1324 mutex_lock(&memcg_oom_mutex);
1325 mem_cgroup_oom_unlock(mem);
1326 /*
1327 * Here, we use global waitq .....more fine grained waitq ?
1328 * Assume following hierarchy.
1329 * A/
1330 * 01
1331 * 02
1332 * assume OOM happens both in A and 01 at the same time. Tthey are
1333 * mutually exclusive by lock. (kill in 01 helps A.)
1334 * When we use per memcg waitq, we have to wake up waiters on A and 02
1335 * in addtion to waiters on 01. We use global waitq for avoiding mess.
1336 * It will not be a big problem.
1337 * (And a task may be moved to other groups while it's waiting for OOM.)
1338 */
1339 wake_up_all(&memcg_oom_waitq);
1340 mutex_unlock(&memcg_oom_mutex);
1341
1342 if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
1343 return false;
1344 /* Give chance to dying process */
1345 schedule_timeout(1);
1346 return true;
1234} 1347}
1235 1348
1236/* 1349/*
@@ -1240,9 +1353,6 @@ static void record_last_oom(struct mem_cgroup *mem)
1240void mem_cgroup_update_file_mapped(struct page *page, int val) 1353void mem_cgroup_update_file_mapped(struct page *page, int val)
1241{ 1354{
1242 struct mem_cgroup *mem; 1355 struct mem_cgroup *mem;
1243 struct mem_cgroup_stat *stat;
1244 struct mem_cgroup_stat_cpu *cpustat;
1245 int cpu;
1246 struct page_cgroup *pc; 1356 struct page_cgroup *pc;
1247 1357
1248 pc = lookup_page_cgroup(page); 1358 pc = lookup_page_cgroup(page);
@@ -1251,20 +1361,20 @@ void mem_cgroup_update_file_mapped(struct page *page, int val)
1251 1361
1252 lock_page_cgroup(pc); 1362 lock_page_cgroup(pc);
1253 mem = pc->mem_cgroup; 1363 mem = pc->mem_cgroup;
1254 if (!mem) 1364 if (!mem || !PageCgroupUsed(pc))
1255 goto done;
1256
1257 if (!PageCgroupUsed(pc))
1258 goto done; 1365 goto done;
1259 1366
1260 /* 1367 /*
1261 * Preemption is already disabled, we don't need get_cpu() 1368 * Preemption is already disabled. We can use __this_cpu_xxx
1262 */ 1369 */
1263 cpu = smp_processor_id(); 1370 if (val > 0) {
1264 stat = &mem->stat; 1371 __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
1265 cpustat = &stat->cpustat[cpu]; 1372 SetPageCgroupFileMapped(pc);
1373 } else {
1374 __this_cpu_dec(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
1375 ClearPageCgroupFileMapped(pc);
1376 }
1266 1377
1267 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED, val);
1268done: 1378done:
1269 unlock_page_cgroup(pc); 1379 unlock_page_cgroup(pc);
1270} 1380}
@@ -1401,19 +1511,21 @@ static int __cpuinit memcg_stock_cpu_callback(struct notifier_block *nb,
1401 * oom-killer can be invoked. 1511 * oom-killer can be invoked.
1402 */ 1512 */
1403static int __mem_cgroup_try_charge(struct mm_struct *mm, 1513static int __mem_cgroup_try_charge(struct mm_struct *mm,
1404 gfp_t gfp_mask, struct mem_cgroup **memcg, 1514 gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom)
1405 bool oom, struct page *page)
1406{ 1515{
1407 struct mem_cgroup *mem, *mem_over_limit; 1516 struct mem_cgroup *mem, *mem_over_limit;
1408 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 1517 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
1409 struct res_counter *fail_res; 1518 struct res_counter *fail_res;
1410 int csize = CHARGE_SIZE; 1519 int csize = CHARGE_SIZE;
1411 1520
1412 if (unlikely(test_thread_flag(TIF_MEMDIE))) { 1521 /*
1413 /* Don't account this! */ 1522 * Unlike gloval-vm's OOM-kill, we're not in memory shortage
1414 *memcg = NULL; 1523 * in system level. So, allow to go ahead dying process in addition to
1415 return 0; 1524 * MEMDIE process.
1416 } 1525 */
1526 if (unlikely(test_thread_flag(TIF_MEMDIE)
1527 || fatal_signal_pending(current)))
1528 goto bypass;
1417 1529
1418 /* 1530 /*
1419 * We always charge the cgroup the mm_struct belongs to. 1531 * We always charge the cgroup the mm_struct belongs to.
@@ -1440,7 +1552,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
1440 unsigned long flags = 0; 1552 unsigned long flags = 0;
1441 1553
1442 if (consume_stock(mem)) 1554 if (consume_stock(mem))
1443 goto charged; 1555 goto done;
1444 1556
1445 ret = res_counter_charge(&mem->res, csize, &fail_res); 1557 ret = res_counter_charge(&mem->res, csize, &fail_res);
1446 if (likely(!ret)) { 1558 if (likely(!ret)) {
@@ -1483,28 +1595,70 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
1483 if (mem_cgroup_check_under_limit(mem_over_limit)) 1595 if (mem_cgroup_check_under_limit(mem_over_limit))
1484 continue; 1596 continue;
1485 1597
1598 /* try to avoid oom while someone is moving charge */
1599 if (mc.moving_task && current != mc.moving_task) {
1600 struct mem_cgroup *from, *to;
1601 bool do_continue = false;
1602 /*
1603 * There is a small race that "from" or "to" can be
1604 * freed by rmdir, so we use css_tryget().
1605 */
1606 rcu_read_lock();
1607 from = mc.from;
1608 to = mc.to;
1609 if (from && css_tryget(&from->css)) {
1610 if (mem_over_limit->use_hierarchy)
1611 do_continue = css_is_ancestor(
1612 &from->css,
1613 &mem_over_limit->css);
1614 else
1615 do_continue = (from == mem_over_limit);
1616 css_put(&from->css);
1617 }
1618 if (!do_continue && to && css_tryget(&to->css)) {
1619 if (mem_over_limit->use_hierarchy)
1620 do_continue = css_is_ancestor(
1621 &to->css,
1622 &mem_over_limit->css);
1623 else
1624 do_continue = (to == mem_over_limit);
1625 css_put(&to->css);
1626 }
1627 rcu_read_unlock();
1628 if (do_continue) {
1629 DEFINE_WAIT(wait);
1630 prepare_to_wait(&mc.waitq, &wait,
1631 TASK_INTERRUPTIBLE);
1632 /* moving charge context might have finished. */
1633 if (mc.moving_task)
1634 schedule();
1635 finish_wait(&mc.waitq, &wait);
1636 continue;
1637 }
1638 }
1639
1486 if (!nr_retries--) { 1640 if (!nr_retries--) {
1487 if (oom) { 1641 if (!oom)
1488 mem_cgroup_out_of_memory(mem_over_limit, gfp_mask); 1642 goto nomem;
1489 record_last_oom(mem_over_limit); 1643 if (mem_cgroup_handle_oom(mem_over_limit, gfp_mask)) {
1644 nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
1645 continue;
1490 } 1646 }
1491 goto nomem; 1647 /* When we reach here, current task is dying .*/
1648 css_put(&mem->css);
1649 goto bypass;
1492 } 1650 }
1493 } 1651 }
1494 if (csize > PAGE_SIZE) 1652 if (csize > PAGE_SIZE)
1495 refill_stock(mem, csize - PAGE_SIZE); 1653 refill_stock(mem, csize - PAGE_SIZE);
1496charged:
1497 /*
1498 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
1499 * if they exceeds softlimit.
1500 */
1501 if (mem_cgroup_soft_limit_check(mem))
1502 mem_cgroup_update_tree(mem, page);
1503done: 1654done:
1504 return 0; 1655 return 0;
1505nomem: 1656nomem:
1506 css_put(&mem->css); 1657 css_put(&mem->css);
1507 return -ENOMEM; 1658 return -ENOMEM;
1659bypass:
1660 *memcg = NULL;
1661 return 0;
1508} 1662}
1509 1663
1510/* 1664/*
@@ -1512,14 +1666,23 @@ nomem:
1512 * This function is for that and do uncharge, put css's refcnt. 1666 * This function is for that and do uncharge, put css's refcnt.
1513 * gotten by try_charge(). 1667 * gotten by try_charge().
1514 */ 1668 */
1515static void mem_cgroup_cancel_charge(struct mem_cgroup *mem) 1669static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem,
1670 unsigned long count)
1516{ 1671{
1517 if (!mem_cgroup_is_root(mem)) { 1672 if (!mem_cgroup_is_root(mem)) {
1518 res_counter_uncharge(&mem->res, PAGE_SIZE); 1673 res_counter_uncharge(&mem->res, PAGE_SIZE * count);
1519 if (do_swap_account) 1674 if (do_swap_account)
1520 res_counter_uncharge(&mem->memsw, PAGE_SIZE); 1675 res_counter_uncharge(&mem->memsw, PAGE_SIZE * count);
1676 VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags));
1677 WARN_ON_ONCE(count > INT_MAX);
1678 __css_put(&mem->css, (int)count);
1521 } 1679 }
1522 css_put(&mem->css); 1680 /* we don't need css_put for root */
1681}
1682
1683static void mem_cgroup_cancel_charge(struct mem_cgroup *mem)
1684{
1685 __mem_cgroup_cancel_charge(mem, 1);
1523} 1686}
1524 1687
1525/* 1688/*
@@ -1615,6 +1778,12 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
1615 mem_cgroup_charge_statistics(mem, pc, true); 1778 mem_cgroup_charge_statistics(mem, pc, true);
1616 1779
1617 unlock_page_cgroup(pc); 1780 unlock_page_cgroup(pc);
1781 /*
1782 * "charge_statistics" updated event counter. Then, check it.
1783 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
1784 * if they exceeds softlimit.
1785 */
1786 memcg_check_events(mem, pc->page);
1618} 1787}
1619 1788
1620/** 1789/**
@@ -1622,61 +1791,48 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
1622 * @pc: page_cgroup of the page. 1791 * @pc: page_cgroup of the page.
1623 * @from: mem_cgroup which the page is moved from. 1792 * @from: mem_cgroup which the page is moved from.
1624 * @to: mem_cgroup which the page is moved to. @from != @to. 1793 * @to: mem_cgroup which the page is moved to. @from != @to.
1794 * @uncharge: whether we should call uncharge and css_put against @from.
1625 * 1795 *
1626 * The caller must confirm following. 1796 * The caller must confirm following.
1627 * - page is not on LRU (isolate_page() is useful.) 1797 * - page is not on LRU (isolate_page() is useful.)
1628 * - the pc is locked, used, and ->mem_cgroup points to @from. 1798 * - the pc is locked, used, and ->mem_cgroup points to @from.
1629 * 1799 *
1630 * This function does "uncharge" from old cgroup but doesn't do "charge" to 1800 * This function doesn't do "charge" nor css_get to new cgroup. It should be
1631 * new cgroup. It should be done by a caller. 1801 * done by a caller(__mem_cgroup_try_charge would be usefull). If @uncharge is
1802 * true, this function does "uncharge" from old cgroup, but it doesn't if
1803 * @uncharge is false, so a caller should do "uncharge".
1632 */ 1804 */
1633 1805
1634static void __mem_cgroup_move_account(struct page_cgroup *pc, 1806static void __mem_cgroup_move_account(struct page_cgroup *pc,
1635 struct mem_cgroup *from, struct mem_cgroup *to) 1807 struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge)
1636{ 1808{
1637 struct page *page;
1638 int cpu;
1639 struct mem_cgroup_stat *stat;
1640 struct mem_cgroup_stat_cpu *cpustat;
1641
1642 VM_BUG_ON(from == to); 1809 VM_BUG_ON(from == to);
1643 VM_BUG_ON(PageLRU(pc->page)); 1810 VM_BUG_ON(PageLRU(pc->page));
1644 VM_BUG_ON(!PageCgroupLocked(pc)); 1811 VM_BUG_ON(!PageCgroupLocked(pc));
1645 VM_BUG_ON(!PageCgroupUsed(pc)); 1812 VM_BUG_ON(!PageCgroupUsed(pc));
1646 VM_BUG_ON(pc->mem_cgroup != from); 1813 VM_BUG_ON(pc->mem_cgroup != from);
1647 1814
1648 if (!mem_cgroup_is_root(from)) 1815 if (PageCgroupFileMapped(pc)) {
1649 res_counter_uncharge(&from->res, PAGE_SIZE); 1816 /* Update mapped_file data for mem_cgroup */
1650 mem_cgroup_charge_statistics(from, pc, false); 1817 preempt_disable();
1651 1818 __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
1652 page = pc->page; 1819 __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
1653 if (page_mapped(page) && !PageAnon(page)) { 1820 preempt_enable();
1654 cpu = smp_processor_id();
1655 /* Update mapped_file data for mem_cgroup "from" */
1656 stat = &from->stat;
1657 cpustat = &stat->cpustat[cpu];
1658 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED,
1659 -1);
1660
1661 /* Update mapped_file data for mem_cgroup "to" */
1662 stat = &to->stat;
1663 cpustat = &stat->cpustat[cpu];
1664 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED,
1665 1);
1666 } 1821 }
1822 mem_cgroup_charge_statistics(from, pc, false);
1823 if (uncharge)
1824 /* This is not "cancel", but cancel_charge does all we need. */
1825 mem_cgroup_cancel_charge(from);
1667 1826
1668 if (do_swap_account && !mem_cgroup_is_root(from)) 1827 /* caller should have done css_get */
1669 res_counter_uncharge(&from->memsw, PAGE_SIZE);
1670 css_put(&from->css);
1671
1672 css_get(&to->css);
1673 pc->mem_cgroup = to; 1828 pc->mem_cgroup = to;
1674 mem_cgroup_charge_statistics(to, pc, true); 1829 mem_cgroup_charge_statistics(to, pc, true);
1675 /* 1830 /*
1676 * We charges against "to" which may not have any tasks. Then, "to" 1831 * We charges against "to" which may not have any tasks. Then, "to"
1677 * can be under rmdir(). But in current implementation, caller of 1832 * can be under rmdir(). But in current implementation, caller of
1678 * this function is just force_empty() and it's garanteed that 1833 * this function is just force_empty() and move charge, so it's
1679 * "to" is never removed. So, we don't check rmdir status here. 1834 * garanteed that "to" is never removed. So, we don't check rmdir
1835 * status here.
1680 */ 1836 */
1681} 1837}
1682 1838
@@ -1685,15 +1841,20 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc,
1685 * __mem_cgroup_move_account() 1841 * __mem_cgroup_move_account()
1686 */ 1842 */
1687static int mem_cgroup_move_account(struct page_cgroup *pc, 1843static int mem_cgroup_move_account(struct page_cgroup *pc,
1688 struct mem_cgroup *from, struct mem_cgroup *to) 1844 struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge)
1689{ 1845{
1690 int ret = -EINVAL; 1846 int ret = -EINVAL;
1691 lock_page_cgroup(pc); 1847 lock_page_cgroup(pc);
1692 if (PageCgroupUsed(pc) && pc->mem_cgroup == from) { 1848 if (PageCgroupUsed(pc) && pc->mem_cgroup == from) {
1693 __mem_cgroup_move_account(pc, from, to); 1849 __mem_cgroup_move_account(pc, from, to, uncharge);
1694 ret = 0; 1850 ret = 0;
1695 } 1851 }
1696 unlock_page_cgroup(pc); 1852 unlock_page_cgroup(pc);
1853 /*
1854 * check events
1855 */
1856 memcg_check_events(to, pc->page);
1857 memcg_check_events(from, pc->page);
1697 return ret; 1858 return ret;
1698} 1859}
1699 1860
@@ -1722,15 +1883,13 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
1722 goto put; 1883 goto put;
1723 1884
1724 parent = mem_cgroup_from_cont(pcg); 1885 parent = mem_cgroup_from_cont(pcg);
1725 ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, page); 1886 ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false);
1726 if (ret || !parent) 1887 if (ret || !parent)
1727 goto put_back; 1888 goto put_back;
1728 1889
1729 ret = mem_cgroup_move_account(pc, child, parent); 1890 ret = mem_cgroup_move_account(pc, child, parent, true);
1730 if (!ret) 1891 if (ret)
1731 css_put(&parent->css); /* drop extra refcnt by try_charge() */ 1892 mem_cgroup_cancel_charge(parent);
1732 else
1733 mem_cgroup_cancel_charge(parent); /* does css_put */
1734put_back: 1893put_back:
1735 putback_lru_page(page); 1894 putback_lru_page(page);
1736put: 1895put:
@@ -1760,7 +1919,7 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
1760 prefetchw(pc); 1919 prefetchw(pc);
1761 1920
1762 mem = memcg; 1921 mem = memcg;
1763 ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true, page); 1922 ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true);
1764 if (ret || !mem) 1923 if (ret || !mem)
1765 return ret; 1924 return ret;
1766 1925
@@ -1880,14 +2039,14 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
1880 if (!mem) 2039 if (!mem)
1881 goto charge_cur_mm; 2040 goto charge_cur_mm;
1882 *ptr = mem; 2041 *ptr = mem;
1883 ret = __mem_cgroup_try_charge(NULL, mask, ptr, true, page); 2042 ret = __mem_cgroup_try_charge(NULL, mask, ptr, true);
1884 /* drop extra refcnt from tryget */ 2043 /* drop extra refcnt from tryget */
1885 css_put(&mem->css); 2044 css_put(&mem->css);
1886 return ret; 2045 return ret;
1887charge_cur_mm: 2046charge_cur_mm:
1888 if (unlikely(!mm)) 2047 if (unlikely(!mm))
1889 mm = &init_mm; 2048 mm = &init_mm;
1890 return __mem_cgroup_try_charge(mm, mask, ptr, true, page); 2049 return __mem_cgroup_try_charge(mm, mask, ptr, true);
1891} 2050}
1892 2051
1893static void 2052static void
@@ -2064,8 +2223,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2064 mz = page_cgroup_zoneinfo(pc); 2223 mz = page_cgroup_zoneinfo(pc);
2065 unlock_page_cgroup(pc); 2224 unlock_page_cgroup(pc);
2066 2225
2067 if (mem_cgroup_soft_limit_check(mem)) 2226 memcg_check_events(mem, page);
2068 mem_cgroup_update_tree(mem, page);
2069 /* at swapout, this memcg will be accessed to record to swap */ 2227 /* at swapout, this memcg will be accessed to record to swap */
2070 if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT) 2228 if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
2071 css_put(&mem->css); 2229 css_put(&mem->css);
@@ -2156,7 +2314,9 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
2156 2314
2157 /* record memcg information */ 2315 /* record memcg information */
2158 if (do_swap_account && swapout && memcg) { 2316 if (do_swap_account && swapout && memcg) {
2317 rcu_read_lock();
2159 swap_cgroup_record(ent, css_id(&memcg->css)); 2318 swap_cgroup_record(ent, css_id(&memcg->css));
2319 rcu_read_unlock();
2160 mem_cgroup_get(memcg); 2320 mem_cgroup_get(memcg);
2161 } 2321 }
2162 if (swapout && memcg) 2322 if (swapout && memcg)
@@ -2192,6 +2352,66 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent)
2192 } 2352 }
2193 rcu_read_unlock(); 2353 rcu_read_unlock();
2194} 2354}
2355
2356/**
2357 * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.
2358 * @entry: swap entry to be moved
2359 * @from: mem_cgroup which the entry is moved from
2360 * @to: mem_cgroup which the entry is moved to
2361 * @need_fixup: whether we should fixup res_counters and refcounts.
2362 *
2363 * It succeeds only when the swap_cgroup's record for this entry is the same
2364 * as the mem_cgroup's id of @from.
2365 *
2366 * Returns 0 on success, -EINVAL on failure.
2367 *
2368 * The caller must have charged to @to, IOW, called res_counter_charge() about
2369 * both res and memsw, and called css_get().
2370 */
2371static int mem_cgroup_move_swap_account(swp_entry_t entry,
2372 struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup)
2373{
2374 unsigned short old_id, new_id;
2375
2376 rcu_read_lock();
2377 old_id = css_id(&from->css);
2378 new_id = css_id(&to->css);
2379 rcu_read_unlock();
2380
2381 if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
2382 mem_cgroup_swap_statistics(from, false);
2383 mem_cgroup_swap_statistics(to, true);
2384 /*
2385 * This function is only called from task migration context now.
2386 * It postpones res_counter and refcount handling till the end
2387 * of task migration(mem_cgroup_clear_mc()) for performance
2388 * improvement. But we cannot postpone mem_cgroup_get(to)
2389 * because if the process that has been moved to @to does
2390 * swap-in, the refcount of @to might be decreased to 0.
2391 */
2392 mem_cgroup_get(to);
2393 if (need_fixup) {
2394 if (!mem_cgroup_is_root(from))
2395 res_counter_uncharge(&from->memsw, PAGE_SIZE);
2396 mem_cgroup_put(from);
2397 /*
2398 * we charged both to->res and to->memsw, so we should
2399 * uncharge to->res.
2400 */
2401 if (!mem_cgroup_is_root(to))
2402 res_counter_uncharge(&to->res, PAGE_SIZE);
2403 css_put(&to->css);
2404 }
2405 return 0;
2406 }
2407 return -EINVAL;
2408}
2409#else
2410static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
2411 struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup)
2412{
2413 return -EINVAL;
2414}
2195#endif 2415#endif
2196 2416
2197/* 2417/*
@@ -2215,12 +2435,11 @@ int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr)
2215 } 2435 }
2216 unlock_page_cgroup(pc); 2436 unlock_page_cgroup(pc);
2217 2437
2438 *ptr = mem;
2218 if (mem) { 2439 if (mem) {
2219 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false, 2440 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false);
2220 page);
2221 css_put(&mem->css); 2441 css_put(&mem->css);
2222 } 2442 }
2223 *ptr = mem;
2224 return ret; 2443 return ret;
2225} 2444}
2226 2445
@@ -2545,7 +2764,7 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
2545 pc = list_entry(list->prev, struct page_cgroup, lru); 2764 pc = list_entry(list->prev, struct page_cgroup, lru);
2546 if (busy == pc) { 2765 if (busy == pc) {
2547 list_move(&pc->lru, list); 2766 list_move(&pc->lru, list);
2548 busy = 0; 2767 busy = NULL;
2549 spin_unlock_irqrestore(&zone->lru_lock, flags); 2768 spin_unlock_irqrestore(&zone->lru_lock, flags);
2550 continue; 2769 continue;
2551 } 2770 }
@@ -2704,7 +2923,7 @@ static int
2704mem_cgroup_get_idx_stat(struct mem_cgroup *mem, void *data) 2923mem_cgroup_get_idx_stat(struct mem_cgroup *mem, void *data)
2705{ 2924{
2706 struct mem_cgroup_idx_data *d = data; 2925 struct mem_cgroup_idx_data *d = data;
2707 d->val += mem_cgroup_read_stat(&mem->stat, d->idx); 2926 d->val += mem_cgroup_read_stat(mem, d->idx);
2708 return 0; 2927 return 0;
2709} 2928}
2710 2929
@@ -2719,40 +2938,50 @@ mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem,
2719 *val = d.val; 2938 *val = d.val;
2720} 2939}
2721 2940
2941static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap)
2942{
2943 u64 idx_val, val;
2944
2945 if (!mem_cgroup_is_root(mem)) {
2946 if (!swap)
2947 return res_counter_read_u64(&mem->res, RES_USAGE);
2948 else
2949 return res_counter_read_u64(&mem->memsw, RES_USAGE);
2950 }
2951
2952 mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_CACHE, &idx_val);
2953 val = idx_val;
2954 mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_RSS, &idx_val);
2955 val += idx_val;
2956
2957 if (swap) {
2958 mem_cgroup_get_recursive_idx_stat(mem,
2959 MEM_CGROUP_STAT_SWAPOUT, &idx_val);
2960 val += idx_val;
2961 }
2962
2963 return val << PAGE_SHIFT;
2964}
2965
2722static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) 2966static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
2723{ 2967{
2724 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 2968 struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
2725 u64 idx_val, val; 2969 u64 val;
2726 int type, name; 2970 int type, name;
2727 2971
2728 type = MEMFILE_TYPE(cft->private); 2972 type = MEMFILE_TYPE(cft->private);
2729 name = MEMFILE_ATTR(cft->private); 2973 name = MEMFILE_ATTR(cft->private);
2730 switch (type) { 2974 switch (type) {
2731 case _MEM: 2975 case _MEM:
2732 if (name == RES_USAGE && mem_cgroup_is_root(mem)) { 2976 if (name == RES_USAGE)
2733 mem_cgroup_get_recursive_idx_stat(mem, 2977 val = mem_cgroup_usage(mem, false);
2734 MEM_CGROUP_STAT_CACHE, &idx_val); 2978 else
2735 val = idx_val;
2736 mem_cgroup_get_recursive_idx_stat(mem,
2737 MEM_CGROUP_STAT_RSS, &idx_val);
2738 val += idx_val;
2739 val <<= PAGE_SHIFT;
2740 } else
2741 val = res_counter_read_u64(&mem->res, name); 2979 val = res_counter_read_u64(&mem->res, name);
2742 break; 2980 break;
2743 case _MEMSWAP: 2981 case _MEMSWAP:
2744 if (name == RES_USAGE && mem_cgroup_is_root(mem)) { 2982 if (name == RES_USAGE)
2745 mem_cgroup_get_recursive_idx_stat(mem, 2983 val = mem_cgroup_usage(mem, true);
2746 MEM_CGROUP_STAT_CACHE, &idx_val); 2984 else
2747 val = idx_val;
2748 mem_cgroup_get_recursive_idx_stat(mem,
2749 MEM_CGROUP_STAT_RSS, &idx_val);
2750 val += idx_val;
2751 mem_cgroup_get_recursive_idx_stat(mem,
2752 MEM_CGROUP_STAT_SWAPOUT, &idx_val);
2753 val += idx_val;
2754 val <<= PAGE_SHIFT;
2755 } else
2756 val = res_counter_read_u64(&mem->memsw, name); 2985 val = res_counter_read_u64(&mem->memsw, name);
2757 break; 2986 break;
2758 default: 2987 default:
@@ -2865,6 +3094,39 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
2865 return 0; 3094 return 0;
2866} 3095}
2867 3096
3097static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp,
3098 struct cftype *cft)
3099{
3100 return mem_cgroup_from_cont(cgrp)->move_charge_at_immigrate;
3101}
3102
3103#ifdef CONFIG_MMU
3104static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
3105 struct cftype *cft, u64 val)
3106{
3107 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
3108
3109 if (val >= (1 << NR_MOVE_TYPE))
3110 return -EINVAL;
3111 /*
3112 * We check this value several times in both in can_attach() and
3113 * attach(), so we need cgroup lock to prevent this value from being
3114 * inconsistent.
3115 */
3116 cgroup_lock();
3117 mem->move_charge_at_immigrate = val;
3118 cgroup_unlock();
3119
3120 return 0;
3121}
3122#else
3123static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
3124 struct cftype *cft, u64 val)
3125{
3126 return -ENOSYS;
3127}
3128#endif
3129
2868 3130
2869/* For read statistics */ 3131/* For read statistics */
2870enum { 3132enum {
@@ -2910,18 +3172,18 @@ static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data)
2910 s64 val; 3172 s64 val;
2911 3173
2912 /* per cpu stat */ 3174 /* per cpu stat */
2913 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_CACHE); 3175 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE);
2914 s->stat[MCS_CACHE] += val * PAGE_SIZE; 3176 s->stat[MCS_CACHE] += val * PAGE_SIZE;
2915 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS); 3177 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);
2916 s->stat[MCS_RSS] += val * PAGE_SIZE; 3178 s->stat[MCS_RSS] += val * PAGE_SIZE;
2917 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_FILE_MAPPED); 3179 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED);
2918 s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE; 3180 s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE;
2919 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGIN_COUNT); 3181 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGIN_COUNT);
2920 s->stat[MCS_PGPGIN] += val; 3182 s->stat[MCS_PGPGIN] += val;
2921 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT); 3183 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGOUT_COUNT);
2922 s->stat[MCS_PGPGOUT] += val; 3184 s->stat[MCS_PGPGOUT] += val;
2923 if (do_swap_account) { 3185 if (do_swap_account) {
2924 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_SWAPOUT); 3186 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT);
2925 s->stat[MCS_SWAP] += val * PAGE_SIZE; 3187 s->stat[MCS_SWAP] += val * PAGE_SIZE;
2926 } 3188 }
2927 3189
@@ -3049,12 +3311,249 @@ static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
3049 return 0; 3311 return 0;
3050} 3312}
3051 3313
3314static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
3315{
3316 struct mem_cgroup_threshold_ary *t;
3317 u64 usage;
3318 int i;
3319
3320 rcu_read_lock();
3321 if (!swap)
3322 t = rcu_dereference(memcg->thresholds);
3323 else
3324 t = rcu_dereference(memcg->memsw_thresholds);
3325
3326 if (!t)
3327 goto unlock;
3328
3329 usage = mem_cgroup_usage(memcg, swap);
3330
3331 /*
3332 * current_threshold points to threshold just below usage.
3333 * If it's not true, a threshold was crossed after last
3334 * call of __mem_cgroup_threshold().
3335 */
3336 i = atomic_read(&t->current_threshold);
3337
3338 /*
3339 * Iterate backward over array of thresholds starting from
3340 * current_threshold and check if a threshold is crossed.
3341 * If none of thresholds below usage is crossed, we read
3342 * only one element of the array here.
3343 */
3344 for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
3345 eventfd_signal(t->entries[i].eventfd, 1);
3346
3347 /* i = current_threshold + 1 */
3348 i++;
3349
3350 /*
3351 * Iterate forward over array of thresholds starting from
3352 * current_threshold+1 and check if a threshold is crossed.
3353 * If none of thresholds above usage is crossed, we read
3354 * only one element of the array here.
3355 */
3356 for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
3357 eventfd_signal(t->entries[i].eventfd, 1);
3358
3359 /* Update current_threshold */
3360 atomic_set(&t->current_threshold, i - 1);
3361unlock:
3362 rcu_read_unlock();
3363}
3364
3365static void mem_cgroup_threshold(struct mem_cgroup *memcg)
3366{
3367 __mem_cgroup_threshold(memcg, false);
3368 if (do_swap_account)
3369 __mem_cgroup_threshold(memcg, true);
3370}
3371
3372static int compare_thresholds(const void *a, const void *b)
3373{
3374 const struct mem_cgroup_threshold *_a = a;
3375 const struct mem_cgroup_threshold *_b = b;
3376
3377 return _a->threshold - _b->threshold;
3378}
3379
3380static int mem_cgroup_register_event(struct cgroup *cgrp, struct cftype *cft,
3381 struct eventfd_ctx *eventfd, const char *args)
3382{
3383 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
3384 struct mem_cgroup_threshold_ary *thresholds, *thresholds_new;
3385 int type = MEMFILE_TYPE(cft->private);
3386 u64 threshold, usage;
3387 int size;
3388 int i, ret;
3389
3390 ret = res_counter_memparse_write_strategy(args, &threshold);
3391 if (ret)
3392 return ret;
3393
3394 mutex_lock(&memcg->thresholds_lock);
3395 if (type == _MEM)
3396 thresholds = memcg->thresholds;
3397 else if (type == _MEMSWAP)
3398 thresholds = memcg->memsw_thresholds;
3399 else
3400 BUG();
3401
3402 usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
3403
3404 /* Check if a threshold crossed before adding a new one */
3405 if (thresholds)
3406 __mem_cgroup_threshold(memcg, type == _MEMSWAP);
3407
3408 if (thresholds)
3409 size = thresholds->size + 1;
3410 else
3411 size = 1;
3412
3413 /* Allocate memory for new array of thresholds */
3414 thresholds_new = kmalloc(sizeof(*thresholds_new) +
3415 size * sizeof(struct mem_cgroup_threshold),
3416 GFP_KERNEL);
3417 if (!thresholds_new) {
3418 ret = -ENOMEM;
3419 goto unlock;
3420 }
3421 thresholds_new->size = size;
3422
3423 /* Copy thresholds (if any) to new array */
3424 if (thresholds)
3425 memcpy(thresholds_new->entries, thresholds->entries,
3426 thresholds->size *
3427 sizeof(struct mem_cgroup_threshold));
3428 /* Add new threshold */
3429 thresholds_new->entries[size - 1].eventfd = eventfd;
3430 thresholds_new->entries[size - 1].threshold = threshold;
3431
3432 /* Sort thresholds. Registering of new threshold isn't time-critical */
3433 sort(thresholds_new->entries, size,
3434 sizeof(struct mem_cgroup_threshold),
3435 compare_thresholds, NULL);
3436
3437 /* Find current threshold */
3438 atomic_set(&thresholds_new->current_threshold, -1);
3439 for (i = 0; i < size; i++) {
3440 if (thresholds_new->entries[i].threshold < usage) {
3441 /*
3442 * thresholds_new->current_threshold will not be used
3443 * until rcu_assign_pointer(), so it's safe to increment
3444 * it here.
3445 */
3446 atomic_inc(&thresholds_new->current_threshold);
3447 }
3448 }
3449
3450 if (type == _MEM)
3451 rcu_assign_pointer(memcg->thresholds, thresholds_new);
3452 else
3453 rcu_assign_pointer(memcg->memsw_thresholds, thresholds_new);
3454
3455 /* To be sure that nobody uses thresholds before freeing it */
3456 synchronize_rcu();
3457
3458 kfree(thresholds);
3459unlock:
3460 mutex_unlock(&memcg->thresholds_lock);
3461
3462 return ret;
3463}
3464
3465static int mem_cgroup_unregister_event(struct cgroup *cgrp, struct cftype *cft,
3466 struct eventfd_ctx *eventfd)
3467{
3468 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
3469 struct mem_cgroup_threshold_ary *thresholds, *thresholds_new;
3470 int type = MEMFILE_TYPE(cft->private);
3471 u64 usage;
3472 int size = 0;
3473 int i, j, ret;
3474
3475 mutex_lock(&memcg->thresholds_lock);
3476 if (type == _MEM)
3477 thresholds = memcg->thresholds;
3478 else if (type == _MEMSWAP)
3479 thresholds = memcg->memsw_thresholds;
3480 else
3481 BUG();
3482
3483 /*
3484 * Something went wrong if we trying to unregister a threshold
3485 * if we don't have thresholds
3486 */
3487 BUG_ON(!thresholds);
3488
3489 usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
3490
3491 /* Check if a threshold crossed before removing */
3492 __mem_cgroup_threshold(memcg, type == _MEMSWAP);
3493
3494 /* Calculate new number of threshold */
3495 for (i = 0; i < thresholds->size; i++) {
3496 if (thresholds->entries[i].eventfd != eventfd)
3497 size++;
3498 }
3499
3500 /* Set thresholds array to NULL if we don't have thresholds */
3501 if (!size) {
3502 thresholds_new = NULL;
3503 goto assign;
3504 }
3505
3506 /* Allocate memory for new array of thresholds */
3507 thresholds_new = kmalloc(sizeof(*thresholds_new) +
3508 size * sizeof(struct mem_cgroup_threshold),
3509 GFP_KERNEL);
3510 if (!thresholds_new) {
3511 ret = -ENOMEM;
3512 goto unlock;
3513 }
3514 thresholds_new->size = size;
3515
3516 /* Copy thresholds and find current threshold */
3517 atomic_set(&thresholds_new->current_threshold, -1);
3518 for (i = 0, j = 0; i < thresholds->size; i++) {
3519 if (thresholds->entries[i].eventfd == eventfd)
3520 continue;
3521
3522 thresholds_new->entries[j] = thresholds->entries[i];
3523 if (thresholds_new->entries[j].threshold < usage) {
3524 /*
3525 * thresholds_new->current_threshold will not be used
3526 * until rcu_assign_pointer(), so it's safe to increment
3527 * it here.
3528 */
3529 atomic_inc(&thresholds_new->current_threshold);
3530 }
3531 j++;
3532 }
3533
3534assign:
3535 if (type == _MEM)
3536 rcu_assign_pointer(memcg->thresholds, thresholds_new);
3537 else
3538 rcu_assign_pointer(memcg->memsw_thresholds, thresholds_new);
3539
3540 /* To be sure that nobody uses thresholds before freeing it */
3541 synchronize_rcu();
3542
3543 kfree(thresholds);
3544unlock:
3545 mutex_unlock(&memcg->thresholds_lock);
3546
3547 return ret;
3548}
3052 3549
3053static struct cftype mem_cgroup_files[] = { 3550static struct cftype mem_cgroup_files[] = {
3054 { 3551 {
3055 .name = "usage_in_bytes", 3552 .name = "usage_in_bytes",
3056 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), 3553 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
3057 .read_u64 = mem_cgroup_read, 3554 .read_u64 = mem_cgroup_read,
3555 .register_event = mem_cgroup_register_event,
3556 .unregister_event = mem_cgroup_unregister_event,
3058 }, 3557 },
3059 { 3558 {
3060 .name = "max_usage_in_bytes", 3559 .name = "max_usage_in_bytes",
@@ -3098,6 +3597,11 @@ static struct cftype mem_cgroup_files[] = {
3098 .read_u64 = mem_cgroup_swappiness_read, 3597 .read_u64 = mem_cgroup_swappiness_read,
3099 .write_u64 = mem_cgroup_swappiness_write, 3598 .write_u64 = mem_cgroup_swappiness_write,
3100 }, 3599 },
3600 {
3601 .name = "move_charge_at_immigrate",
3602 .read_u64 = mem_cgroup_move_charge_read,
3603 .write_u64 = mem_cgroup_move_charge_write,
3604 },
3101}; 3605};
3102 3606
3103#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 3607#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
@@ -3106,6 +3610,8 @@ static struct cftype memsw_cgroup_files[] = {
3106 .name = "memsw.usage_in_bytes", 3610 .name = "memsw.usage_in_bytes",
3107 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), 3611 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
3108 .read_u64 = mem_cgroup_read, 3612 .read_u64 = mem_cgroup_read,
3613 .register_event = mem_cgroup_register_event,
3614 .unregister_event = mem_cgroup_unregister_event,
3109 }, 3615 },
3110 { 3616 {
3111 .name = "memsw.max_usage_in_bytes", 3617 .name = "memsw.max_usage_in_bytes",
@@ -3180,24 +3686,29 @@ static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
3180 kfree(mem->info.nodeinfo[node]); 3686 kfree(mem->info.nodeinfo[node]);
3181} 3687}
3182 3688
3183static int mem_cgroup_size(void)
3184{
3185 int cpustat_size = nr_cpu_ids * sizeof(struct mem_cgroup_stat_cpu);
3186 return sizeof(struct mem_cgroup) + cpustat_size;
3187}
3188
3189static struct mem_cgroup *mem_cgroup_alloc(void) 3689static struct mem_cgroup *mem_cgroup_alloc(void)
3190{ 3690{
3191 struct mem_cgroup *mem; 3691 struct mem_cgroup *mem;
3192 int size = mem_cgroup_size(); 3692 int size = sizeof(struct mem_cgroup);
3193 3693
3694 /* Can be very big if MAX_NUMNODES is very big */
3194 if (size < PAGE_SIZE) 3695 if (size < PAGE_SIZE)
3195 mem = kmalloc(size, GFP_KERNEL); 3696 mem = kmalloc(size, GFP_KERNEL);
3196 else 3697 else
3197 mem = vmalloc(size); 3698 mem = vmalloc(size);
3198 3699
3199 if (mem) 3700 if (!mem)
3200 memset(mem, 0, size); 3701 return NULL;
3702
3703 memset(mem, 0, size);
3704 mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
3705 if (!mem->stat) {
3706 if (size < PAGE_SIZE)
3707 kfree(mem);
3708 else
3709 vfree(mem);
3710 mem = NULL;
3711 }
3201 return mem; 3712 return mem;
3202} 3713}
3203 3714
@@ -3222,7 +3733,8 @@ static void __mem_cgroup_free(struct mem_cgroup *mem)
3222 for_each_node_state(node, N_POSSIBLE) 3733 for_each_node_state(node, N_POSSIBLE)
3223 free_mem_cgroup_per_zone_info(mem, node); 3734 free_mem_cgroup_per_zone_info(mem, node);
3224 3735
3225 if (mem_cgroup_size() < PAGE_SIZE) 3736 free_percpu(mem->stat);
3737 if (sizeof(struct mem_cgroup) < PAGE_SIZE)
3226 kfree(mem); 3738 kfree(mem);
3227 else 3739 else
3228 vfree(mem); 3740 vfree(mem);
@@ -3233,9 +3745,9 @@ static void mem_cgroup_get(struct mem_cgroup *mem)
3233 atomic_inc(&mem->refcnt); 3745 atomic_inc(&mem->refcnt);
3234} 3746}
3235 3747
3236static void mem_cgroup_put(struct mem_cgroup *mem) 3748static void __mem_cgroup_put(struct mem_cgroup *mem, int count)
3237{ 3749{
3238 if (atomic_dec_and_test(&mem->refcnt)) { 3750 if (atomic_sub_and_test(count, &mem->refcnt)) {
3239 struct mem_cgroup *parent = parent_mem_cgroup(mem); 3751 struct mem_cgroup *parent = parent_mem_cgroup(mem);
3240 __mem_cgroup_free(mem); 3752 __mem_cgroup_free(mem);
3241 if (parent) 3753 if (parent)
@@ -3243,6 +3755,11 @@ static void mem_cgroup_put(struct mem_cgroup *mem)
3243 } 3755 }
3244} 3756}
3245 3757
3758static void mem_cgroup_put(struct mem_cgroup *mem)
3759{
3760 __mem_cgroup_put(mem, 1);
3761}
3762
3246/* 3763/*
3247 * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled. 3764 * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled.
3248 */ 3765 */
@@ -3319,7 +3836,6 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
3319 INIT_WORK(&stock->work, drain_local_stock); 3836 INIT_WORK(&stock->work, drain_local_stock);
3320 } 3837 }
3321 hotcpu_notifier(memcg_stock_cpu_callback, 0); 3838 hotcpu_notifier(memcg_stock_cpu_callback, 0);
3322
3323 } else { 3839 } else {
3324 parent = mem_cgroup_from_cont(cont->parent); 3840 parent = mem_cgroup_from_cont(cont->parent);
3325 mem->use_hierarchy = parent->use_hierarchy; 3841 mem->use_hierarchy = parent->use_hierarchy;
@@ -3345,6 +3861,8 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
3345 if (parent) 3861 if (parent)
3346 mem->swappiness = get_swappiness(parent); 3862 mem->swappiness = get_swappiness(parent);
3347 atomic_set(&mem->refcnt, 1); 3863 atomic_set(&mem->refcnt, 1);
3864 mem->move_charge_at_immigrate = 0;
3865 mutex_init(&mem->thresholds_lock);
3348 return &mem->css; 3866 return &mem->css;
3349free_out: 3867free_out:
3350 __mem_cgroup_free(mem); 3868 __mem_cgroup_free(mem);
@@ -3381,17 +3899,450 @@ static int mem_cgroup_populate(struct cgroup_subsys *ss,
3381 return ret; 3899 return ret;
3382} 3900}
3383 3901
3902#ifdef CONFIG_MMU
3903/* Handlers for move charge at task migration. */
3904#define PRECHARGE_COUNT_AT_ONCE 256
3905static int mem_cgroup_do_precharge(unsigned long count)
3906{
3907 int ret = 0;
3908 int batch_count = PRECHARGE_COUNT_AT_ONCE;
3909 struct mem_cgroup *mem = mc.to;
3910
3911 if (mem_cgroup_is_root(mem)) {
3912 mc.precharge += count;
3913 /* we don't need css_get for root */
3914 return ret;
3915 }
3916 /* try to charge at once */
3917 if (count > 1) {
3918 struct res_counter *dummy;
3919 /*
3920 * "mem" cannot be under rmdir() because we've already checked
3921 * by cgroup_lock_live_cgroup() that it is not removed and we
3922 * are still under the same cgroup_mutex. So we can postpone
3923 * css_get().
3924 */
3925 if (res_counter_charge(&mem->res, PAGE_SIZE * count, &dummy))
3926 goto one_by_one;
3927 if (do_swap_account && res_counter_charge(&mem->memsw,
3928 PAGE_SIZE * count, &dummy)) {
3929 res_counter_uncharge(&mem->res, PAGE_SIZE * count);
3930 goto one_by_one;
3931 }
3932 mc.precharge += count;
3933 VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags));
3934 WARN_ON_ONCE(count > INT_MAX);
3935 __css_get(&mem->css, (int)count);
3936 return ret;
3937 }
3938one_by_one:
3939 /* fall back to one by one charge */
3940 while (count--) {
3941 if (signal_pending(current)) {
3942 ret = -EINTR;
3943 break;
3944 }
3945 if (!batch_count--) {
3946 batch_count = PRECHARGE_COUNT_AT_ONCE;
3947 cond_resched();
3948 }
3949 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false);
3950 if (ret || !mem)
3951 /* mem_cgroup_clear_mc() will do uncharge later */
3952 return -ENOMEM;
3953 mc.precharge++;
3954 }
3955 return ret;
3956}
3957
3958/**
3959 * is_target_pte_for_mc - check a pte whether it is valid for move charge
3960 * @vma: the vma the pte to be checked belongs
3961 * @addr: the address corresponding to the pte to be checked
3962 * @ptent: the pte to be checked
3963 * @target: the pointer the target page or swap ent will be stored(can be NULL)
3964 *
3965 * Returns
3966 * 0(MC_TARGET_NONE): if the pte is not a target for move charge.
3967 * 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for
3968 * move charge. if @target is not NULL, the page is stored in target->page
3969 * with extra refcnt got(Callers should handle it).
3970 * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
3971 * target for charge migration. if @target is not NULL, the entry is stored
3972 * in target->ent.
3973 *
3974 * Called with pte lock held.
3975 */
3976union mc_target {
3977 struct page *page;
3978 swp_entry_t ent;
3979};
3980
3981enum mc_target_type {
3982 MC_TARGET_NONE, /* not used */
3983 MC_TARGET_PAGE,
3984 MC_TARGET_SWAP,
3985};
3986
3987static int is_target_pte_for_mc(struct vm_area_struct *vma,
3988 unsigned long addr, pte_t ptent, union mc_target *target)
3989{
3990 struct page *page = NULL;
3991 struct page_cgroup *pc;
3992 int ret = 0;
3993 swp_entry_t ent = { .val = 0 };
3994 int usage_count = 0;
3995 bool move_anon = test_bit(MOVE_CHARGE_TYPE_ANON,
3996 &mc.to->move_charge_at_immigrate);
3997
3998 if (!pte_present(ptent)) {
3999 /* TODO: handle swap of shmes/tmpfs */
4000 if (pte_none(ptent) || pte_file(ptent))
4001 return 0;
4002 else if (is_swap_pte(ptent)) {
4003 ent = pte_to_swp_entry(ptent);
4004 if (!move_anon || non_swap_entry(ent))
4005 return 0;
4006 usage_count = mem_cgroup_count_swap_user(ent, &page);
4007 }
4008 } else {
4009 page = vm_normal_page(vma, addr, ptent);
4010 if (!page || !page_mapped(page))
4011 return 0;
4012 /*
4013 * TODO: We don't move charges of file(including shmem/tmpfs)
4014 * pages for now.
4015 */
4016 if (!move_anon || !PageAnon(page))
4017 return 0;
4018 if (!get_page_unless_zero(page))
4019 return 0;
4020 usage_count = page_mapcount(page);
4021 }
4022 if (usage_count > 1) {
4023 /*
4024 * TODO: We don't move charges of shared(used by multiple
4025 * processes) pages for now.
4026 */
4027 if (page)
4028 put_page(page);
4029 return 0;
4030 }
4031 if (page) {
4032 pc = lookup_page_cgroup(page);
4033 /*
4034 * Do only loose check w/o page_cgroup lock.
4035 * mem_cgroup_move_account() checks the pc is valid or not under
4036 * the lock.
4037 */
4038 if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
4039 ret = MC_TARGET_PAGE;
4040 if (target)
4041 target->page = page;
4042 }
4043 if (!ret || !target)
4044 put_page(page);
4045 }
4046 /* throught */
4047 if (ent.val && do_swap_account && !ret) {
4048 unsigned short id;
4049 rcu_read_lock();
4050 id = css_id(&mc.from->css);
4051 rcu_read_unlock();
4052 if (id == lookup_swap_cgroup(ent)) {
4053 ret = MC_TARGET_SWAP;
4054 if (target)
4055 target->ent = ent;
4056 }
4057 }
4058 return ret;
4059}
4060
4061static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
4062 unsigned long addr, unsigned long end,
4063 struct mm_walk *walk)
4064{
4065 struct vm_area_struct *vma = walk->private;
4066 pte_t *pte;
4067 spinlock_t *ptl;
4068
4069 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
4070 for (; addr != end; pte++, addr += PAGE_SIZE)
4071 if (is_target_pte_for_mc(vma, addr, *pte, NULL))
4072 mc.precharge++; /* increment precharge temporarily */
4073 pte_unmap_unlock(pte - 1, ptl);
4074 cond_resched();
4075
4076 return 0;
4077}
4078
4079static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
4080{
4081 unsigned long precharge;
4082 struct vm_area_struct *vma;
4083
4084 down_read(&mm->mmap_sem);
4085 for (vma = mm->mmap; vma; vma = vma->vm_next) {
4086 struct mm_walk mem_cgroup_count_precharge_walk = {
4087 .pmd_entry = mem_cgroup_count_precharge_pte_range,
4088 .mm = mm,
4089 .private = vma,
4090 };
4091 if (is_vm_hugetlb_page(vma))
4092 continue;
4093 /* TODO: We don't move charges of shmem/tmpfs pages for now. */
4094 if (vma->vm_flags & VM_SHARED)
4095 continue;
4096 walk_page_range(vma->vm_start, vma->vm_end,
4097 &mem_cgroup_count_precharge_walk);
4098 }
4099 up_read(&mm->mmap_sem);
4100
4101 precharge = mc.precharge;
4102 mc.precharge = 0;
4103
4104 return precharge;
4105}
4106
4107static int mem_cgroup_precharge_mc(struct mm_struct *mm)
4108{
4109 return mem_cgroup_do_precharge(mem_cgroup_count_precharge(mm));
4110}
4111
4112static void mem_cgroup_clear_mc(void)
4113{
4114 /* we must uncharge all the leftover precharges from mc.to */
4115 if (mc.precharge) {
4116 __mem_cgroup_cancel_charge(mc.to, mc.precharge);
4117 mc.precharge = 0;
4118 }
4119 /*
4120 * we didn't uncharge from mc.from at mem_cgroup_move_account(), so
4121 * we must uncharge here.
4122 */
4123 if (mc.moved_charge) {
4124 __mem_cgroup_cancel_charge(mc.from, mc.moved_charge);
4125 mc.moved_charge = 0;
4126 }
4127 /* we must fixup refcnts and charges */
4128 if (mc.moved_swap) {
4129 WARN_ON_ONCE(mc.moved_swap > INT_MAX);
4130 /* uncharge swap account from the old cgroup */
4131 if (!mem_cgroup_is_root(mc.from))
4132 res_counter_uncharge(&mc.from->memsw,
4133 PAGE_SIZE * mc.moved_swap);
4134 __mem_cgroup_put(mc.from, mc.moved_swap);
4135
4136 if (!mem_cgroup_is_root(mc.to)) {
4137 /*
4138 * we charged both to->res and to->memsw, so we should
4139 * uncharge to->res.
4140 */
4141 res_counter_uncharge(&mc.to->res,
4142 PAGE_SIZE * mc.moved_swap);
4143 VM_BUG_ON(test_bit(CSS_ROOT, &mc.to->css.flags));
4144 __css_put(&mc.to->css, mc.moved_swap);
4145 }
4146 /* we've already done mem_cgroup_get(mc.to) */
4147
4148 mc.moved_swap = 0;
4149 }
4150 mc.from = NULL;
4151 mc.to = NULL;
4152 mc.moving_task = NULL;
4153 wake_up_all(&mc.waitq);
4154}
4155
4156static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
4157 struct cgroup *cgroup,
4158 struct task_struct *p,
4159 bool threadgroup)
4160{
4161 int ret = 0;
4162 struct mem_cgroup *mem = mem_cgroup_from_cont(cgroup);
4163
4164 if (mem->move_charge_at_immigrate) {
4165 struct mm_struct *mm;
4166 struct mem_cgroup *from = mem_cgroup_from_task(p);
4167
4168 VM_BUG_ON(from == mem);
4169
4170 mm = get_task_mm(p);
4171 if (!mm)
4172 return 0;
4173 /* We move charges only when we move a owner of the mm */
4174 if (mm->owner == p) {
4175 VM_BUG_ON(mc.from);
4176 VM_BUG_ON(mc.to);
4177 VM_BUG_ON(mc.precharge);
4178 VM_BUG_ON(mc.moved_charge);
4179 VM_BUG_ON(mc.moved_swap);
4180 VM_BUG_ON(mc.moving_task);
4181 mc.from = from;
4182 mc.to = mem;
4183 mc.precharge = 0;
4184 mc.moved_charge = 0;
4185 mc.moved_swap = 0;
4186 mc.moving_task = current;
4187
4188 ret = mem_cgroup_precharge_mc(mm);
4189 if (ret)
4190 mem_cgroup_clear_mc();
4191 }
4192 mmput(mm);
4193 }
4194 return ret;
4195}
4196
4197static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
4198 struct cgroup *cgroup,
4199 struct task_struct *p,
4200 bool threadgroup)
4201{
4202 mem_cgroup_clear_mc();
4203}
4204
4205static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
4206 unsigned long addr, unsigned long end,
4207 struct mm_walk *walk)
4208{
4209 int ret = 0;
4210 struct vm_area_struct *vma = walk->private;
4211 pte_t *pte;
4212 spinlock_t *ptl;
4213
4214retry:
4215 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
4216 for (; addr != end; addr += PAGE_SIZE) {
4217 pte_t ptent = *(pte++);
4218 union mc_target target;
4219 int type;
4220 struct page *page;
4221 struct page_cgroup *pc;
4222 swp_entry_t ent;
4223
4224 if (!mc.precharge)
4225 break;
4226
4227 type = is_target_pte_for_mc(vma, addr, ptent, &target);
4228 switch (type) {
4229 case MC_TARGET_PAGE:
4230 page = target.page;
4231 if (isolate_lru_page(page))
4232 goto put;
4233 pc = lookup_page_cgroup(page);
4234 if (!mem_cgroup_move_account(pc,
4235 mc.from, mc.to, false)) {
4236 mc.precharge--;
4237 /* we uncharge from mc.from later. */
4238 mc.moved_charge++;
4239 }
4240 putback_lru_page(page);
4241put: /* is_target_pte_for_mc() gets the page */
4242 put_page(page);
4243 break;
4244 case MC_TARGET_SWAP:
4245 ent = target.ent;
4246 if (!mem_cgroup_move_swap_account(ent,
4247 mc.from, mc.to, false)) {
4248 mc.precharge--;
4249 /* we fixup refcnts and charges later. */
4250 mc.moved_swap++;
4251 }
4252 break;
4253 default:
4254 break;
4255 }
4256 }
4257 pte_unmap_unlock(pte - 1, ptl);
4258 cond_resched();
4259
4260 if (addr != end) {
4261 /*
4262 * We have consumed all precharges we got in can_attach().
4263 * We try charge one by one, but don't do any additional
4264 * charges to mc.to if we have failed in charge once in attach()
4265 * phase.
4266 */
4267 ret = mem_cgroup_do_precharge(1);
4268 if (!ret)
4269 goto retry;
4270 }
4271
4272 return ret;
4273}
4274
4275static void mem_cgroup_move_charge(struct mm_struct *mm)
4276{
4277 struct vm_area_struct *vma;
4278
4279 lru_add_drain_all();
4280 down_read(&mm->mmap_sem);
4281 for (vma = mm->mmap; vma; vma = vma->vm_next) {
4282 int ret;
4283 struct mm_walk mem_cgroup_move_charge_walk = {
4284 .pmd_entry = mem_cgroup_move_charge_pte_range,
4285 .mm = mm,
4286 .private = vma,
4287 };
4288 if (is_vm_hugetlb_page(vma))
4289 continue;
4290 /* TODO: We don't move charges of shmem/tmpfs pages for now. */
4291 if (vma->vm_flags & VM_SHARED)
4292 continue;
4293 ret = walk_page_range(vma->vm_start, vma->vm_end,
4294 &mem_cgroup_move_charge_walk);
4295 if (ret)
4296 /*
4297 * means we have consumed all precharges and failed in
4298 * doing additional charge. Just abandon here.
4299 */
4300 break;
4301 }
4302 up_read(&mm->mmap_sem);
4303}
4304
3384static void mem_cgroup_move_task(struct cgroup_subsys *ss, 4305static void mem_cgroup_move_task(struct cgroup_subsys *ss,
3385 struct cgroup *cont, 4306 struct cgroup *cont,
3386 struct cgroup *old_cont, 4307 struct cgroup *old_cont,
3387 struct task_struct *p, 4308 struct task_struct *p,
3388 bool threadgroup) 4309 bool threadgroup)
3389{ 4310{
3390 /* 4311 struct mm_struct *mm;
3391 * FIXME: It's better to move charges of this process from old 4312
3392 * memcg to new memcg. But it's just on TODO-List now. 4313 if (!mc.to)
3393 */ 4314 /* no need to move charge */
4315 return;
4316
4317 mm = get_task_mm(p);
4318 if (mm) {
4319 mem_cgroup_move_charge(mm);
4320 mmput(mm);
4321 }
4322 mem_cgroup_clear_mc();
4323}
4324#else /* !CONFIG_MMU */
4325static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
4326 struct cgroup *cgroup,
4327 struct task_struct *p,
4328 bool threadgroup)
4329{
4330 return 0;
4331}
4332static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
4333 struct cgroup *cgroup,
4334 struct task_struct *p,
4335 bool threadgroup)
4336{
3394} 4337}
4338static void mem_cgroup_move_task(struct cgroup_subsys *ss,
4339 struct cgroup *cont,
4340 struct cgroup *old_cont,
4341 struct task_struct *p,
4342 bool threadgroup)
4343{
4344}
4345#endif
3395 4346
3396struct cgroup_subsys mem_cgroup_subsys = { 4347struct cgroup_subsys mem_cgroup_subsys = {
3397 .name = "memory", 4348 .name = "memory",
@@ -3400,6 +4351,8 @@ struct cgroup_subsys mem_cgroup_subsys = {
3400 .pre_destroy = mem_cgroup_pre_destroy, 4351 .pre_destroy = mem_cgroup_pre_destroy,
3401 .destroy = mem_cgroup_destroy, 4352 .destroy = mem_cgroup_destroy,
3402 .populate = mem_cgroup_populate, 4353 .populate = mem_cgroup_populate,
4354 .can_attach = mem_cgroup_can_attach,
4355 .cancel_attach = mem_cgroup_cancel_attach,
3403 .attach = mem_cgroup_move_task, 4356 .attach = mem_cgroup_move_task,
3404 .early_init = 0, 4357 .early_init = 0,
3405 .use_id = 1, 4358 .use_id = 1,
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 17299fd4577c..620b0b461593 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -44,6 +44,7 @@
44#include <linux/migrate.h> 44#include <linux/migrate.h>
45#include <linux/page-isolation.h> 45#include <linux/page-isolation.h>
46#include <linux/suspend.h> 46#include <linux/suspend.h>
47#include <linux/slab.h>
47#include "internal.h" 48#include "internal.h"
48 49
49int sysctl_memory_failure_early_kill __read_mostly = 0; 50int sysctl_memory_failure_early_kill __read_mostly = 0;
@@ -383,9 +384,12 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
383 if (av == NULL) /* Not actually mapped anymore */ 384 if (av == NULL) /* Not actually mapped anymore */
384 goto out; 385 goto out;
385 for_each_process (tsk) { 386 for_each_process (tsk) {
387 struct anon_vma_chain *vmac;
388
386 if (!task_early_kill(tsk)) 389 if (!task_early_kill(tsk))
387 continue; 390 continue;
388 list_for_each_entry (vma, &av->head, anon_vma_node) { 391 list_for_each_entry(vmac, &av->head, same_anon_vma) {
392 vma = vmac->vma;
389 if (!page_mapped_in_vma(page, vma)) 393 if (!page_mapped_in_vma(page, vma))
390 continue; 394 continue;
391 if (vma->vm_mm == tsk->mm) 395 if (vma->vm_mm == tsk->mm)
diff --git a/mm/memory.c b/mm/memory.c
index 09e4b1be7b67..833952d8b74d 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -56,6 +56,7 @@
56#include <linux/kallsyms.h> 56#include <linux/kallsyms.h>
57#include <linux/swapops.h> 57#include <linux/swapops.h>
58#include <linux/elf.h> 58#include <linux/elf.h>
59#include <linux/gfp.h>
59 60
60#include <asm/io.h> 61#include <asm/io.h>
61#include <asm/pgalloc.h> 62#include <asm/pgalloc.h>
@@ -121,6 +122,77 @@ static int __init init_zero_pfn(void)
121} 122}
122core_initcall(init_zero_pfn); 123core_initcall(init_zero_pfn);
123 124
125
126#if defined(SPLIT_RSS_COUNTING)
127
128static void __sync_task_rss_stat(struct task_struct *task, struct mm_struct *mm)
129{
130 int i;
131
132 for (i = 0; i < NR_MM_COUNTERS; i++) {
133 if (task->rss_stat.count[i]) {
134 add_mm_counter(mm, i, task->rss_stat.count[i]);
135 task->rss_stat.count[i] = 0;
136 }
137 }
138 task->rss_stat.events = 0;
139}
140
141static void add_mm_counter_fast(struct mm_struct *mm, int member, int val)
142{
143 struct task_struct *task = current;
144
145 if (likely(task->mm == mm))
146 task->rss_stat.count[member] += val;
147 else
148 add_mm_counter(mm, member, val);
149}
150#define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1)
151#define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1)
152
153/* sync counter once per 64 page faults */
154#define TASK_RSS_EVENTS_THRESH (64)
155static void check_sync_rss_stat(struct task_struct *task)
156{
157 if (unlikely(task != current))
158 return;
159 if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH))
160 __sync_task_rss_stat(task, task->mm);
161}
162
163unsigned long get_mm_counter(struct mm_struct *mm, int member)
164{
165 long val = 0;
166
167 /*
168 * Don't use task->mm here...for avoiding to use task_get_mm()..
169 * The caller must guarantee task->mm is not invalid.
170 */
171 val = atomic_long_read(&mm->rss_stat.count[member]);
172 /*
173 * counter is updated in asynchronous manner and may go to minus.
174 * But it's never be expected number for users.
175 */
176 if (val < 0)
177 return 0;
178 return (unsigned long)val;
179}
180
181void sync_mm_rss(struct task_struct *task, struct mm_struct *mm)
182{
183 __sync_task_rss_stat(task, mm);
184}
185#else
186
187#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member)
188#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member)
189
190static void check_sync_rss_stat(struct task_struct *task)
191{
192}
193
194#endif
195
124/* 196/*
125 * If a p?d_bad entry is found while walking page tables, report 197 * If a p?d_bad entry is found while walking page tables, report
126 * the error, before resetting entry to p?d_none. Usually (but 198 * the error, before resetting entry to p?d_none. Usually (but
@@ -300,7 +372,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
300 * Hide vma from rmap and truncate_pagecache before freeing 372 * Hide vma from rmap and truncate_pagecache before freeing
301 * pgtables 373 * pgtables
302 */ 374 */
303 anon_vma_unlink(vma); 375 unlink_anon_vmas(vma);
304 unlink_file_vma(vma); 376 unlink_file_vma(vma);
305 377
306 if (is_vm_hugetlb_page(vma)) { 378 if (is_vm_hugetlb_page(vma)) {
@@ -314,7 +386,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
314 && !is_vm_hugetlb_page(next)) { 386 && !is_vm_hugetlb_page(next)) {
315 vma = next; 387 vma = next;
316 next = vma->vm_next; 388 next = vma->vm_next;
317 anon_vma_unlink(vma); 389 unlink_anon_vmas(vma);
318 unlink_file_vma(vma); 390 unlink_file_vma(vma);
319 } 391 }
320 free_pgd_range(tlb, addr, vma->vm_end, 392 free_pgd_range(tlb, addr, vma->vm_end,
@@ -376,12 +448,20 @@ int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
376 return 0; 448 return 0;
377} 449}
378 450
379static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss) 451static inline void init_rss_vec(int *rss)
380{ 452{
381 if (file_rss) 453 memset(rss, 0, sizeof(int) * NR_MM_COUNTERS);
382 add_mm_counter(mm, file_rss, file_rss); 454}
383 if (anon_rss) 455
384 add_mm_counter(mm, anon_rss, anon_rss); 456static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
457{
458 int i;
459
460 if (current->mm == mm)
461 sync_mm_rss(current, mm);
462 for (i = 0; i < NR_MM_COUNTERS; i++)
463 if (rss[i])
464 add_mm_counter(mm, i, rss[i]);
385} 465}
386 466
387/* 467/*
@@ -430,12 +510,8 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
430 "BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n", 510 "BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n",
431 current->comm, 511 current->comm,
432 (long long)pte_val(pte), (long long)pmd_val(*pmd)); 512 (long long)pte_val(pte), (long long)pmd_val(*pmd));
433 if (page) { 513 if (page)
434 printk(KERN_ALERT 514 dump_page(page);
435 "page:%p flags:%p count:%d mapcount:%d mapping:%p index:%lx\n",
436 page, (void *)page->flags, page_count(page),
437 page_mapcount(page), page->mapping, page->index);
438 }
439 printk(KERN_ALERT 515 printk(KERN_ALERT
440 "addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n", 516 "addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n",
441 (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index); 517 (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
@@ -597,7 +673,9 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
597 &src_mm->mmlist); 673 &src_mm->mmlist);
598 spin_unlock(&mmlist_lock); 674 spin_unlock(&mmlist_lock);
599 } 675 }
600 if (is_write_migration_entry(entry) && 676 if (likely(!non_swap_entry(entry)))
677 rss[MM_SWAPENTS]++;
678 else if (is_write_migration_entry(entry) &&
601 is_cow_mapping(vm_flags)) { 679 is_cow_mapping(vm_flags)) {
602 /* 680 /*
603 * COW mappings require pages in both parent 681 * COW mappings require pages in both parent
@@ -632,7 +710,10 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
632 if (page) { 710 if (page) {
633 get_page(page); 711 get_page(page);
634 page_dup_rmap(page); 712 page_dup_rmap(page);
635 rss[PageAnon(page)]++; 713 if (PageAnon(page))
714 rss[MM_ANONPAGES]++;
715 else
716 rss[MM_FILEPAGES]++;
636 } 717 }
637 718
638out_set_pte: 719out_set_pte:
@@ -648,11 +729,12 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
648 pte_t *src_pte, *dst_pte; 729 pte_t *src_pte, *dst_pte;
649 spinlock_t *src_ptl, *dst_ptl; 730 spinlock_t *src_ptl, *dst_ptl;
650 int progress = 0; 731 int progress = 0;
651 int rss[2]; 732 int rss[NR_MM_COUNTERS];
652 swp_entry_t entry = (swp_entry_t){0}; 733 swp_entry_t entry = (swp_entry_t){0};
653 734
654again: 735again:
655 rss[1] = rss[0] = 0; 736 init_rss_vec(rss);
737
656 dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl); 738 dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
657 if (!dst_pte) 739 if (!dst_pte)
658 return -ENOMEM; 740 return -ENOMEM;
@@ -688,7 +770,7 @@ again:
688 arch_leave_lazy_mmu_mode(); 770 arch_leave_lazy_mmu_mode();
689 spin_unlock(src_ptl); 771 spin_unlock(src_ptl);
690 pte_unmap_nested(orig_src_pte); 772 pte_unmap_nested(orig_src_pte);
691 add_mm_rss(dst_mm, rss[0], rss[1]); 773 add_mm_rss_vec(dst_mm, rss);
692 pte_unmap_unlock(orig_dst_pte, dst_ptl); 774 pte_unmap_unlock(orig_dst_pte, dst_ptl);
693 cond_resched(); 775 cond_resched();
694 776
@@ -816,8 +898,9 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
816 struct mm_struct *mm = tlb->mm; 898 struct mm_struct *mm = tlb->mm;
817 pte_t *pte; 899 pte_t *pte;
818 spinlock_t *ptl; 900 spinlock_t *ptl;
819 int file_rss = 0; 901 int rss[NR_MM_COUNTERS];
820 int anon_rss = 0; 902
903 init_rss_vec(rss);
821 904
822 pte = pte_offset_map_lock(mm, pmd, addr, &ptl); 905 pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
823 arch_enter_lazy_mmu_mode(); 906 arch_enter_lazy_mmu_mode();
@@ -863,14 +946,14 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
863 set_pte_at(mm, addr, pte, 946 set_pte_at(mm, addr, pte,
864 pgoff_to_pte(page->index)); 947 pgoff_to_pte(page->index));
865 if (PageAnon(page)) 948 if (PageAnon(page))
866 anon_rss--; 949 rss[MM_ANONPAGES]--;
867 else { 950 else {
868 if (pte_dirty(ptent)) 951 if (pte_dirty(ptent))
869 set_page_dirty(page); 952 set_page_dirty(page);
870 if (pte_young(ptent) && 953 if (pte_young(ptent) &&
871 likely(!VM_SequentialReadHint(vma))) 954 likely(!VM_SequentialReadHint(vma)))
872 mark_page_accessed(page); 955 mark_page_accessed(page);
873 file_rss--; 956 rss[MM_FILEPAGES]--;
874 } 957 }
875 page_remove_rmap(page); 958 page_remove_rmap(page);
876 if (unlikely(page_mapcount(page) < 0)) 959 if (unlikely(page_mapcount(page) < 0))
@@ -887,13 +970,18 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
887 if (pte_file(ptent)) { 970 if (pte_file(ptent)) {
888 if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) 971 if (unlikely(!(vma->vm_flags & VM_NONLINEAR)))
889 print_bad_pte(vma, addr, ptent, NULL); 972 print_bad_pte(vma, addr, ptent, NULL);
890 } else if 973 } else {
891 (unlikely(!free_swap_and_cache(pte_to_swp_entry(ptent)))) 974 swp_entry_t entry = pte_to_swp_entry(ptent);
892 print_bad_pte(vma, addr, ptent, NULL); 975
976 if (!non_swap_entry(entry))
977 rss[MM_SWAPENTS]--;
978 if (unlikely(!free_swap_and_cache(entry)))
979 print_bad_pte(vma, addr, ptent, NULL);
980 }
893 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); 981 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
894 } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0)); 982 } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
895 983
896 add_mm_rss(mm, file_rss, anon_rss); 984 add_mm_rss_vec(mm, rss);
897 arch_leave_lazy_mmu_mode(); 985 arch_leave_lazy_mmu_mode();
898 pte_unmap_unlock(pte - 1, ptl); 986 pte_unmap_unlock(pte - 1, ptl);
899 987
@@ -1527,7 +1615,7 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr,
1527 1615
1528 /* Ok, finally just insert the thing.. */ 1616 /* Ok, finally just insert the thing.. */
1529 get_page(page); 1617 get_page(page);
1530 inc_mm_counter(mm, file_rss); 1618 inc_mm_counter_fast(mm, MM_FILEPAGES);
1531 page_add_file_rmap(page); 1619 page_add_file_rmap(page);
1532 set_pte_at(mm, addr, pte, mk_pte(page, prot)); 1620 set_pte_at(mm, addr, pte, mk_pte(page, prot));
1533 1621
@@ -1593,7 +1681,7 @@ static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
1593 /* Ok, finally just insert the thing.. */ 1681 /* Ok, finally just insert the thing.. */
1594 entry = pte_mkspecial(pfn_pte(pfn, prot)); 1682 entry = pte_mkspecial(pfn_pte(pfn, prot));
1595 set_pte_at(mm, addr, pte, entry); 1683 set_pte_at(mm, addr, pte, entry);
1596 update_mmu_cache(vma, addr, entry); /* XXX: why not for insert_page? */ 1684 update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */
1597 1685
1598 retval = 0; 1686 retval = 0;
1599out_unlock: 1687out_unlock:
@@ -2044,6 +2132,13 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
2044 page_cache_release(old_page); 2132 page_cache_release(old_page);
2045 } 2133 }
2046 reuse = reuse_swap_page(old_page); 2134 reuse = reuse_swap_page(old_page);
2135 if (reuse)
2136 /*
2137 * The page is all ours. Move it to our anon_vma so
2138 * the rmap code will not search our parent or siblings.
2139 * Protected against the rmap code by the page lock.
2140 */
2141 page_move_anon_rmap(old_page, vma, address);
2047 unlock_page(old_page); 2142 unlock_page(old_page);
2048 } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == 2143 } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
2049 (VM_WRITE|VM_SHARED))) { 2144 (VM_WRITE|VM_SHARED))) {
@@ -2116,7 +2211,7 @@ reuse:
2116 entry = pte_mkyoung(orig_pte); 2211 entry = pte_mkyoung(orig_pte);
2117 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 2212 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2118 if (ptep_set_access_flags(vma, address, page_table, entry,1)) 2213 if (ptep_set_access_flags(vma, address, page_table, entry,1))
2119 update_mmu_cache(vma, address, entry); 2214 update_mmu_cache(vma, address, page_table);
2120 ret |= VM_FAULT_WRITE; 2215 ret |= VM_FAULT_WRITE;
2121 goto unlock; 2216 goto unlock;
2122 } 2217 }
@@ -2163,11 +2258,11 @@ gotten:
2163 if (likely(pte_same(*page_table, orig_pte))) { 2258 if (likely(pte_same(*page_table, orig_pte))) {
2164 if (old_page) { 2259 if (old_page) {
2165 if (!PageAnon(old_page)) { 2260 if (!PageAnon(old_page)) {
2166 dec_mm_counter(mm, file_rss); 2261 dec_mm_counter_fast(mm, MM_FILEPAGES);
2167 inc_mm_counter(mm, anon_rss); 2262 inc_mm_counter_fast(mm, MM_ANONPAGES);
2168 } 2263 }
2169 } else 2264 } else
2170 inc_mm_counter(mm, anon_rss); 2265 inc_mm_counter_fast(mm, MM_ANONPAGES);
2171 flush_cache_page(vma, address, pte_pfn(orig_pte)); 2266 flush_cache_page(vma, address, pte_pfn(orig_pte));
2172 entry = mk_pte(new_page, vma->vm_page_prot); 2267 entry = mk_pte(new_page, vma->vm_page_prot);
2173 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 2268 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
@@ -2185,7 +2280,7 @@ gotten:
2185 * new page to be mapped directly into the secondary page table. 2280 * new page to be mapped directly into the secondary page table.
2186 */ 2281 */
2187 set_pte_at_notify(mm, address, page_table, entry); 2282 set_pte_at_notify(mm, address, page_table, entry);
2188 update_mmu_cache(vma, address, entry); 2283 update_mmu_cache(vma, address, page_table);
2189 if (old_page) { 2284 if (old_page) {
2190 /* 2285 /*
2191 * Only after switching the pte to the new page may 2286 * Only after switching the pte to the new page may
@@ -2604,7 +2699,8 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2604 * discarded at swap_free(). 2699 * discarded at swap_free().
2605 */ 2700 */
2606 2701
2607 inc_mm_counter(mm, anon_rss); 2702 inc_mm_counter_fast(mm, MM_ANONPAGES);
2703 dec_mm_counter_fast(mm, MM_SWAPENTS);
2608 pte = mk_pte(page, vma->vm_page_prot); 2704 pte = mk_pte(page, vma->vm_page_prot);
2609 if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) { 2705 if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
2610 pte = maybe_mkwrite(pte_mkdirty(pte), vma); 2706 pte = maybe_mkwrite(pte_mkdirty(pte), vma);
@@ -2629,7 +2725,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2629 } 2725 }
2630 2726
2631 /* No need to invalidate - it was non-present before */ 2727 /* No need to invalidate - it was non-present before */
2632 update_mmu_cache(vma, address, pte); 2728 update_mmu_cache(vma, address, page_table);
2633unlock: 2729unlock:
2634 pte_unmap_unlock(page_table, ptl); 2730 pte_unmap_unlock(page_table, ptl);
2635out: 2731out:
@@ -2688,13 +2784,13 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
2688 if (!pte_none(*page_table)) 2784 if (!pte_none(*page_table))
2689 goto release; 2785 goto release;
2690 2786
2691 inc_mm_counter(mm, anon_rss); 2787 inc_mm_counter_fast(mm, MM_ANONPAGES);
2692 page_add_new_anon_rmap(page, vma, address); 2788 page_add_new_anon_rmap(page, vma, address);
2693setpte: 2789setpte:
2694 set_pte_at(mm, address, page_table, entry); 2790 set_pte_at(mm, address, page_table, entry);
2695 2791
2696 /* No need to invalidate - it was non-present before */ 2792 /* No need to invalidate - it was non-present before */
2697 update_mmu_cache(vma, address, entry); 2793 update_mmu_cache(vma, address, page_table);
2698unlock: 2794unlock:
2699 pte_unmap_unlock(page_table, ptl); 2795 pte_unmap_unlock(page_table, ptl);
2700 return 0; 2796 return 0;
@@ -2842,10 +2938,10 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2842 if (flags & FAULT_FLAG_WRITE) 2938 if (flags & FAULT_FLAG_WRITE)
2843 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 2939 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2844 if (anon) { 2940 if (anon) {
2845 inc_mm_counter(mm, anon_rss); 2941 inc_mm_counter_fast(mm, MM_ANONPAGES);
2846 page_add_new_anon_rmap(page, vma, address); 2942 page_add_new_anon_rmap(page, vma, address);
2847 } else { 2943 } else {
2848 inc_mm_counter(mm, file_rss); 2944 inc_mm_counter_fast(mm, MM_FILEPAGES);
2849 page_add_file_rmap(page); 2945 page_add_file_rmap(page);
2850 if (flags & FAULT_FLAG_WRITE) { 2946 if (flags & FAULT_FLAG_WRITE) {
2851 dirty_page = page; 2947 dirty_page = page;
@@ -2855,7 +2951,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2855 set_pte_at(mm, address, page_table, entry); 2951 set_pte_at(mm, address, page_table, entry);
2856 2952
2857 /* no need to invalidate: a not-present page won't be cached */ 2953 /* no need to invalidate: a not-present page won't be cached */
2858 update_mmu_cache(vma, address, entry); 2954 update_mmu_cache(vma, address, page_table);
2859 } else { 2955 } else {
2860 if (charged) 2956 if (charged)
2861 mem_cgroup_uncharge_page(page); 2957 mem_cgroup_uncharge_page(page);
@@ -2992,7 +3088,7 @@ static inline int handle_pte_fault(struct mm_struct *mm,
2992 } 3088 }
2993 entry = pte_mkyoung(entry); 3089 entry = pte_mkyoung(entry);
2994 if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) { 3090 if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) {
2995 update_mmu_cache(vma, address, entry); 3091 update_mmu_cache(vma, address, pte);
2996 } else { 3092 } else {
2997 /* 3093 /*
2998 * This is needed only for protection faults but the arch code 3094 * This is needed only for protection faults but the arch code
@@ -3023,6 +3119,9 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3023 3119
3024 count_vm_event(PGFAULT); 3120 count_vm_event(PGFAULT);
3025 3121
3122 /* do counter updates before entering really critical section. */
3123 check_sync_rss_stat(current);
3124
3026 if (unlikely(is_vm_hugetlb_page(vma))) 3125 if (unlikely(is_vm_hugetlb_page(vma)))
3027 return hugetlb_fault(mm, vma, address, flags); 3126 return hugetlb_fault(mm, vma, address, flags);
3028 3127
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 030ce8a5bb0e..be211a582930 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -28,6 +28,7 @@
28#include <linux/pfn.h> 28#include <linux/pfn.h>
29#include <linux/suspend.h> 29#include <linux/suspend.h>
30#include <linux/mm_inline.h> 30#include <linux/mm_inline.h>
31#include <linux/firmware-map.h>
31 32
32#include <asm/tlbflush.h> 33#include <asm/tlbflush.h>
33 34
@@ -523,6 +524,9 @@ int __ref add_memory(int nid, u64 start, u64 size)
523 BUG_ON(ret); 524 BUG_ON(ret);
524 } 525 }
525 526
527 /* create new memmap entry */
528 firmware_map_add_hotplug(start, start + size, "System RAM");
529
526 goto out; 530 goto out;
527 531
528error: 532error:
@@ -684,9 +688,9 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
684 if (page_count(page)) 688 if (page_count(page))
685 not_managed++; 689 not_managed++;
686#ifdef CONFIG_DEBUG_VM 690#ifdef CONFIG_DEBUG_VM
687 printk(KERN_INFO "removing from LRU failed" 691 printk(KERN_ALERT "removing pfn %lx from LRU failed\n",
688 " %lx/%d/%lx\n", 692 pfn);
689 pfn, page_count(page), page->flags); 693 dump_page(page);
690#endif 694#endif
691 } 695 }
692 } 696 }
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 290fb5bf0440..08f40a2f3fe0 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -73,7 +73,6 @@
73#include <linux/sched.h> 73#include <linux/sched.h>
74#include <linux/nodemask.h> 74#include <linux/nodemask.h>
75#include <linux/cpuset.h> 75#include <linux/cpuset.h>
76#include <linux/gfp.h>
77#include <linux/slab.h> 76#include <linux/slab.h>
78#include <linux/string.h> 77#include <linux/string.h>
79#include <linux/module.h> 78#include <linux/module.h>
@@ -563,24 +562,50 @@ static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
563} 562}
564 563
565/* Step 2: apply policy to a range and do splits. */ 564/* Step 2: apply policy to a range and do splits. */
566static int mbind_range(struct vm_area_struct *vma, unsigned long start, 565static int mbind_range(struct mm_struct *mm, unsigned long start,
567 unsigned long end, struct mempolicy *new) 566 unsigned long end, struct mempolicy *new_pol)
568{ 567{
569 struct vm_area_struct *next; 568 struct vm_area_struct *next;
570 int err; 569 struct vm_area_struct *prev;
570 struct vm_area_struct *vma;
571 int err = 0;
572 pgoff_t pgoff;
573 unsigned long vmstart;
574 unsigned long vmend;
571 575
572 err = 0; 576 vma = find_vma_prev(mm, start, &prev);
573 for (; vma && vma->vm_start < end; vma = next) { 577 if (!vma || vma->vm_start > start)
578 return -EFAULT;
579
580 for (; vma && vma->vm_start < end; prev = vma, vma = next) {
574 next = vma->vm_next; 581 next = vma->vm_next;
575 if (vma->vm_start < start) 582 vmstart = max(start, vma->vm_start);
576 err = split_vma(vma->vm_mm, vma, start, 1); 583 vmend = min(end, vma->vm_end);
577 if (!err && vma->vm_end > end) 584
578 err = split_vma(vma->vm_mm, vma, end, 0); 585 pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
579 if (!err) 586 prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
580 err = policy_vma(vma, new); 587 vma->anon_vma, vma->vm_file, pgoff, new_pol);
588 if (prev) {
589 vma = prev;
590 next = vma->vm_next;
591 continue;
592 }
593 if (vma->vm_start != vmstart) {
594 err = split_vma(vma->vm_mm, vma, vmstart, 1);
595 if (err)
596 goto out;
597 }
598 if (vma->vm_end != vmend) {
599 err = split_vma(vma->vm_mm, vma, vmend, 0);
600 if (err)
601 goto out;
602 }
603 err = policy_vma(vma, new_pol);
581 if (err) 604 if (err)
582 break; 605 goto out;
583 } 606 }
607
608 out:
584 return err; 609 return err;
585} 610}
586 611
@@ -780,9 +805,13 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
780 805
781 err = 0; 806 err = 0;
782 if (nmask) { 807 if (nmask) {
783 task_lock(current); 808 if (mpol_store_user_nodemask(pol)) {
784 get_policy_nodemask(pol, nmask); 809 *nmask = pol->w.user_nodemask;
785 task_unlock(current); 810 } else {
811 task_lock(current);
812 get_policy_nodemask(pol, nmask);
813 task_unlock(current);
814 }
786 } 815 }
787 816
788 out: 817 out:
@@ -862,36 +891,36 @@ int do_migrate_pages(struct mm_struct *mm,
862 if (err) 891 if (err)
863 goto out; 892 goto out;
864 893
865/* 894 /*
866 * Find a 'source' bit set in 'tmp' whose corresponding 'dest' 895 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
867 * bit in 'to' is not also set in 'tmp'. Clear the found 'source' 896 * bit in 'to' is not also set in 'tmp'. Clear the found 'source'
868 * bit in 'tmp', and return that <source, dest> pair for migration. 897 * bit in 'tmp', and return that <source, dest> pair for migration.
869 * The pair of nodemasks 'to' and 'from' define the map. 898 * The pair of nodemasks 'to' and 'from' define the map.
870 * 899 *
871 * If no pair of bits is found that way, fallback to picking some 900 * If no pair of bits is found that way, fallback to picking some
872 * pair of 'source' and 'dest' bits that are not the same. If the 901 * pair of 'source' and 'dest' bits that are not the same. If the
873 * 'source' and 'dest' bits are the same, this represents a node 902 * 'source' and 'dest' bits are the same, this represents a node
874 * that will be migrating to itself, so no pages need move. 903 * that will be migrating to itself, so no pages need move.
875 * 904 *
876 * If no bits are left in 'tmp', or if all remaining bits left 905 * If no bits are left in 'tmp', or if all remaining bits left
877 * in 'tmp' correspond to the same bit in 'to', return false 906 * in 'tmp' correspond to the same bit in 'to', return false
878 * (nothing left to migrate). 907 * (nothing left to migrate).
879 * 908 *
880 * This lets us pick a pair of nodes to migrate between, such that 909 * This lets us pick a pair of nodes to migrate between, such that
881 * if possible the dest node is not already occupied by some other 910 * if possible the dest node is not already occupied by some other
882 * source node, minimizing the risk of overloading the memory on a 911 * source node, minimizing the risk of overloading the memory on a
883 * node that would happen if we migrated incoming memory to a node 912 * node that would happen if we migrated incoming memory to a node
884 * before migrating outgoing memory source that same node. 913 * before migrating outgoing memory source that same node.
885 * 914 *
886 * A single scan of tmp is sufficient. As we go, we remember the 915 * A single scan of tmp is sufficient. As we go, we remember the
887 * most recent <s, d> pair that moved (s != d). If we find a pair 916 * most recent <s, d> pair that moved (s != d). If we find a pair
888 * that not only moved, but what's better, moved to an empty slot 917 * that not only moved, but what's better, moved to an empty slot
889 * (d is not set in tmp), then we break out then, with that pair. 918 * (d is not set in tmp), then we break out then, with that pair.
890 * Otherwise when we finish scannng from_tmp, we at least have the 919 * Otherwise when we finish scannng from_tmp, we at least have the
891 * most recent <s, d> pair that moved. If we get all the way through 920 * most recent <s, d> pair that moved. If we get all the way through
892 * the scan of tmp without finding any node that moved, much less 921 * the scan of tmp without finding any node that moved, much less
893 * moved to an empty node, then there is nothing left worth migrating. 922 * moved to an empty node, then there is nothing left worth migrating.
894 */ 923 */
895 924
896 tmp = *from_nodes; 925 tmp = *from_nodes;
897 while (!nodes_empty(tmp)) { 926 while (!nodes_empty(tmp)) {
@@ -1047,7 +1076,7 @@ static long do_mbind(unsigned long start, unsigned long len,
1047 if (!IS_ERR(vma)) { 1076 if (!IS_ERR(vma)) {
1048 int nr_failed = 0; 1077 int nr_failed = 0;
1049 1078
1050 err = mbind_range(vma, start, end, new); 1079 err = mbind_range(mm, start, end, new);
1051 1080
1052 if (!list_empty(&pagelist)) 1081 if (!list_empty(&pagelist))
1053 nr_failed = migrate_pages(&pagelist, new_vma_page, 1082 nr_failed = migrate_pages(&pagelist, new_vma_page,
@@ -1730,10 +1759,12 @@ struct mempolicy *__mpol_dup(struct mempolicy *old)
1730 1759
1731 if (!new) 1760 if (!new)
1732 return ERR_PTR(-ENOMEM); 1761 return ERR_PTR(-ENOMEM);
1762 rcu_read_lock();
1733 if (current_cpuset_is_being_rebound()) { 1763 if (current_cpuset_is_being_rebound()) {
1734 nodemask_t mems = cpuset_mems_allowed(current); 1764 nodemask_t mems = cpuset_mems_allowed(current);
1735 mpol_rebind_policy(old, &mems); 1765 mpol_rebind_policy(old, &mems);
1736 } 1766 }
1767 rcu_read_unlock();
1737 *new = *old; 1768 *new = *old;
1738 atomic_set(&new->refcnt, 1); 1769 atomic_set(&new->refcnt, 1);
1739 return new; 1770 return new;
@@ -2167,8 +2198,8 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
2167 char *rest = nodelist; 2198 char *rest = nodelist;
2168 while (isdigit(*rest)) 2199 while (isdigit(*rest))
2169 rest++; 2200 rest++;
2170 if (!*rest) 2201 if (*rest)
2171 err = 0; 2202 goto out;
2172 } 2203 }
2173 break; 2204 break;
2174 case MPOL_INTERLEAVE: 2205 case MPOL_INTERLEAVE:
@@ -2177,7 +2208,6 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
2177 */ 2208 */
2178 if (!nodelist) 2209 if (!nodelist)
2179 nodes = node_states[N_HIGH_MEMORY]; 2210 nodes = node_states[N_HIGH_MEMORY];
2180 err = 0;
2181 break; 2211 break;
2182 case MPOL_LOCAL: 2212 case MPOL_LOCAL:
2183 /* 2213 /*
@@ -2187,11 +2217,19 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
2187 goto out; 2217 goto out;
2188 mode = MPOL_PREFERRED; 2218 mode = MPOL_PREFERRED;
2189 break; 2219 break;
2190 2220 case MPOL_DEFAULT:
2191 /* 2221 /*
2192 * case MPOL_BIND: mpol_new() enforces non-empty nodemask. 2222 * Insist on a empty nodelist
2193 * case MPOL_DEFAULT: mpol_new() enforces empty nodemask, ignores flags. 2223 */
2194 */ 2224 if (!nodelist)
2225 err = 0;
2226 goto out;
2227 case MPOL_BIND:
2228 /*
2229 * Insist on a nodelist
2230 */
2231 if (!nodelist)
2232 goto out;
2195 } 2233 }
2196 2234
2197 mode_flags = 0; 2235 mode_flags = 0;
@@ -2205,13 +2243,14 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
2205 else if (!strcmp(flags, "relative")) 2243 else if (!strcmp(flags, "relative"))
2206 mode_flags |= MPOL_F_RELATIVE_NODES; 2244 mode_flags |= MPOL_F_RELATIVE_NODES;
2207 else 2245 else
2208 err = 1; 2246 goto out;
2209 } 2247 }
2210 2248
2211 new = mpol_new(mode, mode_flags, &nodes); 2249 new = mpol_new(mode, mode_flags, &nodes);
2212 if (IS_ERR(new)) 2250 if (IS_ERR(new))
2213 err = 1; 2251 goto out;
2214 else { 2252
2253 {
2215 int ret; 2254 int ret;
2216 NODEMASK_SCRATCH(scratch); 2255 NODEMASK_SCRATCH(scratch);
2217 if (scratch) { 2256 if (scratch) {
@@ -2222,13 +2261,15 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
2222 ret = -ENOMEM; 2261 ret = -ENOMEM;
2223 NODEMASK_SCRATCH_FREE(scratch); 2262 NODEMASK_SCRATCH_FREE(scratch);
2224 if (ret) { 2263 if (ret) {
2225 err = 1;
2226 mpol_put(new); 2264 mpol_put(new);
2227 } else if (no_context) { 2265 goto out;
2228 /* save for contextualization */
2229 new->w.user_nodemask = nodes;
2230 } 2266 }
2231 } 2267 }
2268 err = 0;
2269 if (no_context) {
2270 /* save for contextualization */
2271 new->w.user_nodemask = nodes;
2272 }
2232 2273
2233out: 2274out:
2234 /* Restore string for error message */ 2275 /* Restore string for error message */
diff --git a/mm/migrate.c b/mm/migrate.c
index 880bd592d38e..d3f3f7f81075 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -32,6 +32,7 @@
32#include <linux/security.h> 32#include <linux/security.h>
33#include <linux/memcontrol.h> 33#include <linux/memcontrol.h>
34#include <linux/syscalls.h> 34#include <linux/syscalls.h>
35#include <linux/gfp.h>
35 36
36#include "internal.h" 37#include "internal.h"
37 38
@@ -134,7 +135,7 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
134 page_add_file_rmap(new); 135 page_add_file_rmap(new);
135 136
136 /* No need to invalidate - it was non-present before */ 137 /* No need to invalidate - it was non-present before */
137 update_mmu_cache(vma, addr, pte); 138 update_mmu_cache(vma, addr, ptep);
138unlock: 139unlock:
139 pte_unmap_unlock(ptep, ptl); 140 pte_unmap_unlock(ptep, ptl);
140out: 141out:
@@ -275,8 +276,6 @@ static int migrate_page_move_mapping(struct address_space *mapping,
275 */ 276 */
276static void migrate_page_copy(struct page *newpage, struct page *page) 277static void migrate_page_copy(struct page *newpage, struct page *page)
277{ 278{
278 int anon;
279
280 copy_highpage(newpage, page); 279 copy_highpage(newpage, page);
281 280
282 if (PageError(page)) 281 if (PageError(page))
@@ -313,8 +312,6 @@ static void migrate_page_copy(struct page *newpage, struct page *page)
313 ClearPageSwapCache(page); 312 ClearPageSwapCache(page);
314 ClearPagePrivate(page); 313 ClearPagePrivate(page);
315 set_page_private(page, 0); 314 set_page_private(page, 0);
316 /* page->mapping contains a flag for PageAnon() */
317 anon = PageAnon(page);
318 page->mapping = NULL; 315 page->mapping = NULL;
319 316
320 /* 317 /*
diff --git a/mm/mincore.c b/mm/mincore.c
index 7a3436ef39eb..f77433c20279 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -7,8 +7,8 @@
7/* 7/*
8 * The mincore() system call. 8 * The mincore() system call.
9 */ 9 */
10#include <linux/slab.h>
11#include <linux/pagemap.h> 10#include <linux/pagemap.h>
11#include <linux/gfp.h>
12#include <linux/mm.h> 12#include <linux/mm.h>
13#include <linux/mman.h> 13#include <linux/mman.h>
14#include <linux/syscalls.h> 14#include <linux/syscalls.h>
diff --git a/mm/mlock.c b/mm/mlock.c
index 2b8335a89400..8f4e2dfceec1 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -25,7 +25,7 @@ int can_do_mlock(void)
25{ 25{
26 if (capable(CAP_IPC_LOCK)) 26 if (capable(CAP_IPC_LOCK))
27 return 1; 27 return 1;
28 if (current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur != 0) 28 if (rlimit(RLIMIT_MEMLOCK) != 0)
29 return 1; 29 return 1;
30 return 0; 30 return 0;
31} 31}
@@ -487,7 +487,7 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
487 locked = len >> PAGE_SHIFT; 487 locked = len >> PAGE_SHIFT;
488 locked += current->mm->locked_vm; 488 locked += current->mm->locked_vm;
489 489
490 lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; 490 lock_limit = rlimit(RLIMIT_MEMLOCK);
491 lock_limit >>= PAGE_SHIFT; 491 lock_limit >>= PAGE_SHIFT;
492 492
493 /* check against resource limits */ 493 /* check against resource limits */
@@ -550,7 +550,7 @@ SYSCALL_DEFINE1(mlockall, int, flags)
550 550
551 down_write(&current->mm->mmap_sem); 551 down_write(&current->mm->mmap_sem);
552 552
553 lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; 553 lock_limit = rlimit(RLIMIT_MEMLOCK);
554 lock_limit >>= PAGE_SHIFT; 554 lock_limit >>= PAGE_SHIFT;
555 555
556 ret = -ENOMEM; 556 ret = -ENOMEM;
@@ -584,7 +584,7 @@ int user_shm_lock(size_t size, struct user_struct *user)
584 int allowed = 0; 584 int allowed = 0;
585 585
586 locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; 586 locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
587 lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; 587 lock_limit = rlimit(RLIMIT_MEMLOCK);
588 if (lock_limit == RLIM_INFINITY) 588 if (lock_limit == RLIM_INFINITY)
589 allowed = 1; 589 allowed = 1;
590 lock_limit >>= PAGE_SHIFT; 590 lock_limit >>= PAGE_SHIFT;
@@ -618,12 +618,12 @@ int account_locked_memory(struct mm_struct *mm, struct rlimit *rlim,
618 618
619 down_write(&mm->mmap_sem); 619 down_write(&mm->mmap_sem);
620 620
621 lim = rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT; 621 lim = ACCESS_ONCE(rlim[RLIMIT_AS].rlim_cur) >> PAGE_SHIFT;
622 vm = mm->total_vm + pgsz; 622 vm = mm->total_vm + pgsz;
623 if (lim < vm) 623 if (lim < vm)
624 goto out; 624 goto out;
625 625
626 lim = rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; 626 lim = ACCESS_ONCE(rlim[RLIMIT_MEMLOCK].rlim_cur) >> PAGE_SHIFT;
627 vm = mm->locked_vm + pgsz; 627 vm = mm->locked_vm + pgsz;
628 if (lim < vm) 628 if (lim < vm)
629 goto out; 629 goto out;
diff --git a/mm/mmap.c b/mm/mmap.c
index ee2298936fe6..456ec6f27889 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -265,7 +265,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
265 * segment grow beyond its set limit the in case where the limit is 265 * segment grow beyond its set limit the in case where the limit is
266 * not page aligned -Ram Gupta 266 * not page aligned -Ram Gupta
267 */ 267 */
268 rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur; 268 rlim = rlimit(RLIMIT_DATA);
269 if (rlim < RLIM_INFINITY && (brk - mm->start_brk) + 269 if (rlim < RLIM_INFINITY && (brk - mm->start_brk) +
270 (mm->end_data - mm->start_data) > rlim) 270 (mm->end_data - mm->start_data) > rlim)
271 goto out; 271 goto out;
@@ -437,7 +437,6 @@ __vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
437{ 437{
438 __vma_link_list(mm, vma, prev, rb_parent); 438 __vma_link_list(mm, vma, prev, rb_parent);
439 __vma_link_rb(mm, vma, rb_link, rb_parent); 439 __vma_link_rb(mm, vma, rb_link, rb_parent);
440 __anon_vma_link(vma);
441} 440}
442 441
443static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma, 442static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -499,7 +498,7 @@ __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
499 * are necessary. The "insert" vma (if any) is to be inserted 498 * are necessary. The "insert" vma (if any) is to be inserted
500 * before we drop the necessary locks. 499 * before we drop the necessary locks.
501 */ 500 */
502void vma_adjust(struct vm_area_struct *vma, unsigned long start, 501int vma_adjust(struct vm_area_struct *vma, unsigned long start,
503 unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert) 502 unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert)
504{ 503{
505 struct mm_struct *mm = vma->vm_mm; 504 struct mm_struct *mm = vma->vm_mm;
@@ -508,11 +507,12 @@ void vma_adjust(struct vm_area_struct *vma, unsigned long start,
508 struct address_space *mapping = NULL; 507 struct address_space *mapping = NULL;
509 struct prio_tree_root *root = NULL; 508 struct prio_tree_root *root = NULL;
510 struct file *file = vma->vm_file; 509 struct file *file = vma->vm_file;
511 struct anon_vma *anon_vma = NULL;
512 long adjust_next = 0; 510 long adjust_next = 0;
513 int remove_next = 0; 511 int remove_next = 0;
514 512
515 if (next && !insert) { 513 if (next && !insert) {
514 struct vm_area_struct *exporter = NULL;
515
516 if (end >= next->vm_end) { 516 if (end >= next->vm_end) {
517 /* 517 /*
518 * vma expands, overlapping all the next, and 518 * vma expands, overlapping all the next, and
@@ -520,7 +520,7 @@ void vma_adjust(struct vm_area_struct *vma, unsigned long start,
520 */ 520 */
521again: remove_next = 1 + (end > next->vm_end); 521again: remove_next = 1 + (end > next->vm_end);
522 end = next->vm_end; 522 end = next->vm_end;
523 anon_vma = next->anon_vma; 523 exporter = next;
524 importer = vma; 524 importer = vma;
525 } else if (end > next->vm_start) { 525 } else if (end > next->vm_start) {
526 /* 526 /*
@@ -528,7 +528,7 @@ again: remove_next = 1 + (end > next->vm_end);
528 * mprotect case 5 shifting the boundary up. 528 * mprotect case 5 shifting the boundary up.
529 */ 529 */
530 adjust_next = (end - next->vm_start) >> PAGE_SHIFT; 530 adjust_next = (end - next->vm_start) >> PAGE_SHIFT;
531 anon_vma = next->anon_vma; 531 exporter = next;
532 importer = vma; 532 importer = vma;
533 } else if (end < vma->vm_end) { 533 } else if (end < vma->vm_end) {
534 /* 534 /*
@@ -537,9 +537,20 @@ again: remove_next = 1 + (end > next->vm_end);
537 * mprotect case 4 shifting the boundary down. 537 * mprotect case 4 shifting the boundary down.
538 */ 538 */
539 adjust_next = - ((vma->vm_end - end) >> PAGE_SHIFT); 539 adjust_next = - ((vma->vm_end - end) >> PAGE_SHIFT);
540 anon_vma = next->anon_vma; 540 exporter = vma;
541 importer = next; 541 importer = next;
542 } 542 }
543
544 /*
545 * Easily overlooked: when mprotect shifts the boundary,
546 * make sure the expanding vma has anon_vma set if the
547 * shrinking vma had, to cover any anon pages imported.
548 */
549 if (exporter && exporter->anon_vma && !importer->anon_vma) {
550 if (anon_vma_clone(importer, exporter))
551 return -ENOMEM;
552 importer->anon_vma = exporter->anon_vma;
553 }
543 } 554 }
544 555
545 if (file) { 556 if (file) {
@@ -567,25 +578,6 @@ again: remove_next = 1 + (end > next->vm_end);
567 } 578 }
568 } 579 }
569 580
570 /*
571 * When changing only vma->vm_end, we don't really need
572 * anon_vma lock.
573 */
574 if (vma->anon_vma && (insert || importer || start != vma->vm_start))
575 anon_vma = vma->anon_vma;
576 if (anon_vma) {
577 spin_lock(&anon_vma->lock);
578 /*
579 * Easily overlooked: when mprotect shifts the boundary,
580 * make sure the expanding vma has anon_vma set if the
581 * shrinking vma had, to cover any anon pages imported.
582 */
583 if (importer && !importer->anon_vma) {
584 importer->anon_vma = anon_vma;
585 __anon_vma_link(importer);
586 }
587 }
588
589 if (root) { 581 if (root) {
590 flush_dcache_mmap_lock(mapping); 582 flush_dcache_mmap_lock(mapping);
591 vma_prio_tree_remove(vma, root); 583 vma_prio_tree_remove(vma, root);
@@ -616,8 +608,6 @@ again: remove_next = 1 + (end > next->vm_end);
616 __vma_unlink(mm, next, vma); 608 __vma_unlink(mm, next, vma);
617 if (file) 609 if (file)
618 __remove_shared_vm_struct(next, file, mapping); 610 __remove_shared_vm_struct(next, file, mapping);
619 if (next->anon_vma)
620 __anon_vma_merge(vma, next);
621 } else if (insert) { 611 } else if (insert) {
622 /* 612 /*
623 * split_vma has split insert from vma, and needs 613 * split_vma has split insert from vma, and needs
@@ -627,8 +617,6 @@ again: remove_next = 1 + (end > next->vm_end);
627 __insert_vm_struct(mm, insert); 617 __insert_vm_struct(mm, insert);
628 } 618 }
629 619
630 if (anon_vma)
631 spin_unlock(&anon_vma->lock);
632 if (mapping) 620 if (mapping)
633 spin_unlock(&mapping->i_mmap_lock); 621 spin_unlock(&mapping->i_mmap_lock);
634 622
@@ -638,6 +626,8 @@ again: remove_next = 1 + (end > next->vm_end);
638 if (next->vm_flags & VM_EXECUTABLE) 626 if (next->vm_flags & VM_EXECUTABLE)
639 removed_exe_file_vma(mm); 627 removed_exe_file_vma(mm);
640 } 628 }
629 if (next->anon_vma)
630 anon_vma_merge(vma, next);
641 mm->map_count--; 631 mm->map_count--;
642 mpol_put(vma_policy(next)); 632 mpol_put(vma_policy(next));
643 kmem_cache_free(vm_area_cachep, next); 633 kmem_cache_free(vm_area_cachep, next);
@@ -653,6 +643,8 @@ again: remove_next = 1 + (end > next->vm_end);
653 } 643 }
654 644
655 validate_mm(mm); 645 validate_mm(mm);
646
647 return 0;
656} 648}
657 649
658/* 650/*
@@ -759,6 +751,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
759{ 751{
760 pgoff_t pglen = (end - addr) >> PAGE_SHIFT; 752 pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
761 struct vm_area_struct *area, *next; 753 struct vm_area_struct *area, *next;
754 int err;
762 755
763 /* 756 /*
764 * We later require that vma->vm_flags == vm_flags, 757 * We later require that vma->vm_flags == vm_flags,
@@ -792,11 +785,13 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
792 is_mergeable_anon_vma(prev->anon_vma, 785 is_mergeable_anon_vma(prev->anon_vma,
793 next->anon_vma)) { 786 next->anon_vma)) {
794 /* cases 1, 6 */ 787 /* cases 1, 6 */
795 vma_adjust(prev, prev->vm_start, 788 err = vma_adjust(prev, prev->vm_start,
796 next->vm_end, prev->vm_pgoff, NULL); 789 next->vm_end, prev->vm_pgoff, NULL);
797 } else /* cases 2, 5, 7 */ 790 } else /* cases 2, 5, 7 */
798 vma_adjust(prev, prev->vm_start, 791 err = vma_adjust(prev, prev->vm_start,
799 end, prev->vm_pgoff, NULL); 792 end, prev->vm_pgoff, NULL);
793 if (err)
794 return NULL;
800 return prev; 795 return prev;
801 } 796 }
802 797
@@ -808,11 +803,13 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
808 can_vma_merge_before(next, vm_flags, 803 can_vma_merge_before(next, vm_flags,
809 anon_vma, file, pgoff+pglen)) { 804 anon_vma, file, pgoff+pglen)) {
810 if (prev && addr < prev->vm_end) /* case 4 */ 805 if (prev && addr < prev->vm_end) /* case 4 */
811 vma_adjust(prev, prev->vm_start, 806 err = vma_adjust(prev, prev->vm_start,
812 addr, prev->vm_pgoff, NULL); 807 addr, prev->vm_pgoff, NULL);
813 else /* cases 3, 8 */ 808 else /* cases 3, 8 */
814 vma_adjust(area, addr, next->vm_end, 809 err = vma_adjust(area, addr, next->vm_end,
815 next->vm_pgoff - pglen, NULL); 810 next->vm_pgoff - pglen, NULL);
811 if (err)
812 return NULL;
816 return area; 813 return area;
817 } 814 }
818 815
@@ -820,6 +817,61 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
820} 817}
821 818
822/* 819/*
820 * Rough compatbility check to quickly see if it's even worth looking
821 * at sharing an anon_vma.
822 *
823 * They need to have the same vm_file, and the flags can only differ
824 * in things that mprotect may change.
825 *
826 * NOTE! The fact that we share an anon_vma doesn't _have_ to mean that
827 * we can merge the two vma's. For example, we refuse to merge a vma if
828 * there is a vm_ops->close() function, because that indicates that the
829 * driver is doing some kind of reference counting. But that doesn't
830 * really matter for the anon_vma sharing case.
831 */
832static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *b)
833{
834 return a->vm_end == b->vm_start &&
835 mpol_equal(vma_policy(a), vma_policy(b)) &&
836 a->vm_file == b->vm_file &&
837 !((a->vm_flags ^ b->vm_flags) & ~(VM_READ|VM_WRITE|VM_EXEC)) &&
838 b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT);
839}
840
841/*
842 * Do some basic sanity checking to see if we can re-use the anon_vma
843 * from 'old'. The 'a'/'b' vma's are in VM order - one of them will be
844 * the same as 'old', the other will be the new one that is trying
845 * to share the anon_vma.
846 *
847 * NOTE! This runs with mm_sem held for reading, so it is possible that
848 * the anon_vma of 'old' is concurrently in the process of being set up
849 * by another page fault trying to merge _that_. But that's ok: if it
850 * is being set up, that automatically means that it will be a singleton
851 * acceptable for merging, so we can do all of this optimistically. But
852 * we do that ACCESS_ONCE() to make sure that we never re-load the pointer.
853 *
854 * IOW: that the "list_is_singular()" test on the anon_vma_chain only
855 * matters for the 'stable anon_vma' case (ie the thing we want to avoid
856 * is to return an anon_vma that is "complex" due to having gone through
857 * a fork).
858 *
859 * We also make sure that the two vma's are compatible (adjacent,
860 * and with the same memory policies). That's all stable, even with just
861 * a read lock on the mm_sem.
862 */
863static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_area_struct *a, struct vm_area_struct *b)
864{
865 if (anon_vma_compatible(a, b)) {
866 struct anon_vma *anon_vma = ACCESS_ONCE(old->anon_vma);
867
868 if (anon_vma && list_is_singular(&old->anon_vma_chain))
869 return anon_vma;
870 }
871 return NULL;
872}
873
874/*
823 * find_mergeable_anon_vma is used by anon_vma_prepare, to check 875 * find_mergeable_anon_vma is used by anon_vma_prepare, to check
824 * neighbouring vmas for a suitable anon_vma, before it goes off 876 * neighbouring vmas for a suitable anon_vma, before it goes off
825 * to allocate a new anon_vma. It checks because a repetitive 877 * to allocate a new anon_vma. It checks because a repetitive
@@ -829,28 +881,16 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
829 */ 881 */
830struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma) 882struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma)
831{ 883{
884 struct anon_vma *anon_vma;
832 struct vm_area_struct *near; 885 struct vm_area_struct *near;
833 unsigned long vm_flags;
834 886
835 near = vma->vm_next; 887 near = vma->vm_next;
836 if (!near) 888 if (!near)
837 goto try_prev; 889 goto try_prev;
838 890
839 /* 891 anon_vma = reusable_anon_vma(near, vma, near);
840 * Since only mprotect tries to remerge vmas, match flags 892 if (anon_vma)
841 * which might be mprotected into each other later on. 893 return anon_vma;
842 * Neither mlock nor madvise tries to remerge at present,
843 * so leave their flags as obstructing a merge.
844 */
845 vm_flags = vma->vm_flags & ~(VM_READ|VM_WRITE|VM_EXEC);
846 vm_flags |= near->vm_flags & (VM_READ|VM_WRITE|VM_EXEC);
847
848 if (near->anon_vma && vma->vm_end == near->vm_start &&
849 mpol_equal(vma_policy(vma), vma_policy(near)) &&
850 can_vma_merge_before(near, vm_flags,
851 NULL, vma->vm_file, vma->vm_pgoff +
852 ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT)))
853 return near->anon_vma;
854try_prev: 894try_prev:
855 /* 895 /*
856 * It is potentially slow to have to call find_vma_prev here. 896 * It is potentially slow to have to call find_vma_prev here.
@@ -863,14 +903,9 @@ try_prev:
863 if (!near) 903 if (!near)
864 goto none; 904 goto none;
865 905
866 vm_flags = vma->vm_flags & ~(VM_READ|VM_WRITE|VM_EXEC); 906 anon_vma = reusable_anon_vma(near, near, vma);
867 vm_flags |= near->vm_flags & (VM_READ|VM_WRITE|VM_EXEC); 907 if (anon_vma)
868 908 return anon_vma;
869 if (near->anon_vma && near->vm_end == vma->vm_start &&
870 mpol_equal(vma_policy(near), vma_policy(vma)) &&
871 can_vma_merge_after(near, vm_flags,
872 NULL, vma->vm_file, vma->vm_pgoff))
873 return near->anon_vma;
874none: 909none:
875 /* 910 /*
876 * There's no absolute need to look only at touching neighbours: 911 * There's no absolute need to look only at touching neighbours:
@@ -967,7 +1002,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
967 unsigned long locked, lock_limit; 1002 unsigned long locked, lock_limit;
968 locked = len >> PAGE_SHIFT; 1003 locked = len >> PAGE_SHIFT;
969 locked += mm->locked_vm; 1004 locked += mm->locked_vm;
970 lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; 1005 lock_limit = rlimit(RLIMIT_MEMLOCK);
971 lock_limit >>= PAGE_SHIFT; 1006 lock_limit >>= PAGE_SHIFT;
972 if (locked > lock_limit && !capable(CAP_IPC_LOCK)) 1007 if (locked > lock_limit && !capable(CAP_IPC_LOCK))
973 return -EAGAIN; 1008 return -EAGAIN;
@@ -1083,6 +1118,30 @@ out:
1083 return retval; 1118 return retval;
1084} 1119}
1085 1120
1121#ifdef __ARCH_WANT_SYS_OLD_MMAP
1122struct mmap_arg_struct {
1123 unsigned long addr;
1124 unsigned long len;
1125 unsigned long prot;
1126 unsigned long flags;
1127 unsigned long fd;
1128 unsigned long offset;
1129};
1130
1131SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)
1132{
1133 struct mmap_arg_struct a;
1134
1135 if (copy_from_user(&a, arg, sizeof(a)))
1136 return -EFAULT;
1137 if (a.offset & ~PAGE_MASK)
1138 return -EINVAL;
1139
1140 return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd,
1141 a.offset >> PAGE_SHIFT);
1142}
1143#endif /* __ARCH_WANT_SYS_OLD_MMAP */
1144
1086/* 1145/*
1087 * Some shared mappigns will want the pages marked read-only 1146 * Some shared mappigns will want the pages marked read-only
1088 * to track write events. If so, we'll downgrade vm_page_prot 1147 * to track write events. If so, we'll downgrade vm_page_prot
@@ -1205,6 +1264,7 @@ munmap_back:
1205 vma->vm_flags = vm_flags; 1264 vma->vm_flags = vm_flags;
1206 vma->vm_page_prot = vm_get_page_prot(vm_flags); 1265 vma->vm_page_prot = vm_get_page_prot(vm_flags);
1207 vma->vm_pgoff = pgoff; 1266 vma->vm_pgoff = pgoff;
1267 INIT_LIST_HEAD(&vma->anon_vma_chain);
1208 1268
1209 if (file) { 1269 if (file) {
1210 error = -EINVAL; 1270 error = -EINVAL;
@@ -1265,13 +1325,8 @@ out:
1265 mm->total_vm += len >> PAGE_SHIFT; 1325 mm->total_vm += len >> PAGE_SHIFT;
1266 vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); 1326 vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
1267 if (vm_flags & VM_LOCKED) { 1327 if (vm_flags & VM_LOCKED) {
1268 /* 1328 if (!mlock_vma_pages_range(vma, addr, addr + len))
1269 * makes pages present; downgrades, drops, reacquires mmap_sem 1329 mm->locked_vm += (len >> PAGE_SHIFT);
1270 */
1271 long nr_pages = mlock_vma_pages_range(vma, addr, addr + len);
1272 if (nr_pages < 0)
1273 return nr_pages; /* vma gone! */
1274 mm->locked_vm += (len >> PAGE_SHIFT) - nr_pages;
1275 } else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK)) 1330 } else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK))
1276 make_pages_present(addr, addr + len); 1331 make_pages_present(addr, addr + len);
1277 return addr; 1332 return addr;
@@ -1599,7 +1654,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
1599 return -ENOMEM; 1654 return -ENOMEM;
1600 1655
1601 /* Stack limit test */ 1656 /* Stack limit test */
1602 if (size > rlim[RLIMIT_STACK].rlim_cur) 1657 if (size > ACCESS_ONCE(rlim[RLIMIT_STACK].rlim_cur))
1603 return -ENOMEM; 1658 return -ENOMEM;
1604 1659
1605 /* mlock limit tests */ 1660 /* mlock limit tests */
@@ -1607,7 +1662,8 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
1607 unsigned long locked; 1662 unsigned long locked;
1608 unsigned long limit; 1663 unsigned long limit;
1609 locked = mm->locked_vm + grow; 1664 locked = mm->locked_vm + grow;
1610 limit = rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; 1665 limit = ACCESS_ONCE(rlim[RLIMIT_MEMLOCK].rlim_cur);
1666 limit >>= PAGE_SHIFT;
1611 if (locked > limit && !capable(CAP_IPC_LOCK)) 1667 if (locked > limit && !capable(CAP_IPC_LOCK))
1612 return -ENOMEM; 1668 return -ENOMEM;
1613 } 1669 }
@@ -1754,8 +1810,7 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr)
1754 if (!prev || expand_stack(prev, addr)) 1810 if (!prev || expand_stack(prev, addr))
1755 return NULL; 1811 return NULL;
1756 if (prev->vm_flags & VM_LOCKED) { 1812 if (prev->vm_flags & VM_LOCKED) {
1757 if (mlock_vma_pages_range(prev, addr, prev->vm_end) < 0) 1813 mlock_vma_pages_range(prev, addr, prev->vm_end);
1758 return NULL; /* vma gone! */
1759 } 1814 }
1760 return prev; 1815 return prev;
1761} 1816}
@@ -1783,8 +1838,7 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr)
1783 if (expand_stack(vma, addr)) 1838 if (expand_stack(vma, addr))
1784 return NULL; 1839 return NULL;
1785 if (vma->vm_flags & VM_LOCKED) { 1840 if (vma->vm_flags & VM_LOCKED) {
1786 if (mlock_vma_pages_range(vma, addr, start) < 0) 1841 mlock_vma_pages_range(vma, addr, start);
1787 return NULL; /* vma gone! */
1788 } 1842 }
1789 return vma; 1843 return vma;
1790} 1844}
@@ -1871,6 +1925,7 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
1871{ 1925{
1872 struct mempolicy *pol; 1926 struct mempolicy *pol;
1873 struct vm_area_struct *new; 1927 struct vm_area_struct *new;
1928 int err = -ENOMEM;
1874 1929
1875 if (is_vm_hugetlb_page(vma) && (addr & 1930 if (is_vm_hugetlb_page(vma) && (addr &
1876 ~(huge_page_mask(hstate_vma(vma))))) 1931 ~(huge_page_mask(hstate_vma(vma)))))
@@ -1878,11 +1933,13 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
1878 1933
1879 new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); 1934 new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
1880 if (!new) 1935 if (!new)
1881 return -ENOMEM; 1936 goto out_err;
1882 1937
1883 /* most fields are the same, copy all, and then fixup */ 1938 /* most fields are the same, copy all, and then fixup */
1884 *new = *vma; 1939 *new = *vma;
1885 1940
1941 INIT_LIST_HEAD(&new->anon_vma_chain);
1942
1886 if (new_below) 1943 if (new_below)
1887 new->vm_end = addr; 1944 new->vm_end = addr;
1888 else { 1945 else {
@@ -1892,11 +1949,14 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
1892 1949
1893 pol = mpol_dup(vma_policy(vma)); 1950 pol = mpol_dup(vma_policy(vma));
1894 if (IS_ERR(pol)) { 1951 if (IS_ERR(pol)) {
1895 kmem_cache_free(vm_area_cachep, new); 1952 err = PTR_ERR(pol);
1896 return PTR_ERR(pol); 1953 goto out_free_vma;
1897 } 1954 }
1898 vma_set_policy(new, pol); 1955 vma_set_policy(new, pol);
1899 1956
1957 if (anon_vma_clone(new, vma))
1958 goto out_free_mpol;
1959
1900 if (new->vm_file) { 1960 if (new->vm_file) {
1901 get_file(new->vm_file); 1961 get_file(new->vm_file);
1902 if (vma->vm_flags & VM_EXECUTABLE) 1962 if (vma->vm_flags & VM_EXECUTABLE)
@@ -1907,12 +1967,29 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
1907 new->vm_ops->open(new); 1967 new->vm_ops->open(new);
1908 1968
1909 if (new_below) 1969 if (new_below)
1910 vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff + 1970 err = vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff +
1911 ((addr - new->vm_start) >> PAGE_SHIFT), new); 1971 ((addr - new->vm_start) >> PAGE_SHIFT), new);
1912 else 1972 else
1913 vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new); 1973 err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new);
1914 1974
1915 return 0; 1975 /* Success. */
1976 if (!err)
1977 return 0;
1978
1979 /* Clean everything up if vma_adjust failed. */
1980 if (new->vm_ops && new->vm_ops->close)
1981 new->vm_ops->close(new);
1982 if (new->vm_file) {
1983 if (vma->vm_flags & VM_EXECUTABLE)
1984 removed_exe_file_vma(mm);
1985 fput(new->vm_file);
1986 }
1987 out_free_mpol:
1988 mpol_put(pol);
1989 out_free_vma:
1990 kmem_cache_free(vm_area_cachep, new);
1991 out_err:
1992 return err;
1916} 1993}
1917 1994
1918/* 1995/*
@@ -2074,7 +2151,7 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
2074 unsigned long locked, lock_limit; 2151 unsigned long locked, lock_limit;
2075 locked = len >> PAGE_SHIFT; 2152 locked = len >> PAGE_SHIFT;
2076 locked += mm->locked_vm; 2153 locked += mm->locked_vm;
2077 lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; 2154 lock_limit = rlimit(RLIMIT_MEMLOCK);
2078 lock_limit >>= PAGE_SHIFT; 2155 lock_limit >>= PAGE_SHIFT;
2079 if (locked > lock_limit && !capable(CAP_IPC_LOCK)) 2156 if (locked > lock_limit && !capable(CAP_IPC_LOCK))
2080 return -EAGAIN; 2157 return -EAGAIN;
@@ -2122,6 +2199,7 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
2122 return -ENOMEM; 2199 return -ENOMEM;
2123 } 2200 }
2124 2201
2202 INIT_LIST_HEAD(&vma->anon_vma_chain);
2125 vma->vm_mm = mm; 2203 vma->vm_mm = mm;
2126 vma->vm_start = addr; 2204 vma->vm_start = addr;
2127 vma->vm_end = addr + len; 2205 vma->vm_end = addr + len;
@@ -2258,10 +2336,11 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
2258 if (new_vma) { 2336 if (new_vma) {
2259 *new_vma = *vma; 2337 *new_vma = *vma;
2260 pol = mpol_dup(vma_policy(vma)); 2338 pol = mpol_dup(vma_policy(vma));
2261 if (IS_ERR(pol)) { 2339 if (IS_ERR(pol))
2262 kmem_cache_free(vm_area_cachep, new_vma); 2340 goto out_free_vma;
2263 return NULL; 2341 INIT_LIST_HEAD(&new_vma->anon_vma_chain);
2264 } 2342 if (anon_vma_clone(new_vma, vma))
2343 goto out_free_mempol;
2265 vma_set_policy(new_vma, pol); 2344 vma_set_policy(new_vma, pol);
2266 new_vma->vm_start = addr; 2345 new_vma->vm_start = addr;
2267 new_vma->vm_end = addr + len; 2346 new_vma->vm_end = addr + len;
@@ -2277,6 +2356,12 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
2277 } 2356 }
2278 } 2357 }
2279 return new_vma; 2358 return new_vma;
2359
2360 out_free_mempol:
2361 mpol_put(pol);
2362 out_free_vma:
2363 kmem_cache_free(vm_area_cachep, new_vma);
2364 return NULL;
2280} 2365}
2281 2366
2282/* 2367/*
@@ -2288,7 +2373,7 @@ int may_expand_vm(struct mm_struct *mm, unsigned long npages)
2288 unsigned long cur = mm->total_vm; /* pages */ 2373 unsigned long cur = mm->total_vm; /* pages */
2289 unsigned long lim; 2374 unsigned long lim;
2290 2375
2291 lim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT; 2376 lim = rlimit(RLIMIT_AS) >> PAGE_SHIFT;
2292 2377
2293 if (cur + npages > lim) 2378 if (cur + npages > lim)
2294 return 0; 2379 return 0;
@@ -2354,6 +2439,7 @@ int install_special_mapping(struct mm_struct *mm,
2354 if (unlikely(vma == NULL)) 2439 if (unlikely(vma == NULL))
2355 return -ENOMEM; 2440 return -ENOMEM;
2356 2441
2442 INIT_LIST_HEAD(&vma->anon_vma_chain);
2357 vma->vm_mm = mm; 2443 vma->vm_mm = mm;
2358 vma->vm_start = addr; 2444 vma->vm_start = addr;
2359 vma->vm_end = addr + len; 2445 vma->vm_end = addr + len;
@@ -2454,6 +2540,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
2454int mm_take_all_locks(struct mm_struct *mm) 2540int mm_take_all_locks(struct mm_struct *mm)
2455{ 2541{
2456 struct vm_area_struct *vma; 2542 struct vm_area_struct *vma;
2543 struct anon_vma_chain *avc;
2457 int ret = -EINTR; 2544 int ret = -EINTR;
2458 2545
2459 BUG_ON(down_read_trylock(&mm->mmap_sem)); 2546 BUG_ON(down_read_trylock(&mm->mmap_sem));
@@ -2471,7 +2558,8 @@ int mm_take_all_locks(struct mm_struct *mm)
2471 if (signal_pending(current)) 2558 if (signal_pending(current))
2472 goto out_unlock; 2559 goto out_unlock;
2473 if (vma->anon_vma) 2560 if (vma->anon_vma)
2474 vm_lock_anon_vma(mm, vma->anon_vma); 2561 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
2562 vm_lock_anon_vma(mm, avc->anon_vma);
2475 } 2563 }
2476 2564
2477 ret = 0; 2565 ret = 0;
@@ -2526,13 +2614,15 @@ static void vm_unlock_mapping(struct address_space *mapping)
2526void mm_drop_all_locks(struct mm_struct *mm) 2614void mm_drop_all_locks(struct mm_struct *mm)
2527{ 2615{
2528 struct vm_area_struct *vma; 2616 struct vm_area_struct *vma;
2617 struct anon_vma_chain *avc;
2529 2618
2530 BUG_ON(down_read_trylock(&mm->mmap_sem)); 2619 BUG_ON(down_read_trylock(&mm->mmap_sem));
2531 BUG_ON(!mutex_is_locked(&mm_all_locks_mutex)); 2620 BUG_ON(!mutex_is_locked(&mm_all_locks_mutex));
2532 2621
2533 for (vma = mm->mmap; vma; vma = vma->vm_next) { 2622 for (vma = mm->mmap; vma; vma = vma->vm_next) {
2534 if (vma->anon_vma) 2623 if (vma->anon_vma)
2535 vm_unlock_anon_vma(vma->anon_vma); 2624 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
2625 vm_unlock_anon_vma(avc->anon_vma);
2536 if (vma->vm_file && vma->vm_file->f_mapping) 2626 if (vma->vm_file && vma->vm_file->f_mapping)
2537 vm_unlock_mapping(vma->vm_file->f_mapping); 2627 vm_unlock_mapping(vma->vm_file->f_mapping);
2538 } 2628 }
diff --git a/mm/mmu_context.c b/mm/mmu_context.c
index ded9081f4021..9e82e937000e 100644
--- a/mm/mmu_context.c
+++ b/mm/mmu_context.c
@@ -5,6 +5,7 @@
5 5
6#include <linux/mm.h> 6#include <linux/mm.h>
7#include <linux/mmu_context.h> 7#include <linux/mmu_context.h>
8#include <linux/module.h>
8#include <linux/sched.h> 9#include <linux/sched.h>
9 10
10#include <asm/mmu_context.h> 11#include <asm/mmu_context.h>
@@ -37,6 +38,7 @@ void use_mm(struct mm_struct *mm)
37 if (active_mm != mm) 38 if (active_mm != mm)
38 mmdrop(active_mm); 39 mmdrop(active_mm);
39} 40}
41EXPORT_SYMBOL_GPL(use_mm);
40 42
41/* 43/*
42 * unuse_mm 44 * unuse_mm
@@ -51,8 +53,10 @@ void unuse_mm(struct mm_struct *mm)
51 struct task_struct *tsk = current; 53 struct task_struct *tsk = current;
52 54
53 task_lock(tsk); 55 task_lock(tsk);
56 sync_mm_rss(tsk, mm);
54 tsk->mm = NULL; 57 tsk->mm = NULL;
55 /* active_mm is still 'mm' */ 58 /* active_mm is still 'mm' */
56 enter_lazy_tlb(mm, tsk); 59 enter_lazy_tlb(mm, tsk);
57 task_unlock(tsk); 60 task_unlock(tsk);
58} 61}
62EXPORT_SYMBOL_GPL(unuse_mm);
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 7e33f2cb3c77..438951d366f2 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -16,6 +16,7 @@
16#include <linux/err.h> 16#include <linux/err.h>
17#include <linux/rcupdate.h> 17#include <linux/rcupdate.h>
18#include <linux/sched.h> 18#include <linux/sched.h>
19#include <linux/slab.h>
19 20
20/* 21/*
21 * This function can't run concurrently against mmu_notifier_register 22 * This function can't run concurrently against mmu_notifier_register
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 8bc969d8112d..2d1bf7cf8851 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -10,7 +10,6 @@
10 10
11#include <linux/mm.h> 11#include <linux/mm.h>
12#include <linux/hugetlb.h> 12#include <linux/hugetlb.h>
13#include <linux/slab.h>
14#include <linux/shm.h> 13#include <linux/shm.h>
15#include <linux/mman.h> 14#include <linux/mman.h>
16#include <linux/fs.h> 15#include <linux/fs.h>
diff --git a/mm/mremap.c b/mm/mremap.c
index 845190898d59..cde56ee51ef7 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -9,7 +9,6 @@
9 9
10#include <linux/mm.h> 10#include <linux/mm.h>
11#include <linux/hugetlb.h> 11#include <linux/hugetlb.h>
12#include <linux/slab.h>
13#include <linux/shm.h> 12#include <linux/shm.h>
14#include <linux/ksm.h> 13#include <linux/ksm.h>
15#include <linux/mman.h> 14#include <linux/mman.h>
@@ -285,7 +284,7 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
285 if (vma->vm_flags & VM_LOCKED) { 284 if (vma->vm_flags & VM_LOCKED) {
286 unsigned long locked, lock_limit; 285 unsigned long locked, lock_limit;
287 locked = mm->locked_vm << PAGE_SHIFT; 286 locked = mm->locked_vm << PAGE_SHIFT;
288 lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; 287 lock_limit = rlimit(RLIMIT_MEMLOCK);
289 locked += new_len - old_len; 288 locked += new_len - old_len;
290 if (locked > lock_limit && !capable(CAP_IPC_LOCK)) 289 if (locked > lock_limit && !capable(CAP_IPC_LOCK))
291 goto Eagain; 290 goto Eagain;
@@ -460,8 +459,11 @@ unsigned long do_mremap(unsigned long addr,
460 if (vma_expandable(vma, new_len - old_len)) { 459 if (vma_expandable(vma, new_len - old_len)) {
461 int pages = (new_len - old_len) >> PAGE_SHIFT; 460 int pages = (new_len - old_len) >> PAGE_SHIFT;
462 461
463 vma_adjust(vma, vma->vm_start, 462 if (vma_adjust(vma, vma->vm_start, addr + new_len,
464 addr + new_len, vma->vm_pgoff, NULL); 463 vma->vm_pgoff, NULL)) {
464 ret = -ENOMEM;
465 goto out;
466 }
465 467
466 mm->total_vm += pages; 468 mm->total_vm += pages;
467 vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages); 469 vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages);
diff --git a/mm/nommu.c b/mm/nommu.c
index 48a2ecfaf059..63fa17d121f0 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -162,7 +162,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
162 } 162 }
163 if (vmas) 163 if (vmas)
164 vmas[i] = vma; 164 vmas[i] = vma;
165 start += PAGE_SIZE; 165 start = (start + PAGE_SIZE) & PAGE_MASK;
166 } 166 }
167 167
168 return i; 168 return i;
@@ -1040,10 +1040,9 @@ static int do_mmap_shared_file(struct vm_area_struct *vma)
1040 if (ret != -ENOSYS) 1040 if (ret != -ENOSYS)
1041 return ret; 1041 return ret;
1042 1042
1043 /* getting an ENOSYS error indicates that direct mmap isn't 1043 /* getting -ENOSYS indicates that direct mmap isn't possible (as
1044 * possible (as opposed to tried but failed) so we'll fall 1044 * opposed to tried but failed) so we can only give a suitable error as
1045 * through to making a private copy of the data and mapping 1045 * it's not possible to make a private copy if MAP_SHARED was given */
1046 * that if we can */
1047 return -ENODEV; 1046 return -ENODEV;
1048} 1047}
1049 1048
@@ -1209,7 +1208,7 @@ unsigned long do_mmap_pgoff(struct file *file,
1209 region->vm_flags = vm_flags; 1208 region->vm_flags = vm_flags;
1210 region->vm_pgoff = pgoff; 1209 region->vm_pgoff = pgoff;
1211 1210
1212 INIT_LIST_HEAD(&vma->anon_vma_node); 1211 INIT_LIST_HEAD(&vma->anon_vma_chain);
1213 vma->vm_flags = vm_flags; 1212 vma->vm_flags = vm_flags;
1214 vma->vm_pgoff = pgoff; 1213 vma->vm_pgoff = pgoff;
1215 1214
@@ -1428,6 +1427,30 @@ out:
1428 return retval; 1427 return retval;
1429} 1428}
1430 1429
1430#ifdef __ARCH_WANT_SYS_OLD_MMAP
1431struct mmap_arg_struct {
1432 unsigned long addr;
1433 unsigned long len;
1434 unsigned long prot;
1435 unsigned long flags;
1436 unsigned long fd;
1437 unsigned long offset;
1438};
1439
1440SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)
1441{
1442 struct mmap_arg_struct a;
1443
1444 if (copy_from_user(&a, arg, sizeof(a)))
1445 return -EFAULT;
1446 if (a.offset & ~PAGE_MASK)
1447 return -EINVAL;
1448
1449 return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd,
1450 a.offset >> PAGE_SHIFT);
1451}
1452#endif /* __ARCH_WANT_SYS_OLD_MMAP */
1453
1431/* 1454/*
1432 * split a vma into two pieces at address 'addr', a new vma is allocated either 1455 * split a vma into two pieces at address 'addr', a new vma is allocated either
1433 * for the first part or the tail. 1456 * for the first part or the tail.
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 237050478f28..b68e802a7a7d 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -18,6 +18,7 @@
18#include <linux/oom.h> 18#include <linux/oom.h>
19#include <linux/mm.h> 19#include <linux/mm.h>
20#include <linux/err.h> 20#include <linux/err.h>
21#include <linux/gfp.h>
21#include <linux/sched.h> 22#include <linux/sched.h>
22#include <linux/swap.h> 23#include <linux/swap.h>
23#include <linux/timex.h> 24#include <linux/timex.h>
@@ -401,8 +402,8 @@ static void __oom_kill_task(struct task_struct *p, int verbose)
401 "vsz:%lukB, anon-rss:%lukB, file-rss:%lukB\n", 402 "vsz:%lukB, anon-rss:%lukB, file-rss:%lukB\n",
402 task_pid_nr(p), p->comm, 403 task_pid_nr(p), p->comm,
403 K(p->mm->total_vm), 404 K(p->mm->total_vm),
404 K(get_mm_counter(p->mm, anon_rss)), 405 K(get_mm_counter(p->mm, MM_ANONPAGES)),
405 K(get_mm_counter(p->mm, file_rss))); 406 K(get_mm_counter(p->mm, MM_FILEPAGES)));
406 task_unlock(p); 407 task_unlock(p);
407 408
408 /* 409 /*
@@ -473,6 +474,8 @@ void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask)
473 unsigned long points = 0; 474 unsigned long points = 0;
474 struct task_struct *p; 475 struct task_struct *p;
475 476
477 if (sysctl_panic_on_oom == 2)
478 panic("out of memory(memcg). panic_on_oom is selected.\n");
476 read_lock(&tasklist_lock); 479 read_lock(&tasklist_lock);
477retry: 480retry:
478 p = select_bad_process(&points, mem); 481 p = select_bad_process(&points, mem);
@@ -601,13 +604,6 @@ void pagefault_out_of_memory(void)
601 /* Got some memory back in the last second. */ 604 /* Got some memory back in the last second. */
602 return; 605 return;
603 606
604 /*
605 * If this is from memcg, oom-killer is already invoked.
606 * and not worth to go system-wide-oom.
607 */
608 if (mem_cgroup_oom_called(current))
609 goto rest_and_return;
610
611 if (sysctl_panic_on_oom) 607 if (sysctl_panic_on_oom)
612 panic("out of memory from page fault. panic_on_oom is selected.\n"); 608 panic("out of memory from page fault. panic_on_oom is selected.\n");
613 609
@@ -619,7 +615,6 @@ void pagefault_out_of_memory(void)
619 * Give "p" a good chance of killing itself before we 615 * Give "p" a good chance of killing itself before we
620 * retry to allocate memory. 616 * retry to allocate memory.
621 */ 617 */
622rest_and_return:
623 if (!test_thread_flag(TIF_MEMDIE)) 618 if (!test_thread_flag(TIF_MEMDIE))
624 schedule_timeout_uninterruptible(1); 619 schedule_timeout_uninterruptible(1);
625} 620}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8deb9d0fd5b1..d03c946d5566 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -50,6 +50,7 @@
50#include <linux/kmemleak.h> 50#include <linux/kmemleak.h>
51#include <linux/memory.h> 51#include <linux/memory.h>
52#include <trace/events/kmem.h> 52#include <trace/events/kmem.h>
53#include <linux/ftrace_event.h>
53 54
54#include <asm/tlbflush.h> 55#include <asm/tlbflush.h>
55#include <asm/div64.h> 56#include <asm/div64.h>
@@ -76,6 +77,31 @@ unsigned long totalreserve_pages __read_mostly;
76int percpu_pagelist_fraction; 77int percpu_pagelist_fraction;
77gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; 78gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
78 79
80#ifdef CONFIG_PM_SLEEP
81/*
82 * The following functions are used by the suspend/hibernate code to temporarily
83 * change gfp_allowed_mask in order to avoid using I/O during memory allocations
84 * while devices are suspended. To avoid races with the suspend/hibernate code,
85 * they should always be called with pm_mutex held (gfp_allowed_mask also should
86 * only be modified with pm_mutex held, unless the suspend/hibernate code is
87 * guaranteed not to run in parallel with that modification).
88 */
89void set_gfp_allowed_mask(gfp_t mask)
90{
91 WARN_ON(!mutex_is_locked(&pm_mutex));
92 gfp_allowed_mask = mask;
93}
94
95gfp_t clear_gfp_allowed_mask(gfp_t mask)
96{
97 gfp_t ret = gfp_allowed_mask;
98
99 WARN_ON(!mutex_is_locked(&pm_mutex));
100 gfp_allowed_mask &= ~mask;
101 return ret;
102}
103#endif /* CONFIG_PM_SLEEP */
104
79#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE 105#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
80int pageblock_order __read_mostly; 106int pageblock_order __read_mostly;
81#endif 107#endif
@@ -263,10 +289,7 @@ static void bad_page(struct page *page)
263 289
264 printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n", 290 printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n",
265 current->comm, page_to_pfn(page)); 291 current->comm, page_to_pfn(page));
266 printk(KERN_ALERT 292 dump_page(page);
267 "page:%p flags:%p count:%d mapcount:%d mapping:%p index:%lx\n",
268 page, (void *)page->flags, page_count(page),
269 page_mapcount(page), page->mapping, page->index);
270 293
271 dump_stack(); 294 dump_stack();
272out: 295out:
@@ -530,7 +553,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
530 int batch_free = 0; 553 int batch_free = 0;
531 554
532 spin_lock(&zone->lock); 555 spin_lock(&zone->lock);
533 zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); 556 zone->all_unreclaimable = 0;
534 zone->pages_scanned = 0; 557 zone->pages_scanned = 0;
535 558
536 __mod_zone_page_state(zone, NR_FREE_PAGES, count); 559 __mod_zone_page_state(zone, NR_FREE_PAGES, count);
@@ -568,7 +591,7 @@ static void free_one_page(struct zone *zone, struct page *page, int order,
568 int migratetype) 591 int migratetype)
569{ 592{
570 spin_lock(&zone->lock); 593 spin_lock(&zone->lock);
571 zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); 594 zone->all_unreclaimable = 0;
572 zone->pages_scanned = 0; 595 zone->pages_scanned = 0;
573 596
574 __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order); 597 __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
@@ -583,6 +606,7 @@ static void __free_pages_ok(struct page *page, unsigned int order)
583 int bad = 0; 606 int bad = 0;
584 int wasMlocked = __TestClearPageMlocked(page); 607 int wasMlocked = __TestClearPageMlocked(page);
585 608
609 trace_mm_page_free_direct(page, order);
586 kmemcheck_free_shadow(page, order); 610 kmemcheck_free_shadow(page, order);
587 611
588 for (i = 0 ; i < (1 << order) ; ++i) 612 for (i = 0 ; i < (1 << order) ; ++i)
@@ -1009,10 +1033,10 @@ static void drain_pages(unsigned int cpu)
1009 struct per_cpu_pageset *pset; 1033 struct per_cpu_pageset *pset;
1010 struct per_cpu_pages *pcp; 1034 struct per_cpu_pages *pcp;
1011 1035
1012 pset = zone_pcp(zone, cpu); 1036 local_irq_save(flags);
1037 pset = per_cpu_ptr(zone->pageset, cpu);
1013 1038
1014 pcp = &pset->pcp; 1039 pcp = &pset->pcp;
1015 local_irq_save(flags);
1016 free_pcppages_bulk(zone, pcp->count, pcp); 1040 free_pcppages_bulk(zone, pcp->count, pcp);
1017 pcp->count = 0; 1041 pcp->count = 0;
1018 local_irq_restore(flags); 1042 local_irq_restore(flags);
@@ -1073,8 +1097,9 @@ void mark_free_pages(struct zone *zone)
1073 1097
1074/* 1098/*
1075 * Free a 0-order page 1099 * Free a 0-order page
1100 * cold == 1 ? free a cold page : free a hot page
1076 */ 1101 */
1077static void free_hot_cold_page(struct page *page, int cold) 1102void free_hot_cold_page(struct page *page, int cold)
1078{ 1103{
1079 struct zone *zone = page_zone(page); 1104 struct zone *zone = page_zone(page);
1080 struct per_cpu_pages *pcp; 1105 struct per_cpu_pages *pcp;
@@ -1082,6 +1107,7 @@ static void free_hot_cold_page(struct page *page, int cold)
1082 int migratetype; 1107 int migratetype;
1083 int wasMlocked = __TestClearPageMlocked(page); 1108 int wasMlocked = __TestClearPageMlocked(page);
1084 1109
1110 trace_mm_page_free_direct(page, 0);
1085 kmemcheck_free_shadow(page, 0); 1111 kmemcheck_free_shadow(page, 0);
1086 1112
1087 if (PageAnon(page)) 1113 if (PageAnon(page))
@@ -1096,7 +1122,6 @@ static void free_hot_cold_page(struct page *page, int cold)
1096 arch_free_page(page, 0); 1122 arch_free_page(page, 0);
1097 kernel_map_pages(page, 1, 0); 1123 kernel_map_pages(page, 1, 0);
1098 1124
1099 pcp = &zone_pcp(zone, get_cpu())->pcp;
1100 migratetype = get_pageblock_migratetype(page); 1125 migratetype = get_pageblock_migratetype(page);
1101 set_page_private(page, migratetype); 1126 set_page_private(page, migratetype);
1102 local_irq_save(flags); 1127 local_irq_save(flags);
@@ -1119,6 +1144,7 @@ static void free_hot_cold_page(struct page *page, int cold)
1119 migratetype = MIGRATE_MOVABLE; 1144 migratetype = MIGRATE_MOVABLE;
1120 } 1145 }
1121 1146
1147 pcp = &this_cpu_ptr(zone->pageset)->pcp;
1122 if (cold) 1148 if (cold)
1123 list_add_tail(&page->lru, &pcp->lists[migratetype]); 1149 list_add_tail(&page->lru, &pcp->lists[migratetype]);
1124 else 1150 else
@@ -1131,15 +1157,8 @@ static void free_hot_cold_page(struct page *page, int cold)
1131 1157
1132out: 1158out:
1133 local_irq_restore(flags); 1159 local_irq_restore(flags);
1134 put_cpu();
1135} 1160}
1136 1161
1137void free_hot_page(struct page *page)
1138{
1139 trace_mm_page_free_direct(page, 0);
1140 free_hot_cold_page(page, 0);
1141}
1142
1143/* 1162/*
1144 * split_page takes a non-compound higher-order page, and splits it into 1163 * split_page takes a non-compound higher-order page, and splits it into
1145 * n (1<<order) sub-pages: page[0..n] 1164 * n (1<<order) sub-pages: page[0..n]
@@ -1181,17 +1200,15 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
1181 unsigned long flags; 1200 unsigned long flags;
1182 struct page *page; 1201 struct page *page;
1183 int cold = !!(gfp_flags & __GFP_COLD); 1202 int cold = !!(gfp_flags & __GFP_COLD);
1184 int cpu;
1185 1203
1186again: 1204again:
1187 cpu = get_cpu();
1188 if (likely(order == 0)) { 1205 if (likely(order == 0)) {
1189 struct per_cpu_pages *pcp; 1206 struct per_cpu_pages *pcp;
1190 struct list_head *list; 1207 struct list_head *list;
1191 1208
1192 pcp = &zone_pcp(zone, cpu)->pcp;
1193 list = &pcp->lists[migratetype];
1194 local_irq_save(flags); 1209 local_irq_save(flags);
1210 pcp = &this_cpu_ptr(zone->pageset)->pcp;
1211 list = &pcp->lists[migratetype];
1195 if (list_empty(list)) { 1212 if (list_empty(list)) {
1196 pcp->count += rmqueue_bulk(zone, 0, 1213 pcp->count += rmqueue_bulk(zone, 0,
1197 pcp->batch, list, 1214 pcp->batch, list,
@@ -1232,7 +1249,6 @@ again:
1232 __count_zone_vm_events(PGALLOC, zone, 1 << order); 1249 __count_zone_vm_events(PGALLOC, zone, 1 << order);
1233 zone_statistics(preferred_zone, zone); 1250 zone_statistics(preferred_zone, zone);
1234 local_irq_restore(flags); 1251 local_irq_restore(flags);
1235 put_cpu();
1236 1252
1237 VM_BUG_ON(bad_range(zone, page)); 1253 VM_BUG_ON(bad_range(zone, page));
1238 if (prep_new_page(page, order, gfp_flags)) 1254 if (prep_new_page(page, order, gfp_flags))
@@ -1241,7 +1257,6 @@ again:
1241 1257
1242failed: 1258failed:
1243 local_irq_restore(flags); 1259 local_irq_restore(flags);
1244 put_cpu();
1245 return NULL; 1260 return NULL;
1246} 1261}
1247 1262
@@ -2013,9 +2028,8 @@ void __pagevec_free(struct pagevec *pvec)
2013void __free_pages(struct page *page, unsigned int order) 2028void __free_pages(struct page *page, unsigned int order)
2014{ 2029{
2015 if (put_page_testzero(page)) { 2030 if (put_page_testzero(page)) {
2016 trace_mm_page_free_direct(page, order);
2017 if (order == 0) 2031 if (order == 0)
2018 free_hot_page(page); 2032 free_hot_cold_page(page, 0);
2019 else 2033 else
2020 __free_pages_ok(page, order); 2034 __free_pages_ok(page, order);
2021 } 2035 }
@@ -2180,7 +2194,7 @@ void show_free_areas(void)
2180 for_each_online_cpu(cpu) { 2194 for_each_online_cpu(cpu) {
2181 struct per_cpu_pageset *pageset; 2195 struct per_cpu_pageset *pageset;
2182 2196
2183 pageset = zone_pcp(zone, cpu); 2197 pageset = per_cpu_ptr(zone->pageset, cpu);
2184 2198
2185 printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n", 2199 printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n",
2186 cpu, pageset->pcp.high, 2200 cpu, pageset->pcp.high,
@@ -2271,7 +2285,7 @@ void show_free_areas(void)
2271 K(zone_page_state(zone, NR_BOUNCE)), 2285 K(zone_page_state(zone, NR_BOUNCE)),
2272 K(zone_page_state(zone, NR_WRITEBACK_TEMP)), 2286 K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
2273 zone->pages_scanned, 2287 zone->pages_scanned,
2274 (zone_is_all_unreclaimable(zone) ? "yes" : "no") 2288 (zone->all_unreclaimable ? "yes" : "no")
2275 ); 2289 );
2276 printk("lowmem_reserve[]:"); 2290 printk("lowmem_reserve[]:");
2277 for (i = 0; i < MAX_NR_ZONES; i++) 2291 for (i = 0; i < MAX_NR_ZONES; i++)
@@ -2745,10 +2759,29 @@ static void build_zonelist_cache(pg_data_t *pgdat)
2745 2759
2746#endif /* CONFIG_NUMA */ 2760#endif /* CONFIG_NUMA */
2747 2761
2762/*
2763 * Boot pageset table. One per cpu which is going to be used for all
2764 * zones and all nodes. The parameters will be set in such a way
2765 * that an item put on a list will immediately be handed over to
2766 * the buddy list. This is safe since pageset manipulation is done
2767 * with interrupts disabled.
2768 *
2769 * The boot_pagesets must be kept even after bootup is complete for
2770 * unused processors and/or zones. They do play a role for bootstrapping
2771 * hotplugged processors.
2772 *
2773 * zoneinfo_show() and maybe other functions do
2774 * not check if the processor is online before following the pageset pointer.
2775 * Other parts of the kernel may not check if the zone is available.
2776 */
2777static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);
2778static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
2779
2748/* return values int ....just for stop_machine() */ 2780/* return values int ....just for stop_machine() */
2749static int __build_all_zonelists(void *dummy) 2781static int __build_all_zonelists(void *dummy)
2750{ 2782{
2751 int nid; 2783 int nid;
2784 int cpu;
2752 2785
2753#ifdef CONFIG_NUMA 2786#ifdef CONFIG_NUMA
2754 memset(node_load, 0, sizeof(node_load)); 2787 memset(node_load, 0, sizeof(node_load));
@@ -2759,6 +2792,23 @@ static int __build_all_zonelists(void *dummy)
2759 build_zonelists(pgdat); 2792 build_zonelists(pgdat);
2760 build_zonelist_cache(pgdat); 2793 build_zonelist_cache(pgdat);
2761 } 2794 }
2795
2796 /*
2797 * Initialize the boot_pagesets that are going to be used
2798 * for bootstrapping processors. The real pagesets for
2799 * each zone will be allocated later when the per cpu
2800 * allocator is available.
2801 *
2802 * boot_pagesets are used also for bootstrapping offline
2803 * cpus if the system is already booted because the pagesets
2804 * are needed to initialize allocators on a specific cpu too.
2805 * F.e. the percpu allocator needs the page allocator which
2806 * needs the percpu allocator in order to allocate its pagesets
2807 * (a chicken-egg dilemma).
2808 */
2809 for_each_possible_cpu(cpu)
2810 setup_pageset(&per_cpu(boot_pageset, cpu), 0);
2811
2762 return 0; 2812 return 0;
2763} 2813}
2764 2814
@@ -3096,121 +3146,33 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p,
3096 pcp->batch = PAGE_SHIFT * 8; 3146 pcp->batch = PAGE_SHIFT * 8;
3097} 3147}
3098 3148
3099
3100#ifdef CONFIG_NUMA
3101/*
3102 * Boot pageset table. One per cpu which is going to be used for all
3103 * zones and all nodes. The parameters will be set in such a way
3104 * that an item put on a list will immediately be handed over to
3105 * the buddy list. This is safe since pageset manipulation is done
3106 * with interrupts disabled.
3107 *
3108 * Some NUMA counter updates may also be caught by the boot pagesets.
3109 *
3110 * The boot_pagesets must be kept even after bootup is complete for
3111 * unused processors and/or zones. They do play a role for bootstrapping
3112 * hotplugged processors.
3113 *
3114 * zoneinfo_show() and maybe other functions do
3115 * not check if the processor is online before following the pageset pointer.
3116 * Other parts of the kernel may not check if the zone is available.
3117 */
3118static struct per_cpu_pageset boot_pageset[NR_CPUS];
3119
3120/* 3149/*
3121 * Dynamically allocate memory for the 3150 * Allocate per cpu pagesets and initialize them.
3122 * per cpu pageset array in struct zone. 3151 * Before this call only boot pagesets were available.
3152 * Boot pagesets will no longer be used by this processorr
3153 * after setup_per_cpu_pageset().
3123 */ 3154 */
3124static int __cpuinit process_zones(int cpu) 3155void __init setup_per_cpu_pageset(void)
3125{ 3156{
3126 struct zone *zone, *dzone; 3157 struct zone *zone;
3127 int node = cpu_to_node(cpu); 3158 int cpu;
3128
3129 node_set_state(node, N_CPU); /* this node has a cpu */
3130 3159
3131 for_each_populated_zone(zone) { 3160 for_each_populated_zone(zone) {
3132 zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset), 3161 zone->pageset = alloc_percpu(struct per_cpu_pageset);
3133 GFP_KERNEL, node);
3134 if (!zone_pcp(zone, cpu))
3135 goto bad;
3136 3162
3137 setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone)); 3163 for_each_possible_cpu(cpu) {
3164 struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);
3138 3165
3139 if (percpu_pagelist_fraction) 3166 setup_pageset(pcp, zone_batchsize(zone));
3140 setup_pagelist_highmark(zone_pcp(zone, cpu),
3141 (zone->present_pages / percpu_pagelist_fraction));
3142 }
3143 3167
3144 return 0; 3168 if (percpu_pagelist_fraction)
3145bad: 3169 setup_pagelist_highmark(pcp,
3146 for_each_zone(dzone) { 3170 (zone->present_pages /
3147 if (!populated_zone(dzone)) 3171 percpu_pagelist_fraction));
3148 continue; 3172 }
3149 if (dzone == zone)
3150 break;
3151 kfree(zone_pcp(dzone, cpu));
3152 zone_pcp(dzone, cpu) = &boot_pageset[cpu];
3153 }
3154 return -ENOMEM;
3155}
3156
3157static inline void free_zone_pagesets(int cpu)
3158{
3159 struct zone *zone;
3160
3161 for_each_zone(zone) {
3162 struct per_cpu_pageset *pset = zone_pcp(zone, cpu);
3163
3164 /* Free per_cpu_pageset if it is slab allocated */
3165 if (pset != &boot_pageset[cpu])
3166 kfree(pset);
3167 zone_pcp(zone, cpu) = &boot_pageset[cpu];
3168 }
3169}
3170
3171static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb,
3172 unsigned long action,
3173 void *hcpu)
3174{
3175 int cpu = (long)hcpu;
3176 int ret = NOTIFY_OK;
3177
3178 switch (action) {
3179 case CPU_UP_PREPARE:
3180 case CPU_UP_PREPARE_FROZEN:
3181 if (process_zones(cpu))
3182 ret = NOTIFY_BAD;
3183 break;
3184 case CPU_UP_CANCELED:
3185 case CPU_UP_CANCELED_FROZEN:
3186 case CPU_DEAD:
3187 case CPU_DEAD_FROZEN:
3188 free_zone_pagesets(cpu);
3189 break;
3190 default:
3191 break;
3192 } 3173 }
3193 return ret;
3194} 3174}
3195 3175
3196static struct notifier_block __cpuinitdata pageset_notifier =
3197 { &pageset_cpuup_callback, NULL, 0 };
3198
3199void __init setup_per_cpu_pageset(void)
3200{
3201 int err;
3202
3203 /* Initialize per_cpu_pageset for cpu 0.
3204 * A cpuup callback will do this for every cpu
3205 * as it comes online
3206 */
3207 err = process_zones(smp_processor_id());
3208 BUG_ON(err);
3209 register_cpu_notifier(&pageset_notifier);
3210}
3211
3212#endif
3213
3214static noinline __init_refok 3176static noinline __init_refok
3215int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) 3177int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
3216{ 3178{
@@ -3260,11 +3222,11 @@ static int __zone_pcp_update(void *data)
3260 int cpu; 3222 int cpu;
3261 unsigned long batch = zone_batchsize(zone), flags; 3223 unsigned long batch = zone_batchsize(zone), flags;
3262 3224
3263 for (cpu = 0; cpu < NR_CPUS; cpu++) { 3225 for_each_possible_cpu(cpu) {
3264 struct per_cpu_pageset *pset; 3226 struct per_cpu_pageset *pset;
3265 struct per_cpu_pages *pcp; 3227 struct per_cpu_pages *pcp;
3266 3228
3267 pset = zone_pcp(zone, cpu); 3229 pset = per_cpu_ptr(zone->pageset, cpu);
3268 pcp = &pset->pcp; 3230 pcp = &pset->pcp;
3269 3231
3270 local_irq_save(flags); 3232 local_irq_save(flags);
@@ -3282,21 +3244,17 @@ void zone_pcp_update(struct zone *zone)
3282 3244
3283static __meminit void zone_pcp_init(struct zone *zone) 3245static __meminit void zone_pcp_init(struct zone *zone)
3284{ 3246{
3285 int cpu; 3247 /*
3286 unsigned long batch = zone_batchsize(zone); 3248 * per cpu subsystem is not up at this point. The following code
3249 * relies on the ability of the linker to provide the
3250 * offset of a (static) per cpu variable into the per cpu area.
3251 */
3252 zone->pageset = &boot_pageset;
3287 3253
3288 for (cpu = 0; cpu < NR_CPUS; cpu++) {
3289#ifdef CONFIG_NUMA
3290 /* Early boot. Slab allocator not functional yet */
3291 zone_pcp(zone, cpu) = &boot_pageset[cpu];
3292 setup_pageset(&boot_pageset[cpu],0);
3293#else
3294 setup_pageset(zone_pcp(zone,cpu), batch);
3295#endif
3296 }
3297 if (zone->present_pages) 3254 if (zone->present_pages)
3298 printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", 3255 printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n",
3299 zone->name, zone->present_pages, batch); 3256 zone->name, zone->present_pages,
3257 zone_batchsize(zone));
3300} 3258}
3301 3259
3302__meminit int init_currently_empty_zone(struct zone *zone, 3260__meminit int init_currently_empty_zone(struct zone *zone,
@@ -3435,6 +3393,61 @@ void __init free_bootmem_with_active_regions(int nid,
3435 } 3393 }
3436} 3394}
3437 3395
3396int __init add_from_early_node_map(struct range *range, int az,
3397 int nr_range, int nid)
3398{
3399 int i;
3400 u64 start, end;
3401
3402 /* need to go over early_node_map to find out good range for node */
3403 for_each_active_range_index_in_nid(i, nid) {
3404 start = early_node_map[i].start_pfn;
3405 end = early_node_map[i].end_pfn;
3406 nr_range = add_range(range, az, nr_range, start, end);
3407 }
3408 return nr_range;
3409}
3410
3411#ifdef CONFIG_NO_BOOTMEM
3412void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
3413 u64 goal, u64 limit)
3414{
3415 int i;
3416 void *ptr;
3417
3418 /* need to go over early_node_map to find out good range for node */
3419 for_each_active_range_index_in_nid(i, nid) {
3420 u64 addr;
3421 u64 ei_start, ei_last;
3422
3423 ei_last = early_node_map[i].end_pfn;
3424 ei_last <<= PAGE_SHIFT;
3425 ei_start = early_node_map[i].start_pfn;
3426 ei_start <<= PAGE_SHIFT;
3427 addr = find_early_area(ei_start, ei_last,
3428 goal, limit, size, align);
3429
3430 if (addr == -1ULL)
3431 continue;
3432
3433#if 0
3434 printk(KERN_DEBUG "alloc (nid=%d %llx - %llx) (%llx - %llx) %llx %llx => %llx\n",
3435 nid,
3436 ei_start, ei_last, goal, limit, size,
3437 align, addr);
3438#endif
3439
3440 ptr = phys_to_virt(addr);
3441 memset(ptr, 0, size);
3442 reserve_early_without_check(addr, addr + size, "BOOTMEM");
3443 return ptr;
3444 }
3445
3446 return NULL;
3447}
3448#endif
3449
3450
3438void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data) 3451void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data)
3439{ 3452{
3440 int i; 3453 int i;
@@ -4377,8 +4390,12 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
4377 for (i = 0; i < MAX_NR_ZONES; i++) { 4390 for (i = 0; i < MAX_NR_ZONES; i++) {
4378 if (i == ZONE_MOVABLE) 4391 if (i == ZONE_MOVABLE)
4379 continue; 4392 continue;
4380 printk(" %-8s %0#10lx -> %0#10lx\n", 4393 printk(" %-8s ", zone_names[i]);
4381 zone_names[i], 4394 if (arch_zone_lowest_possible_pfn[i] ==
4395 arch_zone_highest_possible_pfn[i])
4396 printk("empty\n");
4397 else
4398 printk("%0#10lx -> %0#10lx\n",
4382 arch_zone_lowest_possible_pfn[i], 4399 arch_zone_lowest_possible_pfn[i],
4383 arch_zone_highest_possible_pfn[i]); 4400 arch_zone_highest_possible_pfn[i]);
4384 } 4401 }
@@ -4467,7 +4484,11 @@ void __init set_dma_reserve(unsigned long new_dma_reserve)
4467} 4484}
4468 4485
4469#ifndef CONFIG_NEED_MULTIPLE_NODES 4486#ifndef CONFIG_NEED_MULTIPLE_NODES
4470struct pglist_data __refdata contig_page_data = { .bdata = &bootmem_node_data[0] }; 4487struct pglist_data __refdata contig_page_data = {
4488#ifndef CONFIG_NO_BOOTMEM
4489 .bdata = &bootmem_node_data[0]
4490#endif
4491 };
4471EXPORT_SYMBOL(contig_page_data); 4492EXPORT_SYMBOL(contig_page_data);
4472#endif 4493#endif
4473 4494
@@ -4810,10 +4831,11 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
4810 if (!write || (ret == -EINVAL)) 4831 if (!write || (ret == -EINVAL))
4811 return ret; 4832 return ret;
4812 for_each_populated_zone(zone) { 4833 for_each_populated_zone(zone) {
4813 for_each_online_cpu(cpu) { 4834 for_each_possible_cpu(cpu) {
4814 unsigned long high; 4835 unsigned long high;
4815 high = zone->present_pages / percpu_pagelist_fraction; 4836 high = zone->present_pages / percpu_pagelist_fraction;
4816 setup_pagelist_highmark(zone_pcp(zone, cpu), high); 4837 setup_pagelist_highmark(
4838 per_cpu_ptr(zone->pageset, cpu), high);
4817 } 4839 }
4818 } 4840 }
4819 return 0; 4841 return 0;
@@ -5159,3 +5181,80 @@ bool is_free_buddy_page(struct page *page)
5159 return order < MAX_ORDER; 5181 return order < MAX_ORDER;
5160} 5182}
5161#endif 5183#endif
5184
5185static struct trace_print_flags pageflag_names[] = {
5186 {1UL << PG_locked, "locked" },
5187 {1UL << PG_error, "error" },
5188 {1UL << PG_referenced, "referenced" },
5189 {1UL << PG_uptodate, "uptodate" },
5190 {1UL << PG_dirty, "dirty" },
5191 {1UL << PG_lru, "lru" },
5192 {1UL << PG_active, "active" },
5193 {1UL << PG_slab, "slab" },
5194 {1UL << PG_owner_priv_1, "owner_priv_1" },
5195 {1UL << PG_arch_1, "arch_1" },
5196 {1UL << PG_reserved, "reserved" },
5197 {1UL << PG_private, "private" },
5198 {1UL << PG_private_2, "private_2" },
5199 {1UL << PG_writeback, "writeback" },
5200#ifdef CONFIG_PAGEFLAGS_EXTENDED
5201 {1UL << PG_head, "head" },
5202 {1UL << PG_tail, "tail" },
5203#else
5204 {1UL << PG_compound, "compound" },
5205#endif
5206 {1UL << PG_swapcache, "swapcache" },
5207 {1UL << PG_mappedtodisk, "mappedtodisk" },
5208 {1UL << PG_reclaim, "reclaim" },
5209 {1UL << PG_buddy, "buddy" },
5210 {1UL << PG_swapbacked, "swapbacked" },
5211 {1UL << PG_unevictable, "unevictable" },
5212#ifdef CONFIG_MMU
5213 {1UL << PG_mlocked, "mlocked" },
5214#endif
5215#ifdef CONFIG_ARCH_USES_PG_UNCACHED
5216 {1UL << PG_uncached, "uncached" },
5217#endif
5218#ifdef CONFIG_MEMORY_FAILURE
5219 {1UL << PG_hwpoison, "hwpoison" },
5220#endif
5221 {-1UL, NULL },
5222};
5223
5224static void dump_page_flags(unsigned long flags)
5225{
5226 const char *delim = "";
5227 unsigned long mask;
5228 int i;
5229
5230 printk(KERN_ALERT "page flags: %#lx(", flags);
5231
5232 /* remove zone id */
5233 flags &= (1UL << NR_PAGEFLAGS) - 1;
5234
5235 for (i = 0; pageflag_names[i].name && flags; i++) {
5236
5237 mask = pageflag_names[i].mask;
5238 if ((flags & mask) != mask)
5239 continue;
5240
5241 flags &= ~mask;
5242 printk("%s%s", delim, pageflag_names[i].name);
5243 delim = "|";
5244 }
5245
5246 /* check for left over flags */
5247 if (flags)
5248 printk("%s%#lx", delim, flags);
5249
5250 printk(")\n");
5251}
5252
5253void dump_page(struct page *page)
5254{
5255 printk(KERN_ALERT
5256 "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",
5257 page, page_count(page), page_mapcount(page),
5258 page->mapping, page->index);
5259 dump_page_flags(page->flags);
5260}
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 3d535d594826..6c0081441a32 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -284,6 +284,7 @@ static DEFINE_MUTEX(swap_cgroup_mutex);
284struct swap_cgroup_ctrl { 284struct swap_cgroup_ctrl {
285 struct page **map; 285 struct page **map;
286 unsigned long length; 286 unsigned long length;
287 spinlock_t lock;
287}; 288};
288 289
289struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES]; 290struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES];
@@ -335,6 +336,43 @@ not_enough_page:
335} 336}
336 337
337/** 338/**
339 * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry.
340 * @end: swap entry to be cmpxchged
341 * @old: old id
342 * @new: new id
343 *
344 * Returns old id at success, 0 at failure.
345 * (There is no mem_cgroup useing 0 as its id)
346 */
347unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
348 unsigned short old, unsigned short new)
349{
350 int type = swp_type(ent);
351 unsigned long offset = swp_offset(ent);
352 unsigned long idx = offset / SC_PER_PAGE;
353 unsigned long pos = offset & SC_POS_MASK;
354 struct swap_cgroup_ctrl *ctrl;
355 struct page *mappage;
356 struct swap_cgroup *sc;
357 unsigned long flags;
358 unsigned short retval;
359
360 ctrl = &swap_cgroup_ctrl[type];
361
362 mappage = ctrl->map[idx];
363 sc = page_address(mappage);
364 sc += pos;
365 spin_lock_irqsave(&ctrl->lock, flags);
366 retval = sc->id;
367 if (retval == old)
368 sc->id = new;
369 else
370 retval = 0;
371 spin_unlock_irqrestore(&ctrl->lock, flags);
372 return retval;
373}
374
375/**
338 * swap_cgroup_record - record mem_cgroup for this swp_entry. 376 * swap_cgroup_record - record mem_cgroup for this swp_entry.
339 * @ent: swap entry to be recorded into 377 * @ent: swap entry to be recorded into
340 * @mem: mem_cgroup to be recorded 378 * @mem: mem_cgroup to be recorded
@@ -352,14 +390,17 @@ unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
352 struct page *mappage; 390 struct page *mappage;
353 struct swap_cgroup *sc; 391 struct swap_cgroup *sc;
354 unsigned short old; 392 unsigned short old;
393 unsigned long flags;
355 394
356 ctrl = &swap_cgroup_ctrl[type]; 395 ctrl = &swap_cgroup_ctrl[type];
357 396
358 mappage = ctrl->map[idx]; 397 mappage = ctrl->map[idx];
359 sc = page_address(mappage); 398 sc = page_address(mappage);
360 sc += pos; 399 sc += pos;
400 spin_lock_irqsave(&ctrl->lock, flags);
361 old = sc->id; 401 old = sc->id;
362 sc->id = id; 402 sc->id = id;
403 spin_unlock_irqrestore(&ctrl->lock, flags);
363 404
364 return old; 405 return old;
365} 406}
@@ -411,6 +452,7 @@ int swap_cgroup_swapon(int type, unsigned long max_pages)
411 mutex_lock(&swap_cgroup_mutex); 452 mutex_lock(&swap_cgroup_mutex);
412 ctrl->length = length; 453 ctrl->length = length;
413 ctrl->map = array; 454 ctrl->map = array;
455 spin_lock_init(&ctrl->lock);
414 if (swap_cgroup_prepare(type)) { 456 if (swap_cgroup_prepare(type)) {
415 /* memory shortage */ 457 /* memory shortage */
416 ctrl->map = NULL; 458 ctrl->map = NULL;
diff --git a/mm/page_io.c b/mm/page_io.c
index a19af956ee1b..31a3b962230a 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -12,6 +12,7 @@
12 12
13#include <linux/mm.h> 13#include <linux/mm.h>
14#include <linux/kernel_stat.h> 14#include <linux/kernel_stat.h>
15#include <linux/gfp.h>
15#include <linux/pagemap.h> 16#include <linux/pagemap.h>
16#include <linux/swap.h> 17#include <linux/swap.h>
17#include <linux/bio.h> 18#include <linux/bio.h>
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 7b47a57b6646..8b1a2ce21ee5 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -80,6 +80,37 @@ static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end,
80 return err; 80 return err;
81} 81}
82 82
83#ifdef CONFIG_HUGETLB_PAGE
84static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr,
85 unsigned long end)
86{
87 unsigned long boundary = (addr & huge_page_mask(h)) + huge_page_size(h);
88 return boundary < end ? boundary : end;
89}
90
91static int walk_hugetlb_range(struct vm_area_struct *vma,
92 unsigned long addr, unsigned long end,
93 struct mm_walk *walk)
94{
95 struct hstate *h = hstate_vma(vma);
96 unsigned long next;
97 unsigned long hmask = huge_page_mask(h);
98 pte_t *pte;
99 int err = 0;
100
101 do {
102 next = hugetlb_entry_end(h, addr, end);
103 pte = huge_pte_offset(walk->mm, addr & hmask);
104 if (pte && walk->hugetlb_entry)
105 err = walk->hugetlb_entry(pte, hmask, addr, next, walk);
106 if (err)
107 return err;
108 } while (addr = next, addr != end);
109
110 return 0;
111}
112#endif
113
83/** 114/**
84 * walk_page_range - walk a memory map's page tables with a callback 115 * walk_page_range - walk a memory map's page tables with a callback
85 * @mm: memory map to walk 116 * @mm: memory map to walk
@@ -128,20 +159,16 @@ int walk_page_range(unsigned long addr, unsigned long end,
128 vma = find_vma(walk->mm, addr); 159 vma = find_vma(walk->mm, addr);
129#ifdef CONFIG_HUGETLB_PAGE 160#ifdef CONFIG_HUGETLB_PAGE
130 if (vma && is_vm_hugetlb_page(vma)) { 161 if (vma && is_vm_hugetlb_page(vma)) {
131 pte_t *pte;
132 struct hstate *hs;
133
134 if (vma->vm_end < next) 162 if (vma->vm_end < next)
135 next = vma->vm_end; 163 next = vma->vm_end;
136 hs = hstate_vma(vma); 164 /*
137 pte = huge_pte_offset(walk->mm, 165 * Hugepage is very tightly coupled with vma, so
138 addr & huge_page_mask(hs)); 166 * walk through hugetlb entries within a given vma.
139 if (pte && !huge_pte_none(huge_ptep_get(pte)) 167 */
140 && walk->hugetlb_entry) 168 err = walk_hugetlb_range(vma, addr, next, walk);
141 err = walk->hugetlb_entry(pte, addr,
142 next, walk);
143 if (err) 169 if (err)
144 break; 170 break;
171 pgd = pgd_offset(walk->mm, next);
145 continue; 172 continue;
146 } 173 }
147#endif 174#endif
diff --git a/mm/percpu.c b/mm/percpu.c
index 083e7c91e5f6..6e09741ddc62 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -80,13 +80,15 @@
80/* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */ 80/* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */
81#ifndef __addr_to_pcpu_ptr 81#ifndef __addr_to_pcpu_ptr
82#define __addr_to_pcpu_ptr(addr) \ 82#define __addr_to_pcpu_ptr(addr) \
83 (void *)((unsigned long)(addr) - (unsigned long)pcpu_base_addr \ 83 (void __percpu *)((unsigned long)(addr) - \
84 + (unsigned long)__per_cpu_start) 84 (unsigned long)pcpu_base_addr + \
85 (unsigned long)__per_cpu_start)
85#endif 86#endif
86#ifndef __pcpu_ptr_to_addr 87#ifndef __pcpu_ptr_to_addr
87#define __pcpu_ptr_to_addr(ptr) \ 88#define __pcpu_ptr_to_addr(ptr) \
88 (void *)((unsigned long)(ptr) + (unsigned long)pcpu_base_addr \ 89 (void __force *)((unsigned long)(ptr) + \
89 - (unsigned long)__per_cpu_start) 90 (unsigned long)pcpu_base_addr - \
91 (unsigned long)__per_cpu_start)
90#endif 92#endif
91 93
92struct pcpu_chunk { 94struct pcpu_chunk {
@@ -913,11 +915,10 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size)
913 int rs, re; 915 int rs, re;
914 916
915 /* quick path, check whether it's empty already */ 917 /* quick path, check whether it's empty already */
916 pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { 918 rs = page_start;
917 if (rs == page_start && re == page_end) 919 pcpu_next_unpop(chunk, &rs, &re, page_end);
918 return; 920 if (rs == page_start && re == page_end)
919 break; 921 return;
920 }
921 922
922 /* immutable chunks can't be depopulated */ 923 /* immutable chunks can't be depopulated */
923 WARN_ON(chunk->immutable); 924 WARN_ON(chunk->immutable);
@@ -968,11 +969,10 @@ static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
968 int rs, re, rc; 969 int rs, re, rc;
969 970
970 /* quick path, check whether all pages are already there */ 971 /* quick path, check whether all pages are already there */
971 pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) { 972 rs = page_start;
972 if (rs == page_start && re == page_end) 973 pcpu_next_pop(chunk, &rs, &re, page_end);
973 goto clear; 974 if (rs == page_start && re == page_end)
974 break; 975 goto clear;
975 }
976 976
977 /* need to allocate and map pages, this chunk can't be immutable */ 977 /* need to allocate and map pages, this chunk can't be immutable */
978 WARN_ON(chunk->immutable); 978 WARN_ON(chunk->immutable);
@@ -1067,7 +1067,7 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void)
1067 * RETURNS: 1067 * RETURNS:
1068 * Percpu pointer to the allocated area on success, NULL on failure. 1068 * Percpu pointer to the allocated area on success, NULL on failure.
1069 */ 1069 */
1070static void *pcpu_alloc(size_t size, size_t align, bool reserved) 1070static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved)
1071{ 1071{
1072 static int warn_limit = 10; 1072 static int warn_limit = 10;
1073 struct pcpu_chunk *chunk; 1073 struct pcpu_chunk *chunk;
@@ -1196,7 +1196,7 @@ fail_unlock_mutex:
1196 * RETURNS: 1196 * RETURNS:
1197 * Percpu pointer to the allocated area on success, NULL on failure. 1197 * Percpu pointer to the allocated area on success, NULL on failure.
1198 */ 1198 */
1199void *__alloc_percpu(size_t size, size_t align) 1199void __percpu *__alloc_percpu(size_t size, size_t align)
1200{ 1200{
1201 return pcpu_alloc(size, align, false); 1201 return pcpu_alloc(size, align, false);
1202} 1202}
@@ -1217,7 +1217,7 @@ EXPORT_SYMBOL_GPL(__alloc_percpu);
1217 * RETURNS: 1217 * RETURNS:
1218 * Percpu pointer to the allocated area on success, NULL on failure. 1218 * Percpu pointer to the allocated area on success, NULL on failure.
1219 */ 1219 */
1220void *__alloc_reserved_percpu(size_t size, size_t align) 1220void __percpu *__alloc_reserved_percpu(size_t size, size_t align)
1221{ 1221{
1222 return pcpu_alloc(size, align, true); 1222 return pcpu_alloc(size, align, true);
1223} 1223}
@@ -1269,7 +1269,7 @@ static void pcpu_reclaim(struct work_struct *work)
1269 * CONTEXT: 1269 * CONTEXT:
1270 * Can be called from atomic context. 1270 * Can be called from atomic context.
1271 */ 1271 */
1272void free_percpu(void *ptr) 1272void free_percpu(void __percpu *ptr)
1273{ 1273{
1274 void *addr; 1274 void *addr;
1275 struct pcpu_chunk *chunk; 1275 struct pcpu_chunk *chunk;
@@ -1304,6 +1304,32 @@ void free_percpu(void *ptr)
1304EXPORT_SYMBOL_GPL(free_percpu); 1304EXPORT_SYMBOL_GPL(free_percpu);
1305 1305
1306/** 1306/**
1307 * is_kernel_percpu_address - test whether address is from static percpu area
1308 * @addr: address to test
1309 *
1310 * Test whether @addr belongs to in-kernel static percpu area. Module
1311 * static percpu areas are not considered. For those, use
1312 * is_module_percpu_address().
1313 *
1314 * RETURNS:
1315 * %true if @addr is from in-kernel static percpu area, %false otherwise.
1316 */
1317bool is_kernel_percpu_address(unsigned long addr)
1318{
1319 const size_t static_size = __per_cpu_end - __per_cpu_start;
1320 void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);
1321 unsigned int cpu;
1322
1323 for_each_possible_cpu(cpu) {
1324 void *start = per_cpu_ptr(base, cpu);
1325
1326 if ((void *)addr >= start && (void *)addr < start + static_size)
1327 return true;
1328 }
1329 return false;
1330}
1331
1332/**
1307 * per_cpu_ptr_to_phys - convert translated percpu address to physical address 1333 * per_cpu_ptr_to_phys - convert translated percpu address to physical address
1308 * @addr: the address to be converted to physical address 1334 * @addr: the address to be converted to physical address
1309 * 1335 *
diff --git a/mm/percpu_up.c b/mm/percpu_up.c
new file mode 100644
index 000000000000..c4351c7f57d2
--- /dev/null
+++ b/mm/percpu_up.c
@@ -0,0 +1,30 @@
1/*
2 * mm/percpu_up.c - dummy percpu memory allocator implementation for UP
3 */
4
5#include <linux/module.h>
6#include <linux/percpu.h>
7#include <linux/slab.h>
8
9void __percpu *__alloc_percpu(size_t size, size_t align)
10{
11 /*
12 * Can't easily make larger alignment work with kmalloc. WARN
13 * on it. Larger alignment should only be used for module
14 * percpu sections on SMP for which this path isn't used.
15 */
16 WARN_ON_ONCE(align > SMP_CACHE_BYTES);
17 return kzalloc(size, GFP_KERNEL);
18}
19EXPORT_SYMBOL_GPL(__alloc_percpu);
20
21void free_percpu(void __percpu *p)
22{
23 kfree(p);
24}
25EXPORT_SYMBOL_GPL(free_percpu);
26
27phys_addr_t per_cpu_ptr_to_phys(void *addr)
28{
29 return __pa(addr);
30}
diff --git a/mm/quicklist.c b/mm/quicklist.c
index 6633965bb27b..2876349339a7 100644
--- a/mm/quicklist.c
+++ b/mm/quicklist.c
@@ -14,6 +14,7 @@
14 */ 14 */
15#include <linux/kernel.h> 15#include <linux/kernel.h>
16 16
17#include <linux/gfp.h>
17#include <linux/mm.h> 18#include <linux/mm.h>
18#include <linux/mmzone.h> 19#include <linux/mmzone.h>
19#include <linux/module.h> 20#include <linux/module.h>
diff --git a/mm/readahead.c b/mm/readahead.c
index 033bc135a41f..dfa9a1a03a11 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -9,6 +9,7 @@
9 9
10#include <linux/kernel.h> 10#include <linux/kernel.h>
11#include <linux/fs.h> 11#include <linux/fs.h>
12#include <linux/gfp.h>
12#include <linux/mm.h> 13#include <linux/mm.h>
13#include <linux/module.h> 14#include <linux/module.h>
14#include <linux/blkdev.h> 15#include <linux/blkdev.h>
@@ -501,6 +502,12 @@ void page_cache_sync_readahead(struct address_space *mapping,
501 if (!ra->ra_pages) 502 if (!ra->ra_pages)
502 return; 503 return;
503 504
505 /* be dumb */
506 if (filp && (filp->f_mode & FMODE_RANDOM)) {
507 force_page_cache_readahead(mapping, filp, offset, req_size);
508 return;
509 }
510
504 /* do read-ahead */ 511 /* do read-ahead */
505 ondemand_readahead(mapping, ra, filp, false, offset, req_size); 512 ondemand_readahead(mapping, ra, filp, false, offset, req_size);
506} 513}
diff --git a/mm/rmap.c b/mm/rmap.c
index 278cd277bdec..07fc94758799 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -62,6 +62,7 @@
62#include "internal.h" 62#include "internal.h"
63 63
64static struct kmem_cache *anon_vma_cachep; 64static struct kmem_cache *anon_vma_cachep;
65static struct kmem_cache *anon_vma_chain_cachep;
65 66
66static inline struct anon_vma *anon_vma_alloc(void) 67static inline struct anon_vma *anon_vma_alloc(void)
67{ 68{
@@ -73,6 +74,16 @@ void anon_vma_free(struct anon_vma *anon_vma)
73 kmem_cache_free(anon_vma_cachep, anon_vma); 74 kmem_cache_free(anon_vma_cachep, anon_vma);
74} 75}
75 76
77static inline struct anon_vma_chain *anon_vma_chain_alloc(void)
78{
79 return kmem_cache_alloc(anon_vma_chain_cachep, GFP_KERNEL);
80}
81
82void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
83{
84 kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain);
85}
86
76/** 87/**
77 * anon_vma_prepare - attach an anon_vma to a memory region 88 * anon_vma_prepare - attach an anon_vma to a memory region
78 * @vma: the memory region in question 89 * @vma: the memory region in question
@@ -103,73 +114,140 @@ void anon_vma_free(struct anon_vma *anon_vma)
103int anon_vma_prepare(struct vm_area_struct *vma) 114int anon_vma_prepare(struct vm_area_struct *vma)
104{ 115{
105 struct anon_vma *anon_vma = vma->anon_vma; 116 struct anon_vma *anon_vma = vma->anon_vma;
117 struct anon_vma_chain *avc;
106 118
107 might_sleep(); 119 might_sleep();
108 if (unlikely(!anon_vma)) { 120 if (unlikely(!anon_vma)) {
109 struct mm_struct *mm = vma->vm_mm; 121 struct mm_struct *mm = vma->vm_mm;
110 struct anon_vma *allocated; 122 struct anon_vma *allocated;
111 123
124 avc = anon_vma_chain_alloc();
125 if (!avc)
126 goto out_enomem;
127
112 anon_vma = find_mergeable_anon_vma(vma); 128 anon_vma = find_mergeable_anon_vma(vma);
113 allocated = NULL; 129 allocated = NULL;
114 if (!anon_vma) { 130 if (!anon_vma) {
115 anon_vma = anon_vma_alloc(); 131 anon_vma = anon_vma_alloc();
116 if (unlikely(!anon_vma)) 132 if (unlikely(!anon_vma))
117 return -ENOMEM; 133 goto out_enomem_free_avc;
118 allocated = anon_vma; 134 allocated = anon_vma;
119 } 135 }
120 spin_lock(&anon_vma->lock);
121 136
137 spin_lock(&anon_vma->lock);
122 /* page_table_lock to protect against threads */ 138 /* page_table_lock to protect against threads */
123 spin_lock(&mm->page_table_lock); 139 spin_lock(&mm->page_table_lock);
124 if (likely(!vma->anon_vma)) { 140 if (likely(!vma->anon_vma)) {
125 vma->anon_vma = anon_vma; 141 vma->anon_vma = anon_vma;
126 list_add_tail(&vma->anon_vma_node, &anon_vma->head); 142 avc->anon_vma = anon_vma;
143 avc->vma = vma;
144 list_add(&avc->same_vma, &vma->anon_vma_chain);
145 list_add(&avc->same_anon_vma, &anon_vma->head);
127 allocated = NULL; 146 allocated = NULL;
147 avc = NULL;
128 } 148 }
129 spin_unlock(&mm->page_table_lock); 149 spin_unlock(&mm->page_table_lock);
130
131 spin_unlock(&anon_vma->lock); 150 spin_unlock(&anon_vma->lock);
151
132 if (unlikely(allocated)) 152 if (unlikely(allocated))
133 anon_vma_free(allocated); 153 anon_vma_free(allocated);
154 if (unlikely(avc))
155 anon_vma_chain_free(avc);
134 } 156 }
135 return 0; 157 return 0;
158
159 out_enomem_free_avc:
160 anon_vma_chain_free(avc);
161 out_enomem:
162 return -ENOMEM;
136} 163}
137 164
138void __anon_vma_merge(struct vm_area_struct *vma, struct vm_area_struct *next) 165static void anon_vma_chain_link(struct vm_area_struct *vma,
166 struct anon_vma_chain *avc,
167 struct anon_vma *anon_vma)
139{ 168{
140 BUG_ON(vma->anon_vma != next->anon_vma); 169 avc->vma = vma;
141 list_del(&next->anon_vma_node); 170 avc->anon_vma = anon_vma;
171 list_add(&avc->same_vma, &vma->anon_vma_chain);
172
173 spin_lock(&anon_vma->lock);
174 list_add_tail(&avc->same_anon_vma, &anon_vma->head);
175 spin_unlock(&anon_vma->lock);
142} 176}
143 177
144void __anon_vma_link(struct vm_area_struct *vma) 178/*
179 * Attach the anon_vmas from src to dst.
180 * Returns 0 on success, -ENOMEM on failure.
181 */
182int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
145{ 183{
146 struct anon_vma *anon_vma = vma->anon_vma; 184 struct anon_vma_chain *avc, *pavc;
147 185
148 if (anon_vma) 186 list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) {
149 list_add_tail(&vma->anon_vma_node, &anon_vma->head); 187 avc = anon_vma_chain_alloc();
188 if (!avc)
189 goto enomem_failure;
190 anon_vma_chain_link(dst, avc, pavc->anon_vma);
191 }
192 return 0;
193
194 enomem_failure:
195 unlink_anon_vmas(dst);
196 return -ENOMEM;
150} 197}
151 198
152void anon_vma_link(struct vm_area_struct *vma) 199/*
200 * Attach vma to its own anon_vma, as well as to the anon_vmas that
201 * the corresponding VMA in the parent process is attached to.
202 * Returns 0 on success, non-zero on failure.
203 */
204int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
153{ 205{
154 struct anon_vma *anon_vma = vma->anon_vma; 206 struct anon_vma_chain *avc;
207 struct anon_vma *anon_vma;
155 208
156 if (anon_vma) { 209 /* Don't bother if the parent process has no anon_vma here. */
157 spin_lock(&anon_vma->lock); 210 if (!pvma->anon_vma)
158 list_add_tail(&vma->anon_vma_node, &anon_vma->head); 211 return 0;
159 spin_unlock(&anon_vma->lock); 212
160 } 213 /*
214 * First, attach the new VMA to the parent VMA's anon_vmas,
215 * so rmap can find non-COWed pages in child processes.
216 */
217 if (anon_vma_clone(vma, pvma))
218 return -ENOMEM;
219
220 /* Then add our own anon_vma. */
221 anon_vma = anon_vma_alloc();
222 if (!anon_vma)
223 goto out_error;
224 avc = anon_vma_chain_alloc();
225 if (!avc)
226 goto out_error_free_anon_vma;
227 anon_vma_chain_link(vma, avc, anon_vma);
228 /* Mark this anon_vma as the one where our new (COWed) pages go. */
229 vma->anon_vma = anon_vma;
230
231 return 0;
232
233 out_error_free_anon_vma:
234 anon_vma_free(anon_vma);
235 out_error:
236 unlink_anon_vmas(vma);
237 return -ENOMEM;
161} 238}
162 239
163void anon_vma_unlink(struct vm_area_struct *vma) 240static void anon_vma_unlink(struct anon_vma_chain *anon_vma_chain)
164{ 241{
165 struct anon_vma *anon_vma = vma->anon_vma; 242 struct anon_vma *anon_vma = anon_vma_chain->anon_vma;
166 int empty; 243 int empty;
167 244
245 /* If anon_vma_fork fails, we can get an empty anon_vma_chain. */
168 if (!anon_vma) 246 if (!anon_vma)
169 return; 247 return;
170 248
171 spin_lock(&anon_vma->lock); 249 spin_lock(&anon_vma->lock);
172 list_del(&vma->anon_vma_node); 250 list_del(&anon_vma_chain->same_anon_vma);
173 251
174 /* We must garbage collect the anon_vma if it's empty */ 252 /* We must garbage collect the anon_vma if it's empty */
175 empty = list_empty(&anon_vma->head) && !ksm_refcount(anon_vma); 253 empty = list_empty(&anon_vma->head) && !ksm_refcount(anon_vma);
@@ -179,6 +257,18 @@ void anon_vma_unlink(struct vm_area_struct *vma)
179 anon_vma_free(anon_vma); 257 anon_vma_free(anon_vma);
180} 258}
181 259
260void unlink_anon_vmas(struct vm_area_struct *vma)
261{
262 struct anon_vma_chain *avc, *next;
263
264 /* Unlink each anon_vma chained to the VMA. */
265 list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
266 anon_vma_unlink(avc);
267 list_del(&avc->same_vma);
268 anon_vma_chain_free(avc);
269 }
270}
271
182static void anon_vma_ctor(void *data) 272static void anon_vma_ctor(void *data)
183{ 273{
184 struct anon_vma *anon_vma = data; 274 struct anon_vma *anon_vma = data;
@@ -192,6 +282,7 @@ void __init anon_vma_init(void)
192{ 282{
193 anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma), 283 anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma),
194 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor); 284 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor);
285 anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain, SLAB_PANIC);
195} 286}
196 287
197/* 288/*
@@ -396,7 +487,7 @@ static int page_referenced_anon(struct page *page,
396{ 487{
397 unsigned int mapcount; 488 unsigned int mapcount;
398 struct anon_vma *anon_vma; 489 struct anon_vma *anon_vma;
399 struct vm_area_struct *vma; 490 struct anon_vma_chain *avc;
400 int referenced = 0; 491 int referenced = 0;
401 492
402 anon_vma = page_lock_anon_vma(page); 493 anon_vma = page_lock_anon_vma(page);
@@ -404,7 +495,8 @@ static int page_referenced_anon(struct page *page,
404 return referenced; 495 return referenced;
405 496
406 mapcount = page_mapcount(page); 497 mapcount = page_mapcount(page);
407 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { 498 list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
499 struct vm_area_struct *vma = avc->vma;
408 unsigned long address = vma_address(page, vma); 500 unsigned long address = vma_address(page, vma);
409 if (address == -EFAULT) 501 if (address == -EFAULT)
410 continue; 502 continue;
@@ -511,9 +603,6 @@ int page_referenced(struct page *page,
511 int referenced = 0; 603 int referenced = 0;
512 int we_locked = 0; 604 int we_locked = 0;
513 605
514 if (TestClearPageReferenced(page))
515 referenced++;
516
517 *vm_flags = 0; 606 *vm_flags = 0;
518 if (page_mapped(page) && page_rmapping(page)) { 607 if (page_mapped(page) && page_rmapping(page)) {
519 if (!is_locked && (!PageAnon(page) || PageKsm(page))) { 608 if (!is_locked && (!PageAnon(page) || PageKsm(page))) {
@@ -614,17 +703,57 @@ int page_mkclean(struct page *page)
614EXPORT_SYMBOL_GPL(page_mkclean); 703EXPORT_SYMBOL_GPL(page_mkclean);
615 704
616/** 705/**
706 * page_move_anon_rmap - move a page to our anon_vma
707 * @page: the page to move to our anon_vma
708 * @vma: the vma the page belongs to
709 * @address: the user virtual address mapped
710 *
711 * When a page belongs exclusively to one process after a COW event,
712 * that page can be moved into the anon_vma that belongs to just that
713 * process, so the rmap code will not search the parent or sibling
714 * processes.
715 */
716void page_move_anon_rmap(struct page *page,
717 struct vm_area_struct *vma, unsigned long address)
718{
719 struct anon_vma *anon_vma = vma->anon_vma;
720
721 VM_BUG_ON(!PageLocked(page));
722 VM_BUG_ON(!anon_vma);
723 VM_BUG_ON(page->index != linear_page_index(vma, address));
724
725 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
726 page->mapping = (struct address_space *) anon_vma;
727}
728
729/**
617 * __page_set_anon_rmap - setup new anonymous rmap 730 * __page_set_anon_rmap - setup new anonymous rmap
618 * @page: the page to add the mapping to 731 * @page: the page to add the mapping to
619 * @vma: the vm area in which the mapping is added 732 * @vma: the vm area in which the mapping is added
620 * @address: the user virtual address mapped 733 * @address: the user virtual address mapped
734 * @exclusive: the page is exclusively owned by the current process
621 */ 735 */
622static void __page_set_anon_rmap(struct page *page, 736static void __page_set_anon_rmap(struct page *page,
623 struct vm_area_struct *vma, unsigned long address) 737 struct vm_area_struct *vma, unsigned long address, int exclusive)
624{ 738{
625 struct anon_vma *anon_vma = vma->anon_vma; 739 struct anon_vma *anon_vma = vma->anon_vma;
626 740
627 BUG_ON(!anon_vma); 741 BUG_ON(!anon_vma);
742
743 /*
744 * If the page isn't exclusively mapped into this vma,
745 * we must use the _oldest_ possible anon_vma for the
746 * page mapping!
747 *
748 * So take the last AVC chain entry in the vma, which is
749 * the deepest ancestor, and use the anon_vma from that.
750 */
751 if (!exclusive) {
752 struct anon_vma_chain *avc;
753 avc = list_entry(vma->anon_vma_chain.prev, struct anon_vma_chain, same_vma);
754 anon_vma = avc->anon_vma;
755 }
756
628 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; 757 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
629 page->mapping = (struct address_space *) anon_vma; 758 page->mapping = (struct address_space *) anon_vma;
630 page->index = linear_page_index(vma, address); 759 page->index = linear_page_index(vma, address);
@@ -652,9 +781,6 @@ static void __page_check_anon_rmap(struct page *page,
652 * are initially only visible via the pagetables, and the pte is locked 781 * are initially only visible via the pagetables, and the pte is locked
653 * over the call to page_add_new_anon_rmap. 782 * over the call to page_add_new_anon_rmap.
654 */ 783 */
655 struct anon_vma *anon_vma = vma->anon_vma;
656 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
657 BUG_ON(page->mapping != (struct address_space *)anon_vma);
658 BUG_ON(page->index != linear_page_index(vma, address)); 784 BUG_ON(page->index != linear_page_index(vma, address));
659#endif 785#endif
660} 786}
@@ -682,7 +808,7 @@ void page_add_anon_rmap(struct page *page,
682 VM_BUG_ON(!PageLocked(page)); 808 VM_BUG_ON(!PageLocked(page));
683 VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); 809 VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
684 if (first) 810 if (first)
685 __page_set_anon_rmap(page, vma, address); 811 __page_set_anon_rmap(page, vma, address, 0);
686 else 812 else
687 __page_check_anon_rmap(page, vma, address); 813 __page_check_anon_rmap(page, vma, address);
688} 814}
@@ -704,7 +830,7 @@ void page_add_new_anon_rmap(struct page *page,
704 SetPageSwapBacked(page); 830 SetPageSwapBacked(page);
705 atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */ 831 atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */
706 __inc_zone_page_state(page, NR_ANON_PAGES); 832 __inc_zone_page_state(page, NR_ANON_PAGES);
707 __page_set_anon_rmap(page, vma, address); 833 __page_set_anon_rmap(page, vma, address, 1);
708 if (page_evictable(page, vma)) 834 if (page_evictable(page, vma))
709 lru_cache_add_lru(page, LRU_ACTIVE_ANON); 835 lru_cache_add_lru(page, LRU_ACTIVE_ANON);
710 else 836 else
@@ -815,9 +941,9 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
815 941
816 if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) { 942 if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) {
817 if (PageAnon(page)) 943 if (PageAnon(page))
818 dec_mm_counter(mm, anon_rss); 944 dec_mm_counter(mm, MM_ANONPAGES);
819 else 945 else
820 dec_mm_counter(mm, file_rss); 946 dec_mm_counter(mm, MM_FILEPAGES);
821 set_pte_at(mm, address, pte, 947 set_pte_at(mm, address, pte,
822 swp_entry_to_pte(make_hwpoison_entry(page))); 948 swp_entry_to_pte(make_hwpoison_entry(page)));
823 } else if (PageAnon(page)) { 949 } else if (PageAnon(page)) {
@@ -839,7 +965,8 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
839 list_add(&mm->mmlist, &init_mm.mmlist); 965 list_add(&mm->mmlist, &init_mm.mmlist);
840 spin_unlock(&mmlist_lock); 966 spin_unlock(&mmlist_lock);
841 } 967 }
842 dec_mm_counter(mm, anon_rss); 968 dec_mm_counter(mm, MM_ANONPAGES);
969 inc_mm_counter(mm, MM_SWAPENTS);
843 } else if (PAGE_MIGRATION) { 970 } else if (PAGE_MIGRATION) {
844 /* 971 /*
845 * Store the pfn of the page in a special migration 972 * Store the pfn of the page in a special migration
@@ -857,7 +984,7 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
857 entry = make_migration_entry(page, pte_write(pteval)); 984 entry = make_migration_entry(page, pte_write(pteval));
858 set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); 985 set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
859 } else 986 } else
860 dec_mm_counter(mm, file_rss); 987 dec_mm_counter(mm, MM_FILEPAGES);
861 988
862 page_remove_rmap(page); 989 page_remove_rmap(page);
863 page_cache_release(page); 990 page_cache_release(page);
@@ -996,7 +1123,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
996 1123
997 page_remove_rmap(page); 1124 page_remove_rmap(page);
998 page_cache_release(page); 1125 page_cache_release(page);
999 dec_mm_counter(mm, file_rss); 1126 dec_mm_counter(mm, MM_FILEPAGES);
1000 (*mapcount)--; 1127 (*mapcount)--;
1001 } 1128 }
1002 pte_unmap_unlock(pte - 1, ptl); 1129 pte_unmap_unlock(pte - 1, ptl);
@@ -1024,14 +1151,15 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
1024static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) 1151static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
1025{ 1152{
1026 struct anon_vma *anon_vma; 1153 struct anon_vma *anon_vma;
1027 struct vm_area_struct *vma; 1154 struct anon_vma_chain *avc;
1028 int ret = SWAP_AGAIN; 1155 int ret = SWAP_AGAIN;
1029 1156
1030 anon_vma = page_lock_anon_vma(page); 1157 anon_vma = page_lock_anon_vma(page);
1031 if (!anon_vma) 1158 if (!anon_vma)
1032 return ret; 1159 return ret;
1033 1160
1034 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { 1161 list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
1162 struct vm_area_struct *vma = avc->vma;
1035 unsigned long address = vma_address(page, vma); 1163 unsigned long address = vma_address(page, vma);
1036 if (address == -EFAULT) 1164 if (address == -EFAULT)
1037 continue; 1165 continue;
@@ -1222,7 +1350,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
1222 struct vm_area_struct *, unsigned long, void *), void *arg) 1350 struct vm_area_struct *, unsigned long, void *), void *arg)
1223{ 1351{
1224 struct anon_vma *anon_vma; 1352 struct anon_vma *anon_vma;
1225 struct vm_area_struct *vma; 1353 struct anon_vma_chain *avc;
1226 int ret = SWAP_AGAIN; 1354 int ret = SWAP_AGAIN;
1227 1355
1228 /* 1356 /*
@@ -1237,7 +1365,8 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
1237 if (!anon_vma) 1365 if (!anon_vma)
1238 return ret; 1366 return ret;
1239 spin_lock(&anon_vma->lock); 1367 spin_lock(&anon_vma->lock);
1240 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { 1368 list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
1369 struct vm_area_struct *vma = avc->vma;
1241 unsigned long address = vma_address(page, vma); 1370 unsigned long address = vma_address(page, vma);
1242 if (address == -EFAULT) 1371 if (address == -EFAULT)
1243 continue; 1372 continue;
diff --git a/mm/slab.c b/mm/slab.c
index 7451bdacaf18..bac0f4fcc216 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -935,7 +935,6 @@ static int transfer_objects(struct array_cache *to,
935 935
936 from->avail -= nr; 936 from->avail -= nr;
937 to->avail += nr; 937 to->avail += nr;
938 to->touched = 1;
939 return nr; 938 return nr;
940} 939}
941 940
@@ -983,13 +982,11 @@ static struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp)
983 982
984 if (limit > 1) 983 if (limit > 1)
985 limit = 12; 984 limit = 12;
986 ac_ptr = kmalloc_node(memsize, gfp, node); 985 ac_ptr = kzalloc_node(memsize, gfp, node);
987 if (ac_ptr) { 986 if (ac_ptr) {
988 for_each_node(i) { 987 for_each_node(i) {
989 if (i == node || !node_online(i)) { 988 if (i == node || !node_online(i))
990 ac_ptr[i] = NULL;
991 continue; 989 continue;
992 }
993 ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d, gfp); 990 ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d, gfp);
994 if (!ac_ptr[i]) { 991 if (!ac_ptr[i]) {
995 for (i--; i >= 0; i--) 992 for (i--; i >= 0; i--)
@@ -2963,8 +2960,10 @@ retry:
2963 spin_lock(&l3->list_lock); 2960 spin_lock(&l3->list_lock);
2964 2961
2965 /* See if we can refill from the shared array */ 2962 /* See if we can refill from the shared array */
2966 if (l3->shared && transfer_objects(ac, l3->shared, batchcount)) 2963 if (l3->shared && transfer_objects(ac, l3->shared, batchcount)) {
2964 l3->shared->touched = 1;
2967 goto alloc_done; 2965 goto alloc_done;
2966 }
2968 2967
2969 while (batchcount > 0) { 2968 while (batchcount > 0) {
2970 struct list_head *entry; 2969 struct list_head *entry;
@@ -3101,7 +3100,7 @@ static bool slab_should_failslab(struct kmem_cache *cachep, gfp_t flags)
3101 if (cachep == &cache_cache) 3100 if (cachep == &cache_cache)
3102 return false; 3101 return false;
3103 3102
3104 return should_failslab(obj_size(cachep), flags); 3103 return should_failslab(obj_size(cachep), flags, cachep->flags);
3105} 3104}
3106 3105
3107static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) 3106static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
@@ -3603,21 +3602,10 @@ EXPORT_SYMBOL(kmem_cache_alloc_notrace);
3603 */ 3602 */
3604int kmem_ptr_validate(struct kmem_cache *cachep, const void *ptr) 3603int kmem_ptr_validate(struct kmem_cache *cachep, const void *ptr)
3605{ 3604{
3606 unsigned long addr = (unsigned long)ptr;
3607 unsigned long min_addr = PAGE_OFFSET;
3608 unsigned long align_mask = BYTES_PER_WORD - 1;
3609 unsigned long size = cachep->buffer_size; 3605 unsigned long size = cachep->buffer_size;
3610 struct page *page; 3606 struct page *page;
3611 3607
3612 if (unlikely(addr < min_addr)) 3608 if (unlikely(!kern_ptr_validate(ptr, size)))
3613 goto out;
3614 if (unlikely(addr > (unsigned long)high_memory - size))
3615 goto out;
3616 if (unlikely(addr & align_mask))
3617 goto out;
3618 if (unlikely(!kern_addr_valid(addr)))
3619 goto out;
3620 if (unlikely(!kern_addr_valid(addr + size - 1)))
3621 goto out; 3609 goto out;
3622 page = virt_to_page(ptr); 3610 page = virt_to_page(ptr);
3623 if (unlikely(!PageSlab(page))) 3611 if (unlikely(!PageSlab(page)))
diff --git a/mm/slub.c b/mm/slub.c
index 8d71aaf888d7..d2a54fe71ea2 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -151,7 +151,8 @@
151 * Set of flags that will prevent slab merging 151 * Set of flags that will prevent slab merging
152 */ 152 */
153#define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ 153#define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
154 SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE) 154 SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \
155 SLAB_FAILSLAB)
155 156
156#define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \ 157#define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \
157 SLAB_CACHE_DMA | SLAB_NOTRACK) 158 SLAB_CACHE_DMA | SLAB_NOTRACK)
@@ -217,10 +218,10 @@ static inline void sysfs_slab_remove(struct kmem_cache *s)
217 218
218#endif 219#endif
219 220
220static inline void stat(struct kmem_cache_cpu *c, enum stat_item si) 221static inline void stat(struct kmem_cache *s, enum stat_item si)
221{ 222{
222#ifdef CONFIG_SLUB_STATS 223#ifdef CONFIG_SLUB_STATS
223 c->stat[si]++; 224 __this_cpu_inc(s->cpu_slab->stat[si]);
224#endif 225#endif
225} 226}
226 227
@@ -242,15 +243,6 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
242#endif 243#endif
243} 244}
244 245
245static inline struct kmem_cache_cpu *get_cpu_slab(struct kmem_cache *s, int cpu)
246{
247#ifdef CONFIG_SMP
248 return s->cpu_slab[cpu];
249#else
250 return &s->cpu_slab;
251#endif
252}
253
254/* Verify that a pointer has an address that is valid within a slab page */ 246/* Verify that a pointer has an address that is valid within a slab page */
255static inline int check_valid_pointer(struct kmem_cache *s, 247static inline int check_valid_pointer(struct kmem_cache *s,
256 struct page *page, const void *object) 248 struct page *page, const void *object)
@@ -269,13 +261,6 @@ static inline int check_valid_pointer(struct kmem_cache *s,
269 return 1; 261 return 1;
270} 262}
271 263
272/*
273 * Slow version of get and set free pointer.
274 *
275 * This version requires touching the cache lines of kmem_cache which
276 * we avoid to do in the fast alloc free paths. There we obtain the offset
277 * from the page struct.
278 */
279static inline void *get_freepointer(struct kmem_cache *s, void *object) 264static inline void *get_freepointer(struct kmem_cache *s, void *object)
280{ 265{
281 return *(void **)(object + s->offset); 266 return *(void **)(object + s->offset);
@@ -1020,6 +1005,9 @@ static int __init setup_slub_debug(char *str)
1020 case 't': 1005 case 't':
1021 slub_debug |= SLAB_TRACE; 1006 slub_debug |= SLAB_TRACE;
1022 break; 1007 break;
1008 case 'a':
1009 slub_debug |= SLAB_FAILSLAB;
1010 break;
1023 default: 1011 default:
1024 printk(KERN_ERR "slub_debug option '%c' " 1012 printk(KERN_ERR "slub_debug option '%c' "
1025 "unknown. skipped\n", *str); 1013 "unknown. skipped\n", *str);
@@ -1124,7 +1112,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
1124 if (!page) 1112 if (!page)
1125 return NULL; 1113 return NULL;
1126 1114
1127 stat(get_cpu_slab(s, raw_smp_processor_id()), ORDER_FALLBACK); 1115 stat(s, ORDER_FALLBACK);
1128 } 1116 }
1129 1117
1130 if (kmemcheck_enabled 1118 if (kmemcheck_enabled
@@ -1422,23 +1410,22 @@ static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node)
1422static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) 1410static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
1423{ 1411{
1424 struct kmem_cache_node *n = get_node(s, page_to_nid(page)); 1412 struct kmem_cache_node *n = get_node(s, page_to_nid(page));
1425 struct kmem_cache_cpu *c = get_cpu_slab(s, smp_processor_id());
1426 1413
1427 __ClearPageSlubFrozen(page); 1414 __ClearPageSlubFrozen(page);
1428 if (page->inuse) { 1415 if (page->inuse) {
1429 1416
1430 if (page->freelist) { 1417 if (page->freelist) {
1431 add_partial(n, page, tail); 1418 add_partial(n, page, tail);
1432 stat(c, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD); 1419 stat(s, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD);
1433 } else { 1420 } else {
1434 stat(c, DEACTIVATE_FULL); 1421 stat(s, DEACTIVATE_FULL);
1435 if (SLABDEBUG && PageSlubDebug(page) && 1422 if (SLABDEBUG && PageSlubDebug(page) &&
1436 (s->flags & SLAB_STORE_USER)) 1423 (s->flags & SLAB_STORE_USER))
1437 add_full(n, page); 1424 add_full(n, page);
1438 } 1425 }
1439 slab_unlock(page); 1426 slab_unlock(page);
1440 } else { 1427 } else {
1441 stat(c, DEACTIVATE_EMPTY); 1428 stat(s, DEACTIVATE_EMPTY);
1442 if (n->nr_partial < s->min_partial) { 1429 if (n->nr_partial < s->min_partial) {
1443 /* 1430 /*
1444 * Adding an empty slab to the partial slabs in order 1431 * Adding an empty slab to the partial slabs in order
@@ -1454,7 +1441,7 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
1454 slab_unlock(page); 1441 slab_unlock(page);
1455 } else { 1442 } else {
1456 slab_unlock(page); 1443 slab_unlock(page);
1457 stat(get_cpu_slab(s, raw_smp_processor_id()), FREE_SLAB); 1444 stat(s, FREE_SLAB);
1458 discard_slab(s, page); 1445 discard_slab(s, page);
1459 } 1446 }
1460 } 1447 }
@@ -1469,7 +1456,7 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
1469 int tail = 1; 1456 int tail = 1;
1470 1457
1471 if (page->freelist) 1458 if (page->freelist)
1472 stat(c, DEACTIVATE_REMOTE_FREES); 1459 stat(s, DEACTIVATE_REMOTE_FREES);
1473 /* 1460 /*
1474 * Merge cpu freelist into slab freelist. Typically we get here 1461 * Merge cpu freelist into slab freelist. Typically we get here
1475 * because both freelists are empty. So this is unlikely 1462 * because both freelists are empty. So this is unlikely
@@ -1482,10 +1469,10 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
1482 1469
1483 /* Retrieve object from cpu_freelist */ 1470 /* Retrieve object from cpu_freelist */
1484 object = c->freelist; 1471 object = c->freelist;
1485 c->freelist = c->freelist[c->offset]; 1472 c->freelist = get_freepointer(s, c->freelist);
1486 1473
1487 /* And put onto the regular freelist */ 1474 /* And put onto the regular freelist */
1488 object[c->offset] = page->freelist; 1475 set_freepointer(s, object, page->freelist);
1489 page->freelist = object; 1476 page->freelist = object;
1490 page->inuse--; 1477 page->inuse--;
1491 } 1478 }
@@ -1495,7 +1482,7 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
1495 1482
1496static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) 1483static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
1497{ 1484{
1498 stat(c, CPUSLAB_FLUSH); 1485 stat(s, CPUSLAB_FLUSH);
1499 slab_lock(c->page); 1486 slab_lock(c->page);
1500 deactivate_slab(s, c); 1487 deactivate_slab(s, c);
1501} 1488}
@@ -1507,7 +1494,7 @@ static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
1507 */ 1494 */
1508static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) 1495static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu)
1509{ 1496{
1510 struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); 1497 struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
1511 1498
1512 if (likely(c && c->page)) 1499 if (likely(c && c->page))
1513 flush_slab(s, c); 1500 flush_slab(s, c);
@@ -1635,7 +1622,7 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
1635 if (unlikely(!node_match(c, node))) 1622 if (unlikely(!node_match(c, node)))
1636 goto another_slab; 1623 goto another_slab;
1637 1624
1638 stat(c, ALLOC_REFILL); 1625 stat(s, ALLOC_REFILL);
1639 1626
1640load_freelist: 1627load_freelist:
1641 object = c->page->freelist; 1628 object = c->page->freelist;
@@ -1644,13 +1631,13 @@ load_freelist:
1644 if (unlikely(SLABDEBUG && PageSlubDebug(c->page))) 1631 if (unlikely(SLABDEBUG && PageSlubDebug(c->page)))
1645 goto debug; 1632 goto debug;
1646 1633
1647 c->freelist = object[c->offset]; 1634 c->freelist = get_freepointer(s, object);
1648 c->page->inuse = c->page->objects; 1635 c->page->inuse = c->page->objects;
1649 c->page->freelist = NULL; 1636 c->page->freelist = NULL;
1650 c->node = page_to_nid(c->page); 1637 c->node = page_to_nid(c->page);
1651unlock_out: 1638unlock_out:
1652 slab_unlock(c->page); 1639 slab_unlock(c->page);
1653 stat(c, ALLOC_SLOWPATH); 1640 stat(s, ALLOC_SLOWPATH);
1654 return object; 1641 return object;
1655 1642
1656another_slab: 1643another_slab:
@@ -1660,7 +1647,7 @@ new_slab:
1660 new = get_partial(s, gfpflags, node); 1647 new = get_partial(s, gfpflags, node);
1661 if (new) { 1648 if (new) {
1662 c->page = new; 1649 c->page = new;
1663 stat(c, ALLOC_FROM_PARTIAL); 1650 stat(s, ALLOC_FROM_PARTIAL);
1664 goto load_freelist; 1651 goto load_freelist;
1665 } 1652 }
1666 1653
@@ -1673,8 +1660,8 @@ new_slab:
1673 local_irq_disable(); 1660 local_irq_disable();
1674 1661
1675 if (new) { 1662 if (new) {
1676 c = get_cpu_slab(s, smp_processor_id()); 1663 c = __this_cpu_ptr(s->cpu_slab);
1677 stat(c, ALLOC_SLAB); 1664 stat(s, ALLOC_SLAB);
1678 if (c->page) 1665 if (c->page)
1679 flush_slab(s, c); 1666 flush_slab(s, c);
1680 slab_lock(new); 1667 slab_lock(new);
@@ -1690,7 +1677,7 @@ debug:
1690 goto another_slab; 1677 goto another_slab;
1691 1678
1692 c->page->inuse++; 1679 c->page->inuse++;
1693 c->page->freelist = object[c->offset]; 1680 c->page->freelist = get_freepointer(s, object);
1694 c->node = -1; 1681 c->node = -1;
1695 goto unlock_out; 1682 goto unlock_out;
1696} 1683}
@@ -1711,35 +1698,33 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
1711 void **object; 1698 void **object;
1712 struct kmem_cache_cpu *c; 1699 struct kmem_cache_cpu *c;
1713 unsigned long flags; 1700 unsigned long flags;
1714 unsigned int objsize;
1715 1701
1716 gfpflags &= gfp_allowed_mask; 1702 gfpflags &= gfp_allowed_mask;
1717 1703
1718 lockdep_trace_alloc(gfpflags); 1704 lockdep_trace_alloc(gfpflags);
1719 might_sleep_if(gfpflags & __GFP_WAIT); 1705 might_sleep_if(gfpflags & __GFP_WAIT);
1720 1706
1721 if (should_failslab(s->objsize, gfpflags)) 1707 if (should_failslab(s->objsize, gfpflags, s->flags))
1722 return NULL; 1708 return NULL;
1723 1709
1724 local_irq_save(flags); 1710 local_irq_save(flags);
1725 c = get_cpu_slab(s, smp_processor_id()); 1711 c = __this_cpu_ptr(s->cpu_slab);
1726 objsize = c->objsize; 1712 object = c->freelist;
1727 if (unlikely(!c->freelist || !node_match(c, node))) 1713 if (unlikely(!object || !node_match(c, node)))
1728 1714
1729 object = __slab_alloc(s, gfpflags, node, addr, c); 1715 object = __slab_alloc(s, gfpflags, node, addr, c);
1730 1716
1731 else { 1717 else {
1732 object = c->freelist; 1718 c->freelist = get_freepointer(s, object);
1733 c->freelist = object[c->offset]; 1719 stat(s, ALLOC_FASTPATH);
1734 stat(c, ALLOC_FASTPATH);
1735 } 1720 }
1736 local_irq_restore(flags); 1721 local_irq_restore(flags);
1737 1722
1738 if (unlikely(gfpflags & __GFP_ZERO) && object) 1723 if (unlikely(gfpflags & __GFP_ZERO) && object)
1739 memset(object, 0, objsize); 1724 memset(object, 0, s->objsize);
1740 1725
1741 kmemcheck_slab_alloc(s, gfpflags, object, c->objsize); 1726 kmemcheck_slab_alloc(s, gfpflags, object, s->objsize);
1742 kmemleak_alloc_recursive(object, objsize, 1, s->flags, gfpflags); 1727 kmemleak_alloc_recursive(object, s->objsize, 1, s->flags, gfpflags);
1743 1728
1744 return object; 1729 return object;
1745} 1730}
@@ -1794,26 +1779,25 @@ EXPORT_SYMBOL(kmem_cache_alloc_node_notrace);
1794 * handling required then we can return immediately. 1779 * handling required then we can return immediately.
1795 */ 1780 */
1796static void __slab_free(struct kmem_cache *s, struct page *page, 1781static void __slab_free(struct kmem_cache *s, struct page *page,
1797 void *x, unsigned long addr, unsigned int offset) 1782 void *x, unsigned long addr)
1798{ 1783{
1799 void *prior; 1784 void *prior;
1800 void **object = (void *)x; 1785 void **object = (void *)x;
1801 struct kmem_cache_cpu *c;
1802 1786
1803 c = get_cpu_slab(s, raw_smp_processor_id()); 1787 stat(s, FREE_SLOWPATH);
1804 stat(c, FREE_SLOWPATH);
1805 slab_lock(page); 1788 slab_lock(page);
1806 1789
1807 if (unlikely(SLABDEBUG && PageSlubDebug(page))) 1790 if (unlikely(SLABDEBUG && PageSlubDebug(page)))
1808 goto debug; 1791 goto debug;
1809 1792
1810checks_ok: 1793checks_ok:
1811 prior = object[offset] = page->freelist; 1794 prior = page->freelist;
1795 set_freepointer(s, object, prior);
1812 page->freelist = object; 1796 page->freelist = object;
1813 page->inuse--; 1797 page->inuse--;
1814 1798
1815 if (unlikely(PageSlubFrozen(page))) { 1799 if (unlikely(PageSlubFrozen(page))) {
1816 stat(c, FREE_FROZEN); 1800 stat(s, FREE_FROZEN);
1817 goto out_unlock; 1801 goto out_unlock;
1818 } 1802 }
1819 1803
@@ -1826,7 +1810,7 @@ checks_ok:
1826 */ 1810 */
1827 if (unlikely(!prior)) { 1811 if (unlikely(!prior)) {
1828 add_partial(get_node(s, page_to_nid(page)), page, 1); 1812 add_partial(get_node(s, page_to_nid(page)), page, 1);
1829 stat(c, FREE_ADD_PARTIAL); 1813 stat(s, FREE_ADD_PARTIAL);
1830 } 1814 }
1831 1815
1832out_unlock: 1816out_unlock:
@@ -1839,10 +1823,10 @@ slab_empty:
1839 * Slab still on the partial list. 1823 * Slab still on the partial list.
1840 */ 1824 */
1841 remove_partial(s, page); 1825 remove_partial(s, page);
1842 stat(c, FREE_REMOVE_PARTIAL); 1826 stat(s, FREE_REMOVE_PARTIAL);
1843 } 1827 }
1844 slab_unlock(page); 1828 slab_unlock(page);
1845 stat(c, FREE_SLAB); 1829 stat(s, FREE_SLAB);
1846 discard_slab(s, page); 1830 discard_slab(s, page);
1847 return; 1831 return;
1848 1832
@@ -1872,17 +1856,17 @@ static __always_inline void slab_free(struct kmem_cache *s,
1872 1856
1873 kmemleak_free_recursive(x, s->flags); 1857 kmemleak_free_recursive(x, s->flags);
1874 local_irq_save(flags); 1858 local_irq_save(flags);
1875 c = get_cpu_slab(s, smp_processor_id()); 1859 c = __this_cpu_ptr(s->cpu_slab);
1876 kmemcheck_slab_free(s, object, c->objsize); 1860 kmemcheck_slab_free(s, object, s->objsize);
1877 debug_check_no_locks_freed(object, c->objsize); 1861 debug_check_no_locks_freed(object, s->objsize);
1878 if (!(s->flags & SLAB_DEBUG_OBJECTS)) 1862 if (!(s->flags & SLAB_DEBUG_OBJECTS))
1879 debug_check_no_obj_freed(object, c->objsize); 1863 debug_check_no_obj_freed(object, s->objsize);
1880 if (likely(page == c->page && c->node >= 0)) { 1864 if (likely(page == c->page && c->node >= 0)) {
1881 object[c->offset] = c->freelist; 1865 set_freepointer(s, object, c->freelist);
1882 c->freelist = object; 1866 c->freelist = object;
1883 stat(c, FREE_FASTPATH); 1867 stat(s, FREE_FASTPATH);
1884 } else 1868 } else
1885 __slab_free(s, page, x, addr, c->offset); 1869 __slab_free(s, page, x, addr);
1886 1870
1887 local_irq_restore(flags); 1871 local_irq_restore(flags);
1888} 1872}
@@ -2069,19 +2053,6 @@ static unsigned long calculate_alignment(unsigned long flags,
2069 return ALIGN(align, sizeof(void *)); 2053 return ALIGN(align, sizeof(void *));
2070} 2054}
2071 2055
2072static void init_kmem_cache_cpu(struct kmem_cache *s,
2073 struct kmem_cache_cpu *c)
2074{
2075 c->page = NULL;
2076 c->freelist = NULL;
2077 c->node = 0;
2078 c->offset = s->offset / sizeof(void *);
2079 c->objsize = s->objsize;
2080#ifdef CONFIG_SLUB_STATS
2081 memset(c->stat, 0, NR_SLUB_STAT_ITEMS * sizeof(unsigned));
2082#endif
2083}
2084
2085static void 2056static void
2086init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s) 2057init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s)
2087{ 2058{
@@ -2095,130 +2066,24 @@ init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s)
2095#endif 2066#endif
2096} 2067}
2097 2068
2098#ifdef CONFIG_SMP 2069static DEFINE_PER_CPU(struct kmem_cache_cpu, kmalloc_percpu[KMALLOC_CACHES]);
2099/*
2100 * Per cpu array for per cpu structures.
2101 *
2102 * The per cpu array places all kmem_cache_cpu structures from one processor
2103 * close together meaning that it becomes possible that multiple per cpu
2104 * structures are contained in one cacheline. This may be particularly
2105 * beneficial for the kmalloc caches.
2106 *
2107 * A desktop system typically has around 60-80 slabs. With 100 here we are
2108 * likely able to get per cpu structures for all caches from the array defined
2109 * here. We must be able to cover all kmalloc caches during bootstrap.
2110 *
2111 * If the per cpu array is exhausted then fall back to kmalloc
2112 * of individual cachelines. No sharing is possible then.
2113 */
2114#define NR_KMEM_CACHE_CPU 100
2115
2116static DEFINE_PER_CPU(struct kmem_cache_cpu [NR_KMEM_CACHE_CPU],
2117 kmem_cache_cpu);
2118
2119static DEFINE_PER_CPU(struct kmem_cache_cpu *, kmem_cache_cpu_free);
2120static DECLARE_BITMAP(kmem_cach_cpu_free_init_once, CONFIG_NR_CPUS);
2121
2122static struct kmem_cache_cpu *alloc_kmem_cache_cpu(struct kmem_cache *s,
2123 int cpu, gfp_t flags)
2124{
2125 struct kmem_cache_cpu *c = per_cpu(kmem_cache_cpu_free, cpu);
2126
2127 if (c)
2128 per_cpu(kmem_cache_cpu_free, cpu) =
2129 (void *)c->freelist;
2130 else {
2131 /* Table overflow: So allocate ourselves */
2132 c = kmalloc_node(
2133 ALIGN(sizeof(struct kmem_cache_cpu), cache_line_size()),
2134 flags, cpu_to_node(cpu));
2135 if (!c)
2136 return NULL;
2137 }
2138
2139 init_kmem_cache_cpu(s, c);
2140 return c;
2141}
2142
2143static void free_kmem_cache_cpu(struct kmem_cache_cpu *c, int cpu)
2144{
2145 if (c < per_cpu(kmem_cache_cpu, cpu) ||
2146 c >= per_cpu(kmem_cache_cpu, cpu) + NR_KMEM_CACHE_CPU) {
2147 kfree(c);
2148 return;
2149 }
2150 c->freelist = (void *)per_cpu(kmem_cache_cpu_free, cpu);
2151 per_cpu(kmem_cache_cpu_free, cpu) = c;
2152}
2153
2154static void free_kmem_cache_cpus(struct kmem_cache *s)
2155{
2156 int cpu;
2157
2158 for_each_online_cpu(cpu) {
2159 struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
2160
2161 if (c) {
2162 s->cpu_slab[cpu] = NULL;
2163 free_kmem_cache_cpu(c, cpu);
2164 }
2165 }
2166}
2167
2168static int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags)
2169{
2170 int cpu;
2171
2172 for_each_online_cpu(cpu) {
2173 struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
2174
2175 if (c)
2176 continue;
2177
2178 c = alloc_kmem_cache_cpu(s, cpu, flags);
2179 if (!c) {
2180 free_kmem_cache_cpus(s);
2181 return 0;
2182 }
2183 s->cpu_slab[cpu] = c;
2184 }
2185 return 1;
2186}
2187
2188/*
2189 * Initialize the per cpu array.
2190 */
2191static void init_alloc_cpu_cpu(int cpu)
2192{
2193 int i;
2194 2070
2195 if (cpumask_test_cpu(cpu, to_cpumask(kmem_cach_cpu_free_init_once))) 2071static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags)
2196 return;
2197
2198 for (i = NR_KMEM_CACHE_CPU - 1; i >= 0; i--)
2199 free_kmem_cache_cpu(&per_cpu(kmem_cache_cpu, cpu)[i], cpu);
2200
2201 cpumask_set_cpu(cpu, to_cpumask(kmem_cach_cpu_free_init_once));
2202}
2203
2204static void __init init_alloc_cpu(void)
2205{ 2072{
2206 int cpu; 2073 if (s < kmalloc_caches + KMALLOC_CACHES && s >= kmalloc_caches)
2207 2074 /*
2208 for_each_online_cpu(cpu) 2075 * Boot time creation of the kmalloc array. Use static per cpu data
2209 init_alloc_cpu_cpu(cpu); 2076 * since the per cpu allocator is not available yet.
2210 } 2077 */
2078 s->cpu_slab = kmalloc_percpu + (s - kmalloc_caches);
2079 else
2080 s->cpu_slab = alloc_percpu(struct kmem_cache_cpu);
2211 2081
2212#else 2082 if (!s->cpu_slab)
2213static inline void free_kmem_cache_cpus(struct kmem_cache *s) {} 2083 return 0;
2214static inline void init_alloc_cpu(void) {}
2215 2084
2216static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags)
2217{
2218 init_kmem_cache_cpu(s, &s->cpu_slab);
2219 return 1; 2085 return 1;
2220} 2086}
2221#endif
2222 2087
2223#ifdef CONFIG_NUMA 2088#ifdef CONFIG_NUMA
2224/* 2089/*
@@ -2287,7 +2152,8 @@ static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
2287 int node; 2152 int node;
2288 int local_node; 2153 int local_node;
2289 2154
2290 if (slab_state >= UP) 2155 if (slab_state >= UP && (s < kmalloc_caches ||
2156 s >= kmalloc_caches + KMALLOC_CACHES))
2291 local_node = page_to_nid(virt_to_page(s)); 2157 local_node = page_to_nid(virt_to_page(s));
2292 else 2158 else
2293 local_node = 0; 2159 local_node = 0;
@@ -2502,6 +2368,7 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
2502 2368
2503 if (alloc_kmem_cache_cpus(s, gfpflags & ~SLUB_DMA)) 2369 if (alloc_kmem_cache_cpus(s, gfpflags & ~SLUB_DMA))
2504 return 1; 2370 return 1;
2371
2505 free_kmem_cache_nodes(s); 2372 free_kmem_cache_nodes(s);
2506error: 2373error:
2507 if (flags & SLAB_PANIC) 2374 if (flags & SLAB_PANIC)
@@ -2519,6 +2386,9 @@ int kmem_ptr_validate(struct kmem_cache *s, const void *object)
2519{ 2386{
2520 struct page *page; 2387 struct page *page;
2521 2388
2389 if (!kern_ptr_validate(object, s->size))
2390 return 0;
2391
2522 page = get_object_page(object); 2392 page = get_object_page(object);
2523 2393
2524 if (!page || s != page->slab) 2394 if (!page || s != page->slab)
@@ -2609,9 +2479,8 @@ static inline int kmem_cache_close(struct kmem_cache *s)
2609 int node; 2479 int node;
2610 2480
2611 flush_all(s); 2481 flush_all(s);
2612 2482 free_percpu(s->cpu_slab);
2613 /* Attempt to free all objects */ 2483 /* Attempt to free all objects */
2614 free_kmem_cache_cpus(s);
2615 for_each_node_state(node, N_NORMAL_MEMORY) { 2484 for_each_node_state(node, N_NORMAL_MEMORY) {
2616 struct kmem_cache_node *n = get_node(s, node); 2485 struct kmem_cache_node *n = get_node(s, node);
2617 2486
@@ -2651,7 +2520,7 @@ EXPORT_SYMBOL(kmem_cache_destroy);
2651 * Kmalloc subsystem 2520 * Kmalloc subsystem
2652 *******************************************************************/ 2521 *******************************************************************/
2653 2522
2654struct kmem_cache kmalloc_caches[SLUB_PAGE_SHIFT] __cacheline_aligned; 2523struct kmem_cache kmalloc_caches[KMALLOC_CACHES] __cacheline_aligned;
2655EXPORT_SYMBOL(kmalloc_caches); 2524EXPORT_SYMBOL(kmalloc_caches);
2656 2525
2657static int __init setup_slub_min_order(char *str) 2526static int __init setup_slub_min_order(char *str)
@@ -2741,6 +2610,7 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags)
2741 char *text; 2610 char *text;
2742 size_t realsize; 2611 size_t realsize;
2743 unsigned long slabflags; 2612 unsigned long slabflags;
2613 int i;
2744 2614
2745 s = kmalloc_caches_dma[index]; 2615 s = kmalloc_caches_dma[index];
2746 if (s) 2616 if (s)
@@ -2760,7 +2630,14 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags)
2760 realsize = kmalloc_caches[index].objsize; 2630 realsize = kmalloc_caches[index].objsize;
2761 text = kasprintf(flags & ~SLUB_DMA, "kmalloc_dma-%d", 2631 text = kasprintf(flags & ~SLUB_DMA, "kmalloc_dma-%d",
2762 (unsigned int)realsize); 2632 (unsigned int)realsize);
2763 s = kmalloc(kmem_size, flags & ~SLUB_DMA); 2633
2634 s = NULL;
2635 for (i = 0; i < KMALLOC_CACHES; i++)
2636 if (!kmalloc_caches[i].size)
2637 break;
2638
2639 BUG_ON(i >= KMALLOC_CACHES);
2640 s = kmalloc_caches + i;
2764 2641
2765 /* 2642 /*
2766 * Must defer sysfs creation to a workqueue because we don't know 2643 * Must defer sysfs creation to a workqueue because we don't know
@@ -2772,9 +2649,9 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags)
2772 if (slab_state >= SYSFS) 2649 if (slab_state >= SYSFS)
2773 slabflags |= __SYSFS_ADD_DEFERRED; 2650 slabflags |= __SYSFS_ADD_DEFERRED;
2774 2651
2775 if (!s || !text || !kmem_cache_open(s, flags, text, 2652 if (!text || !kmem_cache_open(s, flags, text,
2776 realsize, ARCH_KMALLOC_MINALIGN, slabflags, NULL)) { 2653 realsize, ARCH_KMALLOC_MINALIGN, slabflags, NULL)) {
2777 kfree(s); 2654 s->size = 0;
2778 kfree(text); 2655 kfree(text);
2779 goto unlock_out; 2656 goto unlock_out;
2780 } 2657 }
@@ -3086,7 +2963,7 @@ static void slab_mem_offline_callback(void *arg)
3086 /* 2963 /*
3087 * if n->nr_slabs > 0, slabs still exist on the node 2964 * if n->nr_slabs > 0, slabs still exist on the node
3088 * that is going down. We were unable to free them, 2965 * that is going down. We were unable to free them,
3089 * and offline_pages() function shoudn't call this 2966 * and offline_pages() function shouldn't call this
3090 * callback. So, we must fail. 2967 * callback. So, we must fail.
3091 */ 2968 */
3092 BUG_ON(slabs_node(s, offline_node)); 2969 BUG_ON(slabs_node(s, offline_node));
@@ -3176,8 +3053,6 @@ void __init kmem_cache_init(void)
3176 int i; 3053 int i;
3177 int caches = 0; 3054 int caches = 0;
3178 3055
3179 init_alloc_cpu();
3180
3181#ifdef CONFIG_NUMA 3056#ifdef CONFIG_NUMA
3182 /* 3057 /*
3183 * Must first have the slab cache available for the allocations of the 3058 * Must first have the slab cache available for the allocations of the
@@ -3261,8 +3136,10 @@ void __init kmem_cache_init(void)
3261 3136
3262#ifdef CONFIG_SMP 3137#ifdef CONFIG_SMP
3263 register_cpu_notifier(&slab_notifier); 3138 register_cpu_notifier(&slab_notifier);
3264 kmem_size = offsetof(struct kmem_cache, cpu_slab) + 3139#endif
3265 nr_cpu_ids * sizeof(struct kmem_cache_cpu *); 3140#ifdef CONFIG_NUMA
3141 kmem_size = offsetof(struct kmem_cache, node) +
3142 nr_node_ids * sizeof(struct kmem_cache_node *);
3266#else 3143#else
3267 kmem_size = sizeof(struct kmem_cache); 3144 kmem_size = sizeof(struct kmem_cache);
3268#endif 3145#endif
@@ -3351,22 +3228,12 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
3351 down_write(&slub_lock); 3228 down_write(&slub_lock);
3352 s = find_mergeable(size, align, flags, name, ctor); 3229 s = find_mergeable(size, align, flags, name, ctor);
3353 if (s) { 3230 if (s) {
3354 int cpu;
3355
3356 s->refcount++; 3231 s->refcount++;
3357 /* 3232 /*
3358 * Adjust the object sizes so that we clear 3233 * Adjust the object sizes so that we clear
3359 * the complete object on kzalloc. 3234 * the complete object on kzalloc.
3360 */ 3235 */
3361 s->objsize = max(s->objsize, (int)size); 3236 s->objsize = max(s->objsize, (int)size);
3362
3363 /*
3364 * And then we need to update the object size in the
3365 * per cpu structures
3366 */
3367 for_each_online_cpu(cpu)
3368 get_cpu_slab(s, cpu)->objsize = s->objsize;
3369
3370 s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); 3237 s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *)));
3371 up_write(&slub_lock); 3238 up_write(&slub_lock);
3372 3239
@@ -3420,29 +3287,15 @@ static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb,
3420 unsigned long flags; 3287 unsigned long flags;
3421 3288
3422 switch (action) { 3289 switch (action) {
3423 case CPU_UP_PREPARE:
3424 case CPU_UP_PREPARE_FROZEN:
3425 init_alloc_cpu_cpu(cpu);
3426 down_read(&slub_lock);
3427 list_for_each_entry(s, &slab_caches, list)
3428 s->cpu_slab[cpu] = alloc_kmem_cache_cpu(s, cpu,
3429 GFP_KERNEL);
3430 up_read(&slub_lock);
3431 break;
3432
3433 case CPU_UP_CANCELED: 3290 case CPU_UP_CANCELED:
3434 case CPU_UP_CANCELED_FROZEN: 3291 case CPU_UP_CANCELED_FROZEN:
3435 case CPU_DEAD: 3292 case CPU_DEAD:
3436 case CPU_DEAD_FROZEN: 3293 case CPU_DEAD_FROZEN:
3437 down_read(&slub_lock); 3294 down_read(&slub_lock);
3438 list_for_each_entry(s, &slab_caches, list) { 3295 list_for_each_entry(s, &slab_caches, list) {
3439 struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
3440
3441 local_irq_save(flags); 3296 local_irq_save(flags);
3442 __flush_cpu_slab(s, cpu); 3297 __flush_cpu_slab(s, cpu);
3443 local_irq_restore(flags); 3298 local_irq_restore(flags);
3444 free_kmem_cache_cpu(c, cpu);
3445 s->cpu_slab[cpu] = NULL;
3446 } 3299 }
3447 up_read(&slub_lock); 3300 up_read(&slub_lock);
3448 break; 3301 break;
@@ -3928,7 +3781,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
3928 int cpu; 3781 int cpu;
3929 3782
3930 for_each_possible_cpu(cpu) { 3783 for_each_possible_cpu(cpu) {
3931 struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); 3784 struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
3932 3785
3933 if (!c || c->node < 0) 3786 if (!c || c->node < 0)
3934 continue; 3787 continue;
@@ -4171,6 +4024,23 @@ static ssize_t trace_store(struct kmem_cache *s, const char *buf,
4171} 4024}
4172SLAB_ATTR(trace); 4025SLAB_ATTR(trace);
4173 4026
4027#ifdef CONFIG_FAILSLAB
4028static ssize_t failslab_show(struct kmem_cache *s, char *buf)
4029{
4030 return sprintf(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB));
4031}
4032
4033static ssize_t failslab_store(struct kmem_cache *s, const char *buf,
4034 size_t length)
4035{
4036 s->flags &= ~SLAB_FAILSLAB;
4037 if (buf[0] == '1')
4038 s->flags |= SLAB_FAILSLAB;
4039 return length;
4040}
4041SLAB_ATTR(failslab);
4042#endif
4043
4174static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf) 4044static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf)
4175{ 4045{
4176 return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT)); 4046 return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT));
@@ -4353,7 +4223,7 @@ static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si)
4353 return -ENOMEM; 4223 return -ENOMEM;
4354 4224
4355 for_each_online_cpu(cpu) { 4225 for_each_online_cpu(cpu) {
4356 unsigned x = get_cpu_slab(s, cpu)->stat[si]; 4226 unsigned x = per_cpu_ptr(s->cpu_slab, cpu)->stat[si];
4357 4227
4358 data[cpu] = x; 4228 data[cpu] = x;
4359 sum += x; 4229 sum += x;
@@ -4376,7 +4246,7 @@ static void clear_stat(struct kmem_cache *s, enum stat_item si)
4376 int cpu; 4246 int cpu;
4377 4247
4378 for_each_online_cpu(cpu) 4248 for_each_online_cpu(cpu)
4379 get_cpu_slab(s, cpu)->stat[si] = 0; 4249 per_cpu_ptr(s->cpu_slab, cpu)->stat[si] = 0;
4380} 4250}
4381 4251
4382#define STAT_ATTR(si, text) \ 4252#define STAT_ATTR(si, text) \
@@ -4467,6 +4337,10 @@ static struct attribute *slab_attrs[] = {
4467 &deactivate_remote_frees_attr.attr, 4337 &deactivate_remote_frees_attr.attr,
4468 &order_fallback_attr.attr, 4338 &order_fallback_attr.attr,
4469#endif 4339#endif
4340#ifdef CONFIG_FAILSLAB
4341 &failslab_attr.attr,
4342#endif
4343
4470 NULL 4344 NULL
4471}; 4345};
4472 4346
@@ -4519,7 +4393,7 @@ static void kmem_cache_release(struct kobject *kobj)
4519 kfree(s); 4393 kfree(s);
4520} 4394}
4521 4395
4522static struct sysfs_ops slab_sysfs_ops = { 4396static const struct sysfs_ops slab_sysfs_ops = {
4523 .show = slab_attr_show, 4397 .show = slab_attr_show,
4524 .store = slab_attr_store, 4398 .store = slab_attr_store,
4525}; 4399};
@@ -4538,7 +4412,7 @@ static int uevent_filter(struct kset *kset, struct kobject *kobj)
4538 return 0; 4412 return 0;
4539} 4413}
4540 4414
4541static struct kset_uevent_ops slab_uevent_ops = { 4415static const struct kset_uevent_ops slab_uevent_ops = {
4542 .filter = uevent_filter, 4416 .filter = uevent_filter,
4543}; 4417};
4544 4418
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index d9714bdcb4a3..aa33fd67fa41 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -22,6 +22,7 @@
22#include <linux/bootmem.h> 22#include <linux/bootmem.h>
23#include <linux/highmem.h> 23#include <linux/highmem.h>
24#include <linux/module.h> 24#include <linux/module.h>
25#include <linux/slab.h>
25#include <linux/spinlock.h> 26#include <linux/spinlock.h>
26#include <linux/vmalloc.h> 27#include <linux/vmalloc.h>
27#include <linux/sched.h> 28#include <linux/sched.h>
@@ -40,9 +41,11 @@ static void * __init_refok __earlyonly_bootmem_alloc(int node,
40 unsigned long align, 41 unsigned long align,
41 unsigned long goal) 42 unsigned long goal)
42{ 43{
43 return __alloc_bootmem_node(NODE_DATA(node), size, align, goal); 44 return __alloc_bootmem_node_high(NODE_DATA(node), size, align, goal);
44} 45}
45 46
47static void *vmemmap_buf;
48static void *vmemmap_buf_end;
46 49
47void * __meminit vmemmap_alloc_block(unsigned long size, int node) 50void * __meminit vmemmap_alloc_block(unsigned long size, int node)
48{ 51{
@@ -64,6 +67,24 @@ void * __meminit vmemmap_alloc_block(unsigned long size, int node)
64 __pa(MAX_DMA_ADDRESS)); 67 __pa(MAX_DMA_ADDRESS));
65} 68}
66 69
70/* need to make sure size is all the same during early stage */
71void * __meminit vmemmap_alloc_block_buf(unsigned long size, int node)
72{
73 void *ptr;
74
75 if (!vmemmap_buf)
76 return vmemmap_alloc_block(size, node);
77
78 /* take the from buf */
79 ptr = (void *)ALIGN((unsigned long)vmemmap_buf, size);
80 if (ptr + size > vmemmap_buf_end)
81 return vmemmap_alloc_block(size, node);
82
83 vmemmap_buf = ptr + size;
84
85 return ptr;
86}
87
67void __meminit vmemmap_verify(pte_t *pte, int node, 88void __meminit vmemmap_verify(pte_t *pte, int node,
68 unsigned long start, unsigned long end) 89 unsigned long start, unsigned long end)
69{ 90{
@@ -80,7 +101,7 @@ pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node)
80 pte_t *pte = pte_offset_kernel(pmd, addr); 101 pte_t *pte = pte_offset_kernel(pmd, addr);
81 if (pte_none(*pte)) { 102 if (pte_none(*pte)) {
82 pte_t entry; 103 pte_t entry;
83 void *p = vmemmap_alloc_block(PAGE_SIZE, node); 104 void *p = vmemmap_alloc_block_buf(PAGE_SIZE, node);
84 if (!p) 105 if (!p)
85 return NULL; 106 return NULL;
86 entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL); 107 entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
@@ -163,3 +184,55 @@ struct page * __meminit sparse_mem_map_populate(unsigned long pnum, int nid)
163 184
164 return map; 185 return map;
165} 186}
187
188void __init sparse_mem_maps_populate_node(struct page **map_map,
189 unsigned long pnum_begin,
190 unsigned long pnum_end,
191 unsigned long map_count, int nodeid)
192{
193 unsigned long pnum;
194 unsigned long size = sizeof(struct page) * PAGES_PER_SECTION;
195 void *vmemmap_buf_start;
196
197 size = ALIGN(size, PMD_SIZE);
198 vmemmap_buf_start = __earlyonly_bootmem_alloc(nodeid, size * map_count,
199 PMD_SIZE, __pa(MAX_DMA_ADDRESS));
200
201 if (vmemmap_buf_start) {
202 vmemmap_buf = vmemmap_buf_start;
203 vmemmap_buf_end = vmemmap_buf_start + size * map_count;
204 }
205
206 for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
207 struct mem_section *ms;
208
209 if (!present_section_nr(pnum))
210 continue;
211
212 map_map[pnum] = sparse_mem_map_populate(pnum, nodeid);
213 if (map_map[pnum])
214 continue;
215 ms = __nr_to_section(pnum);
216 printk(KERN_ERR "%s: sparsemem memory map backing failed "
217 "some memory will not be available.\n", __func__);
218 ms->section_mem_map = 0;
219 }
220
221 if (vmemmap_buf_start) {
222 /* need to free left buf */
223#ifdef CONFIG_NO_BOOTMEM
224 free_early(__pa(vmemmap_buf_start), __pa(vmemmap_buf_end));
225 if (vmemmap_buf_start < vmemmap_buf) {
226 char name[15];
227
228 snprintf(name, sizeof(name), "MEMMAP %d", nodeid);
229 reserve_early_without_check(__pa(vmemmap_buf_start),
230 __pa(vmemmap_buf), name);
231 }
232#else
233 free_bootmem(__pa(vmemmap_buf), vmemmap_buf_end - vmemmap_buf);
234#endif
235 vmemmap_buf = NULL;
236 vmemmap_buf_end = NULL;
237 }
238}
diff --git a/mm/sparse.c b/mm/sparse.c
index 6ce4aab69e99..dc0cc4d43ff3 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -2,6 +2,7 @@
2 * sparse memory mappings. 2 * sparse memory mappings.
3 */ 3 */
4#include <linux/mm.h> 4#include <linux/mm.h>
5#include <linux/slab.h>
5#include <linux/mmzone.h> 6#include <linux/mmzone.h>
6#include <linux/bootmem.h> 7#include <linux/bootmem.h>
7#include <linux/highmem.h> 8#include <linux/highmem.h>
@@ -271,7 +272,8 @@ static unsigned long *__kmalloc_section_usemap(void)
271 272
272#ifdef CONFIG_MEMORY_HOTREMOVE 273#ifdef CONFIG_MEMORY_HOTREMOVE
273static unsigned long * __init 274static unsigned long * __init
274sparse_early_usemap_alloc_pgdat_section(struct pglist_data *pgdat) 275sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
276 unsigned long count)
275{ 277{
276 unsigned long section_nr; 278 unsigned long section_nr;
277 279
@@ -286,7 +288,7 @@ sparse_early_usemap_alloc_pgdat_section(struct pglist_data *pgdat)
286 * this problem. 288 * this problem.
287 */ 289 */
288 section_nr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT); 290 section_nr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT);
289 return alloc_bootmem_section(usemap_size(), section_nr); 291 return alloc_bootmem_section(usemap_size() * count, section_nr);
290} 292}
291 293
292static void __init check_usemap_section_nr(int nid, unsigned long *usemap) 294static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
@@ -329,7 +331,8 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
329} 331}
330#else 332#else
331static unsigned long * __init 333static unsigned long * __init
332sparse_early_usemap_alloc_pgdat_section(struct pglist_data *pgdat) 334sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
335 unsigned long count)
333{ 336{
334 return NULL; 337 return NULL;
335} 338}
@@ -339,27 +342,40 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
339} 342}
340#endif /* CONFIG_MEMORY_HOTREMOVE */ 343#endif /* CONFIG_MEMORY_HOTREMOVE */
341 344
342static unsigned long *__init sparse_early_usemap_alloc(unsigned long pnum) 345static void __init sparse_early_usemaps_alloc_node(unsigned long**usemap_map,
346 unsigned long pnum_begin,
347 unsigned long pnum_end,
348 unsigned long usemap_count, int nodeid)
343{ 349{
344 unsigned long *usemap; 350 void *usemap;
345 struct mem_section *ms = __nr_to_section(pnum); 351 unsigned long pnum;
346 int nid = sparse_early_nid(ms); 352 int size = usemap_size();
347
348 usemap = sparse_early_usemap_alloc_pgdat_section(NODE_DATA(nid));
349 if (usemap)
350 return usemap;
351 353
352 usemap = alloc_bootmem_node(NODE_DATA(nid), usemap_size()); 354 usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid),
355 usemap_count);
353 if (usemap) { 356 if (usemap) {
354 check_usemap_section_nr(nid, usemap); 357 for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
355 return usemap; 358 if (!present_section_nr(pnum))
359 continue;
360 usemap_map[pnum] = usemap;
361 usemap += size;
362 }
363 return;
356 } 364 }
357 365
358 /* Stupid: suppress gcc warning for SPARSEMEM && !NUMA */ 366 usemap = alloc_bootmem_node(NODE_DATA(nodeid), size * usemap_count);
359 nid = 0; 367 if (usemap) {
368 for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
369 if (!present_section_nr(pnum))
370 continue;
371 usemap_map[pnum] = usemap;
372 usemap += size;
373 check_usemap_section_nr(nodeid, usemap_map[pnum]);
374 }
375 return;
376 }
360 377
361 printk(KERN_WARNING "%s: allocation failed\n", __func__); 378 printk(KERN_WARNING "%s: allocation failed\n", __func__);
362 return NULL;
363} 379}
364 380
365#ifndef CONFIG_SPARSEMEM_VMEMMAP 381#ifndef CONFIG_SPARSEMEM_VMEMMAP
@@ -375,8 +391,65 @@ struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid)
375 PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION)); 391 PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION));
376 return map; 392 return map;
377} 393}
394void __init sparse_mem_maps_populate_node(struct page **map_map,
395 unsigned long pnum_begin,
396 unsigned long pnum_end,
397 unsigned long map_count, int nodeid)
398{
399 void *map;
400 unsigned long pnum;
401 unsigned long size = sizeof(struct page) * PAGES_PER_SECTION;
402
403 map = alloc_remap(nodeid, size * map_count);
404 if (map) {
405 for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
406 if (!present_section_nr(pnum))
407 continue;
408 map_map[pnum] = map;
409 map += size;
410 }
411 return;
412 }
413
414 size = PAGE_ALIGN(size);
415 map = alloc_bootmem_pages_node(NODE_DATA(nodeid), size * map_count);
416 if (map) {
417 for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
418 if (!present_section_nr(pnum))
419 continue;
420 map_map[pnum] = map;
421 map += size;
422 }
423 return;
424 }
425
426 /* fallback */
427 for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
428 struct mem_section *ms;
429
430 if (!present_section_nr(pnum))
431 continue;
432 map_map[pnum] = sparse_mem_map_populate(pnum, nodeid);
433 if (map_map[pnum])
434 continue;
435 ms = __nr_to_section(pnum);
436 printk(KERN_ERR "%s: sparsemem memory map backing failed "
437 "some memory will not be available.\n", __func__);
438 ms->section_mem_map = 0;
439 }
440}
378#endif /* !CONFIG_SPARSEMEM_VMEMMAP */ 441#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
379 442
443#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
444static void __init sparse_early_mem_maps_alloc_node(struct page **map_map,
445 unsigned long pnum_begin,
446 unsigned long pnum_end,
447 unsigned long map_count, int nodeid)
448{
449 sparse_mem_maps_populate_node(map_map, pnum_begin, pnum_end,
450 map_count, nodeid);
451}
452#else
380static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) 453static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum)
381{ 454{
382 struct page *map; 455 struct page *map;
@@ -392,10 +465,12 @@ static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum)
392 ms->section_mem_map = 0; 465 ms->section_mem_map = 0;
393 return NULL; 466 return NULL;
394} 467}
468#endif
395 469
396void __attribute__((weak)) __meminit vmemmap_populate_print_last(void) 470void __attribute__((weak)) __meminit vmemmap_populate_print_last(void)
397{ 471{
398} 472}
473
399/* 474/*
400 * Allocate the accumulated non-linear sections, allocate a mem_map 475 * Allocate the accumulated non-linear sections, allocate a mem_map
401 * for each and record the physical to section mapping. 476 * for each and record the physical to section mapping.
@@ -407,6 +482,14 @@ void __init sparse_init(void)
407 unsigned long *usemap; 482 unsigned long *usemap;
408 unsigned long **usemap_map; 483 unsigned long **usemap_map;
409 int size; 484 int size;
485 int nodeid_begin = 0;
486 unsigned long pnum_begin = 0;
487 unsigned long usemap_count;
488#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
489 unsigned long map_count;
490 int size2;
491 struct page **map_map;
492#endif
410 493
411 /* 494 /*
412 * map is using big page (aka 2M in x86 64 bit) 495 * map is using big page (aka 2M in x86 64 bit)
@@ -425,10 +508,81 @@ void __init sparse_init(void)
425 panic("can not allocate usemap_map\n"); 508 panic("can not allocate usemap_map\n");
426 509
427 for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { 510 for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
511 struct mem_section *ms;
512
428 if (!present_section_nr(pnum)) 513 if (!present_section_nr(pnum))
429 continue; 514 continue;
430 usemap_map[pnum] = sparse_early_usemap_alloc(pnum); 515 ms = __nr_to_section(pnum);
516 nodeid_begin = sparse_early_nid(ms);
517 pnum_begin = pnum;
518 break;
431 } 519 }
520 usemap_count = 1;
521 for (pnum = pnum_begin + 1; pnum < NR_MEM_SECTIONS; pnum++) {
522 struct mem_section *ms;
523 int nodeid;
524
525 if (!present_section_nr(pnum))
526 continue;
527 ms = __nr_to_section(pnum);
528 nodeid = sparse_early_nid(ms);
529 if (nodeid == nodeid_begin) {
530 usemap_count++;
531 continue;
532 }
533 /* ok, we need to take cake of from pnum_begin to pnum - 1*/
534 sparse_early_usemaps_alloc_node(usemap_map, pnum_begin, pnum,
535 usemap_count, nodeid_begin);
536 /* new start, update count etc*/
537 nodeid_begin = nodeid;
538 pnum_begin = pnum;
539 usemap_count = 1;
540 }
541 /* ok, last chunk */
542 sparse_early_usemaps_alloc_node(usemap_map, pnum_begin, NR_MEM_SECTIONS,
543 usemap_count, nodeid_begin);
544
545#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
546 size2 = sizeof(struct page *) * NR_MEM_SECTIONS;
547 map_map = alloc_bootmem(size2);
548 if (!map_map)
549 panic("can not allocate map_map\n");
550
551 for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
552 struct mem_section *ms;
553
554 if (!present_section_nr(pnum))
555 continue;
556 ms = __nr_to_section(pnum);
557 nodeid_begin = sparse_early_nid(ms);
558 pnum_begin = pnum;
559 break;
560 }
561 map_count = 1;
562 for (pnum = pnum_begin + 1; pnum < NR_MEM_SECTIONS; pnum++) {
563 struct mem_section *ms;
564 int nodeid;
565
566 if (!present_section_nr(pnum))
567 continue;
568 ms = __nr_to_section(pnum);
569 nodeid = sparse_early_nid(ms);
570 if (nodeid == nodeid_begin) {
571 map_count++;
572 continue;
573 }
574 /* ok, we need to take cake of from pnum_begin to pnum - 1*/
575 sparse_early_mem_maps_alloc_node(map_map, pnum_begin, pnum,
576 map_count, nodeid_begin);
577 /* new start, update count etc*/
578 nodeid_begin = nodeid;
579 pnum_begin = pnum;
580 map_count = 1;
581 }
582 /* ok, last chunk */
583 sparse_early_mem_maps_alloc_node(map_map, pnum_begin, NR_MEM_SECTIONS,
584 map_count, nodeid_begin);
585#endif
432 586
433 for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { 587 for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
434 if (!present_section_nr(pnum)) 588 if (!present_section_nr(pnum))
@@ -438,7 +592,11 @@ void __init sparse_init(void)
438 if (!usemap) 592 if (!usemap)
439 continue; 593 continue;
440 594
595#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
596 map = map_map[pnum];
597#else
441 map = sparse_early_mem_map_alloc(pnum); 598 map = sparse_early_mem_map_alloc(pnum);
599#endif
442 if (!map) 600 if (!map)
443 continue; 601 continue;
444 602
@@ -448,6 +606,9 @@ void __init sparse_init(void)
448 606
449 vmemmap_populate_print_last(); 607 vmemmap_populate_print_last();
450 608
609#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
610 free_bootmem(__pa(map_map), size2);
611#endif
451 free_bootmem(__pa(usemap_map), size); 612 free_bootmem(__pa(usemap_map), size);
452} 613}
453 614
diff --git a/mm/swap.c b/mm/swap.c
index 308e57d8d7ed..7cd60bf0a972 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -30,6 +30,7 @@
30#include <linux/notifier.h> 30#include <linux/notifier.h>
31#include <linux/backing-dev.h> 31#include <linux/backing-dev.h>
32#include <linux/memcontrol.h> 32#include <linux/memcontrol.h>
33#include <linux/gfp.h>
33 34
34#include "internal.h" 35#include "internal.h"
35 36
@@ -55,7 +56,7 @@ static void __page_cache_release(struct page *page)
55 del_page_from_lru(zone, page); 56 del_page_from_lru(zone, page);
56 spin_unlock_irqrestore(&zone->lru_lock, flags); 57 spin_unlock_irqrestore(&zone->lru_lock, flags);
57 } 58 }
58 free_hot_page(page); 59 free_hot_cold_page(page, 0);
59} 60}
60 61
61static void put_compound_page(struct page *page) 62static void put_compound_page(struct page *page)
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 6d1daeb1cb4a..e10f5833167f 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -8,6 +8,7 @@
8 */ 8 */
9#include <linux/module.h> 9#include <linux/module.h>
10#include <linux/mm.h> 10#include <linux/mm.h>
11#include <linux/gfp.h>
11#include <linux/kernel_stat.h> 12#include <linux/kernel_stat.h>
12#include <linux/swap.h> 13#include <linux/swap.h>
13#include <linux/swapops.h> 14#include <linux/swapops.h>
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 6c0585b16418..6cd0a8f90dc7 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -723,6 +723,37 @@ int free_swap_and_cache(swp_entry_t entry)
723 return p != NULL; 723 return p != NULL;
724} 724}
725 725
726#ifdef CONFIG_CGROUP_MEM_RES_CTLR
727/**
728 * mem_cgroup_count_swap_user - count the user of a swap entry
729 * @ent: the swap entry to be checked
730 * @pagep: the pointer for the swap cache page of the entry to be stored
731 *
732 * Returns the number of the user of the swap entry. The number is valid only
733 * for swaps of anonymous pages.
734 * If the entry is found on swap cache, the page is stored to pagep with
735 * refcount of it being incremented.
736 */
737int mem_cgroup_count_swap_user(swp_entry_t ent, struct page **pagep)
738{
739 struct page *page;
740 struct swap_info_struct *p;
741 int count = 0;
742
743 page = find_get_page(&swapper_space, ent.val);
744 if (page)
745 count += page_mapcount(page);
746 p = swap_info_get(ent);
747 if (p) {
748 count += swap_count(p->swap_map[swp_offset(ent)]);
749 spin_unlock(&swap_lock);
750 }
751
752 *pagep = page;
753 return count;
754}
755#endif
756
726#ifdef CONFIG_HIBERNATION 757#ifdef CONFIG_HIBERNATION
727/* 758/*
728 * Find the swap type that corresponds to given device (if any). 759 * Find the swap type that corresponds to given device (if any).
@@ -840,7 +871,8 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
840 goto out; 871 goto out;
841 } 872 }
842 873
843 inc_mm_counter(vma->vm_mm, anon_rss); 874 dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
875 inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
844 get_page(page); 876 get_page(page);
845 set_pte_at(vma->vm_mm, addr, pte, 877 set_pte_at(vma->vm_mm, addr, pte,
846 pte_mkold(mk_pte(page, vma->vm_page_prot))); 878 pte_mkold(mk_pte(page, vma->vm_page_prot)));
@@ -1759,11 +1791,11 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
1759 unsigned int type; 1791 unsigned int type;
1760 int i, prev; 1792 int i, prev;
1761 int error; 1793 int error;
1762 union swap_header *swap_header = NULL; 1794 union swap_header *swap_header;
1763 unsigned int nr_good_pages = 0; 1795 unsigned int nr_good_pages;
1764 int nr_extents = 0; 1796 int nr_extents = 0;
1765 sector_t span; 1797 sector_t span;
1766 unsigned long maxpages = 1; 1798 unsigned long maxpages;
1767 unsigned long swapfilepages; 1799 unsigned long swapfilepages;
1768 unsigned char *swap_map = NULL; 1800 unsigned char *swap_map = NULL;
1769 struct page *page = NULL; 1801 struct page *page = NULL;
@@ -1922,9 +1954,13 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
1922 * swap pte. 1954 * swap pte.
1923 */ 1955 */
1924 maxpages = swp_offset(pte_to_swp_entry( 1956 maxpages = swp_offset(pte_to_swp_entry(
1925 swp_entry_to_pte(swp_entry(0, ~0UL)))) - 1; 1957 swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
1926 if (maxpages > swap_header->info.last_page) 1958 if (maxpages > swap_header->info.last_page) {
1927 maxpages = swap_header->info.last_page; 1959 maxpages = swap_header->info.last_page + 1;
1960 /* p->max is an unsigned int: don't overflow it */
1961 if ((unsigned int)maxpages == 0)
1962 maxpages = UINT_MAX;
1963 }
1928 p->highest_bit = maxpages - 1; 1964 p->highest_bit = maxpages - 1;
1929 1965
1930 error = -EINVAL; 1966 error = -EINVAL;
@@ -1948,23 +1984,24 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
1948 } 1984 }
1949 1985
1950 memset(swap_map, 0, maxpages); 1986 memset(swap_map, 0, maxpages);
1987 nr_good_pages = maxpages - 1; /* omit header page */
1988
1951 for (i = 0; i < swap_header->info.nr_badpages; i++) { 1989 for (i = 0; i < swap_header->info.nr_badpages; i++) {
1952 int page_nr = swap_header->info.badpages[i]; 1990 unsigned int page_nr = swap_header->info.badpages[i];
1953 if (page_nr <= 0 || page_nr >= swap_header->info.last_page) { 1991 if (page_nr == 0 || page_nr > swap_header->info.last_page) {
1954 error = -EINVAL; 1992 error = -EINVAL;
1955 goto bad_swap; 1993 goto bad_swap;
1956 } 1994 }
1957 swap_map[page_nr] = SWAP_MAP_BAD; 1995 if (page_nr < maxpages) {
1996 swap_map[page_nr] = SWAP_MAP_BAD;
1997 nr_good_pages--;
1998 }
1958 } 1999 }
1959 2000
1960 error = swap_cgroup_swapon(type, maxpages); 2001 error = swap_cgroup_swapon(type, maxpages);
1961 if (error) 2002 if (error)
1962 goto bad_swap; 2003 goto bad_swap;
1963 2004
1964 nr_good_pages = swap_header->info.last_page -
1965 swap_header->info.nr_badpages -
1966 1 /* header page */;
1967
1968 if (nr_good_pages) { 2005 if (nr_good_pages) {
1969 swap_map[0] = SWAP_MAP_BAD; 2006 swap_map[0] = SWAP_MAP_BAD;
1970 p->max = maxpages; 2007 p->max = maxpages;
@@ -2155,7 +2192,11 @@ void swap_shmem_alloc(swp_entry_t entry)
2155} 2192}
2156 2193
2157/* 2194/*
2158 * increase reference count of swap entry by 1. 2195 * Increase reference count of swap entry by 1.
2196 * Returns 0 for success, or -ENOMEM if a swap_count_continuation is required
2197 * but could not be atomically allocated. Returns 0, just as if it succeeded,
2198 * if __swap_duplicate() fails for another reason (-EINVAL or -ENOENT), which
2199 * might occur if a page table entry has got corrupted.
2159 */ 2200 */
2160int swap_duplicate(swp_entry_t entry) 2201int swap_duplicate(swp_entry_t entry)
2161{ 2202{
diff --git a/mm/truncate.c b/mm/truncate.c
index e87e37244829..f42675a3615d 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -9,6 +9,7 @@
9 9
10#include <linux/kernel.h> 10#include <linux/kernel.h>
11#include <linux/backing-dev.h> 11#include <linux/backing-dev.h>
12#include <linux/gfp.h>
12#include <linux/mm.h> 13#include <linux/mm.h>
13#include <linux/swap.h> 14#include <linux/swap.h>
14#include <linux/module.h> 15#include <linux/module.h>
diff --git a/mm/util.c b/mm/util.c
index 834db7be240f..f5712e8964be 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -186,6 +186,27 @@ void kzfree(const void *p)
186} 186}
187EXPORT_SYMBOL(kzfree); 187EXPORT_SYMBOL(kzfree);
188 188
189int kern_ptr_validate(const void *ptr, unsigned long size)
190{
191 unsigned long addr = (unsigned long)ptr;
192 unsigned long min_addr = PAGE_OFFSET;
193 unsigned long align_mask = sizeof(void *) - 1;
194
195 if (unlikely(addr < min_addr))
196 goto out;
197 if (unlikely(addr > (unsigned long)high_memory - size))
198 goto out;
199 if (unlikely(addr & align_mask))
200 goto out;
201 if (unlikely(!kern_addr_valid(addr)))
202 goto out;
203 if (unlikely(!kern_addr_valid(addr + size - 1)))
204 goto out;
205 return 1;
206out:
207 return 0;
208}
209
189/* 210/*
190 * strndup_user - duplicate an existing string from user space 211 * strndup_user - duplicate an existing string from user space
191 * @s: The string to duplicate 212 * @s: The string to duplicate
diff --git a/mm/vmscan.c b/mm/vmscan.c
index c26986c85ce0..3ff3311447f5 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -13,7 +13,7 @@
13 13
14#include <linux/mm.h> 14#include <linux/mm.h>
15#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/slab.h> 16#include <linux/gfp.h>
17#include <linux/kernel_stat.h> 17#include <linux/kernel_stat.h>
18#include <linux/swap.h> 18#include <linux/swap.h>
19#include <linux/pagemap.h> 19#include <linux/pagemap.h>
@@ -262,27 +262,6 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
262 return ret; 262 return ret;
263} 263}
264 264
265/* Called without lock on whether page is mapped, so answer is unstable */
266static inline int page_mapping_inuse(struct page *page)
267{
268 struct address_space *mapping;
269
270 /* Page is in somebody's page tables. */
271 if (page_mapped(page))
272 return 1;
273
274 /* Be more reluctant to reclaim swapcache than pagecache */
275 if (PageSwapCache(page))
276 return 1;
277
278 mapping = page_mapping(page);
279 if (!mapping)
280 return 0;
281
282 /* File is mmap'd by somebody? */
283 return mapping_mapped(mapping);
284}
285
286static inline int is_page_cache_freeable(struct page *page) 265static inline int is_page_cache_freeable(struct page *page)
287{ 266{
288 /* 267 /*
@@ -579,6 +558,65 @@ redo:
579 put_page(page); /* drop ref from isolate */ 558 put_page(page); /* drop ref from isolate */
580} 559}
581 560
561enum page_references {
562 PAGEREF_RECLAIM,
563 PAGEREF_RECLAIM_CLEAN,
564 PAGEREF_KEEP,
565 PAGEREF_ACTIVATE,
566};
567
568static enum page_references page_check_references(struct page *page,
569 struct scan_control *sc)
570{
571 int referenced_ptes, referenced_page;
572 unsigned long vm_flags;
573
574 referenced_ptes = page_referenced(page, 1, sc->mem_cgroup, &vm_flags);
575 referenced_page = TestClearPageReferenced(page);
576
577 /* Lumpy reclaim - ignore references */
578 if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
579 return PAGEREF_RECLAIM;
580
581 /*
582 * Mlock lost the isolation race with us. Let try_to_unmap()
583 * move the page to the unevictable list.
584 */
585 if (vm_flags & VM_LOCKED)
586 return PAGEREF_RECLAIM;
587
588 if (referenced_ptes) {
589 if (PageAnon(page))
590 return PAGEREF_ACTIVATE;
591 /*
592 * All mapped pages start out with page table
593 * references from the instantiating fault, so we need
594 * to look twice if a mapped file page is used more
595 * than once.
596 *
597 * Mark it and spare it for another trip around the
598 * inactive list. Another page table reference will
599 * lead to its activation.
600 *
601 * Note: the mark is set for activated pages as well
602 * so that recently deactivated but used pages are
603 * quickly recovered.
604 */
605 SetPageReferenced(page);
606
607 if (referenced_page)
608 return PAGEREF_ACTIVATE;
609
610 return PAGEREF_KEEP;
611 }
612
613 /* Reclaim if clean, defer dirty pages to writeback */
614 if (referenced_page)
615 return PAGEREF_RECLAIM_CLEAN;
616
617 return PAGEREF_RECLAIM;
618}
619
582/* 620/*
583 * shrink_page_list() returns the number of reclaimed pages 621 * shrink_page_list() returns the number of reclaimed pages
584 */ 622 */
@@ -590,16 +628,15 @@ static unsigned long shrink_page_list(struct list_head *page_list,
590 struct pagevec freed_pvec; 628 struct pagevec freed_pvec;
591 int pgactivate = 0; 629 int pgactivate = 0;
592 unsigned long nr_reclaimed = 0; 630 unsigned long nr_reclaimed = 0;
593 unsigned long vm_flags;
594 631
595 cond_resched(); 632 cond_resched();
596 633
597 pagevec_init(&freed_pvec, 1); 634 pagevec_init(&freed_pvec, 1);
598 while (!list_empty(page_list)) { 635 while (!list_empty(page_list)) {
636 enum page_references references;
599 struct address_space *mapping; 637 struct address_space *mapping;
600 struct page *page; 638 struct page *page;
601 int may_enter_fs; 639 int may_enter_fs;
602 int referenced;
603 640
604 cond_resched(); 641 cond_resched();
605 642
@@ -641,17 +678,16 @@ static unsigned long shrink_page_list(struct list_head *page_list,
641 goto keep_locked; 678 goto keep_locked;
642 } 679 }
643 680
644 referenced = page_referenced(page, 1, 681 references = page_check_references(page, sc);
645 sc->mem_cgroup, &vm_flags); 682 switch (references) {
646 /* 683 case PAGEREF_ACTIVATE:
647 * In active use or really unfreeable? Activate it.
648 * If page which have PG_mlocked lost isoltation race,
649 * try_to_unmap moves it to unevictable list
650 */
651 if (sc->order <= PAGE_ALLOC_COSTLY_ORDER &&
652 referenced && page_mapping_inuse(page)
653 && !(vm_flags & VM_LOCKED))
654 goto activate_locked; 684 goto activate_locked;
685 case PAGEREF_KEEP:
686 goto keep_locked;
687 case PAGEREF_RECLAIM:
688 case PAGEREF_RECLAIM_CLEAN:
689 ; /* try to reclaim the page below */
690 }
655 691
656 /* 692 /*
657 * Anonymous process memory has backing store? 693 * Anonymous process memory has backing store?
@@ -685,7 +721,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
685 } 721 }
686 722
687 if (PageDirty(page)) { 723 if (PageDirty(page)) {
688 if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && referenced) 724 if (references == PAGEREF_RECLAIM_CLEAN)
689 goto keep_locked; 725 goto keep_locked;
690 if (!may_enter_fs) 726 if (!may_enter_fs)
691 goto keep_locked; 727 goto keep_locked;
@@ -1350,9 +1386,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1350 continue; 1386 continue;
1351 } 1387 }
1352 1388
1353 /* page_referenced clears PageReferenced */ 1389 if (page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) {
1354 if (page_mapping_inuse(page) &&
1355 page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) {
1356 nr_rotated++; 1390 nr_rotated++;
1357 /* 1391 /*
1358 * Identify referenced, file-backed active pages and 1392 * Identify referenced, file-backed active pages and
@@ -1694,8 +1728,7 @@ static void shrink_zones(int priority, struct zonelist *zonelist,
1694 continue; 1728 continue;
1695 note_zone_scanning_priority(zone, priority); 1729 note_zone_scanning_priority(zone, priority);
1696 1730
1697 if (zone_is_all_unreclaimable(zone) && 1731 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
1698 priority != DEF_PRIORITY)
1699 continue; /* Let kswapd poll it */ 1732 continue; /* Let kswapd poll it */
1700 sc->all_unreclaimable = 0; 1733 sc->all_unreclaimable = 0;
1701 } else { 1734 } else {
@@ -1922,7 +1955,7 @@ static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining)
1922 if (!populated_zone(zone)) 1955 if (!populated_zone(zone))
1923 continue; 1956 continue;
1924 1957
1925 if (zone_is_all_unreclaimable(zone)) 1958 if (zone->all_unreclaimable)
1926 continue; 1959 continue;
1927 1960
1928 if (!zone_watermark_ok(zone, order, high_wmark_pages(zone), 1961 if (!zone_watermark_ok(zone, order, high_wmark_pages(zone),
@@ -2012,8 +2045,7 @@ loop_again:
2012 if (!populated_zone(zone)) 2045 if (!populated_zone(zone))
2013 continue; 2046 continue;
2014 2047
2015 if (zone_is_all_unreclaimable(zone) && 2048 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
2016 priority != DEF_PRIORITY)
2017 continue; 2049 continue;
2018 2050
2019 /* 2051 /*
@@ -2056,13 +2088,9 @@ loop_again:
2056 if (!populated_zone(zone)) 2088 if (!populated_zone(zone))
2057 continue; 2089 continue;
2058 2090
2059 if (zone_is_all_unreclaimable(zone) && 2091 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
2060 priority != DEF_PRIORITY)
2061 continue; 2092 continue;
2062 2093
2063 if (!zone_watermark_ok(zone, order,
2064 high_wmark_pages(zone), end_zone, 0))
2065 all_zones_ok = 0;
2066 temp_priority[i] = priority; 2094 temp_priority[i] = priority;
2067 sc.nr_scanned = 0; 2095 sc.nr_scanned = 0;
2068 note_zone_scanning_priority(zone, priority); 2096 note_zone_scanning_priority(zone, priority);
@@ -2087,12 +2115,11 @@ loop_again:
2087 lru_pages); 2115 lru_pages);
2088 sc.nr_reclaimed += reclaim_state->reclaimed_slab; 2116 sc.nr_reclaimed += reclaim_state->reclaimed_slab;
2089 total_scanned += sc.nr_scanned; 2117 total_scanned += sc.nr_scanned;
2090 if (zone_is_all_unreclaimable(zone)) 2118 if (zone->all_unreclaimable)
2091 continue; 2119 continue;
2092 if (nr_slab == 0 && zone->pages_scanned >= 2120 if (nr_slab == 0 &&
2093 (zone_reclaimable_pages(zone) * 6)) 2121 zone->pages_scanned >= (zone_reclaimable_pages(zone) * 6))
2094 zone_set_flag(zone, 2122 zone->all_unreclaimable = 1;
2095 ZONE_ALL_UNRECLAIMABLE);
2096 /* 2123 /*
2097 * If we've done a decent amount of scanning and 2124 * If we've done a decent amount of scanning and
2098 * the reclaim ratio is low, start doing writepage 2125 * the reclaim ratio is low, start doing writepage
@@ -2102,13 +2129,18 @@ loop_again:
2102 total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) 2129 total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
2103 sc.may_writepage = 1; 2130 sc.may_writepage = 1;
2104 2131
2105 /* 2132 if (!zone_watermark_ok(zone, order,
2106 * We are still under min water mark. it mean we have 2133 high_wmark_pages(zone), end_zone, 0)) {
2107 * GFP_ATOMIC allocation failure risk. Hurry up! 2134 all_zones_ok = 0;
2108 */ 2135 /*
2109 if (!zone_watermark_ok(zone, order, min_wmark_pages(zone), 2136 * We are still under min water mark. This
2110 end_zone, 0)) 2137 * means that we have a GFP_ATOMIC allocation
2111 has_under_min_watermark_zone = 1; 2138 * failure risk. Hurry up!
2139 */
2140 if (!zone_watermark_ok(zone, order,
2141 min_wmark_pages(zone), end_zone, 0))
2142 has_under_min_watermark_zone = 1;
2143 }
2112 2144
2113 } 2145 }
2114 if (all_zones_ok) 2146 if (all_zones_ok)
@@ -2550,6 +2582,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2550 * and RECLAIM_SWAP. 2582 * and RECLAIM_SWAP.
2551 */ 2583 */
2552 p->flags |= PF_MEMALLOC | PF_SWAPWRITE; 2584 p->flags |= PF_MEMALLOC | PF_SWAPWRITE;
2585 lockdep_set_current_reclaim_state(gfp_mask);
2553 reclaim_state.reclaimed_slab = 0; 2586 reclaim_state.reclaimed_slab = 0;
2554 p->reclaim_state = &reclaim_state; 2587 p->reclaim_state = &reclaim_state;
2555 2588
@@ -2593,6 +2626,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2593 2626
2594 p->reclaim_state = NULL; 2627 p->reclaim_state = NULL;
2595 current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); 2628 current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
2629 lockdep_clear_current_reclaim_state();
2596 return sc.nr_reclaimed >= nr_pages; 2630 return sc.nr_reclaimed >= nr_pages;
2597} 2631}
2598 2632
@@ -2615,7 +2649,7 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2615 zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages) 2649 zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages)
2616 return ZONE_RECLAIM_FULL; 2650 return ZONE_RECLAIM_FULL;
2617 2651
2618 if (zone_is_all_unreclaimable(zone)) 2652 if (zone->all_unreclaimable)
2619 return ZONE_RECLAIM_FULL; 2653 return ZONE_RECLAIM_FULL;
2620 2654
2621 /* 2655 /*
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 6051fbab67ba..fa12ea3051fb 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -12,6 +12,7 @@
12#include <linux/mm.h> 12#include <linux/mm.h>
13#include <linux/err.h> 13#include <linux/err.h>
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/slab.h>
15#include <linux/cpu.h> 16#include <linux/cpu.h>
16#include <linux/vmstat.h> 17#include <linux/vmstat.h>
17#include <linux/sched.h> 18#include <linux/sched.h>
@@ -139,7 +140,8 @@ static void refresh_zone_stat_thresholds(void)
139 threshold = calculate_threshold(zone); 140 threshold = calculate_threshold(zone);
140 141
141 for_each_online_cpu(cpu) 142 for_each_online_cpu(cpu)
142 zone_pcp(zone, cpu)->stat_threshold = threshold; 143 per_cpu_ptr(zone->pageset, cpu)->stat_threshold
144 = threshold;
143 } 145 }
144} 146}
145 147
@@ -149,7 +151,8 @@ static void refresh_zone_stat_thresholds(void)
149void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, 151void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
150 int delta) 152 int delta)
151{ 153{
152 struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id()); 154 struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset);
155
153 s8 *p = pcp->vm_stat_diff + item; 156 s8 *p = pcp->vm_stat_diff + item;
154 long x; 157 long x;
155 158
@@ -202,7 +205,7 @@ EXPORT_SYMBOL(mod_zone_page_state);
202 */ 205 */
203void __inc_zone_state(struct zone *zone, enum zone_stat_item item) 206void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
204{ 207{
205 struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id()); 208 struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset);
206 s8 *p = pcp->vm_stat_diff + item; 209 s8 *p = pcp->vm_stat_diff + item;
207 210
208 (*p)++; 211 (*p)++;
@@ -223,7 +226,7 @@ EXPORT_SYMBOL(__inc_zone_page_state);
223 226
224void __dec_zone_state(struct zone *zone, enum zone_stat_item item) 227void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
225{ 228{
226 struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id()); 229 struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset);
227 s8 *p = pcp->vm_stat_diff + item; 230 s8 *p = pcp->vm_stat_diff + item;
228 231
229 (*p)--; 232 (*p)--;
@@ -300,7 +303,7 @@ void refresh_cpu_vm_stats(int cpu)
300 for_each_populated_zone(zone) { 303 for_each_populated_zone(zone) {
301 struct per_cpu_pageset *p; 304 struct per_cpu_pageset *p;
302 305
303 p = zone_pcp(zone, cpu); 306 p = per_cpu_ptr(zone->pageset, cpu);
304 307
305 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) 308 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
306 if (p->vm_stat_diff[i]) { 309 if (p->vm_stat_diff[i]) {
@@ -741,7 +744,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
741 for_each_online_cpu(i) { 744 for_each_online_cpu(i) {
742 struct per_cpu_pageset *pageset; 745 struct per_cpu_pageset *pageset;
743 746
744 pageset = zone_pcp(zone, i); 747 pageset = per_cpu_ptr(zone->pageset, i);
745 seq_printf(m, 748 seq_printf(m,
746 "\n cpu: %i" 749 "\n cpu: %i"
747 "\n count: %i" 750 "\n count: %i"
@@ -761,7 +764,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
761 "\n prev_priority: %i" 764 "\n prev_priority: %i"
762 "\n start_pfn: %lu" 765 "\n start_pfn: %lu"
763 "\n inactive_ratio: %u", 766 "\n inactive_ratio: %u",
764 zone_is_all_unreclaimable(zone), 767 zone->all_unreclaimable,
765 zone->prev_priority, 768 zone->prev_priority,
766 zone->zone_start_pfn, 769 zone->zone_start_pfn,
767 zone->inactive_ratio); 770 zone->inactive_ratio);
@@ -906,6 +909,7 @@ static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb,
906 case CPU_ONLINE: 909 case CPU_ONLINE:
907 case CPU_ONLINE_FROZEN: 910 case CPU_ONLINE_FROZEN:
908 start_cpu_timer(cpu); 911 start_cpu_timer(cpu);
912 node_set_state(cpu_to_node(cpu), N_CPU);
909 break; 913 break;
910 case CPU_DOWN_PREPARE: 914 case CPU_DOWN_PREPARE:
911 case CPU_DOWN_PREPARE_FROZEN: 915 case CPU_DOWN_PREPARE_FROZEN: