aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig8
-rw-r--r--mm/bootmem.c195
-rw-r--r--mm/failslab.c18
-rw-r--r--mm/filemap.c103
-rw-r--r--mm/hugetlb.c13
-rw-r--r--mm/maccess.c11
-rw-r--r--mm/memcontrol.c11
-rw-r--r--mm/memory-failure.c9
-rw-r--r--mm/memory.c14
-rw-r--r--mm/migrate.c41
-rw-r--r--mm/mmap.c40
-rw-r--r--mm/mmu_context.c3
-rw-r--r--mm/nommu.c144
-rw-r--r--mm/oom_kill.c2
-rw-r--r--mm/page_alloc.c344
-rw-r--r--mm/percpu.c40
-rw-r--r--mm/slab.c17
-rw-r--r--mm/slub.c29
-rw-r--r--mm/sparse-vmemmap.c76
-rw-r--r--mm/sparse.c196
-rw-r--r--mm/truncate.c30
-rw-r--r--mm/util.c46
-rw-r--r--mm/vmalloc.c114
-rw-r--r--mm/vmscan.c3
-rw-r--r--mm/vmstat.c15
25 files changed, 1108 insertions, 414 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index ee9f3e0f2b69..9c61158308dc 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -115,6 +115,10 @@ config SPARSEMEM_EXTREME
115config SPARSEMEM_VMEMMAP_ENABLE 115config SPARSEMEM_VMEMMAP_ENABLE
116 bool 116 bool
117 117
118config SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
119 def_bool y
120 depends on SPARSEMEM && X86_64
121
118config SPARSEMEM_VMEMMAP 122config SPARSEMEM_VMEMMAP
119 bool "Sparse Memory virtual memmap" 123 bool "Sparse Memory virtual memmap"
120 depends on SPARSEMEM && SPARSEMEM_VMEMMAP_ENABLE 124 depends on SPARSEMEM && SPARSEMEM_VMEMMAP_ENABLE
@@ -195,7 +199,7 @@ config BOUNCE
195config NR_QUICK 199config NR_QUICK
196 int 200 int
197 depends on QUICKLIST 201 depends on QUICKLIST
198 default "2" if SUPERH || AVR32 202 default "2" if AVR32
199 default "1" 203 default "1"
200 204
201config VIRT_TO_BUS 205config VIRT_TO_BUS
@@ -253,7 +257,7 @@ config MEMORY_FAILURE
253 257
254config HWPOISON_INJECT 258config HWPOISON_INJECT
255 tristate "HWPoison pages injector" 259 tristate "HWPoison pages injector"
256 depends on MEMORY_FAILURE && DEBUG_KERNEL 260 depends on MEMORY_FAILURE && DEBUG_KERNEL && PROC_FS
257 select PROC_PAGE_MONITOR 261 select PROC_PAGE_MONITOR
258 262
259config NOMMU_INITIAL_TRIM_EXCESS 263config NOMMU_INITIAL_TRIM_EXCESS
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 7d1486875e1c..d7c791ef0036 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -13,6 +13,7 @@
13#include <linux/bootmem.h> 13#include <linux/bootmem.h>
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/kmemleak.h> 15#include <linux/kmemleak.h>
16#include <linux/range.h>
16 17
17#include <asm/bug.h> 18#include <asm/bug.h>
18#include <asm/io.h> 19#include <asm/io.h>
@@ -32,6 +33,7 @@ unsigned long max_pfn;
32unsigned long saved_max_pfn; 33unsigned long saved_max_pfn;
33#endif 34#endif
34 35
36#ifndef CONFIG_NO_BOOTMEM
35bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata; 37bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata;
36 38
37static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list); 39static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list);
@@ -142,7 +144,7 @@ unsigned long __init init_bootmem(unsigned long start, unsigned long pages)
142 min_low_pfn = start; 144 min_low_pfn = start;
143 return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages); 145 return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages);
144} 146}
145 147#endif
146/* 148/*
147 * free_bootmem_late - free bootmem pages directly to page allocator 149 * free_bootmem_late - free bootmem pages directly to page allocator
148 * @addr: starting address of the range 150 * @addr: starting address of the range
@@ -167,6 +169,60 @@ void __init free_bootmem_late(unsigned long addr, unsigned long size)
167 } 169 }
168} 170}
169 171
172#ifdef CONFIG_NO_BOOTMEM
173static void __init __free_pages_memory(unsigned long start, unsigned long end)
174{
175 int i;
176 unsigned long start_aligned, end_aligned;
177 int order = ilog2(BITS_PER_LONG);
178
179 start_aligned = (start + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1);
180 end_aligned = end & ~(BITS_PER_LONG - 1);
181
182 if (end_aligned <= start_aligned) {
183#if 1
184 printk(KERN_DEBUG " %lx - %lx\n", start, end);
185#endif
186 for (i = start; i < end; i++)
187 __free_pages_bootmem(pfn_to_page(i), 0);
188
189 return;
190 }
191
192#if 1
193 printk(KERN_DEBUG " %lx %lx - %lx %lx\n",
194 start, start_aligned, end_aligned, end);
195#endif
196 for (i = start; i < start_aligned; i++)
197 __free_pages_bootmem(pfn_to_page(i), 0);
198
199 for (i = start_aligned; i < end_aligned; i += BITS_PER_LONG)
200 __free_pages_bootmem(pfn_to_page(i), order);
201
202 for (i = end_aligned; i < end; i++)
203 __free_pages_bootmem(pfn_to_page(i), 0);
204}
205
206unsigned long __init free_all_memory_core_early(int nodeid)
207{
208 int i;
209 u64 start, end;
210 unsigned long count = 0;
211 struct range *range = NULL;
212 int nr_range;
213
214 nr_range = get_free_all_memory_range(&range, nodeid);
215
216 for (i = 0; i < nr_range; i++) {
217 start = range[i].start;
218 end = range[i].end;
219 count += end - start;
220 __free_pages_memory(start, end);
221 }
222
223 return count;
224}
225#else
170static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) 226static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
171{ 227{
172 int aligned; 228 int aligned;
@@ -227,6 +283,7 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
227 283
228 return count; 284 return count;
229} 285}
286#endif
230 287
231/** 288/**
232 * free_all_bootmem_node - release a node's free pages to the buddy allocator 289 * free_all_bootmem_node - release a node's free pages to the buddy allocator
@@ -237,7 +294,12 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
237unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) 294unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
238{ 295{
239 register_page_bootmem_info_node(pgdat); 296 register_page_bootmem_info_node(pgdat);
297#ifdef CONFIG_NO_BOOTMEM
298 /* free_all_memory_core_early(MAX_NUMNODES) will be called later */
299 return 0;
300#else
240 return free_all_bootmem_core(pgdat->bdata); 301 return free_all_bootmem_core(pgdat->bdata);
302#endif
241} 303}
242 304
243/** 305/**
@@ -247,9 +309,14 @@ unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
247 */ 309 */
248unsigned long __init free_all_bootmem(void) 310unsigned long __init free_all_bootmem(void)
249{ 311{
312#ifdef CONFIG_NO_BOOTMEM
313 return free_all_memory_core_early(NODE_DATA(0)->node_id);
314#else
250 return free_all_bootmem_core(NODE_DATA(0)->bdata); 315 return free_all_bootmem_core(NODE_DATA(0)->bdata);
316#endif
251} 317}
252 318
319#ifndef CONFIG_NO_BOOTMEM
253static void __init __free(bootmem_data_t *bdata, 320static void __init __free(bootmem_data_t *bdata,
254 unsigned long sidx, unsigned long eidx) 321 unsigned long sidx, unsigned long eidx)
255{ 322{
@@ -344,6 +411,7 @@ static int __init mark_bootmem(unsigned long start, unsigned long end,
344 } 411 }
345 BUG(); 412 BUG();
346} 413}
414#endif
347 415
348/** 416/**
349 * free_bootmem_node - mark a page range as usable 417 * free_bootmem_node - mark a page range as usable
@@ -358,6 +426,12 @@ static int __init mark_bootmem(unsigned long start, unsigned long end,
358void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, 426void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
359 unsigned long size) 427 unsigned long size)
360{ 428{
429#ifdef CONFIG_NO_BOOTMEM
430 free_early(physaddr, physaddr + size);
431#if 0
432 printk(KERN_DEBUG "free %lx %lx\n", physaddr, size);
433#endif
434#else
361 unsigned long start, end; 435 unsigned long start, end;
362 436
363 kmemleak_free_part(__va(physaddr), size); 437 kmemleak_free_part(__va(physaddr), size);
@@ -366,6 +440,7 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
366 end = PFN_DOWN(physaddr + size); 440 end = PFN_DOWN(physaddr + size);
367 441
368 mark_bootmem_node(pgdat->bdata, start, end, 0, 0); 442 mark_bootmem_node(pgdat->bdata, start, end, 0, 0);
443#endif
369} 444}
370 445
371/** 446/**
@@ -379,6 +454,12 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
379 */ 454 */
380void __init free_bootmem(unsigned long addr, unsigned long size) 455void __init free_bootmem(unsigned long addr, unsigned long size)
381{ 456{
457#ifdef CONFIG_NO_BOOTMEM
458 free_early(addr, addr + size);
459#if 0
460 printk(KERN_DEBUG "free %lx %lx\n", addr, size);
461#endif
462#else
382 unsigned long start, end; 463 unsigned long start, end;
383 464
384 kmemleak_free_part(__va(addr), size); 465 kmemleak_free_part(__va(addr), size);
@@ -387,6 +468,7 @@ void __init free_bootmem(unsigned long addr, unsigned long size)
387 end = PFN_DOWN(addr + size); 468 end = PFN_DOWN(addr + size);
388 469
389 mark_bootmem(start, end, 0, 0); 470 mark_bootmem(start, end, 0, 0);
471#endif
390} 472}
391 473
392/** 474/**
@@ -403,12 +485,17 @@ void __init free_bootmem(unsigned long addr, unsigned long size)
403int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, 485int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
404 unsigned long size, int flags) 486 unsigned long size, int flags)
405{ 487{
488#ifdef CONFIG_NO_BOOTMEM
489 panic("no bootmem");
490 return 0;
491#else
406 unsigned long start, end; 492 unsigned long start, end;
407 493
408 start = PFN_DOWN(physaddr); 494 start = PFN_DOWN(physaddr);
409 end = PFN_UP(physaddr + size); 495 end = PFN_UP(physaddr + size);
410 496
411 return mark_bootmem_node(pgdat->bdata, start, end, 1, flags); 497 return mark_bootmem_node(pgdat->bdata, start, end, 1, flags);
498#endif
412} 499}
413 500
414/** 501/**
@@ -424,14 +511,20 @@ int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
424int __init reserve_bootmem(unsigned long addr, unsigned long size, 511int __init reserve_bootmem(unsigned long addr, unsigned long size,
425 int flags) 512 int flags)
426{ 513{
514#ifdef CONFIG_NO_BOOTMEM
515 panic("no bootmem");
516 return 0;
517#else
427 unsigned long start, end; 518 unsigned long start, end;
428 519
429 start = PFN_DOWN(addr); 520 start = PFN_DOWN(addr);
430 end = PFN_UP(addr + size); 521 end = PFN_UP(addr + size);
431 522
432 return mark_bootmem(start, end, 1, flags); 523 return mark_bootmem(start, end, 1, flags);
524#endif
433} 525}
434 526
527#ifndef CONFIG_NO_BOOTMEM
435static unsigned long __init align_idx(struct bootmem_data *bdata, 528static unsigned long __init align_idx(struct bootmem_data *bdata,
436 unsigned long idx, unsigned long step) 529 unsigned long idx, unsigned long step)
437{ 530{
@@ -582,12 +675,33 @@ static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata,
582#endif 675#endif
583 return NULL; 676 return NULL;
584} 677}
678#endif
585 679
586static void * __init ___alloc_bootmem_nopanic(unsigned long size, 680static void * __init ___alloc_bootmem_nopanic(unsigned long size,
587 unsigned long align, 681 unsigned long align,
588 unsigned long goal, 682 unsigned long goal,
589 unsigned long limit) 683 unsigned long limit)
590{ 684{
685#ifdef CONFIG_NO_BOOTMEM
686 void *ptr;
687
688 if (WARN_ON_ONCE(slab_is_available()))
689 return kzalloc(size, GFP_NOWAIT);
690
691restart:
692
693 ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal, limit);
694
695 if (ptr)
696 return ptr;
697
698 if (goal != 0) {
699 goal = 0;
700 goto restart;
701 }
702
703 return NULL;
704#else
591 bootmem_data_t *bdata; 705 bootmem_data_t *bdata;
592 void *region; 706 void *region;
593 707
@@ -613,6 +727,7 @@ restart:
613 } 727 }
614 728
615 return NULL; 729 return NULL;
730#endif
616} 731}
617 732
618/** 733/**
@@ -631,7 +746,13 @@ restart:
631void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align, 746void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
632 unsigned long goal) 747 unsigned long goal)
633{ 748{
634 return ___alloc_bootmem_nopanic(size, align, goal, 0); 749 unsigned long limit = 0;
750
751#ifdef CONFIG_NO_BOOTMEM
752 limit = -1UL;
753#endif
754
755 return ___alloc_bootmem_nopanic(size, align, goal, limit);
635} 756}
636 757
637static void * __init ___alloc_bootmem(unsigned long size, unsigned long align, 758static void * __init ___alloc_bootmem(unsigned long size, unsigned long align,
@@ -665,9 +786,16 @@ static void * __init ___alloc_bootmem(unsigned long size, unsigned long align,
665void * __init __alloc_bootmem(unsigned long size, unsigned long align, 786void * __init __alloc_bootmem(unsigned long size, unsigned long align,
666 unsigned long goal) 787 unsigned long goal)
667{ 788{
668 return ___alloc_bootmem(size, align, goal, 0); 789 unsigned long limit = 0;
790
791#ifdef CONFIG_NO_BOOTMEM
792 limit = -1UL;
793#endif
794
795 return ___alloc_bootmem(size, align, goal, limit);
669} 796}
670 797
798#ifndef CONFIG_NO_BOOTMEM
671static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata, 799static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata,
672 unsigned long size, unsigned long align, 800 unsigned long size, unsigned long align,
673 unsigned long goal, unsigned long limit) 801 unsigned long goal, unsigned long limit)
@@ -684,6 +812,7 @@ static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata,
684 812
685 return ___alloc_bootmem(size, align, goal, limit); 813 return ___alloc_bootmem(size, align, goal, limit);
686} 814}
815#endif
687 816
688/** 817/**
689 * __alloc_bootmem_node - allocate boot memory from a specific node 818 * __alloc_bootmem_node - allocate boot memory from a specific node
@@ -706,7 +835,46 @@ void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
706 if (WARN_ON_ONCE(slab_is_available())) 835 if (WARN_ON_ONCE(slab_is_available()))
707 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); 836 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
708 837
838#ifdef CONFIG_NO_BOOTMEM
839 return __alloc_memory_core_early(pgdat->node_id, size, align,
840 goal, -1ULL);
841#else
709 return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0); 842 return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0);
843#endif
844}
845
846void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
847 unsigned long align, unsigned long goal)
848{
849#ifdef MAX_DMA32_PFN
850 unsigned long end_pfn;
851
852 if (WARN_ON_ONCE(slab_is_available()))
853 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
854
855 /* update goal according ...MAX_DMA32_PFN */
856 end_pfn = pgdat->node_start_pfn + pgdat->node_spanned_pages;
857
858 if (end_pfn > MAX_DMA32_PFN + (128 >> (20 - PAGE_SHIFT)) &&
859 (goal >> PAGE_SHIFT) < MAX_DMA32_PFN) {
860 void *ptr;
861 unsigned long new_goal;
862
863 new_goal = MAX_DMA32_PFN << PAGE_SHIFT;
864#ifdef CONFIG_NO_BOOTMEM
865 ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
866 new_goal, -1ULL);
867#else
868 ptr = alloc_bootmem_core(pgdat->bdata, size, align,
869 new_goal, 0);
870#endif
871 if (ptr)
872 return ptr;
873 }
874#endif
875
876 return __alloc_bootmem_node(pgdat, size, align, goal);
877
710} 878}
711 879
712#ifdef CONFIG_SPARSEMEM 880#ifdef CONFIG_SPARSEMEM
@@ -720,6 +888,16 @@ void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
720void * __init alloc_bootmem_section(unsigned long size, 888void * __init alloc_bootmem_section(unsigned long size,
721 unsigned long section_nr) 889 unsigned long section_nr)
722{ 890{
891#ifdef CONFIG_NO_BOOTMEM
892 unsigned long pfn, goal, limit;
893
894 pfn = section_nr_to_pfn(section_nr);
895 goal = pfn << PAGE_SHIFT;
896 limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT;
897
898 return __alloc_memory_core_early(early_pfn_to_nid(pfn), size,
899 SMP_CACHE_BYTES, goal, limit);
900#else
723 bootmem_data_t *bdata; 901 bootmem_data_t *bdata;
724 unsigned long pfn, goal, limit; 902 unsigned long pfn, goal, limit;
725 903
@@ -729,6 +907,7 @@ void * __init alloc_bootmem_section(unsigned long size,
729 bdata = &bootmem_node_data[early_pfn_to_nid(pfn)]; 907 bdata = &bootmem_node_data[early_pfn_to_nid(pfn)];
730 908
731 return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, limit); 909 return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, limit);
910#endif
732} 911}
733#endif 912#endif
734 913
@@ -740,11 +919,16 @@ void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
740 if (WARN_ON_ONCE(slab_is_available())) 919 if (WARN_ON_ONCE(slab_is_available()))
741 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); 920 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
742 921
922#ifdef CONFIG_NO_BOOTMEM
923 ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
924 goal, -1ULL);
925#else
743 ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0); 926 ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0);
744 if (ptr) 927 if (ptr)
745 return ptr; 928 return ptr;
746 929
747 ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0); 930 ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0);
931#endif
748 if (ptr) 932 if (ptr)
749 return ptr; 933 return ptr;
750 934
@@ -795,6 +979,11 @@ void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size,
795 if (WARN_ON_ONCE(slab_is_available())) 979 if (WARN_ON_ONCE(slab_is_available()))
796 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); 980 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
797 981
982#ifdef CONFIG_NO_BOOTMEM
983 return __alloc_memory_core_early(pgdat->node_id, size, align,
984 goal, ARCH_LOW_ADDRESS_LIMIT);
985#else
798 return ___alloc_bootmem_node(pgdat->bdata, size, align, 986 return ___alloc_bootmem_node(pgdat->bdata, size, align,
799 goal, ARCH_LOW_ADDRESS_LIMIT); 987 goal, ARCH_LOW_ADDRESS_LIMIT);
988#endif
800} 989}
diff --git a/mm/failslab.c b/mm/failslab.c
index 9339de5f0a91..bb41f98dd8b7 100644
--- a/mm/failslab.c
+++ b/mm/failslab.c
@@ -1,18 +1,22 @@
1#include <linux/fault-inject.h> 1#include <linux/fault-inject.h>
2#include <linux/gfp.h> 2#include <linux/gfp.h>
3#include <linux/slab.h>
3 4
4static struct { 5static struct {
5 struct fault_attr attr; 6 struct fault_attr attr;
6 u32 ignore_gfp_wait; 7 u32 ignore_gfp_wait;
8 int cache_filter;
7#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS 9#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
8 struct dentry *ignore_gfp_wait_file; 10 struct dentry *ignore_gfp_wait_file;
11 struct dentry *cache_filter_file;
9#endif 12#endif
10} failslab = { 13} failslab = {
11 .attr = FAULT_ATTR_INITIALIZER, 14 .attr = FAULT_ATTR_INITIALIZER,
12 .ignore_gfp_wait = 1, 15 .ignore_gfp_wait = 1,
16 .cache_filter = 0,
13}; 17};
14 18
15bool should_failslab(size_t size, gfp_t gfpflags) 19bool should_failslab(size_t size, gfp_t gfpflags, unsigned long cache_flags)
16{ 20{
17 if (gfpflags & __GFP_NOFAIL) 21 if (gfpflags & __GFP_NOFAIL)
18 return false; 22 return false;
@@ -20,6 +24,9 @@ bool should_failslab(size_t size, gfp_t gfpflags)
20 if (failslab.ignore_gfp_wait && (gfpflags & __GFP_WAIT)) 24 if (failslab.ignore_gfp_wait && (gfpflags & __GFP_WAIT))
21 return false; 25 return false;
22 26
27 if (failslab.cache_filter && !(cache_flags & SLAB_FAILSLAB))
28 return false;
29
23 return should_fail(&failslab.attr, size); 30 return should_fail(&failslab.attr, size);
24} 31}
25 32
@@ -30,7 +37,6 @@ static int __init setup_failslab(char *str)
30__setup("failslab=", setup_failslab); 37__setup("failslab=", setup_failslab);
31 38
32#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS 39#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
33
34static int __init failslab_debugfs_init(void) 40static int __init failslab_debugfs_init(void)
35{ 41{
36 mode_t mode = S_IFREG | S_IRUSR | S_IWUSR; 42 mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
@@ -46,8 +52,14 @@ static int __init failslab_debugfs_init(void)
46 debugfs_create_bool("ignore-gfp-wait", mode, dir, 52 debugfs_create_bool("ignore-gfp-wait", mode, dir,
47 &failslab.ignore_gfp_wait); 53 &failslab.ignore_gfp_wait);
48 54
49 if (!failslab.ignore_gfp_wait_file) { 55 failslab.cache_filter_file =
56 debugfs_create_bool("cache-filter", mode, dir,
57 &failslab.cache_filter);
58
59 if (!failslab.ignore_gfp_wait_file ||
60 !failslab.cache_filter_file) {
50 err = -ENOMEM; 61 err = -ENOMEM;
62 debugfs_remove(failslab.cache_filter_file);
51 debugfs_remove(failslab.ignore_gfp_wait_file); 63 debugfs_remove(failslab.ignore_gfp_wait_file);
52 cleanup_fault_attr_dentries(&failslab.attr); 64 cleanup_fault_attr_dentries(&failslab.attr);
53 } 65 }
diff --git a/mm/filemap.c b/mm/filemap.c
index 96ac6b0eb6cb..698ea80f2102 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1634,14 +1634,15 @@ EXPORT_SYMBOL(generic_file_readonly_mmap);
1634static struct page *__read_cache_page(struct address_space *mapping, 1634static struct page *__read_cache_page(struct address_space *mapping,
1635 pgoff_t index, 1635 pgoff_t index,
1636 int (*filler)(void *,struct page*), 1636 int (*filler)(void *,struct page*),
1637 void *data) 1637 void *data,
1638 gfp_t gfp)
1638{ 1639{
1639 struct page *page; 1640 struct page *page;
1640 int err; 1641 int err;
1641repeat: 1642repeat:
1642 page = find_get_page(mapping, index); 1643 page = find_get_page(mapping, index);
1643 if (!page) { 1644 if (!page) {
1644 page = page_cache_alloc_cold(mapping); 1645 page = __page_cache_alloc(gfp | __GFP_COLD);
1645 if (!page) 1646 if (!page)
1646 return ERR_PTR(-ENOMEM); 1647 return ERR_PTR(-ENOMEM);
1647 err = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL); 1648 err = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL);
@@ -1661,31 +1662,18 @@ repeat:
1661 return page; 1662 return page;
1662} 1663}
1663 1664
1664/** 1665static struct page *do_read_cache_page(struct address_space *mapping,
1665 * read_cache_page_async - read into page cache, fill it if needed
1666 * @mapping: the page's address_space
1667 * @index: the page index
1668 * @filler: function to perform the read
1669 * @data: destination for read data
1670 *
1671 * Same as read_cache_page, but don't wait for page to become unlocked
1672 * after submitting it to the filler.
1673 *
1674 * Read into the page cache. If a page already exists, and PageUptodate() is
1675 * not set, try to fill the page but don't wait for it to become unlocked.
1676 *
1677 * If the page does not get brought uptodate, return -EIO.
1678 */
1679struct page *read_cache_page_async(struct address_space *mapping,
1680 pgoff_t index, 1666 pgoff_t index,
1681 int (*filler)(void *,struct page*), 1667 int (*filler)(void *,struct page*),
1682 void *data) 1668 void *data,
1669 gfp_t gfp)
1670
1683{ 1671{
1684 struct page *page; 1672 struct page *page;
1685 int err; 1673 int err;
1686 1674
1687retry: 1675retry:
1688 page = __read_cache_page(mapping, index, filler, data); 1676 page = __read_cache_page(mapping, index, filler, data, gfp);
1689 if (IS_ERR(page)) 1677 if (IS_ERR(page))
1690 return page; 1678 return page;
1691 if (PageUptodate(page)) 1679 if (PageUptodate(page))
@@ -1710,8 +1698,67 @@ out:
1710 mark_page_accessed(page); 1698 mark_page_accessed(page);
1711 return page; 1699 return page;
1712} 1700}
1701
1702/**
1703 * read_cache_page_async - read into page cache, fill it if needed
1704 * @mapping: the page's address_space
1705 * @index: the page index
1706 * @filler: function to perform the read
1707 * @data: destination for read data
1708 *
1709 * Same as read_cache_page, but don't wait for page to become unlocked
1710 * after submitting it to the filler.
1711 *
1712 * Read into the page cache. If a page already exists, and PageUptodate() is
1713 * not set, try to fill the page but don't wait for it to become unlocked.
1714 *
1715 * If the page does not get brought uptodate, return -EIO.
1716 */
1717struct page *read_cache_page_async(struct address_space *mapping,
1718 pgoff_t index,
1719 int (*filler)(void *,struct page*),
1720 void *data)
1721{
1722 return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping));
1723}
1713EXPORT_SYMBOL(read_cache_page_async); 1724EXPORT_SYMBOL(read_cache_page_async);
1714 1725
1726static struct page *wait_on_page_read(struct page *page)
1727{
1728 if (!IS_ERR(page)) {
1729 wait_on_page_locked(page);
1730 if (!PageUptodate(page)) {
1731 page_cache_release(page);
1732 page = ERR_PTR(-EIO);
1733 }
1734 }
1735 return page;
1736}
1737
1738/**
1739 * read_cache_page_gfp - read into page cache, using specified page allocation flags.
1740 * @mapping: the page's address_space
1741 * @index: the page index
1742 * @gfp: the page allocator flags to use if allocating
1743 *
1744 * This is the same as "read_mapping_page(mapping, index, NULL)", but with
1745 * any new page allocations done using the specified allocation flags. Note
1746 * that the Radix tree operations will still use GFP_KERNEL, so you can't
1747 * expect to do this atomically or anything like that - but you can pass in
1748 * other page requirements.
1749 *
1750 * If the page does not get brought uptodate, return -EIO.
1751 */
1752struct page *read_cache_page_gfp(struct address_space *mapping,
1753 pgoff_t index,
1754 gfp_t gfp)
1755{
1756 filler_t *filler = (filler_t *)mapping->a_ops->readpage;
1757
1758 return wait_on_page_read(do_read_cache_page(mapping, index, filler, NULL, gfp));
1759}
1760EXPORT_SYMBOL(read_cache_page_gfp);
1761
1715/** 1762/**
1716 * read_cache_page - read into page cache, fill it if needed 1763 * read_cache_page - read into page cache, fill it if needed
1717 * @mapping: the page's address_space 1764 * @mapping: the page's address_space
@@ -1729,18 +1776,7 @@ struct page *read_cache_page(struct address_space *mapping,
1729 int (*filler)(void *,struct page*), 1776 int (*filler)(void *,struct page*),
1730 void *data) 1777 void *data)
1731{ 1778{
1732 struct page *page; 1779 return wait_on_page_read(read_cache_page_async(mapping, index, filler, data));
1733
1734 page = read_cache_page_async(mapping, index, filler, data);
1735 if (IS_ERR(page))
1736 goto out;
1737 wait_on_page_locked(page);
1738 if (!PageUptodate(page)) {
1739 page_cache_release(page);
1740 page = ERR_PTR(-EIO);
1741 }
1742 out:
1743 return page;
1744} 1780}
1745EXPORT_SYMBOL(read_cache_page); 1781EXPORT_SYMBOL(read_cache_page);
1746 1782
@@ -2196,6 +2232,9 @@ again:
2196 if (unlikely(status)) 2232 if (unlikely(status))
2197 break; 2233 break;
2198 2234
2235 if (mapping_writably_mapped(mapping))
2236 flush_dcache_page(page);
2237
2199 pagefault_disable(); 2238 pagefault_disable();
2200 copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); 2239 copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
2201 pagefault_enable(); 2240 pagefault_enable();
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 65f38c218207..3a5aeb37c110 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -402,7 +402,7 @@ static void clear_huge_page(struct page *page,
402{ 402{
403 int i; 403 int i;
404 404
405 if (unlikely(sz > MAX_ORDER_NR_PAGES)) { 405 if (unlikely(sz/PAGE_SIZE > MAX_ORDER_NR_PAGES)) {
406 clear_gigantic_page(page, addr, sz); 406 clear_gigantic_page(page, addr, sz);
407 return; 407 return;
408 } 408 }
@@ -1515,10 +1515,9 @@ static struct attribute_group hstate_attr_group = {
1515 .attrs = hstate_attrs, 1515 .attrs = hstate_attrs,
1516}; 1516};
1517 1517
1518static int __init hugetlb_sysfs_add_hstate(struct hstate *h, 1518static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent,
1519 struct kobject *parent, 1519 struct kobject **hstate_kobjs,
1520 struct kobject **hstate_kobjs, 1520 struct attribute_group *hstate_attr_group)
1521 struct attribute_group *hstate_attr_group)
1522{ 1521{
1523 int retval; 1522 int retval;
1524 int hi = h - hstates; 1523 int hi = h - hstates;
@@ -2088,7 +2087,7 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma,
2088 2087
2089 entry = pte_mkwrite(pte_mkdirty(huge_ptep_get(ptep))); 2088 entry = pte_mkwrite(pte_mkdirty(huge_ptep_get(ptep)));
2090 if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) { 2089 if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) {
2091 update_mmu_cache(vma, address, entry); 2090 update_mmu_cache(vma, address, ptep);
2092 } 2091 }
2093} 2092}
2094 2093
@@ -2559,7 +2558,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2559 entry = pte_mkyoung(entry); 2558 entry = pte_mkyoung(entry);
2560 if (huge_ptep_set_access_flags(vma, address, ptep, entry, 2559 if (huge_ptep_set_access_flags(vma, address, ptep, entry,
2561 flags & FAULT_FLAG_WRITE)) 2560 flags & FAULT_FLAG_WRITE))
2562 update_mmu_cache(vma, address, entry); 2561 update_mmu_cache(vma, address, ptep);
2563 2562
2564out_page_table_lock: 2563out_page_table_lock:
2565 spin_unlock(&mm->page_table_lock); 2564 spin_unlock(&mm->page_table_lock);
diff --git a/mm/maccess.c b/mm/maccess.c
index 9073695ff25f..4e348dbaecd7 100644
--- a/mm/maccess.c
+++ b/mm/maccess.c
@@ -14,7 +14,11 @@
14 * Safely read from address @src to the buffer at @dst. If a kernel fault 14 * Safely read from address @src to the buffer at @dst. If a kernel fault
15 * happens, handle that and return -EFAULT. 15 * happens, handle that and return -EFAULT.
16 */ 16 */
17long probe_kernel_read(void *dst, void *src, size_t size) 17
18long __weak probe_kernel_read(void *dst, void *src, size_t size)
19 __attribute__((alias("__probe_kernel_read")));
20
21long __probe_kernel_read(void *dst, void *src, size_t size)
18{ 22{
19 long ret; 23 long ret;
20 mm_segment_t old_fs = get_fs(); 24 mm_segment_t old_fs = get_fs();
@@ -39,7 +43,10 @@ EXPORT_SYMBOL_GPL(probe_kernel_read);
39 * Safely write to address @dst from the buffer at @src. If a kernel fault 43 * Safely write to address @dst from the buffer at @src. If a kernel fault
40 * happens, handle that and return -EFAULT. 44 * happens, handle that and return -EFAULT.
41 */ 45 */
42long notrace __weak probe_kernel_write(void *dst, void *src, size_t size) 46long __weak probe_kernel_write(void *dst, void *src, size_t size)
47 __attribute__((alias("__probe_kernel_write")));
48
49long __probe_kernel_write(void *dst, void *src, size_t size)
43{ 50{
44 long ret; 51 long ret;
45 mm_segment_t old_fs = get_fs(); 52 mm_segment_t old_fs = get_fs();
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 488b644e0e8e..954032b80bed 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2586,7 +2586,7 @@ static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all)
2586 if (free_all) 2586 if (free_all)
2587 goto try_to_free; 2587 goto try_to_free;
2588move_account: 2588move_account:
2589 while (mem->res.usage > 0) { 2589 do {
2590 ret = -EBUSY; 2590 ret = -EBUSY;
2591 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children)) 2591 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
2592 goto out; 2592 goto out;
@@ -2614,8 +2614,8 @@ move_account:
2614 if (ret == -ENOMEM) 2614 if (ret == -ENOMEM)
2615 goto try_to_free; 2615 goto try_to_free;
2616 cond_resched(); 2616 cond_resched();
2617 } 2617 /* "ret" should also be checked to ensure all lists are empty. */
2618 ret = 0; 2618 } while (mem->res.usage > 0 || ret);
2619out: 2619out:
2620 css_put(&mem->css); 2620 css_put(&mem->css);
2621 return ret; 2621 return ret;
@@ -2648,10 +2648,7 @@ try_to_free:
2648 } 2648 }
2649 lru_add_drain(); 2649 lru_add_drain();
2650 /* try move_account...there may be some *locked* pages. */ 2650 /* try move_account...there may be some *locked* pages. */
2651 if (mem->res.usage) 2651 goto move_account;
2652 goto move_account;
2653 ret = 0;
2654 goto out;
2655} 2652}
2656 2653
2657int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event) 2654int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 6a0466ed5bfd..17299fd4577c 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -52,6 +52,8 @@ int sysctl_memory_failure_recovery __read_mostly = 1;
52 52
53atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0); 53atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0);
54 54
55#if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE)
56
55u32 hwpoison_filter_enable = 0; 57u32 hwpoison_filter_enable = 0;
56u32 hwpoison_filter_dev_major = ~0U; 58u32 hwpoison_filter_dev_major = ~0U;
57u32 hwpoison_filter_dev_minor = ~0U; 59u32 hwpoison_filter_dev_minor = ~0U;
@@ -164,6 +166,13 @@ int hwpoison_filter(struct page *p)
164 166
165 return 0; 167 return 0;
166} 168}
169#else
170int hwpoison_filter(struct page *p)
171{
172 return 0;
173}
174#endif
175
167EXPORT_SYMBOL_GPL(hwpoison_filter); 176EXPORT_SYMBOL_GPL(hwpoison_filter);
168 177
169/* 178/*
diff --git a/mm/memory.c b/mm/memory.c
index 09e4b1be7b67..72fb5f39bccc 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1593,7 +1593,7 @@ static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
1593 /* Ok, finally just insert the thing.. */ 1593 /* Ok, finally just insert the thing.. */
1594 entry = pte_mkspecial(pfn_pte(pfn, prot)); 1594 entry = pte_mkspecial(pfn_pte(pfn, prot));
1595 set_pte_at(mm, addr, pte, entry); 1595 set_pte_at(mm, addr, pte, entry);
1596 update_mmu_cache(vma, addr, entry); /* XXX: why not for insert_page? */ 1596 update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */
1597 1597
1598 retval = 0; 1598 retval = 0;
1599out_unlock: 1599out_unlock:
@@ -2116,7 +2116,7 @@ reuse:
2116 entry = pte_mkyoung(orig_pte); 2116 entry = pte_mkyoung(orig_pte);
2117 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 2117 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2118 if (ptep_set_access_flags(vma, address, page_table, entry,1)) 2118 if (ptep_set_access_flags(vma, address, page_table, entry,1))
2119 update_mmu_cache(vma, address, entry); 2119 update_mmu_cache(vma, address, page_table);
2120 ret |= VM_FAULT_WRITE; 2120 ret |= VM_FAULT_WRITE;
2121 goto unlock; 2121 goto unlock;
2122 } 2122 }
@@ -2185,7 +2185,7 @@ gotten:
2185 * new page to be mapped directly into the secondary page table. 2185 * new page to be mapped directly into the secondary page table.
2186 */ 2186 */
2187 set_pte_at_notify(mm, address, page_table, entry); 2187 set_pte_at_notify(mm, address, page_table, entry);
2188 update_mmu_cache(vma, address, entry); 2188 update_mmu_cache(vma, address, page_table);
2189 if (old_page) { 2189 if (old_page) {
2190 /* 2190 /*
2191 * Only after switching the pte to the new page may 2191 * Only after switching the pte to the new page may
@@ -2629,7 +2629,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2629 } 2629 }
2630 2630
2631 /* No need to invalidate - it was non-present before */ 2631 /* No need to invalidate - it was non-present before */
2632 update_mmu_cache(vma, address, pte); 2632 update_mmu_cache(vma, address, page_table);
2633unlock: 2633unlock:
2634 pte_unmap_unlock(page_table, ptl); 2634 pte_unmap_unlock(page_table, ptl);
2635out: 2635out:
@@ -2694,7 +2694,7 @@ setpte:
2694 set_pte_at(mm, address, page_table, entry); 2694 set_pte_at(mm, address, page_table, entry);
2695 2695
2696 /* No need to invalidate - it was non-present before */ 2696 /* No need to invalidate - it was non-present before */
2697 update_mmu_cache(vma, address, entry); 2697 update_mmu_cache(vma, address, page_table);
2698unlock: 2698unlock:
2699 pte_unmap_unlock(page_table, ptl); 2699 pte_unmap_unlock(page_table, ptl);
2700 return 0; 2700 return 0;
@@ -2855,7 +2855,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2855 set_pte_at(mm, address, page_table, entry); 2855 set_pte_at(mm, address, page_table, entry);
2856 2856
2857 /* no need to invalidate: a not-present page won't be cached */ 2857 /* no need to invalidate: a not-present page won't be cached */
2858 update_mmu_cache(vma, address, entry); 2858 update_mmu_cache(vma, address, page_table);
2859 } else { 2859 } else {
2860 if (charged) 2860 if (charged)
2861 mem_cgroup_uncharge_page(page); 2861 mem_cgroup_uncharge_page(page);
@@ -2992,7 +2992,7 @@ static inline int handle_pte_fault(struct mm_struct *mm,
2992 } 2992 }
2993 entry = pte_mkyoung(entry); 2993 entry = pte_mkyoung(entry);
2994 if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) { 2994 if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) {
2995 update_mmu_cache(vma, address, entry); 2995 update_mmu_cache(vma, address, pte);
2996 } else { 2996 } else {
2997 /* 2997 /*
2998 * This is needed only for protection faults but the arch code 2998 * This is needed only for protection faults but the arch code
diff --git a/mm/migrate.c b/mm/migrate.c
index efddbf0926b2..edb6101ed774 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -134,7 +134,7 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
134 page_add_file_rmap(new); 134 page_add_file_rmap(new);
135 135
136 /* No need to invalidate - it was non-present before */ 136 /* No need to invalidate - it was non-present before */
137 update_mmu_cache(vma, addr, pte); 137 update_mmu_cache(vma, addr, ptep);
138unlock: 138unlock:
139 pte_unmap_unlock(ptep, ptl); 139 pte_unmap_unlock(ptep, ptl);
140out: 140out:
@@ -912,6 +912,9 @@ static int do_pages_move(struct mm_struct *mm, struct task_struct *task,
912 goto out_pm; 912 goto out_pm;
913 913
914 err = -ENODEV; 914 err = -ENODEV;
915 if (node < 0 || node >= MAX_NUMNODES)
916 goto out_pm;
917
915 if (!node_state(node, N_HIGH_MEMORY)) 918 if (!node_state(node, N_HIGH_MEMORY))
916 goto out_pm; 919 goto out_pm;
917 920
@@ -999,33 +1002,27 @@ static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages,
999#define DO_PAGES_STAT_CHUNK_NR 16 1002#define DO_PAGES_STAT_CHUNK_NR 16
1000 const void __user *chunk_pages[DO_PAGES_STAT_CHUNK_NR]; 1003 const void __user *chunk_pages[DO_PAGES_STAT_CHUNK_NR];
1001 int chunk_status[DO_PAGES_STAT_CHUNK_NR]; 1004 int chunk_status[DO_PAGES_STAT_CHUNK_NR];
1002 unsigned long i, chunk_nr = DO_PAGES_STAT_CHUNK_NR;
1003 int err;
1004 1005
1005 for (i = 0; i < nr_pages; i += chunk_nr) { 1006 while (nr_pages) {
1006 if (chunk_nr > nr_pages - i) 1007 unsigned long chunk_nr;
1007 chunk_nr = nr_pages - i;
1008 1008
1009 err = copy_from_user(chunk_pages, &pages[i], 1009 chunk_nr = nr_pages;
1010 chunk_nr * sizeof(*chunk_pages)); 1010 if (chunk_nr > DO_PAGES_STAT_CHUNK_NR)
1011 if (err) { 1011 chunk_nr = DO_PAGES_STAT_CHUNK_NR;
1012 err = -EFAULT; 1012
1013 goto out; 1013 if (copy_from_user(chunk_pages, pages, chunk_nr * sizeof(*chunk_pages)))
1014 } 1014 break;
1015 1015
1016 do_pages_stat_array(mm, chunk_nr, chunk_pages, chunk_status); 1016 do_pages_stat_array(mm, chunk_nr, chunk_pages, chunk_status);
1017 1017
1018 err = copy_to_user(&status[i], chunk_status, 1018 if (copy_to_user(status, chunk_status, chunk_nr * sizeof(*status)))
1019 chunk_nr * sizeof(*chunk_status)); 1019 break;
1020 if (err) {
1021 err = -EFAULT;
1022 goto out;
1023 }
1024 }
1025 err = 0;
1026 1020
1027out: 1021 pages += chunk_nr;
1028 return err; 1022 status += chunk_nr;
1023 nr_pages -= chunk_nr;
1024 }
1025 return nr_pages ? -EFAULT : 0;
1029} 1026}
1030 1027
1031/* 1028/*
diff --git a/mm/mmap.c b/mm/mmap.c
index d9c77b2dbe9d..ee2298936fe6 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1043,6 +1043,46 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
1043} 1043}
1044EXPORT_SYMBOL(do_mmap_pgoff); 1044EXPORT_SYMBOL(do_mmap_pgoff);
1045 1045
1046SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
1047 unsigned long, prot, unsigned long, flags,
1048 unsigned long, fd, unsigned long, pgoff)
1049{
1050 struct file *file = NULL;
1051 unsigned long retval = -EBADF;
1052
1053 if (!(flags & MAP_ANONYMOUS)) {
1054 if (unlikely(flags & MAP_HUGETLB))
1055 return -EINVAL;
1056 file = fget(fd);
1057 if (!file)
1058 goto out;
1059 } else if (flags & MAP_HUGETLB) {
1060 struct user_struct *user = NULL;
1061 /*
1062 * VM_NORESERVE is used because the reservations will be
1063 * taken when vm_ops->mmap() is called
1064 * A dummy user value is used because we are not locking
1065 * memory so no accounting is necessary
1066 */
1067 len = ALIGN(len, huge_page_size(&default_hstate));
1068 file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE,
1069 &user, HUGETLB_ANONHUGE_INODE);
1070 if (IS_ERR(file))
1071 return PTR_ERR(file);
1072 }
1073
1074 flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
1075
1076 down_write(&current->mm->mmap_sem);
1077 retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
1078 up_write(&current->mm->mmap_sem);
1079
1080 if (file)
1081 fput(file);
1082out:
1083 return retval;
1084}
1085
1046/* 1086/*
1047 * Some shared mappigns will want the pages marked read-only 1087 * Some shared mappigns will want the pages marked read-only
1048 * to track write events. If so, we'll downgrade vm_page_prot 1088 * to track write events. If so, we'll downgrade vm_page_prot
diff --git a/mm/mmu_context.c b/mm/mmu_context.c
index ded9081f4021..0777654147c9 100644
--- a/mm/mmu_context.c
+++ b/mm/mmu_context.c
@@ -5,6 +5,7 @@
5 5
6#include <linux/mm.h> 6#include <linux/mm.h>
7#include <linux/mmu_context.h> 7#include <linux/mmu_context.h>
8#include <linux/module.h>
8#include <linux/sched.h> 9#include <linux/sched.h>
9 10
10#include <asm/mmu_context.h> 11#include <asm/mmu_context.h>
@@ -37,6 +38,7 @@ void use_mm(struct mm_struct *mm)
37 if (active_mm != mm) 38 if (active_mm != mm)
38 mmdrop(active_mm); 39 mmdrop(active_mm);
39} 40}
41EXPORT_SYMBOL_GPL(use_mm);
40 42
41/* 43/*
42 * unuse_mm 44 * unuse_mm
@@ -56,3 +58,4 @@ void unuse_mm(struct mm_struct *mm)
56 enter_lazy_tlb(mm, tsk); 58 enter_lazy_tlb(mm, tsk);
57 task_unlock(tsk); 59 task_unlock(tsk);
58} 60}
61EXPORT_SYMBOL_GPL(unuse_mm);
diff --git a/mm/nommu.c b/mm/nommu.c
index 8687973462bb..48a2ecfaf059 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -432,6 +432,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
432 /* 432 /*
433 * Ok, looks good - let it rip. 433 * Ok, looks good - let it rip.
434 */ 434 */
435 flush_icache_range(mm->brk, brk);
435 return mm->brk = brk; 436 return mm->brk = brk;
436} 437}
437 438
@@ -551,11 +552,11 @@ static void free_page_series(unsigned long from, unsigned long to)
551static void __put_nommu_region(struct vm_region *region) 552static void __put_nommu_region(struct vm_region *region)
552 __releases(nommu_region_sem) 553 __releases(nommu_region_sem)
553{ 554{
554 kenter("%p{%d}", region, atomic_read(&region->vm_usage)); 555 kenter("%p{%d}", region, region->vm_usage);
555 556
556 BUG_ON(!nommu_region_tree.rb_node); 557 BUG_ON(!nommu_region_tree.rb_node);
557 558
558 if (atomic_dec_and_test(&region->vm_usage)) { 559 if (--region->vm_usage == 0) {
559 if (region->vm_top > region->vm_start) 560 if (region->vm_top > region->vm_start)
560 delete_nommu_region(region); 561 delete_nommu_region(region);
561 up_write(&nommu_region_sem); 562 up_write(&nommu_region_sem);
@@ -1204,7 +1205,7 @@ unsigned long do_mmap_pgoff(struct file *file,
1204 if (!vma) 1205 if (!vma)
1205 goto error_getting_vma; 1206 goto error_getting_vma;
1206 1207
1207 atomic_set(&region->vm_usage, 1); 1208 region->vm_usage = 1;
1208 region->vm_flags = vm_flags; 1209 region->vm_flags = vm_flags;
1209 region->vm_pgoff = pgoff; 1210 region->vm_pgoff = pgoff;
1210 1211
@@ -1271,7 +1272,7 @@ unsigned long do_mmap_pgoff(struct file *file,
1271 } 1272 }
1272 1273
1273 /* we've found a region we can share */ 1274 /* we've found a region we can share */
1274 atomic_inc(&pregion->vm_usage); 1275 pregion->vm_usage++;
1275 vma->vm_region = pregion; 1276 vma->vm_region = pregion;
1276 start = pregion->vm_start; 1277 start = pregion->vm_start;
1277 start += (pgoff - pregion->vm_pgoff) << PAGE_SHIFT; 1278 start += (pgoff - pregion->vm_pgoff) << PAGE_SHIFT;
@@ -1288,7 +1289,7 @@ unsigned long do_mmap_pgoff(struct file *file,
1288 vma->vm_region = NULL; 1289 vma->vm_region = NULL;
1289 vma->vm_start = 0; 1290 vma->vm_start = 0;
1290 vma->vm_end = 0; 1291 vma->vm_end = 0;
1291 atomic_dec(&pregion->vm_usage); 1292 pregion->vm_usage--;
1292 pregion = NULL; 1293 pregion = NULL;
1293 goto error_just_free; 1294 goto error_just_free;
1294 } 1295 }
@@ -1353,10 +1354,14 @@ unsigned long do_mmap_pgoff(struct file *file,
1353share: 1354share:
1354 add_vma_to_mm(current->mm, vma); 1355 add_vma_to_mm(current->mm, vma);
1355 1356
1356 up_write(&nommu_region_sem); 1357 /* we flush the region from the icache only when the first executable
1358 * mapping of it is made */
1359 if (vma->vm_flags & VM_EXEC && !region->vm_icache_flushed) {
1360 flush_icache_range(region->vm_start, region->vm_end);
1361 region->vm_icache_flushed = true;
1362 }
1357 1363
1358 if (prot & PROT_EXEC) 1364 up_write(&nommu_region_sem);
1359 flush_icache_range(result, result + len);
1360 1365
1361 kleave(" = %lx", result); 1366 kleave(" = %lx", result);
1362 return result; 1367 return result;
@@ -1398,6 +1403,31 @@ error_getting_region:
1398} 1403}
1399EXPORT_SYMBOL(do_mmap_pgoff); 1404EXPORT_SYMBOL(do_mmap_pgoff);
1400 1405
1406SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
1407 unsigned long, prot, unsigned long, flags,
1408 unsigned long, fd, unsigned long, pgoff)
1409{
1410 struct file *file = NULL;
1411 unsigned long retval = -EBADF;
1412
1413 if (!(flags & MAP_ANONYMOUS)) {
1414 file = fget(fd);
1415 if (!file)
1416 goto out;
1417 }
1418
1419 flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
1420
1421 down_write(&current->mm->mmap_sem);
1422 retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
1423 up_write(&current->mm->mmap_sem);
1424
1425 if (file)
1426 fput(file);
1427out:
1428 return retval;
1429}
1430
1401/* 1431/*
1402 * split a vma into two pieces at address 'addr', a new vma is allocated either 1432 * split a vma into two pieces at address 'addr', a new vma is allocated either
1403 * for the first part or the tail. 1433 * for the first part or the tail.
@@ -1411,10 +1441,9 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
1411 1441
1412 kenter(""); 1442 kenter("");
1413 1443
1414 /* we're only permitted to split anonymous regions that have a single 1444 /* we're only permitted to split anonymous regions (these should have
1415 * owner */ 1445 * only a single usage on the region) */
1416 if (vma->vm_file || 1446 if (vma->vm_file)
1417 atomic_read(&vma->vm_region->vm_usage) != 1)
1418 return -ENOMEM; 1447 return -ENOMEM;
1419 1448
1420 if (mm->map_count >= sysctl_max_map_count) 1449 if (mm->map_count >= sysctl_max_map_count)
@@ -1488,7 +1517,7 @@ static int shrink_vma(struct mm_struct *mm,
1488 1517
1489 /* cut the backing region down to size */ 1518 /* cut the backing region down to size */
1490 region = vma->vm_region; 1519 region = vma->vm_region;
1491 BUG_ON(atomic_read(&region->vm_usage) != 1); 1520 BUG_ON(region->vm_usage != 1);
1492 1521
1493 down_write(&nommu_region_sem); 1522 down_write(&nommu_region_sem);
1494 delete_nommu_region(region); 1523 delete_nommu_region(region);
@@ -1732,27 +1761,6 @@ void unmap_mapping_range(struct address_space *mapping,
1732EXPORT_SYMBOL(unmap_mapping_range); 1761EXPORT_SYMBOL(unmap_mapping_range);
1733 1762
1734/* 1763/*
1735 * ask for an unmapped area at which to create a mapping on a file
1736 */
1737unsigned long get_unmapped_area(struct file *file, unsigned long addr,
1738 unsigned long len, unsigned long pgoff,
1739 unsigned long flags)
1740{
1741 unsigned long (*get_area)(struct file *, unsigned long, unsigned long,
1742 unsigned long, unsigned long);
1743
1744 get_area = current->mm->get_unmapped_area;
1745 if (file && file->f_op && file->f_op->get_unmapped_area)
1746 get_area = file->f_op->get_unmapped_area;
1747
1748 if (!get_area)
1749 return -ENOSYS;
1750
1751 return get_area(file, addr, len, pgoff, flags);
1752}
1753EXPORT_SYMBOL(get_unmapped_area);
1754
1755/*
1756 * Check that a process has enough memory to allocate a new virtual 1764 * Check that a process has enough memory to allocate a new virtual
1757 * mapping. 0 means there is enough memory for the allocation to 1765 * mapping. 0 means there is enough memory for the allocation to
1758 * succeed and -ENOMEM implies there is not. 1766 * succeed and -ENOMEM implies there is not.
@@ -1891,9 +1899,11 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in
1891 1899
1892 /* only read or write mappings where it is permitted */ 1900 /* only read or write mappings where it is permitted */
1893 if (write && vma->vm_flags & VM_MAYWRITE) 1901 if (write && vma->vm_flags & VM_MAYWRITE)
1894 len -= copy_to_user((void *) addr, buf, len); 1902 copy_to_user_page(vma, NULL, addr,
1903 (void *) addr, buf, len);
1895 else if (!write && vma->vm_flags & VM_MAYREAD) 1904 else if (!write && vma->vm_flags & VM_MAYREAD)
1896 len -= copy_from_user(buf, (void *) addr, len); 1905 copy_from_user_page(vma, NULL, addr,
1906 buf, (void *) addr, len);
1897 else 1907 else
1898 len = 0; 1908 len = 0;
1899 } else { 1909 } else {
@@ -1904,3 +1914,65 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in
1904 mmput(mm); 1914 mmput(mm);
1905 return len; 1915 return len;
1906} 1916}
1917
1918/**
1919 * nommu_shrink_inode_mappings - Shrink the shared mappings on an inode
1920 * @inode: The inode to check
1921 * @size: The current filesize of the inode
1922 * @newsize: The proposed filesize of the inode
1923 *
1924 * Check the shared mappings on an inode on behalf of a shrinking truncate to
1925 * make sure that that any outstanding VMAs aren't broken and then shrink the
1926 * vm_regions that extend that beyond so that do_mmap_pgoff() doesn't
1927 * automatically grant mappings that are too large.
1928 */
1929int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
1930 size_t newsize)
1931{
1932 struct vm_area_struct *vma;
1933 struct prio_tree_iter iter;
1934 struct vm_region *region;
1935 pgoff_t low, high;
1936 size_t r_size, r_top;
1937
1938 low = newsize >> PAGE_SHIFT;
1939 high = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
1940
1941 down_write(&nommu_region_sem);
1942
1943 /* search for VMAs that fall within the dead zone */
1944 vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap,
1945 low, high) {
1946 /* found one - only interested if it's shared out of the page
1947 * cache */
1948 if (vma->vm_flags & VM_SHARED) {
1949 up_write(&nommu_region_sem);
1950 return -ETXTBSY; /* not quite true, but near enough */
1951 }
1952 }
1953
1954 /* reduce any regions that overlap the dead zone - if in existence,
1955 * these will be pointed to by VMAs that don't overlap the dead zone
1956 *
1957 * we don't check for any regions that start beyond the EOF as there
1958 * shouldn't be any
1959 */
1960 vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap,
1961 0, ULONG_MAX) {
1962 if (!(vma->vm_flags & VM_SHARED))
1963 continue;
1964
1965 region = vma->vm_region;
1966 r_size = region->vm_top - region->vm_start;
1967 r_top = (region->vm_pgoff << PAGE_SHIFT) + r_size;
1968
1969 if (r_top > newsize) {
1970 region->vm_top -= r_top - newsize;
1971 if (region->vm_end > region->vm_top)
1972 region->vm_end = region->vm_top;
1973 }
1974 }
1975
1976 up_write(&nommu_region_sem);
1977 return 0;
1978}
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index f52481b1c1e5..237050478f28 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -459,6 +459,8 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
459 list_for_each_entry(c, &p->children, sibling) { 459 list_for_each_entry(c, &p->children, sibling) {
460 if (c->mm == p->mm) 460 if (c->mm == p->mm)
461 continue; 461 continue;
462 if (mem && !task_in_mem_cgroup(c, mem))
463 continue;
462 if (!oom_kill_task(c)) 464 if (!oom_kill_task(c))
463 return 0; 465 return 0;
464 } 466 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 74af449b1f1d..a6b17aa4740b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -48,6 +48,7 @@
48#include <linux/page_cgroup.h> 48#include <linux/page_cgroup.h>
49#include <linux/debugobjects.h> 49#include <linux/debugobjects.h>
50#include <linux/kmemleak.h> 50#include <linux/kmemleak.h>
51#include <linux/memory.h>
51#include <trace/events/kmem.h> 52#include <trace/events/kmem.h>
52 53
53#include <asm/tlbflush.h> 54#include <asm/tlbflush.h>
@@ -555,8 +556,9 @@ static void free_pcppages_bulk(struct zone *zone, int count,
555 page = list_entry(list->prev, struct page, lru); 556 page = list_entry(list->prev, struct page, lru);
556 /* must delete as __free_one_page list manipulates */ 557 /* must delete as __free_one_page list manipulates */
557 list_del(&page->lru); 558 list_del(&page->lru);
558 __free_one_page(page, zone, 0, migratetype); 559 /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
559 trace_mm_page_pcpu_drain(page, 0, migratetype); 560 __free_one_page(page, zone, 0, page_private(page));
561 trace_mm_page_pcpu_drain(page, 0, page_private(page));
560 } while (--count && --batch_free && !list_empty(list)); 562 } while (--count && --batch_free && !list_empty(list));
561 } 563 }
562 spin_unlock(&zone->lock); 564 spin_unlock(&zone->lock);
@@ -1007,10 +1009,10 @@ static void drain_pages(unsigned int cpu)
1007 struct per_cpu_pageset *pset; 1009 struct per_cpu_pageset *pset;
1008 struct per_cpu_pages *pcp; 1010 struct per_cpu_pages *pcp;
1009 1011
1010 pset = zone_pcp(zone, cpu); 1012 local_irq_save(flags);
1013 pset = per_cpu_ptr(zone->pageset, cpu);
1011 1014
1012 pcp = &pset->pcp; 1015 pcp = &pset->pcp;
1013 local_irq_save(flags);
1014 free_pcppages_bulk(zone, pcp->count, pcp); 1016 free_pcppages_bulk(zone, pcp->count, pcp);
1015 pcp->count = 0; 1017 pcp->count = 0;
1016 local_irq_restore(flags); 1018 local_irq_restore(flags);
@@ -1094,7 +1096,6 @@ static void free_hot_cold_page(struct page *page, int cold)
1094 arch_free_page(page, 0); 1096 arch_free_page(page, 0);
1095 kernel_map_pages(page, 1, 0); 1097 kernel_map_pages(page, 1, 0);
1096 1098
1097 pcp = &zone_pcp(zone, get_cpu())->pcp;
1098 migratetype = get_pageblock_migratetype(page); 1099 migratetype = get_pageblock_migratetype(page);
1099 set_page_private(page, migratetype); 1100 set_page_private(page, migratetype);
1100 local_irq_save(flags); 1101 local_irq_save(flags);
@@ -1117,6 +1118,7 @@ static void free_hot_cold_page(struct page *page, int cold)
1117 migratetype = MIGRATE_MOVABLE; 1118 migratetype = MIGRATE_MOVABLE;
1118 } 1119 }
1119 1120
1121 pcp = &this_cpu_ptr(zone->pageset)->pcp;
1120 if (cold) 1122 if (cold)
1121 list_add_tail(&page->lru, &pcp->lists[migratetype]); 1123 list_add_tail(&page->lru, &pcp->lists[migratetype]);
1122 else 1124 else
@@ -1129,7 +1131,6 @@ static void free_hot_cold_page(struct page *page, int cold)
1129 1131
1130out: 1132out:
1131 local_irq_restore(flags); 1133 local_irq_restore(flags);
1132 put_cpu();
1133} 1134}
1134 1135
1135void free_hot_page(struct page *page) 1136void free_hot_page(struct page *page)
@@ -1179,17 +1180,15 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
1179 unsigned long flags; 1180 unsigned long flags;
1180 struct page *page; 1181 struct page *page;
1181 int cold = !!(gfp_flags & __GFP_COLD); 1182 int cold = !!(gfp_flags & __GFP_COLD);
1182 int cpu;
1183 1183
1184again: 1184again:
1185 cpu = get_cpu();
1186 if (likely(order == 0)) { 1185 if (likely(order == 0)) {
1187 struct per_cpu_pages *pcp; 1186 struct per_cpu_pages *pcp;
1188 struct list_head *list; 1187 struct list_head *list;
1189 1188
1190 pcp = &zone_pcp(zone, cpu)->pcp;
1191 list = &pcp->lists[migratetype];
1192 local_irq_save(flags); 1189 local_irq_save(flags);
1190 pcp = &this_cpu_ptr(zone->pageset)->pcp;
1191 list = &pcp->lists[migratetype];
1193 if (list_empty(list)) { 1192 if (list_empty(list)) {
1194 pcp->count += rmqueue_bulk(zone, 0, 1193 pcp->count += rmqueue_bulk(zone, 0,
1195 pcp->batch, list, 1194 pcp->batch, list,
@@ -1221,16 +1220,15 @@ again:
1221 } 1220 }
1222 spin_lock_irqsave(&zone->lock, flags); 1221 spin_lock_irqsave(&zone->lock, flags);
1223 page = __rmqueue(zone, order, migratetype); 1222 page = __rmqueue(zone, order, migratetype);
1224 __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order));
1225 spin_unlock(&zone->lock); 1223 spin_unlock(&zone->lock);
1226 if (!page) 1224 if (!page)
1227 goto failed; 1225 goto failed;
1226 __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order));
1228 } 1227 }
1229 1228
1230 __count_zone_vm_events(PGALLOC, zone, 1 << order); 1229 __count_zone_vm_events(PGALLOC, zone, 1 << order);
1231 zone_statistics(preferred_zone, zone); 1230 zone_statistics(preferred_zone, zone);
1232 local_irq_restore(flags); 1231 local_irq_restore(flags);
1233 put_cpu();
1234 1232
1235 VM_BUG_ON(bad_range(zone, page)); 1233 VM_BUG_ON(bad_range(zone, page));
1236 if (prep_new_page(page, order, gfp_flags)) 1234 if (prep_new_page(page, order, gfp_flags))
@@ -1239,7 +1237,6 @@ again:
1239 1237
1240failed: 1238failed:
1241 local_irq_restore(flags); 1239 local_irq_restore(flags);
1242 put_cpu();
1243 return NULL; 1240 return NULL;
1244} 1241}
1245 1242
@@ -2178,7 +2175,7 @@ void show_free_areas(void)
2178 for_each_online_cpu(cpu) { 2175 for_each_online_cpu(cpu) {
2179 struct per_cpu_pageset *pageset; 2176 struct per_cpu_pageset *pageset;
2180 2177
2181 pageset = zone_pcp(zone, cpu); 2178 pageset = per_cpu_ptr(zone->pageset, cpu);
2182 2179
2183 printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n", 2180 printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n",
2184 cpu, pageset->pcp.high, 2181 cpu, pageset->pcp.high,
@@ -2401,13 +2398,14 @@ int numa_zonelist_order_handler(ctl_table *table, int write,
2401{ 2398{
2402 char saved_string[NUMA_ZONELIST_ORDER_LEN]; 2399 char saved_string[NUMA_ZONELIST_ORDER_LEN];
2403 int ret; 2400 int ret;
2401 static DEFINE_MUTEX(zl_order_mutex);
2404 2402
2403 mutex_lock(&zl_order_mutex);
2405 if (write) 2404 if (write)
2406 strncpy(saved_string, (char*)table->data, 2405 strcpy(saved_string, (char*)table->data);
2407 NUMA_ZONELIST_ORDER_LEN);
2408 ret = proc_dostring(table, write, buffer, length, ppos); 2406 ret = proc_dostring(table, write, buffer, length, ppos);
2409 if (ret) 2407 if (ret)
2410 return ret; 2408 goto out;
2411 if (write) { 2409 if (write) {
2412 int oldval = user_zonelist_order; 2410 int oldval = user_zonelist_order;
2413 if (__parse_numa_zonelist_order((char*)table->data)) { 2411 if (__parse_numa_zonelist_order((char*)table->data)) {
@@ -2420,7 +2418,9 @@ int numa_zonelist_order_handler(ctl_table *table, int write,
2420 } else if (oldval != user_zonelist_order) 2418 } else if (oldval != user_zonelist_order)
2421 build_all_zonelists(); 2419 build_all_zonelists();
2422 } 2420 }
2423 return 0; 2421out:
2422 mutex_unlock(&zl_order_mutex);
2423 return ret;
2424} 2424}
2425 2425
2426 2426
@@ -2740,10 +2740,29 @@ static void build_zonelist_cache(pg_data_t *pgdat)
2740 2740
2741#endif /* CONFIG_NUMA */ 2741#endif /* CONFIG_NUMA */
2742 2742
2743/*
2744 * Boot pageset table. One per cpu which is going to be used for all
2745 * zones and all nodes. The parameters will be set in such a way
2746 * that an item put on a list will immediately be handed over to
2747 * the buddy list. This is safe since pageset manipulation is done
2748 * with interrupts disabled.
2749 *
2750 * The boot_pagesets must be kept even after bootup is complete for
2751 * unused processors and/or zones. They do play a role for bootstrapping
2752 * hotplugged processors.
2753 *
2754 * zoneinfo_show() and maybe other functions do
2755 * not check if the processor is online before following the pageset pointer.
2756 * Other parts of the kernel may not check if the zone is available.
2757 */
2758static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);
2759static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
2760
2743/* return values int ....just for stop_machine() */ 2761/* return values int ....just for stop_machine() */
2744static int __build_all_zonelists(void *dummy) 2762static int __build_all_zonelists(void *dummy)
2745{ 2763{
2746 int nid; 2764 int nid;
2765 int cpu;
2747 2766
2748#ifdef CONFIG_NUMA 2767#ifdef CONFIG_NUMA
2749 memset(node_load, 0, sizeof(node_load)); 2768 memset(node_load, 0, sizeof(node_load));
@@ -2754,6 +2773,23 @@ static int __build_all_zonelists(void *dummy)
2754 build_zonelists(pgdat); 2773 build_zonelists(pgdat);
2755 build_zonelist_cache(pgdat); 2774 build_zonelist_cache(pgdat);
2756 } 2775 }
2776
2777 /*
2778 * Initialize the boot_pagesets that are going to be used
2779 * for bootstrapping processors. The real pagesets for
2780 * each zone will be allocated later when the per cpu
2781 * allocator is available.
2782 *
2783 * boot_pagesets are used also for bootstrapping offline
2784 * cpus if the system is already booted because the pagesets
2785 * are needed to initialize allocators on a specific cpu too.
2786 * F.e. the percpu allocator needs the page allocator which
2787 * needs the percpu allocator in order to allocate its pagesets
2788 * (a chicken-egg dilemma).
2789 */
2790 for_each_possible_cpu(cpu)
2791 setup_pageset(&per_cpu(boot_pageset, cpu), 0);
2792
2757 return 0; 2793 return 0;
2758} 2794}
2759 2795
@@ -3091,121 +3127,33 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p,
3091 pcp->batch = PAGE_SHIFT * 8; 3127 pcp->batch = PAGE_SHIFT * 8;
3092} 3128}
3093 3129
3094
3095#ifdef CONFIG_NUMA
3096/*
3097 * Boot pageset table. One per cpu which is going to be used for all
3098 * zones and all nodes. The parameters will be set in such a way
3099 * that an item put on a list will immediately be handed over to
3100 * the buddy list. This is safe since pageset manipulation is done
3101 * with interrupts disabled.
3102 *
3103 * Some NUMA counter updates may also be caught by the boot pagesets.
3104 *
3105 * The boot_pagesets must be kept even after bootup is complete for
3106 * unused processors and/or zones. They do play a role for bootstrapping
3107 * hotplugged processors.
3108 *
3109 * zoneinfo_show() and maybe other functions do
3110 * not check if the processor is online before following the pageset pointer.
3111 * Other parts of the kernel may not check if the zone is available.
3112 */
3113static struct per_cpu_pageset boot_pageset[NR_CPUS];
3114
3115/* 3130/*
3116 * Dynamically allocate memory for the 3131 * Allocate per cpu pagesets and initialize them.
3117 * per cpu pageset array in struct zone. 3132 * Before this call only boot pagesets were available.
3133 * Boot pagesets will no longer be used by this processorr
3134 * after setup_per_cpu_pageset().
3118 */ 3135 */
3119static int __cpuinit process_zones(int cpu) 3136void __init setup_per_cpu_pageset(void)
3120{ 3137{
3121 struct zone *zone, *dzone; 3138 struct zone *zone;
3122 int node = cpu_to_node(cpu); 3139 int cpu;
3123
3124 node_set_state(node, N_CPU); /* this node has a cpu */
3125 3140
3126 for_each_populated_zone(zone) { 3141 for_each_populated_zone(zone) {
3127 zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset), 3142 zone->pageset = alloc_percpu(struct per_cpu_pageset);
3128 GFP_KERNEL, node);
3129 if (!zone_pcp(zone, cpu))
3130 goto bad;
3131 3143
3132 setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone)); 3144 for_each_possible_cpu(cpu) {
3145 struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);
3133 3146
3134 if (percpu_pagelist_fraction) 3147 setup_pageset(pcp, zone_batchsize(zone));
3135 setup_pagelist_highmark(zone_pcp(zone, cpu),
3136 (zone->present_pages / percpu_pagelist_fraction));
3137 }
3138 3148
3139 return 0; 3149 if (percpu_pagelist_fraction)
3140bad: 3150 setup_pagelist_highmark(pcp,
3141 for_each_zone(dzone) { 3151 (zone->present_pages /
3142 if (!populated_zone(dzone)) 3152 percpu_pagelist_fraction));
3143 continue; 3153 }
3144 if (dzone == zone)
3145 break;
3146 kfree(zone_pcp(dzone, cpu));
3147 zone_pcp(dzone, cpu) = &boot_pageset[cpu];
3148 }
3149 return -ENOMEM;
3150}
3151
3152static inline void free_zone_pagesets(int cpu)
3153{
3154 struct zone *zone;
3155
3156 for_each_zone(zone) {
3157 struct per_cpu_pageset *pset = zone_pcp(zone, cpu);
3158
3159 /* Free per_cpu_pageset if it is slab allocated */
3160 if (pset != &boot_pageset[cpu])
3161 kfree(pset);
3162 zone_pcp(zone, cpu) = &boot_pageset[cpu];
3163 } 3154 }
3164} 3155}
3165 3156
3166static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb,
3167 unsigned long action,
3168 void *hcpu)
3169{
3170 int cpu = (long)hcpu;
3171 int ret = NOTIFY_OK;
3172
3173 switch (action) {
3174 case CPU_UP_PREPARE:
3175 case CPU_UP_PREPARE_FROZEN:
3176 if (process_zones(cpu))
3177 ret = NOTIFY_BAD;
3178 break;
3179 case CPU_UP_CANCELED:
3180 case CPU_UP_CANCELED_FROZEN:
3181 case CPU_DEAD:
3182 case CPU_DEAD_FROZEN:
3183 free_zone_pagesets(cpu);
3184 break;
3185 default:
3186 break;
3187 }
3188 return ret;
3189}
3190
3191static struct notifier_block __cpuinitdata pageset_notifier =
3192 { &pageset_cpuup_callback, NULL, 0 };
3193
3194void __init setup_per_cpu_pageset(void)
3195{
3196 int err;
3197
3198 /* Initialize per_cpu_pageset for cpu 0.
3199 * A cpuup callback will do this for every cpu
3200 * as it comes online
3201 */
3202 err = process_zones(smp_processor_id());
3203 BUG_ON(err);
3204 register_cpu_notifier(&pageset_notifier);
3205}
3206
3207#endif
3208
3209static noinline __init_refok 3157static noinline __init_refok
3210int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) 3158int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
3211{ 3159{
@@ -3259,7 +3207,7 @@ static int __zone_pcp_update(void *data)
3259 struct per_cpu_pageset *pset; 3207 struct per_cpu_pageset *pset;
3260 struct per_cpu_pages *pcp; 3208 struct per_cpu_pages *pcp;
3261 3209
3262 pset = zone_pcp(zone, cpu); 3210 pset = per_cpu_ptr(zone->pageset, cpu);
3263 pcp = &pset->pcp; 3211 pcp = &pset->pcp;
3264 3212
3265 local_irq_save(flags); 3213 local_irq_save(flags);
@@ -3277,21 +3225,17 @@ void zone_pcp_update(struct zone *zone)
3277 3225
3278static __meminit void zone_pcp_init(struct zone *zone) 3226static __meminit void zone_pcp_init(struct zone *zone)
3279{ 3227{
3280 int cpu; 3228 /*
3281 unsigned long batch = zone_batchsize(zone); 3229 * per cpu subsystem is not up at this point. The following code
3230 * relies on the ability of the linker to provide the
3231 * offset of a (static) per cpu variable into the per cpu area.
3232 */
3233 zone->pageset = &boot_pageset;
3282 3234
3283 for (cpu = 0; cpu < NR_CPUS; cpu++) {
3284#ifdef CONFIG_NUMA
3285 /* Early boot. Slab allocator not functional yet */
3286 zone_pcp(zone, cpu) = &boot_pageset[cpu];
3287 setup_pageset(&boot_pageset[cpu],0);
3288#else
3289 setup_pageset(zone_pcp(zone,cpu), batch);
3290#endif
3291 }
3292 if (zone->present_pages) 3235 if (zone->present_pages)
3293 printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", 3236 printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n",
3294 zone->name, zone->present_pages, batch); 3237 zone->name, zone->present_pages,
3238 zone_batchsize(zone));
3295} 3239}
3296 3240
3297__meminit int init_currently_empty_zone(struct zone *zone, 3241__meminit int init_currently_empty_zone(struct zone *zone,
@@ -3430,6 +3374,61 @@ void __init free_bootmem_with_active_regions(int nid,
3430 } 3374 }
3431} 3375}
3432 3376
3377int __init add_from_early_node_map(struct range *range, int az,
3378 int nr_range, int nid)
3379{
3380 int i;
3381 u64 start, end;
3382
3383 /* need to go over early_node_map to find out good range for node */
3384 for_each_active_range_index_in_nid(i, nid) {
3385 start = early_node_map[i].start_pfn;
3386 end = early_node_map[i].end_pfn;
3387 nr_range = add_range(range, az, nr_range, start, end);
3388 }
3389 return nr_range;
3390}
3391
3392#ifdef CONFIG_NO_BOOTMEM
3393void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
3394 u64 goal, u64 limit)
3395{
3396 int i;
3397 void *ptr;
3398
3399 /* need to go over early_node_map to find out good range for node */
3400 for_each_active_range_index_in_nid(i, nid) {
3401 u64 addr;
3402 u64 ei_start, ei_last;
3403
3404 ei_last = early_node_map[i].end_pfn;
3405 ei_last <<= PAGE_SHIFT;
3406 ei_start = early_node_map[i].start_pfn;
3407 ei_start <<= PAGE_SHIFT;
3408 addr = find_early_area(ei_start, ei_last,
3409 goal, limit, size, align);
3410
3411 if (addr == -1ULL)
3412 continue;
3413
3414#if 0
3415 printk(KERN_DEBUG "alloc (nid=%d %llx - %llx) (%llx - %llx) %llx %llx => %llx\n",
3416 nid,
3417 ei_start, ei_last, goal, limit, size,
3418 align, addr);
3419#endif
3420
3421 ptr = phys_to_virt(addr);
3422 memset(ptr, 0, size);
3423 reserve_early_without_check(addr, addr + size, "BOOTMEM");
3424 return ptr;
3425 }
3426
3427 return NULL;
3428}
3429#endif
3430
3431
3433void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data) 3432void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data)
3434{ 3433{
3435 int i; 3434 int i;
@@ -3579,7 +3578,7 @@ static unsigned long __meminit zone_spanned_pages_in_node(int nid,
3579 * Return the number of holes in a range on a node. If nid is MAX_NUMNODES, 3578 * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
3580 * then all holes in the requested range will be accounted for. 3579 * then all holes in the requested range will be accounted for.
3581 */ 3580 */
3582static unsigned long __meminit __absent_pages_in_range(int nid, 3581unsigned long __meminit __absent_pages_in_range(int nid,
3583 unsigned long range_start_pfn, 3582 unsigned long range_start_pfn,
3584 unsigned long range_end_pfn) 3583 unsigned long range_end_pfn)
3585{ 3584{
@@ -3994,7 +3993,7 @@ void __init add_active_range(unsigned int nid, unsigned long start_pfn,
3994 } 3993 }
3995 3994
3996 /* Merge backward if suitable */ 3995 /* Merge backward if suitable */
3997 if (start_pfn < early_node_map[i].end_pfn && 3996 if (start_pfn < early_node_map[i].start_pfn &&
3998 end_pfn >= early_node_map[i].start_pfn) { 3997 end_pfn >= early_node_map[i].start_pfn) {
3999 early_node_map[i].start_pfn = start_pfn; 3998 early_node_map[i].start_pfn = start_pfn;
4000 return; 3999 return;
@@ -4108,7 +4107,7 @@ static int __init cmp_node_active_region(const void *a, const void *b)
4108} 4107}
4109 4108
4110/* sort the node_map by start_pfn */ 4109/* sort the node_map by start_pfn */
4111static void __init sort_node_map(void) 4110void __init sort_node_map(void)
4112{ 4111{
4113 sort(early_node_map, (size_t)nr_nodemap_entries, 4112 sort(early_node_map, (size_t)nr_nodemap_entries,
4114 sizeof(struct node_active_region), 4113 sizeof(struct node_active_region),
@@ -4462,7 +4461,11 @@ void __init set_dma_reserve(unsigned long new_dma_reserve)
4462} 4461}
4463 4462
4464#ifndef CONFIG_NEED_MULTIPLE_NODES 4463#ifndef CONFIG_NEED_MULTIPLE_NODES
4465struct pglist_data __refdata contig_page_data = { .bdata = &bootmem_node_data[0] }; 4464struct pglist_data __refdata contig_page_data = {
4465#ifndef CONFIG_NO_BOOTMEM
4466 .bdata = &bootmem_node_data[0]
4467#endif
4468 };
4466EXPORT_SYMBOL(contig_page_data); 4469EXPORT_SYMBOL(contig_page_data);
4467#endif 4470#endif
4468 4471
@@ -4805,10 +4808,11 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
4805 if (!write || (ret == -EINVAL)) 4808 if (!write || (ret == -EINVAL))
4806 return ret; 4809 return ret;
4807 for_each_populated_zone(zone) { 4810 for_each_populated_zone(zone) {
4808 for_each_online_cpu(cpu) { 4811 for_each_possible_cpu(cpu) {
4809 unsigned long high; 4812 unsigned long high;
4810 high = zone->present_pages / percpu_pagelist_fraction; 4813 high = zone->present_pages / percpu_pagelist_fraction;
4811 setup_pagelist_highmark(zone_pcp(zone, cpu), high); 4814 setup_pagelist_highmark(
4815 per_cpu_ptr(zone->pageset, cpu), high);
4812 } 4816 }
4813 } 4817 }
4814 return 0; 4818 return 0;
@@ -5008,23 +5012,65 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags,
5008int set_migratetype_isolate(struct page *page) 5012int set_migratetype_isolate(struct page *page)
5009{ 5013{
5010 struct zone *zone; 5014 struct zone *zone;
5011 unsigned long flags; 5015 struct page *curr_page;
5016 unsigned long flags, pfn, iter;
5017 unsigned long immobile = 0;
5018 struct memory_isolate_notify arg;
5019 int notifier_ret;
5012 int ret = -EBUSY; 5020 int ret = -EBUSY;
5013 int zone_idx; 5021 int zone_idx;
5014 5022
5015 zone = page_zone(page); 5023 zone = page_zone(page);
5016 zone_idx = zone_idx(zone); 5024 zone_idx = zone_idx(zone);
5025
5017 spin_lock_irqsave(&zone->lock, flags); 5026 spin_lock_irqsave(&zone->lock, flags);
5027 if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE ||
5028 zone_idx == ZONE_MOVABLE) {
5029 ret = 0;
5030 goto out;
5031 }
5032
5033 pfn = page_to_pfn(page);
5034 arg.start_pfn = pfn;
5035 arg.nr_pages = pageblock_nr_pages;
5036 arg.pages_found = 0;
5037
5018 /* 5038 /*
5019 * In future, more migrate types will be able to be isolation target. 5039 * It may be possible to isolate a pageblock even if the
5040 * migratetype is not MIGRATE_MOVABLE. The memory isolation
5041 * notifier chain is used by balloon drivers to return the
5042 * number of pages in a range that are held by the balloon
5043 * driver to shrink memory. If all the pages are accounted for
5044 * by balloons, are free, or on the LRU, isolation can continue.
5045 * Later, for example, when memory hotplug notifier runs, these
5046 * pages reported as "can be isolated" should be isolated(freed)
5047 * by the balloon driver through the memory notifier chain.
5020 */ 5048 */
5021 if (get_pageblock_migratetype(page) != MIGRATE_MOVABLE && 5049 notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg);
5022 zone_idx != ZONE_MOVABLE) 5050 notifier_ret = notifier_to_errno(notifier_ret);
5051 if (notifier_ret || !arg.pages_found)
5023 goto out; 5052 goto out;
5024 set_pageblock_migratetype(page, MIGRATE_ISOLATE); 5053
5025 move_freepages_block(zone, page, MIGRATE_ISOLATE); 5054 for (iter = pfn; iter < (pfn + pageblock_nr_pages); iter++) {
5026 ret = 0; 5055 if (!pfn_valid_within(pfn))
5056 continue;
5057
5058 curr_page = pfn_to_page(iter);
5059 if (!page_count(curr_page) || PageLRU(curr_page))
5060 continue;
5061
5062 immobile++;
5063 }
5064
5065 if (arg.pages_found == immobile)
5066 ret = 0;
5067
5027out: 5068out:
5069 if (!ret) {
5070 set_pageblock_migratetype(page, MIGRATE_ISOLATE);
5071 move_freepages_block(zone, page, MIGRATE_ISOLATE);
5072 }
5073
5028 spin_unlock_irqrestore(&zone->lock, flags); 5074 spin_unlock_irqrestore(&zone->lock, flags);
5029 if (!ret) 5075 if (!ret)
5030 drain_all_pages(); 5076 drain_all_pages();
diff --git a/mm/percpu.c b/mm/percpu.c
index 442010cc91c6..768419d44ad7 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -80,13 +80,15 @@
80/* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */ 80/* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */
81#ifndef __addr_to_pcpu_ptr 81#ifndef __addr_to_pcpu_ptr
82#define __addr_to_pcpu_ptr(addr) \ 82#define __addr_to_pcpu_ptr(addr) \
83 (void *)((unsigned long)(addr) - (unsigned long)pcpu_base_addr \ 83 (void __percpu *)((unsigned long)(addr) - \
84 + (unsigned long)__per_cpu_start) 84 (unsigned long)pcpu_base_addr + \
85 (unsigned long)__per_cpu_start)
85#endif 86#endif
86#ifndef __pcpu_ptr_to_addr 87#ifndef __pcpu_ptr_to_addr
87#define __pcpu_ptr_to_addr(ptr) \ 88#define __pcpu_ptr_to_addr(ptr) \
88 (void *)((unsigned long)(ptr) + (unsigned long)pcpu_base_addr \ 89 (void __force *)((unsigned long)(ptr) + \
89 - (unsigned long)__per_cpu_start) 90 (unsigned long)pcpu_base_addr - \
91 (unsigned long)__per_cpu_start)
90#endif 92#endif
91 93
92struct pcpu_chunk { 94struct pcpu_chunk {
@@ -913,11 +915,10 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size)
913 int rs, re; 915 int rs, re;
914 916
915 /* quick path, check whether it's empty already */ 917 /* quick path, check whether it's empty already */
916 pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { 918 rs = page_start;
917 if (rs == page_start && re == page_end) 919 pcpu_next_unpop(chunk, &rs, &re, page_end);
918 return; 920 if (rs == page_start && re == page_end)
919 break; 921 return;
920 }
921 922
922 /* immutable chunks can't be depopulated */ 923 /* immutable chunks can't be depopulated */
923 WARN_ON(chunk->immutable); 924 WARN_ON(chunk->immutable);
@@ -968,11 +969,10 @@ static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
968 int rs, re, rc; 969 int rs, re, rc;
969 970
970 /* quick path, check whether all pages are already there */ 971 /* quick path, check whether all pages are already there */
971 pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) { 972 rs = page_start;
972 if (rs == page_start && re == page_end) 973 pcpu_next_pop(chunk, &rs, &re, page_end);
973 goto clear; 974 if (rs == page_start && re == page_end)
974 break; 975 goto clear;
975 }
976 976
977 /* need to allocate and map pages, this chunk can't be immutable */ 977 /* need to allocate and map pages, this chunk can't be immutable */
978 WARN_ON(chunk->immutable); 978 WARN_ON(chunk->immutable);
@@ -1067,7 +1067,7 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void)
1067 * RETURNS: 1067 * RETURNS:
1068 * Percpu pointer to the allocated area on success, NULL on failure. 1068 * Percpu pointer to the allocated area on success, NULL on failure.
1069 */ 1069 */
1070static void *pcpu_alloc(size_t size, size_t align, bool reserved) 1070static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved)
1071{ 1071{
1072 static int warn_limit = 10; 1072 static int warn_limit = 10;
1073 struct pcpu_chunk *chunk; 1073 struct pcpu_chunk *chunk;
@@ -1196,7 +1196,7 @@ fail_unlock_mutex:
1196 * RETURNS: 1196 * RETURNS:
1197 * Percpu pointer to the allocated area on success, NULL on failure. 1197 * Percpu pointer to the allocated area on success, NULL on failure.
1198 */ 1198 */
1199void *__alloc_percpu(size_t size, size_t align) 1199void __percpu *__alloc_percpu(size_t size, size_t align)
1200{ 1200{
1201 return pcpu_alloc(size, align, false); 1201 return pcpu_alloc(size, align, false);
1202} 1202}
@@ -1217,7 +1217,7 @@ EXPORT_SYMBOL_GPL(__alloc_percpu);
1217 * RETURNS: 1217 * RETURNS:
1218 * Percpu pointer to the allocated area on success, NULL on failure. 1218 * Percpu pointer to the allocated area on success, NULL on failure.
1219 */ 1219 */
1220void *__alloc_reserved_percpu(size_t size, size_t align) 1220void __percpu *__alloc_reserved_percpu(size_t size, size_t align)
1221{ 1221{
1222 return pcpu_alloc(size, align, true); 1222 return pcpu_alloc(size, align, true);
1223} 1223}
@@ -1269,9 +1269,9 @@ static void pcpu_reclaim(struct work_struct *work)
1269 * CONTEXT: 1269 * CONTEXT:
1270 * Can be called from atomic context. 1270 * Can be called from atomic context.
1271 */ 1271 */
1272void free_percpu(void *ptr) 1272void free_percpu(void __percpu *ptr)
1273{ 1273{
1274 void *addr = __pcpu_ptr_to_addr(ptr); 1274 void *addr;
1275 struct pcpu_chunk *chunk; 1275 struct pcpu_chunk *chunk;
1276 unsigned long flags; 1276 unsigned long flags;
1277 int off; 1277 int off;
@@ -1279,6 +1279,8 @@ void free_percpu(void *ptr)
1279 if (!ptr) 1279 if (!ptr)
1280 return; 1280 return;
1281 1281
1282 addr = __pcpu_ptr_to_addr(ptr);
1283
1282 spin_lock_irqsave(&pcpu_lock, flags); 1284 spin_lock_irqsave(&pcpu_lock, flags);
1283 1285
1284 chunk = pcpu_chunk_addr_search(addr); 1286 chunk = pcpu_chunk_addr_search(addr);
diff --git a/mm/slab.c b/mm/slab.c
index 7d41f15b48d3..a9f325b28bed 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -654,7 +654,7 @@ static void init_node_lock_keys(int q)
654 654
655 l3 = s->cs_cachep->nodelists[q]; 655 l3 = s->cs_cachep->nodelists[q];
656 if (!l3 || OFF_SLAB(s->cs_cachep)) 656 if (!l3 || OFF_SLAB(s->cs_cachep))
657 return; 657 continue;
658 lockdep_set_class(&l3->list_lock, &on_slab_l3_key); 658 lockdep_set_class(&l3->list_lock, &on_slab_l3_key);
659 alc = l3->alien; 659 alc = l3->alien;
660 /* 660 /*
@@ -665,7 +665,7 @@ static void init_node_lock_keys(int q)
665 * for alloc_alien_cache, 665 * for alloc_alien_cache,
666 */ 666 */
667 if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC) 667 if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC)
668 return; 668 continue;
669 for_each_node(r) { 669 for_each_node(r) {
670 if (alc[r]) 670 if (alc[r])
671 lockdep_set_class(&alc[r]->lock, 671 lockdep_set_class(&alc[r]->lock,
@@ -935,7 +935,6 @@ static int transfer_objects(struct array_cache *to,
935 935
936 from->avail -= nr; 936 from->avail -= nr;
937 to->avail += nr; 937 to->avail += nr;
938 to->touched = 1;
939 return nr; 938 return nr;
940} 939}
941 940
@@ -983,13 +982,11 @@ static struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp)
983 982
984 if (limit > 1) 983 if (limit > 1)
985 limit = 12; 984 limit = 12;
986 ac_ptr = kmalloc_node(memsize, gfp, node); 985 ac_ptr = kzalloc_node(memsize, gfp, node);
987 if (ac_ptr) { 986 if (ac_ptr) {
988 for_each_node(i) { 987 for_each_node(i) {
989 if (i == node || !node_online(i)) { 988 if (i == node || !node_online(i))
990 ac_ptr[i] = NULL;
991 continue; 989 continue;
992 }
993 ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d, gfp); 990 ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d, gfp);
994 if (!ac_ptr[i]) { 991 if (!ac_ptr[i]) {
995 for (i--; i >= 0; i--) 992 for (i--; i >= 0; i--)
@@ -2963,8 +2960,10 @@ retry:
2963 spin_lock(&l3->list_lock); 2960 spin_lock(&l3->list_lock);
2964 2961
2965 /* See if we can refill from the shared array */ 2962 /* See if we can refill from the shared array */
2966 if (l3->shared && transfer_objects(ac, l3->shared, batchcount)) 2963 if (l3->shared && transfer_objects(ac, l3->shared, batchcount)) {
2964 l3->shared->touched = 1;
2967 goto alloc_done; 2965 goto alloc_done;
2966 }
2968 2967
2969 while (batchcount > 0) { 2968 while (batchcount > 0) {
2970 struct list_head *entry; 2969 struct list_head *entry;
@@ -3101,7 +3100,7 @@ static bool slab_should_failslab(struct kmem_cache *cachep, gfp_t flags)
3101 if (cachep == &cache_cache) 3100 if (cachep == &cache_cache)
3102 return false; 3101 return false;
3103 3102
3104 return should_failslab(obj_size(cachep), flags); 3103 return should_failslab(obj_size(cachep), flags, cachep->flags);
3105} 3104}
3106 3105
3107static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) 3106static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
diff --git a/mm/slub.c b/mm/slub.c
index bd4a9e942ace..3525a4ec9794 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -151,7 +151,8 @@
151 * Set of flags that will prevent slab merging 151 * Set of flags that will prevent slab merging
152 */ 152 */
153#define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ 153#define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
154 SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE) 154 SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \
155 SLAB_FAILSLAB)
155 156
156#define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \ 157#define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \
157 SLAB_CACHE_DMA | SLAB_NOTRACK) 158 SLAB_CACHE_DMA | SLAB_NOTRACK)
@@ -1004,6 +1005,9 @@ static int __init setup_slub_debug(char *str)
1004 case 't': 1005 case 't':
1005 slub_debug |= SLAB_TRACE; 1006 slub_debug |= SLAB_TRACE;
1006 break; 1007 break;
1008 case 'a':
1009 slub_debug |= SLAB_FAILSLAB;
1010 break;
1007 default: 1011 default:
1008 printk(KERN_ERR "slub_debug option '%c' " 1012 printk(KERN_ERR "slub_debug option '%c' "
1009 "unknown. skipped\n", *str); 1013 "unknown. skipped\n", *str);
@@ -1700,7 +1704,7 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
1700 lockdep_trace_alloc(gfpflags); 1704 lockdep_trace_alloc(gfpflags);
1701 might_sleep_if(gfpflags & __GFP_WAIT); 1705 might_sleep_if(gfpflags & __GFP_WAIT);
1702 1706
1703 if (should_failslab(s->objsize, gfpflags)) 1707 if (should_failslab(s->objsize, gfpflags, s->flags))
1704 return NULL; 1708 return NULL;
1705 1709
1706 local_irq_save(flags); 1710 local_irq_save(flags);
@@ -4017,6 +4021,23 @@ static ssize_t trace_store(struct kmem_cache *s, const char *buf,
4017} 4021}
4018SLAB_ATTR(trace); 4022SLAB_ATTR(trace);
4019 4023
4024#ifdef CONFIG_FAILSLAB
4025static ssize_t failslab_show(struct kmem_cache *s, char *buf)
4026{
4027 return sprintf(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB));
4028}
4029
4030static ssize_t failslab_store(struct kmem_cache *s, const char *buf,
4031 size_t length)
4032{
4033 s->flags &= ~SLAB_FAILSLAB;
4034 if (buf[0] == '1')
4035 s->flags |= SLAB_FAILSLAB;
4036 return length;
4037}
4038SLAB_ATTR(failslab);
4039#endif
4040
4020static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf) 4041static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf)
4021{ 4042{
4022 return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT)); 4043 return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT));
@@ -4313,6 +4334,10 @@ static struct attribute *slab_attrs[] = {
4313 &deactivate_remote_frees_attr.attr, 4334 &deactivate_remote_frees_attr.attr,
4314 &order_fallback_attr.attr, 4335 &order_fallback_attr.attr,
4315#endif 4336#endif
4337#ifdef CONFIG_FAILSLAB
4338 &failslab_attr.attr,
4339#endif
4340
4316 NULL 4341 NULL
4317}; 4342};
4318 4343
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index d9714bdcb4a3..392b9bb5bc01 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -40,9 +40,11 @@ static void * __init_refok __earlyonly_bootmem_alloc(int node,
40 unsigned long align, 40 unsigned long align,
41 unsigned long goal) 41 unsigned long goal)
42{ 42{
43 return __alloc_bootmem_node(NODE_DATA(node), size, align, goal); 43 return __alloc_bootmem_node_high(NODE_DATA(node), size, align, goal);
44} 44}
45 45
46static void *vmemmap_buf;
47static void *vmemmap_buf_end;
46 48
47void * __meminit vmemmap_alloc_block(unsigned long size, int node) 49void * __meminit vmemmap_alloc_block(unsigned long size, int node)
48{ 50{
@@ -64,6 +66,24 @@ void * __meminit vmemmap_alloc_block(unsigned long size, int node)
64 __pa(MAX_DMA_ADDRESS)); 66 __pa(MAX_DMA_ADDRESS));
65} 67}
66 68
69/* need to make sure size is all the same during early stage */
70void * __meminit vmemmap_alloc_block_buf(unsigned long size, int node)
71{
72 void *ptr;
73
74 if (!vmemmap_buf)
75 return vmemmap_alloc_block(size, node);
76
77 /* take the from buf */
78 ptr = (void *)ALIGN((unsigned long)vmemmap_buf, size);
79 if (ptr + size > vmemmap_buf_end)
80 return vmemmap_alloc_block(size, node);
81
82 vmemmap_buf = ptr + size;
83
84 return ptr;
85}
86
67void __meminit vmemmap_verify(pte_t *pte, int node, 87void __meminit vmemmap_verify(pte_t *pte, int node,
68 unsigned long start, unsigned long end) 88 unsigned long start, unsigned long end)
69{ 89{
@@ -80,7 +100,7 @@ pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node)
80 pte_t *pte = pte_offset_kernel(pmd, addr); 100 pte_t *pte = pte_offset_kernel(pmd, addr);
81 if (pte_none(*pte)) { 101 if (pte_none(*pte)) {
82 pte_t entry; 102 pte_t entry;
83 void *p = vmemmap_alloc_block(PAGE_SIZE, node); 103 void *p = vmemmap_alloc_block_buf(PAGE_SIZE, node);
84 if (!p) 104 if (!p)
85 return NULL; 105 return NULL;
86 entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL); 106 entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
@@ -163,3 +183,55 @@ struct page * __meminit sparse_mem_map_populate(unsigned long pnum, int nid)
163 183
164 return map; 184 return map;
165} 185}
186
187void __init sparse_mem_maps_populate_node(struct page **map_map,
188 unsigned long pnum_begin,
189 unsigned long pnum_end,
190 unsigned long map_count, int nodeid)
191{
192 unsigned long pnum;
193 unsigned long size = sizeof(struct page) * PAGES_PER_SECTION;
194 void *vmemmap_buf_start;
195
196 size = ALIGN(size, PMD_SIZE);
197 vmemmap_buf_start = __earlyonly_bootmem_alloc(nodeid, size * map_count,
198 PMD_SIZE, __pa(MAX_DMA_ADDRESS));
199
200 if (vmemmap_buf_start) {
201 vmemmap_buf = vmemmap_buf_start;
202 vmemmap_buf_end = vmemmap_buf_start + size * map_count;
203 }
204
205 for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
206 struct mem_section *ms;
207
208 if (!present_section_nr(pnum))
209 continue;
210
211 map_map[pnum] = sparse_mem_map_populate(pnum, nodeid);
212 if (map_map[pnum])
213 continue;
214 ms = __nr_to_section(pnum);
215 printk(KERN_ERR "%s: sparsemem memory map backing failed "
216 "some memory will not be available.\n", __func__);
217 ms->section_mem_map = 0;
218 }
219
220 if (vmemmap_buf_start) {
221 /* need to free left buf */
222#ifdef CONFIG_NO_BOOTMEM
223 free_early(__pa(vmemmap_buf_start), __pa(vmemmap_buf_end));
224 if (vmemmap_buf_start < vmemmap_buf) {
225 char name[15];
226
227 snprintf(name, sizeof(name), "MEMMAP %d", nodeid);
228 reserve_early_without_check(__pa(vmemmap_buf_start),
229 __pa(vmemmap_buf), name);
230 }
231#else
232 free_bootmem(__pa(vmemmap_buf), vmemmap_buf_end - vmemmap_buf);
233#endif
234 vmemmap_buf = NULL;
235 vmemmap_buf_end = NULL;
236 }
237}
diff --git a/mm/sparse.c b/mm/sparse.c
index 6ce4aab69e99..22896d589133 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -271,7 +271,8 @@ static unsigned long *__kmalloc_section_usemap(void)
271 271
272#ifdef CONFIG_MEMORY_HOTREMOVE 272#ifdef CONFIG_MEMORY_HOTREMOVE
273static unsigned long * __init 273static unsigned long * __init
274sparse_early_usemap_alloc_pgdat_section(struct pglist_data *pgdat) 274sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
275 unsigned long count)
275{ 276{
276 unsigned long section_nr; 277 unsigned long section_nr;
277 278
@@ -286,7 +287,7 @@ sparse_early_usemap_alloc_pgdat_section(struct pglist_data *pgdat)
286 * this problem. 287 * this problem.
287 */ 288 */
288 section_nr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT); 289 section_nr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT);
289 return alloc_bootmem_section(usemap_size(), section_nr); 290 return alloc_bootmem_section(usemap_size() * count, section_nr);
290} 291}
291 292
292static void __init check_usemap_section_nr(int nid, unsigned long *usemap) 293static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
@@ -329,7 +330,8 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
329} 330}
330#else 331#else
331static unsigned long * __init 332static unsigned long * __init
332sparse_early_usemap_alloc_pgdat_section(struct pglist_data *pgdat) 333sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
334 unsigned long count)
333{ 335{
334 return NULL; 336 return NULL;
335} 337}
@@ -339,27 +341,40 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
339} 341}
340#endif /* CONFIG_MEMORY_HOTREMOVE */ 342#endif /* CONFIG_MEMORY_HOTREMOVE */
341 343
342static unsigned long *__init sparse_early_usemap_alloc(unsigned long pnum) 344static void __init sparse_early_usemaps_alloc_node(unsigned long**usemap_map,
345 unsigned long pnum_begin,
346 unsigned long pnum_end,
347 unsigned long usemap_count, int nodeid)
343{ 348{
344 unsigned long *usemap; 349 void *usemap;
345 struct mem_section *ms = __nr_to_section(pnum); 350 unsigned long pnum;
346 int nid = sparse_early_nid(ms); 351 int size = usemap_size();
347
348 usemap = sparse_early_usemap_alloc_pgdat_section(NODE_DATA(nid));
349 if (usemap)
350 return usemap;
351 352
352 usemap = alloc_bootmem_node(NODE_DATA(nid), usemap_size()); 353 usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid),
354 usemap_count);
353 if (usemap) { 355 if (usemap) {
354 check_usemap_section_nr(nid, usemap); 356 for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
355 return usemap; 357 if (!present_section_nr(pnum))
358 continue;
359 usemap_map[pnum] = usemap;
360 usemap += size;
361 }
362 return;
356 } 363 }
357 364
358 /* Stupid: suppress gcc warning for SPARSEMEM && !NUMA */ 365 usemap = alloc_bootmem_node(NODE_DATA(nodeid), size * usemap_count);
359 nid = 0; 366 if (usemap) {
367 for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
368 if (!present_section_nr(pnum))
369 continue;
370 usemap_map[pnum] = usemap;
371 usemap += size;
372 check_usemap_section_nr(nodeid, usemap_map[pnum]);
373 }
374 return;
375 }
360 376
361 printk(KERN_WARNING "%s: allocation failed\n", __func__); 377 printk(KERN_WARNING "%s: allocation failed\n", __func__);
362 return NULL;
363} 378}
364 379
365#ifndef CONFIG_SPARSEMEM_VMEMMAP 380#ifndef CONFIG_SPARSEMEM_VMEMMAP
@@ -375,8 +390,65 @@ struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid)
375 PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION)); 390 PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION));
376 return map; 391 return map;
377} 392}
393void __init sparse_mem_maps_populate_node(struct page **map_map,
394 unsigned long pnum_begin,
395 unsigned long pnum_end,
396 unsigned long map_count, int nodeid)
397{
398 void *map;
399 unsigned long pnum;
400 unsigned long size = sizeof(struct page) * PAGES_PER_SECTION;
401
402 map = alloc_remap(nodeid, size * map_count);
403 if (map) {
404 for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
405 if (!present_section_nr(pnum))
406 continue;
407 map_map[pnum] = map;
408 map += size;
409 }
410 return;
411 }
412
413 size = PAGE_ALIGN(size);
414 map = alloc_bootmem_pages_node(NODE_DATA(nodeid), size * map_count);
415 if (map) {
416 for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
417 if (!present_section_nr(pnum))
418 continue;
419 map_map[pnum] = map;
420 map += size;
421 }
422 return;
423 }
424
425 /* fallback */
426 for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
427 struct mem_section *ms;
428
429 if (!present_section_nr(pnum))
430 continue;
431 map_map[pnum] = sparse_mem_map_populate(pnum, nodeid);
432 if (map_map[pnum])
433 continue;
434 ms = __nr_to_section(pnum);
435 printk(KERN_ERR "%s: sparsemem memory map backing failed "
436 "some memory will not be available.\n", __func__);
437 ms->section_mem_map = 0;
438 }
439}
378#endif /* !CONFIG_SPARSEMEM_VMEMMAP */ 440#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
379 441
442#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
443static void __init sparse_early_mem_maps_alloc_node(struct page **map_map,
444 unsigned long pnum_begin,
445 unsigned long pnum_end,
446 unsigned long map_count, int nodeid)
447{
448 sparse_mem_maps_populate_node(map_map, pnum_begin, pnum_end,
449 map_count, nodeid);
450}
451#else
380static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) 452static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum)
381{ 453{
382 struct page *map; 454 struct page *map;
@@ -392,10 +464,12 @@ static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum)
392 ms->section_mem_map = 0; 464 ms->section_mem_map = 0;
393 return NULL; 465 return NULL;
394} 466}
467#endif
395 468
396void __attribute__((weak)) __meminit vmemmap_populate_print_last(void) 469void __attribute__((weak)) __meminit vmemmap_populate_print_last(void)
397{ 470{
398} 471}
472
399/* 473/*
400 * Allocate the accumulated non-linear sections, allocate a mem_map 474 * Allocate the accumulated non-linear sections, allocate a mem_map
401 * for each and record the physical to section mapping. 475 * for each and record the physical to section mapping.
@@ -407,6 +481,14 @@ void __init sparse_init(void)
407 unsigned long *usemap; 481 unsigned long *usemap;
408 unsigned long **usemap_map; 482 unsigned long **usemap_map;
409 int size; 483 int size;
484 int nodeid_begin = 0;
485 unsigned long pnum_begin = 0;
486 unsigned long usemap_count;
487#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
488 unsigned long map_count;
489 int size2;
490 struct page **map_map;
491#endif
410 492
411 /* 493 /*
412 * map is using big page (aka 2M in x86 64 bit) 494 * map is using big page (aka 2M in x86 64 bit)
@@ -425,10 +507,81 @@ void __init sparse_init(void)
425 panic("can not allocate usemap_map\n"); 507 panic("can not allocate usemap_map\n");
426 508
427 for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { 509 for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
510 struct mem_section *ms;
511
428 if (!present_section_nr(pnum)) 512 if (!present_section_nr(pnum))
429 continue; 513 continue;
430 usemap_map[pnum] = sparse_early_usemap_alloc(pnum); 514 ms = __nr_to_section(pnum);
515 nodeid_begin = sparse_early_nid(ms);
516 pnum_begin = pnum;
517 break;
431 } 518 }
519 usemap_count = 1;
520 for (pnum = pnum_begin + 1; pnum < NR_MEM_SECTIONS; pnum++) {
521 struct mem_section *ms;
522 int nodeid;
523
524 if (!present_section_nr(pnum))
525 continue;
526 ms = __nr_to_section(pnum);
527 nodeid = sparse_early_nid(ms);
528 if (nodeid == nodeid_begin) {
529 usemap_count++;
530 continue;
531 }
532 /* ok, we need to take cake of from pnum_begin to pnum - 1*/
533 sparse_early_usemaps_alloc_node(usemap_map, pnum_begin, pnum,
534 usemap_count, nodeid_begin);
535 /* new start, update count etc*/
536 nodeid_begin = nodeid;
537 pnum_begin = pnum;
538 usemap_count = 1;
539 }
540 /* ok, last chunk */
541 sparse_early_usemaps_alloc_node(usemap_map, pnum_begin, NR_MEM_SECTIONS,
542 usemap_count, nodeid_begin);
543
544#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
545 size2 = sizeof(struct page *) * NR_MEM_SECTIONS;
546 map_map = alloc_bootmem(size2);
547 if (!map_map)
548 panic("can not allocate map_map\n");
549
550 for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
551 struct mem_section *ms;
552
553 if (!present_section_nr(pnum))
554 continue;
555 ms = __nr_to_section(pnum);
556 nodeid_begin = sparse_early_nid(ms);
557 pnum_begin = pnum;
558 break;
559 }
560 map_count = 1;
561 for (pnum = pnum_begin + 1; pnum < NR_MEM_SECTIONS; pnum++) {
562 struct mem_section *ms;
563 int nodeid;
564
565 if (!present_section_nr(pnum))
566 continue;
567 ms = __nr_to_section(pnum);
568 nodeid = sparse_early_nid(ms);
569 if (nodeid == nodeid_begin) {
570 map_count++;
571 continue;
572 }
573 /* ok, we need to take cake of from pnum_begin to pnum - 1*/
574 sparse_early_mem_maps_alloc_node(map_map, pnum_begin, pnum,
575 map_count, nodeid_begin);
576 /* new start, update count etc*/
577 nodeid_begin = nodeid;
578 pnum_begin = pnum;
579 map_count = 1;
580 }
581 /* ok, last chunk */
582 sparse_early_mem_maps_alloc_node(map_map, pnum_begin, NR_MEM_SECTIONS,
583 map_count, nodeid_begin);
584#endif
432 585
433 for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { 586 for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
434 if (!present_section_nr(pnum)) 587 if (!present_section_nr(pnum))
@@ -438,7 +591,11 @@ void __init sparse_init(void)
438 if (!usemap) 591 if (!usemap)
439 continue; 592 continue;
440 593
594#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
595 map = map_map[pnum];
596#else
441 map = sparse_early_mem_map_alloc(pnum); 597 map = sparse_early_mem_map_alloc(pnum);
598#endif
442 if (!map) 599 if (!map)
443 continue; 600 continue;
444 601
@@ -448,6 +605,9 @@ void __init sparse_init(void)
448 605
449 vmemmap_populate_print_last(); 606 vmemmap_populate_print_last();
450 607
608#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
609 free_bootmem(__pa(map_map), size2);
610#endif
451 free_bootmem(__pa(usemap_map), size); 611 free_bootmem(__pa(usemap_map), size);
452} 612}
453 613
diff --git a/mm/truncate.c b/mm/truncate.c
index 342deee22684..e87e37244829 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -522,22 +522,20 @@ EXPORT_SYMBOL_GPL(invalidate_inode_pages2);
522 */ 522 */
523void truncate_pagecache(struct inode *inode, loff_t old, loff_t new) 523void truncate_pagecache(struct inode *inode, loff_t old, loff_t new)
524{ 524{
525 if (new < old) { 525 struct address_space *mapping = inode->i_mapping;
526 struct address_space *mapping = inode->i_mapping; 526
527 527 /*
528 /* 528 * unmap_mapping_range is called twice, first simply for
529 * unmap_mapping_range is called twice, first simply for 529 * efficiency so that truncate_inode_pages does fewer
530 * efficiency so that truncate_inode_pages does fewer 530 * single-page unmaps. However after this first call, and
531 * single-page unmaps. However after this first call, and 531 * before truncate_inode_pages finishes, it is possible for
532 * before truncate_inode_pages finishes, it is possible for 532 * private pages to be COWed, which remain after
533 * private pages to be COWed, which remain after 533 * truncate_inode_pages finishes, hence the second
534 * truncate_inode_pages finishes, hence the second 534 * unmap_mapping_range call must be made for correctness.
535 * unmap_mapping_range call must be made for correctness. 535 */
536 */ 536 unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1);
537 unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1); 537 truncate_inode_pages(mapping, new);
538 truncate_inode_pages(mapping, new); 538 unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1);
539 unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1);
540 }
541} 539}
542EXPORT_SYMBOL(truncate_pagecache); 540EXPORT_SYMBOL(truncate_pagecache);
543 541
diff --git a/mm/util.c b/mm/util.c
index b377ce430803..834db7be240f 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -4,10 +4,6 @@
4#include <linux/module.h> 4#include <linux/module.h>
5#include <linux/err.h> 5#include <linux/err.h>
6#include <linux/sched.h> 6#include <linux/sched.h>
7#include <linux/hugetlb.h>
8#include <linux/syscalls.h>
9#include <linux/mman.h>
10#include <linux/file.h>
11#include <asm/uaccess.h> 7#include <asm/uaccess.h>
12 8
13#define CREATE_TRACE_POINTS 9#define CREATE_TRACE_POINTS
@@ -224,7 +220,7 @@ char *strndup_user(const char __user *s, long n)
224} 220}
225EXPORT_SYMBOL(strndup_user); 221EXPORT_SYMBOL(strndup_user);
226 222
227#ifndef HAVE_ARCH_PICK_MMAP_LAYOUT 223#if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT)
228void arch_pick_mmap_layout(struct mm_struct *mm) 224void arch_pick_mmap_layout(struct mm_struct *mm)
229{ 225{
230 mm->mmap_base = TASK_UNMAPPED_BASE; 226 mm->mmap_base = TASK_UNMAPPED_BASE;
@@ -272,46 +268,6 @@ int __attribute__((weak)) get_user_pages_fast(unsigned long start,
272} 268}
273EXPORT_SYMBOL_GPL(get_user_pages_fast); 269EXPORT_SYMBOL_GPL(get_user_pages_fast);
274 270
275SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
276 unsigned long, prot, unsigned long, flags,
277 unsigned long, fd, unsigned long, pgoff)
278{
279 struct file * file = NULL;
280 unsigned long retval = -EBADF;
281
282 if (!(flags & MAP_ANONYMOUS)) {
283 if (unlikely(flags & MAP_HUGETLB))
284 return -EINVAL;
285 file = fget(fd);
286 if (!file)
287 goto out;
288 } else if (flags & MAP_HUGETLB) {
289 struct user_struct *user = NULL;
290 /*
291 * VM_NORESERVE is used because the reservations will be
292 * taken when vm_ops->mmap() is called
293 * A dummy user value is used because we are not locking
294 * memory so no accounting is necessary
295 */
296 len = ALIGN(len, huge_page_size(&default_hstate));
297 file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE,
298 &user, HUGETLB_ANONHUGE_INODE);
299 if (IS_ERR(file))
300 return PTR_ERR(file);
301 }
302
303 flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
304
305 down_write(&current->mm->mmap_sem);
306 retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
307 up_write(&current->mm->mmap_sem);
308
309 if (file)
310 fput(file);
311out:
312 return retval;
313}
314
315/* Tracepoints definitions. */ 271/* Tracepoints definitions. */
316EXPORT_TRACEPOINT_SYMBOL(kmalloc); 272EXPORT_TRACEPOINT_SYMBOL(kmalloc);
317EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); 273EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 37e69295f250..ae007462b7f6 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -509,6 +509,9 @@ static unsigned long lazy_max_pages(void)
509 509
510static atomic_t vmap_lazy_nr = ATOMIC_INIT(0); 510static atomic_t vmap_lazy_nr = ATOMIC_INIT(0);
511 511
512/* for per-CPU blocks */
513static void purge_fragmented_blocks_allcpus(void);
514
512/* 515/*
513 * Purges all lazily-freed vmap areas. 516 * Purges all lazily-freed vmap areas.
514 * 517 *
@@ -539,6 +542,9 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
539 } else 542 } else
540 spin_lock(&purge_lock); 543 spin_lock(&purge_lock);
541 544
545 if (sync)
546 purge_fragmented_blocks_allcpus();
547
542 rcu_read_lock(); 548 rcu_read_lock();
543 list_for_each_entry_rcu(va, &vmap_area_list, list) { 549 list_for_each_entry_rcu(va, &vmap_area_list, list) {
544 if (va->flags & VM_LAZY_FREE) { 550 if (va->flags & VM_LAZY_FREE) {
@@ -555,10 +561,8 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
555 } 561 }
556 rcu_read_unlock(); 562 rcu_read_unlock();
557 563
558 if (nr) { 564 if (nr)
559 BUG_ON(nr > atomic_read(&vmap_lazy_nr));
560 atomic_sub(nr, &vmap_lazy_nr); 565 atomic_sub(nr, &vmap_lazy_nr);
561 }
562 566
563 if (nr || force_flush) 567 if (nr || force_flush)
564 flush_tlb_kernel_range(*start, *end); 568 flush_tlb_kernel_range(*start, *end);
@@ -669,8 +673,6 @@ static bool vmap_initialized __read_mostly = false;
669struct vmap_block_queue { 673struct vmap_block_queue {
670 spinlock_t lock; 674 spinlock_t lock;
671 struct list_head free; 675 struct list_head free;
672 struct list_head dirty;
673 unsigned int nr_dirty;
674}; 676};
675 677
676struct vmap_block { 678struct vmap_block {
@@ -680,10 +682,9 @@ struct vmap_block {
680 unsigned long free, dirty; 682 unsigned long free, dirty;
681 DECLARE_BITMAP(alloc_map, VMAP_BBMAP_BITS); 683 DECLARE_BITMAP(alloc_map, VMAP_BBMAP_BITS);
682 DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS); 684 DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS);
683 union { 685 struct list_head free_list;
684 struct list_head free_list; 686 struct rcu_head rcu_head;
685 struct rcu_head rcu_head; 687 struct list_head purge;
686 };
687}; 688};
688 689
689/* Queue of free and dirty vmap blocks, for allocation and flushing purposes */ 690/* Queue of free and dirty vmap blocks, for allocation and flushing purposes */
@@ -759,7 +760,7 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
759 vbq = &get_cpu_var(vmap_block_queue); 760 vbq = &get_cpu_var(vmap_block_queue);
760 vb->vbq = vbq; 761 vb->vbq = vbq;
761 spin_lock(&vbq->lock); 762 spin_lock(&vbq->lock);
762 list_add(&vb->free_list, &vbq->free); 763 list_add_rcu(&vb->free_list, &vbq->free);
763 spin_unlock(&vbq->lock); 764 spin_unlock(&vbq->lock);
764 put_cpu_var(vmap_block_queue); 765 put_cpu_var(vmap_block_queue);
765 766
@@ -778,8 +779,6 @@ static void free_vmap_block(struct vmap_block *vb)
778 struct vmap_block *tmp; 779 struct vmap_block *tmp;
779 unsigned long vb_idx; 780 unsigned long vb_idx;
780 781
781 BUG_ON(!list_empty(&vb->free_list));
782
783 vb_idx = addr_to_vb_idx(vb->va->va_start); 782 vb_idx = addr_to_vb_idx(vb->va->va_start);
784 spin_lock(&vmap_block_tree_lock); 783 spin_lock(&vmap_block_tree_lock);
785 tmp = radix_tree_delete(&vmap_block_tree, vb_idx); 784 tmp = radix_tree_delete(&vmap_block_tree, vb_idx);
@@ -790,12 +789,61 @@ static void free_vmap_block(struct vmap_block *vb)
790 call_rcu(&vb->rcu_head, rcu_free_vb); 789 call_rcu(&vb->rcu_head, rcu_free_vb);
791} 790}
792 791
792static void purge_fragmented_blocks(int cpu)
793{
794 LIST_HEAD(purge);
795 struct vmap_block *vb;
796 struct vmap_block *n_vb;
797 struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
798
799 rcu_read_lock();
800 list_for_each_entry_rcu(vb, &vbq->free, free_list) {
801
802 if (!(vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS))
803 continue;
804
805 spin_lock(&vb->lock);
806 if (vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS) {
807 vb->free = 0; /* prevent further allocs after releasing lock */
808 vb->dirty = VMAP_BBMAP_BITS; /* prevent purging it again */
809 bitmap_fill(vb->alloc_map, VMAP_BBMAP_BITS);
810 bitmap_fill(vb->dirty_map, VMAP_BBMAP_BITS);
811 spin_lock(&vbq->lock);
812 list_del_rcu(&vb->free_list);
813 spin_unlock(&vbq->lock);
814 spin_unlock(&vb->lock);
815 list_add_tail(&vb->purge, &purge);
816 } else
817 spin_unlock(&vb->lock);
818 }
819 rcu_read_unlock();
820
821 list_for_each_entry_safe(vb, n_vb, &purge, purge) {
822 list_del(&vb->purge);
823 free_vmap_block(vb);
824 }
825}
826
827static void purge_fragmented_blocks_thiscpu(void)
828{
829 purge_fragmented_blocks(smp_processor_id());
830}
831
832static void purge_fragmented_blocks_allcpus(void)
833{
834 int cpu;
835
836 for_each_possible_cpu(cpu)
837 purge_fragmented_blocks(cpu);
838}
839
793static void *vb_alloc(unsigned long size, gfp_t gfp_mask) 840static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
794{ 841{
795 struct vmap_block_queue *vbq; 842 struct vmap_block_queue *vbq;
796 struct vmap_block *vb; 843 struct vmap_block *vb;
797 unsigned long addr = 0; 844 unsigned long addr = 0;
798 unsigned int order; 845 unsigned int order;
846 int purge = 0;
799 847
800 BUG_ON(size & ~PAGE_MASK); 848 BUG_ON(size & ~PAGE_MASK);
801 BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); 849 BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
@@ -808,24 +856,38 @@ again:
808 int i; 856 int i;
809 857
810 spin_lock(&vb->lock); 858 spin_lock(&vb->lock);
859 if (vb->free < 1UL << order)
860 goto next;
861
811 i = bitmap_find_free_region(vb->alloc_map, 862 i = bitmap_find_free_region(vb->alloc_map,
812 VMAP_BBMAP_BITS, order); 863 VMAP_BBMAP_BITS, order);
813 864
814 if (i >= 0) { 865 if (i < 0) {
815 addr = vb->va->va_start + (i << PAGE_SHIFT); 866 if (vb->free + vb->dirty == VMAP_BBMAP_BITS) {
816 BUG_ON(addr_to_vb_idx(addr) != 867 /* fragmented and no outstanding allocations */
817 addr_to_vb_idx(vb->va->va_start)); 868 BUG_ON(vb->dirty != VMAP_BBMAP_BITS);
818 vb->free -= 1UL << order; 869 purge = 1;
819 if (vb->free == 0) {
820 spin_lock(&vbq->lock);
821 list_del_init(&vb->free_list);
822 spin_unlock(&vbq->lock);
823 } 870 }
824 spin_unlock(&vb->lock); 871 goto next;
825 break; 872 }
873 addr = vb->va->va_start + (i << PAGE_SHIFT);
874 BUG_ON(addr_to_vb_idx(addr) !=
875 addr_to_vb_idx(vb->va->va_start));
876 vb->free -= 1UL << order;
877 if (vb->free == 0) {
878 spin_lock(&vbq->lock);
879 list_del_rcu(&vb->free_list);
880 spin_unlock(&vbq->lock);
826 } 881 }
827 spin_unlock(&vb->lock); 882 spin_unlock(&vb->lock);
883 break;
884next:
885 spin_unlock(&vb->lock);
828 } 886 }
887
888 if (purge)
889 purge_fragmented_blocks_thiscpu();
890
829 put_cpu_var(vmap_block_queue); 891 put_cpu_var(vmap_block_queue);
830 rcu_read_unlock(); 892 rcu_read_unlock();
831 893
@@ -862,11 +924,11 @@ static void vb_free(const void *addr, unsigned long size)
862 BUG_ON(!vb); 924 BUG_ON(!vb);
863 925
864 spin_lock(&vb->lock); 926 spin_lock(&vb->lock);
865 bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order); 927 BUG_ON(bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order));
866 928
867 vb->dirty += 1UL << order; 929 vb->dirty += 1UL << order;
868 if (vb->dirty == VMAP_BBMAP_BITS) { 930 if (vb->dirty == VMAP_BBMAP_BITS) {
869 BUG_ON(vb->free || !list_empty(&vb->free_list)); 931 BUG_ON(vb->free);
870 spin_unlock(&vb->lock); 932 spin_unlock(&vb->lock);
871 free_vmap_block(vb); 933 free_vmap_block(vb);
872 } else 934 } else
@@ -1035,8 +1097,6 @@ void __init vmalloc_init(void)
1035 vbq = &per_cpu(vmap_block_queue, i); 1097 vbq = &per_cpu(vmap_block_queue, i);
1036 spin_lock_init(&vbq->lock); 1098 spin_lock_init(&vbq->lock);
1037 INIT_LIST_HEAD(&vbq->free); 1099 INIT_LIST_HEAD(&vbq->free);
1038 INIT_LIST_HEAD(&vbq->dirty);
1039 vbq->nr_dirty = 0;
1040 } 1100 }
1041 1101
1042 /* Import existing vmlist entries. */ 1102 /* Import existing vmlist entries. */
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 885207a6b6b7..c26986c85ce0 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1922,6 +1922,9 @@ static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining)
1922 if (!populated_zone(zone)) 1922 if (!populated_zone(zone))
1923 continue; 1923 continue;
1924 1924
1925 if (zone_is_all_unreclaimable(zone))
1926 continue;
1927
1925 if (!zone_watermark_ok(zone, order, high_wmark_pages(zone), 1928 if (!zone_watermark_ok(zone, order, high_wmark_pages(zone),
1926 0, 0)) 1929 0, 0))
1927 return 1; 1930 return 1;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 6051fbab67ba..fc5aa183bc45 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -139,7 +139,8 @@ static void refresh_zone_stat_thresholds(void)
139 threshold = calculate_threshold(zone); 139 threshold = calculate_threshold(zone);
140 140
141 for_each_online_cpu(cpu) 141 for_each_online_cpu(cpu)
142 zone_pcp(zone, cpu)->stat_threshold = threshold; 142 per_cpu_ptr(zone->pageset, cpu)->stat_threshold
143 = threshold;
143 } 144 }
144} 145}
145 146
@@ -149,7 +150,8 @@ static void refresh_zone_stat_thresholds(void)
149void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, 150void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
150 int delta) 151 int delta)
151{ 152{
152 struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id()); 153 struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset);
154
153 s8 *p = pcp->vm_stat_diff + item; 155 s8 *p = pcp->vm_stat_diff + item;
154 long x; 156 long x;
155 157
@@ -202,7 +204,7 @@ EXPORT_SYMBOL(mod_zone_page_state);
202 */ 204 */
203void __inc_zone_state(struct zone *zone, enum zone_stat_item item) 205void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
204{ 206{
205 struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id()); 207 struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset);
206 s8 *p = pcp->vm_stat_diff + item; 208 s8 *p = pcp->vm_stat_diff + item;
207 209
208 (*p)++; 210 (*p)++;
@@ -223,7 +225,7 @@ EXPORT_SYMBOL(__inc_zone_page_state);
223 225
224void __dec_zone_state(struct zone *zone, enum zone_stat_item item) 226void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
225{ 227{
226 struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id()); 228 struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset);
227 s8 *p = pcp->vm_stat_diff + item; 229 s8 *p = pcp->vm_stat_diff + item;
228 230
229 (*p)--; 231 (*p)--;
@@ -300,7 +302,7 @@ void refresh_cpu_vm_stats(int cpu)
300 for_each_populated_zone(zone) { 302 for_each_populated_zone(zone) {
301 struct per_cpu_pageset *p; 303 struct per_cpu_pageset *p;
302 304
303 p = zone_pcp(zone, cpu); 305 p = per_cpu_ptr(zone->pageset, cpu);
304 306
305 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) 307 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
306 if (p->vm_stat_diff[i]) { 308 if (p->vm_stat_diff[i]) {
@@ -741,7 +743,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
741 for_each_online_cpu(i) { 743 for_each_online_cpu(i) {
742 struct per_cpu_pageset *pageset; 744 struct per_cpu_pageset *pageset;
743 745
744 pageset = zone_pcp(zone, i); 746 pageset = per_cpu_ptr(zone->pageset, i);
745 seq_printf(m, 747 seq_printf(m,
746 "\n cpu: %i" 748 "\n cpu: %i"
747 "\n count: %i" 749 "\n count: %i"
@@ -906,6 +908,7 @@ static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb,
906 case CPU_ONLINE: 908 case CPU_ONLINE:
907 case CPU_ONLINE_FROZEN: 909 case CPU_ONLINE_FROZEN:
908 start_cpu_timer(cpu); 910 start_cpu_timer(cpu);
911 node_set_state(cpu_to_node(cpu), N_CPU);
909 break; 912 break;
910 case CPU_DOWN_PREPARE: 913 case CPU_DOWN_PREPARE:
911 case CPU_DOWN_PREPARE_FROZEN: 914 case CPU_DOWN_PREPARE_FROZEN: