aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig6
-rw-r--r--mm/bootmem.c195
-rw-r--r--mm/failslab.c18
-rw-r--r--mm/filemap.c103
-rw-r--r--mm/hugetlb.c13
-rw-r--r--mm/maccess.c11
-rw-r--r--mm/memcontrol.c11
-rw-r--r--mm/memory.c14
-rw-r--r--mm/migrate.c41
-rw-r--r--mm/mmap.c40
-rw-r--r--mm/mmu_context.c3
-rw-r--r--mm/nommu.c144
-rw-r--r--mm/oom_kill.c2
-rw-r--r--mm/page_alloc.c272
-rw-r--r--mm/percpu.c40
-rw-r--r--mm/slab.c11
-rw-r--r--mm/slub.c337
-rw-r--r--mm/sparse-vmemmap.c76
-rw-r--r--mm/sparse.c196
-rw-r--r--mm/truncate.c30
-rw-r--r--mm/util.c46
-rw-r--r--mm/vmalloc.c114
-rw-r--r--mm/vmscan.c3
-rw-r--r--mm/vmstat.c15
24 files changed, 1114 insertions, 627 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 17b8947aa7da..9c61158308dc 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -115,6 +115,10 @@ config SPARSEMEM_EXTREME
115config SPARSEMEM_VMEMMAP_ENABLE 115config SPARSEMEM_VMEMMAP_ENABLE
116 bool 116 bool
117 117
118config SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
119 def_bool y
120 depends on SPARSEMEM && X86_64
121
118config SPARSEMEM_VMEMMAP 122config SPARSEMEM_VMEMMAP
119 bool "Sparse Memory virtual memmap" 123 bool "Sparse Memory virtual memmap"
120 depends on SPARSEMEM && SPARSEMEM_VMEMMAP_ENABLE 124 depends on SPARSEMEM && SPARSEMEM_VMEMMAP_ENABLE
@@ -195,7 +199,7 @@ config BOUNCE
195config NR_QUICK 199config NR_QUICK
196 int 200 int
197 depends on QUICKLIST 201 depends on QUICKLIST
198 default "2" if SUPERH || AVR32 202 default "2" if AVR32
199 default "1" 203 default "1"
200 204
201config VIRT_TO_BUS 205config VIRT_TO_BUS
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 7d1486875e1c..d7c791ef0036 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -13,6 +13,7 @@
13#include <linux/bootmem.h> 13#include <linux/bootmem.h>
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/kmemleak.h> 15#include <linux/kmemleak.h>
16#include <linux/range.h>
16 17
17#include <asm/bug.h> 18#include <asm/bug.h>
18#include <asm/io.h> 19#include <asm/io.h>
@@ -32,6 +33,7 @@ unsigned long max_pfn;
32unsigned long saved_max_pfn; 33unsigned long saved_max_pfn;
33#endif 34#endif
34 35
36#ifndef CONFIG_NO_BOOTMEM
35bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata; 37bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata;
36 38
37static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list); 39static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list);
@@ -142,7 +144,7 @@ unsigned long __init init_bootmem(unsigned long start, unsigned long pages)
142 min_low_pfn = start; 144 min_low_pfn = start;
143 return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages); 145 return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages);
144} 146}
145 147#endif
146/* 148/*
147 * free_bootmem_late - free bootmem pages directly to page allocator 149 * free_bootmem_late - free bootmem pages directly to page allocator
148 * @addr: starting address of the range 150 * @addr: starting address of the range
@@ -167,6 +169,60 @@ void __init free_bootmem_late(unsigned long addr, unsigned long size)
167 } 169 }
168} 170}
169 171
172#ifdef CONFIG_NO_BOOTMEM
173static void __init __free_pages_memory(unsigned long start, unsigned long end)
174{
175 int i;
176 unsigned long start_aligned, end_aligned;
177 int order = ilog2(BITS_PER_LONG);
178
179 start_aligned = (start + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1);
180 end_aligned = end & ~(BITS_PER_LONG - 1);
181
182 if (end_aligned <= start_aligned) {
183#if 1
184 printk(KERN_DEBUG " %lx - %lx\n", start, end);
185#endif
186 for (i = start; i < end; i++)
187 __free_pages_bootmem(pfn_to_page(i), 0);
188
189 return;
190 }
191
192#if 1
193 printk(KERN_DEBUG " %lx %lx - %lx %lx\n",
194 start, start_aligned, end_aligned, end);
195#endif
196 for (i = start; i < start_aligned; i++)
197 __free_pages_bootmem(pfn_to_page(i), 0);
198
199 for (i = start_aligned; i < end_aligned; i += BITS_PER_LONG)
200 __free_pages_bootmem(pfn_to_page(i), order);
201
202 for (i = end_aligned; i < end; i++)
203 __free_pages_bootmem(pfn_to_page(i), 0);
204}
205
206unsigned long __init free_all_memory_core_early(int nodeid)
207{
208 int i;
209 u64 start, end;
210 unsigned long count = 0;
211 struct range *range = NULL;
212 int nr_range;
213
214 nr_range = get_free_all_memory_range(&range, nodeid);
215
216 for (i = 0; i < nr_range; i++) {
217 start = range[i].start;
218 end = range[i].end;
219 count += end - start;
220 __free_pages_memory(start, end);
221 }
222
223 return count;
224}
225#else
170static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) 226static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
171{ 227{
172 int aligned; 228 int aligned;
@@ -227,6 +283,7 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
227 283
228 return count; 284 return count;
229} 285}
286#endif
230 287
231/** 288/**
232 * free_all_bootmem_node - release a node's free pages to the buddy allocator 289 * free_all_bootmem_node - release a node's free pages to the buddy allocator
@@ -237,7 +294,12 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
237unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) 294unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
238{ 295{
239 register_page_bootmem_info_node(pgdat); 296 register_page_bootmem_info_node(pgdat);
297#ifdef CONFIG_NO_BOOTMEM
298 /* free_all_memory_core_early(MAX_NUMNODES) will be called later */
299 return 0;
300#else
240 return free_all_bootmem_core(pgdat->bdata); 301 return free_all_bootmem_core(pgdat->bdata);
302#endif
241} 303}
242 304
243/** 305/**
@@ -247,9 +309,14 @@ unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
247 */ 309 */
248unsigned long __init free_all_bootmem(void) 310unsigned long __init free_all_bootmem(void)
249{ 311{
312#ifdef CONFIG_NO_BOOTMEM
313 return free_all_memory_core_early(NODE_DATA(0)->node_id);
314#else
250 return free_all_bootmem_core(NODE_DATA(0)->bdata); 315 return free_all_bootmem_core(NODE_DATA(0)->bdata);
316#endif
251} 317}
252 318
319#ifndef CONFIG_NO_BOOTMEM
253static void __init __free(bootmem_data_t *bdata, 320static void __init __free(bootmem_data_t *bdata,
254 unsigned long sidx, unsigned long eidx) 321 unsigned long sidx, unsigned long eidx)
255{ 322{
@@ -344,6 +411,7 @@ static int __init mark_bootmem(unsigned long start, unsigned long end,
344 } 411 }
345 BUG(); 412 BUG();
346} 413}
414#endif
347 415
348/** 416/**
349 * free_bootmem_node - mark a page range as usable 417 * free_bootmem_node - mark a page range as usable
@@ -358,6 +426,12 @@ static int __init mark_bootmem(unsigned long start, unsigned long end,
358void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, 426void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
359 unsigned long size) 427 unsigned long size)
360{ 428{
429#ifdef CONFIG_NO_BOOTMEM
430 free_early(physaddr, physaddr + size);
431#if 0
432 printk(KERN_DEBUG "free %lx %lx\n", physaddr, size);
433#endif
434#else
361 unsigned long start, end; 435 unsigned long start, end;
362 436
363 kmemleak_free_part(__va(physaddr), size); 437 kmemleak_free_part(__va(physaddr), size);
@@ -366,6 +440,7 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
366 end = PFN_DOWN(physaddr + size); 440 end = PFN_DOWN(physaddr + size);
367 441
368 mark_bootmem_node(pgdat->bdata, start, end, 0, 0); 442 mark_bootmem_node(pgdat->bdata, start, end, 0, 0);
443#endif
369} 444}
370 445
371/** 446/**
@@ -379,6 +454,12 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
379 */ 454 */
380void __init free_bootmem(unsigned long addr, unsigned long size) 455void __init free_bootmem(unsigned long addr, unsigned long size)
381{ 456{
457#ifdef CONFIG_NO_BOOTMEM
458 free_early(addr, addr + size);
459#if 0
460 printk(KERN_DEBUG "free %lx %lx\n", addr, size);
461#endif
462#else
382 unsigned long start, end; 463 unsigned long start, end;
383 464
384 kmemleak_free_part(__va(addr), size); 465 kmemleak_free_part(__va(addr), size);
@@ -387,6 +468,7 @@ void __init free_bootmem(unsigned long addr, unsigned long size)
387 end = PFN_DOWN(addr + size); 468 end = PFN_DOWN(addr + size);
388 469
389 mark_bootmem(start, end, 0, 0); 470 mark_bootmem(start, end, 0, 0);
471#endif
390} 472}
391 473
392/** 474/**
@@ -403,12 +485,17 @@ void __init free_bootmem(unsigned long addr, unsigned long size)
403int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, 485int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
404 unsigned long size, int flags) 486 unsigned long size, int flags)
405{ 487{
488#ifdef CONFIG_NO_BOOTMEM
489 panic("no bootmem");
490 return 0;
491#else
406 unsigned long start, end; 492 unsigned long start, end;
407 493
408 start = PFN_DOWN(physaddr); 494 start = PFN_DOWN(physaddr);
409 end = PFN_UP(physaddr + size); 495 end = PFN_UP(physaddr + size);
410 496
411 return mark_bootmem_node(pgdat->bdata, start, end, 1, flags); 497 return mark_bootmem_node(pgdat->bdata, start, end, 1, flags);
498#endif
412} 499}
413 500
414/** 501/**
@@ -424,14 +511,20 @@ int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
424int __init reserve_bootmem(unsigned long addr, unsigned long size, 511int __init reserve_bootmem(unsigned long addr, unsigned long size,
425 int flags) 512 int flags)
426{ 513{
514#ifdef CONFIG_NO_BOOTMEM
515 panic("no bootmem");
516 return 0;
517#else
427 unsigned long start, end; 518 unsigned long start, end;
428 519
429 start = PFN_DOWN(addr); 520 start = PFN_DOWN(addr);
430 end = PFN_UP(addr + size); 521 end = PFN_UP(addr + size);
431 522
432 return mark_bootmem(start, end, 1, flags); 523 return mark_bootmem(start, end, 1, flags);
524#endif
433} 525}
434 526
527#ifndef CONFIG_NO_BOOTMEM
435static unsigned long __init align_idx(struct bootmem_data *bdata, 528static unsigned long __init align_idx(struct bootmem_data *bdata,
436 unsigned long idx, unsigned long step) 529 unsigned long idx, unsigned long step)
437{ 530{
@@ -582,12 +675,33 @@ static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata,
582#endif 675#endif
583 return NULL; 676 return NULL;
584} 677}
678#endif
585 679
586static void * __init ___alloc_bootmem_nopanic(unsigned long size, 680static void * __init ___alloc_bootmem_nopanic(unsigned long size,
587 unsigned long align, 681 unsigned long align,
588 unsigned long goal, 682 unsigned long goal,
589 unsigned long limit) 683 unsigned long limit)
590{ 684{
685#ifdef CONFIG_NO_BOOTMEM
686 void *ptr;
687
688 if (WARN_ON_ONCE(slab_is_available()))
689 return kzalloc(size, GFP_NOWAIT);
690
691restart:
692
693 ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal, limit);
694
695 if (ptr)
696 return ptr;
697
698 if (goal != 0) {
699 goal = 0;
700 goto restart;
701 }
702
703 return NULL;
704#else
591 bootmem_data_t *bdata; 705 bootmem_data_t *bdata;
592 void *region; 706 void *region;
593 707
@@ -613,6 +727,7 @@ restart:
613 } 727 }
614 728
615 return NULL; 729 return NULL;
730#endif
616} 731}
617 732
618/** 733/**
@@ -631,7 +746,13 @@ restart:
631void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align, 746void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
632 unsigned long goal) 747 unsigned long goal)
633{ 748{
634 return ___alloc_bootmem_nopanic(size, align, goal, 0); 749 unsigned long limit = 0;
750
751#ifdef CONFIG_NO_BOOTMEM
752 limit = -1UL;
753#endif
754
755 return ___alloc_bootmem_nopanic(size, align, goal, limit);
635} 756}
636 757
637static void * __init ___alloc_bootmem(unsigned long size, unsigned long align, 758static void * __init ___alloc_bootmem(unsigned long size, unsigned long align,
@@ -665,9 +786,16 @@ static void * __init ___alloc_bootmem(unsigned long size, unsigned long align,
665void * __init __alloc_bootmem(unsigned long size, unsigned long align, 786void * __init __alloc_bootmem(unsigned long size, unsigned long align,
666 unsigned long goal) 787 unsigned long goal)
667{ 788{
668 return ___alloc_bootmem(size, align, goal, 0); 789 unsigned long limit = 0;
790
791#ifdef CONFIG_NO_BOOTMEM
792 limit = -1UL;
793#endif
794
795 return ___alloc_bootmem(size, align, goal, limit);
669} 796}
670 797
798#ifndef CONFIG_NO_BOOTMEM
671static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata, 799static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata,
672 unsigned long size, unsigned long align, 800 unsigned long size, unsigned long align,
673 unsigned long goal, unsigned long limit) 801 unsigned long goal, unsigned long limit)
@@ -684,6 +812,7 @@ static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata,
684 812
685 return ___alloc_bootmem(size, align, goal, limit); 813 return ___alloc_bootmem(size, align, goal, limit);
686} 814}
815#endif
687 816
688/** 817/**
689 * __alloc_bootmem_node - allocate boot memory from a specific node 818 * __alloc_bootmem_node - allocate boot memory from a specific node
@@ -706,7 +835,46 @@ void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
706 if (WARN_ON_ONCE(slab_is_available())) 835 if (WARN_ON_ONCE(slab_is_available()))
707 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); 836 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
708 837
838#ifdef CONFIG_NO_BOOTMEM
839 return __alloc_memory_core_early(pgdat->node_id, size, align,
840 goal, -1ULL);
841#else
709 return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0); 842 return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0);
843#endif
844}
845
846void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
847 unsigned long align, unsigned long goal)
848{
849#ifdef MAX_DMA32_PFN
850 unsigned long end_pfn;
851
852 if (WARN_ON_ONCE(slab_is_available()))
853 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
854
855 /* update goal according ...MAX_DMA32_PFN */
856 end_pfn = pgdat->node_start_pfn + pgdat->node_spanned_pages;
857
858 if (end_pfn > MAX_DMA32_PFN + (128 >> (20 - PAGE_SHIFT)) &&
859 (goal >> PAGE_SHIFT) < MAX_DMA32_PFN) {
860 void *ptr;
861 unsigned long new_goal;
862
863 new_goal = MAX_DMA32_PFN << PAGE_SHIFT;
864#ifdef CONFIG_NO_BOOTMEM
865 ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
866 new_goal, -1ULL);
867#else
868 ptr = alloc_bootmem_core(pgdat->bdata, size, align,
869 new_goal, 0);
870#endif
871 if (ptr)
872 return ptr;
873 }
874#endif
875
876 return __alloc_bootmem_node(pgdat, size, align, goal);
877
710} 878}
711 879
712#ifdef CONFIG_SPARSEMEM 880#ifdef CONFIG_SPARSEMEM
@@ -720,6 +888,16 @@ void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
720void * __init alloc_bootmem_section(unsigned long size, 888void * __init alloc_bootmem_section(unsigned long size,
721 unsigned long section_nr) 889 unsigned long section_nr)
722{ 890{
891#ifdef CONFIG_NO_BOOTMEM
892 unsigned long pfn, goal, limit;
893
894 pfn = section_nr_to_pfn(section_nr);
895 goal = pfn << PAGE_SHIFT;
896 limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT;
897
898 return __alloc_memory_core_early(early_pfn_to_nid(pfn), size,
899 SMP_CACHE_BYTES, goal, limit);
900#else
723 bootmem_data_t *bdata; 901 bootmem_data_t *bdata;
724 unsigned long pfn, goal, limit; 902 unsigned long pfn, goal, limit;
725 903
@@ -729,6 +907,7 @@ void * __init alloc_bootmem_section(unsigned long size,
729 bdata = &bootmem_node_data[early_pfn_to_nid(pfn)]; 907 bdata = &bootmem_node_data[early_pfn_to_nid(pfn)];
730 908
731 return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, limit); 909 return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, limit);
910#endif
732} 911}
733#endif 912#endif
734 913
@@ -740,11 +919,16 @@ void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
740 if (WARN_ON_ONCE(slab_is_available())) 919 if (WARN_ON_ONCE(slab_is_available()))
741 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); 920 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
742 921
922#ifdef CONFIG_NO_BOOTMEM
923 ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
924 goal, -1ULL);
925#else
743 ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0); 926 ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0);
744 if (ptr) 927 if (ptr)
745 return ptr; 928 return ptr;
746 929
747 ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0); 930 ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0);
931#endif
748 if (ptr) 932 if (ptr)
749 return ptr; 933 return ptr;
750 934
@@ -795,6 +979,11 @@ void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size,
795 if (WARN_ON_ONCE(slab_is_available())) 979 if (WARN_ON_ONCE(slab_is_available()))
796 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); 980 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
797 981
982#ifdef CONFIG_NO_BOOTMEM
983 return __alloc_memory_core_early(pgdat->node_id, size, align,
984 goal, ARCH_LOW_ADDRESS_LIMIT);
985#else
798 return ___alloc_bootmem_node(pgdat->bdata, size, align, 986 return ___alloc_bootmem_node(pgdat->bdata, size, align,
799 goal, ARCH_LOW_ADDRESS_LIMIT); 987 goal, ARCH_LOW_ADDRESS_LIMIT);
988#endif
800} 989}
diff --git a/mm/failslab.c b/mm/failslab.c
index 9339de5f0a91..bb41f98dd8b7 100644
--- a/mm/failslab.c
+++ b/mm/failslab.c
@@ -1,18 +1,22 @@
1#include <linux/fault-inject.h> 1#include <linux/fault-inject.h>
2#include <linux/gfp.h> 2#include <linux/gfp.h>
3#include <linux/slab.h>
3 4
4static struct { 5static struct {
5 struct fault_attr attr; 6 struct fault_attr attr;
6 u32 ignore_gfp_wait; 7 u32 ignore_gfp_wait;
8 int cache_filter;
7#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS 9#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
8 struct dentry *ignore_gfp_wait_file; 10 struct dentry *ignore_gfp_wait_file;
11 struct dentry *cache_filter_file;
9#endif 12#endif
10} failslab = { 13} failslab = {
11 .attr = FAULT_ATTR_INITIALIZER, 14 .attr = FAULT_ATTR_INITIALIZER,
12 .ignore_gfp_wait = 1, 15 .ignore_gfp_wait = 1,
16 .cache_filter = 0,
13}; 17};
14 18
15bool should_failslab(size_t size, gfp_t gfpflags) 19bool should_failslab(size_t size, gfp_t gfpflags, unsigned long cache_flags)
16{ 20{
17 if (gfpflags & __GFP_NOFAIL) 21 if (gfpflags & __GFP_NOFAIL)
18 return false; 22 return false;
@@ -20,6 +24,9 @@ bool should_failslab(size_t size, gfp_t gfpflags)
20 if (failslab.ignore_gfp_wait && (gfpflags & __GFP_WAIT)) 24 if (failslab.ignore_gfp_wait && (gfpflags & __GFP_WAIT))
21 return false; 25 return false;
22 26
27 if (failslab.cache_filter && !(cache_flags & SLAB_FAILSLAB))
28 return false;
29
23 return should_fail(&failslab.attr, size); 30 return should_fail(&failslab.attr, size);
24} 31}
25 32
@@ -30,7 +37,6 @@ static int __init setup_failslab(char *str)
30__setup("failslab=", setup_failslab); 37__setup("failslab=", setup_failslab);
31 38
32#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS 39#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
33
34static int __init failslab_debugfs_init(void) 40static int __init failslab_debugfs_init(void)
35{ 41{
36 mode_t mode = S_IFREG | S_IRUSR | S_IWUSR; 42 mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
@@ -46,8 +52,14 @@ static int __init failslab_debugfs_init(void)
46 debugfs_create_bool("ignore-gfp-wait", mode, dir, 52 debugfs_create_bool("ignore-gfp-wait", mode, dir,
47 &failslab.ignore_gfp_wait); 53 &failslab.ignore_gfp_wait);
48 54
49 if (!failslab.ignore_gfp_wait_file) { 55 failslab.cache_filter_file =
56 debugfs_create_bool("cache-filter", mode, dir,
57 &failslab.cache_filter);
58
59 if (!failslab.ignore_gfp_wait_file ||
60 !failslab.cache_filter_file) {
50 err = -ENOMEM; 61 err = -ENOMEM;
62 debugfs_remove(failslab.cache_filter_file);
51 debugfs_remove(failslab.ignore_gfp_wait_file); 63 debugfs_remove(failslab.ignore_gfp_wait_file);
52 cleanup_fault_attr_dentries(&failslab.attr); 64 cleanup_fault_attr_dentries(&failslab.attr);
53 } 65 }
diff --git a/mm/filemap.c b/mm/filemap.c
index 96ac6b0eb6cb..698ea80f2102 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1634,14 +1634,15 @@ EXPORT_SYMBOL(generic_file_readonly_mmap);
1634static struct page *__read_cache_page(struct address_space *mapping, 1634static struct page *__read_cache_page(struct address_space *mapping,
1635 pgoff_t index, 1635 pgoff_t index,
1636 int (*filler)(void *,struct page*), 1636 int (*filler)(void *,struct page*),
1637 void *data) 1637 void *data,
1638 gfp_t gfp)
1638{ 1639{
1639 struct page *page; 1640 struct page *page;
1640 int err; 1641 int err;
1641repeat: 1642repeat:
1642 page = find_get_page(mapping, index); 1643 page = find_get_page(mapping, index);
1643 if (!page) { 1644 if (!page) {
1644 page = page_cache_alloc_cold(mapping); 1645 page = __page_cache_alloc(gfp | __GFP_COLD);
1645 if (!page) 1646 if (!page)
1646 return ERR_PTR(-ENOMEM); 1647 return ERR_PTR(-ENOMEM);
1647 err = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL); 1648 err = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL);
@@ -1661,31 +1662,18 @@ repeat:
1661 return page; 1662 return page;
1662} 1663}
1663 1664
1664/** 1665static struct page *do_read_cache_page(struct address_space *mapping,
1665 * read_cache_page_async - read into page cache, fill it if needed
1666 * @mapping: the page's address_space
1667 * @index: the page index
1668 * @filler: function to perform the read
1669 * @data: destination for read data
1670 *
1671 * Same as read_cache_page, but don't wait for page to become unlocked
1672 * after submitting it to the filler.
1673 *
1674 * Read into the page cache. If a page already exists, and PageUptodate() is
1675 * not set, try to fill the page but don't wait for it to become unlocked.
1676 *
1677 * If the page does not get brought uptodate, return -EIO.
1678 */
1679struct page *read_cache_page_async(struct address_space *mapping,
1680 pgoff_t index, 1666 pgoff_t index,
1681 int (*filler)(void *,struct page*), 1667 int (*filler)(void *,struct page*),
1682 void *data) 1668 void *data,
1669 gfp_t gfp)
1670
1683{ 1671{
1684 struct page *page; 1672 struct page *page;
1685 int err; 1673 int err;
1686 1674
1687retry: 1675retry:
1688 page = __read_cache_page(mapping, index, filler, data); 1676 page = __read_cache_page(mapping, index, filler, data, gfp);
1689 if (IS_ERR(page)) 1677 if (IS_ERR(page))
1690 return page; 1678 return page;
1691 if (PageUptodate(page)) 1679 if (PageUptodate(page))
@@ -1710,8 +1698,67 @@ out:
1710 mark_page_accessed(page); 1698 mark_page_accessed(page);
1711 return page; 1699 return page;
1712} 1700}
1701
1702/**
1703 * read_cache_page_async - read into page cache, fill it if needed
1704 * @mapping: the page's address_space
1705 * @index: the page index
1706 * @filler: function to perform the read
1707 * @data: destination for read data
1708 *
1709 * Same as read_cache_page, but don't wait for page to become unlocked
1710 * after submitting it to the filler.
1711 *
1712 * Read into the page cache. If a page already exists, and PageUptodate() is
1713 * not set, try to fill the page but don't wait for it to become unlocked.
1714 *
1715 * If the page does not get brought uptodate, return -EIO.
1716 */
1717struct page *read_cache_page_async(struct address_space *mapping,
1718 pgoff_t index,
1719 int (*filler)(void *,struct page*),
1720 void *data)
1721{
1722 return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping));
1723}
1713EXPORT_SYMBOL(read_cache_page_async); 1724EXPORT_SYMBOL(read_cache_page_async);
1714 1725
1726static struct page *wait_on_page_read(struct page *page)
1727{
1728 if (!IS_ERR(page)) {
1729 wait_on_page_locked(page);
1730 if (!PageUptodate(page)) {
1731 page_cache_release(page);
1732 page = ERR_PTR(-EIO);
1733 }
1734 }
1735 return page;
1736}
1737
1738/**
1739 * read_cache_page_gfp - read into page cache, using specified page allocation flags.
1740 * @mapping: the page's address_space
1741 * @index: the page index
1742 * @gfp: the page allocator flags to use if allocating
1743 *
1744 * This is the same as "read_mapping_page(mapping, index, NULL)", but with
1745 * any new page allocations done using the specified allocation flags. Note
1746 * that the Radix tree operations will still use GFP_KERNEL, so you can't
1747 * expect to do this atomically or anything like that - but you can pass in
1748 * other page requirements.
1749 *
1750 * If the page does not get brought uptodate, return -EIO.
1751 */
1752struct page *read_cache_page_gfp(struct address_space *mapping,
1753 pgoff_t index,
1754 gfp_t gfp)
1755{
1756 filler_t *filler = (filler_t *)mapping->a_ops->readpage;
1757
1758 return wait_on_page_read(do_read_cache_page(mapping, index, filler, NULL, gfp));
1759}
1760EXPORT_SYMBOL(read_cache_page_gfp);
1761
1715/** 1762/**
1716 * read_cache_page - read into page cache, fill it if needed 1763 * read_cache_page - read into page cache, fill it if needed
1717 * @mapping: the page's address_space 1764 * @mapping: the page's address_space
@@ -1729,18 +1776,7 @@ struct page *read_cache_page(struct address_space *mapping,
1729 int (*filler)(void *,struct page*), 1776 int (*filler)(void *,struct page*),
1730 void *data) 1777 void *data)
1731{ 1778{
1732 struct page *page; 1779 return wait_on_page_read(read_cache_page_async(mapping, index, filler, data));
1733
1734 page = read_cache_page_async(mapping, index, filler, data);
1735 if (IS_ERR(page))
1736 goto out;
1737 wait_on_page_locked(page);
1738 if (!PageUptodate(page)) {
1739 page_cache_release(page);
1740 page = ERR_PTR(-EIO);
1741 }
1742 out:
1743 return page;
1744} 1780}
1745EXPORT_SYMBOL(read_cache_page); 1781EXPORT_SYMBOL(read_cache_page);
1746 1782
@@ -2196,6 +2232,9 @@ again:
2196 if (unlikely(status)) 2232 if (unlikely(status))
2197 break; 2233 break;
2198 2234
2235 if (mapping_writably_mapped(mapping))
2236 flush_dcache_page(page);
2237
2199 pagefault_disable(); 2238 pagefault_disable();
2200 copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); 2239 copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
2201 pagefault_enable(); 2240 pagefault_enable();
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 65f38c218207..3a5aeb37c110 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -402,7 +402,7 @@ static void clear_huge_page(struct page *page,
402{ 402{
403 int i; 403 int i;
404 404
405 if (unlikely(sz > MAX_ORDER_NR_PAGES)) { 405 if (unlikely(sz/PAGE_SIZE > MAX_ORDER_NR_PAGES)) {
406 clear_gigantic_page(page, addr, sz); 406 clear_gigantic_page(page, addr, sz);
407 return; 407 return;
408 } 408 }
@@ -1515,10 +1515,9 @@ static struct attribute_group hstate_attr_group = {
1515 .attrs = hstate_attrs, 1515 .attrs = hstate_attrs,
1516}; 1516};
1517 1517
1518static int __init hugetlb_sysfs_add_hstate(struct hstate *h, 1518static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent,
1519 struct kobject *parent, 1519 struct kobject **hstate_kobjs,
1520 struct kobject **hstate_kobjs, 1520 struct attribute_group *hstate_attr_group)
1521 struct attribute_group *hstate_attr_group)
1522{ 1521{
1523 int retval; 1522 int retval;
1524 int hi = h - hstates; 1523 int hi = h - hstates;
@@ -2088,7 +2087,7 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma,
2088 2087
2089 entry = pte_mkwrite(pte_mkdirty(huge_ptep_get(ptep))); 2088 entry = pte_mkwrite(pte_mkdirty(huge_ptep_get(ptep)));
2090 if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) { 2089 if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) {
2091 update_mmu_cache(vma, address, entry); 2090 update_mmu_cache(vma, address, ptep);
2092 } 2091 }
2093} 2092}
2094 2093
@@ -2559,7 +2558,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2559 entry = pte_mkyoung(entry); 2558 entry = pte_mkyoung(entry);
2560 if (huge_ptep_set_access_flags(vma, address, ptep, entry, 2559 if (huge_ptep_set_access_flags(vma, address, ptep, entry,
2561 flags & FAULT_FLAG_WRITE)) 2560 flags & FAULT_FLAG_WRITE))
2562 update_mmu_cache(vma, address, entry); 2561 update_mmu_cache(vma, address, ptep);
2563 2562
2564out_page_table_lock: 2563out_page_table_lock:
2565 spin_unlock(&mm->page_table_lock); 2564 spin_unlock(&mm->page_table_lock);
diff --git a/mm/maccess.c b/mm/maccess.c
index 9073695ff25f..4e348dbaecd7 100644
--- a/mm/maccess.c
+++ b/mm/maccess.c
@@ -14,7 +14,11 @@
14 * Safely read from address @src to the buffer at @dst. If a kernel fault 14 * Safely read from address @src to the buffer at @dst. If a kernel fault
15 * happens, handle that and return -EFAULT. 15 * happens, handle that and return -EFAULT.
16 */ 16 */
17long probe_kernel_read(void *dst, void *src, size_t size) 17
18long __weak probe_kernel_read(void *dst, void *src, size_t size)
19 __attribute__((alias("__probe_kernel_read")));
20
21long __probe_kernel_read(void *dst, void *src, size_t size)
18{ 22{
19 long ret; 23 long ret;
20 mm_segment_t old_fs = get_fs(); 24 mm_segment_t old_fs = get_fs();
@@ -39,7 +43,10 @@ EXPORT_SYMBOL_GPL(probe_kernel_read);
39 * Safely write to address @dst from the buffer at @src. If a kernel fault 43 * Safely write to address @dst from the buffer at @src. If a kernel fault
40 * happens, handle that and return -EFAULT. 44 * happens, handle that and return -EFAULT.
41 */ 45 */
42long notrace __weak probe_kernel_write(void *dst, void *src, size_t size) 46long __weak probe_kernel_write(void *dst, void *src, size_t size)
47 __attribute__((alias("__probe_kernel_write")));
48
49long __probe_kernel_write(void *dst, void *src, size_t size)
43{ 50{
44 long ret; 51 long ret;
45 mm_segment_t old_fs = get_fs(); 52 mm_segment_t old_fs = get_fs();
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 488b644e0e8e..954032b80bed 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2586,7 +2586,7 @@ static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all)
2586 if (free_all) 2586 if (free_all)
2587 goto try_to_free; 2587 goto try_to_free;
2588move_account: 2588move_account:
2589 while (mem->res.usage > 0) { 2589 do {
2590 ret = -EBUSY; 2590 ret = -EBUSY;
2591 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children)) 2591 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
2592 goto out; 2592 goto out;
@@ -2614,8 +2614,8 @@ move_account:
2614 if (ret == -ENOMEM) 2614 if (ret == -ENOMEM)
2615 goto try_to_free; 2615 goto try_to_free;
2616 cond_resched(); 2616 cond_resched();
2617 } 2617 /* "ret" should also be checked to ensure all lists are empty. */
2618 ret = 0; 2618 } while (mem->res.usage > 0 || ret);
2619out: 2619out:
2620 css_put(&mem->css); 2620 css_put(&mem->css);
2621 return ret; 2621 return ret;
@@ -2648,10 +2648,7 @@ try_to_free:
2648 } 2648 }
2649 lru_add_drain(); 2649 lru_add_drain();
2650 /* try move_account...there may be some *locked* pages. */ 2650 /* try move_account...there may be some *locked* pages. */
2651 if (mem->res.usage) 2651 goto move_account;
2652 goto move_account;
2653 ret = 0;
2654 goto out;
2655} 2652}
2656 2653
2657int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event) 2654int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)
diff --git a/mm/memory.c b/mm/memory.c
index 09e4b1be7b67..72fb5f39bccc 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1593,7 +1593,7 @@ static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
1593 /* Ok, finally just insert the thing.. */ 1593 /* Ok, finally just insert the thing.. */
1594 entry = pte_mkspecial(pfn_pte(pfn, prot)); 1594 entry = pte_mkspecial(pfn_pte(pfn, prot));
1595 set_pte_at(mm, addr, pte, entry); 1595 set_pte_at(mm, addr, pte, entry);
1596 update_mmu_cache(vma, addr, entry); /* XXX: why not for insert_page? */ 1596 update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */
1597 1597
1598 retval = 0; 1598 retval = 0;
1599out_unlock: 1599out_unlock:
@@ -2116,7 +2116,7 @@ reuse:
2116 entry = pte_mkyoung(orig_pte); 2116 entry = pte_mkyoung(orig_pte);
2117 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 2117 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2118 if (ptep_set_access_flags(vma, address, page_table, entry,1)) 2118 if (ptep_set_access_flags(vma, address, page_table, entry,1))
2119 update_mmu_cache(vma, address, entry); 2119 update_mmu_cache(vma, address, page_table);
2120 ret |= VM_FAULT_WRITE; 2120 ret |= VM_FAULT_WRITE;
2121 goto unlock; 2121 goto unlock;
2122 } 2122 }
@@ -2185,7 +2185,7 @@ gotten:
2185 * new page to be mapped directly into the secondary page table. 2185 * new page to be mapped directly into the secondary page table.
2186 */ 2186 */
2187 set_pte_at_notify(mm, address, page_table, entry); 2187 set_pte_at_notify(mm, address, page_table, entry);
2188 update_mmu_cache(vma, address, entry); 2188 update_mmu_cache(vma, address, page_table);
2189 if (old_page) { 2189 if (old_page) {
2190 /* 2190 /*
2191 * Only after switching the pte to the new page may 2191 * Only after switching the pte to the new page may
@@ -2629,7 +2629,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2629 } 2629 }
2630 2630
2631 /* No need to invalidate - it was non-present before */ 2631 /* No need to invalidate - it was non-present before */
2632 update_mmu_cache(vma, address, pte); 2632 update_mmu_cache(vma, address, page_table);
2633unlock: 2633unlock:
2634 pte_unmap_unlock(page_table, ptl); 2634 pte_unmap_unlock(page_table, ptl);
2635out: 2635out:
@@ -2694,7 +2694,7 @@ setpte:
2694 set_pte_at(mm, address, page_table, entry); 2694 set_pte_at(mm, address, page_table, entry);
2695 2695
2696 /* No need to invalidate - it was non-present before */ 2696 /* No need to invalidate - it was non-present before */
2697 update_mmu_cache(vma, address, entry); 2697 update_mmu_cache(vma, address, page_table);
2698unlock: 2698unlock:
2699 pte_unmap_unlock(page_table, ptl); 2699 pte_unmap_unlock(page_table, ptl);
2700 return 0; 2700 return 0;
@@ -2855,7 +2855,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2855 set_pte_at(mm, address, page_table, entry); 2855 set_pte_at(mm, address, page_table, entry);
2856 2856
2857 /* no need to invalidate: a not-present page won't be cached */ 2857 /* no need to invalidate: a not-present page won't be cached */
2858 update_mmu_cache(vma, address, entry); 2858 update_mmu_cache(vma, address, page_table);
2859 } else { 2859 } else {
2860 if (charged) 2860 if (charged)
2861 mem_cgroup_uncharge_page(page); 2861 mem_cgroup_uncharge_page(page);
@@ -2992,7 +2992,7 @@ static inline int handle_pte_fault(struct mm_struct *mm,
2992 } 2992 }
2993 entry = pte_mkyoung(entry); 2993 entry = pte_mkyoung(entry);
2994 if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) { 2994 if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) {
2995 update_mmu_cache(vma, address, entry); 2995 update_mmu_cache(vma, address, pte);
2996 } else { 2996 } else {
2997 /* 2997 /*
2998 * This is needed only for protection faults but the arch code 2998 * This is needed only for protection faults but the arch code
diff --git a/mm/migrate.c b/mm/migrate.c
index efddbf0926b2..edb6101ed774 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -134,7 +134,7 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
134 page_add_file_rmap(new); 134 page_add_file_rmap(new);
135 135
136 /* No need to invalidate - it was non-present before */ 136 /* No need to invalidate - it was non-present before */
137 update_mmu_cache(vma, addr, pte); 137 update_mmu_cache(vma, addr, ptep);
138unlock: 138unlock:
139 pte_unmap_unlock(ptep, ptl); 139 pte_unmap_unlock(ptep, ptl);
140out: 140out:
@@ -912,6 +912,9 @@ static int do_pages_move(struct mm_struct *mm, struct task_struct *task,
912 goto out_pm; 912 goto out_pm;
913 913
914 err = -ENODEV; 914 err = -ENODEV;
915 if (node < 0 || node >= MAX_NUMNODES)
916 goto out_pm;
917
915 if (!node_state(node, N_HIGH_MEMORY)) 918 if (!node_state(node, N_HIGH_MEMORY))
916 goto out_pm; 919 goto out_pm;
917 920
@@ -999,33 +1002,27 @@ static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages,
999#define DO_PAGES_STAT_CHUNK_NR 16 1002#define DO_PAGES_STAT_CHUNK_NR 16
1000 const void __user *chunk_pages[DO_PAGES_STAT_CHUNK_NR]; 1003 const void __user *chunk_pages[DO_PAGES_STAT_CHUNK_NR];
1001 int chunk_status[DO_PAGES_STAT_CHUNK_NR]; 1004 int chunk_status[DO_PAGES_STAT_CHUNK_NR];
1002 unsigned long i, chunk_nr = DO_PAGES_STAT_CHUNK_NR;
1003 int err;
1004 1005
1005 for (i = 0; i < nr_pages; i += chunk_nr) { 1006 while (nr_pages) {
1006 if (chunk_nr > nr_pages - i) 1007 unsigned long chunk_nr;
1007 chunk_nr = nr_pages - i;
1008 1008
1009 err = copy_from_user(chunk_pages, &pages[i], 1009 chunk_nr = nr_pages;
1010 chunk_nr * sizeof(*chunk_pages)); 1010 if (chunk_nr > DO_PAGES_STAT_CHUNK_NR)
1011 if (err) { 1011 chunk_nr = DO_PAGES_STAT_CHUNK_NR;
1012 err = -EFAULT; 1012
1013 goto out; 1013 if (copy_from_user(chunk_pages, pages, chunk_nr * sizeof(*chunk_pages)))
1014 } 1014 break;
1015 1015
1016 do_pages_stat_array(mm, chunk_nr, chunk_pages, chunk_status); 1016 do_pages_stat_array(mm, chunk_nr, chunk_pages, chunk_status);
1017 1017
1018 err = copy_to_user(&status[i], chunk_status, 1018 if (copy_to_user(status, chunk_status, chunk_nr * sizeof(*status)))
1019 chunk_nr * sizeof(*chunk_status)); 1019 break;
1020 if (err) {
1021 err = -EFAULT;
1022 goto out;
1023 }
1024 }
1025 err = 0;
1026 1020
1027out: 1021 pages += chunk_nr;
1028 return err; 1022 status += chunk_nr;
1023 nr_pages -= chunk_nr;
1024 }
1025 return nr_pages ? -EFAULT : 0;
1029} 1026}
1030 1027
1031/* 1028/*
diff --git a/mm/mmap.c b/mm/mmap.c
index d9c77b2dbe9d..ee2298936fe6 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1043,6 +1043,46 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
1043} 1043}
1044EXPORT_SYMBOL(do_mmap_pgoff); 1044EXPORT_SYMBOL(do_mmap_pgoff);
1045 1045
1046SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
1047 unsigned long, prot, unsigned long, flags,
1048 unsigned long, fd, unsigned long, pgoff)
1049{
1050 struct file *file = NULL;
1051 unsigned long retval = -EBADF;
1052
1053 if (!(flags & MAP_ANONYMOUS)) {
1054 if (unlikely(flags & MAP_HUGETLB))
1055 return -EINVAL;
1056 file = fget(fd);
1057 if (!file)
1058 goto out;
1059 } else if (flags & MAP_HUGETLB) {
1060 struct user_struct *user = NULL;
1061 /*
1062 * VM_NORESERVE is used because the reservations will be
1063 * taken when vm_ops->mmap() is called
1064 * A dummy user value is used because we are not locking
1065 * memory so no accounting is necessary
1066 */
1067 len = ALIGN(len, huge_page_size(&default_hstate));
1068 file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE,
1069 &user, HUGETLB_ANONHUGE_INODE);
1070 if (IS_ERR(file))
1071 return PTR_ERR(file);
1072 }
1073
1074 flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
1075
1076 down_write(&current->mm->mmap_sem);
1077 retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
1078 up_write(&current->mm->mmap_sem);
1079
1080 if (file)
1081 fput(file);
1082out:
1083 return retval;
1084}
1085
1046/* 1086/*
1047 * Some shared mappigns will want the pages marked read-only 1087 * Some shared mappigns will want the pages marked read-only
1048 * to track write events. If so, we'll downgrade vm_page_prot 1088 * to track write events. If so, we'll downgrade vm_page_prot
diff --git a/mm/mmu_context.c b/mm/mmu_context.c
index ded9081f4021..0777654147c9 100644
--- a/mm/mmu_context.c
+++ b/mm/mmu_context.c
@@ -5,6 +5,7 @@
5 5
6#include <linux/mm.h> 6#include <linux/mm.h>
7#include <linux/mmu_context.h> 7#include <linux/mmu_context.h>
8#include <linux/module.h>
8#include <linux/sched.h> 9#include <linux/sched.h>
9 10
10#include <asm/mmu_context.h> 11#include <asm/mmu_context.h>
@@ -37,6 +38,7 @@ void use_mm(struct mm_struct *mm)
37 if (active_mm != mm) 38 if (active_mm != mm)
38 mmdrop(active_mm); 39 mmdrop(active_mm);
39} 40}
41EXPORT_SYMBOL_GPL(use_mm);
40 42
41/* 43/*
42 * unuse_mm 44 * unuse_mm
@@ -56,3 +58,4 @@ void unuse_mm(struct mm_struct *mm)
56 enter_lazy_tlb(mm, tsk); 58 enter_lazy_tlb(mm, tsk);
57 task_unlock(tsk); 59 task_unlock(tsk);
58} 60}
61EXPORT_SYMBOL_GPL(unuse_mm);
diff --git a/mm/nommu.c b/mm/nommu.c
index 8687973462bb..48a2ecfaf059 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -432,6 +432,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
432 /* 432 /*
433 * Ok, looks good - let it rip. 433 * Ok, looks good - let it rip.
434 */ 434 */
435 flush_icache_range(mm->brk, brk);
435 return mm->brk = brk; 436 return mm->brk = brk;
436} 437}
437 438
@@ -551,11 +552,11 @@ static void free_page_series(unsigned long from, unsigned long to)
551static void __put_nommu_region(struct vm_region *region) 552static void __put_nommu_region(struct vm_region *region)
552 __releases(nommu_region_sem) 553 __releases(nommu_region_sem)
553{ 554{
554 kenter("%p{%d}", region, atomic_read(&region->vm_usage)); 555 kenter("%p{%d}", region, region->vm_usage);
555 556
556 BUG_ON(!nommu_region_tree.rb_node); 557 BUG_ON(!nommu_region_tree.rb_node);
557 558
558 if (atomic_dec_and_test(&region->vm_usage)) { 559 if (--region->vm_usage == 0) {
559 if (region->vm_top > region->vm_start) 560 if (region->vm_top > region->vm_start)
560 delete_nommu_region(region); 561 delete_nommu_region(region);
561 up_write(&nommu_region_sem); 562 up_write(&nommu_region_sem);
@@ -1204,7 +1205,7 @@ unsigned long do_mmap_pgoff(struct file *file,
1204 if (!vma) 1205 if (!vma)
1205 goto error_getting_vma; 1206 goto error_getting_vma;
1206 1207
1207 atomic_set(&region->vm_usage, 1); 1208 region->vm_usage = 1;
1208 region->vm_flags = vm_flags; 1209 region->vm_flags = vm_flags;
1209 region->vm_pgoff = pgoff; 1210 region->vm_pgoff = pgoff;
1210 1211
@@ -1271,7 +1272,7 @@ unsigned long do_mmap_pgoff(struct file *file,
1271 } 1272 }
1272 1273
1273 /* we've found a region we can share */ 1274 /* we've found a region we can share */
1274 atomic_inc(&pregion->vm_usage); 1275 pregion->vm_usage++;
1275 vma->vm_region = pregion; 1276 vma->vm_region = pregion;
1276 start = pregion->vm_start; 1277 start = pregion->vm_start;
1277 start += (pgoff - pregion->vm_pgoff) << PAGE_SHIFT; 1278 start += (pgoff - pregion->vm_pgoff) << PAGE_SHIFT;
@@ -1288,7 +1289,7 @@ unsigned long do_mmap_pgoff(struct file *file,
1288 vma->vm_region = NULL; 1289 vma->vm_region = NULL;
1289 vma->vm_start = 0; 1290 vma->vm_start = 0;
1290 vma->vm_end = 0; 1291 vma->vm_end = 0;
1291 atomic_dec(&pregion->vm_usage); 1292 pregion->vm_usage--;
1292 pregion = NULL; 1293 pregion = NULL;
1293 goto error_just_free; 1294 goto error_just_free;
1294 } 1295 }
@@ -1353,10 +1354,14 @@ unsigned long do_mmap_pgoff(struct file *file,
1353share: 1354share:
1354 add_vma_to_mm(current->mm, vma); 1355 add_vma_to_mm(current->mm, vma);
1355 1356
1356 up_write(&nommu_region_sem); 1357 /* we flush the region from the icache only when the first executable
1358 * mapping of it is made */
1359 if (vma->vm_flags & VM_EXEC && !region->vm_icache_flushed) {
1360 flush_icache_range(region->vm_start, region->vm_end);
1361 region->vm_icache_flushed = true;
1362 }
1357 1363
1358 if (prot & PROT_EXEC) 1364 up_write(&nommu_region_sem);
1359 flush_icache_range(result, result + len);
1360 1365
1361 kleave(" = %lx", result); 1366 kleave(" = %lx", result);
1362 return result; 1367 return result;
@@ -1398,6 +1403,31 @@ error_getting_region:
1398} 1403}
1399EXPORT_SYMBOL(do_mmap_pgoff); 1404EXPORT_SYMBOL(do_mmap_pgoff);
1400 1405
1406SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
1407 unsigned long, prot, unsigned long, flags,
1408 unsigned long, fd, unsigned long, pgoff)
1409{
1410 struct file *file = NULL;
1411 unsigned long retval = -EBADF;
1412
1413 if (!(flags & MAP_ANONYMOUS)) {
1414 file = fget(fd);
1415 if (!file)
1416 goto out;
1417 }
1418
1419 flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
1420
1421 down_write(&current->mm->mmap_sem);
1422 retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
1423 up_write(&current->mm->mmap_sem);
1424
1425 if (file)
1426 fput(file);
1427out:
1428 return retval;
1429}
1430
1401/* 1431/*
1402 * split a vma into two pieces at address 'addr', a new vma is allocated either 1432 * split a vma into two pieces at address 'addr', a new vma is allocated either
1403 * for the first part or the tail. 1433 * for the first part or the tail.
@@ -1411,10 +1441,9 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
1411 1441
1412 kenter(""); 1442 kenter("");
1413 1443
1414 /* we're only permitted to split anonymous regions that have a single 1444 /* we're only permitted to split anonymous regions (these should have
1415 * owner */ 1445 * only a single usage on the region) */
1416 if (vma->vm_file || 1446 if (vma->vm_file)
1417 atomic_read(&vma->vm_region->vm_usage) != 1)
1418 return -ENOMEM; 1447 return -ENOMEM;
1419 1448
1420 if (mm->map_count >= sysctl_max_map_count) 1449 if (mm->map_count >= sysctl_max_map_count)
@@ -1488,7 +1517,7 @@ static int shrink_vma(struct mm_struct *mm,
1488 1517
1489 /* cut the backing region down to size */ 1518 /* cut the backing region down to size */
1490 region = vma->vm_region; 1519 region = vma->vm_region;
1491 BUG_ON(atomic_read(&region->vm_usage) != 1); 1520 BUG_ON(region->vm_usage != 1);
1492 1521
1493 down_write(&nommu_region_sem); 1522 down_write(&nommu_region_sem);
1494 delete_nommu_region(region); 1523 delete_nommu_region(region);
@@ -1732,27 +1761,6 @@ void unmap_mapping_range(struct address_space *mapping,
1732EXPORT_SYMBOL(unmap_mapping_range); 1761EXPORT_SYMBOL(unmap_mapping_range);
1733 1762
1734/* 1763/*
1735 * ask for an unmapped area at which to create a mapping on a file
1736 */
1737unsigned long get_unmapped_area(struct file *file, unsigned long addr,
1738 unsigned long len, unsigned long pgoff,
1739 unsigned long flags)
1740{
1741 unsigned long (*get_area)(struct file *, unsigned long, unsigned long,
1742 unsigned long, unsigned long);
1743
1744 get_area = current->mm->get_unmapped_area;
1745 if (file && file->f_op && file->f_op->get_unmapped_area)
1746 get_area = file->f_op->get_unmapped_area;
1747
1748 if (!get_area)
1749 return -ENOSYS;
1750
1751 return get_area(file, addr, len, pgoff, flags);
1752}
1753EXPORT_SYMBOL(get_unmapped_area);
1754
1755/*
1756 * Check that a process has enough memory to allocate a new virtual 1764 * Check that a process has enough memory to allocate a new virtual
1757 * mapping. 0 means there is enough memory for the allocation to 1765 * mapping. 0 means there is enough memory for the allocation to
1758 * succeed and -ENOMEM implies there is not. 1766 * succeed and -ENOMEM implies there is not.
@@ -1891,9 +1899,11 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in
1891 1899
1892 /* only read or write mappings where it is permitted */ 1900 /* only read or write mappings where it is permitted */
1893 if (write && vma->vm_flags & VM_MAYWRITE) 1901 if (write && vma->vm_flags & VM_MAYWRITE)
1894 len -= copy_to_user((void *) addr, buf, len); 1902 copy_to_user_page(vma, NULL, addr,
1903 (void *) addr, buf, len);
1895 else if (!write && vma->vm_flags & VM_MAYREAD) 1904 else if (!write && vma->vm_flags & VM_MAYREAD)
1896 len -= copy_from_user(buf, (void *) addr, len); 1905 copy_from_user_page(vma, NULL, addr,
1906 buf, (void *) addr, len);
1897 else 1907 else
1898 len = 0; 1908 len = 0;
1899 } else { 1909 } else {
@@ -1904,3 +1914,65 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in
1904 mmput(mm); 1914 mmput(mm);
1905 return len; 1915 return len;
1906} 1916}
1917
1918/**
1919 * nommu_shrink_inode_mappings - Shrink the shared mappings on an inode
1920 * @inode: The inode to check
1921 * @size: The current filesize of the inode
1922 * @newsize: The proposed filesize of the inode
1923 *
1924 * Check the shared mappings on an inode on behalf of a shrinking truncate to
1925 * make sure that that any outstanding VMAs aren't broken and then shrink the
1926 * vm_regions that extend that beyond so that do_mmap_pgoff() doesn't
1927 * automatically grant mappings that are too large.
1928 */
1929int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
1930 size_t newsize)
1931{
1932 struct vm_area_struct *vma;
1933 struct prio_tree_iter iter;
1934 struct vm_region *region;
1935 pgoff_t low, high;
1936 size_t r_size, r_top;
1937
1938 low = newsize >> PAGE_SHIFT;
1939 high = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
1940
1941 down_write(&nommu_region_sem);
1942
1943 /* search for VMAs that fall within the dead zone */
1944 vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap,
1945 low, high) {
1946 /* found one - only interested if it's shared out of the page
1947 * cache */
1948 if (vma->vm_flags & VM_SHARED) {
1949 up_write(&nommu_region_sem);
1950 return -ETXTBSY; /* not quite true, but near enough */
1951 }
1952 }
1953
1954 /* reduce any regions that overlap the dead zone - if in existence,
1955 * these will be pointed to by VMAs that don't overlap the dead zone
1956 *
1957 * we don't check for any regions that start beyond the EOF as there
1958 * shouldn't be any
1959 */
1960 vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap,
1961 0, ULONG_MAX) {
1962 if (!(vma->vm_flags & VM_SHARED))
1963 continue;
1964
1965 region = vma->vm_region;
1966 r_size = region->vm_top - region->vm_start;
1967 r_top = (region->vm_pgoff << PAGE_SHIFT) + r_size;
1968
1969 if (r_top > newsize) {
1970 region->vm_top -= r_top - newsize;
1971 if (region->vm_end > region->vm_top)
1972 region->vm_end = region->vm_top;
1973 }
1974 }
1975
1976 up_write(&nommu_region_sem);
1977 return 0;
1978}
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index f52481b1c1e5..237050478f28 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -459,6 +459,8 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
459 list_for_each_entry(c, &p->children, sibling) { 459 list_for_each_entry(c, &p->children, sibling) {
460 if (c->mm == p->mm) 460 if (c->mm == p->mm)
461 continue; 461 continue;
462 if (mem && !task_in_mem_cgroup(c, mem))
463 continue;
462 if (!oom_kill_task(c)) 464 if (!oom_kill_task(c))
463 return 0; 465 return 0;
464 } 466 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 4e9f5cc5fb59..a6b17aa4740b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -556,8 +556,9 @@ static void free_pcppages_bulk(struct zone *zone, int count,
556 page = list_entry(list->prev, struct page, lru); 556 page = list_entry(list->prev, struct page, lru);
557 /* must delete as __free_one_page list manipulates */ 557 /* must delete as __free_one_page list manipulates */
558 list_del(&page->lru); 558 list_del(&page->lru);
559 __free_one_page(page, zone, 0, migratetype); 559 /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
560 trace_mm_page_pcpu_drain(page, 0, migratetype); 560 __free_one_page(page, zone, 0, page_private(page));
561 trace_mm_page_pcpu_drain(page, 0, page_private(page));
561 } while (--count && --batch_free && !list_empty(list)); 562 } while (--count && --batch_free && !list_empty(list));
562 } 563 }
563 spin_unlock(&zone->lock); 564 spin_unlock(&zone->lock);
@@ -1008,10 +1009,10 @@ static void drain_pages(unsigned int cpu)
1008 struct per_cpu_pageset *pset; 1009 struct per_cpu_pageset *pset;
1009 struct per_cpu_pages *pcp; 1010 struct per_cpu_pages *pcp;
1010 1011
1011 pset = zone_pcp(zone, cpu); 1012 local_irq_save(flags);
1013 pset = per_cpu_ptr(zone->pageset, cpu);
1012 1014
1013 pcp = &pset->pcp; 1015 pcp = &pset->pcp;
1014 local_irq_save(flags);
1015 free_pcppages_bulk(zone, pcp->count, pcp); 1016 free_pcppages_bulk(zone, pcp->count, pcp);
1016 pcp->count = 0; 1017 pcp->count = 0;
1017 local_irq_restore(flags); 1018 local_irq_restore(flags);
@@ -1095,7 +1096,6 @@ static void free_hot_cold_page(struct page *page, int cold)
1095 arch_free_page(page, 0); 1096 arch_free_page(page, 0);
1096 kernel_map_pages(page, 1, 0); 1097 kernel_map_pages(page, 1, 0);
1097 1098
1098 pcp = &zone_pcp(zone, get_cpu())->pcp;
1099 migratetype = get_pageblock_migratetype(page); 1099 migratetype = get_pageblock_migratetype(page);
1100 set_page_private(page, migratetype); 1100 set_page_private(page, migratetype);
1101 local_irq_save(flags); 1101 local_irq_save(flags);
@@ -1118,6 +1118,7 @@ static void free_hot_cold_page(struct page *page, int cold)
1118 migratetype = MIGRATE_MOVABLE; 1118 migratetype = MIGRATE_MOVABLE;
1119 } 1119 }
1120 1120
1121 pcp = &this_cpu_ptr(zone->pageset)->pcp;
1121 if (cold) 1122 if (cold)
1122 list_add_tail(&page->lru, &pcp->lists[migratetype]); 1123 list_add_tail(&page->lru, &pcp->lists[migratetype]);
1123 else 1124 else
@@ -1130,7 +1131,6 @@ static void free_hot_cold_page(struct page *page, int cold)
1130 1131
1131out: 1132out:
1132 local_irq_restore(flags); 1133 local_irq_restore(flags);
1133 put_cpu();
1134} 1134}
1135 1135
1136void free_hot_page(struct page *page) 1136void free_hot_page(struct page *page)
@@ -1180,17 +1180,15 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
1180 unsigned long flags; 1180 unsigned long flags;
1181 struct page *page; 1181 struct page *page;
1182 int cold = !!(gfp_flags & __GFP_COLD); 1182 int cold = !!(gfp_flags & __GFP_COLD);
1183 int cpu;
1184 1183
1185again: 1184again:
1186 cpu = get_cpu();
1187 if (likely(order == 0)) { 1185 if (likely(order == 0)) {
1188 struct per_cpu_pages *pcp; 1186 struct per_cpu_pages *pcp;
1189 struct list_head *list; 1187 struct list_head *list;
1190 1188
1191 pcp = &zone_pcp(zone, cpu)->pcp;
1192 list = &pcp->lists[migratetype];
1193 local_irq_save(flags); 1189 local_irq_save(flags);
1190 pcp = &this_cpu_ptr(zone->pageset)->pcp;
1191 list = &pcp->lists[migratetype];
1194 if (list_empty(list)) { 1192 if (list_empty(list)) {
1195 pcp->count += rmqueue_bulk(zone, 0, 1193 pcp->count += rmqueue_bulk(zone, 0,
1196 pcp->batch, list, 1194 pcp->batch, list,
@@ -1222,16 +1220,15 @@ again:
1222 } 1220 }
1223 spin_lock_irqsave(&zone->lock, flags); 1221 spin_lock_irqsave(&zone->lock, flags);
1224 page = __rmqueue(zone, order, migratetype); 1222 page = __rmqueue(zone, order, migratetype);
1225 __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order));
1226 spin_unlock(&zone->lock); 1223 spin_unlock(&zone->lock);
1227 if (!page) 1224 if (!page)
1228 goto failed; 1225 goto failed;
1226 __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order));
1229 } 1227 }
1230 1228
1231 __count_zone_vm_events(PGALLOC, zone, 1 << order); 1229 __count_zone_vm_events(PGALLOC, zone, 1 << order);
1232 zone_statistics(preferred_zone, zone); 1230 zone_statistics(preferred_zone, zone);
1233 local_irq_restore(flags); 1231 local_irq_restore(flags);
1234 put_cpu();
1235 1232
1236 VM_BUG_ON(bad_range(zone, page)); 1233 VM_BUG_ON(bad_range(zone, page));
1237 if (prep_new_page(page, order, gfp_flags)) 1234 if (prep_new_page(page, order, gfp_flags))
@@ -1240,7 +1237,6 @@ again:
1240 1237
1241failed: 1238failed:
1242 local_irq_restore(flags); 1239 local_irq_restore(flags);
1243 put_cpu();
1244 return NULL; 1240 return NULL;
1245} 1241}
1246 1242
@@ -2179,7 +2175,7 @@ void show_free_areas(void)
2179 for_each_online_cpu(cpu) { 2175 for_each_online_cpu(cpu) {
2180 struct per_cpu_pageset *pageset; 2176 struct per_cpu_pageset *pageset;
2181 2177
2182 pageset = zone_pcp(zone, cpu); 2178 pageset = per_cpu_ptr(zone->pageset, cpu);
2183 2179
2184 printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n", 2180 printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n",
2185 cpu, pageset->pcp.high, 2181 cpu, pageset->pcp.high,
@@ -2744,10 +2740,29 @@ static void build_zonelist_cache(pg_data_t *pgdat)
2744 2740
2745#endif /* CONFIG_NUMA */ 2741#endif /* CONFIG_NUMA */
2746 2742
2743/*
2744 * Boot pageset table. One per cpu which is going to be used for all
2745 * zones and all nodes. The parameters will be set in such a way
2746 * that an item put on a list will immediately be handed over to
2747 * the buddy list. This is safe since pageset manipulation is done
2748 * with interrupts disabled.
2749 *
2750 * The boot_pagesets must be kept even after bootup is complete for
2751 * unused processors and/or zones. They do play a role for bootstrapping
2752 * hotplugged processors.
2753 *
2754 * zoneinfo_show() and maybe other functions do
2755 * not check if the processor is online before following the pageset pointer.
2756 * Other parts of the kernel may not check if the zone is available.
2757 */
2758static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);
2759static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
2760
2747/* return values int ....just for stop_machine() */ 2761/* return values int ....just for stop_machine() */
2748static int __build_all_zonelists(void *dummy) 2762static int __build_all_zonelists(void *dummy)
2749{ 2763{
2750 int nid; 2764 int nid;
2765 int cpu;
2751 2766
2752#ifdef CONFIG_NUMA 2767#ifdef CONFIG_NUMA
2753 memset(node_load, 0, sizeof(node_load)); 2768 memset(node_load, 0, sizeof(node_load));
@@ -2758,6 +2773,23 @@ static int __build_all_zonelists(void *dummy)
2758 build_zonelists(pgdat); 2773 build_zonelists(pgdat);
2759 build_zonelist_cache(pgdat); 2774 build_zonelist_cache(pgdat);
2760 } 2775 }
2776
2777 /*
2778 * Initialize the boot_pagesets that are going to be used
2779 * for bootstrapping processors. The real pagesets for
2780 * each zone will be allocated later when the per cpu
2781 * allocator is available.
2782 *
2783 * boot_pagesets are used also for bootstrapping offline
2784 * cpus if the system is already booted because the pagesets
2785 * are needed to initialize allocators on a specific cpu too.
2786 * F.e. the percpu allocator needs the page allocator which
2787 * needs the percpu allocator in order to allocate its pagesets
2788 * (a chicken-egg dilemma).
2789 */
2790 for_each_possible_cpu(cpu)
2791 setup_pageset(&per_cpu(boot_pageset, cpu), 0);
2792
2761 return 0; 2793 return 0;
2762} 2794}
2763 2795
@@ -3095,121 +3127,33 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p,
3095 pcp->batch = PAGE_SHIFT * 8; 3127 pcp->batch = PAGE_SHIFT * 8;
3096} 3128}
3097 3129
3098
3099#ifdef CONFIG_NUMA
3100/*
3101 * Boot pageset table. One per cpu which is going to be used for all
3102 * zones and all nodes. The parameters will be set in such a way
3103 * that an item put on a list will immediately be handed over to
3104 * the buddy list. This is safe since pageset manipulation is done
3105 * with interrupts disabled.
3106 *
3107 * Some NUMA counter updates may also be caught by the boot pagesets.
3108 *
3109 * The boot_pagesets must be kept even after bootup is complete for
3110 * unused processors and/or zones. They do play a role for bootstrapping
3111 * hotplugged processors.
3112 *
3113 * zoneinfo_show() and maybe other functions do
3114 * not check if the processor is online before following the pageset pointer.
3115 * Other parts of the kernel may not check if the zone is available.
3116 */
3117static struct per_cpu_pageset boot_pageset[NR_CPUS];
3118
3119/* 3130/*
3120 * Dynamically allocate memory for the 3131 * Allocate per cpu pagesets and initialize them.
3121 * per cpu pageset array in struct zone. 3132 * Before this call only boot pagesets were available.
3133 * Boot pagesets will no longer be used by this processorr
3134 * after setup_per_cpu_pageset().
3122 */ 3135 */
3123static int __cpuinit process_zones(int cpu) 3136void __init setup_per_cpu_pageset(void)
3124{ 3137{
3125 struct zone *zone, *dzone; 3138 struct zone *zone;
3126 int node = cpu_to_node(cpu); 3139 int cpu;
3127
3128 node_set_state(node, N_CPU); /* this node has a cpu */
3129 3140
3130 for_each_populated_zone(zone) { 3141 for_each_populated_zone(zone) {
3131 zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset), 3142 zone->pageset = alloc_percpu(struct per_cpu_pageset);
3132 GFP_KERNEL, node);
3133 if (!zone_pcp(zone, cpu))
3134 goto bad;
3135 3143
3136 setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone)); 3144 for_each_possible_cpu(cpu) {
3145 struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);
3137 3146
3138 if (percpu_pagelist_fraction) 3147 setup_pageset(pcp, zone_batchsize(zone));
3139 setup_pagelist_highmark(zone_pcp(zone, cpu),
3140 (zone->present_pages / percpu_pagelist_fraction));
3141 }
3142 3148
3143 return 0; 3149 if (percpu_pagelist_fraction)
3144bad: 3150 setup_pagelist_highmark(pcp,
3145 for_each_zone(dzone) { 3151 (zone->present_pages /
3146 if (!populated_zone(dzone)) 3152 percpu_pagelist_fraction));
3147 continue; 3153 }
3148 if (dzone == zone)
3149 break;
3150 kfree(zone_pcp(dzone, cpu));
3151 zone_pcp(dzone, cpu) = &boot_pageset[cpu];
3152 }
3153 return -ENOMEM;
3154}
3155
3156static inline void free_zone_pagesets(int cpu)
3157{
3158 struct zone *zone;
3159
3160 for_each_zone(zone) {
3161 struct per_cpu_pageset *pset = zone_pcp(zone, cpu);
3162
3163 /* Free per_cpu_pageset if it is slab allocated */
3164 if (pset != &boot_pageset[cpu])
3165 kfree(pset);
3166 zone_pcp(zone, cpu) = &boot_pageset[cpu];
3167 }
3168}
3169
3170static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb,
3171 unsigned long action,
3172 void *hcpu)
3173{
3174 int cpu = (long)hcpu;
3175 int ret = NOTIFY_OK;
3176
3177 switch (action) {
3178 case CPU_UP_PREPARE:
3179 case CPU_UP_PREPARE_FROZEN:
3180 if (process_zones(cpu))
3181 ret = NOTIFY_BAD;
3182 break;
3183 case CPU_UP_CANCELED:
3184 case CPU_UP_CANCELED_FROZEN:
3185 case CPU_DEAD:
3186 case CPU_DEAD_FROZEN:
3187 free_zone_pagesets(cpu);
3188 break;
3189 default:
3190 break;
3191 } 3154 }
3192 return ret;
3193}
3194
3195static struct notifier_block __cpuinitdata pageset_notifier =
3196 { &pageset_cpuup_callback, NULL, 0 };
3197
3198void __init setup_per_cpu_pageset(void)
3199{
3200 int err;
3201
3202 /* Initialize per_cpu_pageset for cpu 0.
3203 * A cpuup callback will do this for every cpu
3204 * as it comes online
3205 */
3206 err = process_zones(smp_processor_id());
3207 BUG_ON(err);
3208 register_cpu_notifier(&pageset_notifier);
3209} 3155}
3210 3156
3211#endif
3212
3213static noinline __init_refok 3157static noinline __init_refok
3214int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) 3158int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
3215{ 3159{
@@ -3263,7 +3207,7 @@ static int __zone_pcp_update(void *data)
3263 struct per_cpu_pageset *pset; 3207 struct per_cpu_pageset *pset;
3264 struct per_cpu_pages *pcp; 3208 struct per_cpu_pages *pcp;
3265 3209
3266 pset = zone_pcp(zone, cpu); 3210 pset = per_cpu_ptr(zone->pageset, cpu);
3267 pcp = &pset->pcp; 3211 pcp = &pset->pcp;
3268 3212
3269 local_irq_save(flags); 3213 local_irq_save(flags);
@@ -3281,21 +3225,17 @@ void zone_pcp_update(struct zone *zone)
3281 3225
3282static __meminit void zone_pcp_init(struct zone *zone) 3226static __meminit void zone_pcp_init(struct zone *zone)
3283{ 3227{
3284 int cpu; 3228 /*
3285 unsigned long batch = zone_batchsize(zone); 3229 * per cpu subsystem is not up at this point. The following code
3230 * relies on the ability of the linker to provide the
3231 * offset of a (static) per cpu variable into the per cpu area.
3232 */
3233 zone->pageset = &boot_pageset;
3286 3234
3287 for (cpu = 0; cpu < NR_CPUS; cpu++) {
3288#ifdef CONFIG_NUMA
3289 /* Early boot. Slab allocator not functional yet */
3290 zone_pcp(zone, cpu) = &boot_pageset[cpu];
3291 setup_pageset(&boot_pageset[cpu],0);
3292#else
3293 setup_pageset(zone_pcp(zone,cpu), batch);
3294#endif
3295 }
3296 if (zone->present_pages) 3235 if (zone->present_pages)
3297 printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", 3236 printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n",
3298 zone->name, zone->present_pages, batch); 3237 zone->name, zone->present_pages,
3238 zone_batchsize(zone));
3299} 3239}
3300 3240
3301__meminit int init_currently_empty_zone(struct zone *zone, 3241__meminit int init_currently_empty_zone(struct zone *zone,
@@ -3434,6 +3374,61 @@ void __init free_bootmem_with_active_regions(int nid,
3434 } 3374 }
3435} 3375}
3436 3376
3377int __init add_from_early_node_map(struct range *range, int az,
3378 int nr_range, int nid)
3379{
3380 int i;
3381 u64 start, end;
3382
3383 /* need to go over early_node_map to find out good range for node */
3384 for_each_active_range_index_in_nid(i, nid) {
3385 start = early_node_map[i].start_pfn;
3386 end = early_node_map[i].end_pfn;
3387 nr_range = add_range(range, az, nr_range, start, end);
3388 }
3389 return nr_range;
3390}
3391
3392#ifdef CONFIG_NO_BOOTMEM
3393void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
3394 u64 goal, u64 limit)
3395{
3396 int i;
3397 void *ptr;
3398
3399 /* need to go over early_node_map to find out good range for node */
3400 for_each_active_range_index_in_nid(i, nid) {
3401 u64 addr;
3402 u64 ei_start, ei_last;
3403
3404 ei_last = early_node_map[i].end_pfn;
3405 ei_last <<= PAGE_SHIFT;
3406 ei_start = early_node_map[i].start_pfn;
3407 ei_start <<= PAGE_SHIFT;
3408 addr = find_early_area(ei_start, ei_last,
3409 goal, limit, size, align);
3410
3411 if (addr == -1ULL)
3412 continue;
3413
3414#if 0
3415 printk(KERN_DEBUG "alloc (nid=%d %llx - %llx) (%llx - %llx) %llx %llx => %llx\n",
3416 nid,
3417 ei_start, ei_last, goal, limit, size,
3418 align, addr);
3419#endif
3420
3421 ptr = phys_to_virt(addr);
3422 memset(ptr, 0, size);
3423 reserve_early_without_check(addr, addr + size, "BOOTMEM");
3424 return ptr;
3425 }
3426
3427 return NULL;
3428}
3429#endif
3430
3431
3437void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data) 3432void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data)
3438{ 3433{
3439 int i; 3434 int i;
@@ -3998,7 +3993,7 @@ void __init add_active_range(unsigned int nid, unsigned long start_pfn,
3998 } 3993 }
3999 3994
4000 /* Merge backward if suitable */ 3995 /* Merge backward if suitable */
4001 if (start_pfn < early_node_map[i].end_pfn && 3996 if (start_pfn < early_node_map[i].start_pfn &&
4002 end_pfn >= early_node_map[i].start_pfn) { 3997 end_pfn >= early_node_map[i].start_pfn) {
4003 early_node_map[i].start_pfn = start_pfn; 3998 early_node_map[i].start_pfn = start_pfn;
4004 return; 3999 return;
@@ -4466,7 +4461,11 @@ void __init set_dma_reserve(unsigned long new_dma_reserve)
4466} 4461}
4467 4462
4468#ifndef CONFIG_NEED_MULTIPLE_NODES 4463#ifndef CONFIG_NEED_MULTIPLE_NODES
4469struct pglist_data __refdata contig_page_data = { .bdata = &bootmem_node_data[0] }; 4464struct pglist_data __refdata contig_page_data = {
4465#ifndef CONFIG_NO_BOOTMEM
4466 .bdata = &bootmem_node_data[0]
4467#endif
4468 };
4470EXPORT_SYMBOL(contig_page_data); 4469EXPORT_SYMBOL(contig_page_data);
4471#endif 4470#endif
4472 4471
@@ -4809,10 +4808,11 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
4809 if (!write || (ret == -EINVAL)) 4808 if (!write || (ret == -EINVAL))
4810 return ret; 4809 return ret;
4811 for_each_populated_zone(zone) { 4810 for_each_populated_zone(zone) {
4812 for_each_online_cpu(cpu) { 4811 for_each_possible_cpu(cpu) {
4813 unsigned long high; 4812 unsigned long high;
4814 high = zone->present_pages / percpu_pagelist_fraction; 4813 high = zone->present_pages / percpu_pagelist_fraction;
4815 setup_pagelist_highmark(zone_pcp(zone, cpu), high); 4814 setup_pagelist_highmark(
4815 per_cpu_ptr(zone->pageset, cpu), high);
4816 } 4816 }
4817 } 4817 }
4818 return 0; 4818 return 0;
diff --git a/mm/percpu.c b/mm/percpu.c
index 442010cc91c6..768419d44ad7 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -80,13 +80,15 @@
80/* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */ 80/* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */
81#ifndef __addr_to_pcpu_ptr 81#ifndef __addr_to_pcpu_ptr
82#define __addr_to_pcpu_ptr(addr) \ 82#define __addr_to_pcpu_ptr(addr) \
83 (void *)((unsigned long)(addr) - (unsigned long)pcpu_base_addr \ 83 (void __percpu *)((unsigned long)(addr) - \
84 + (unsigned long)__per_cpu_start) 84 (unsigned long)pcpu_base_addr + \
85 (unsigned long)__per_cpu_start)
85#endif 86#endif
86#ifndef __pcpu_ptr_to_addr 87#ifndef __pcpu_ptr_to_addr
87#define __pcpu_ptr_to_addr(ptr) \ 88#define __pcpu_ptr_to_addr(ptr) \
88 (void *)((unsigned long)(ptr) + (unsigned long)pcpu_base_addr \ 89 (void __force *)((unsigned long)(ptr) + \
89 - (unsigned long)__per_cpu_start) 90 (unsigned long)pcpu_base_addr - \
91 (unsigned long)__per_cpu_start)
90#endif 92#endif
91 93
92struct pcpu_chunk { 94struct pcpu_chunk {
@@ -913,11 +915,10 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size)
913 int rs, re; 915 int rs, re;
914 916
915 /* quick path, check whether it's empty already */ 917 /* quick path, check whether it's empty already */
916 pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { 918 rs = page_start;
917 if (rs == page_start && re == page_end) 919 pcpu_next_unpop(chunk, &rs, &re, page_end);
918 return; 920 if (rs == page_start && re == page_end)
919 break; 921 return;
920 }
921 922
922 /* immutable chunks can't be depopulated */ 923 /* immutable chunks can't be depopulated */
923 WARN_ON(chunk->immutable); 924 WARN_ON(chunk->immutable);
@@ -968,11 +969,10 @@ static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
968 int rs, re, rc; 969 int rs, re, rc;
969 970
970 /* quick path, check whether all pages are already there */ 971 /* quick path, check whether all pages are already there */
971 pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) { 972 rs = page_start;
972 if (rs == page_start && re == page_end) 973 pcpu_next_pop(chunk, &rs, &re, page_end);
973 goto clear; 974 if (rs == page_start && re == page_end)
974 break; 975 goto clear;
975 }
976 976
977 /* need to allocate and map pages, this chunk can't be immutable */ 977 /* need to allocate and map pages, this chunk can't be immutable */
978 WARN_ON(chunk->immutable); 978 WARN_ON(chunk->immutable);
@@ -1067,7 +1067,7 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void)
1067 * RETURNS: 1067 * RETURNS:
1068 * Percpu pointer to the allocated area on success, NULL on failure. 1068 * Percpu pointer to the allocated area on success, NULL on failure.
1069 */ 1069 */
1070static void *pcpu_alloc(size_t size, size_t align, bool reserved) 1070static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved)
1071{ 1071{
1072 static int warn_limit = 10; 1072 static int warn_limit = 10;
1073 struct pcpu_chunk *chunk; 1073 struct pcpu_chunk *chunk;
@@ -1196,7 +1196,7 @@ fail_unlock_mutex:
1196 * RETURNS: 1196 * RETURNS:
1197 * Percpu pointer to the allocated area on success, NULL on failure. 1197 * Percpu pointer to the allocated area on success, NULL on failure.
1198 */ 1198 */
1199void *__alloc_percpu(size_t size, size_t align) 1199void __percpu *__alloc_percpu(size_t size, size_t align)
1200{ 1200{
1201 return pcpu_alloc(size, align, false); 1201 return pcpu_alloc(size, align, false);
1202} 1202}
@@ -1217,7 +1217,7 @@ EXPORT_SYMBOL_GPL(__alloc_percpu);
1217 * RETURNS: 1217 * RETURNS:
1218 * Percpu pointer to the allocated area on success, NULL on failure. 1218 * Percpu pointer to the allocated area on success, NULL on failure.
1219 */ 1219 */
1220void *__alloc_reserved_percpu(size_t size, size_t align) 1220void __percpu *__alloc_reserved_percpu(size_t size, size_t align)
1221{ 1221{
1222 return pcpu_alloc(size, align, true); 1222 return pcpu_alloc(size, align, true);
1223} 1223}
@@ -1269,9 +1269,9 @@ static void pcpu_reclaim(struct work_struct *work)
1269 * CONTEXT: 1269 * CONTEXT:
1270 * Can be called from atomic context. 1270 * Can be called from atomic context.
1271 */ 1271 */
1272void free_percpu(void *ptr) 1272void free_percpu(void __percpu *ptr)
1273{ 1273{
1274 void *addr = __pcpu_ptr_to_addr(ptr); 1274 void *addr;
1275 struct pcpu_chunk *chunk; 1275 struct pcpu_chunk *chunk;
1276 unsigned long flags; 1276 unsigned long flags;
1277 int off; 1277 int off;
@@ -1279,6 +1279,8 @@ void free_percpu(void *ptr)
1279 if (!ptr) 1279 if (!ptr)
1280 return; 1280 return;
1281 1281
1282 addr = __pcpu_ptr_to_addr(ptr);
1283
1282 spin_lock_irqsave(&pcpu_lock, flags); 1284 spin_lock_irqsave(&pcpu_lock, flags);
1283 1285
1284 chunk = pcpu_chunk_addr_search(addr); 1286 chunk = pcpu_chunk_addr_search(addr);
diff --git a/mm/slab.c b/mm/slab.c
index 0c632a946ea1..a9f325b28bed 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -654,7 +654,7 @@ static void init_node_lock_keys(int q)
654 654
655 l3 = s->cs_cachep->nodelists[q]; 655 l3 = s->cs_cachep->nodelists[q];
656 if (!l3 || OFF_SLAB(s->cs_cachep)) 656 if (!l3 || OFF_SLAB(s->cs_cachep))
657 return; 657 continue;
658 lockdep_set_class(&l3->list_lock, &on_slab_l3_key); 658 lockdep_set_class(&l3->list_lock, &on_slab_l3_key);
659 alc = l3->alien; 659 alc = l3->alien;
660 /* 660 /*
@@ -665,7 +665,7 @@ static void init_node_lock_keys(int q)
665 * for alloc_alien_cache, 665 * for alloc_alien_cache,
666 */ 666 */
667 if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC) 667 if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC)
668 return; 668 continue;
669 for_each_node(r) { 669 for_each_node(r) {
670 if (alc[r]) 670 if (alc[r])
671 lockdep_set_class(&alc[r]->lock, 671 lockdep_set_class(&alc[r]->lock,
@@ -935,7 +935,6 @@ static int transfer_objects(struct array_cache *to,
935 935
936 from->avail -= nr; 936 from->avail -= nr;
937 to->avail += nr; 937 to->avail += nr;
938 to->touched = 1;
939 return nr; 938 return nr;
940} 939}
941 940
@@ -2961,8 +2960,10 @@ retry:
2961 spin_lock(&l3->list_lock); 2960 spin_lock(&l3->list_lock);
2962 2961
2963 /* See if we can refill from the shared array */ 2962 /* See if we can refill from the shared array */
2964 if (l3->shared && transfer_objects(ac, l3->shared, batchcount)) 2963 if (l3->shared && transfer_objects(ac, l3->shared, batchcount)) {
2964 l3->shared->touched = 1;
2965 goto alloc_done; 2965 goto alloc_done;
2966 }
2966 2967
2967 while (batchcount > 0) { 2968 while (batchcount > 0) {
2968 struct list_head *entry; 2969 struct list_head *entry;
@@ -3099,7 +3100,7 @@ static bool slab_should_failslab(struct kmem_cache *cachep, gfp_t flags)
3099 if (cachep == &cache_cache) 3100 if (cachep == &cache_cache)
3100 return false; 3101 return false;
3101 3102
3102 return should_failslab(obj_size(cachep), flags); 3103 return should_failslab(obj_size(cachep), flags, cachep->flags);
3103} 3104}
3104 3105
3105static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) 3106static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
diff --git a/mm/slub.c b/mm/slub.c
index 8d71aaf888d7..3525a4ec9794 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -151,7 +151,8 @@
151 * Set of flags that will prevent slab merging 151 * Set of flags that will prevent slab merging
152 */ 152 */
153#define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ 153#define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
154 SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE) 154 SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \
155 SLAB_FAILSLAB)
155 156
156#define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \ 157#define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \
157 SLAB_CACHE_DMA | SLAB_NOTRACK) 158 SLAB_CACHE_DMA | SLAB_NOTRACK)
@@ -217,10 +218,10 @@ static inline void sysfs_slab_remove(struct kmem_cache *s)
217 218
218#endif 219#endif
219 220
220static inline void stat(struct kmem_cache_cpu *c, enum stat_item si) 221static inline void stat(struct kmem_cache *s, enum stat_item si)
221{ 222{
222#ifdef CONFIG_SLUB_STATS 223#ifdef CONFIG_SLUB_STATS
223 c->stat[si]++; 224 __this_cpu_inc(s->cpu_slab->stat[si]);
224#endif 225#endif
225} 226}
226 227
@@ -242,15 +243,6 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
242#endif 243#endif
243} 244}
244 245
245static inline struct kmem_cache_cpu *get_cpu_slab(struct kmem_cache *s, int cpu)
246{
247#ifdef CONFIG_SMP
248 return s->cpu_slab[cpu];
249#else
250 return &s->cpu_slab;
251#endif
252}
253
254/* Verify that a pointer has an address that is valid within a slab page */ 246/* Verify that a pointer has an address that is valid within a slab page */
255static inline int check_valid_pointer(struct kmem_cache *s, 247static inline int check_valid_pointer(struct kmem_cache *s,
256 struct page *page, const void *object) 248 struct page *page, const void *object)
@@ -269,13 +261,6 @@ static inline int check_valid_pointer(struct kmem_cache *s,
269 return 1; 261 return 1;
270} 262}
271 263
272/*
273 * Slow version of get and set free pointer.
274 *
275 * This version requires touching the cache lines of kmem_cache which
276 * we avoid to do in the fast alloc free paths. There we obtain the offset
277 * from the page struct.
278 */
279static inline void *get_freepointer(struct kmem_cache *s, void *object) 264static inline void *get_freepointer(struct kmem_cache *s, void *object)
280{ 265{
281 return *(void **)(object + s->offset); 266 return *(void **)(object + s->offset);
@@ -1020,6 +1005,9 @@ static int __init setup_slub_debug(char *str)
1020 case 't': 1005 case 't':
1021 slub_debug |= SLAB_TRACE; 1006 slub_debug |= SLAB_TRACE;
1022 break; 1007 break;
1008 case 'a':
1009 slub_debug |= SLAB_FAILSLAB;
1010 break;
1023 default: 1011 default:
1024 printk(KERN_ERR "slub_debug option '%c' " 1012 printk(KERN_ERR "slub_debug option '%c' "
1025 "unknown. skipped\n", *str); 1013 "unknown. skipped\n", *str);
@@ -1124,7 +1112,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
1124 if (!page) 1112 if (!page)
1125 return NULL; 1113 return NULL;
1126 1114
1127 stat(get_cpu_slab(s, raw_smp_processor_id()), ORDER_FALLBACK); 1115 stat(s, ORDER_FALLBACK);
1128 } 1116 }
1129 1117
1130 if (kmemcheck_enabled 1118 if (kmemcheck_enabled
@@ -1422,23 +1410,22 @@ static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node)
1422static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) 1410static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
1423{ 1411{
1424 struct kmem_cache_node *n = get_node(s, page_to_nid(page)); 1412 struct kmem_cache_node *n = get_node(s, page_to_nid(page));
1425 struct kmem_cache_cpu *c = get_cpu_slab(s, smp_processor_id());
1426 1413
1427 __ClearPageSlubFrozen(page); 1414 __ClearPageSlubFrozen(page);
1428 if (page->inuse) { 1415 if (page->inuse) {
1429 1416
1430 if (page->freelist) { 1417 if (page->freelist) {
1431 add_partial(n, page, tail); 1418 add_partial(n, page, tail);
1432 stat(c, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD); 1419 stat(s, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD);
1433 } else { 1420 } else {
1434 stat(c, DEACTIVATE_FULL); 1421 stat(s, DEACTIVATE_FULL);
1435 if (SLABDEBUG && PageSlubDebug(page) && 1422 if (SLABDEBUG && PageSlubDebug(page) &&
1436 (s->flags & SLAB_STORE_USER)) 1423 (s->flags & SLAB_STORE_USER))
1437 add_full(n, page); 1424 add_full(n, page);
1438 } 1425 }
1439 slab_unlock(page); 1426 slab_unlock(page);
1440 } else { 1427 } else {
1441 stat(c, DEACTIVATE_EMPTY); 1428 stat(s, DEACTIVATE_EMPTY);
1442 if (n->nr_partial < s->min_partial) { 1429 if (n->nr_partial < s->min_partial) {
1443 /* 1430 /*
1444 * Adding an empty slab to the partial slabs in order 1431 * Adding an empty slab to the partial slabs in order
@@ -1454,7 +1441,7 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
1454 slab_unlock(page); 1441 slab_unlock(page);
1455 } else { 1442 } else {
1456 slab_unlock(page); 1443 slab_unlock(page);
1457 stat(get_cpu_slab(s, raw_smp_processor_id()), FREE_SLAB); 1444 stat(s, FREE_SLAB);
1458 discard_slab(s, page); 1445 discard_slab(s, page);
1459 } 1446 }
1460 } 1447 }
@@ -1469,7 +1456,7 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
1469 int tail = 1; 1456 int tail = 1;
1470 1457
1471 if (page->freelist) 1458 if (page->freelist)
1472 stat(c, DEACTIVATE_REMOTE_FREES); 1459 stat(s, DEACTIVATE_REMOTE_FREES);
1473 /* 1460 /*
1474 * Merge cpu freelist into slab freelist. Typically we get here 1461 * Merge cpu freelist into slab freelist. Typically we get here
1475 * because both freelists are empty. So this is unlikely 1462 * because both freelists are empty. So this is unlikely
@@ -1482,10 +1469,10 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
1482 1469
1483 /* Retrieve object from cpu_freelist */ 1470 /* Retrieve object from cpu_freelist */
1484 object = c->freelist; 1471 object = c->freelist;
1485 c->freelist = c->freelist[c->offset]; 1472 c->freelist = get_freepointer(s, c->freelist);
1486 1473
1487 /* And put onto the regular freelist */ 1474 /* And put onto the regular freelist */
1488 object[c->offset] = page->freelist; 1475 set_freepointer(s, object, page->freelist);
1489 page->freelist = object; 1476 page->freelist = object;
1490 page->inuse--; 1477 page->inuse--;
1491 } 1478 }
@@ -1495,7 +1482,7 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
1495 1482
1496static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) 1483static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
1497{ 1484{
1498 stat(c, CPUSLAB_FLUSH); 1485 stat(s, CPUSLAB_FLUSH);
1499 slab_lock(c->page); 1486 slab_lock(c->page);
1500 deactivate_slab(s, c); 1487 deactivate_slab(s, c);
1501} 1488}
@@ -1507,7 +1494,7 @@ static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
1507 */ 1494 */
1508static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) 1495static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu)
1509{ 1496{
1510 struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); 1497 struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
1511 1498
1512 if (likely(c && c->page)) 1499 if (likely(c && c->page))
1513 flush_slab(s, c); 1500 flush_slab(s, c);
@@ -1635,7 +1622,7 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
1635 if (unlikely(!node_match(c, node))) 1622 if (unlikely(!node_match(c, node)))
1636 goto another_slab; 1623 goto another_slab;
1637 1624
1638 stat(c, ALLOC_REFILL); 1625 stat(s, ALLOC_REFILL);
1639 1626
1640load_freelist: 1627load_freelist:
1641 object = c->page->freelist; 1628 object = c->page->freelist;
@@ -1644,13 +1631,13 @@ load_freelist:
1644 if (unlikely(SLABDEBUG && PageSlubDebug(c->page))) 1631 if (unlikely(SLABDEBUG && PageSlubDebug(c->page)))
1645 goto debug; 1632 goto debug;
1646 1633
1647 c->freelist = object[c->offset]; 1634 c->freelist = get_freepointer(s, object);
1648 c->page->inuse = c->page->objects; 1635 c->page->inuse = c->page->objects;
1649 c->page->freelist = NULL; 1636 c->page->freelist = NULL;
1650 c->node = page_to_nid(c->page); 1637 c->node = page_to_nid(c->page);
1651unlock_out: 1638unlock_out:
1652 slab_unlock(c->page); 1639 slab_unlock(c->page);
1653 stat(c, ALLOC_SLOWPATH); 1640 stat(s, ALLOC_SLOWPATH);
1654 return object; 1641 return object;
1655 1642
1656another_slab: 1643another_slab:
@@ -1660,7 +1647,7 @@ new_slab:
1660 new = get_partial(s, gfpflags, node); 1647 new = get_partial(s, gfpflags, node);
1661 if (new) { 1648 if (new) {
1662 c->page = new; 1649 c->page = new;
1663 stat(c, ALLOC_FROM_PARTIAL); 1650 stat(s, ALLOC_FROM_PARTIAL);
1664 goto load_freelist; 1651 goto load_freelist;
1665 } 1652 }
1666 1653
@@ -1673,8 +1660,8 @@ new_slab:
1673 local_irq_disable(); 1660 local_irq_disable();
1674 1661
1675 if (new) { 1662 if (new) {
1676 c = get_cpu_slab(s, smp_processor_id()); 1663 c = __this_cpu_ptr(s->cpu_slab);
1677 stat(c, ALLOC_SLAB); 1664 stat(s, ALLOC_SLAB);
1678 if (c->page) 1665 if (c->page)
1679 flush_slab(s, c); 1666 flush_slab(s, c);
1680 slab_lock(new); 1667 slab_lock(new);
@@ -1690,7 +1677,7 @@ debug:
1690 goto another_slab; 1677 goto another_slab;
1691 1678
1692 c->page->inuse++; 1679 c->page->inuse++;
1693 c->page->freelist = object[c->offset]; 1680 c->page->freelist = get_freepointer(s, object);
1694 c->node = -1; 1681 c->node = -1;
1695 goto unlock_out; 1682 goto unlock_out;
1696} 1683}
@@ -1711,35 +1698,33 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
1711 void **object; 1698 void **object;
1712 struct kmem_cache_cpu *c; 1699 struct kmem_cache_cpu *c;
1713 unsigned long flags; 1700 unsigned long flags;
1714 unsigned int objsize;
1715 1701
1716 gfpflags &= gfp_allowed_mask; 1702 gfpflags &= gfp_allowed_mask;
1717 1703
1718 lockdep_trace_alloc(gfpflags); 1704 lockdep_trace_alloc(gfpflags);
1719 might_sleep_if(gfpflags & __GFP_WAIT); 1705 might_sleep_if(gfpflags & __GFP_WAIT);
1720 1706
1721 if (should_failslab(s->objsize, gfpflags)) 1707 if (should_failslab(s->objsize, gfpflags, s->flags))
1722 return NULL; 1708 return NULL;
1723 1709
1724 local_irq_save(flags); 1710 local_irq_save(flags);
1725 c = get_cpu_slab(s, smp_processor_id()); 1711 c = __this_cpu_ptr(s->cpu_slab);
1726 objsize = c->objsize; 1712 object = c->freelist;
1727 if (unlikely(!c->freelist || !node_match(c, node))) 1713 if (unlikely(!object || !node_match(c, node)))
1728 1714
1729 object = __slab_alloc(s, gfpflags, node, addr, c); 1715 object = __slab_alloc(s, gfpflags, node, addr, c);
1730 1716
1731 else { 1717 else {
1732 object = c->freelist; 1718 c->freelist = get_freepointer(s, object);
1733 c->freelist = object[c->offset]; 1719 stat(s, ALLOC_FASTPATH);
1734 stat(c, ALLOC_FASTPATH);
1735 } 1720 }
1736 local_irq_restore(flags); 1721 local_irq_restore(flags);
1737 1722
1738 if (unlikely(gfpflags & __GFP_ZERO) && object) 1723 if (unlikely(gfpflags & __GFP_ZERO) && object)
1739 memset(object, 0, objsize); 1724 memset(object, 0, s->objsize);
1740 1725
1741 kmemcheck_slab_alloc(s, gfpflags, object, c->objsize); 1726 kmemcheck_slab_alloc(s, gfpflags, object, s->objsize);
1742 kmemleak_alloc_recursive(object, objsize, 1, s->flags, gfpflags); 1727 kmemleak_alloc_recursive(object, s->objsize, 1, s->flags, gfpflags);
1743 1728
1744 return object; 1729 return object;
1745} 1730}
@@ -1794,26 +1779,25 @@ EXPORT_SYMBOL(kmem_cache_alloc_node_notrace);
1794 * handling required then we can return immediately. 1779 * handling required then we can return immediately.
1795 */ 1780 */
1796static void __slab_free(struct kmem_cache *s, struct page *page, 1781static void __slab_free(struct kmem_cache *s, struct page *page,
1797 void *x, unsigned long addr, unsigned int offset) 1782 void *x, unsigned long addr)
1798{ 1783{
1799 void *prior; 1784 void *prior;
1800 void **object = (void *)x; 1785 void **object = (void *)x;
1801 struct kmem_cache_cpu *c;
1802 1786
1803 c = get_cpu_slab(s, raw_smp_processor_id()); 1787 stat(s, FREE_SLOWPATH);
1804 stat(c, FREE_SLOWPATH);
1805 slab_lock(page); 1788 slab_lock(page);
1806 1789
1807 if (unlikely(SLABDEBUG && PageSlubDebug(page))) 1790 if (unlikely(SLABDEBUG && PageSlubDebug(page)))
1808 goto debug; 1791 goto debug;
1809 1792
1810checks_ok: 1793checks_ok:
1811 prior = object[offset] = page->freelist; 1794 prior = page->freelist;
1795 set_freepointer(s, object, prior);
1812 page->freelist = object; 1796 page->freelist = object;
1813 page->inuse--; 1797 page->inuse--;
1814 1798
1815 if (unlikely(PageSlubFrozen(page))) { 1799 if (unlikely(PageSlubFrozen(page))) {
1816 stat(c, FREE_FROZEN); 1800 stat(s, FREE_FROZEN);
1817 goto out_unlock; 1801 goto out_unlock;
1818 } 1802 }
1819 1803
@@ -1826,7 +1810,7 @@ checks_ok:
1826 */ 1810 */
1827 if (unlikely(!prior)) { 1811 if (unlikely(!prior)) {
1828 add_partial(get_node(s, page_to_nid(page)), page, 1); 1812 add_partial(get_node(s, page_to_nid(page)), page, 1);
1829 stat(c, FREE_ADD_PARTIAL); 1813 stat(s, FREE_ADD_PARTIAL);
1830 } 1814 }
1831 1815
1832out_unlock: 1816out_unlock:
@@ -1839,10 +1823,10 @@ slab_empty:
1839 * Slab still on the partial list. 1823 * Slab still on the partial list.
1840 */ 1824 */
1841 remove_partial(s, page); 1825 remove_partial(s, page);
1842 stat(c, FREE_REMOVE_PARTIAL); 1826 stat(s, FREE_REMOVE_PARTIAL);
1843 } 1827 }
1844 slab_unlock(page); 1828 slab_unlock(page);
1845 stat(c, FREE_SLAB); 1829 stat(s, FREE_SLAB);
1846 discard_slab(s, page); 1830 discard_slab(s, page);
1847 return; 1831 return;
1848 1832
@@ -1872,17 +1856,17 @@ static __always_inline void slab_free(struct kmem_cache *s,
1872 1856
1873 kmemleak_free_recursive(x, s->flags); 1857 kmemleak_free_recursive(x, s->flags);
1874 local_irq_save(flags); 1858 local_irq_save(flags);
1875 c = get_cpu_slab(s, smp_processor_id()); 1859 c = __this_cpu_ptr(s->cpu_slab);
1876 kmemcheck_slab_free(s, object, c->objsize); 1860 kmemcheck_slab_free(s, object, s->objsize);
1877 debug_check_no_locks_freed(object, c->objsize); 1861 debug_check_no_locks_freed(object, s->objsize);
1878 if (!(s->flags & SLAB_DEBUG_OBJECTS)) 1862 if (!(s->flags & SLAB_DEBUG_OBJECTS))
1879 debug_check_no_obj_freed(object, c->objsize); 1863 debug_check_no_obj_freed(object, s->objsize);
1880 if (likely(page == c->page && c->node >= 0)) { 1864 if (likely(page == c->page && c->node >= 0)) {
1881 object[c->offset] = c->freelist; 1865 set_freepointer(s, object, c->freelist);
1882 c->freelist = object; 1866 c->freelist = object;
1883 stat(c, FREE_FASTPATH); 1867 stat(s, FREE_FASTPATH);
1884 } else 1868 } else
1885 __slab_free(s, page, x, addr, c->offset); 1869 __slab_free(s, page, x, addr);
1886 1870
1887 local_irq_restore(flags); 1871 local_irq_restore(flags);
1888} 1872}
@@ -2069,19 +2053,6 @@ static unsigned long calculate_alignment(unsigned long flags,
2069 return ALIGN(align, sizeof(void *)); 2053 return ALIGN(align, sizeof(void *));
2070} 2054}
2071 2055
2072static void init_kmem_cache_cpu(struct kmem_cache *s,
2073 struct kmem_cache_cpu *c)
2074{
2075 c->page = NULL;
2076 c->freelist = NULL;
2077 c->node = 0;
2078 c->offset = s->offset / sizeof(void *);
2079 c->objsize = s->objsize;
2080#ifdef CONFIG_SLUB_STATS
2081 memset(c->stat, 0, NR_SLUB_STAT_ITEMS * sizeof(unsigned));
2082#endif
2083}
2084
2085static void 2056static void
2086init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s) 2057init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s)
2087{ 2058{
@@ -2095,130 +2066,24 @@ init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s)
2095#endif 2066#endif
2096} 2067}
2097 2068
2098#ifdef CONFIG_SMP 2069static DEFINE_PER_CPU(struct kmem_cache_cpu, kmalloc_percpu[KMALLOC_CACHES]);
2099/*
2100 * Per cpu array for per cpu structures.
2101 *
2102 * The per cpu array places all kmem_cache_cpu structures from one processor
2103 * close together meaning that it becomes possible that multiple per cpu
2104 * structures are contained in one cacheline. This may be particularly
2105 * beneficial for the kmalloc caches.
2106 *
2107 * A desktop system typically has around 60-80 slabs. With 100 here we are
2108 * likely able to get per cpu structures for all caches from the array defined
2109 * here. We must be able to cover all kmalloc caches during bootstrap.
2110 *
2111 * If the per cpu array is exhausted then fall back to kmalloc
2112 * of individual cachelines. No sharing is possible then.
2113 */
2114#define NR_KMEM_CACHE_CPU 100
2115
2116static DEFINE_PER_CPU(struct kmem_cache_cpu [NR_KMEM_CACHE_CPU],
2117 kmem_cache_cpu);
2118
2119static DEFINE_PER_CPU(struct kmem_cache_cpu *, kmem_cache_cpu_free);
2120static DECLARE_BITMAP(kmem_cach_cpu_free_init_once, CONFIG_NR_CPUS);
2121
2122static struct kmem_cache_cpu *alloc_kmem_cache_cpu(struct kmem_cache *s,
2123 int cpu, gfp_t flags)
2124{
2125 struct kmem_cache_cpu *c = per_cpu(kmem_cache_cpu_free, cpu);
2126
2127 if (c)
2128 per_cpu(kmem_cache_cpu_free, cpu) =
2129 (void *)c->freelist;
2130 else {
2131 /* Table overflow: So allocate ourselves */
2132 c = kmalloc_node(
2133 ALIGN(sizeof(struct kmem_cache_cpu), cache_line_size()),
2134 flags, cpu_to_node(cpu));
2135 if (!c)
2136 return NULL;
2137 }
2138
2139 init_kmem_cache_cpu(s, c);
2140 return c;
2141}
2142
2143static void free_kmem_cache_cpu(struct kmem_cache_cpu *c, int cpu)
2144{
2145 if (c < per_cpu(kmem_cache_cpu, cpu) ||
2146 c >= per_cpu(kmem_cache_cpu, cpu) + NR_KMEM_CACHE_CPU) {
2147 kfree(c);
2148 return;
2149 }
2150 c->freelist = (void *)per_cpu(kmem_cache_cpu_free, cpu);
2151 per_cpu(kmem_cache_cpu_free, cpu) = c;
2152}
2153
2154static void free_kmem_cache_cpus(struct kmem_cache *s)
2155{
2156 int cpu;
2157
2158 for_each_online_cpu(cpu) {
2159 struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
2160
2161 if (c) {
2162 s->cpu_slab[cpu] = NULL;
2163 free_kmem_cache_cpu(c, cpu);
2164 }
2165 }
2166}
2167
2168static int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags)
2169{
2170 int cpu;
2171
2172 for_each_online_cpu(cpu) {
2173 struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
2174
2175 if (c)
2176 continue;
2177
2178 c = alloc_kmem_cache_cpu(s, cpu, flags);
2179 if (!c) {
2180 free_kmem_cache_cpus(s);
2181 return 0;
2182 }
2183 s->cpu_slab[cpu] = c;
2184 }
2185 return 1;
2186}
2187
2188/*
2189 * Initialize the per cpu array.
2190 */
2191static void init_alloc_cpu_cpu(int cpu)
2192{
2193 int i;
2194 2070
2195 if (cpumask_test_cpu(cpu, to_cpumask(kmem_cach_cpu_free_init_once))) 2071static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags)
2196 return;
2197
2198 for (i = NR_KMEM_CACHE_CPU - 1; i >= 0; i--)
2199 free_kmem_cache_cpu(&per_cpu(kmem_cache_cpu, cpu)[i], cpu);
2200
2201 cpumask_set_cpu(cpu, to_cpumask(kmem_cach_cpu_free_init_once));
2202}
2203
2204static void __init init_alloc_cpu(void)
2205{ 2072{
2206 int cpu; 2073 if (s < kmalloc_caches + KMALLOC_CACHES && s >= kmalloc_caches)
2207 2074 /*
2208 for_each_online_cpu(cpu) 2075 * Boot time creation of the kmalloc array. Use static per cpu data
2209 init_alloc_cpu_cpu(cpu); 2076 * since the per cpu allocator is not available yet.
2210 } 2077 */
2078 s->cpu_slab = per_cpu_var(kmalloc_percpu) + (s - kmalloc_caches);
2079 else
2080 s->cpu_slab = alloc_percpu(struct kmem_cache_cpu);
2211 2081
2212#else 2082 if (!s->cpu_slab)
2213static inline void free_kmem_cache_cpus(struct kmem_cache *s) {} 2083 return 0;
2214static inline void init_alloc_cpu(void) {}
2215 2084
2216static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags)
2217{
2218 init_kmem_cache_cpu(s, &s->cpu_slab);
2219 return 1; 2085 return 1;
2220} 2086}
2221#endif
2222 2087
2223#ifdef CONFIG_NUMA 2088#ifdef CONFIG_NUMA
2224/* 2089/*
@@ -2287,7 +2152,8 @@ static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
2287 int node; 2152 int node;
2288 int local_node; 2153 int local_node;
2289 2154
2290 if (slab_state >= UP) 2155 if (slab_state >= UP && (s < kmalloc_caches ||
2156 s > kmalloc_caches + KMALLOC_CACHES))
2291 local_node = page_to_nid(virt_to_page(s)); 2157 local_node = page_to_nid(virt_to_page(s));
2292 else 2158 else
2293 local_node = 0; 2159 local_node = 0;
@@ -2502,6 +2368,7 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
2502 2368
2503 if (alloc_kmem_cache_cpus(s, gfpflags & ~SLUB_DMA)) 2369 if (alloc_kmem_cache_cpus(s, gfpflags & ~SLUB_DMA))
2504 return 1; 2370 return 1;
2371
2505 free_kmem_cache_nodes(s); 2372 free_kmem_cache_nodes(s);
2506error: 2373error:
2507 if (flags & SLAB_PANIC) 2374 if (flags & SLAB_PANIC)
@@ -2609,9 +2476,8 @@ static inline int kmem_cache_close(struct kmem_cache *s)
2609 int node; 2476 int node;
2610 2477
2611 flush_all(s); 2478 flush_all(s);
2612 2479 free_percpu(s->cpu_slab);
2613 /* Attempt to free all objects */ 2480 /* Attempt to free all objects */
2614 free_kmem_cache_cpus(s);
2615 for_each_node_state(node, N_NORMAL_MEMORY) { 2481 for_each_node_state(node, N_NORMAL_MEMORY) {
2616 struct kmem_cache_node *n = get_node(s, node); 2482 struct kmem_cache_node *n = get_node(s, node);
2617 2483
@@ -2651,7 +2517,7 @@ EXPORT_SYMBOL(kmem_cache_destroy);
2651 * Kmalloc subsystem 2517 * Kmalloc subsystem
2652 *******************************************************************/ 2518 *******************************************************************/
2653 2519
2654struct kmem_cache kmalloc_caches[SLUB_PAGE_SHIFT] __cacheline_aligned; 2520struct kmem_cache kmalloc_caches[KMALLOC_CACHES] __cacheline_aligned;
2655EXPORT_SYMBOL(kmalloc_caches); 2521EXPORT_SYMBOL(kmalloc_caches);
2656 2522
2657static int __init setup_slub_min_order(char *str) 2523static int __init setup_slub_min_order(char *str)
@@ -2741,6 +2607,7 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags)
2741 char *text; 2607 char *text;
2742 size_t realsize; 2608 size_t realsize;
2743 unsigned long slabflags; 2609 unsigned long slabflags;
2610 int i;
2744 2611
2745 s = kmalloc_caches_dma[index]; 2612 s = kmalloc_caches_dma[index];
2746 if (s) 2613 if (s)
@@ -2760,7 +2627,14 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags)
2760 realsize = kmalloc_caches[index].objsize; 2627 realsize = kmalloc_caches[index].objsize;
2761 text = kasprintf(flags & ~SLUB_DMA, "kmalloc_dma-%d", 2628 text = kasprintf(flags & ~SLUB_DMA, "kmalloc_dma-%d",
2762 (unsigned int)realsize); 2629 (unsigned int)realsize);
2763 s = kmalloc(kmem_size, flags & ~SLUB_DMA); 2630
2631 s = NULL;
2632 for (i = 0; i < KMALLOC_CACHES; i++)
2633 if (!kmalloc_caches[i].size)
2634 break;
2635
2636 BUG_ON(i >= KMALLOC_CACHES);
2637 s = kmalloc_caches + i;
2764 2638
2765 /* 2639 /*
2766 * Must defer sysfs creation to a workqueue because we don't know 2640 * Must defer sysfs creation to a workqueue because we don't know
@@ -2772,9 +2646,9 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags)
2772 if (slab_state >= SYSFS) 2646 if (slab_state >= SYSFS)
2773 slabflags |= __SYSFS_ADD_DEFERRED; 2647 slabflags |= __SYSFS_ADD_DEFERRED;
2774 2648
2775 if (!s || !text || !kmem_cache_open(s, flags, text, 2649 if (!text || !kmem_cache_open(s, flags, text,
2776 realsize, ARCH_KMALLOC_MINALIGN, slabflags, NULL)) { 2650 realsize, ARCH_KMALLOC_MINALIGN, slabflags, NULL)) {
2777 kfree(s); 2651 s->size = 0;
2778 kfree(text); 2652 kfree(text);
2779 goto unlock_out; 2653 goto unlock_out;
2780 } 2654 }
@@ -3176,8 +3050,6 @@ void __init kmem_cache_init(void)
3176 int i; 3050 int i;
3177 int caches = 0; 3051 int caches = 0;
3178 3052
3179 init_alloc_cpu();
3180
3181#ifdef CONFIG_NUMA 3053#ifdef CONFIG_NUMA
3182 /* 3054 /*
3183 * Must first have the slab cache available for the allocations of the 3055 * Must first have the slab cache available for the allocations of the
@@ -3261,8 +3133,10 @@ void __init kmem_cache_init(void)
3261 3133
3262#ifdef CONFIG_SMP 3134#ifdef CONFIG_SMP
3263 register_cpu_notifier(&slab_notifier); 3135 register_cpu_notifier(&slab_notifier);
3264 kmem_size = offsetof(struct kmem_cache, cpu_slab) + 3136#endif
3265 nr_cpu_ids * sizeof(struct kmem_cache_cpu *); 3137#ifdef CONFIG_NUMA
3138 kmem_size = offsetof(struct kmem_cache, node) +
3139 nr_node_ids * sizeof(struct kmem_cache_node *);
3266#else 3140#else
3267 kmem_size = sizeof(struct kmem_cache); 3141 kmem_size = sizeof(struct kmem_cache);
3268#endif 3142#endif
@@ -3351,22 +3225,12 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
3351 down_write(&slub_lock); 3225 down_write(&slub_lock);
3352 s = find_mergeable(size, align, flags, name, ctor); 3226 s = find_mergeable(size, align, flags, name, ctor);
3353 if (s) { 3227 if (s) {
3354 int cpu;
3355
3356 s->refcount++; 3228 s->refcount++;
3357 /* 3229 /*
3358 * Adjust the object sizes so that we clear 3230 * Adjust the object sizes so that we clear
3359 * the complete object on kzalloc. 3231 * the complete object on kzalloc.
3360 */ 3232 */
3361 s->objsize = max(s->objsize, (int)size); 3233 s->objsize = max(s->objsize, (int)size);
3362
3363 /*
3364 * And then we need to update the object size in the
3365 * per cpu structures
3366 */
3367 for_each_online_cpu(cpu)
3368 get_cpu_slab(s, cpu)->objsize = s->objsize;
3369
3370 s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); 3234 s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *)));
3371 up_write(&slub_lock); 3235 up_write(&slub_lock);
3372 3236
@@ -3420,29 +3284,15 @@ static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb,
3420 unsigned long flags; 3284 unsigned long flags;
3421 3285
3422 switch (action) { 3286 switch (action) {
3423 case CPU_UP_PREPARE:
3424 case CPU_UP_PREPARE_FROZEN:
3425 init_alloc_cpu_cpu(cpu);
3426 down_read(&slub_lock);
3427 list_for_each_entry(s, &slab_caches, list)
3428 s->cpu_slab[cpu] = alloc_kmem_cache_cpu(s, cpu,
3429 GFP_KERNEL);
3430 up_read(&slub_lock);
3431 break;
3432
3433 case CPU_UP_CANCELED: 3287 case CPU_UP_CANCELED:
3434 case CPU_UP_CANCELED_FROZEN: 3288 case CPU_UP_CANCELED_FROZEN:
3435 case CPU_DEAD: 3289 case CPU_DEAD:
3436 case CPU_DEAD_FROZEN: 3290 case CPU_DEAD_FROZEN:
3437 down_read(&slub_lock); 3291 down_read(&slub_lock);
3438 list_for_each_entry(s, &slab_caches, list) { 3292 list_for_each_entry(s, &slab_caches, list) {
3439 struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
3440
3441 local_irq_save(flags); 3293 local_irq_save(flags);
3442 __flush_cpu_slab(s, cpu); 3294 __flush_cpu_slab(s, cpu);
3443 local_irq_restore(flags); 3295 local_irq_restore(flags);
3444 free_kmem_cache_cpu(c, cpu);
3445 s->cpu_slab[cpu] = NULL;
3446 } 3296 }
3447 up_read(&slub_lock); 3297 up_read(&slub_lock);
3448 break; 3298 break;
@@ -3928,7 +3778,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
3928 int cpu; 3778 int cpu;
3929 3779
3930 for_each_possible_cpu(cpu) { 3780 for_each_possible_cpu(cpu) {
3931 struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); 3781 struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
3932 3782
3933 if (!c || c->node < 0) 3783 if (!c || c->node < 0)
3934 continue; 3784 continue;
@@ -4171,6 +4021,23 @@ static ssize_t trace_store(struct kmem_cache *s, const char *buf,
4171} 4021}
4172SLAB_ATTR(trace); 4022SLAB_ATTR(trace);
4173 4023
4024#ifdef CONFIG_FAILSLAB
4025static ssize_t failslab_show(struct kmem_cache *s, char *buf)
4026{
4027 return sprintf(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB));
4028}
4029
4030static ssize_t failslab_store(struct kmem_cache *s, const char *buf,
4031 size_t length)
4032{
4033 s->flags &= ~SLAB_FAILSLAB;
4034 if (buf[0] == '1')
4035 s->flags |= SLAB_FAILSLAB;
4036 return length;
4037}
4038SLAB_ATTR(failslab);
4039#endif
4040
4174static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf) 4041static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf)
4175{ 4042{
4176 return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT)); 4043 return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT));
@@ -4353,7 +4220,7 @@ static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si)
4353 return -ENOMEM; 4220 return -ENOMEM;
4354 4221
4355 for_each_online_cpu(cpu) { 4222 for_each_online_cpu(cpu) {
4356 unsigned x = get_cpu_slab(s, cpu)->stat[si]; 4223 unsigned x = per_cpu_ptr(s->cpu_slab, cpu)->stat[si];
4357 4224
4358 data[cpu] = x; 4225 data[cpu] = x;
4359 sum += x; 4226 sum += x;
@@ -4376,7 +4243,7 @@ static void clear_stat(struct kmem_cache *s, enum stat_item si)
4376 int cpu; 4243 int cpu;
4377 4244
4378 for_each_online_cpu(cpu) 4245 for_each_online_cpu(cpu)
4379 get_cpu_slab(s, cpu)->stat[si] = 0; 4246 per_cpu_ptr(s->cpu_slab, cpu)->stat[si] = 0;
4380} 4247}
4381 4248
4382#define STAT_ATTR(si, text) \ 4249#define STAT_ATTR(si, text) \
@@ -4467,6 +4334,10 @@ static struct attribute *slab_attrs[] = {
4467 &deactivate_remote_frees_attr.attr, 4334 &deactivate_remote_frees_attr.attr,
4468 &order_fallback_attr.attr, 4335 &order_fallback_attr.attr,
4469#endif 4336#endif
4337#ifdef CONFIG_FAILSLAB
4338 &failslab_attr.attr,
4339#endif
4340
4470 NULL 4341 NULL
4471}; 4342};
4472 4343
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index d9714bdcb4a3..392b9bb5bc01 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -40,9 +40,11 @@ static void * __init_refok __earlyonly_bootmem_alloc(int node,
40 unsigned long align, 40 unsigned long align,
41 unsigned long goal) 41 unsigned long goal)
42{ 42{
43 return __alloc_bootmem_node(NODE_DATA(node), size, align, goal); 43 return __alloc_bootmem_node_high(NODE_DATA(node), size, align, goal);
44} 44}
45 45
46static void *vmemmap_buf;
47static void *vmemmap_buf_end;
46 48
47void * __meminit vmemmap_alloc_block(unsigned long size, int node) 49void * __meminit vmemmap_alloc_block(unsigned long size, int node)
48{ 50{
@@ -64,6 +66,24 @@ void * __meminit vmemmap_alloc_block(unsigned long size, int node)
64 __pa(MAX_DMA_ADDRESS)); 66 __pa(MAX_DMA_ADDRESS));
65} 67}
66 68
69/* need to make sure size is all the same during early stage */
70void * __meminit vmemmap_alloc_block_buf(unsigned long size, int node)
71{
72 void *ptr;
73
74 if (!vmemmap_buf)
75 return vmemmap_alloc_block(size, node);
76
77 /* take the from buf */
78 ptr = (void *)ALIGN((unsigned long)vmemmap_buf, size);
79 if (ptr + size > vmemmap_buf_end)
80 return vmemmap_alloc_block(size, node);
81
82 vmemmap_buf = ptr + size;
83
84 return ptr;
85}
86
67void __meminit vmemmap_verify(pte_t *pte, int node, 87void __meminit vmemmap_verify(pte_t *pte, int node,
68 unsigned long start, unsigned long end) 88 unsigned long start, unsigned long end)
69{ 89{
@@ -80,7 +100,7 @@ pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node)
80 pte_t *pte = pte_offset_kernel(pmd, addr); 100 pte_t *pte = pte_offset_kernel(pmd, addr);
81 if (pte_none(*pte)) { 101 if (pte_none(*pte)) {
82 pte_t entry; 102 pte_t entry;
83 void *p = vmemmap_alloc_block(PAGE_SIZE, node); 103 void *p = vmemmap_alloc_block_buf(PAGE_SIZE, node);
84 if (!p) 104 if (!p)
85 return NULL; 105 return NULL;
86 entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL); 106 entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
@@ -163,3 +183,55 @@ struct page * __meminit sparse_mem_map_populate(unsigned long pnum, int nid)
163 183
164 return map; 184 return map;
165} 185}
186
187void __init sparse_mem_maps_populate_node(struct page **map_map,
188 unsigned long pnum_begin,
189 unsigned long pnum_end,
190 unsigned long map_count, int nodeid)
191{
192 unsigned long pnum;
193 unsigned long size = sizeof(struct page) * PAGES_PER_SECTION;
194 void *vmemmap_buf_start;
195
196 size = ALIGN(size, PMD_SIZE);
197 vmemmap_buf_start = __earlyonly_bootmem_alloc(nodeid, size * map_count,
198 PMD_SIZE, __pa(MAX_DMA_ADDRESS));
199
200 if (vmemmap_buf_start) {
201 vmemmap_buf = vmemmap_buf_start;
202 vmemmap_buf_end = vmemmap_buf_start + size * map_count;
203 }
204
205 for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
206 struct mem_section *ms;
207
208 if (!present_section_nr(pnum))
209 continue;
210
211 map_map[pnum] = sparse_mem_map_populate(pnum, nodeid);
212 if (map_map[pnum])
213 continue;
214 ms = __nr_to_section(pnum);
215 printk(KERN_ERR "%s: sparsemem memory map backing failed "
216 "some memory will not be available.\n", __func__);
217 ms->section_mem_map = 0;
218 }
219
220 if (vmemmap_buf_start) {
221 /* need to free left buf */
222#ifdef CONFIG_NO_BOOTMEM
223 free_early(__pa(vmemmap_buf_start), __pa(vmemmap_buf_end));
224 if (vmemmap_buf_start < vmemmap_buf) {
225 char name[15];
226
227 snprintf(name, sizeof(name), "MEMMAP %d", nodeid);
228 reserve_early_without_check(__pa(vmemmap_buf_start),
229 __pa(vmemmap_buf), name);
230 }
231#else
232 free_bootmem(__pa(vmemmap_buf), vmemmap_buf_end - vmemmap_buf);
233#endif
234 vmemmap_buf = NULL;
235 vmemmap_buf_end = NULL;
236 }
237}
diff --git a/mm/sparse.c b/mm/sparse.c
index 6ce4aab69e99..22896d589133 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -271,7 +271,8 @@ static unsigned long *__kmalloc_section_usemap(void)
271 271
272#ifdef CONFIG_MEMORY_HOTREMOVE 272#ifdef CONFIG_MEMORY_HOTREMOVE
273static unsigned long * __init 273static unsigned long * __init
274sparse_early_usemap_alloc_pgdat_section(struct pglist_data *pgdat) 274sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
275 unsigned long count)
275{ 276{
276 unsigned long section_nr; 277 unsigned long section_nr;
277 278
@@ -286,7 +287,7 @@ sparse_early_usemap_alloc_pgdat_section(struct pglist_data *pgdat)
286 * this problem. 287 * this problem.
287 */ 288 */
288 section_nr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT); 289 section_nr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT);
289 return alloc_bootmem_section(usemap_size(), section_nr); 290 return alloc_bootmem_section(usemap_size() * count, section_nr);
290} 291}
291 292
292static void __init check_usemap_section_nr(int nid, unsigned long *usemap) 293static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
@@ -329,7 +330,8 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
329} 330}
330#else 331#else
331static unsigned long * __init 332static unsigned long * __init
332sparse_early_usemap_alloc_pgdat_section(struct pglist_data *pgdat) 333sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
334 unsigned long count)
333{ 335{
334 return NULL; 336 return NULL;
335} 337}
@@ -339,27 +341,40 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
339} 341}
340#endif /* CONFIG_MEMORY_HOTREMOVE */ 342#endif /* CONFIG_MEMORY_HOTREMOVE */
341 343
342static unsigned long *__init sparse_early_usemap_alloc(unsigned long pnum) 344static void __init sparse_early_usemaps_alloc_node(unsigned long**usemap_map,
345 unsigned long pnum_begin,
346 unsigned long pnum_end,
347 unsigned long usemap_count, int nodeid)
343{ 348{
344 unsigned long *usemap; 349 void *usemap;
345 struct mem_section *ms = __nr_to_section(pnum); 350 unsigned long pnum;
346 int nid = sparse_early_nid(ms); 351 int size = usemap_size();
347
348 usemap = sparse_early_usemap_alloc_pgdat_section(NODE_DATA(nid));
349 if (usemap)
350 return usemap;
351 352
352 usemap = alloc_bootmem_node(NODE_DATA(nid), usemap_size()); 353 usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid),
354 usemap_count);
353 if (usemap) { 355 if (usemap) {
354 check_usemap_section_nr(nid, usemap); 356 for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
355 return usemap; 357 if (!present_section_nr(pnum))
358 continue;
359 usemap_map[pnum] = usemap;
360 usemap += size;
361 }
362 return;
356 } 363 }
357 364
358 /* Stupid: suppress gcc warning for SPARSEMEM && !NUMA */ 365 usemap = alloc_bootmem_node(NODE_DATA(nodeid), size * usemap_count);
359 nid = 0; 366 if (usemap) {
367 for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
368 if (!present_section_nr(pnum))
369 continue;
370 usemap_map[pnum] = usemap;
371 usemap += size;
372 check_usemap_section_nr(nodeid, usemap_map[pnum]);
373 }
374 return;
375 }
360 376
361 printk(KERN_WARNING "%s: allocation failed\n", __func__); 377 printk(KERN_WARNING "%s: allocation failed\n", __func__);
362 return NULL;
363} 378}
364 379
365#ifndef CONFIG_SPARSEMEM_VMEMMAP 380#ifndef CONFIG_SPARSEMEM_VMEMMAP
@@ -375,8 +390,65 @@ struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid)
375 PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION)); 390 PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION));
376 return map; 391 return map;
377} 392}
393void __init sparse_mem_maps_populate_node(struct page **map_map,
394 unsigned long pnum_begin,
395 unsigned long pnum_end,
396 unsigned long map_count, int nodeid)
397{
398 void *map;
399 unsigned long pnum;
400 unsigned long size = sizeof(struct page) * PAGES_PER_SECTION;
401
402 map = alloc_remap(nodeid, size * map_count);
403 if (map) {
404 for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
405 if (!present_section_nr(pnum))
406 continue;
407 map_map[pnum] = map;
408 map += size;
409 }
410 return;
411 }
412
413 size = PAGE_ALIGN(size);
414 map = alloc_bootmem_pages_node(NODE_DATA(nodeid), size * map_count);
415 if (map) {
416 for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
417 if (!present_section_nr(pnum))
418 continue;
419 map_map[pnum] = map;
420 map += size;
421 }
422 return;
423 }
424
425 /* fallback */
426 for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
427 struct mem_section *ms;
428
429 if (!present_section_nr(pnum))
430 continue;
431 map_map[pnum] = sparse_mem_map_populate(pnum, nodeid);
432 if (map_map[pnum])
433 continue;
434 ms = __nr_to_section(pnum);
435 printk(KERN_ERR "%s: sparsemem memory map backing failed "
436 "some memory will not be available.\n", __func__);
437 ms->section_mem_map = 0;
438 }
439}
378#endif /* !CONFIG_SPARSEMEM_VMEMMAP */ 440#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
379 441
442#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
443static void __init sparse_early_mem_maps_alloc_node(struct page **map_map,
444 unsigned long pnum_begin,
445 unsigned long pnum_end,
446 unsigned long map_count, int nodeid)
447{
448 sparse_mem_maps_populate_node(map_map, pnum_begin, pnum_end,
449 map_count, nodeid);
450}
451#else
380static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) 452static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum)
381{ 453{
382 struct page *map; 454 struct page *map;
@@ -392,10 +464,12 @@ static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum)
392 ms->section_mem_map = 0; 464 ms->section_mem_map = 0;
393 return NULL; 465 return NULL;
394} 466}
467#endif
395 468
396void __attribute__((weak)) __meminit vmemmap_populate_print_last(void) 469void __attribute__((weak)) __meminit vmemmap_populate_print_last(void)
397{ 470{
398} 471}
472
399/* 473/*
400 * Allocate the accumulated non-linear sections, allocate a mem_map 474 * Allocate the accumulated non-linear sections, allocate a mem_map
401 * for each and record the physical to section mapping. 475 * for each and record the physical to section mapping.
@@ -407,6 +481,14 @@ void __init sparse_init(void)
407 unsigned long *usemap; 481 unsigned long *usemap;
408 unsigned long **usemap_map; 482 unsigned long **usemap_map;
409 int size; 483 int size;
484 int nodeid_begin = 0;
485 unsigned long pnum_begin = 0;
486 unsigned long usemap_count;
487#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
488 unsigned long map_count;
489 int size2;
490 struct page **map_map;
491#endif
410 492
411 /* 493 /*
412 * map is using big page (aka 2M in x86 64 bit) 494 * map is using big page (aka 2M in x86 64 bit)
@@ -425,10 +507,81 @@ void __init sparse_init(void)
425 panic("can not allocate usemap_map\n"); 507 panic("can not allocate usemap_map\n");
426 508
427 for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { 509 for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
510 struct mem_section *ms;
511
428 if (!present_section_nr(pnum)) 512 if (!present_section_nr(pnum))
429 continue; 513 continue;
430 usemap_map[pnum] = sparse_early_usemap_alloc(pnum); 514 ms = __nr_to_section(pnum);
515 nodeid_begin = sparse_early_nid(ms);
516 pnum_begin = pnum;
517 break;
431 } 518 }
519 usemap_count = 1;
520 for (pnum = pnum_begin + 1; pnum < NR_MEM_SECTIONS; pnum++) {
521 struct mem_section *ms;
522 int nodeid;
523
524 if (!present_section_nr(pnum))
525 continue;
526 ms = __nr_to_section(pnum);
527 nodeid = sparse_early_nid(ms);
528 if (nodeid == nodeid_begin) {
529 usemap_count++;
530 continue;
531 }
532 /* ok, we need to take cake of from pnum_begin to pnum - 1*/
533 sparse_early_usemaps_alloc_node(usemap_map, pnum_begin, pnum,
534 usemap_count, nodeid_begin);
535 /* new start, update count etc*/
536 nodeid_begin = nodeid;
537 pnum_begin = pnum;
538 usemap_count = 1;
539 }
540 /* ok, last chunk */
541 sparse_early_usemaps_alloc_node(usemap_map, pnum_begin, NR_MEM_SECTIONS,
542 usemap_count, nodeid_begin);
543
544#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
545 size2 = sizeof(struct page *) * NR_MEM_SECTIONS;
546 map_map = alloc_bootmem(size2);
547 if (!map_map)
548 panic("can not allocate map_map\n");
549
550 for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
551 struct mem_section *ms;
552
553 if (!present_section_nr(pnum))
554 continue;
555 ms = __nr_to_section(pnum);
556 nodeid_begin = sparse_early_nid(ms);
557 pnum_begin = pnum;
558 break;
559 }
560 map_count = 1;
561 for (pnum = pnum_begin + 1; pnum < NR_MEM_SECTIONS; pnum++) {
562 struct mem_section *ms;
563 int nodeid;
564
565 if (!present_section_nr(pnum))
566 continue;
567 ms = __nr_to_section(pnum);
568 nodeid = sparse_early_nid(ms);
569 if (nodeid == nodeid_begin) {
570 map_count++;
571 continue;
572 }
573 /* ok, we need to take cake of from pnum_begin to pnum - 1*/
574 sparse_early_mem_maps_alloc_node(map_map, pnum_begin, pnum,
575 map_count, nodeid_begin);
576 /* new start, update count etc*/
577 nodeid_begin = nodeid;
578 pnum_begin = pnum;
579 map_count = 1;
580 }
581 /* ok, last chunk */
582 sparse_early_mem_maps_alloc_node(map_map, pnum_begin, NR_MEM_SECTIONS,
583 map_count, nodeid_begin);
584#endif
432 585
433 for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { 586 for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
434 if (!present_section_nr(pnum)) 587 if (!present_section_nr(pnum))
@@ -438,7 +591,11 @@ void __init sparse_init(void)
438 if (!usemap) 591 if (!usemap)
439 continue; 592 continue;
440 593
594#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
595 map = map_map[pnum];
596#else
441 map = sparse_early_mem_map_alloc(pnum); 597 map = sparse_early_mem_map_alloc(pnum);
598#endif
442 if (!map) 599 if (!map)
443 continue; 600 continue;
444 601
@@ -448,6 +605,9 @@ void __init sparse_init(void)
448 605
449 vmemmap_populate_print_last(); 606 vmemmap_populate_print_last();
450 607
608#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
609 free_bootmem(__pa(map_map), size2);
610#endif
451 free_bootmem(__pa(usemap_map), size); 611 free_bootmem(__pa(usemap_map), size);
452} 612}
453 613
diff --git a/mm/truncate.c b/mm/truncate.c
index 342deee22684..e87e37244829 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -522,22 +522,20 @@ EXPORT_SYMBOL_GPL(invalidate_inode_pages2);
522 */ 522 */
523void truncate_pagecache(struct inode *inode, loff_t old, loff_t new) 523void truncate_pagecache(struct inode *inode, loff_t old, loff_t new)
524{ 524{
525 if (new < old) { 525 struct address_space *mapping = inode->i_mapping;
526 struct address_space *mapping = inode->i_mapping; 526
527 527 /*
528 /* 528 * unmap_mapping_range is called twice, first simply for
529 * unmap_mapping_range is called twice, first simply for 529 * efficiency so that truncate_inode_pages does fewer
530 * efficiency so that truncate_inode_pages does fewer 530 * single-page unmaps. However after this first call, and
531 * single-page unmaps. However after this first call, and 531 * before truncate_inode_pages finishes, it is possible for
532 * before truncate_inode_pages finishes, it is possible for 532 * private pages to be COWed, which remain after
533 * private pages to be COWed, which remain after 533 * truncate_inode_pages finishes, hence the second
534 * truncate_inode_pages finishes, hence the second 534 * unmap_mapping_range call must be made for correctness.
535 * unmap_mapping_range call must be made for correctness. 535 */
536 */ 536 unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1);
537 unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1); 537 truncate_inode_pages(mapping, new);
538 truncate_inode_pages(mapping, new); 538 unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1);
539 unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1);
540 }
541} 539}
542EXPORT_SYMBOL(truncate_pagecache); 540EXPORT_SYMBOL(truncate_pagecache);
543 541
diff --git a/mm/util.c b/mm/util.c
index b377ce430803..834db7be240f 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -4,10 +4,6 @@
4#include <linux/module.h> 4#include <linux/module.h>
5#include <linux/err.h> 5#include <linux/err.h>
6#include <linux/sched.h> 6#include <linux/sched.h>
7#include <linux/hugetlb.h>
8#include <linux/syscalls.h>
9#include <linux/mman.h>
10#include <linux/file.h>
11#include <asm/uaccess.h> 7#include <asm/uaccess.h>
12 8
13#define CREATE_TRACE_POINTS 9#define CREATE_TRACE_POINTS
@@ -224,7 +220,7 @@ char *strndup_user(const char __user *s, long n)
224} 220}
225EXPORT_SYMBOL(strndup_user); 221EXPORT_SYMBOL(strndup_user);
226 222
227#ifndef HAVE_ARCH_PICK_MMAP_LAYOUT 223#if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT)
228void arch_pick_mmap_layout(struct mm_struct *mm) 224void arch_pick_mmap_layout(struct mm_struct *mm)
229{ 225{
230 mm->mmap_base = TASK_UNMAPPED_BASE; 226 mm->mmap_base = TASK_UNMAPPED_BASE;
@@ -272,46 +268,6 @@ int __attribute__((weak)) get_user_pages_fast(unsigned long start,
272} 268}
273EXPORT_SYMBOL_GPL(get_user_pages_fast); 269EXPORT_SYMBOL_GPL(get_user_pages_fast);
274 270
275SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
276 unsigned long, prot, unsigned long, flags,
277 unsigned long, fd, unsigned long, pgoff)
278{
279 struct file * file = NULL;
280 unsigned long retval = -EBADF;
281
282 if (!(flags & MAP_ANONYMOUS)) {
283 if (unlikely(flags & MAP_HUGETLB))
284 return -EINVAL;
285 file = fget(fd);
286 if (!file)
287 goto out;
288 } else if (flags & MAP_HUGETLB) {
289 struct user_struct *user = NULL;
290 /*
291 * VM_NORESERVE is used because the reservations will be
292 * taken when vm_ops->mmap() is called
293 * A dummy user value is used because we are not locking
294 * memory so no accounting is necessary
295 */
296 len = ALIGN(len, huge_page_size(&default_hstate));
297 file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE,
298 &user, HUGETLB_ANONHUGE_INODE);
299 if (IS_ERR(file))
300 return PTR_ERR(file);
301 }
302
303 flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
304
305 down_write(&current->mm->mmap_sem);
306 retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
307 up_write(&current->mm->mmap_sem);
308
309 if (file)
310 fput(file);
311out:
312 return retval;
313}
314
315/* Tracepoints definitions. */ 271/* Tracepoints definitions. */
316EXPORT_TRACEPOINT_SYMBOL(kmalloc); 272EXPORT_TRACEPOINT_SYMBOL(kmalloc);
317EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); 273EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 37e69295f250..ae007462b7f6 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -509,6 +509,9 @@ static unsigned long lazy_max_pages(void)
509 509
510static atomic_t vmap_lazy_nr = ATOMIC_INIT(0); 510static atomic_t vmap_lazy_nr = ATOMIC_INIT(0);
511 511
512/* for per-CPU blocks */
513static void purge_fragmented_blocks_allcpus(void);
514
512/* 515/*
513 * Purges all lazily-freed vmap areas. 516 * Purges all lazily-freed vmap areas.
514 * 517 *
@@ -539,6 +542,9 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
539 } else 542 } else
540 spin_lock(&purge_lock); 543 spin_lock(&purge_lock);
541 544
545 if (sync)
546 purge_fragmented_blocks_allcpus();
547
542 rcu_read_lock(); 548 rcu_read_lock();
543 list_for_each_entry_rcu(va, &vmap_area_list, list) { 549 list_for_each_entry_rcu(va, &vmap_area_list, list) {
544 if (va->flags & VM_LAZY_FREE) { 550 if (va->flags & VM_LAZY_FREE) {
@@ -555,10 +561,8 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
555 } 561 }
556 rcu_read_unlock(); 562 rcu_read_unlock();
557 563
558 if (nr) { 564 if (nr)
559 BUG_ON(nr > atomic_read(&vmap_lazy_nr));
560 atomic_sub(nr, &vmap_lazy_nr); 565 atomic_sub(nr, &vmap_lazy_nr);
561 }
562 566
563 if (nr || force_flush) 567 if (nr || force_flush)
564 flush_tlb_kernel_range(*start, *end); 568 flush_tlb_kernel_range(*start, *end);
@@ -669,8 +673,6 @@ static bool vmap_initialized __read_mostly = false;
669struct vmap_block_queue { 673struct vmap_block_queue {
670 spinlock_t lock; 674 spinlock_t lock;
671 struct list_head free; 675 struct list_head free;
672 struct list_head dirty;
673 unsigned int nr_dirty;
674}; 676};
675 677
676struct vmap_block { 678struct vmap_block {
@@ -680,10 +682,9 @@ struct vmap_block {
680 unsigned long free, dirty; 682 unsigned long free, dirty;
681 DECLARE_BITMAP(alloc_map, VMAP_BBMAP_BITS); 683 DECLARE_BITMAP(alloc_map, VMAP_BBMAP_BITS);
682 DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS); 684 DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS);
683 union { 685 struct list_head free_list;
684 struct list_head free_list; 686 struct rcu_head rcu_head;
685 struct rcu_head rcu_head; 687 struct list_head purge;
686 };
687}; 688};
688 689
689/* Queue of free and dirty vmap blocks, for allocation and flushing purposes */ 690/* Queue of free and dirty vmap blocks, for allocation and flushing purposes */
@@ -759,7 +760,7 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
759 vbq = &get_cpu_var(vmap_block_queue); 760 vbq = &get_cpu_var(vmap_block_queue);
760 vb->vbq = vbq; 761 vb->vbq = vbq;
761 spin_lock(&vbq->lock); 762 spin_lock(&vbq->lock);
762 list_add(&vb->free_list, &vbq->free); 763 list_add_rcu(&vb->free_list, &vbq->free);
763 spin_unlock(&vbq->lock); 764 spin_unlock(&vbq->lock);
764 put_cpu_var(vmap_block_queue); 765 put_cpu_var(vmap_block_queue);
765 766
@@ -778,8 +779,6 @@ static void free_vmap_block(struct vmap_block *vb)
778 struct vmap_block *tmp; 779 struct vmap_block *tmp;
779 unsigned long vb_idx; 780 unsigned long vb_idx;
780 781
781 BUG_ON(!list_empty(&vb->free_list));
782
783 vb_idx = addr_to_vb_idx(vb->va->va_start); 782 vb_idx = addr_to_vb_idx(vb->va->va_start);
784 spin_lock(&vmap_block_tree_lock); 783 spin_lock(&vmap_block_tree_lock);
785 tmp = radix_tree_delete(&vmap_block_tree, vb_idx); 784 tmp = radix_tree_delete(&vmap_block_tree, vb_idx);
@@ -790,12 +789,61 @@ static void free_vmap_block(struct vmap_block *vb)
790 call_rcu(&vb->rcu_head, rcu_free_vb); 789 call_rcu(&vb->rcu_head, rcu_free_vb);
791} 790}
792 791
792static void purge_fragmented_blocks(int cpu)
793{
794 LIST_HEAD(purge);
795 struct vmap_block *vb;
796 struct vmap_block *n_vb;
797 struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
798
799 rcu_read_lock();
800 list_for_each_entry_rcu(vb, &vbq->free, free_list) {
801
802 if (!(vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS))
803 continue;
804
805 spin_lock(&vb->lock);
806 if (vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS) {
807 vb->free = 0; /* prevent further allocs after releasing lock */
808 vb->dirty = VMAP_BBMAP_BITS; /* prevent purging it again */
809 bitmap_fill(vb->alloc_map, VMAP_BBMAP_BITS);
810 bitmap_fill(vb->dirty_map, VMAP_BBMAP_BITS);
811 spin_lock(&vbq->lock);
812 list_del_rcu(&vb->free_list);
813 spin_unlock(&vbq->lock);
814 spin_unlock(&vb->lock);
815 list_add_tail(&vb->purge, &purge);
816 } else
817 spin_unlock(&vb->lock);
818 }
819 rcu_read_unlock();
820
821 list_for_each_entry_safe(vb, n_vb, &purge, purge) {
822 list_del(&vb->purge);
823 free_vmap_block(vb);
824 }
825}
826
827static void purge_fragmented_blocks_thiscpu(void)
828{
829 purge_fragmented_blocks(smp_processor_id());
830}
831
832static void purge_fragmented_blocks_allcpus(void)
833{
834 int cpu;
835
836 for_each_possible_cpu(cpu)
837 purge_fragmented_blocks(cpu);
838}
839
793static void *vb_alloc(unsigned long size, gfp_t gfp_mask) 840static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
794{ 841{
795 struct vmap_block_queue *vbq; 842 struct vmap_block_queue *vbq;
796 struct vmap_block *vb; 843 struct vmap_block *vb;
797 unsigned long addr = 0; 844 unsigned long addr = 0;
798 unsigned int order; 845 unsigned int order;
846 int purge = 0;
799 847
800 BUG_ON(size & ~PAGE_MASK); 848 BUG_ON(size & ~PAGE_MASK);
801 BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); 849 BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
@@ -808,24 +856,38 @@ again:
808 int i; 856 int i;
809 857
810 spin_lock(&vb->lock); 858 spin_lock(&vb->lock);
859 if (vb->free < 1UL << order)
860 goto next;
861
811 i = bitmap_find_free_region(vb->alloc_map, 862 i = bitmap_find_free_region(vb->alloc_map,
812 VMAP_BBMAP_BITS, order); 863 VMAP_BBMAP_BITS, order);
813 864
814 if (i >= 0) { 865 if (i < 0) {
815 addr = vb->va->va_start + (i << PAGE_SHIFT); 866 if (vb->free + vb->dirty == VMAP_BBMAP_BITS) {
816 BUG_ON(addr_to_vb_idx(addr) != 867 /* fragmented and no outstanding allocations */
817 addr_to_vb_idx(vb->va->va_start)); 868 BUG_ON(vb->dirty != VMAP_BBMAP_BITS);
818 vb->free -= 1UL << order; 869 purge = 1;
819 if (vb->free == 0) {
820 spin_lock(&vbq->lock);
821 list_del_init(&vb->free_list);
822 spin_unlock(&vbq->lock);
823 } 870 }
824 spin_unlock(&vb->lock); 871 goto next;
825 break; 872 }
873 addr = vb->va->va_start + (i << PAGE_SHIFT);
874 BUG_ON(addr_to_vb_idx(addr) !=
875 addr_to_vb_idx(vb->va->va_start));
876 vb->free -= 1UL << order;
877 if (vb->free == 0) {
878 spin_lock(&vbq->lock);
879 list_del_rcu(&vb->free_list);
880 spin_unlock(&vbq->lock);
826 } 881 }
827 spin_unlock(&vb->lock); 882 spin_unlock(&vb->lock);
883 break;
884next:
885 spin_unlock(&vb->lock);
828 } 886 }
887
888 if (purge)
889 purge_fragmented_blocks_thiscpu();
890
829 put_cpu_var(vmap_block_queue); 891 put_cpu_var(vmap_block_queue);
830 rcu_read_unlock(); 892 rcu_read_unlock();
831 893
@@ -862,11 +924,11 @@ static void vb_free(const void *addr, unsigned long size)
862 BUG_ON(!vb); 924 BUG_ON(!vb);
863 925
864 spin_lock(&vb->lock); 926 spin_lock(&vb->lock);
865 bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order); 927 BUG_ON(bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order));
866 928
867 vb->dirty += 1UL << order; 929 vb->dirty += 1UL << order;
868 if (vb->dirty == VMAP_BBMAP_BITS) { 930 if (vb->dirty == VMAP_BBMAP_BITS) {
869 BUG_ON(vb->free || !list_empty(&vb->free_list)); 931 BUG_ON(vb->free);
870 spin_unlock(&vb->lock); 932 spin_unlock(&vb->lock);
871 free_vmap_block(vb); 933 free_vmap_block(vb);
872 } else 934 } else
@@ -1035,8 +1097,6 @@ void __init vmalloc_init(void)
1035 vbq = &per_cpu(vmap_block_queue, i); 1097 vbq = &per_cpu(vmap_block_queue, i);
1036 spin_lock_init(&vbq->lock); 1098 spin_lock_init(&vbq->lock);
1037 INIT_LIST_HEAD(&vbq->free); 1099 INIT_LIST_HEAD(&vbq->free);
1038 INIT_LIST_HEAD(&vbq->dirty);
1039 vbq->nr_dirty = 0;
1040 } 1100 }
1041 1101
1042 /* Import existing vmlist entries. */ 1102 /* Import existing vmlist entries. */
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 885207a6b6b7..c26986c85ce0 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1922,6 +1922,9 @@ static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining)
1922 if (!populated_zone(zone)) 1922 if (!populated_zone(zone))
1923 continue; 1923 continue;
1924 1924
1925 if (zone_is_all_unreclaimable(zone))
1926 continue;
1927
1925 if (!zone_watermark_ok(zone, order, high_wmark_pages(zone), 1928 if (!zone_watermark_ok(zone, order, high_wmark_pages(zone),
1926 0, 0)) 1929 0, 0))
1927 return 1; 1930 return 1;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 6051fbab67ba..fc5aa183bc45 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -139,7 +139,8 @@ static void refresh_zone_stat_thresholds(void)
139 threshold = calculate_threshold(zone); 139 threshold = calculate_threshold(zone);
140 140
141 for_each_online_cpu(cpu) 141 for_each_online_cpu(cpu)
142 zone_pcp(zone, cpu)->stat_threshold = threshold; 142 per_cpu_ptr(zone->pageset, cpu)->stat_threshold
143 = threshold;
143 } 144 }
144} 145}
145 146
@@ -149,7 +150,8 @@ static void refresh_zone_stat_thresholds(void)
149void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, 150void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
150 int delta) 151 int delta)
151{ 152{
152 struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id()); 153 struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset);
154
153 s8 *p = pcp->vm_stat_diff + item; 155 s8 *p = pcp->vm_stat_diff + item;
154 long x; 156 long x;
155 157
@@ -202,7 +204,7 @@ EXPORT_SYMBOL(mod_zone_page_state);
202 */ 204 */
203void __inc_zone_state(struct zone *zone, enum zone_stat_item item) 205void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
204{ 206{
205 struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id()); 207 struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset);
206 s8 *p = pcp->vm_stat_diff + item; 208 s8 *p = pcp->vm_stat_diff + item;
207 209
208 (*p)++; 210 (*p)++;
@@ -223,7 +225,7 @@ EXPORT_SYMBOL(__inc_zone_page_state);
223 225
224void __dec_zone_state(struct zone *zone, enum zone_stat_item item) 226void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
225{ 227{
226 struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id()); 228 struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset);
227 s8 *p = pcp->vm_stat_diff + item; 229 s8 *p = pcp->vm_stat_diff + item;
228 230
229 (*p)--; 231 (*p)--;
@@ -300,7 +302,7 @@ void refresh_cpu_vm_stats(int cpu)
300 for_each_populated_zone(zone) { 302 for_each_populated_zone(zone) {
301 struct per_cpu_pageset *p; 303 struct per_cpu_pageset *p;
302 304
303 p = zone_pcp(zone, cpu); 305 p = per_cpu_ptr(zone->pageset, cpu);
304 306
305 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) 307 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
306 if (p->vm_stat_diff[i]) { 308 if (p->vm_stat_diff[i]) {
@@ -741,7 +743,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
741 for_each_online_cpu(i) { 743 for_each_online_cpu(i) {
742 struct per_cpu_pageset *pageset; 744 struct per_cpu_pageset *pageset;
743 745
744 pageset = zone_pcp(zone, i); 746 pageset = per_cpu_ptr(zone->pageset, i);
745 seq_printf(m, 747 seq_printf(m,
746 "\n cpu: %i" 748 "\n cpu: %i"
747 "\n count: %i" 749 "\n count: %i"
@@ -906,6 +908,7 @@ static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb,
906 case CPU_ONLINE: 908 case CPU_ONLINE:
907 case CPU_ONLINE_FROZEN: 909 case CPU_ONLINE_FROZEN:
908 start_cpu_timer(cpu); 910 start_cpu_timer(cpu);
911 node_set_state(cpu_to_node(cpu), N_CPU);
909 break; 912 break;
910 case CPU_DOWN_PREPARE: 913 case CPU_DOWN_PREPARE:
911 case CPU_DOWN_PREPARE_FROZEN: 914 case CPU_DOWN_PREPARE_FROZEN: