aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig6
-rw-r--r--mm/bootmem.c195
-rw-r--r--mm/hugetlb.c4
-rw-r--r--mm/memory.c14
-rw-r--r--mm/migrate.c2
-rw-r--r--mm/mmu_context.c3
-rw-r--r--mm/page_alloc.c263
-rw-r--r--mm/percpu.c36
-rw-r--r--mm/slab.c11
-rw-r--r--mm/slub.c308
-rw-r--r--mm/sparse-vmemmap.c76
-rw-r--r--mm/sparse.c196
-rw-r--r--mm/vmstat.c15
13 files changed, 702 insertions, 427 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 17b8947aa7da..9c61158308dc 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -115,6 +115,10 @@ config SPARSEMEM_EXTREME
115config SPARSEMEM_VMEMMAP_ENABLE 115config SPARSEMEM_VMEMMAP_ENABLE
116 bool 116 bool
117 117
118config SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
119 def_bool y
120 depends on SPARSEMEM && X86_64
121
118config SPARSEMEM_VMEMMAP 122config SPARSEMEM_VMEMMAP
119 bool "Sparse Memory virtual memmap" 123 bool "Sparse Memory virtual memmap"
120 depends on SPARSEMEM && SPARSEMEM_VMEMMAP_ENABLE 124 depends on SPARSEMEM && SPARSEMEM_VMEMMAP_ENABLE
@@ -195,7 +199,7 @@ config BOUNCE
195config NR_QUICK 199config NR_QUICK
196 int 200 int
197 depends on QUICKLIST 201 depends on QUICKLIST
198 default "2" if SUPERH || AVR32 202 default "2" if AVR32
199 default "1" 203 default "1"
200 204
201config VIRT_TO_BUS 205config VIRT_TO_BUS
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 7d1486875e1c..d7c791ef0036 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -13,6 +13,7 @@
13#include <linux/bootmem.h> 13#include <linux/bootmem.h>
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/kmemleak.h> 15#include <linux/kmemleak.h>
16#include <linux/range.h>
16 17
17#include <asm/bug.h> 18#include <asm/bug.h>
18#include <asm/io.h> 19#include <asm/io.h>
@@ -32,6 +33,7 @@ unsigned long max_pfn;
32unsigned long saved_max_pfn; 33unsigned long saved_max_pfn;
33#endif 34#endif
34 35
36#ifndef CONFIG_NO_BOOTMEM
35bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata; 37bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata;
36 38
37static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list); 39static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list);
@@ -142,7 +144,7 @@ unsigned long __init init_bootmem(unsigned long start, unsigned long pages)
142 min_low_pfn = start; 144 min_low_pfn = start;
143 return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages); 145 return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages);
144} 146}
145 147#endif
146/* 148/*
147 * free_bootmem_late - free bootmem pages directly to page allocator 149 * free_bootmem_late - free bootmem pages directly to page allocator
148 * @addr: starting address of the range 150 * @addr: starting address of the range
@@ -167,6 +169,60 @@ void __init free_bootmem_late(unsigned long addr, unsigned long size)
167 } 169 }
168} 170}
169 171
172#ifdef CONFIG_NO_BOOTMEM
173static void __init __free_pages_memory(unsigned long start, unsigned long end)
174{
175 int i;
176 unsigned long start_aligned, end_aligned;
177 int order = ilog2(BITS_PER_LONG);
178
179 start_aligned = (start + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1);
180 end_aligned = end & ~(BITS_PER_LONG - 1);
181
182 if (end_aligned <= start_aligned) {
183#if 1
184 printk(KERN_DEBUG " %lx - %lx\n", start, end);
185#endif
186 for (i = start; i < end; i++)
187 __free_pages_bootmem(pfn_to_page(i), 0);
188
189 return;
190 }
191
192#if 1
193 printk(KERN_DEBUG " %lx %lx - %lx %lx\n",
194 start, start_aligned, end_aligned, end);
195#endif
196 for (i = start; i < start_aligned; i++)
197 __free_pages_bootmem(pfn_to_page(i), 0);
198
199 for (i = start_aligned; i < end_aligned; i += BITS_PER_LONG)
200 __free_pages_bootmem(pfn_to_page(i), order);
201
202 for (i = end_aligned; i < end; i++)
203 __free_pages_bootmem(pfn_to_page(i), 0);
204}
205
206unsigned long __init free_all_memory_core_early(int nodeid)
207{
208 int i;
209 u64 start, end;
210 unsigned long count = 0;
211 struct range *range = NULL;
212 int nr_range;
213
214 nr_range = get_free_all_memory_range(&range, nodeid);
215
216 for (i = 0; i < nr_range; i++) {
217 start = range[i].start;
218 end = range[i].end;
219 count += end - start;
220 __free_pages_memory(start, end);
221 }
222
223 return count;
224}
225#else
170static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) 226static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
171{ 227{
172 int aligned; 228 int aligned;
@@ -227,6 +283,7 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
227 283
228 return count; 284 return count;
229} 285}
286#endif
230 287
231/** 288/**
232 * free_all_bootmem_node - release a node's free pages to the buddy allocator 289 * free_all_bootmem_node - release a node's free pages to the buddy allocator
@@ -237,7 +294,12 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
237unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) 294unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
238{ 295{
239 register_page_bootmem_info_node(pgdat); 296 register_page_bootmem_info_node(pgdat);
297#ifdef CONFIG_NO_BOOTMEM
298 /* free_all_memory_core_early(MAX_NUMNODES) will be called later */
299 return 0;
300#else
240 return free_all_bootmem_core(pgdat->bdata); 301 return free_all_bootmem_core(pgdat->bdata);
302#endif
241} 303}
242 304
243/** 305/**
@@ -247,9 +309,14 @@ unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
247 */ 309 */
248unsigned long __init free_all_bootmem(void) 310unsigned long __init free_all_bootmem(void)
249{ 311{
312#ifdef CONFIG_NO_BOOTMEM
313 return free_all_memory_core_early(NODE_DATA(0)->node_id);
314#else
250 return free_all_bootmem_core(NODE_DATA(0)->bdata); 315 return free_all_bootmem_core(NODE_DATA(0)->bdata);
316#endif
251} 317}
252 318
319#ifndef CONFIG_NO_BOOTMEM
253static void __init __free(bootmem_data_t *bdata, 320static void __init __free(bootmem_data_t *bdata,
254 unsigned long sidx, unsigned long eidx) 321 unsigned long sidx, unsigned long eidx)
255{ 322{
@@ -344,6 +411,7 @@ static int __init mark_bootmem(unsigned long start, unsigned long end,
344 } 411 }
345 BUG(); 412 BUG();
346} 413}
414#endif
347 415
348/** 416/**
349 * free_bootmem_node - mark a page range as usable 417 * free_bootmem_node - mark a page range as usable
@@ -358,6 +426,12 @@ static int __init mark_bootmem(unsigned long start, unsigned long end,
358void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, 426void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
359 unsigned long size) 427 unsigned long size)
360{ 428{
429#ifdef CONFIG_NO_BOOTMEM
430 free_early(physaddr, physaddr + size);
431#if 0
432 printk(KERN_DEBUG "free %lx %lx\n", physaddr, size);
433#endif
434#else
361 unsigned long start, end; 435 unsigned long start, end;
362 436
363 kmemleak_free_part(__va(physaddr), size); 437 kmemleak_free_part(__va(physaddr), size);
@@ -366,6 +440,7 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
366 end = PFN_DOWN(physaddr + size); 440 end = PFN_DOWN(physaddr + size);
367 441
368 mark_bootmem_node(pgdat->bdata, start, end, 0, 0); 442 mark_bootmem_node(pgdat->bdata, start, end, 0, 0);
443#endif
369} 444}
370 445
371/** 446/**
@@ -379,6 +454,12 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
379 */ 454 */
380void __init free_bootmem(unsigned long addr, unsigned long size) 455void __init free_bootmem(unsigned long addr, unsigned long size)
381{ 456{
457#ifdef CONFIG_NO_BOOTMEM
458 free_early(addr, addr + size);
459#if 0
460 printk(KERN_DEBUG "free %lx %lx\n", addr, size);
461#endif
462#else
382 unsigned long start, end; 463 unsigned long start, end;
383 464
384 kmemleak_free_part(__va(addr), size); 465 kmemleak_free_part(__va(addr), size);
@@ -387,6 +468,7 @@ void __init free_bootmem(unsigned long addr, unsigned long size)
387 end = PFN_DOWN(addr + size); 468 end = PFN_DOWN(addr + size);
388 469
389 mark_bootmem(start, end, 0, 0); 470 mark_bootmem(start, end, 0, 0);
471#endif
390} 472}
391 473
392/** 474/**
@@ -403,12 +485,17 @@ void __init free_bootmem(unsigned long addr, unsigned long size)
403int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, 485int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
404 unsigned long size, int flags) 486 unsigned long size, int flags)
405{ 487{
488#ifdef CONFIG_NO_BOOTMEM
489 panic("no bootmem");
490 return 0;
491#else
406 unsigned long start, end; 492 unsigned long start, end;
407 493
408 start = PFN_DOWN(physaddr); 494 start = PFN_DOWN(physaddr);
409 end = PFN_UP(physaddr + size); 495 end = PFN_UP(physaddr + size);
410 496
411 return mark_bootmem_node(pgdat->bdata, start, end, 1, flags); 497 return mark_bootmem_node(pgdat->bdata, start, end, 1, flags);
498#endif
412} 499}
413 500
414/** 501/**
@@ -424,14 +511,20 @@ int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
424int __init reserve_bootmem(unsigned long addr, unsigned long size, 511int __init reserve_bootmem(unsigned long addr, unsigned long size,
425 int flags) 512 int flags)
426{ 513{
514#ifdef CONFIG_NO_BOOTMEM
515 panic("no bootmem");
516 return 0;
517#else
427 unsigned long start, end; 518 unsigned long start, end;
428 519
429 start = PFN_DOWN(addr); 520 start = PFN_DOWN(addr);
430 end = PFN_UP(addr + size); 521 end = PFN_UP(addr + size);
431 522
432 return mark_bootmem(start, end, 1, flags); 523 return mark_bootmem(start, end, 1, flags);
524#endif
433} 525}
434 526
527#ifndef CONFIG_NO_BOOTMEM
435static unsigned long __init align_idx(struct bootmem_data *bdata, 528static unsigned long __init align_idx(struct bootmem_data *bdata,
436 unsigned long idx, unsigned long step) 529 unsigned long idx, unsigned long step)
437{ 530{
@@ -582,12 +675,33 @@ static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata,
582#endif 675#endif
583 return NULL; 676 return NULL;
584} 677}
678#endif
585 679
586static void * __init ___alloc_bootmem_nopanic(unsigned long size, 680static void * __init ___alloc_bootmem_nopanic(unsigned long size,
587 unsigned long align, 681 unsigned long align,
588 unsigned long goal, 682 unsigned long goal,
589 unsigned long limit) 683 unsigned long limit)
590{ 684{
685#ifdef CONFIG_NO_BOOTMEM
686 void *ptr;
687
688 if (WARN_ON_ONCE(slab_is_available()))
689 return kzalloc(size, GFP_NOWAIT);
690
691restart:
692
693 ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal, limit);
694
695 if (ptr)
696 return ptr;
697
698 if (goal != 0) {
699 goal = 0;
700 goto restart;
701 }
702
703 return NULL;
704#else
591 bootmem_data_t *bdata; 705 bootmem_data_t *bdata;
592 void *region; 706 void *region;
593 707
@@ -613,6 +727,7 @@ restart:
613 } 727 }
614 728
615 return NULL; 729 return NULL;
730#endif
616} 731}
617 732
618/** 733/**
@@ -631,7 +746,13 @@ restart:
631void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align, 746void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
632 unsigned long goal) 747 unsigned long goal)
633{ 748{
634 return ___alloc_bootmem_nopanic(size, align, goal, 0); 749 unsigned long limit = 0;
750
751#ifdef CONFIG_NO_BOOTMEM
752 limit = -1UL;
753#endif
754
755 return ___alloc_bootmem_nopanic(size, align, goal, limit);
635} 756}
636 757
637static void * __init ___alloc_bootmem(unsigned long size, unsigned long align, 758static void * __init ___alloc_bootmem(unsigned long size, unsigned long align,
@@ -665,9 +786,16 @@ static void * __init ___alloc_bootmem(unsigned long size, unsigned long align,
665void * __init __alloc_bootmem(unsigned long size, unsigned long align, 786void * __init __alloc_bootmem(unsigned long size, unsigned long align,
666 unsigned long goal) 787 unsigned long goal)
667{ 788{
668 return ___alloc_bootmem(size, align, goal, 0); 789 unsigned long limit = 0;
790
791#ifdef CONFIG_NO_BOOTMEM
792 limit = -1UL;
793#endif
794
795 return ___alloc_bootmem(size, align, goal, limit);
669} 796}
670 797
798#ifndef CONFIG_NO_BOOTMEM
671static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata, 799static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata,
672 unsigned long size, unsigned long align, 800 unsigned long size, unsigned long align,
673 unsigned long goal, unsigned long limit) 801 unsigned long goal, unsigned long limit)
@@ -684,6 +812,7 @@ static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata,
684 812
685 return ___alloc_bootmem(size, align, goal, limit); 813 return ___alloc_bootmem(size, align, goal, limit);
686} 814}
815#endif
687 816
688/** 817/**
689 * __alloc_bootmem_node - allocate boot memory from a specific node 818 * __alloc_bootmem_node - allocate boot memory from a specific node
@@ -706,7 +835,46 @@ void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
706 if (WARN_ON_ONCE(slab_is_available())) 835 if (WARN_ON_ONCE(slab_is_available()))
707 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); 836 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
708 837
838#ifdef CONFIG_NO_BOOTMEM
839 return __alloc_memory_core_early(pgdat->node_id, size, align,
840 goal, -1ULL);
841#else
709 return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0); 842 return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0);
843#endif
844}
845
846void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
847 unsigned long align, unsigned long goal)
848{
849#ifdef MAX_DMA32_PFN
850 unsigned long end_pfn;
851
852 if (WARN_ON_ONCE(slab_is_available()))
853 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
854
855 /* update goal according ...MAX_DMA32_PFN */
856 end_pfn = pgdat->node_start_pfn + pgdat->node_spanned_pages;
857
858 if (end_pfn > MAX_DMA32_PFN + (128 >> (20 - PAGE_SHIFT)) &&
859 (goal >> PAGE_SHIFT) < MAX_DMA32_PFN) {
860 void *ptr;
861 unsigned long new_goal;
862
863 new_goal = MAX_DMA32_PFN << PAGE_SHIFT;
864#ifdef CONFIG_NO_BOOTMEM
865 ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
866 new_goal, -1ULL);
867#else
868 ptr = alloc_bootmem_core(pgdat->bdata, size, align,
869 new_goal, 0);
870#endif
871 if (ptr)
872 return ptr;
873 }
874#endif
875
876 return __alloc_bootmem_node(pgdat, size, align, goal);
877
710} 878}
711 879
712#ifdef CONFIG_SPARSEMEM 880#ifdef CONFIG_SPARSEMEM
@@ -720,6 +888,16 @@ void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
720void * __init alloc_bootmem_section(unsigned long size, 888void * __init alloc_bootmem_section(unsigned long size,
721 unsigned long section_nr) 889 unsigned long section_nr)
722{ 890{
891#ifdef CONFIG_NO_BOOTMEM
892 unsigned long pfn, goal, limit;
893
894 pfn = section_nr_to_pfn(section_nr);
895 goal = pfn << PAGE_SHIFT;
896 limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT;
897
898 return __alloc_memory_core_early(early_pfn_to_nid(pfn), size,
899 SMP_CACHE_BYTES, goal, limit);
900#else
723 bootmem_data_t *bdata; 901 bootmem_data_t *bdata;
724 unsigned long pfn, goal, limit; 902 unsigned long pfn, goal, limit;
725 903
@@ -729,6 +907,7 @@ void * __init alloc_bootmem_section(unsigned long size,
729 bdata = &bootmem_node_data[early_pfn_to_nid(pfn)]; 907 bdata = &bootmem_node_data[early_pfn_to_nid(pfn)];
730 908
731 return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, limit); 909 return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, limit);
910#endif
732} 911}
733#endif 912#endif
734 913
@@ -740,11 +919,16 @@ void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
740 if (WARN_ON_ONCE(slab_is_available())) 919 if (WARN_ON_ONCE(slab_is_available()))
741 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); 920 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
742 921
922#ifdef CONFIG_NO_BOOTMEM
923 ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
924 goal, -1ULL);
925#else
743 ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0); 926 ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0);
744 if (ptr) 927 if (ptr)
745 return ptr; 928 return ptr;
746 929
747 ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0); 930 ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0);
931#endif
748 if (ptr) 932 if (ptr)
749 return ptr; 933 return ptr;
750 934
@@ -795,6 +979,11 @@ void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size,
795 if (WARN_ON_ONCE(slab_is_available())) 979 if (WARN_ON_ONCE(slab_is_available()))
796 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); 980 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
797 981
982#ifdef CONFIG_NO_BOOTMEM
983 return __alloc_memory_core_early(pgdat->node_id, size, align,
984 goal, ARCH_LOW_ADDRESS_LIMIT);
985#else
798 return ___alloc_bootmem_node(pgdat->bdata, size, align, 986 return ___alloc_bootmem_node(pgdat->bdata, size, align,
799 goal, ARCH_LOW_ADDRESS_LIMIT); 987 goal, ARCH_LOW_ADDRESS_LIMIT);
988#endif
800} 989}
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 2d16fa6b8c2d..3a5aeb37c110 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2087,7 +2087,7 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma,
2087 2087
2088 entry = pte_mkwrite(pte_mkdirty(huge_ptep_get(ptep))); 2088 entry = pte_mkwrite(pte_mkdirty(huge_ptep_get(ptep)));
2089 if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) { 2089 if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) {
2090 update_mmu_cache(vma, address, entry); 2090 update_mmu_cache(vma, address, ptep);
2091 } 2091 }
2092} 2092}
2093 2093
@@ -2558,7 +2558,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2558 entry = pte_mkyoung(entry); 2558 entry = pte_mkyoung(entry);
2559 if (huge_ptep_set_access_flags(vma, address, ptep, entry, 2559 if (huge_ptep_set_access_flags(vma, address, ptep, entry,
2560 flags & FAULT_FLAG_WRITE)) 2560 flags & FAULT_FLAG_WRITE))
2561 update_mmu_cache(vma, address, entry); 2561 update_mmu_cache(vma, address, ptep);
2562 2562
2563out_page_table_lock: 2563out_page_table_lock:
2564 spin_unlock(&mm->page_table_lock); 2564 spin_unlock(&mm->page_table_lock);
diff --git a/mm/memory.c b/mm/memory.c
index 09e4b1be7b67..72fb5f39bccc 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1593,7 +1593,7 @@ static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
1593 /* Ok, finally just insert the thing.. */ 1593 /* Ok, finally just insert the thing.. */
1594 entry = pte_mkspecial(pfn_pte(pfn, prot)); 1594 entry = pte_mkspecial(pfn_pte(pfn, prot));
1595 set_pte_at(mm, addr, pte, entry); 1595 set_pte_at(mm, addr, pte, entry);
1596 update_mmu_cache(vma, addr, entry); /* XXX: why not for insert_page? */ 1596 update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */
1597 1597
1598 retval = 0; 1598 retval = 0;
1599out_unlock: 1599out_unlock:
@@ -2116,7 +2116,7 @@ reuse:
2116 entry = pte_mkyoung(orig_pte); 2116 entry = pte_mkyoung(orig_pte);
2117 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 2117 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2118 if (ptep_set_access_flags(vma, address, page_table, entry,1)) 2118 if (ptep_set_access_flags(vma, address, page_table, entry,1))
2119 update_mmu_cache(vma, address, entry); 2119 update_mmu_cache(vma, address, page_table);
2120 ret |= VM_FAULT_WRITE; 2120 ret |= VM_FAULT_WRITE;
2121 goto unlock; 2121 goto unlock;
2122 } 2122 }
@@ -2185,7 +2185,7 @@ gotten:
2185 * new page to be mapped directly into the secondary page table. 2185 * new page to be mapped directly into the secondary page table.
2186 */ 2186 */
2187 set_pte_at_notify(mm, address, page_table, entry); 2187 set_pte_at_notify(mm, address, page_table, entry);
2188 update_mmu_cache(vma, address, entry); 2188 update_mmu_cache(vma, address, page_table);
2189 if (old_page) { 2189 if (old_page) {
2190 /* 2190 /*
2191 * Only after switching the pte to the new page may 2191 * Only after switching the pte to the new page may
@@ -2629,7 +2629,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2629 } 2629 }
2630 2630
2631 /* No need to invalidate - it was non-present before */ 2631 /* No need to invalidate - it was non-present before */
2632 update_mmu_cache(vma, address, pte); 2632 update_mmu_cache(vma, address, page_table);
2633unlock: 2633unlock:
2634 pte_unmap_unlock(page_table, ptl); 2634 pte_unmap_unlock(page_table, ptl);
2635out: 2635out:
@@ -2694,7 +2694,7 @@ setpte:
2694 set_pte_at(mm, address, page_table, entry); 2694 set_pte_at(mm, address, page_table, entry);
2695 2695
2696 /* No need to invalidate - it was non-present before */ 2696 /* No need to invalidate - it was non-present before */
2697 update_mmu_cache(vma, address, entry); 2697 update_mmu_cache(vma, address, page_table);
2698unlock: 2698unlock:
2699 pte_unmap_unlock(page_table, ptl); 2699 pte_unmap_unlock(page_table, ptl);
2700 return 0; 2700 return 0;
@@ -2855,7 +2855,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2855 set_pte_at(mm, address, page_table, entry); 2855 set_pte_at(mm, address, page_table, entry);
2856 2856
2857 /* no need to invalidate: a not-present page won't be cached */ 2857 /* no need to invalidate: a not-present page won't be cached */
2858 update_mmu_cache(vma, address, entry); 2858 update_mmu_cache(vma, address, page_table);
2859 } else { 2859 } else {
2860 if (charged) 2860 if (charged)
2861 mem_cgroup_uncharge_page(page); 2861 mem_cgroup_uncharge_page(page);
@@ -2992,7 +2992,7 @@ static inline int handle_pte_fault(struct mm_struct *mm,
2992 } 2992 }
2993 entry = pte_mkyoung(entry); 2993 entry = pte_mkyoung(entry);
2994 if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) { 2994 if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) {
2995 update_mmu_cache(vma, address, entry); 2995 update_mmu_cache(vma, address, pte);
2996 } else { 2996 } else {
2997 /* 2997 /*
2998 * This is needed only for protection faults but the arch code 2998 * This is needed only for protection faults but the arch code
diff --git a/mm/migrate.c b/mm/migrate.c
index 880bd592d38e..edb6101ed774 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -134,7 +134,7 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
134 page_add_file_rmap(new); 134 page_add_file_rmap(new);
135 135
136 /* No need to invalidate - it was non-present before */ 136 /* No need to invalidate - it was non-present before */
137 update_mmu_cache(vma, addr, pte); 137 update_mmu_cache(vma, addr, ptep);
138unlock: 138unlock:
139 pte_unmap_unlock(ptep, ptl); 139 pte_unmap_unlock(ptep, ptl);
140out: 140out:
diff --git a/mm/mmu_context.c b/mm/mmu_context.c
index ded9081f4021..0777654147c9 100644
--- a/mm/mmu_context.c
+++ b/mm/mmu_context.c
@@ -5,6 +5,7 @@
5 5
6#include <linux/mm.h> 6#include <linux/mm.h>
7#include <linux/mmu_context.h> 7#include <linux/mmu_context.h>
8#include <linux/module.h>
8#include <linux/sched.h> 9#include <linux/sched.h>
9 10
10#include <asm/mmu_context.h> 11#include <asm/mmu_context.h>
@@ -37,6 +38,7 @@ void use_mm(struct mm_struct *mm)
37 if (active_mm != mm) 38 if (active_mm != mm)
38 mmdrop(active_mm); 39 mmdrop(active_mm);
39} 40}
41EXPORT_SYMBOL_GPL(use_mm);
40 42
41/* 43/*
42 * unuse_mm 44 * unuse_mm
@@ -56,3 +58,4 @@ void unuse_mm(struct mm_struct *mm)
56 enter_lazy_tlb(mm, tsk); 58 enter_lazy_tlb(mm, tsk);
57 task_unlock(tsk); 59 task_unlock(tsk);
58} 60}
61EXPORT_SYMBOL_GPL(unuse_mm);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8deb9d0fd5b1..a6b17aa4740b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1009,10 +1009,10 @@ static void drain_pages(unsigned int cpu)
1009 struct per_cpu_pageset *pset; 1009 struct per_cpu_pageset *pset;
1010 struct per_cpu_pages *pcp; 1010 struct per_cpu_pages *pcp;
1011 1011
1012 pset = zone_pcp(zone, cpu); 1012 local_irq_save(flags);
1013 pset = per_cpu_ptr(zone->pageset, cpu);
1013 1014
1014 pcp = &pset->pcp; 1015 pcp = &pset->pcp;
1015 local_irq_save(flags);
1016 free_pcppages_bulk(zone, pcp->count, pcp); 1016 free_pcppages_bulk(zone, pcp->count, pcp);
1017 pcp->count = 0; 1017 pcp->count = 0;
1018 local_irq_restore(flags); 1018 local_irq_restore(flags);
@@ -1096,7 +1096,6 @@ static void free_hot_cold_page(struct page *page, int cold)
1096 arch_free_page(page, 0); 1096 arch_free_page(page, 0);
1097 kernel_map_pages(page, 1, 0); 1097 kernel_map_pages(page, 1, 0);
1098 1098
1099 pcp = &zone_pcp(zone, get_cpu())->pcp;
1100 migratetype = get_pageblock_migratetype(page); 1099 migratetype = get_pageblock_migratetype(page);
1101 set_page_private(page, migratetype); 1100 set_page_private(page, migratetype);
1102 local_irq_save(flags); 1101 local_irq_save(flags);
@@ -1119,6 +1118,7 @@ static void free_hot_cold_page(struct page *page, int cold)
1119 migratetype = MIGRATE_MOVABLE; 1118 migratetype = MIGRATE_MOVABLE;
1120 } 1119 }
1121 1120
1121 pcp = &this_cpu_ptr(zone->pageset)->pcp;
1122 if (cold) 1122 if (cold)
1123 list_add_tail(&page->lru, &pcp->lists[migratetype]); 1123 list_add_tail(&page->lru, &pcp->lists[migratetype]);
1124 else 1124 else
@@ -1131,7 +1131,6 @@ static void free_hot_cold_page(struct page *page, int cold)
1131 1131
1132out: 1132out:
1133 local_irq_restore(flags); 1133 local_irq_restore(flags);
1134 put_cpu();
1135} 1134}
1136 1135
1137void free_hot_page(struct page *page) 1136void free_hot_page(struct page *page)
@@ -1181,17 +1180,15 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
1181 unsigned long flags; 1180 unsigned long flags;
1182 struct page *page; 1181 struct page *page;
1183 int cold = !!(gfp_flags & __GFP_COLD); 1182 int cold = !!(gfp_flags & __GFP_COLD);
1184 int cpu;
1185 1183
1186again: 1184again:
1187 cpu = get_cpu();
1188 if (likely(order == 0)) { 1185 if (likely(order == 0)) {
1189 struct per_cpu_pages *pcp; 1186 struct per_cpu_pages *pcp;
1190 struct list_head *list; 1187 struct list_head *list;
1191 1188
1192 pcp = &zone_pcp(zone, cpu)->pcp;
1193 list = &pcp->lists[migratetype];
1194 local_irq_save(flags); 1189 local_irq_save(flags);
1190 pcp = &this_cpu_ptr(zone->pageset)->pcp;
1191 list = &pcp->lists[migratetype];
1195 if (list_empty(list)) { 1192 if (list_empty(list)) {
1196 pcp->count += rmqueue_bulk(zone, 0, 1193 pcp->count += rmqueue_bulk(zone, 0,
1197 pcp->batch, list, 1194 pcp->batch, list,
@@ -1232,7 +1229,6 @@ again:
1232 __count_zone_vm_events(PGALLOC, zone, 1 << order); 1229 __count_zone_vm_events(PGALLOC, zone, 1 << order);
1233 zone_statistics(preferred_zone, zone); 1230 zone_statistics(preferred_zone, zone);
1234 local_irq_restore(flags); 1231 local_irq_restore(flags);
1235 put_cpu();
1236 1232
1237 VM_BUG_ON(bad_range(zone, page)); 1233 VM_BUG_ON(bad_range(zone, page));
1238 if (prep_new_page(page, order, gfp_flags)) 1234 if (prep_new_page(page, order, gfp_flags))
@@ -1241,7 +1237,6 @@ again:
1241 1237
1242failed: 1238failed:
1243 local_irq_restore(flags); 1239 local_irq_restore(flags);
1244 put_cpu();
1245 return NULL; 1240 return NULL;
1246} 1241}
1247 1242
@@ -2180,7 +2175,7 @@ void show_free_areas(void)
2180 for_each_online_cpu(cpu) { 2175 for_each_online_cpu(cpu) {
2181 struct per_cpu_pageset *pageset; 2176 struct per_cpu_pageset *pageset;
2182 2177
2183 pageset = zone_pcp(zone, cpu); 2178 pageset = per_cpu_ptr(zone->pageset, cpu);
2184 2179
2185 printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n", 2180 printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n",
2186 cpu, pageset->pcp.high, 2181 cpu, pageset->pcp.high,
@@ -2745,10 +2740,29 @@ static void build_zonelist_cache(pg_data_t *pgdat)
2745 2740
2746#endif /* CONFIG_NUMA */ 2741#endif /* CONFIG_NUMA */
2747 2742
2743/*
2744 * Boot pageset table. One per cpu which is going to be used for all
2745 * zones and all nodes. The parameters will be set in such a way
2746 * that an item put on a list will immediately be handed over to
2747 * the buddy list. This is safe since pageset manipulation is done
2748 * with interrupts disabled.
2749 *
2750 * The boot_pagesets must be kept even after bootup is complete for
2751 * unused processors and/or zones. They do play a role for bootstrapping
2752 * hotplugged processors.
2753 *
2754 * zoneinfo_show() and maybe other functions do
2755 * not check if the processor is online before following the pageset pointer.
2756 * Other parts of the kernel may not check if the zone is available.
2757 */
2758static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);
2759static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
2760
2748/* return values int ....just for stop_machine() */ 2761/* return values int ....just for stop_machine() */
2749static int __build_all_zonelists(void *dummy) 2762static int __build_all_zonelists(void *dummy)
2750{ 2763{
2751 int nid; 2764 int nid;
2765 int cpu;
2752 2766
2753#ifdef CONFIG_NUMA 2767#ifdef CONFIG_NUMA
2754 memset(node_load, 0, sizeof(node_load)); 2768 memset(node_load, 0, sizeof(node_load));
@@ -2759,6 +2773,23 @@ static int __build_all_zonelists(void *dummy)
2759 build_zonelists(pgdat); 2773 build_zonelists(pgdat);
2760 build_zonelist_cache(pgdat); 2774 build_zonelist_cache(pgdat);
2761 } 2775 }
2776
2777 /*
2778 * Initialize the boot_pagesets that are going to be used
2779 * for bootstrapping processors. The real pagesets for
2780 * each zone will be allocated later when the per cpu
2781 * allocator is available.
2782 *
2783 * boot_pagesets are used also for bootstrapping offline
2784 * cpus if the system is already booted because the pagesets
2785 * are needed to initialize allocators on a specific cpu too.
2786 * F.e. the percpu allocator needs the page allocator which
2787 * needs the percpu allocator in order to allocate its pagesets
2788 * (a chicken-egg dilemma).
2789 */
2790 for_each_possible_cpu(cpu)
2791 setup_pageset(&per_cpu(boot_pageset, cpu), 0);
2792
2762 return 0; 2793 return 0;
2763} 2794}
2764 2795
@@ -3096,121 +3127,33 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p,
3096 pcp->batch = PAGE_SHIFT * 8; 3127 pcp->batch = PAGE_SHIFT * 8;
3097} 3128}
3098 3129
3099
3100#ifdef CONFIG_NUMA
3101/*
3102 * Boot pageset table. One per cpu which is going to be used for all
3103 * zones and all nodes. The parameters will be set in such a way
3104 * that an item put on a list will immediately be handed over to
3105 * the buddy list. This is safe since pageset manipulation is done
3106 * with interrupts disabled.
3107 *
3108 * Some NUMA counter updates may also be caught by the boot pagesets.
3109 *
3110 * The boot_pagesets must be kept even after bootup is complete for
3111 * unused processors and/or zones. They do play a role for bootstrapping
3112 * hotplugged processors.
3113 *
3114 * zoneinfo_show() and maybe other functions do
3115 * not check if the processor is online before following the pageset pointer.
3116 * Other parts of the kernel may not check if the zone is available.
3117 */
3118static struct per_cpu_pageset boot_pageset[NR_CPUS];
3119
3120/* 3130/*
3121 * Dynamically allocate memory for the 3131 * Allocate per cpu pagesets and initialize them.
3122 * per cpu pageset array in struct zone. 3132 * Before this call only boot pagesets were available.
3133 * Boot pagesets will no longer be used by this processorr
3134 * after setup_per_cpu_pageset().
3123 */ 3135 */
3124static int __cpuinit process_zones(int cpu) 3136void __init setup_per_cpu_pageset(void)
3125{ 3137{
3126 struct zone *zone, *dzone; 3138 struct zone *zone;
3127 int node = cpu_to_node(cpu); 3139 int cpu;
3128
3129 node_set_state(node, N_CPU); /* this node has a cpu */
3130 3140
3131 for_each_populated_zone(zone) { 3141 for_each_populated_zone(zone) {
3132 zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset), 3142 zone->pageset = alloc_percpu(struct per_cpu_pageset);
3133 GFP_KERNEL, node);
3134 if (!zone_pcp(zone, cpu))
3135 goto bad;
3136 3143
3137 setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone)); 3144 for_each_possible_cpu(cpu) {
3145 struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);
3138 3146
3139 if (percpu_pagelist_fraction) 3147 setup_pageset(pcp, zone_batchsize(zone));
3140 setup_pagelist_highmark(zone_pcp(zone, cpu),
3141 (zone->present_pages / percpu_pagelist_fraction));
3142 }
3143 3148
3144 return 0; 3149 if (percpu_pagelist_fraction)
3145bad: 3150 setup_pagelist_highmark(pcp,
3146 for_each_zone(dzone) { 3151 (zone->present_pages /
3147 if (!populated_zone(dzone)) 3152 percpu_pagelist_fraction));
3148 continue; 3153 }
3149 if (dzone == zone)
3150 break;
3151 kfree(zone_pcp(dzone, cpu));
3152 zone_pcp(dzone, cpu) = &boot_pageset[cpu];
3153 }
3154 return -ENOMEM;
3155}
3156
3157static inline void free_zone_pagesets(int cpu)
3158{
3159 struct zone *zone;
3160
3161 for_each_zone(zone) {
3162 struct per_cpu_pageset *pset = zone_pcp(zone, cpu);
3163
3164 /* Free per_cpu_pageset if it is slab allocated */
3165 if (pset != &boot_pageset[cpu])
3166 kfree(pset);
3167 zone_pcp(zone, cpu) = &boot_pageset[cpu];
3168 }
3169}
3170
3171static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb,
3172 unsigned long action,
3173 void *hcpu)
3174{
3175 int cpu = (long)hcpu;
3176 int ret = NOTIFY_OK;
3177
3178 switch (action) {
3179 case CPU_UP_PREPARE:
3180 case CPU_UP_PREPARE_FROZEN:
3181 if (process_zones(cpu))
3182 ret = NOTIFY_BAD;
3183 break;
3184 case CPU_UP_CANCELED:
3185 case CPU_UP_CANCELED_FROZEN:
3186 case CPU_DEAD:
3187 case CPU_DEAD_FROZEN:
3188 free_zone_pagesets(cpu);
3189 break;
3190 default:
3191 break;
3192 } 3154 }
3193 return ret;
3194}
3195
3196static struct notifier_block __cpuinitdata pageset_notifier =
3197 { &pageset_cpuup_callback, NULL, 0 };
3198
3199void __init setup_per_cpu_pageset(void)
3200{
3201 int err;
3202
3203 /* Initialize per_cpu_pageset for cpu 0.
3204 * A cpuup callback will do this for every cpu
3205 * as it comes online
3206 */
3207 err = process_zones(smp_processor_id());
3208 BUG_ON(err);
3209 register_cpu_notifier(&pageset_notifier);
3210} 3155}
3211 3156
3212#endif
3213
3214static noinline __init_refok 3157static noinline __init_refok
3215int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) 3158int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
3216{ 3159{
@@ -3264,7 +3207,7 @@ static int __zone_pcp_update(void *data)
3264 struct per_cpu_pageset *pset; 3207 struct per_cpu_pageset *pset;
3265 struct per_cpu_pages *pcp; 3208 struct per_cpu_pages *pcp;
3266 3209
3267 pset = zone_pcp(zone, cpu); 3210 pset = per_cpu_ptr(zone->pageset, cpu);
3268 pcp = &pset->pcp; 3211 pcp = &pset->pcp;
3269 3212
3270 local_irq_save(flags); 3213 local_irq_save(flags);
@@ -3282,21 +3225,17 @@ void zone_pcp_update(struct zone *zone)
3282 3225
3283static __meminit void zone_pcp_init(struct zone *zone) 3226static __meminit void zone_pcp_init(struct zone *zone)
3284{ 3227{
3285 int cpu; 3228 /*
3286 unsigned long batch = zone_batchsize(zone); 3229 * per cpu subsystem is not up at this point. The following code
3230 * relies on the ability of the linker to provide the
3231 * offset of a (static) per cpu variable into the per cpu area.
3232 */
3233 zone->pageset = &boot_pageset;
3287 3234
3288 for (cpu = 0; cpu < NR_CPUS; cpu++) {
3289#ifdef CONFIG_NUMA
3290 /* Early boot. Slab allocator not functional yet */
3291 zone_pcp(zone, cpu) = &boot_pageset[cpu];
3292 setup_pageset(&boot_pageset[cpu],0);
3293#else
3294 setup_pageset(zone_pcp(zone,cpu), batch);
3295#endif
3296 }
3297 if (zone->present_pages) 3235 if (zone->present_pages)
3298 printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", 3236 printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n",
3299 zone->name, zone->present_pages, batch); 3237 zone->name, zone->present_pages,
3238 zone_batchsize(zone));
3300} 3239}
3301 3240
3302__meminit int init_currently_empty_zone(struct zone *zone, 3241__meminit int init_currently_empty_zone(struct zone *zone,
@@ -3435,6 +3374,61 @@ void __init free_bootmem_with_active_regions(int nid,
3435 } 3374 }
3436} 3375}
3437 3376
3377int __init add_from_early_node_map(struct range *range, int az,
3378 int nr_range, int nid)
3379{
3380 int i;
3381 u64 start, end;
3382
3383 /* need to go over early_node_map to find out good range for node */
3384 for_each_active_range_index_in_nid(i, nid) {
3385 start = early_node_map[i].start_pfn;
3386 end = early_node_map[i].end_pfn;
3387 nr_range = add_range(range, az, nr_range, start, end);
3388 }
3389 return nr_range;
3390}
3391
3392#ifdef CONFIG_NO_BOOTMEM
3393void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
3394 u64 goal, u64 limit)
3395{
3396 int i;
3397 void *ptr;
3398
3399 /* need to go over early_node_map to find out good range for node */
3400 for_each_active_range_index_in_nid(i, nid) {
3401 u64 addr;
3402 u64 ei_start, ei_last;
3403
3404 ei_last = early_node_map[i].end_pfn;
3405 ei_last <<= PAGE_SHIFT;
3406 ei_start = early_node_map[i].start_pfn;
3407 ei_start <<= PAGE_SHIFT;
3408 addr = find_early_area(ei_start, ei_last,
3409 goal, limit, size, align);
3410
3411 if (addr == -1ULL)
3412 continue;
3413
3414#if 0
3415 printk(KERN_DEBUG "alloc (nid=%d %llx - %llx) (%llx - %llx) %llx %llx => %llx\n",
3416 nid,
3417 ei_start, ei_last, goal, limit, size,
3418 align, addr);
3419#endif
3420
3421 ptr = phys_to_virt(addr);
3422 memset(ptr, 0, size);
3423 reserve_early_without_check(addr, addr + size, "BOOTMEM");
3424 return ptr;
3425 }
3426
3427 return NULL;
3428}
3429#endif
3430
3431
3438void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data) 3432void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data)
3439{ 3433{
3440 int i; 3434 int i;
@@ -4467,7 +4461,11 @@ void __init set_dma_reserve(unsigned long new_dma_reserve)
4467} 4461}
4468 4462
4469#ifndef CONFIG_NEED_MULTIPLE_NODES 4463#ifndef CONFIG_NEED_MULTIPLE_NODES
4470struct pglist_data __refdata contig_page_data = { .bdata = &bootmem_node_data[0] }; 4464struct pglist_data __refdata contig_page_data = {
4465#ifndef CONFIG_NO_BOOTMEM
4466 .bdata = &bootmem_node_data[0]
4467#endif
4468 };
4471EXPORT_SYMBOL(contig_page_data); 4469EXPORT_SYMBOL(contig_page_data);
4472#endif 4470#endif
4473 4471
@@ -4810,10 +4808,11 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
4810 if (!write || (ret == -EINVAL)) 4808 if (!write || (ret == -EINVAL))
4811 return ret; 4809 return ret;
4812 for_each_populated_zone(zone) { 4810 for_each_populated_zone(zone) {
4813 for_each_online_cpu(cpu) { 4811 for_each_possible_cpu(cpu) {
4814 unsigned long high; 4812 unsigned long high;
4815 high = zone->present_pages / percpu_pagelist_fraction; 4813 high = zone->present_pages / percpu_pagelist_fraction;
4816 setup_pagelist_highmark(zone_pcp(zone, cpu), high); 4814 setup_pagelist_highmark(
4815 per_cpu_ptr(zone->pageset, cpu), high);
4817 } 4816 }
4818 } 4817 }
4819 return 0; 4818 return 0;
diff --git a/mm/percpu.c b/mm/percpu.c
index 083e7c91e5f6..768419d44ad7 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -80,13 +80,15 @@
80/* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */ 80/* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */
81#ifndef __addr_to_pcpu_ptr 81#ifndef __addr_to_pcpu_ptr
82#define __addr_to_pcpu_ptr(addr) \ 82#define __addr_to_pcpu_ptr(addr) \
83 (void *)((unsigned long)(addr) - (unsigned long)pcpu_base_addr \ 83 (void __percpu *)((unsigned long)(addr) - \
84 + (unsigned long)__per_cpu_start) 84 (unsigned long)pcpu_base_addr + \
85 (unsigned long)__per_cpu_start)
85#endif 86#endif
86#ifndef __pcpu_ptr_to_addr 87#ifndef __pcpu_ptr_to_addr
87#define __pcpu_ptr_to_addr(ptr) \ 88#define __pcpu_ptr_to_addr(ptr) \
88 (void *)((unsigned long)(ptr) + (unsigned long)pcpu_base_addr \ 89 (void __force *)((unsigned long)(ptr) + \
89 - (unsigned long)__per_cpu_start) 90 (unsigned long)pcpu_base_addr - \
91 (unsigned long)__per_cpu_start)
90#endif 92#endif
91 93
92struct pcpu_chunk { 94struct pcpu_chunk {
@@ -913,11 +915,10 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size)
913 int rs, re; 915 int rs, re;
914 916
915 /* quick path, check whether it's empty already */ 917 /* quick path, check whether it's empty already */
916 pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { 918 rs = page_start;
917 if (rs == page_start && re == page_end) 919 pcpu_next_unpop(chunk, &rs, &re, page_end);
918 return; 920 if (rs == page_start && re == page_end)
919 break; 921 return;
920 }
921 922
922 /* immutable chunks can't be depopulated */ 923 /* immutable chunks can't be depopulated */
923 WARN_ON(chunk->immutable); 924 WARN_ON(chunk->immutable);
@@ -968,11 +969,10 @@ static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
968 int rs, re, rc; 969 int rs, re, rc;
969 970
970 /* quick path, check whether all pages are already there */ 971 /* quick path, check whether all pages are already there */
971 pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) { 972 rs = page_start;
972 if (rs == page_start && re == page_end) 973 pcpu_next_pop(chunk, &rs, &re, page_end);
973 goto clear; 974 if (rs == page_start && re == page_end)
974 break; 975 goto clear;
975 }
976 976
977 /* need to allocate and map pages, this chunk can't be immutable */ 977 /* need to allocate and map pages, this chunk can't be immutable */
978 WARN_ON(chunk->immutable); 978 WARN_ON(chunk->immutable);
@@ -1067,7 +1067,7 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void)
1067 * RETURNS: 1067 * RETURNS:
1068 * Percpu pointer to the allocated area on success, NULL on failure. 1068 * Percpu pointer to the allocated area on success, NULL on failure.
1069 */ 1069 */
1070static void *pcpu_alloc(size_t size, size_t align, bool reserved) 1070static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved)
1071{ 1071{
1072 static int warn_limit = 10; 1072 static int warn_limit = 10;
1073 struct pcpu_chunk *chunk; 1073 struct pcpu_chunk *chunk;
@@ -1196,7 +1196,7 @@ fail_unlock_mutex:
1196 * RETURNS: 1196 * RETURNS:
1197 * Percpu pointer to the allocated area on success, NULL on failure. 1197 * Percpu pointer to the allocated area on success, NULL on failure.
1198 */ 1198 */
1199void *__alloc_percpu(size_t size, size_t align) 1199void __percpu *__alloc_percpu(size_t size, size_t align)
1200{ 1200{
1201 return pcpu_alloc(size, align, false); 1201 return pcpu_alloc(size, align, false);
1202} 1202}
@@ -1217,7 +1217,7 @@ EXPORT_SYMBOL_GPL(__alloc_percpu);
1217 * RETURNS: 1217 * RETURNS:
1218 * Percpu pointer to the allocated area on success, NULL on failure. 1218 * Percpu pointer to the allocated area on success, NULL on failure.
1219 */ 1219 */
1220void *__alloc_reserved_percpu(size_t size, size_t align) 1220void __percpu *__alloc_reserved_percpu(size_t size, size_t align)
1221{ 1221{
1222 return pcpu_alloc(size, align, true); 1222 return pcpu_alloc(size, align, true);
1223} 1223}
@@ -1269,7 +1269,7 @@ static void pcpu_reclaim(struct work_struct *work)
1269 * CONTEXT: 1269 * CONTEXT:
1270 * Can be called from atomic context. 1270 * Can be called from atomic context.
1271 */ 1271 */
1272void free_percpu(void *ptr) 1272void free_percpu(void __percpu *ptr)
1273{ 1273{
1274 void *addr; 1274 void *addr;
1275 struct pcpu_chunk *chunk; 1275 struct pcpu_chunk *chunk;
diff --git a/mm/slab.c b/mm/slab.c
index 33496b704859..a9f325b28bed 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -935,7 +935,6 @@ static int transfer_objects(struct array_cache *to,
935 935
936 from->avail -= nr; 936 from->avail -= nr;
937 to->avail += nr; 937 to->avail += nr;
938 to->touched = 1;
939 return nr; 938 return nr;
940} 939}
941 940
@@ -983,13 +982,11 @@ static struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp)
983 982
984 if (limit > 1) 983 if (limit > 1)
985 limit = 12; 984 limit = 12;
986 ac_ptr = kmalloc_node(memsize, gfp, node); 985 ac_ptr = kzalloc_node(memsize, gfp, node);
987 if (ac_ptr) { 986 if (ac_ptr) {
988 for_each_node(i) { 987 for_each_node(i) {
989 if (i == node || !node_online(i)) { 988 if (i == node || !node_online(i))
990 ac_ptr[i] = NULL;
991 continue; 989 continue;
992 }
993 ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d, gfp); 990 ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d, gfp);
994 if (!ac_ptr[i]) { 991 if (!ac_ptr[i]) {
995 for (i--; i >= 0; i--) 992 for (i--; i >= 0; i--)
@@ -2963,8 +2960,10 @@ retry:
2963 spin_lock(&l3->list_lock); 2960 spin_lock(&l3->list_lock);
2964 2961
2965 /* See if we can refill from the shared array */ 2962 /* See if we can refill from the shared array */
2966 if (l3->shared && transfer_objects(ac, l3->shared, batchcount)) 2963 if (l3->shared && transfer_objects(ac, l3->shared, batchcount)) {
2964 l3->shared->touched = 1;
2967 goto alloc_done; 2965 goto alloc_done;
2966 }
2968 2967
2969 while (batchcount > 0) { 2968 while (batchcount > 0) {
2970 struct list_head *entry; 2969 struct list_head *entry;
diff --git a/mm/slub.c b/mm/slub.c
index cab5288736c8..3525a4ec9794 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -218,10 +218,10 @@ static inline void sysfs_slab_remove(struct kmem_cache *s)
218 218
219#endif 219#endif
220 220
221static inline void stat(struct kmem_cache_cpu *c, enum stat_item si) 221static inline void stat(struct kmem_cache *s, enum stat_item si)
222{ 222{
223#ifdef CONFIG_SLUB_STATS 223#ifdef CONFIG_SLUB_STATS
224 c->stat[si]++; 224 __this_cpu_inc(s->cpu_slab->stat[si]);
225#endif 225#endif
226} 226}
227 227
@@ -243,15 +243,6 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
243#endif 243#endif
244} 244}
245 245
246static inline struct kmem_cache_cpu *get_cpu_slab(struct kmem_cache *s, int cpu)
247{
248#ifdef CONFIG_SMP
249 return s->cpu_slab[cpu];
250#else
251 return &s->cpu_slab;
252#endif
253}
254
255/* Verify that a pointer has an address that is valid within a slab page */ 246/* Verify that a pointer has an address that is valid within a slab page */
256static inline int check_valid_pointer(struct kmem_cache *s, 247static inline int check_valid_pointer(struct kmem_cache *s,
257 struct page *page, const void *object) 248 struct page *page, const void *object)
@@ -270,13 +261,6 @@ static inline int check_valid_pointer(struct kmem_cache *s,
270 return 1; 261 return 1;
271} 262}
272 263
273/*
274 * Slow version of get and set free pointer.
275 *
276 * This version requires touching the cache lines of kmem_cache which
277 * we avoid to do in the fast alloc free paths. There we obtain the offset
278 * from the page struct.
279 */
280static inline void *get_freepointer(struct kmem_cache *s, void *object) 264static inline void *get_freepointer(struct kmem_cache *s, void *object)
281{ 265{
282 return *(void **)(object + s->offset); 266 return *(void **)(object + s->offset);
@@ -1128,7 +1112,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
1128 if (!page) 1112 if (!page)
1129 return NULL; 1113 return NULL;
1130 1114
1131 stat(get_cpu_slab(s, raw_smp_processor_id()), ORDER_FALLBACK); 1115 stat(s, ORDER_FALLBACK);
1132 } 1116 }
1133 1117
1134 if (kmemcheck_enabled 1118 if (kmemcheck_enabled
@@ -1426,23 +1410,22 @@ static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node)
1426static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) 1410static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
1427{ 1411{
1428 struct kmem_cache_node *n = get_node(s, page_to_nid(page)); 1412 struct kmem_cache_node *n = get_node(s, page_to_nid(page));
1429 struct kmem_cache_cpu *c = get_cpu_slab(s, smp_processor_id());
1430 1413
1431 __ClearPageSlubFrozen(page); 1414 __ClearPageSlubFrozen(page);
1432 if (page->inuse) { 1415 if (page->inuse) {
1433 1416
1434 if (page->freelist) { 1417 if (page->freelist) {
1435 add_partial(n, page, tail); 1418 add_partial(n, page, tail);
1436 stat(c, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD); 1419 stat(s, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD);
1437 } else { 1420 } else {
1438 stat(c, DEACTIVATE_FULL); 1421 stat(s, DEACTIVATE_FULL);
1439 if (SLABDEBUG && PageSlubDebug(page) && 1422 if (SLABDEBUG && PageSlubDebug(page) &&
1440 (s->flags & SLAB_STORE_USER)) 1423 (s->flags & SLAB_STORE_USER))
1441 add_full(n, page); 1424 add_full(n, page);
1442 } 1425 }
1443 slab_unlock(page); 1426 slab_unlock(page);
1444 } else { 1427 } else {
1445 stat(c, DEACTIVATE_EMPTY); 1428 stat(s, DEACTIVATE_EMPTY);
1446 if (n->nr_partial < s->min_partial) { 1429 if (n->nr_partial < s->min_partial) {
1447 /* 1430 /*
1448 * Adding an empty slab to the partial slabs in order 1431 * Adding an empty slab to the partial slabs in order
@@ -1458,7 +1441,7 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
1458 slab_unlock(page); 1441 slab_unlock(page);
1459 } else { 1442 } else {
1460 slab_unlock(page); 1443 slab_unlock(page);
1461 stat(get_cpu_slab(s, raw_smp_processor_id()), FREE_SLAB); 1444 stat(s, FREE_SLAB);
1462 discard_slab(s, page); 1445 discard_slab(s, page);
1463 } 1446 }
1464 } 1447 }
@@ -1473,7 +1456,7 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
1473 int tail = 1; 1456 int tail = 1;
1474 1457
1475 if (page->freelist) 1458 if (page->freelist)
1476 stat(c, DEACTIVATE_REMOTE_FREES); 1459 stat(s, DEACTIVATE_REMOTE_FREES);
1477 /* 1460 /*
1478 * Merge cpu freelist into slab freelist. Typically we get here 1461 * Merge cpu freelist into slab freelist. Typically we get here
1479 * because both freelists are empty. So this is unlikely 1462 * because both freelists are empty. So this is unlikely
@@ -1486,10 +1469,10 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
1486 1469
1487 /* Retrieve object from cpu_freelist */ 1470 /* Retrieve object from cpu_freelist */
1488 object = c->freelist; 1471 object = c->freelist;
1489 c->freelist = c->freelist[c->offset]; 1472 c->freelist = get_freepointer(s, c->freelist);
1490 1473
1491 /* And put onto the regular freelist */ 1474 /* And put onto the regular freelist */
1492 object[c->offset] = page->freelist; 1475 set_freepointer(s, object, page->freelist);
1493 page->freelist = object; 1476 page->freelist = object;
1494 page->inuse--; 1477 page->inuse--;
1495 } 1478 }
@@ -1499,7 +1482,7 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
1499 1482
1500static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) 1483static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
1501{ 1484{
1502 stat(c, CPUSLAB_FLUSH); 1485 stat(s, CPUSLAB_FLUSH);
1503 slab_lock(c->page); 1486 slab_lock(c->page);
1504 deactivate_slab(s, c); 1487 deactivate_slab(s, c);
1505} 1488}
@@ -1511,7 +1494,7 @@ static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
1511 */ 1494 */
1512static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) 1495static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu)
1513{ 1496{
1514 struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); 1497 struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
1515 1498
1516 if (likely(c && c->page)) 1499 if (likely(c && c->page))
1517 flush_slab(s, c); 1500 flush_slab(s, c);
@@ -1639,7 +1622,7 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
1639 if (unlikely(!node_match(c, node))) 1622 if (unlikely(!node_match(c, node)))
1640 goto another_slab; 1623 goto another_slab;
1641 1624
1642 stat(c, ALLOC_REFILL); 1625 stat(s, ALLOC_REFILL);
1643 1626
1644load_freelist: 1627load_freelist:
1645 object = c->page->freelist; 1628 object = c->page->freelist;
@@ -1648,13 +1631,13 @@ load_freelist:
1648 if (unlikely(SLABDEBUG && PageSlubDebug(c->page))) 1631 if (unlikely(SLABDEBUG && PageSlubDebug(c->page)))
1649 goto debug; 1632 goto debug;
1650 1633
1651 c->freelist = object[c->offset]; 1634 c->freelist = get_freepointer(s, object);
1652 c->page->inuse = c->page->objects; 1635 c->page->inuse = c->page->objects;
1653 c->page->freelist = NULL; 1636 c->page->freelist = NULL;
1654 c->node = page_to_nid(c->page); 1637 c->node = page_to_nid(c->page);
1655unlock_out: 1638unlock_out:
1656 slab_unlock(c->page); 1639 slab_unlock(c->page);
1657 stat(c, ALLOC_SLOWPATH); 1640 stat(s, ALLOC_SLOWPATH);
1658 return object; 1641 return object;
1659 1642
1660another_slab: 1643another_slab:
@@ -1664,7 +1647,7 @@ new_slab:
1664 new = get_partial(s, gfpflags, node); 1647 new = get_partial(s, gfpflags, node);
1665 if (new) { 1648 if (new) {
1666 c->page = new; 1649 c->page = new;
1667 stat(c, ALLOC_FROM_PARTIAL); 1650 stat(s, ALLOC_FROM_PARTIAL);
1668 goto load_freelist; 1651 goto load_freelist;
1669 } 1652 }
1670 1653
@@ -1677,8 +1660,8 @@ new_slab:
1677 local_irq_disable(); 1660 local_irq_disable();
1678 1661
1679 if (new) { 1662 if (new) {
1680 c = get_cpu_slab(s, smp_processor_id()); 1663 c = __this_cpu_ptr(s->cpu_slab);
1681 stat(c, ALLOC_SLAB); 1664 stat(s, ALLOC_SLAB);
1682 if (c->page) 1665 if (c->page)
1683 flush_slab(s, c); 1666 flush_slab(s, c);
1684 slab_lock(new); 1667 slab_lock(new);
@@ -1694,7 +1677,7 @@ debug:
1694 goto another_slab; 1677 goto another_slab;
1695 1678
1696 c->page->inuse++; 1679 c->page->inuse++;
1697 c->page->freelist = object[c->offset]; 1680 c->page->freelist = get_freepointer(s, object);
1698 c->node = -1; 1681 c->node = -1;
1699 goto unlock_out; 1682 goto unlock_out;
1700} 1683}
@@ -1715,7 +1698,6 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
1715 void **object; 1698 void **object;
1716 struct kmem_cache_cpu *c; 1699 struct kmem_cache_cpu *c;
1717 unsigned long flags; 1700 unsigned long flags;
1718 unsigned int objsize;
1719 1701
1720 gfpflags &= gfp_allowed_mask; 1702 gfpflags &= gfp_allowed_mask;
1721 1703
@@ -1726,24 +1708,23 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
1726 return NULL; 1708 return NULL;
1727 1709
1728 local_irq_save(flags); 1710 local_irq_save(flags);
1729 c = get_cpu_slab(s, smp_processor_id()); 1711 c = __this_cpu_ptr(s->cpu_slab);
1730 objsize = c->objsize; 1712 object = c->freelist;
1731 if (unlikely(!c->freelist || !node_match(c, node))) 1713 if (unlikely(!object || !node_match(c, node)))
1732 1714
1733 object = __slab_alloc(s, gfpflags, node, addr, c); 1715 object = __slab_alloc(s, gfpflags, node, addr, c);
1734 1716
1735 else { 1717 else {
1736 object = c->freelist; 1718 c->freelist = get_freepointer(s, object);
1737 c->freelist = object[c->offset]; 1719 stat(s, ALLOC_FASTPATH);
1738 stat(c, ALLOC_FASTPATH);
1739 } 1720 }
1740 local_irq_restore(flags); 1721 local_irq_restore(flags);
1741 1722
1742 if (unlikely(gfpflags & __GFP_ZERO) && object) 1723 if (unlikely(gfpflags & __GFP_ZERO) && object)
1743 memset(object, 0, objsize); 1724 memset(object, 0, s->objsize);
1744 1725
1745 kmemcheck_slab_alloc(s, gfpflags, object, c->objsize); 1726 kmemcheck_slab_alloc(s, gfpflags, object, s->objsize);
1746 kmemleak_alloc_recursive(object, objsize, 1, s->flags, gfpflags); 1727 kmemleak_alloc_recursive(object, s->objsize, 1, s->flags, gfpflags);
1747 1728
1748 return object; 1729 return object;
1749} 1730}
@@ -1798,26 +1779,25 @@ EXPORT_SYMBOL(kmem_cache_alloc_node_notrace);
1798 * handling required then we can return immediately. 1779 * handling required then we can return immediately.
1799 */ 1780 */
1800static void __slab_free(struct kmem_cache *s, struct page *page, 1781static void __slab_free(struct kmem_cache *s, struct page *page,
1801 void *x, unsigned long addr, unsigned int offset) 1782 void *x, unsigned long addr)
1802{ 1783{
1803 void *prior; 1784 void *prior;
1804 void **object = (void *)x; 1785 void **object = (void *)x;
1805 struct kmem_cache_cpu *c;
1806 1786
1807 c = get_cpu_slab(s, raw_smp_processor_id()); 1787 stat(s, FREE_SLOWPATH);
1808 stat(c, FREE_SLOWPATH);
1809 slab_lock(page); 1788 slab_lock(page);
1810 1789
1811 if (unlikely(SLABDEBUG && PageSlubDebug(page))) 1790 if (unlikely(SLABDEBUG && PageSlubDebug(page)))
1812 goto debug; 1791 goto debug;
1813 1792
1814checks_ok: 1793checks_ok:
1815 prior = object[offset] = page->freelist; 1794 prior = page->freelist;
1795 set_freepointer(s, object, prior);
1816 page->freelist = object; 1796 page->freelist = object;
1817 page->inuse--; 1797 page->inuse--;
1818 1798
1819 if (unlikely(PageSlubFrozen(page))) { 1799 if (unlikely(PageSlubFrozen(page))) {
1820 stat(c, FREE_FROZEN); 1800 stat(s, FREE_FROZEN);
1821 goto out_unlock; 1801 goto out_unlock;
1822 } 1802 }
1823 1803
@@ -1830,7 +1810,7 @@ checks_ok:
1830 */ 1810 */
1831 if (unlikely(!prior)) { 1811 if (unlikely(!prior)) {
1832 add_partial(get_node(s, page_to_nid(page)), page, 1); 1812 add_partial(get_node(s, page_to_nid(page)), page, 1);
1833 stat(c, FREE_ADD_PARTIAL); 1813 stat(s, FREE_ADD_PARTIAL);
1834 } 1814 }
1835 1815
1836out_unlock: 1816out_unlock:
@@ -1843,10 +1823,10 @@ slab_empty:
1843 * Slab still on the partial list. 1823 * Slab still on the partial list.
1844 */ 1824 */
1845 remove_partial(s, page); 1825 remove_partial(s, page);
1846 stat(c, FREE_REMOVE_PARTIAL); 1826 stat(s, FREE_REMOVE_PARTIAL);
1847 } 1827 }
1848 slab_unlock(page); 1828 slab_unlock(page);
1849 stat(c, FREE_SLAB); 1829 stat(s, FREE_SLAB);
1850 discard_slab(s, page); 1830 discard_slab(s, page);
1851 return; 1831 return;
1852 1832
@@ -1876,17 +1856,17 @@ static __always_inline void slab_free(struct kmem_cache *s,
1876 1856
1877 kmemleak_free_recursive(x, s->flags); 1857 kmemleak_free_recursive(x, s->flags);
1878 local_irq_save(flags); 1858 local_irq_save(flags);
1879 c = get_cpu_slab(s, smp_processor_id()); 1859 c = __this_cpu_ptr(s->cpu_slab);
1880 kmemcheck_slab_free(s, object, c->objsize); 1860 kmemcheck_slab_free(s, object, s->objsize);
1881 debug_check_no_locks_freed(object, c->objsize); 1861 debug_check_no_locks_freed(object, s->objsize);
1882 if (!(s->flags & SLAB_DEBUG_OBJECTS)) 1862 if (!(s->flags & SLAB_DEBUG_OBJECTS))
1883 debug_check_no_obj_freed(object, c->objsize); 1863 debug_check_no_obj_freed(object, s->objsize);
1884 if (likely(page == c->page && c->node >= 0)) { 1864 if (likely(page == c->page && c->node >= 0)) {
1885 object[c->offset] = c->freelist; 1865 set_freepointer(s, object, c->freelist);
1886 c->freelist = object; 1866 c->freelist = object;
1887 stat(c, FREE_FASTPATH); 1867 stat(s, FREE_FASTPATH);
1888 } else 1868 } else
1889 __slab_free(s, page, x, addr, c->offset); 1869 __slab_free(s, page, x, addr);
1890 1870
1891 local_irq_restore(flags); 1871 local_irq_restore(flags);
1892} 1872}
@@ -2073,19 +2053,6 @@ static unsigned long calculate_alignment(unsigned long flags,
2073 return ALIGN(align, sizeof(void *)); 2053 return ALIGN(align, sizeof(void *));
2074} 2054}
2075 2055
2076static void init_kmem_cache_cpu(struct kmem_cache *s,
2077 struct kmem_cache_cpu *c)
2078{
2079 c->page = NULL;
2080 c->freelist = NULL;
2081 c->node = 0;
2082 c->offset = s->offset / sizeof(void *);
2083 c->objsize = s->objsize;
2084#ifdef CONFIG_SLUB_STATS
2085 memset(c->stat, 0, NR_SLUB_STAT_ITEMS * sizeof(unsigned));
2086#endif
2087}
2088
2089static void 2056static void
2090init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s) 2057init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s)
2091{ 2058{
@@ -2099,130 +2066,24 @@ init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s)
2099#endif 2066#endif
2100} 2067}
2101 2068
2102#ifdef CONFIG_SMP 2069static DEFINE_PER_CPU(struct kmem_cache_cpu, kmalloc_percpu[KMALLOC_CACHES]);
2103/*
2104 * Per cpu array for per cpu structures.
2105 *
2106 * The per cpu array places all kmem_cache_cpu structures from one processor
2107 * close together meaning that it becomes possible that multiple per cpu
2108 * structures are contained in one cacheline. This may be particularly
2109 * beneficial for the kmalloc caches.
2110 *
2111 * A desktop system typically has around 60-80 slabs. With 100 here we are
2112 * likely able to get per cpu structures for all caches from the array defined
2113 * here. We must be able to cover all kmalloc caches during bootstrap.
2114 *
2115 * If the per cpu array is exhausted then fall back to kmalloc
2116 * of individual cachelines. No sharing is possible then.
2117 */
2118#define NR_KMEM_CACHE_CPU 100
2119
2120static DEFINE_PER_CPU(struct kmem_cache_cpu [NR_KMEM_CACHE_CPU],
2121 kmem_cache_cpu);
2122
2123static DEFINE_PER_CPU(struct kmem_cache_cpu *, kmem_cache_cpu_free);
2124static DECLARE_BITMAP(kmem_cach_cpu_free_init_once, CONFIG_NR_CPUS);
2125
2126static struct kmem_cache_cpu *alloc_kmem_cache_cpu(struct kmem_cache *s,
2127 int cpu, gfp_t flags)
2128{
2129 struct kmem_cache_cpu *c = per_cpu(kmem_cache_cpu_free, cpu);
2130
2131 if (c)
2132 per_cpu(kmem_cache_cpu_free, cpu) =
2133 (void *)c->freelist;
2134 else {
2135 /* Table overflow: So allocate ourselves */
2136 c = kmalloc_node(
2137 ALIGN(sizeof(struct kmem_cache_cpu), cache_line_size()),
2138 flags, cpu_to_node(cpu));
2139 if (!c)
2140 return NULL;
2141 }
2142
2143 init_kmem_cache_cpu(s, c);
2144 return c;
2145}
2146
2147static void free_kmem_cache_cpu(struct kmem_cache_cpu *c, int cpu)
2148{
2149 if (c < per_cpu(kmem_cache_cpu, cpu) ||
2150 c >= per_cpu(kmem_cache_cpu, cpu) + NR_KMEM_CACHE_CPU) {
2151 kfree(c);
2152 return;
2153 }
2154 c->freelist = (void *)per_cpu(kmem_cache_cpu_free, cpu);
2155 per_cpu(kmem_cache_cpu_free, cpu) = c;
2156}
2157
2158static void free_kmem_cache_cpus(struct kmem_cache *s)
2159{
2160 int cpu;
2161
2162 for_each_online_cpu(cpu) {
2163 struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
2164
2165 if (c) {
2166 s->cpu_slab[cpu] = NULL;
2167 free_kmem_cache_cpu(c, cpu);
2168 }
2169 }
2170}
2171
2172static int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags)
2173{
2174 int cpu;
2175
2176 for_each_online_cpu(cpu) {
2177 struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
2178 2070
2179 if (c) 2071static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags)
2180 continue;
2181
2182 c = alloc_kmem_cache_cpu(s, cpu, flags);
2183 if (!c) {
2184 free_kmem_cache_cpus(s);
2185 return 0;
2186 }
2187 s->cpu_slab[cpu] = c;
2188 }
2189 return 1;
2190}
2191
2192/*
2193 * Initialize the per cpu array.
2194 */
2195static void init_alloc_cpu_cpu(int cpu)
2196{
2197 int i;
2198
2199 if (cpumask_test_cpu(cpu, to_cpumask(kmem_cach_cpu_free_init_once)))
2200 return;
2201
2202 for (i = NR_KMEM_CACHE_CPU - 1; i >= 0; i--)
2203 free_kmem_cache_cpu(&per_cpu(kmem_cache_cpu, cpu)[i], cpu);
2204
2205 cpumask_set_cpu(cpu, to_cpumask(kmem_cach_cpu_free_init_once));
2206}
2207
2208static void __init init_alloc_cpu(void)
2209{ 2072{
2210 int cpu; 2073 if (s < kmalloc_caches + KMALLOC_CACHES && s >= kmalloc_caches)
2211 2074 /*
2212 for_each_online_cpu(cpu) 2075 * Boot time creation of the kmalloc array. Use static per cpu data
2213 init_alloc_cpu_cpu(cpu); 2076 * since the per cpu allocator is not available yet.
2214 } 2077 */
2078 s->cpu_slab = per_cpu_var(kmalloc_percpu) + (s - kmalloc_caches);
2079 else
2080 s->cpu_slab = alloc_percpu(struct kmem_cache_cpu);
2215 2081
2216#else 2082 if (!s->cpu_slab)
2217static inline void free_kmem_cache_cpus(struct kmem_cache *s) {} 2083 return 0;
2218static inline void init_alloc_cpu(void) {}
2219 2084
2220static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags)
2221{
2222 init_kmem_cache_cpu(s, &s->cpu_slab);
2223 return 1; 2085 return 1;
2224} 2086}
2225#endif
2226 2087
2227#ifdef CONFIG_NUMA 2088#ifdef CONFIG_NUMA
2228/* 2089/*
@@ -2291,7 +2152,8 @@ static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
2291 int node; 2152 int node;
2292 int local_node; 2153 int local_node;
2293 2154
2294 if (slab_state >= UP) 2155 if (slab_state >= UP && (s < kmalloc_caches ||
2156 s > kmalloc_caches + KMALLOC_CACHES))
2295 local_node = page_to_nid(virt_to_page(s)); 2157 local_node = page_to_nid(virt_to_page(s));
2296 else 2158 else
2297 local_node = 0; 2159 local_node = 0;
@@ -2506,6 +2368,7 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
2506 2368
2507 if (alloc_kmem_cache_cpus(s, gfpflags & ~SLUB_DMA)) 2369 if (alloc_kmem_cache_cpus(s, gfpflags & ~SLUB_DMA))
2508 return 1; 2370 return 1;
2371
2509 free_kmem_cache_nodes(s); 2372 free_kmem_cache_nodes(s);
2510error: 2373error:
2511 if (flags & SLAB_PANIC) 2374 if (flags & SLAB_PANIC)
@@ -2613,9 +2476,8 @@ static inline int kmem_cache_close(struct kmem_cache *s)
2613 int node; 2476 int node;
2614 2477
2615 flush_all(s); 2478 flush_all(s);
2616 2479 free_percpu(s->cpu_slab);
2617 /* Attempt to free all objects */ 2480 /* Attempt to free all objects */
2618 free_kmem_cache_cpus(s);
2619 for_each_node_state(node, N_NORMAL_MEMORY) { 2481 for_each_node_state(node, N_NORMAL_MEMORY) {
2620 struct kmem_cache_node *n = get_node(s, node); 2482 struct kmem_cache_node *n = get_node(s, node);
2621 2483
@@ -2655,7 +2517,7 @@ EXPORT_SYMBOL(kmem_cache_destroy);
2655 * Kmalloc subsystem 2517 * Kmalloc subsystem
2656 *******************************************************************/ 2518 *******************************************************************/
2657 2519
2658struct kmem_cache kmalloc_caches[SLUB_PAGE_SHIFT] __cacheline_aligned; 2520struct kmem_cache kmalloc_caches[KMALLOC_CACHES] __cacheline_aligned;
2659EXPORT_SYMBOL(kmalloc_caches); 2521EXPORT_SYMBOL(kmalloc_caches);
2660 2522
2661static int __init setup_slub_min_order(char *str) 2523static int __init setup_slub_min_order(char *str)
@@ -2745,6 +2607,7 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags)
2745 char *text; 2607 char *text;
2746 size_t realsize; 2608 size_t realsize;
2747 unsigned long slabflags; 2609 unsigned long slabflags;
2610 int i;
2748 2611
2749 s = kmalloc_caches_dma[index]; 2612 s = kmalloc_caches_dma[index];
2750 if (s) 2613 if (s)
@@ -2764,7 +2627,14 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags)
2764 realsize = kmalloc_caches[index].objsize; 2627 realsize = kmalloc_caches[index].objsize;
2765 text = kasprintf(flags & ~SLUB_DMA, "kmalloc_dma-%d", 2628 text = kasprintf(flags & ~SLUB_DMA, "kmalloc_dma-%d",
2766 (unsigned int)realsize); 2629 (unsigned int)realsize);
2767 s = kmalloc(kmem_size, flags & ~SLUB_DMA); 2630
2631 s = NULL;
2632 for (i = 0; i < KMALLOC_CACHES; i++)
2633 if (!kmalloc_caches[i].size)
2634 break;
2635
2636 BUG_ON(i >= KMALLOC_CACHES);
2637 s = kmalloc_caches + i;
2768 2638
2769 /* 2639 /*
2770 * Must defer sysfs creation to a workqueue because we don't know 2640 * Must defer sysfs creation to a workqueue because we don't know
@@ -2776,9 +2646,9 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags)
2776 if (slab_state >= SYSFS) 2646 if (slab_state >= SYSFS)
2777 slabflags |= __SYSFS_ADD_DEFERRED; 2647 slabflags |= __SYSFS_ADD_DEFERRED;
2778 2648
2779 if (!s || !text || !kmem_cache_open(s, flags, text, 2649 if (!text || !kmem_cache_open(s, flags, text,
2780 realsize, ARCH_KMALLOC_MINALIGN, slabflags, NULL)) { 2650 realsize, ARCH_KMALLOC_MINALIGN, slabflags, NULL)) {
2781 kfree(s); 2651 s->size = 0;
2782 kfree(text); 2652 kfree(text);
2783 goto unlock_out; 2653 goto unlock_out;
2784 } 2654 }
@@ -3180,8 +3050,6 @@ void __init kmem_cache_init(void)
3180 int i; 3050 int i;
3181 int caches = 0; 3051 int caches = 0;
3182 3052
3183 init_alloc_cpu();
3184
3185#ifdef CONFIG_NUMA 3053#ifdef CONFIG_NUMA
3186 /* 3054 /*
3187 * Must first have the slab cache available for the allocations of the 3055 * Must first have the slab cache available for the allocations of the
@@ -3265,8 +3133,10 @@ void __init kmem_cache_init(void)
3265 3133
3266#ifdef CONFIG_SMP 3134#ifdef CONFIG_SMP
3267 register_cpu_notifier(&slab_notifier); 3135 register_cpu_notifier(&slab_notifier);
3268 kmem_size = offsetof(struct kmem_cache, cpu_slab) + 3136#endif
3269 nr_cpu_ids * sizeof(struct kmem_cache_cpu *); 3137#ifdef CONFIG_NUMA
3138 kmem_size = offsetof(struct kmem_cache, node) +
3139 nr_node_ids * sizeof(struct kmem_cache_node *);
3270#else 3140#else
3271 kmem_size = sizeof(struct kmem_cache); 3141 kmem_size = sizeof(struct kmem_cache);
3272#endif 3142#endif
@@ -3355,22 +3225,12 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
3355 down_write(&slub_lock); 3225 down_write(&slub_lock);
3356 s = find_mergeable(size, align, flags, name, ctor); 3226 s = find_mergeable(size, align, flags, name, ctor);
3357 if (s) { 3227 if (s) {
3358 int cpu;
3359
3360 s->refcount++; 3228 s->refcount++;
3361 /* 3229 /*
3362 * Adjust the object sizes so that we clear 3230 * Adjust the object sizes so that we clear
3363 * the complete object on kzalloc. 3231 * the complete object on kzalloc.
3364 */ 3232 */
3365 s->objsize = max(s->objsize, (int)size); 3233 s->objsize = max(s->objsize, (int)size);
3366
3367 /*
3368 * And then we need to update the object size in the
3369 * per cpu structures
3370 */
3371 for_each_online_cpu(cpu)
3372 get_cpu_slab(s, cpu)->objsize = s->objsize;
3373
3374 s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); 3234 s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *)));
3375 up_write(&slub_lock); 3235 up_write(&slub_lock);
3376 3236
@@ -3424,29 +3284,15 @@ static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb,
3424 unsigned long flags; 3284 unsigned long flags;
3425 3285
3426 switch (action) { 3286 switch (action) {
3427 case CPU_UP_PREPARE:
3428 case CPU_UP_PREPARE_FROZEN:
3429 init_alloc_cpu_cpu(cpu);
3430 down_read(&slub_lock);
3431 list_for_each_entry(s, &slab_caches, list)
3432 s->cpu_slab[cpu] = alloc_kmem_cache_cpu(s, cpu,
3433 GFP_KERNEL);
3434 up_read(&slub_lock);
3435 break;
3436
3437 case CPU_UP_CANCELED: 3287 case CPU_UP_CANCELED:
3438 case CPU_UP_CANCELED_FROZEN: 3288 case CPU_UP_CANCELED_FROZEN:
3439 case CPU_DEAD: 3289 case CPU_DEAD:
3440 case CPU_DEAD_FROZEN: 3290 case CPU_DEAD_FROZEN:
3441 down_read(&slub_lock); 3291 down_read(&slub_lock);
3442 list_for_each_entry(s, &slab_caches, list) { 3292 list_for_each_entry(s, &slab_caches, list) {
3443 struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
3444
3445 local_irq_save(flags); 3293 local_irq_save(flags);
3446 __flush_cpu_slab(s, cpu); 3294 __flush_cpu_slab(s, cpu);
3447 local_irq_restore(flags); 3295 local_irq_restore(flags);
3448 free_kmem_cache_cpu(c, cpu);
3449 s->cpu_slab[cpu] = NULL;
3450 } 3296 }
3451 up_read(&slub_lock); 3297 up_read(&slub_lock);
3452 break; 3298 break;
@@ -3932,7 +3778,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
3932 int cpu; 3778 int cpu;
3933 3779
3934 for_each_possible_cpu(cpu) { 3780 for_each_possible_cpu(cpu) {
3935 struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); 3781 struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
3936 3782
3937 if (!c || c->node < 0) 3783 if (!c || c->node < 0)
3938 continue; 3784 continue;
@@ -4374,7 +4220,7 @@ static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si)
4374 return -ENOMEM; 4220 return -ENOMEM;
4375 4221
4376 for_each_online_cpu(cpu) { 4222 for_each_online_cpu(cpu) {
4377 unsigned x = get_cpu_slab(s, cpu)->stat[si]; 4223 unsigned x = per_cpu_ptr(s->cpu_slab, cpu)->stat[si];
4378 4224
4379 data[cpu] = x; 4225 data[cpu] = x;
4380 sum += x; 4226 sum += x;
@@ -4397,7 +4243,7 @@ static void clear_stat(struct kmem_cache *s, enum stat_item si)
4397 int cpu; 4243 int cpu;
4398 4244
4399 for_each_online_cpu(cpu) 4245 for_each_online_cpu(cpu)
4400 get_cpu_slab(s, cpu)->stat[si] = 0; 4246 per_cpu_ptr(s->cpu_slab, cpu)->stat[si] = 0;
4401} 4247}
4402 4248
4403#define STAT_ATTR(si, text) \ 4249#define STAT_ATTR(si, text) \
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index d9714bdcb4a3..392b9bb5bc01 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -40,9 +40,11 @@ static void * __init_refok __earlyonly_bootmem_alloc(int node,
40 unsigned long align, 40 unsigned long align,
41 unsigned long goal) 41 unsigned long goal)
42{ 42{
43 return __alloc_bootmem_node(NODE_DATA(node), size, align, goal); 43 return __alloc_bootmem_node_high(NODE_DATA(node), size, align, goal);
44} 44}
45 45
46static void *vmemmap_buf;
47static void *vmemmap_buf_end;
46 48
47void * __meminit vmemmap_alloc_block(unsigned long size, int node) 49void * __meminit vmemmap_alloc_block(unsigned long size, int node)
48{ 50{
@@ -64,6 +66,24 @@ void * __meminit vmemmap_alloc_block(unsigned long size, int node)
64 __pa(MAX_DMA_ADDRESS)); 66 __pa(MAX_DMA_ADDRESS));
65} 67}
66 68
69/* need to make sure size is all the same during early stage */
70void * __meminit vmemmap_alloc_block_buf(unsigned long size, int node)
71{
72 void *ptr;
73
74 if (!vmemmap_buf)
75 return vmemmap_alloc_block(size, node);
76
77 /* take the from buf */
78 ptr = (void *)ALIGN((unsigned long)vmemmap_buf, size);
79 if (ptr + size > vmemmap_buf_end)
80 return vmemmap_alloc_block(size, node);
81
82 vmemmap_buf = ptr + size;
83
84 return ptr;
85}
86
67void __meminit vmemmap_verify(pte_t *pte, int node, 87void __meminit vmemmap_verify(pte_t *pte, int node,
68 unsigned long start, unsigned long end) 88 unsigned long start, unsigned long end)
69{ 89{
@@ -80,7 +100,7 @@ pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node)
80 pte_t *pte = pte_offset_kernel(pmd, addr); 100 pte_t *pte = pte_offset_kernel(pmd, addr);
81 if (pte_none(*pte)) { 101 if (pte_none(*pte)) {
82 pte_t entry; 102 pte_t entry;
83 void *p = vmemmap_alloc_block(PAGE_SIZE, node); 103 void *p = vmemmap_alloc_block_buf(PAGE_SIZE, node);
84 if (!p) 104 if (!p)
85 return NULL; 105 return NULL;
86 entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL); 106 entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
@@ -163,3 +183,55 @@ struct page * __meminit sparse_mem_map_populate(unsigned long pnum, int nid)
163 183
164 return map; 184 return map;
165} 185}
186
187void __init sparse_mem_maps_populate_node(struct page **map_map,
188 unsigned long pnum_begin,
189 unsigned long pnum_end,
190 unsigned long map_count, int nodeid)
191{
192 unsigned long pnum;
193 unsigned long size = sizeof(struct page) * PAGES_PER_SECTION;
194 void *vmemmap_buf_start;
195
196 size = ALIGN(size, PMD_SIZE);
197 vmemmap_buf_start = __earlyonly_bootmem_alloc(nodeid, size * map_count,
198 PMD_SIZE, __pa(MAX_DMA_ADDRESS));
199
200 if (vmemmap_buf_start) {
201 vmemmap_buf = vmemmap_buf_start;
202 vmemmap_buf_end = vmemmap_buf_start + size * map_count;
203 }
204
205 for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
206 struct mem_section *ms;
207
208 if (!present_section_nr(pnum))
209 continue;
210
211 map_map[pnum] = sparse_mem_map_populate(pnum, nodeid);
212 if (map_map[pnum])
213 continue;
214 ms = __nr_to_section(pnum);
215 printk(KERN_ERR "%s: sparsemem memory map backing failed "
216 "some memory will not be available.\n", __func__);
217 ms->section_mem_map = 0;
218 }
219
220 if (vmemmap_buf_start) {
221 /* need to free left buf */
222#ifdef CONFIG_NO_BOOTMEM
223 free_early(__pa(vmemmap_buf_start), __pa(vmemmap_buf_end));
224 if (vmemmap_buf_start < vmemmap_buf) {
225 char name[15];
226
227 snprintf(name, sizeof(name), "MEMMAP %d", nodeid);
228 reserve_early_without_check(__pa(vmemmap_buf_start),
229 __pa(vmemmap_buf), name);
230 }
231#else
232 free_bootmem(__pa(vmemmap_buf), vmemmap_buf_end - vmemmap_buf);
233#endif
234 vmemmap_buf = NULL;
235 vmemmap_buf_end = NULL;
236 }
237}
diff --git a/mm/sparse.c b/mm/sparse.c
index 6ce4aab69e99..22896d589133 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -271,7 +271,8 @@ static unsigned long *__kmalloc_section_usemap(void)
271 271
272#ifdef CONFIG_MEMORY_HOTREMOVE 272#ifdef CONFIG_MEMORY_HOTREMOVE
273static unsigned long * __init 273static unsigned long * __init
274sparse_early_usemap_alloc_pgdat_section(struct pglist_data *pgdat) 274sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
275 unsigned long count)
275{ 276{
276 unsigned long section_nr; 277 unsigned long section_nr;
277 278
@@ -286,7 +287,7 @@ sparse_early_usemap_alloc_pgdat_section(struct pglist_data *pgdat)
286 * this problem. 287 * this problem.
287 */ 288 */
288 section_nr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT); 289 section_nr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT);
289 return alloc_bootmem_section(usemap_size(), section_nr); 290 return alloc_bootmem_section(usemap_size() * count, section_nr);
290} 291}
291 292
292static void __init check_usemap_section_nr(int nid, unsigned long *usemap) 293static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
@@ -329,7 +330,8 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
329} 330}
330#else 331#else
331static unsigned long * __init 332static unsigned long * __init
332sparse_early_usemap_alloc_pgdat_section(struct pglist_data *pgdat) 333sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
334 unsigned long count)
333{ 335{
334 return NULL; 336 return NULL;
335} 337}
@@ -339,27 +341,40 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
339} 341}
340#endif /* CONFIG_MEMORY_HOTREMOVE */ 342#endif /* CONFIG_MEMORY_HOTREMOVE */
341 343
342static unsigned long *__init sparse_early_usemap_alloc(unsigned long pnum) 344static void __init sparse_early_usemaps_alloc_node(unsigned long**usemap_map,
345 unsigned long pnum_begin,
346 unsigned long pnum_end,
347 unsigned long usemap_count, int nodeid)
343{ 348{
344 unsigned long *usemap; 349 void *usemap;
345 struct mem_section *ms = __nr_to_section(pnum); 350 unsigned long pnum;
346 int nid = sparse_early_nid(ms); 351 int size = usemap_size();
347
348 usemap = sparse_early_usemap_alloc_pgdat_section(NODE_DATA(nid));
349 if (usemap)
350 return usemap;
351 352
352 usemap = alloc_bootmem_node(NODE_DATA(nid), usemap_size()); 353 usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid),
354 usemap_count);
353 if (usemap) { 355 if (usemap) {
354 check_usemap_section_nr(nid, usemap); 356 for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
355 return usemap; 357 if (!present_section_nr(pnum))
358 continue;
359 usemap_map[pnum] = usemap;
360 usemap += size;
361 }
362 return;
356 } 363 }
357 364
358 /* Stupid: suppress gcc warning for SPARSEMEM && !NUMA */ 365 usemap = alloc_bootmem_node(NODE_DATA(nodeid), size * usemap_count);
359 nid = 0; 366 if (usemap) {
367 for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
368 if (!present_section_nr(pnum))
369 continue;
370 usemap_map[pnum] = usemap;
371 usemap += size;
372 check_usemap_section_nr(nodeid, usemap_map[pnum]);
373 }
374 return;
375 }
360 376
361 printk(KERN_WARNING "%s: allocation failed\n", __func__); 377 printk(KERN_WARNING "%s: allocation failed\n", __func__);
362 return NULL;
363} 378}
364 379
365#ifndef CONFIG_SPARSEMEM_VMEMMAP 380#ifndef CONFIG_SPARSEMEM_VMEMMAP
@@ -375,8 +390,65 @@ struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid)
375 PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION)); 390 PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION));
376 return map; 391 return map;
377} 392}
393void __init sparse_mem_maps_populate_node(struct page **map_map,
394 unsigned long pnum_begin,
395 unsigned long pnum_end,
396 unsigned long map_count, int nodeid)
397{
398 void *map;
399 unsigned long pnum;
400 unsigned long size = sizeof(struct page) * PAGES_PER_SECTION;
401
402 map = alloc_remap(nodeid, size * map_count);
403 if (map) {
404 for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
405 if (!present_section_nr(pnum))
406 continue;
407 map_map[pnum] = map;
408 map += size;
409 }
410 return;
411 }
412
413 size = PAGE_ALIGN(size);
414 map = alloc_bootmem_pages_node(NODE_DATA(nodeid), size * map_count);
415 if (map) {
416 for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
417 if (!present_section_nr(pnum))
418 continue;
419 map_map[pnum] = map;
420 map += size;
421 }
422 return;
423 }
424
425 /* fallback */
426 for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
427 struct mem_section *ms;
428
429 if (!present_section_nr(pnum))
430 continue;
431 map_map[pnum] = sparse_mem_map_populate(pnum, nodeid);
432 if (map_map[pnum])
433 continue;
434 ms = __nr_to_section(pnum);
435 printk(KERN_ERR "%s: sparsemem memory map backing failed "
436 "some memory will not be available.\n", __func__);
437 ms->section_mem_map = 0;
438 }
439}
378#endif /* !CONFIG_SPARSEMEM_VMEMMAP */ 440#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
379 441
442#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
443static void __init sparse_early_mem_maps_alloc_node(struct page **map_map,
444 unsigned long pnum_begin,
445 unsigned long pnum_end,
446 unsigned long map_count, int nodeid)
447{
448 sparse_mem_maps_populate_node(map_map, pnum_begin, pnum_end,
449 map_count, nodeid);
450}
451#else
380static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) 452static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum)
381{ 453{
382 struct page *map; 454 struct page *map;
@@ -392,10 +464,12 @@ static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum)
392 ms->section_mem_map = 0; 464 ms->section_mem_map = 0;
393 return NULL; 465 return NULL;
394} 466}
467#endif
395 468
396void __attribute__((weak)) __meminit vmemmap_populate_print_last(void) 469void __attribute__((weak)) __meminit vmemmap_populate_print_last(void)
397{ 470{
398} 471}
472
399/* 473/*
400 * Allocate the accumulated non-linear sections, allocate a mem_map 474 * Allocate the accumulated non-linear sections, allocate a mem_map
401 * for each and record the physical to section mapping. 475 * for each and record the physical to section mapping.
@@ -407,6 +481,14 @@ void __init sparse_init(void)
407 unsigned long *usemap; 481 unsigned long *usemap;
408 unsigned long **usemap_map; 482 unsigned long **usemap_map;
409 int size; 483 int size;
484 int nodeid_begin = 0;
485 unsigned long pnum_begin = 0;
486 unsigned long usemap_count;
487#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
488 unsigned long map_count;
489 int size2;
490 struct page **map_map;
491#endif
410 492
411 /* 493 /*
412 * map is using big page (aka 2M in x86 64 bit) 494 * map is using big page (aka 2M in x86 64 bit)
@@ -425,10 +507,81 @@ void __init sparse_init(void)
425 panic("can not allocate usemap_map\n"); 507 panic("can not allocate usemap_map\n");
426 508
427 for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { 509 for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
510 struct mem_section *ms;
511
428 if (!present_section_nr(pnum)) 512 if (!present_section_nr(pnum))
429 continue; 513 continue;
430 usemap_map[pnum] = sparse_early_usemap_alloc(pnum); 514 ms = __nr_to_section(pnum);
515 nodeid_begin = sparse_early_nid(ms);
516 pnum_begin = pnum;
517 break;
431 } 518 }
519 usemap_count = 1;
520 for (pnum = pnum_begin + 1; pnum < NR_MEM_SECTIONS; pnum++) {
521 struct mem_section *ms;
522 int nodeid;
523
524 if (!present_section_nr(pnum))
525 continue;
526 ms = __nr_to_section(pnum);
527 nodeid = sparse_early_nid(ms);
528 if (nodeid == nodeid_begin) {
529 usemap_count++;
530 continue;
531 }
532 /* ok, we need to take cake of from pnum_begin to pnum - 1*/
533 sparse_early_usemaps_alloc_node(usemap_map, pnum_begin, pnum,
534 usemap_count, nodeid_begin);
535 /* new start, update count etc*/
536 nodeid_begin = nodeid;
537 pnum_begin = pnum;
538 usemap_count = 1;
539 }
540 /* ok, last chunk */
541 sparse_early_usemaps_alloc_node(usemap_map, pnum_begin, NR_MEM_SECTIONS,
542 usemap_count, nodeid_begin);
543
544#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
545 size2 = sizeof(struct page *) * NR_MEM_SECTIONS;
546 map_map = alloc_bootmem(size2);
547 if (!map_map)
548 panic("can not allocate map_map\n");
549
550 for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
551 struct mem_section *ms;
552
553 if (!present_section_nr(pnum))
554 continue;
555 ms = __nr_to_section(pnum);
556 nodeid_begin = sparse_early_nid(ms);
557 pnum_begin = pnum;
558 break;
559 }
560 map_count = 1;
561 for (pnum = pnum_begin + 1; pnum < NR_MEM_SECTIONS; pnum++) {
562 struct mem_section *ms;
563 int nodeid;
564
565 if (!present_section_nr(pnum))
566 continue;
567 ms = __nr_to_section(pnum);
568 nodeid = sparse_early_nid(ms);
569 if (nodeid == nodeid_begin) {
570 map_count++;
571 continue;
572 }
573 /* ok, we need to take cake of from pnum_begin to pnum - 1*/
574 sparse_early_mem_maps_alloc_node(map_map, pnum_begin, pnum,
575 map_count, nodeid_begin);
576 /* new start, update count etc*/
577 nodeid_begin = nodeid;
578 pnum_begin = pnum;
579 map_count = 1;
580 }
581 /* ok, last chunk */
582 sparse_early_mem_maps_alloc_node(map_map, pnum_begin, NR_MEM_SECTIONS,
583 map_count, nodeid_begin);
584#endif
432 585
433 for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { 586 for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
434 if (!present_section_nr(pnum)) 587 if (!present_section_nr(pnum))
@@ -438,7 +591,11 @@ void __init sparse_init(void)
438 if (!usemap) 591 if (!usemap)
439 continue; 592 continue;
440 593
594#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
595 map = map_map[pnum];
596#else
441 map = sparse_early_mem_map_alloc(pnum); 597 map = sparse_early_mem_map_alloc(pnum);
598#endif
442 if (!map) 599 if (!map)
443 continue; 600 continue;
444 601
@@ -448,6 +605,9 @@ void __init sparse_init(void)
448 605
449 vmemmap_populate_print_last(); 606 vmemmap_populate_print_last();
450 607
608#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
609 free_bootmem(__pa(map_map), size2);
610#endif
451 free_bootmem(__pa(usemap_map), size); 611 free_bootmem(__pa(usemap_map), size);
452} 612}
453 613
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 6051fbab67ba..fc5aa183bc45 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -139,7 +139,8 @@ static void refresh_zone_stat_thresholds(void)
139 threshold = calculate_threshold(zone); 139 threshold = calculate_threshold(zone);
140 140
141 for_each_online_cpu(cpu) 141 for_each_online_cpu(cpu)
142 zone_pcp(zone, cpu)->stat_threshold = threshold; 142 per_cpu_ptr(zone->pageset, cpu)->stat_threshold
143 = threshold;
143 } 144 }
144} 145}
145 146
@@ -149,7 +150,8 @@ static void refresh_zone_stat_thresholds(void)
149void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, 150void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
150 int delta) 151 int delta)
151{ 152{
152 struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id()); 153 struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset);
154
153 s8 *p = pcp->vm_stat_diff + item; 155 s8 *p = pcp->vm_stat_diff + item;
154 long x; 156 long x;
155 157
@@ -202,7 +204,7 @@ EXPORT_SYMBOL(mod_zone_page_state);
202 */ 204 */
203void __inc_zone_state(struct zone *zone, enum zone_stat_item item) 205void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
204{ 206{
205 struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id()); 207 struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset);
206 s8 *p = pcp->vm_stat_diff + item; 208 s8 *p = pcp->vm_stat_diff + item;
207 209
208 (*p)++; 210 (*p)++;
@@ -223,7 +225,7 @@ EXPORT_SYMBOL(__inc_zone_page_state);
223 225
224void __dec_zone_state(struct zone *zone, enum zone_stat_item item) 226void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
225{ 227{
226 struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id()); 228 struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset);
227 s8 *p = pcp->vm_stat_diff + item; 229 s8 *p = pcp->vm_stat_diff + item;
228 230
229 (*p)--; 231 (*p)--;
@@ -300,7 +302,7 @@ void refresh_cpu_vm_stats(int cpu)
300 for_each_populated_zone(zone) { 302 for_each_populated_zone(zone) {
301 struct per_cpu_pageset *p; 303 struct per_cpu_pageset *p;
302 304
303 p = zone_pcp(zone, cpu); 305 p = per_cpu_ptr(zone->pageset, cpu);
304 306
305 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) 307 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
306 if (p->vm_stat_diff[i]) { 308 if (p->vm_stat_diff[i]) {
@@ -741,7 +743,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
741 for_each_online_cpu(i) { 743 for_each_online_cpu(i) {
742 struct per_cpu_pageset *pageset; 744 struct per_cpu_pageset *pageset;
743 745
744 pageset = zone_pcp(zone, i); 746 pageset = per_cpu_ptr(zone->pageset, i);
745 seq_printf(m, 747 seq_printf(m,
746 "\n cpu: %i" 748 "\n cpu: %i"
747 "\n count: %i" 749 "\n count: %i"
@@ -906,6 +908,7 @@ static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb,
906 case CPU_ONLINE: 908 case CPU_ONLINE:
907 case CPU_ONLINE_FROZEN: 909 case CPU_ONLINE_FROZEN:
908 start_cpu_timer(cpu); 910 start_cpu_timer(cpu);
911 node_set_state(cpu_to_node(cpu), N_CPU);
909 break; 912 break;
910 case CPU_DOWN_PREPARE: 913 case CPU_DOWN_PREPARE:
911 case CPU_DOWN_PREPARE_FROZEN: 914 case CPU_DOWN_PREPARE_FROZEN: