aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/kernel-parameters.txt11
-rw-r--r--Makefile2
-rw-r--r--arch/alpha/include/asm/percpu.h100
-rw-r--r--arch/alpha/include/asm/tlbflush.h1
-rw-r--r--arch/alpha/kernel/vmlinux.lds.S9
-rw-r--r--arch/arm/kernel/vmlinux.lds.S1
-rw-r--r--arch/avr32/kernel/vmlinux.lds.S9
-rw-r--r--arch/blackfin/kernel/vmlinux.lds.S5
-rw-r--r--arch/blackfin/mm/sram-alloc.c6
-rw-r--r--arch/cris/include/asm/mmu_context.h3
-rw-r--r--arch/cris/kernel/vmlinux.lds.S9
-rw-r--r--arch/cris/mm/fault.c2
-rw-r--r--arch/frv/kernel/vmlinux.lds.S2
-rw-r--r--arch/h8300/kernel/vmlinux.lds.S5
-rw-r--r--arch/ia64/Kconfig3
-rw-r--r--arch/ia64/kernel/setup.c6
-rw-r--r--arch/ia64/kernel/smp.c3
-rw-r--r--arch/ia64/kernel/vmlinux.lds.S16
-rw-r--r--arch/ia64/sn/kernel/setup.c2
-rw-r--r--arch/m32r/kernel/vmlinux.lds.S10
-rw-r--r--arch/m68k/kernel/vmlinux-std.lds10
-rw-r--r--arch/m68k/kernel/vmlinux-sun3.lds9
-rw-r--r--arch/m68knommu/kernel/vmlinux.lds.S7
-rw-r--r--arch/microblaze/kernel/vmlinux.lds.S6
-rw-r--r--arch/mips/kernel/vmlinux.lds.S21
-rw-r--r--arch/mn10300/kernel/vmlinux.lds.S8
-rw-r--r--arch/parisc/kernel/vmlinux.lds.S8
-rw-r--r--arch/powerpc/Kconfig3
-rw-r--r--arch/powerpc/kernel/setup_64.c61
-rw-r--r--arch/powerpc/kernel/vmlinux.lds.S9
-rw-r--r--arch/powerpc/mm/stab.c2
-rw-r--r--arch/powerpc/platforms/ps3/smp.c2
-rw-r--r--arch/s390/include/asm/percpu.h32
-rw-r--r--arch/s390/kernel/vmlinux.lds.S9
-rw-r--r--arch/sh/kernel/vmlinux.lds.S10
-rw-r--r--arch/sparc/Kconfig2
-rw-r--r--arch/sparc/kernel/smp_64.c132
-rw-r--r--arch/sparc/kernel/vmlinux.lds.S8
-rw-r--r--arch/um/include/asm/common.lds.S5
-rw-r--r--arch/um/kernel/dyn.lds.S2
-rw-r--r--arch/um/kernel/uml.lds.S2
-rw-r--r--arch/x86/Kconfig5
-rw-r--r--arch/x86/include/asm/percpu.h9
-rw-r--r--arch/x86/kernel/cpu/cpu_debug.c4
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c8
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c2
-rw-r--r--arch/x86/kernel/cpu/perf_counter.c14
-rw-r--r--arch/x86/kernel/setup_percpu.c364
-rw-r--r--arch/x86/kernel/vmlinux.lds.S11
-rw-r--r--arch/x86/mm/pageattr.c21
-rw-r--r--arch/xtensa/kernel/vmlinux.lds.S13
-rw-r--r--block/as-iosched.c10
-rw-r--r--block/cfq-iosched.c10
-rw-r--r--drivers/cpufreq/cpufreq_conservative.c12
-rw-r--r--drivers/cpufreq/cpufreq_ondemand.c15
-rw-r--r--drivers/xen/events.c13
-rw-r--r--include/asm-generic/vmlinux.lds.h24
-rw-r--r--include/linux/percpu-defs.h66
-rw-r--r--include/linux/percpu.h88
-rw-r--r--include/linux/vmalloc.h6
-rw-r--r--init/main.c24
-rw-r--r--kernel/module.c6
-rw-r--r--kernel/perf_counter.c6
-rw-r--r--kernel/sched.c4
-rw-r--r--kernel/trace/trace_events.c6
-rw-r--r--lib/Kconfig.debug15
-rw-r--r--mm/Makefile2
-rw-r--r--mm/allocpercpu.c28
-rw-r--r--mm/kmemleak-test.c6
-rw-r--r--mm/page-writeback.c5
-rw-r--r--mm/percpu.c1420
-rw-r--r--mm/quicklist.c2
-rw-r--r--mm/slub.c4
-rw-r--r--mm/vmalloc.c338
-rw-r--r--net/ipv4/syncookies.c5
-rw-r--r--net/ipv6/syncookies.c5
-rw-r--r--net/rds/ib_stats.c2
-rw-r--r--net/rds/iw_stats.c2
-rw-r--r--net/rds/page.c2
-rw-r--r--scripts/module-common.lds8
80 files changed, 1910 insertions, 1228 deletions
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 7936b801fe6a..e710093e3d32 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1919,11 +1919,12 @@ and is between 256 and 4096 characters. It is defined in the file
1919 Format: { 0 | 1 } 1919 Format: { 0 | 1 }
1920 See arch/parisc/kernel/pdc_chassis.c 1920 See arch/parisc/kernel/pdc_chassis.c
1921 1921
1922 percpu_alloc= [X86] Select which percpu first chunk allocator to use. 1922 percpu_alloc= Select which percpu first chunk allocator to use.
1923 Allowed values are one of "lpage", "embed" and "4k". 1923 Currently supported values are "embed" and "page".
1924 See comments in arch/x86/kernel/setup_percpu.c for 1924 Archs may support subset or none of the selections.
1925 details on each allocator. This parameter is primarily 1925 See comments in mm/percpu.c for details on each
1926 for debugging and performance comparison. 1926 allocator. This parameter is primarily for debugging
1927 and performance comparison.
1927 1928
1928 pf. [PARIDE] 1929 pf. [PARIDE]
1929 See Documentation/blockdev/paride.txt. 1930 See Documentation/blockdev/paride.txt.
diff --git a/Makefile b/Makefile
index abcfa85f8f82..e1e7a71355d8 100644
--- a/Makefile
+++ b/Makefile
@@ -325,7 +325,7 @@ CHECKFLAGS := -D__linux__ -Dlinux -D__STDC__ -Dunix -D__unix__ \
325MODFLAGS = -DMODULE 325MODFLAGS = -DMODULE
326CFLAGS_MODULE = $(MODFLAGS) 326CFLAGS_MODULE = $(MODFLAGS)
327AFLAGS_MODULE = $(MODFLAGS) 327AFLAGS_MODULE = $(MODFLAGS)
328LDFLAGS_MODULE = 328LDFLAGS_MODULE = -T $(srctree)/scripts/module-common.lds
329CFLAGS_KERNEL = 329CFLAGS_KERNEL =
330AFLAGS_KERNEL = 330AFLAGS_KERNEL =
331CFLAGS_GCOV = -fprofile-arcs -ftest-coverage 331CFLAGS_GCOV = -fprofile-arcs -ftest-coverage
diff --git a/arch/alpha/include/asm/percpu.h b/arch/alpha/include/asm/percpu.h
index b663f1f10b6a..2c12378e3aa9 100644
--- a/arch/alpha/include/asm/percpu.h
+++ b/arch/alpha/include/asm/percpu.h
@@ -1,102 +1,18 @@
1#ifndef __ALPHA_PERCPU_H 1#ifndef __ALPHA_PERCPU_H
2#define __ALPHA_PERCPU_H 2#define __ALPHA_PERCPU_H
3 3
4#include <linux/compiler.h>
5#include <linux/threads.h>
6#include <linux/percpu-defs.h>
7
8/*
9 * Determine the real variable name from the name visible in the
10 * kernel sources.
11 */
12#define per_cpu_var(var) per_cpu__##var
13
14#ifdef CONFIG_SMP
15
16/*
17 * per_cpu_offset() is the offset that has to be added to a
18 * percpu variable to get to the instance for a certain processor.
19 */
20extern unsigned long __per_cpu_offset[NR_CPUS];
21
22#define per_cpu_offset(x) (__per_cpu_offset[x])
23
24#define __my_cpu_offset per_cpu_offset(raw_smp_processor_id())
25#ifdef CONFIG_DEBUG_PREEMPT
26#define my_cpu_offset per_cpu_offset(smp_processor_id())
27#else
28#define my_cpu_offset __my_cpu_offset
29#endif
30
31#ifndef MODULE
32#define SHIFT_PERCPU_PTR(var, offset) RELOC_HIDE(&per_cpu_var(var), (offset))
33#define PER_CPU_DEF_ATTRIBUTES
34#else
35/* 4/*
36 * To calculate addresses of locally defined variables, GCC uses 32-bit 5 * To calculate addresses of locally defined variables, GCC uses
37 * displacement from the GP. Which doesn't work for per cpu variables in 6 * 32-bit displacement from the GP. Which doesn't work for per cpu
38 * modules, as an offset to the kernel per cpu area is way above 4G. 7 * variables in modules, as an offset to the kernel per cpu area is
8 * way above 4G.
39 * 9 *
40 * This forces allocation of a GOT entry for per cpu variable using 10 * Always use weak definitions for percpu variables in modules.
41 * ldq instruction with a 'literal' relocation.
42 */
43#define SHIFT_PERCPU_PTR(var, offset) ({ \
44 extern int simple_identifier_##var(void); \
45 unsigned long __ptr, tmp_gp; \
46 asm ( "br %1, 1f \n\
47 1: ldgp %1, 0(%1) \n\
48 ldq %0, per_cpu__" #var"(%1)\t!literal" \
49 : "=&r"(__ptr), "=&r"(tmp_gp)); \
50 (typeof(&per_cpu_var(var)))(__ptr + (offset)); })
51
52#define PER_CPU_DEF_ATTRIBUTES __used
53
54#endif /* MODULE */
55
56/*
57 * A percpu variable may point to a discarded regions. The following are
58 * established ways to produce a usable pointer from the percpu variable
59 * offset.
60 */ 11 */
61#define per_cpu(var, cpu) \ 12#if defined(MODULE) && defined(CONFIG_SMP)
62 (*SHIFT_PERCPU_PTR(var, per_cpu_offset(cpu))) 13#define ARCH_NEEDS_WEAK_PER_CPU
63#define __get_cpu_var(var) \
64 (*SHIFT_PERCPU_PTR(var, my_cpu_offset))
65#define __raw_get_cpu_var(var) \
66 (*SHIFT_PERCPU_PTR(var, __my_cpu_offset))
67
68#else /* ! SMP */
69
70#define per_cpu(var, cpu) (*((void)(cpu), &per_cpu_var(var)))
71#define __get_cpu_var(var) per_cpu_var(var)
72#define __raw_get_cpu_var(var) per_cpu_var(var)
73
74#define PER_CPU_DEF_ATTRIBUTES
75
76#endif /* SMP */
77
78#ifdef CONFIG_SMP
79#define PER_CPU_BASE_SECTION ".data.percpu"
80#else
81#define PER_CPU_BASE_SECTION ".data"
82#endif
83
84#ifdef CONFIG_SMP
85
86#ifdef MODULE
87#define PER_CPU_SHARED_ALIGNED_SECTION ""
88#else
89#define PER_CPU_SHARED_ALIGNED_SECTION ".shared_aligned"
90#endif
91#define PER_CPU_FIRST_SECTION ".first"
92
93#else
94
95#define PER_CPU_SHARED_ALIGNED_SECTION ""
96#define PER_CPU_FIRST_SECTION ""
97
98#endif 14#endif
99 15
100#define PER_CPU_ATTRIBUTES 16#include <asm-generic/percpu.h>
101 17
102#endif /* __ALPHA_PERCPU_H */ 18#endif /* __ALPHA_PERCPU_H */
diff --git a/arch/alpha/include/asm/tlbflush.h b/arch/alpha/include/asm/tlbflush.h
index 9d87aaa08c0d..e89e0c2e15b1 100644
--- a/arch/alpha/include/asm/tlbflush.h
+++ b/arch/alpha/include/asm/tlbflush.h
@@ -2,6 +2,7 @@
2#define _ALPHA_TLBFLUSH_H 2#define _ALPHA_TLBFLUSH_H
3 3
4#include <linux/mm.h> 4#include <linux/mm.h>
5#include <linux/sched.h>
5#include <asm/compiler.h> 6#include <asm/compiler.h>
6#include <asm/pgalloc.h> 7#include <asm/pgalloc.h>
7 8
diff --git a/arch/alpha/kernel/vmlinux.lds.S b/arch/alpha/kernel/vmlinux.lds.S
index b9d6568e5f7f..6dc03c35caa0 100644
--- a/arch/alpha/kernel/vmlinux.lds.S
+++ b/arch/alpha/kernel/vmlinux.lds.S
@@ -134,13 +134,6 @@ SECTIONS
134 __bss_stop = .; 134 __bss_stop = .;
135 _end = .; 135 _end = .;
136 136
137 /* Sections to be discarded */
138 /DISCARD/ : {
139 EXIT_TEXT
140 EXIT_DATA
141 *(.exitcall.exit)
142 }
143
144 .mdebug 0 : { 137 .mdebug 0 : {
145 *(.mdebug) 138 *(.mdebug)
146 } 139 }
@@ -150,4 +143,6 @@ SECTIONS
150 143
151 STABS_DEBUG 144 STABS_DEBUG
152 DWARF_DEBUG 145 DWARF_DEBUG
146
147 DISCARDS
153} 148}
diff --git a/arch/arm/kernel/vmlinux.lds.S b/arch/arm/kernel/vmlinux.lds.S
index 69371028a202..5cc4812c9763 100644
--- a/arch/arm/kernel/vmlinux.lds.S
+++ b/arch/arm/kernel/vmlinux.lds.S
@@ -83,6 +83,7 @@ SECTIONS
83 EXIT_TEXT 83 EXIT_TEXT
84 EXIT_DATA 84 EXIT_DATA
85 *(.exitcall.exit) 85 *(.exitcall.exit)
86 *(.discard)
86 *(.ARM.exidx.exit.text) 87 *(.ARM.exidx.exit.text)
87 *(.ARM.extab.exit.text) 88 *(.ARM.extab.exit.text)
88#ifndef CONFIG_HOTPLUG_CPU 89#ifndef CONFIG_HOTPLUG_CPU
diff --git a/arch/avr32/kernel/vmlinux.lds.S b/arch/avr32/kernel/vmlinux.lds.S
index 7910d41eb886..c4b56654349a 100644
--- a/arch/avr32/kernel/vmlinux.lds.S
+++ b/arch/avr32/kernel/vmlinux.lds.S
@@ -124,14 +124,11 @@ SECTIONS
124 _end = .; 124 _end = .;
125 } 125 }
126 126
127 DWARF_DEBUG
128
127 /* When something in the kernel is NOT compiled as a module, the module 129 /* When something in the kernel is NOT compiled as a module, the module
128 * cleanup code and data are put into these segments. Both can then be 130 * cleanup code and data are put into these segments. Both can then be
129 * thrown away, as cleanup code is never called unless it's a module. 131 * thrown away, as cleanup code is never called unless it's a module.
130 */ 132 */
131 /DISCARD/ : { 133 DISCARDS
132 EXIT_DATA
133 *(.exitcall.exit)
134 }
135
136 DWARF_DEBUG
137} 134}
diff --git a/arch/blackfin/kernel/vmlinux.lds.S b/arch/blackfin/kernel/vmlinux.lds.S
index 6ac307ca0d80..d7ffe299b979 100644
--- a/arch/blackfin/kernel/vmlinux.lds.S
+++ b/arch/blackfin/kernel/vmlinux.lds.S
@@ -277,8 +277,5 @@ SECTIONS
277 277
278 DWARF_DEBUG 278 DWARF_DEBUG
279 279
280 /DISCARD/ : 280 DISCARDS
281 {
282 *(.exitcall.exit)
283 }
284} 281}
diff --git a/arch/blackfin/mm/sram-alloc.c b/arch/blackfin/mm/sram-alloc.c
index 0bc3c4ef0aad..99e4dbb1dfd1 100644
--- a/arch/blackfin/mm/sram-alloc.c
+++ b/arch/blackfin/mm/sram-alloc.c
@@ -42,9 +42,9 @@
42#include <asm/mem_map.h> 42#include <asm/mem_map.h>
43#include "blackfin_sram.h" 43#include "blackfin_sram.h"
44 44
45static DEFINE_PER_CPU(spinlock_t, l1sram_lock) ____cacheline_aligned_in_smp; 45static DEFINE_PER_CPU_SHARED_ALIGNED(spinlock_t, l1sram_lock);
46static DEFINE_PER_CPU(spinlock_t, l1_data_sram_lock) ____cacheline_aligned_in_smp; 46static DEFINE_PER_CPU_SHARED_ALIGNED(spinlock_t, l1_data_sram_lock);
47static DEFINE_PER_CPU(spinlock_t, l1_inst_sram_lock) ____cacheline_aligned_in_smp; 47static DEFINE_PER_CPU_SHARED_ALIGNED(spinlock_t, l1_inst_sram_lock);
48static spinlock_t l2_sram_lock ____cacheline_aligned_in_smp; 48static spinlock_t l2_sram_lock ____cacheline_aligned_in_smp;
49 49
50/* the data structure for L1 scratchpad and DATA SRAM */ 50/* the data structure for L1 scratchpad and DATA SRAM */
diff --git a/arch/cris/include/asm/mmu_context.h b/arch/cris/include/asm/mmu_context.h
index 72ba08dcfd18..1d45fd6365b7 100644
--- a/arch/cris/include/asm/mmu_context.h
+++ b/arch/cris/include/asm/mmu_context.h
@@ -17,7 +17,8 @@ extern void switch_mm(struct mm_struct *prev, struct mm_struct *next,
17 * registers like cr3 on the i386 17 * registers like cr3 on the i386
18 */ 18 */
19 19
20extern volatile DEFINE_PER_CPU(pgd_t *,current_pgd); /* defined in arch/cris/mm/fault.c */ 20/* defined in arch/cris/mm/fault.c */
21DECLARE_PER_CPU(pgd_t *, current_pgd);
21 22
22static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) 23static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
23{ 24{
diff --git a/arch/cris/kernel/vmlinux.lds.S b/arch/cris/kernel/vmlinux.lds.S
index 0d2adfc794d4..6c81836b9229 100644
--- a/arch/cris/kernel/vmlinux.lds.S
+++ b/arch/cris/kernel/vmlinux.lds.S
@@ -140,12 +140,7 @@ SECTIONS
140 _end = .; 140 _end = .;
141 __end = .; 141 __end = .;
142 142
143 /* Sections to be discarded */
144 /DISCARD/ : {
145 EXIT_TEXT
146 EXIT_DATA
147 *(.exitcall.exit)
148 }
149
150 dram_end = dram_start + (CONFIG_ETRAX_DRAM_SIZE - __CONFIG_ETRAX_VMEM_SIZE)*1024*1024; 143 dram_end = dram_start + (CONFIG_ETRAX_DRAM_SIZE - __CONFIG_ETRAX_VMEM_SIZE)*1024*1024;
144
145 DISCARDS
151} 146}
diff --git a/arch/cris/mm/fault.c b/arch/cris/mm/fault.c
index f925115e3250..4a7cdd9ea1ee 100644
--- a/arch/cris/mm/fault.c
+++ b/arch/cris/mm/fault.c
@@ -29,7 +29,7 @@ extern void die_if_kernel(const char *, struct pt_regs *, long);
29 29
30/* current active page directory */ 30/* current active page directory */
31 31
32volatile DEFINE_PER_CPU(pgd_t *,current_pgd); 32DEFINE_PER_CPU(pgd_t *, current_pgd);
33unsigned long cris_signal_return_page; 33unsigned long cris_signal_return_page;
34 34
35/* 35/*
diff --git a/arch/frv/kernel/vmlinux.lds.S b/arch/frv/kernel/vmlinux.lds.S
index 22d9787406ed..7dbf41f68b52 100644
--- a/arch/frv/kernel/vmlinux.lds.S
+++ b/arch/frv/kernel/vmlinux.lds.S
@@ -177,6 +177,8 @@ SECTIONS
177 .debug_ranges 0 : { *(.debug_ranges) } 177 .debug_ranges 0 : { *(.debug_ranges) }
178 178
179 .comment 0 : { *(.comment) } 179 .comment 0 : { *(.comment) }
180
181 DISCARDS
180} 182}
181 183
182__kernel_image_size_no_bss = __bss_start - __kernel_image_start; 184__kernel_image_size_no_bss = __bss_start - __kernel_image_start;
diff --git a/arch/h8300/kernel/vmlinux.lds.S b/arch/h8300/kernel/vmlinux.lds.S
index 43a87b9085b6..662b02ecb86e 100644
--- a/arch/h8300/kernel/vmlinux.lds.S
+++ b/arch/h8300/kernel/vmlinux.lds.S
@@ -152,9 +152,6 @@ SECTIONS
152 __end = . ; 152 __end = . ;
153 __ramstart = .; 153 __ramstart = .;
154 } 154 }
155 /DISCARD/ : {
156 *(.exitcall.exit)
157 }
158 .romfs : 155 .romfs :
159 { 156 {
160 *(.romfs*) 157 *(.romfs*)
@@ -165,4 +162,6 @@ SECTIONS
165 COMMAND_START = . - 0x200 ; 162 COMMAND_START = . - 0x200 ;
166 __ramend = . ; 163 __ramend = . ;
167 } 164 }
165
166 DISCARDS
168} 167}
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 170042b420d4..328d2f8b8c3f 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -89,6 +89,9 @@ config GENERIC_TIME_VSYSCALL
89 bool 89 bool
90 default y 90 default y
91 91
92config HAVE_LEGACY_PER_CPU_AREA
93 def_bool y
94
92config HAVE_SETUP_PER_CPU_AREA 95config HAVE_SETUP_PER_CPU_AREA
93 def_bool y 96 def_bool y
94 97
diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c
index 1b23ec126b63..1de86c96801d 100644
--- a/arch/ia64/kernel/setup.c
+++ b/arch/ia64/kernel/setup.c
@@ -855,11 +855,17 @@ identify_cpu (struct cpuinfo_ia64 *c)
855 c->unimpl_pa_mask = ~((1L<<63) | ((1L << phys_addr_size) - 1)); 855 c->unimpl_pa_mask = ~((1L<<63) | ((1L << phys_addr_size) - 1));
856} 856}
857 857
858/*
859 * In UP configuration, setup_per_cpu_areas() is defined in
860 * include/linux/percpu.h
861 */
862#ifdef CONFIG_SMP
858void __init 863void __init
859setup_per_cpu_areas (void) 864setup_per_cpu_areas (void)
860{ 865{
861 /* start_kernel() requires this... */ 866 /* start_kernel() requires this... */
862} 867}
868#endif
863 869
864/* 870/*
865 * Do the following calculations: 871 * Do the following calculations:
diff --git a/arch/ia64/kernel/smp.c b/arch/ia64/kernel/smp.c
index f0c521b0ba4c..93ebfea43c6c 100644
--- a/arch/ia64/kernel/smp.c
+++ b/arch/ia64/kernel/smp.c
@@ -58,7 +58,8 @@ static struct local_tlb_flush_counts {
58 unsigned int count; 58 unsigned int count;
59} __attribute__((__aligned__(32))) local_tlb_flush_counts[NR_CPUS]; 59} __attribute__((__aligned__(32))) local_tlb_flush_counts[NR_CPUS];
60 60
61static DEFINE_PER_CPU(unsigned short, shadow_flush_counts[NR_CPUS]) ____cacheline_aligned; 61static DEFINE_PER_CPU_SHARED_ALIGNED(unsigned short [NR_CPUS],
62 shadow_flush_counts);
62 63
63#define IPI_CALL_FUNC 0 64#define IPI_CALL_FUNC 0
64#define IPI_CPU_STOP 1 65#define IPI_CPU_STOP 1
diff --git a/arch/ia64/kernel/vmlinux.lds.S b/arch/ia64/kernel/vmlinux.lds.S
index 4a95e86b9ac2..eb4214d1c5af 100644
--- a/arch/ia64/kernel/vmlinux.lds.S
+++ b/arch/ia64/kernel/vmlinux.lds.S
@@ -24,14 +24,14 @@ PHDRS {
24} 24}
25SECTIONS 25SECTIONS
26{ 26{
27 /* Sections to be discarded */ 27 /* unwind exit sections must be discarded before the rest of the
28 sections get included. */
28 /DISCARD/ : { 29 /DISCARD/ : {
29 EXIT_TEXT
30 EXIT_DATA
31 *(.exitcall.exit)
32 *(.IA_64.unwind.exit.text) 30 *(.IA_64.unwind.exit.text)
33 *(.IA_64.unwind_info.exit.text) 31 *(.IA_64.unwind_info.exit.text)
34 } 32 *(.comment)
33 *(.note)
34 }
35 35
36 v = PAGE_OFFSET; /* this symbol is here to make debugging easier... */ 36 v = PAGE_OFFSET; /* this symbol is here to make debugging easier... */
37 phys_start = _start - LOAD_OFFSET; 37 phys_start = _start - LOAD_OFFSET;
@@ -316,7 +316,7 @@ SECTIONS
316 .debug_funcnames 0 : { *(.debug_funcnames) } 316 .debug_funcnames 0 : { *(.debug_funcnames) }
317 .debug_typenames 0 : { *(.debug_typenames) } 317 .debug_typenames 0 : { *(.debug_typenames) }
318 .debug_varnames 0 : { *(.debug_varnames) } 318 .debug_varnames 0 : { *(.debug_varnames) }
319 /* These must appear regardless of . */ 319
320 /DISCARD/ : { *(.comment) } 320 /* Default discards */
321 /DISCARD/ : { *(.note) } 321 DISCARDS
322} 322}
diff --git a/arch/ia64/sn/kernel/setup.c b/arch/ia64/sn/kernel/setup.c
index e456f062f241..ece1bf994499 100644
--- a/arch/ia64/sn/kernel/setup.c
+++ b/arch/ia64/sn/kernel/setup.c
@@ -71,7 +71,7 @@ EXPORT_SYMBOL(sn_rtc_cycles_per_second);
71DEFINE_PER_CPU(struct sn_hub_info_s, __sn_hub_info); 71DEFINE_PER_CPU(struct sn_hub_info_s, __sn_hub_info);
72EXPORT_PER_CPU_SYMBOL(__sn_hub_info); 72EXPORT_PER_CPU_SYMBOL(__sn_hub_info);
73 73
74DEFINE_PER_CPU(short, __sn_cnodeid_to_nasid[MAX_COMPACT_NODES]); 74DEFINE_PER_CPU(short [MAX_COMPACT_NODES], __sn_cnodeid_to_nasid);
75EXPORT_PER_CPU_SYMBOL(__sn_cnodeid_to_nasid); 75EXPORT_PER_CPU_SYMBOL(__sn_cnodeid_to_nasid);
76 76
77DEFINE_PER_CPU(struct nodepda_s *, __sn_nodepda); 77DEFINE_PER_CPU(struct nodepda_s *, __sn_nodepda);
diff --git a/arch/m32r/kernel/vmlinux.lds.S b/arch/m32r/kernel/vmlinux.lds.S
index 4179adf6c624..de5e21cca6a5 100644
--- a/arch/m32r/kernel/vmlinux.lds.S
+++ b/arch/m32r/kernel/vmlinux.lds.S
@@ -120,13 +120,6 @@ SECTIONS
120 120
121 _end = . ; 121 _end = . ;
122 122
123 /* Sections to be discarded */
124 /DISCARD/ : {
125 EXIT_TEXT
126 EXIT_DATA
127 *(.exitcall.exit)
128 }
129
130 /* Stabs debugging sections. */ 123 /* Stabs debugging sections. */
131 .stab 0 : { *(.stab) } 124 .stab 0 : { *(.stab) }
132 .stabstr 0 : { *(.stabstr) } 125 .stabstr 0 : { *(.stabstr) }
@@ -135,4 +128,7 @@ SECTIONS
135 .stab.index 0 : { *(.stab.index) } 128 .stab.index 0 : { *(.stab.index) }
136 .stab.indexstr 0 : { *(.stab.indexstr) } 129 .stab.indexstr 0 : { *(.stab.indexstr) }
137 .comment 0 : { *(.comment) } 130 .comment 0 : { *(.comment) }
131
132 /* Sections to be discarded */
133 DISCARDS
138} 134}
diff --git a/arch/m68k/kernel/vmlinux-std.lds b/arch/m68k/kernel/vmlinux-std.lds
index 01d212bb05a6..47eac19e8f61 100644
--- a/arch/m68k/kernel/vmlinux-std.lds
+++ b/arch/m68k/kernel/vmlinux-std.lds
@@ -82,13 +82,6 @@ SECTIONS
82 82
83 _end = . ; 83 _end = . ;
84 84
85 /* Sections to be discarded */
86 /DISCARD/ : {
87 EXIT_TEXT
88 EXIT_DATA
89 *(.exitcall.exit)
90 }
91
92 /* Stabs debugging sections. */ 85 /* Stabs debugging sections. */
93 .stab 0 : { *(.stab) } 86 .stab 0 : { *(.stab) }
94 .stabstr 0 : { *(.stabstr) } 87 .stabstr 0 : { *(.stabstr) }
@@ -97,4 +90,7 @@ SECTIONS
97 .stab.index 0 : { *(.stab.index) } 90 .stab.index 0 : { *(.stab.index) }
98 .stab.indexstr 0 : { *(.stab.indexstr) } 91 .stab.indexstr 0 : { *(.stab.indexstr) }
99 .comment 0 : { *(.comment) } 92 .comment 0 : { *(.comment) }
93
94 /* Sections to be discarded */
95 DISCARDS
100} 96}
diff --git a/arch/m68k/kernel/vmlinux-sun3.lds b/arch/m68k/kernel/vmlinux-sun3.lds
index c192f773db96..03efaf04d7d7 100644
--- a/arch/m68k/kernel/vmlinux-sun3.lds
+++ b/arch/m68k/kernel/vmlinux-sun3.lds
@@ -77,13 +77,6 @@ __init_begin = .;
77 77
78 _end = . ; 78 _end = . ;
79 79
80 /* Sections to be discarded */
81 /DISCARD/ : {
82 EXIT_TEXT
83 EXIT_DATA
84 *(.exitcall.exit)
85 }
86
87 .crap : { 80 .crap : {
88 /* Stabs debugging sections. */ 81 /* Stabs debugging sections. */
89 *(.stab) 82 *(.stab)
@@ -96,4 +89,6 @@ __init_begin = .;
96 *(.note) 89 *(.note)
97 } 90 }
98 91
92 /* Sections to be discarded */
93 DISCARDS
99} 94}
diff --git a/arch/m68knommu/kernel/vmlinux.lds.S b/arch/m68knommu/kernel/vmlinux.lds.S
index b7fe505e358d..2736a5e309c0 100644
--- a/arch/m68knommu/kernel/vmlinux.lds.S
+++ b/arch/m68knommu/kernel/vmlinux.lds.S
@@ -184,12 +184,6 @@ SECTIONS {
184 __init_end = .; 184 __init_end = .;
185 } > INIT 185 } > INIT
186 186
187 /DISCARD/ : {
188 EXIT_TEXT
189 EXIT_DATA
190 *(.exitcall.exit)
191 }
192
193 .bss : { 187 .bss : {
194 . = ALIGN(4); 188 . = ALIGN(4);
195 _sbss = . ; 189 _sbss = . ;
@@ -200,5 +194,6 @@ SECTIONS {
200 _end = . ; 194 _end = . ;
201 } > BSS 195 } > BSS
202 196
197 DISCARDS
203} 198}
204 199
diff --git a/arch/microblaze/kernel/vmlinux.lds.S b/arch/microblaze/kernel/vmlinux.lds.S
index d34d38dcd12c..ec5fa91a48d8 100644
--- a/arch/microblaze/kernel/vmlinux.lds.S
+++ b/arch/microblaze/kernel/vmlinux.lds.S
@@ -23,8 +23,8 @@ SECTIONS {
23 _stext = . ; 23 _stext = . ;
24 *(.text .text.*) 24 *(.text .text.*)
25 *(.fixup) 25 *(.fixup)
26 26 EXIT_TEXT
27 *(.exitcall.exit) 27 EXIT_CALL
28 SCHED_TEXT 28 SCHED_TEXT
29 LOCK_TEXT 29 LOCK_TEXT
30 KPROBES_TEXT 30 KPROBES_TEXT
@@ -162,4 +162,6 @@ SECTIONS {
162 } 162 }
163 . = ALIGN(4096); 163 . = ALIGN(4096);
164 _end = .; 164 _end = .;
165
166 DISCARDS
165} 167}
diff --git a/arch/mips/kernel/vmlinux.lds.S b/arch/mips/kernel/vmlinux.lds.S
index 58738c8d754f..1474c18fb777 100644
--- a/arch/mips/kernel/vmlinux.lds.S
+++ b/arch/mips/kernel/vmlinux.lds.S
@@ -176,17 +176,6 @@ SECTIONS
176 176
177 _end = . ; 177 _end = . ;
178 178
179 /* Sections to be discarded */
180 /DISCARD/ : {
181 *(.exitcall.exit)
182
183 /* ABI crap starts here */
184 *(.MIPS.options)
185 *(.options)
186 *(.pdr)
187 *(.reginfo)
188 }
189
190 /* These mark the ABI of the kernel for debuggers. */ 179 /* These mark the ABI of the kernel for debuggers. */
191 .mdebug.abi32 : { 180 .mdebug.abi32 : {
192 KEEP(*(.mdebug.abi32)) 181 KEEP(*(.mdebug.abi32))
@@ -212,4 +201,14 @@ SECTIONS
212 *(.gptab.bss) 201 *(.gptab.bss)
213 *(.gptab.sbss) 202 *(.gptab.sbss)
214 } 203 }
204
205 /* Sections to be discarded */
206 DISCARDS
207 /DISCARD/ : {
208 /* ABI crap starts here */
209 *(.MIPS.options)
210 *(.options)
211 *(.pdr)
212 *(.reginfo)
213 }
215} 214}
diff --git a/arch/mn10300/kernel/vmlinux.lds.S b/arch/mn10300/kernel/vmlinux.lds.S
index f4aa07934654..76f41bdb79c4 100644
--- a/arch/mn10300/kernel/vmlinux.lds.S
+++ b/arch/mn10300/kernel/vmlinux.lds.S
@@ -115,12 +115,10 @@ SECTIONS
115 . = ALIGN(PAGE_SIZE); 115 . = ALIGN(PAGE_SIZE);
116 pg0 = .; 116 pg0 = .;
117 117
118 /* Sections to be discarded */
119 /DISCARD/ : {
120 EXIT_CALL
121 }
122
123 STABS_DEBUG 118 STABS_DEBUG
124 119
125 DWARF_DEBUG 120 DWARF_DEBUG
121
122 /* Sections to be discarded */
123 DISCARDS
126} 124}
diff --git a/arch/parisc/kernel/vmlinux.lds.S b/arch/parisc/kernel/vmlinux.lds.S
index fd2cc4fd2b65..aea1784edbd1 100644
--- a/arch/parisc/kernel/vmlinux.lds.S
+++ b/arch/parisc/kernel/vmlinux.lds.S
@@ -237,9 +237,12 @@ SECTIONS
237 /* freed after init ends here */ 237 /* freed after init ends here */
238 _end = . ; 238 _end = . ;
239 239
240 STABS_DEBUG
241 .note 0 : { *(.note) }
242
240 /* Sections to be discarded */ 243 /* Sections to be discarded */
244 DISCARDS
241 /DISCARD/ : { 245 /DISCARD/ : {
242 *(.exitcall.exit)
243#ifdef CONFIG_64BIT 246#ifdef CONFIG_64BIT
244 /* temporary hack until binutils is fixed to not emit these 247 /* temporary hack until binutils is fixed to not emit these
245 * for static binaries 248 * for static binaries
@@ -252,7 +255,4 @@ SECTIONS
252 *(.gnu.hash) 255 *(.gnu.hash)
253#endif 256#endif
254 } 257 }
255
256 STABS_DEBUG
257 .note 0 : { *(.note) }
258} 258}
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index d00131ca0835..2c42e1526d03 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -49,6 +49,9 @@ config GENERIC_HARDIRQS_NO__DO_IRQ
49config HAVE_SETUP_PER_CPU_AREA 49config HAVE_SETUP_PER_CPU_AREA
50 def_bool PPC64 50 def_bool PPC64
51 51
52config NEED_PER_CPU_EMBED_FIRST_CHUNK
53 def_bool PPC64
54
52config IRQ_PER_CPU 55config IRQ_PER_CPU
53 bool 56 bool
54 default y 57 default y
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index 1f6816003ebe..aa6e4500635f 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -57,6 +57,7 @@
57#include <asm/cache.h> 57#include <asm/cache.h>
58#include <asm/page.h> 58#include <asm/page.h>
59#include <asm/mmu.h> 59#include <asm/mmu.h>
60#include <asm/mmu-hash64.h>
60#include <asm/firmware.h> 61#include <asm/firmware.h>
61#include <asm/xmon.h> 62#include <asm/xmon.h>
62#include <asm/udbg.h> 63#include <asm/udbg.h>
@@ -569,25 +570,53 @@ void cpu_die(void)
569} 570}
570 571
571#ifdef CONFIG_SMP 572#ifdef CONFIG_SMP
572void __init setup_per_cpu_areas(void) 573#define PCPU_DYN_SIZE ()
574
575static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align)
573{ 576{
574 int i; 577 return __alloc_bootmem_node(NODE_DATA(cpu_to_node(cpu)), size, align,
575 unsigned long size; 578 __pa(MAX_DMA_ADDRESS));
576 char *ptr; 579}
577
578 /* Copy section for each CPU (we discard the original) */
579 size = ALIGN(__per_cpu_end - __per_cpu_start, PAGE_SIZE);
580#ifdef CONFIG_MODULES
581 if (size < PERCPU_ENOUGH_ROOM)
582 size = PERCPU_ENOUGH_ROOM;
583#endif
584 580
585 for_each_possible_cpu(i) { 581static void __init pcpu_fc_free(void *ptr, size_t size)
586 ptr = alloc_bootmem_pages_node(NODE_DATA(cpu_to_node(i)), size); 582{
583 free_bootmem(__pa(ptr), size);
584}
587 585
588 paca[i].data_offset = ptr - __per_cpu_start; 586static int pcpu_cpu_distance(unsigned int from, unsigned int to)
589 memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); 587{
590 } 588 if (cpu_to_node(from) == cpu_to_node(to))
589 return LOCAL_DISTANCE;
590 else
591 return REMOTE_DISTANCE;
592}
593
594void __init setup_per_cpu_areas(void)
595{
596 const size_t dyn_size = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE;
597 size_t atom_size;
598 unsigned long delta;
599 unsigned int cpu;
600 int rc;
601
602 /*
603 * Linear mapping is one of 4K, 1M and 16M. For 4K, no need
604 * to group units. For larger mappings, use 1M atom which
605 * should be large enough to contain a number of units.
606 */
607 if (mmu_linear_psize == MMU_PAGE_4K)
608 atom_size = PAGE_SIZE;
609 else
610 atom_size = 1 << 20;
611
612 rc = pcpu_embed_first_chunk(0, dyn_size, atom_size, pcpu_cpu_distance,
613 pcpu_fc_alloc, pcpu_fc_free);
614 if (rc < 0)
615 panic("cannot initialize percpu area (err=%d)", rc);
616
617 delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
618 for_each_possible_cpu(cpu)
619 paca[cpu].data_offset = delta + pcpu_unit_offsets[cpu];
591} 620}
592#endif 621#endif
593 622
diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S
index 8ef8a14abc95..244e3658983c 100644
--- a/arch/powerpc/kernel/vmlinux.lds.S
+++ b/arch/powerpc/kernel/vmlinux.lds.S
@@ -37,12 +37,6 @@ jiffies = jiffies_64 + 4;
37#endif 37#endif
38SECTIONS 38SECTIONS
39{ 39{
40 /* Sections to be discarded. */
41 /DISCARD/ : {
42 *(.exitcall.exit)
43 EXIT_DATA
44 }
45
46 . = KERNELBASE; 40 . = KERNELBASE;
47 41
48/* 42/*
@@ -298,4 +292,7 @@ SECTIONS
298 . = ALIGN(PAGE_SIZE); 292 . = ALIGN(PAGE_SIZE);
299 _end = . ; 293 _end = . ;
300 PROVIDE32 (end = .); 294 PROVIDE32 (end = .);
295
296 /* Sections to be discarded. */
297 DISCARDS
301} 298}
diff --git a/arch/powerpc/mm/stab.c b/arch/powerpc/mm/stab.c
index 98cd1dc2ae75..6e9b69c99856 100644
--- a/arch/powerpc/mm/stab.c
+++ b/arch/powerpc/mm/stab.c
@@ -31,7 +31,7 @@ struct stab_entry {
31 31
32#define NR_STAB_CACHE_ENTRIES 8 32#define NR_STAB_CACHE_ENTRIES 8
33static DEFINE_PER_CPU(long, stab_cache_ptr); 33static DEFINE_PER_CPU(long, stab_cache_ptr);
34static DEFINE_PER_CPU(long, stab_cache[NR_STAB_CACHE_ENTRIES]); 34static DEFINE_PER_CPU(long [NR_STAB_CACHE_ENTRIES], stab_cache);
35 35
36/* 36/*
37 * Create a segment table entry for the given esid/vsid pair. 37 * Create a segment table entry for the given esid/vsid pair.
diff --git a/arch/powerpc/platforms/ps3/smp.c b/arch/powerpc/platforms/ps3/smp.c
index f6e04bcc70ef..51ffde40af2b 100644
--- a/arch/powerpc/platforms/ps3/smp.c
+++ b/arch/powerpc/platforms/ps3/smp.c
@@ -37,7 +37,7 @@
37 */ 37 */
38 38
39#define MSG_COUNT 4 39#define MSG_COUNT 4
40static DEFINE_PER_CPU(unsigned int, ps3_ipi_virqs[MSG_COUNT]); 40static DEFINE_PER_CPU(unsigned int [MSG_COUNT], ps3_ipi_virqs);
41 41
42static void do_message_pass(int target, int msg) 42static void do_message_pass(int target, int msg)
43{ 43{
diff --git a/arch/s390/include/asm/percpu.h b/arch/s390/include/asm/percpu.h
index 408d60b4f75b..f7ad8719d02d 100644
--- a/arch/s390/include/asm/percpu.h
+++ b/arch/s390/include/asm/percpu.h
@@ -1,37 +1,21 @@
1#ifndef __ARCH_S390_PERCPU__ 1#ifndef __ARCH_S390_PERCPU__
2#define __ARCH_S390_PERCPU__ 2#define __ARCH_S390_PERCPU__
3 3
4#include <linux/compiler.h>
5#include <asm/lowcore.h>
6
7/* 4/*
8 * s390 uses its own implementation for per cpu data, the offset of 5 * s390 uses its own implementation for per cpu data, the offset of
9 * the cpu local data area is cached in the cpu's lowcore memory. 6 * the cpu local data area is cached in the cpu's lowcore memory.
10 * For 64 bit module code s390 forces the use of a GOT slot for the
11 * address of the per cpu variable. This is needed because the module
12 * may be more than 4G above the per cpu area.
13 */ 7 */
14#if defined(__s390x__) && defined(MODULE) 8#define __my_cpu_offset S390_lowcore.percpu_offset
15
16#define SHIFT_PERCPU_PTR(ptr,offset) (({ \
17 extern int simple_identifier_##var(void); \
18 unsigned long *__ptr; \
19 asm ( "larl %0, %1@GOTENT" \
20 : "=a" (__ptr) : "X" (ptr) ); \
21 (typeof(ptr))((*__ptr) + (offset)); }))
22
23#else
24
25#define SHIFT_PERCPU_PTR(ptr, offset) (({ \
26 extern int simple_identifier_##var(void); \
27 unsigned long __ptr; \
28 asm ( "" : "=a" (__ptr) : "0" (ptr) ); \
29 (typeof(ptr)) (__ptr + (offset)); }))
30 9
10/*
11 * For 64 bit module code, the module may be more than 4G above the
12 * per cpu area, use weak definitions to force the compiler to
13 * generate external references.
14 */
15#if defined(CONFIG_SMP) && defined(__s390x__) && defined(MODULE)
16#define ARCH_NEEDS_WEAK_PER_CPU
31#endif 17#endif
32 18
33#define __my_cpu_offset S390_lowcore.percpu_offset
34
35#include <asm-generic/percpu.h> 19#include <asm-generic/percpu.h>
36 20
37#endif /* __ARCH_S390_PERCPU__ */ 21#endif /* __ARCH_S390_PERCPU__ */
diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S
index a53db23ee092..82415c75b996 100644
--- a/arch/s390/kernel/vmlinux.lds.S
+++ b/arch/s390/kernel/vmlinux.lds.S
@@ -157,13 +157,10 @@ SECTIONS
157 157
158 _end = . ; 158 _end = . ;
159 159
160 /* Sections to be discarded */
161 /DISCARD/ : {
162 EXIT_DATA
163 *(.exitcall.exit)
164 }
165
166 /* Debugging sections. */ 160 /* Debugging sections. */
167 STABS_DEBUG 161 STABS_DEBUG
168 DWARF_DEBUG 162 DWARF_DEBUG
163
164 /* Sections to be discarded */
165 DISCARDS
169} 166}
diff --git a/arch/sh/kernel/vmlinux.lds.S b/arch/sh/kernel/vmlinux.lds.S
index f53c76acaede..0ce254bca92f 100644
--- a/arch/sh/kernel/vmlinux.lds.S
+++ b/arch/sh/kernel/vmlinux.lds.S
@@ -163,16 +163,14 @@ SECTIONS
163 _end = . ; 163 _end = . ;
164 } 164 }
165 165
166 STABS_DEBUG
167 DWARF_DEBUG
168
166 /* 169 /*
167 * When something in the kernel is NOT compiled as a module, the 170 * When something in the kernel is NOT compiled as a module, the
168 * module cleanup code and data are put into these segments. Both 171 * module cleanup code and data are put into these segments. Both
169 * can then be thrown away, as cleanup code is never called unless 172 * can then be thrown away, as cleanup code is never called unless
170 * it's a module. 173 * it's a module.
171 */ 174 */
172 /DISCARD/ : { 175 DISCARDS
173 *(.exitcall.exit)
174 }
175
176 STABS_DEBUG
177 DWARF_DEBUG
178} 176}
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 3f8b6a92eabd..fbd1233b392d 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -95,7 +95,7 @@ config AUDIT_ARCH
95config HAVE_SETUP_PER_CPU_AREA 95config HAVE_SETUP_PER_CPU_AREA
96 def_bool y if SPARC64 96 def_bool y if SPARC64
97 97
98config HAVE_DYNAMIC_PER_CPU_AREA 98config NEED_PER_CPU_EMBED_FIRST_CHUNK
99 def_bool y if SPARC64 99 def_bool y if SPARC64
100 100
101config GENERIC_HARDIRQS_NO__DO_IRQ 101config GENERIC_HARDIRQS_NO__DO_IRQ
diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c
index 3691907a43b4..ff68373ce6d6 100644
--- a/arch/sparc/kernel/smp_64.c
+++ b/arch/sparc/kernel/smp_64.c
@@ -1389,8 +1389,8 @@ void smp_send_stop(void)
1389 * RETURNS: 1389 * RETURNS:
1390 * Pointer to the allocated area on success, NULL on failure. 1390 * Pointer to the allocated area on success, NULL on failure.
1391 */ 1391 */
1392static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size, 1392static void * __init pcpu_alloc_bootmem(unsigned int cpu, size_t size,
1393 unsigned long align) 1393 size_t align)
1394{ 1394{
1395 const unsigned long goal = __pa(MAX_DMA_ADDRESS); 1395 const unsigned long goal = __pa(MAX_DMA_ADDRESS);
1396#ifdef CONFIG_NEED_MULTIPLE_NODES 1396#ifdef CONFIG_NEED_MULTIPLE_NODES
@@ -1415,127 +1415,35 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size,
1415#endif 1415#endif
1416} 1416}
1417 1417
1418static size_t pcpur_size __initdata; 1418static void __init pcpu_free_bootmem(void *ptr, size_t size)
1419static void **pcpur_ptrs __initdata;
1420
1421static struct page * __init pcpur_get_page(unsigned int cpu, int pageno)
1422{ 1419{
1423 size_t off = (size_t)pageno << PAGE_SHIFT; 1420 free_bootmem(__pa(ptr), size);
1424
1425 if (off >= pcpur_size)
1426 return NULL;
1427
1428 return virt_to_page(pcpur_ptrs[cpu] + off);
1429} 1421}
1430 1422
1431#define PCPU_CHUNK_SIZE (4UL * 1024UL * 1024UL) 1423static int pcpu_cpu_distance(unsigned int from, unsigned int to)
1432
1433static void __init pcpu_map_range(unsigned long start, unsigned long end,
1434 struct page *page)
1435{ 1424{
1436 unsigned long pfn = page_to_pfn(page); 1425 if (cpu_to_node(from) == cpu_to_node(to))
1437 unsigned long pte_base; 1426 return LOCAL_DISTANCE;
1438 1427 else
1439 BUG_ON((pfn<<PAGE_SHIFT)&(PCPU_CHUNK_SIZE - 1UL)); 1428 return REMOTE_DISTANCE;
1440
1441 pte_base = (_PAGE_VALID | _PAGE_SZ4MB_4U |
1442 _PAGE_CP_4U | _PAGE_CV_4U |
1443 _PAGE_P_4U | _PAGE_W_4U);
1444 if (tlb_type == hypervisor)
1445 pte_base = (_PAGE_VALID | _PAGE_SZ4MB_4V |
1446 _PAGE_CP_4V | _PAGE_CV_4V |
1447 _PAGE_P_4V | _PAGE_W_4V);
1448
1449 while (start < end) {
1450 pgd_t *pgd = pgd_offset_k(start);
1451 unsigned long this_end;
1452 pud_t *pud;
1453 pmd_t *pmd;
1454 pte_t *pte;
1455
1456 pud = pud_offset(pgd, start);
1457 if (pud_none(*pud)) {
1458 pmd_t *new;
1459
1460 new = __alloc_bootmem(PAGE_SIZE, PAGE_SIZE, PAGE_SIZE);
1461 pud_populate(&init_mm, pud, new);
1462 }
1463
1464 pmd = pmd_offset(pud, start);
1465 if (!pmd_present(*pmd)) {
1466 pte_t *new;
1467
1468 new = __alloc_bootmem(PAGE_SIZE, PAGE_SIZE, PAGE_SIZE);
1469 pmd_populate_kernel(&init_mm, pmd, new);
1470 }
1471
1472 pte = pte_offset_kernel(pmd, start);
1473 this_end = (start + PMD_SIZE) & PMD_MASK;
1474 if (this_end > end)
1475 this_end = end;
1476
1477 while (start < this_end) {
1478 unsigned long paddr = pfn << PAGE_SHIFT;
1479
1480 pte_val(*pte) = (paddr | pte_base);
1481
1482 start += PAGE_SIZE;
1483 pte++;
1484 pfn++;
1485 }
1486 }
1487} 1429}
1488 1430
1489void __init setup_per_cpu_areas(void) 1431void __init setup_per_cpu_areas(void)
1490{ 1432{
1491 size_t dyn_size, static_size = __per_cpu_end - __per_cpu_start; 1433 unsigned long delta;
1492 static struct vm_struct vm; 1434 unsigned int cpu;
1493 unsigned long delta, cpu; 1435 int rc;
1494 size_t pcpu_unit_size;
1495 size_t ptrs_size;
1496
1497 pcpur_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE +
1498 PERCPU_DYNAMIC_RESERVE);
1499 dyn_size = pcpur_size - static_size - PERCPU_MODULE_RESERVE;
1500
1501 1436
1502 ptrs_size = PFN_ALIGN(nr_cpu_ids * sizeof(pcpur_ptrs[0])); 1437 rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE,
1503 pcpur_ptrs = alloc_bootmem(ptrs_size); 1438 PERCPU_DYNAMIC_RESERVE, 4 << 20,
1504 1439 pcpu_cpu_distance, pcpu_alloc_bootmem,
1505 for_each_possible_cpu(cpu) { 1440 pcpu_free_bootmem);
1506 pcpur_ptrs[cpu] = pcpu_alloc_bootmem(cpu, PCPU_CHUNK_SIZE, 1441 if (rc)
1507 PCPU_CHUNK_SIZE); 1442 panic("failed to initialize first chunk (%d)", rc);
1508
1509 free_bootmem(__pa(pcpur_ptrs[cpu] + pcpur_size),
1510 PCPU_CHUNK_SIZE - pcpur_size);
1511
1512 memcpy(pcpur_ptrs[cpu], __per_cpu_load, static_size);
1513 }
1514
1515 /* allocate address and map */
1516 vm.flags = VM_ALLOC;
1517 vm.size = nr_cpu_ids * PCPU_CHUNK_SIZE;
1518 vm_area_register_early(&vm, PCPU_CHUNK_SIZE);
1519
1520 for_each_possible_cpu(cpu) {
1521 unsigned long start = (unsigned long) vm.addr;
1522 unsigned long end;
1523
1524 start += cpu * PCPU_CHUNK_SIZE;
1525 end = start + PCPU_CHUNK_SIZE;
1526 pcpu_map_range(start, end, virt_to_page(pcpur_ptrs[cpu]));
1527 }
1528
1529 pcpu_unit_size = pcpu_setup_first_chunk(pcpur_get_page, static_size,
1530 PERCPU_MODULE_RESERVE, dyn_size,
1531 PCPU_CHUNK_SIZE, vm.addr, NULL);
1532
1533 free_bootmem(__pa(pcpur_ptrs), ptrs_size);
1534 1443
1535 delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; 1444 delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
1536 for_each_possible_cpu(cpu) { 1445 for_each_possible_cpu(cpu)
1537 __per_cpu_offset(cpu) = delta + cpu * pcpu_unit_size; 1446 __per_cpu_offset(cpu) = delta + pcpu_unit_offsets[cpu];
1538 }
1539 1447
1540 /* Setup %g5 for the boot cpu. */ 1448 /* Setup %g5 for the boot cpu. */
1541 __local_per_cpu_offset = __per_cpu_offset(smp_processor_id()); 1449 __local_per_cpu_offset = __per_cpu_offset(smp_processor_id());
diff --git a/arch/sparc/kernel/vmlinux.lds.S b/arch/sparc/kernel/vmlinux.lds.S
index fcbbd000ec08..866390feb683 100644
--- a/arch/sparc/kernel/vmlinux.lds.S
+++ b/arch/sparc/kernel/vmlinux.lds.S
@@ -171,12 +171,8 @@ SECTIONS
171 } 171 }
172 _end = . ; 172 _end = . ;
173 173
174 /DISCARD/ : {
175 EXIT_TEXT
176 EXIT_DATA
177 *(.exitcall.exit)
178 }
179
180 STABS_DEBUG 174 STABS_DEBUG
181 DWARF_DEBUG 175 DWARF_DEBUG
176
177 DISCARDS
182} 178}
diff --git a/arch/um/include/asm/common.lds.S b/arch/um/include/asm/common.lds.S
index cb0248616d49..37ecc5577a9a 100644
--- a/arch/um/include/asm/common.lds.S
+++ b/arch/um/include/asm/common.lds.S
@@ -123,8 +123,3 @@
123 __initramfs_end = .; 123 __initramfs_end = .;
124 } 124 }
125 125
126 /* Sections to be discarded */
127 /DISCARD/ : {
128 *(.exitcall.exit)
129 }
130
diff --git a/arch/um/kernel/dyn.lds.S b/arch/um/kernel/dyn.lds.S
index 9975e1ab44fb..715a188c0472 100644
--- a/arch/um/kernel/dyn.lds.S
+++ b/arch/um/kernel/dyn.lds.S
@@ -156,4 +156,6 @@ SECTIONS
156 STABS_DEBUG 156 STABS_DEBUG
157 157
158 DWARF_DEBUG 158 DWARF_DEBUG
159
160 DISCARDS
159} 161}
diff --git a/arch/um/kernel/uml.lds.S b/arch/um/kernel/uml.lds.S
index 11b835248b86..2ebd39765db8 100644
--- a/arch/um/kernel/uml.lds.S
+++ b/arch/um/kernel/uml.lds.S
@@ -100,4 +100,6 @@ SECTIONS
100 STABS_DEBUG 100 STABS_DEBUG
101 101
102 DWARF_DEBUG 102 DWARF_DEBUG
103
104 DISCARDS
103} 105}
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 13ffa5df37d7..869d7d301448 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -150,7 +150,10 @@ config ARCH_HAS_CACHE_LINE_SIZE
150config HAVE_SETUP_PER_CPU_AREA 150config HAVE_SETUP_PER_CPU_AREA
151 def_bool y 151 def_bool y
152 152
153config HAVE_DYNAMIC_PER_CPU_AREA 153config NEED_PER_CPU_EMBED_FIRST_CHUNK
154 def_bool y
155
156config NEED_PER_CPU_PAGE_FIRST_CHUNK
154 def_bool y 157 def_bool y
155 158
156config HAVE_CPUMASK_OF_CPU_MAP 159config HAVE_CPUMASK_OF_CPU_MAP
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 103f1ddb0d85..a18c038a3079 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -156,15 +156,6 @@ do { \
156/* We can use this directly for local CPU (faster). */ 156/* We can use this directly for local CPU (faster). */
157DECLARE_PER_CPU(unsigned long, this_cpu_off); 157DECLARE_PER_CPU(unsigned long, this_cpu_off);
158 158
159#ifdef CONFIG_NEED_MULTIPLE_NODES
160void *pcpu_lpage_remapped(void *kaddr);
161#else
162static inline void *pcpu_lpage_remapped(void *kaddr)
163{
164 return NULL;
165}
166#endif
167
168#endif /* !__ASSEMBLY__ */ 159#endif /* !__ASSEMBLY__ */
169 160
170#ifdef CONFIG_SMP 161#ifdef CONFIG_SMP
diff --git a/arch/x86/kernel/cpu/cpu_debug.c b/arch/x86/kernel/cpu/cpu_debug.c
index 6b2a52dd0403..dca325c03999 100644
--- a/arch/x86/kernel/cpu/cpu_debug.c
+++ b/arch/x86/kernel/cpu/cpu_debug.c
@@ -30,8 +30,8 @@
30#include <asm/apic.h> 30#include <asm/apic.h>
31#include <asm/desc.h> 31#include <asm/desc.h>
32 32
33static DEFINE_PER_CPU(struct cpu_cpuX_base, cpu_arr[CPU_REG_ALL_BIT]); 33static DEFINE_PER_CPU(struct cpu_cpuX_base [CPU_REG_ALL_BIT], cpu_arr);
34static DEFINE_PER_CPU(struct cpu_private *, priv_arr[MAX_CPU_FILES]); 34static DEFINE_PER_CPU(struct cpu_private * [MAX_CPU_FILES], priv_arr);
35static DEFINE_PER_CPU(int, cpu_priv_count); 35static DEFINE_PER_CPU(int, cpu_priv_count);
36 36
37static DEFINE_MUTEX(cpu_debug_lock); 37static DEFINE_MUTEX(cpu_debug_lock);
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 1cfb623ce11c..14ce5d49b2ad 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -1091,7 +1091,7 @@ void mce_log_therm_throt_event(__u64 status)
1091 */ 1091 */
1092static int check_interval = 5 * 60; /* 5 minutes */ 1092static int check_interval = 5 * 60; /* 5 minutes */
1093 1093
1094static DEFINE_PER_CPU(int, next_interval); /* in jiffies */ 1094static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */
1095static DEFINE_PER_CPU(struct timer_list, mce_timer); 1095static DEFINE_PER_CPU(struct timer_list, mce_timer);
1096 1096
1097static void mcheck_timer(unsigned long data) 1097static void mcheck_timer(unsigned long data)
@@ -1110,7 +1110,7 @@ static void mcheck_timer(unsigned long data)
1110 * Alert userspace if needed. If we logged an MCE, reduce the 1110 * Alert userspace if needed. If we logged an MCE, reduce the
1111 * polling interval, otherwise increase the polling interval. 1111 * polling interval, otherwise increase the polling interval.
1112 */ 1112 */
1113 n = &__get_cpu_var(next_interval); 1113 n = &__get_cpu_var(mce_next_interval);
1114 if (mce_notify_irq()) 1114 if (mce_notify_irq())
1115 *n = max(*n/2, HZ/100); 1115 *n = max(*n/2, HZ/100);
1116 else 1116 else
@@ -1311,7 +1311,7 @@ static void mce_cpu_features(struct cpuinfo_x86 *c)
1311static void mce_init_timer(void) 1311static void mce_init_timer(void)
1312{ 1312{
1313 struct timer_list *t = &__get_cpu_var(mce_timer); 1313 struct timer_list *t = &__get_cpu_var(mce_timer);
1314 int *n = &__get_cpu_var(next_interval); 1314 int *n = &__get_cpu_var(mce_next_interval);
1315 1315
1316 if (mce_ignore_ce) 1316 if (mce_ignore_ce)
1317 return; 1317 return;
@@ -1912,7 +1912,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
1912 case CPU_DOWN_FAILED: 1912 case CPU_DOWN_FAILED:
1913 case CPU_DOWN_FAILED_FROZEN: 1913 case CPU_DOWN_FAILED_FROZEN:
1914 t->expires = round_jiffies(jiffies + 1914 t->expires = round_jiffies(jiffies +
1915 __get_cpu_var(next_interval)); 1915 __get_cpu_var(mce_next_interval));
1916 add_timer_on(t, cpu); 1916 add_timer_on(t, cpu);
1917 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); 1917 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
1918 break; 1918 break;
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index ddae21620bda..bd2a2fa84628 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -69,7 +69,7 @@ struct threshold_bank {
69 struct threshold_block *blocks; 69 struct threshold_block *blocks;
70 cpumask_var_t cpus; 70 cpumask_var_t cpus;
71}; 71};
72static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]); 72static DEFINE_PER_CPU(struct threshold_bank * [NR_BANKS], threshold_banks);
73 73
74#ifdef CONFIG_SMP 74#ifdef CONFIG_SMP
75static unsigned char shared_bank[NR_BANKS] = { 75static unsigned char shared_bank[NR_BANKS] = {
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 900332b800f8..3d4ebbd2e129 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -976,7 +976,7 @@ amd_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
976 x86_pmu_disable_counter(hwc, idx); 976 x86_pmu_disable_counter(hwc, idx);
977} 977}
978 978
979static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]); 979static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
980 980
981/* 981/*
982 * Set the next IRQ period, based on the hwc->period_left value. 982 * Set the next IRQ period, based on the hwc->period_left value.
@@ -1015,7 +1015,7 @@ x86_perf_counter_set_period(struct perf_counter *counter,
1015 if (left > x86_pmu.max_period) 1015 if (left > x86_pmu.max_period)
1016 left = x86_pmu.max_period; 1016 left = x86_pmu.max_period;
1017 1017
1018 per_cpu(prev_left[idx], smp_processor_id()) = left; 1018 per_cpu(pmc_prev_left[idx], smp_processor_id()) = left;
1019 1019
1020 /* 1020 /*
1021 * The hw counter starts counting from this counter offset, 1021 * The hw counter starts counting from this counter offset,
@@ -1211,7 +1211,7 @@ void perf_counter_print_debug(void)
1211 rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl); 1211 rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
1212 rdmsrl(x86_pmu.perfctr + idx, pmc_count); 1212 rdmsrl(x86_pmu.perfctr + idx, pmc_count);
1213 1213
1214 prev_left = per_cpu(prev_left[idx], cpu); 1214 prev_left = per_cpu(pmc_prev_left[idx], cpu);
1215 1215
1216 pr_info("CPU#%d: gen-PMC%d ctrl: %016llx\n", 1216 pr_info("CPU#%d: gen-PMC%d ctrl: %016llx\n",
1217 cpu, idx, pmc_ctrl); 1217 cpu, idx, pmc_ctrl);
@@ -1798,8 +1798,8 @@ void callchain_store(struct perf_callchain_entry *entry, u64 ip)
1798 entry->ip[entry->nr++] = ip; 1798 entry->ip[entry->nr++] = ip;
1799} 1799}
1800 1800
1801static DEFINE_PER_CPU(struct perf_callchain_entry, irq_entry); 1801static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry);
1802static DEFINE_PER_CPU(struct perf_callchain_entry, nmi_entry); 1802static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry);
1803static DEFINE_PER_CPU(int, in_nmi_frame); 1803static DEFINE_PER_CPU(int, in_nmi_frame);
1804 1804
1805 1805
@@ -1952,9 +1952,9 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
1952 struct perf_callchain_entry *entry; 1952 struct perf_callchain_entry *entry;
1953 1953
1954 if (in_nmi()) 1954 if (in_nmi())
1955 entry = &__get_cpu_var(nmi_entry); 1955 entry = &__get_cpu_var(pmc_nmi_entry);
1956 else 1956 else
1957 entry = &__get_cpu_var(irq_entry); 1957 entry = &__get_cpu_var(pmc_irq_entry);
1958 1958
1959 entry->nr = 0; 1959 entry->nr = 0;
1960 1960
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 07d81916f212..d559af913e1f 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -55,6 +55,7 @@ EXPORT_SYMBOL(__per_cpu_offset);
55#define PERCPU_FIRST_CHUNK_RESERVE 0 55#define PERCPU_FIRST_CHUNK_RESERVE 0
56#endif 56#endif
57 57
58#ifdef CONFIG_X86_32
58/** 59/**
59 * pcpu_need_numa - determine percpu allocation needs to consider NUMA 60 * pcpu_need_numa - determine percpu allocation needs to consider NUMA
60 * 61 *
@@ -83,6 +84,7 @@ static bool __init pcpu_need_numa(void)
83#endif 84#endif
84 return false; 85 return false;
85} 86}
87#endif
86 88
87/** 89/**
88 * pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu 90 * pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu
@@ -124,308 +126,35 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size,
124} 126}
125 127
126/* 128/*
127 * Large page remap allocator 129 * Helpers for first chunk memory allocation
128 *
129 * This allocator uses PMD page as unit. A PMD page is allocated for
130 * each cpu and each is remapped into vmalloc area using PMD mapping.
131 * As PMD page is quite large, only part of it is used for the first
132 * chunk. Unused part is returned to the bootmem allocator.
133 *
134 * So, the PMD pages are mapped twice - once to the physical mapping
135 * and to the vmalloc area for the first percpu chunk. The double
136 * mapping does add one more PMD TLB entry pressure but still is much
137 * better than only using 4k mappings while still being NUMA friendly.
138 */ 130 */
139#ifdef CONFIG_NEED_MULTIPLE_NODES 131static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align)
140struct pcpul_ent {
141 unsigned int cpu;
142 void *ptr;
143};
144
145static size_t pcpul_size;
146static struct pcpul_ent *pcpul_map;
147static struct vm_struct pcpul_vm;
148
149static struct page * __init pcpul_get_page(unsigned int cpu, int pageno)
150{ 132{
151 size_t off = (size_t)pageno << PAGE_SHIFT; 133 return pcpu_alloc_bootmem(cpu, size, align);
152
153 if (off >= pcpul_size)
154 return NULL;
155
156 return virt_to_page(pcpul_map[cpu].ptr + off);
157} 134}
158 135
159static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) 136static void __init pcpu_fc_free(void *ptr, size_t size)
160{ 137{
161 size_t map_size, dyn_size; 138 free_bootmem(__pa(ptr), size);
162 unsigned int cpu;
163 int i, j;
164 ssize_t ret;
165
166 if (!chosen) {
167 size_t vm_size = VMALLOC_END - VMALLOC_START;
168 size_t tot_size = nr_cpu_ids * PMD_SIZE;
169
170 /* on non-NUMA, embedding is better */
171 if (!pcpu_need_numa())
172 return -EINVAL;
173
174 /* don't consume more than 20% of vmalloc area */
175 if (tot_size > vm_size / 5) {
176 pr_info("PERCPU: too large chunk size %zuMB for "
177 "large page remap\n", tot_size >> 20);
178 return -EINVAL;
179 }
180 }
181
182 /* need PSE */
183 if (!cpu_has_pse) {
184 pr_warning("PERCPU: lpage allocator requires PSE\n");
185 return -EINVAL;
186 }
187
188 /*
189 * Currently supports only single page. Supporting multiple
190 * pages won't be too difficult if it ever becomes necessary.
191 */
192 pcpul_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE +
193 PERCPU_DYNAMIC_RESERVE);
194 if (pcpul_size > PMD_SIZE) {
195 pr_warning("PERCPU: static data is larger than large page, "
196 "can't use large page\n");
197 return -EINVAL;
198 }
199 dyn_size = pcpul_size - static_size - PERCPU_FIRST_CHUNK_RESERVE;
200
201 /* allocate pointer array and alloc large pages */
202 map_size = PFN_ALIGN(nr_cpu_ids * sizeof(pcpul_map[0]));
203 pcpul_map = alloc_bootmem(map_size);
204
205 for_each_possible_cpu(cpu) {
206 pcpul_map[cpu].cpu = cpu;
207 pcpul_map[cpu].ptr = pcpu_alloc_bootmem(cpu, PMD_SIZE,
208 PMD_SIZE);
209 if (!pcpul_map[cpu].ptr) {
210 pr_warning("PERCPU: failed to allocate large page "
211 "for cpu%u\n", cpu);
212 goto enomem;
213 }
214
215 /*
216 * Only use pcpul_size bytes and give back the rest.
217 *
218 * Ingo: The 2MB up-rounding bootmem is needed to make
219 * sure the partial 2MB page is still fully RAM - it's
220 * not well-specified to have a PAT-incompatible area
221 * (unmapped RAM, device memory, etc.) in that hole.
222 */
223 free_bootmem(__pa(pcpul_map[cpu].ptr + pcpul_size),
224 PMD_SIZE - pcpul_size);
225
226 memcpy(pcpul_map[cpu].ptr, __per_cpu_load, static_size);
227 }
228
229 /* allocate address and map */
230 pcpul_vm.flags = VM_ALLOC;
231 pcpul_vm.size = nr_cpu_ids * PMD_SIZE;
232 vm_area_register_early(&pcpul_vm, PMD_SIZE);
233
234 for_each_possible_cpu(cpu) {
235 pmd_t *pmd, pmd_v;
236
237 pmd = populate_extra_pmd((unsigned long)pcpul_vm.addr +
238 cpu * PMD_SIZE);
239 pmd_v = pfn_pmd(page_to_pfn(virt_to_page(pcpul_map[cpu].ptr)),
240 PAGE_KERNEL_LARGE);
241 set_pmd(pmd, pmd_v);
242 }
243
244 /* we're ready, commit */
245 pr_info("PERCPU: Remapped at %p with large pages, static data "
246 "%zu bytes\n", pcpul_vm.addr, static_size);
247
248 ret = pcpu_setup_first_chunk(pcpul_get_page, static_size,
249 PERCPU_FIRST_CHUNK_RESERVE, dyn_size,
250 PMD_SIZE, pcpul_vm.addr, NULL);
251
252 /* sort pcpul_map array for pcpu_lpage_remapped() */
253 for (i = 0; i < nr_cpu_ids - 1; i++)
254 for (j = i + 1; j < nr_cpu_ids; j++)
255 if (pcpul_map[i].ptr > pcpul_map[j].ptr) {
256 struct pcpul_ent tmp = pcpul_map[i];
257 pcpul_map[i] = pcpul_map[j];
258 pcpul_map[j] = tmp;
259 }
260
261 return ret;
262
263enomem:
264 for_each_possible_cpu(cpu)
265 if (pcpul_map[cpu].ptr)
266 free_bootmem(__pa(pcpul_map[cpu].ptr), pcpul_size);
267 free_bootmem(__pa(pcpul_map), map_size);
268 return -ENOMEM;
269} 139}
270 140
271/** 141static int __init pcpu_cpu_distance(unsigned int from, unsigned int to)
272 * pcpu_lpage_remapped - determine whether a kaddr is in pcpul recycled area
273 * @kaddr: the kernel address in question
274 *
275 * Determine whether @kaddr falls in the pcpul recycled area. This is
276 * used by pageattr to detect VM aliases and break up the pcpu PMD
277 * mapping such that the same physical page is not mapped under
278 * different attributes.
279 *
280 * The recycled area is always at the tail of a partially used PMD
281 * page.
282 *
283 * RETURNS:
284 * Address of corresponding remapped pcpu address if match is found;
285 * otherwise, NULL.
286 */
287void *pcpu_lpage_remapped(void *kaddr)
288{ 142{
289 void *pmd_addr = (void *)((unsigned long)kaddr & PMD_MASK); 143#ifdef CONFIG_NEED_MULTIPLE_NODES
290 unsigned long offset = (unsigned long)kaddr & ~PMD_MASK; 144 if (early_cpu_to_node(from) == early_cpu_to_node(to))
291 int left = 0, right = nr_cpu_ids - 1; 145 return LOCAL_DISTANCE;
292 int pos; 146 else
293 147 return REMOTE_DISTANCE;
294 /* pcpul in use at all? */
295 if (!pcpul_map)
296 return NULL;
297
298 /* okay, perform binary search */
299 while (left <= right) {
300 pos = (left + right) / 2;
301
302 if (pcpul_map[pos].ptr < pmd_addr)
303 left = pos + 1;
304 else if (pcpul_map[pos].ptr > pmd_addr)
305 right = pos - 1;
306 else {
307 /* it shouldn't be in the area for the first chunk */
308 WARN_ON(offset < pcpul_size);
309
310 return pcpul_vm.addr +
311 pcpul_map[pos].cpu * PMD_SIZE + offset;
312 }
313 }
314
315 return NULL;
316}
317#else 148#else
318static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) 149 return LOCAL_DISTANCE;
319{
320 return -EINVAL;
321}
322#endif 150#endif
323
324/*
325 * Embedding allocator
326 *
327 * The first chunk is sized to just contain the static area plus
328 * module and dynamic reserves and embedded into linear physical
329 * mapping so that it can use PMD mapping without additional TLB
330 * pressure.
331 */
332static ssize_t __init setup_pcpu_embed(size_t static_size, bool chosen)
333{
334 size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE;
335
336 /*
337 * If large page isn't supported, there's no benefit in doing
338 * this. Also, embedding allocation doesn't play well with
339 * NUMA.
340 */
341 if (!chosen && (!cpu_has_pse || pcpu_need_numa()))
342 return -EINVAL;
343
344 return pcpu_embed_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE,
345 reserve - PERCPU_FIRST_CHUNK_RESERVE, -1);
346} 151}
347 152
348/* 153static void __init pcpup_populate_pte(unsigned long addr)
349 * 4k page allocator
350 *
351 * This is the basic allocator. Static percpu area is allocated
352 * page-by-page and most of initialization is done by the generic
353 * setup function.
354 */
355static struct page **pcpu4k_pages __initdata;
356static int pcpu4k_nr_static_pages __initdata;
357
358static struct page * __init pcpu4k_get_page(unsigned int cpu, int pageno)
359{
360 if (pageno < pcpu4k_nr_static_pages)
361 return pcpu4k_pages[cpu * pcpu4k_nr_static_pages + pageno];
362 return NULL;
363}
364
365static void __init pcpu4k_populate_pte(unsigned long addr)
366{ 154{
367 populate_extra_pte(addr); 155 populate_extra_pte(addr);
368} 156}
369 157
370static ssize_t __init setup_pcpu_4k(size_t static_size)
371{
372 size_t pages_size;
373 unsigned int cpu;
374 int i, j;
375 ssize_t ret;
376
377 pcpu4k_nr_static_pages = PFN_UP(static_size);
378
379 /* unaligned allocations can't be freed, round up to page size */
380 pages_size = PFN_ALIGN(pcpu4k_nr_static_pages * nr_cpu_ids
381 * sizeof(pcpu4k_pages[0]));
382 pcpu4k_pages = alloc_bootmem(pages_size);
383
384 /* allocate and copy */
385 j = 0;
386 for_each_possible_cpu(cpu)
387 for (i = 0; i < pcpu4k_nr_static_pages; i++) {
388 void *ptr;
389
390 ptr = pcpu_alloc_bootmem(cpu, PAGE_SIZE, PAGE_SIZE);
391 if (!ptr) {
392 pr_warning("PERCPU: failed to allocate "
393 "4k page for cpu%u\n", cpu);
394 goto enomem;
395 }
396
397 memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE);
398 pcpu4k_pages[j++] = virt_to_page(ptr);
399 }
400
401 /* we're ready, commit */
402 pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n",
403 pcpu4k_nr_static_pages, static_size);
404
405 ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size,
406 PERCPU_FIRST_CHUNK_RESERVE, -1,
407 -1, NULL, pcpu4k_populate_pte);
408 goto out_free_ar;
409
410enomem:
411 while (--j >= 0)
412 free_bootmem(__pa(page_address(pcpu4k_pages[j])), PAGE_SIZE);
413 ret = -ENOMEM;
414out_free_ar:
415 free_bootmem(__pa(pcpu4k_pages), pages_size);
416 return ret;
417}
418
419/* for explicit first chunk allocator selection */
420static char pcpu_chosen_alloc[16] __initdata;
421
422static int __init percpu_alloc_setup(char *str)
423{
424 strncpy(pcpu_chosen_alloc, str, sizeof(pcpu_chosen_alloc) - 1);
425 return 0;
426}
427early_param("percpu_alloc", percpu_alloc_setup);
428
429static inline void setup_percpu_segment(int cpu) 158static inline void setup_percpu_segment(int cpu)
430{ 159{
431#ifdef CONFIG_X86_32 160#ifdef CONFIG_X86_32
@@ -441,52 +170,49 @@ static inline void setup_percpu_segment(int cpu)
441 170
442void __init setup_per_cpu_areas(void) 171void __init setup_per_cpu_areas(void)
443{ 172{
444 size_t static_size = __per_cpu_end - __per_cpu_start;
445 unsigned int cpu; 173 unsigned int cpu;
446 unsigned long delta; 174 unsigned long delta;
447 size_t pcpu_unit_size; 175 int rc;
448 ssize_t ret;
449 176
450 pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n", 177 pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n",
451 NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids); 178 NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids);
452 179
453 /* 180 /*
454 * Allocate percpu area. If PSE is supported, try to make use 181 * Allocate percpu area. Embedding allocator is our favorite;
455 * of large page mappings. Please read comments on top of 182 * however, on NUMA configurations, it can result in very
456 * each allocator for details. 183 * sparse unit mapping and vmalloc area isn't spacious enough
184 * on 32bit. Use page in that case.
457 */ 185 */
458 ret = -EINVAL; 186#ifdef CONFIG_X86_32
459 if (strlen(pcpu_chosen_alloc)) { 187 if (pcpu_chosen_fc == PCPU_FC_AUTO && pcpu_need_numa())
460 if (strcmp(pcpu_chosen_alloc, "4k")) { 188 pcpu_chosen_fc = PCPU_FC_PAGE;
461 if (!strcmp(pcpu_chosen_alloc, "lpage")) 189#endif
462 ret = setup_pcpu_lpage(static_size, true); 190 rc = -EINVAL;
463 else if (!strcmp(pcpu_chosen_alloc, "embed")) 191 if (pcpu_chosen_fc != PCPU_FC_PAGE) {
464 ret = setup_pcpu_embed(static_size, true); 192 const size_t atom_size = cpu_has_pse ? PMD_SIZE : PAGE_SIZE;
465 else 193 const size_t dyn_size = PERCPU_MODULE_RESERVE +
466 pr_warning("PERCPU: unknown allocator %s " 194 PERCPU_DYNAMIC_RESERVE - PERCPU_FIRST_CHUNK_RESERVE;
467 "specified\n", pcpu_chosen_alloc); 195
468 if (ret < 0) 196 rc = pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,
469 pr_warning("PERCPU: %s allocator failed (%zd), " 197 dyn_size, atom_size,
470 "falling back to 4k\n", 198 pcpu_cpu_distance,
471 pcpu_chosen_alloc, ret); 199 pcpu_fc_alloc, pcpu_fc_free);
472 } 200 if (rc < 0)
473 } else { 201 pr_warning("PERCPU: %s allocator failed (%d), "
474 ret = setup_pcpu_lpage(static_size, false); 202 "falling back to page size\n",
475 if (ret < 0) 203 pcpu_fc_names[pcpu_chosen_fc], rc);
476 ret = setup_pcpu_embed(static_size, false);
477 } 204 }
478 if (ret < 0) 205 if (rc < 0)
479 ret = setup_pcpu_4k(static_size); 206 rc = pcpu_page_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,
480 if (ret < 0) 207 pcpu_fc_alloc, pcpu_fc_free,
481 panic("cannot allocate static percpu area (%zu bytes, err=%zd)", 208 pcpup_populate_pte);
482 static_size, ret); 209 if (rc < 0)
483 210 panic("cannot initialize percpu area (err=%d)", rc);
484 pcpu_unit_size = ret;
485 211
486 /* alrighty, percpu areas up and running */ 212 /* alrighty, percpu areas up and running */
487 delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; 213 delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
488 for_each_possible_cpu(cpu) { 214 for_each_possible_cpu(cpu) {
489 per_cpu_offset(cpu) = delta + cpu * pcpu_unit_size; 215 per_cpu_offset(cpu) = delta + pcpu_unit_offsets[cpu];
490 per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu); 216 per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu);
491 per_cpu(cpu_number, cpu) = cpu; 217 per_cpu(cpu_number, cpu) = cpu;
492 setup_percpu_segment(cpu); 218 setup_percpu_segment(cpu);
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 78d185d797de..bbf4fd044d07 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -380,15 +380,12 @@ SECTIONS
380 _end = .; 380 _end = .;
381 } 381 }
382 382
383 /* Sections to be discarded */
384 /DISCARD/ : {
385 *(.exitcall.exit)
386 *(.eh_frame)
387 *(.discard)
388 }
389
390 STABS_DEBUG 383 STABS_DEBUG
391 DWARF_DEBUG 384 DWARF_DEBUG
385
386 /* Sections to be discarded */
387 DISCARDS
388 /DISCARD/ : { *(.eh_frame) }
392} 389}
393 390
394 391
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 7e600c1962db..f53cfc7f963d 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -12,6 +12,7 @@
12#include <linux/seq_file.h> 12#include <linux/seq_file.h>
13#include <linux/debugfs.h> 13#include <linux/debugfs.h>
14#include <linux/pfn.h> 14#include <linux/pfn.h>
15#include <linux/percpu.h>
15 16
16#include <asm/e820.h> 17#include <asm/e820.h>
17#include <asm/processor.h> 18#include <asm/processor.h>
@@ -686,7 +687,7 @@ static int cpa_process_alias(struct cpa_data *cpa)
686{ 687{
687 struct cpa_data alias_cpa; 688 struct cpa_data alias_cpa;
688 unsigned long laddr = (unsigned long)__va(cpa->pfn << PAGE_SHIFT); 689 unsigned long laddr = (unsigned long)__va(cpa->pfn << PAGE_SHIFT);
689 unsigned long vaddr, remapped; 690 unsigned long vaddr;
690 int ret; 691 int ret;
691 692
692 if (cpa->pfn >= max_pfn_mapped) 693 if (cpa->pfn >= max_pfn_mapped)
@@ -744,24 +745,6 @@ static int cpa_process_alias(struct cpa_data *cpa)
744 } 745 }
745#endif 746#endif
746 747
747 /*
748 * If the PMD page was partially used for per-cpu remapping,
749 * the recycled area needs to be split and modified. Because
750 * the area is always proper subset of a PMD page
751 * cpa->numpages is guaranteed to be 1 for these areas, so
752 * there's no need to loop over and check for further remaps.
753 */
754 remapped = (unsigned long)pcpu_lpage_remapped((void *)laddr);
755 if (remapped) {
756 WARN_ON(cpa->numpages > 1);
757 alias_cpa = *cpa;
758 alias_cpa.vaddr = &remapped;
759 alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
760 ret = __change_page_attr_set_clr(&alias_cpa, 0);
761 if (ret)
762 return ret;
763 }
764
765 return 0; 748 return 0;
766} 749}
767 750
diff --git a/arch/xtensa/kernel/vmlinux.lds.S b/arch/xtensa/kernel/vmlinux.lds.S
index 41c159cd872f..921b6ff3b645 100644
--- a/arch/xtensa/kernel/vmlinux.lds.S
+++ b/arch/xtensa/kernel/vmlinux.lds.S
@@ -280,15 +280,6 @@ SECTIONS
280 *(.ResetVector.text) 280 *(.ResetVector.text)
281 } 281 }
282 282
283 /* Sections to be discarded */
284 /DISCARD/ :
285 {
286 *(.exit.literal)
287 EXIT_TEXT
288 EXIT_DATA
289 *(.exitcall.exit)
290 }
291
292 .xt.lit : { *(.xt.lit) } 283 .xt.lit : { *(.xt.lit) }
293 .xt.prop : { *(.xt.prop) } 284 .xt.prop : { *(.xt.prop) }
294 285
@@ -321,4 +312,8 @@ SECTIONS
321 *(.xt.lit) 312 *(.xt.lit)
322 *(.gnu.linkonce.p*) 313 *(.gnu.linkonce.p*)
323 } 314 }
315
316 /* Sections to be discarded */
317 DISCARDS
318 /DISCARD/ : { *(.exit.literal) }
324} 319}
diff --git a/block/as-iosched.c b/block/as-iosched.c
index 7a12cf6ee1d3..ce8ba57c6557 100644
--- a/block/as-iosched.c
+++ b/block/as-iosched.c
@@ -146,7 +146,7 @@ enum arq_state {
146#define RQ_STATE(rq) ((enum arq_state)(rq)->elevator_private2) 146#define RQ_STATE(rq) ((enum arq_state)(rq)->elevator_private2)
147#define RQ_SET_STATE(rq, state) ((rq)->elevator_private2 = (void *) state) 147#define RQ_SET_STATE(rq, state) ((rq)->elevator_private2 = (void *) state)
148 148
149static DEFINE_PER_CPU(unsigned long, ioc_count); 149static DEFINE_PER_CPU(unsigned long, as_ioc_count);
150static struct completion *ioc_gone; 150static struct completion *ioc_gone;
151static DEFINE_SPINLOCK(ioc_gone_lock); 151static DEFINE_SPINLOCK(ioc_gone_lock);
152 152
@@ -161,7 +161,7 @@ static void as_antic_stop(struct as_data *ad);
161static void free_as_io_context(struct as_io_context *aic) 161static void free_as_io_context(struct as_io_context *aic)
162{ 162{
163 kfree(aic); 163 kfree(aic);
164 elv_ioc_count_dec(ioc_count); 164 elv_ioc_count_dec(as_ioc_count);
165 if (ioc_gone) { 165 if (ioc_gone) {
166 /* 166 /*
167 * AS scheduler is exiting, grab exit lock and check 167 * AS scheduler is exiting, grab exit lock and check
@@ -169,7 +169,7 @@ static void free_as_io_context(struct as_io_context *aic)
169 * complete ioc_gone and set it back to NULL. 169 * complete ioc_gone and set it back to NULL.
170 */ 170 */
171 spin_lock(&ioc_gone_lock); 171 spin_lock(&ioc_gone_lock);
172 if (ioc_gone && !elv_ioc_count_read(ioc_count)) { 172 if (ioc_gone && !elv_ioc_count_read(as_ioc_count)) {
173 complete(ioc_gone); 173 complete(ioc_gone);
174 ioc_gone = NULL; 174 ioc_gone = NULL;
175 } 175 }
@@ -211,7 +211,7 @@ static struct as_io_context *alloc_as_io_context(void)
211 ret->seek_total = 0; 211 ret->seek_total = 0;
212 ret->seek_samples = 0; 212 ret->seek_samples = 0;
213 ret->seek_mean = 0; 213 ret->seek_mean = 0;
214 elv_ioc_count_inc(ioc_count); 214 elv_ioc_count_inc(as_ioc_count);
215 } 215 }
216 216
217 return ret; 217 return ret;
@@ -1507,7 +1507,7 @@ static void __exit as_exit(void)
1507 ioc_gone = &all_gone; 1507 ioc_gone = &all_gone;
1508 /* ioc_gone's update must be visible before reading ioc_count */ 1508 /* ioc_gone's update must be visible before reading ioc_count */
1509 smp_wmb(); 1509 smp_wmb();
1510 if (elv_ioc_count_read(ioc_count)) 1510 if (elv_ioc_count_read(as_ioc_count))
1511 wait_for_completion(&all_gone); 1511 wait_for_completion(&all_gone);
1512 synchronize_rcu(); 1512 synchronize_rcu();
1513} 1513}
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index fd7080ed7935..1b2d12cda43e 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -48,7 +48,7 @@ static int cfq_slice_idle = HZ / 125;
48static struct kmem_cache *cfq_pool; 48static struct kmem_cache *cfq_pool;
49static struct kmem_cache *cfq_ioc_pool; 49static struct kmem_cache *cfq_ioc_pool;
50 50
51static DEFINE_PER_CPU(unsigned long, ioc_count); 51static DEFINE_PER_CPU(unsigned long, cfq_ioc_count);
52static struct completion *ioc_gone; 52static struct completion *ioc_gone;
53static DEFINE_SPINLOCK(ioc_gone_lock); 53static DEFINE_SPINLOCK(ioc_gone_lock);
54 54
@@ -1427,7 +1427,7 @@ static void cfq_cic_free_rcu(struct rcu_head *head)
1427 cic = container_of(head, struct cfq_io_context, rcu_head); 1427 cic = container_of(head, struct cfq_io_context, rcu_head);
1428 1428
1429 kmem_cache_free(cfq_ioc_pool, cic); 1429 kmem_cache_free(cfq_ioc_pool, cic);
1430 elv_ioc_count_dec(ioc_count); 1430 elv_ioc_count_dec(cfq_ioc_count);
1431 1431
1432 if (ioc_gone) { 1432 if (ioc_gone) {
1433 /* 1433 /*
@@ -1436,7 +1436,7 @@ static void cfq_cic_free_rcu(struct rcu_head *head)
1436 * complete ioc_gone and set it back to NULL 1436 * complete ioc_gone and set it back to NULL
1437 */ 1437 */
1438 spin_lock(&ioc_gone_lock); 1438 spin_lock(&ioc_gone_lock);
1439 if (ioc_gone && !elv_ioc_count_read(ioc_count)) { 1439 if (ioc_gone && !elv_ioc_count_read(cfq_ioc_count)) {
1440 complete(ioc_gone); 1440 complete(ioc_gone);
1441 ioc_gone = NULL; 1441 ioc_gone = NULL;
1442 } 1442 }
@@ -1562,7 +1562,7 @@ cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
1562 INIT_HLIST_NODE(&cic->cic_list); 1562 INIT_HLIST_NODE(&cic->cic_list);
1563 cic->dtor = cfq_free_io_context; 1563 cic->dtor = cfq_free_io_context;
1564 cic->exit = cfq_exit_io_context; 1564 cic->exit = cfq_exit_io_context;
1565 elv_ioc_count_inc(ioc_count); 1565 elv_ioc_count_inc(cfq_ioc_count);
1566 } 1566 }
1567 1567
1568 return cic; 1568 return cic;
@@ -2668,7 +2668,7 @@ static void __exit cfq_exit(void)
2668 * this also protects us from entering cfq_slab_kill() with 2668 * this also protects us from entering cfq_slab_kill() with
2669 * pending RCU callbacks 2669 * pending RCU callbacks
2670 */ 2670 */
2671 if (elv_ioc_count_read(ioc_count)) 2671 if (elv_ioc_count_read(cfq_ioc_count))
2672 wait_for_completion(&all_gone); 2672 wait_for_completion(&all_gone);
2673 cfq_slab_kill(); 2673 cfq_slab_kill();
2674} 2674}
diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c
index bdea7e2f94ba..bc33ddc9c97c 100644
--- a/drivers/cpufreq/cpufreq_conservative.c
+++ b/drivers/cpufreq/cpufreq_conservative.c
@@ -71,7 +71,7 @@ struct cpu_dbs_info_s {
71 */ 71 */
72 struct mutex timer_mutex; 72 struct mutex timer_mutex;
73}; 73};
74static DEFINE_PER_CPU(struct cpu_dbs_info_s, cpu_dbs_info); 74static DEFINE_PER_CPU(struct cpu_dbs_info_s, cs_cpu_dbs_info);
75 75
76static unsigned int dbs_enable; /* number of CPUs using this policy */ 76static unsigned int dbs_enable; /* number of CPUs using this policy */
77 77
@@ -137,7 +137,7 @@ dbs_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
137 void *data) 137 void *data)
138{ 138{
139 struct cpufreq_freqs *freq = data; 139 struct cpufreq_freqs *freq = data;
140 struct cpu_dbs_info_s *this_dbs_info = &per_cpu(cpu_dbs_info, 140 struct cpu_dbs_info_s *this_dbs_info = &per_cpu(cs_cpu_dbs_info,
141 freq->cpu); 141 freq->cpu);
142 142
143 struct cpufreq_policy *policy; 143 struct cpufreq_policy *policy;
@@ -297,7 +297,7 @@ static ssize_t store_ignore_nice_load(struct cpufreq_policy *policy,
297 /* we need to re-evaluate prev_cpu_idle */ 297 /* we need to re-evaluate prev_cpu_idle */
298 for_each_online_cpu(j) { 298 for_each_online_cpu(j) {
299 struct cpu_dbs_info_s *dbs_info; 299 struct cpu_dbs_info_s *dbs_info;
300 dbs_info = &per_cpu(cpu_dbs_info, j); 300 dbs_info = &per_cpu(cs_cpu_dbs_info, j);
301 dbs_info->prev_cpu_idle = get_cpu_idle_time(j, 301 dbs_info->prev_cpu_idle = get_cpu_idle_time(j,
302 &dbs_info->prev_cpu_wall); 302 &dbs_info->prev_cpu_wall);
303 if (dbs_tuners_ins.ignore_nice) 303 if (dbs_tuners_ins.ignore_nice)
@@ -387,7 +387,7 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info)
387 cputime64_t cur_wall_time, cur_idle_time; 387 cputime64_t cur_wall_time, cur_idle_time;
388 unsigned int idle_time, wall_time; 388 unsigned int idle_time, wall_time;
389 389
390 j_dbs_info = &per_cpu(cpu_dbs_info, j); 390 j_dbs_info = &per_cpu(cs_cpu_dbs_info, j);
391 391
392 cur_idle_time = get_cpu_idle_time(j, &cur_wall_time); 392 cur_idle_time = get_cpu_idle_time(j, &cur_wall_time);
393 393
@@ -521,7 +521,7 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy,
521 unsigned int j; 521 unsigned int j;
522 int rc; 522 int rc;
523 523
524 this_dbs_info = &per_cpu(cpu_dbs_info, cpu); 524 this_dbs_info = &per_cpu(cs_cpu_dbs_info, cpu);
525 525
526 switch (event) { 526 switch (event) {
527 case CPUFREQ_GOV_START: 527 case CPUFREQ_GOV_START:
@@ -538,7 +538,7 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy,
538 538
539 for_each_cpu(j, policy->cpus) { 539 for_each_cpu(j, policy->cpus) {
540 struct cpu_dbs_info_s *j_dbs_info; 540 struct cpu_dbs_info_s *j_dbs_info;
541 j_dbs_info = &per_cpu(cpu_dbs_info, j); 541 j_dbs_info = &per_cpu(cs_cpu_dbs_info, j);
542 j_dbs_info->cur_policy = policy; 542 j_dbs_info->cur_policy = policy;
543 543
544 j_dbs_info->prev_cpu_idle = get_cpu_idle_time(j, 544 j_dbs_info->prev_cpu_idle = get_cpu_idle_time(j,
diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
index d6ba14276bb1..d7a528c80de8 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -78,7 +78,7 @@ struct cpu_dbs_info_s {
78 */ 78 */
79 struct mutex timer_mutex; 79 struct mutex timer_mutex;
80}; 80};
81static DEFINE_PER_CPU(struct cpu_dbs_info_s, cpu_dbs_info); 81static DEFINE_PER_CPU(struct cpu_dbs_info_s, od_cpu_dbs_info);
82 82
83static unsigned int dbs_enable; /* number of CPUs using this policy */ 83static unsigned int dbs_enable; /* number of CPUs using this policy */
84 84
@@ -149,7 +149,8 @@ static unsigned int powersave_bias_target(struct cpufreq_policy *policy,
149 unsigned int freq_hi, freq_lo; 149 unsigned int freq_hi, freq_lo;
150 unsigned int index = 0; 150 unsigned int index = 0;
151 unsigned int jiffies_total, jiffies_hi, jiffies_lo; 151 unsigned int jiffies_total, jiffies_hi, jiffies_lo;
152 struct cpu_dbs_info_s *dbs_info = &per_cpu(cpu_dbs_info, policy->cpu); 152 struct cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info,
153 policy->cpu);
153 154
154 if (!dbs_info->freq_table) { 155 if (!dbs_info->freq_table) {
155 dbs_info->freq_lo = 0; 156 dbs_info->freq_lo = 0;
@@ -192,7 +193,7 @@ static unsigned int powersave_bias_target(struct cpufreq_policy *policy,
192 193
193static void ondemand_powersave_bias_init_cpu(int cpu) 194static void ondemand_powersave_bias_init_cpu(int cpu)
194{ 195{
195 struct cpu_dbs_info_s *dbs_info = &per_cpu(cpu_dbs_info, cpu); 196 struct cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info, cpu);
196 dbs_info->freq_table = cpufreq_frequency_get_table(cpu); 197 dbs_info->freq_table = cpufreq_frequency_get_table(cpu);
197 dbs_info->freq_lo = 0; 198 dbs_info->freq_lo = 0;
198} 199}
@@ -297,7 +298,7 @@ static ssize_t store_ignore_nice_load(struct cpufreq_policy *policy,
297 /* we need to re-evaluate prev_cpu_idle */ 298 /* we need to re-evaluate prev_cpu_idle */
298 for_each_online_cpu(j) { 299 for_each_online_cpu(j) {
299 struct cpu_dbs_info_s *dbs_info; 300 struct cpu_dbs_info_s *dbs_info;
300 dbs_info = &per_cpu(cpu_dbs_info, j); 301 dbs_info = &per_cpu(od_cpu_dbs_info, j);
301 dbs_info->prev_cpu_idle = get_cpu_idle_time(j, 302 dbs_info->prev_cpu_idle = get_cpu_idle_time(j,
302 &dbs_info->prev_cpu_wall); 303 &dbs_info->prev_cpu_wall);
303 if (dbs_tuners_ins.ignore_nice) 304 if (dbs_tuners_ins.ignore_nice)
@@ -388,7 +389,7 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info)
388 unsigned int load, load_freq; 389 unsigned int load, load_freq;
389 int freq_avg; 390 int freq_avg;
390 391
391 j_dbs_info = &per_cpu(cpu_dbs_info, j); 392 j_dbs_info = &per_cpu(od_cpu_dbs_info, j);
392 393
393 cur_idle_time = get_cpu_idle_time(j, &cur_wall_time); 394 cur_idle_time = get_cpu_idle_time(j, &cur_wall_time);
394 395
@@ -535,7 +536,7 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy,
535 unsigned int j; 536 unsigned int j;
536 int rc; 537 int rc;
537 538
538 this_dbs_info = &per_cpu(cpu_dbs_info, cpu); 539 this_dbs_info = &per_cpu(od_cpu_dbs_info, cpu);
539 540
540 switch (event) { 541 switch (event) {
541 case CPUFREQ_GOV_START: 542 case CPUFREQ_GOV_START:
@@ -553,7 +554,7 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy,
553 dbs_enable++; 554 dbs_enable++;
554 for_each_cpu(j, policy->cpus) { 555 for_each_cpu(j, policy->cpus) {
555 struct cpu_dbs_info_s *j_dbs_info; 556 struct cpu_dbs_info_s *j_dbs_info;
556 j_dbs_info = &per_cpu(cpu_dbs_info, j); 557 j_dbs_info = &per_cpu(od_cpu_dbs_info, j);
557 j_dbs_info->cur_policy = policy; 558 j_dbs_info->cur_policy = policy;
558 559
559 j_dbs_info->prev_cpu_idle = get_cpu_idle_time(j, 560 j_dbs_info->prev_cpu_idle = get_cpu_idle_time(j,
diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index abad71b1632b..2f57276e87a2 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -47,10 +47,10 @@
47static DEFINE_SPINLOCK(irq_mapping_update_lock); 47static DEFINE_SPINLOCK(irq_mapping_update_lock);
48 48
49/* IRQ <-> VIRQ mapping. */ 49/* IRQ <-> VIRQ mapping. */
50static DEFINE_PER_CPU(int, virq_to_irq[NR_VIRQS]) = {[0 ... NR_VIRQS-1] = -1}; 50static DEFINE_PER_CPU(int [NR_VIRQS], virq_to_irq) = {[0 ... NR_VIRQS-1] = -1};
51 51
52/* IRQ <-> IPI mapping */ 52/* IRQ <-> IPI mapping */
53static DEFINE_PER_CPU(int, ipi_to_irq[XEN_NR_IPIS]) = {[0 ... XEN_NR_IPIS-1] = -1}; 53static DEFINE_PER_CPU(int [XEN_NR_IPIS], ipi_to_irq) = {[0 ... XEN_NR_IPIS-1] = -1};
54 54
55/* Interrupt types. */ 55/* Interrupt types. */
56enum xen_irq_type { 56enum xen_irq_type {
@@ -602,6 +602,8 @@ irqreturn_t xen_debug_interrupt(int irq, void *dev_id)
602 return IRQ_HANDLED; 602 return IRQ_HANDLED;
603} 603}
604 604
605static DEFINE_PER_CPU(unsigned, xed_nesting_count);
606
605/* 607/*
606 * Search the CPUs pending events bitmasks. For each one found, map 608 * Search the CPUs pending events bitmasks. For each one found, map
607 * the event number to an irq, and feed it into do_IRQ() for 609 * the event number to an irq, and feed it into do_IRQ() for
@@ -617,7 +619,6 @@ void xen_evtchn_do_upcall(struct pt_regs *regs)
617 struct pt_regs *old_regs = set_irq_regs(regs); 619 struct pt_regs *old_regs = set_irq_regs(regs);
618 struct shared_info *s = HYPERVISOR_shared_info; 620 struct shared_info *s = HYPERVISOR_shared_info;
619 struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu); 621 struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu);
620 static DEFINE_PER_CPU(unsigned, nesting_count);
621 unsigned count; 622 unsigned count;
622 623
623 exit_idle(); 624 exit_idle();
@@ -628,7 +629,7 @@ void xen_evtchn_do_upcall(struct pt_regs *regs)
628 629
629 vcpu_info->evtchn_upcall_pending = 0; 630 vcpu_info->evtchn_upcall_pending = 0;
630 631
631 if (__get_cpu_var(nesting_count)++) 632 if (__get_cpu_var(xed_nesting_count)++)
632 goto out; 633 goto out;
633 634
634#ifndef CONFIG_X86 /* No need for a barrier -- XCHG is a barrier on x86. */ 635#ifndef CONFIG_X86 /* No need for a barrier -- XCHG is a barrier on x86. */
@@ -653,8 +654,8 @@ void xen_evtchn_do_upcall(struct pt_regs *regs)
653 654
654 BUG_ON(!irqs_disabled()); 655 BUG_ON(!irqs_disabled());
655 656
656 count = __get_cpu_var(nesting_count); 657 count = __get_cpu_var(xed_nesting_count);
657 __get_cpu_var(nesting_count) = 0; 658 __get_cpu_var(xed_nesting_count) = 0;
658 } while(count != 1); 659 } while(count != 1);
659 660
660out: 661out:
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 6ad76bf5fb40..a43223af98b6 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -33,13 +33,10 @@
33 * BSS_SECTION(0, 0, 0) 33 * BSS_SECTION(0, 0, 0)
34 * _end = .; 34 * _end = .;
35 * 35 *
36 * /DISCARD/ : {
37 * EXIT_TEXT
38 * EXIT_DATA
39 * EXIT_CALL
40 * }
41 * STABS_DEBUG 36 * STABS_DEBUG
42 * DWARF_DEBUG 37 * DWARF_DEBUG
38 *
39 * DISCARDS // must be the last
43 * } 40 * }
44 * 41 *
45 * [__init_begin, __init_end] is the init section that may be freed after init 42 * [__init_begin, __init_end] is the init section that may be freed after init
@@ -626,6 +623,23 @@
626#define INIT_RAM_FS 623#define INIT_RAM_FS
627#endif 624#endif
628 625
626/*
627 * Default discarded sections.
628 *
629 * Some archs want to discard exit text/data at runtime rather than
630 * link time due to cross-section references such as alt instructions,
631 * bug table, eh_frame, etc. DISCARDS must be the last of output
632 * section definitions so that such archs put those in earlier section
633 * definitions.
634 */
635#define DISCARDS \
636 /DISCARD/ : { \
637 EXIT_TEXT \
638 EXIT_DATA \
639 EXIT_CALL \
640 *(.discard) \
641 }
642
629/** 643/**
630 * PERCPU_VADDR - define output section for percpu area 644 * PERCPU_VADDR - define output section for percpu area
631 * @vaddr: explicit base address (optional) 645 * @vaddr: explicit base address (optional)
diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h
index 68438e18fff4..aefc2f12b48c 100644
--- a/include/linux/percpu-defs.h
+++ b/include/linux/percpu-defs.h
@@ -10,22 +10,70 @@
10/* 10/*
11 * Base implementations of per-CPU variable declarations and definitions, where 11 * Base implementations of per-CPU variable declarations and definitions, where
12 * the section in which the variable is to be placed is provided by the 12 * the section in which the variable is to be placed is provided by the
13 * 'section' argument. This may be used to affect the parameters governing the 13 * 'sec' argument. This may be used to affect the parameters governing the
14 * variable's storage. 14 * variable's storage.
15 * 15 *
16 * NOTE! The sections for the DECLARE and for the DEFINE must match, lest 16 * NOTE! The sections for the DECLARE and for the DEFINE must match, lest
17 * linkage errors occur due the compiler generating the wrong code to access 17 * linkage errors occur due the compiler generating the wrong code to access
18 * that section. 18 * that section.
19 */ 19 */
20#define DECLARE_PER_CPU_SECTION(type, name, section) \ 20#define __PCPU_ATTRS(sec) \
21 extern \ 21 __attribute__((section(PER_CPU_BASE_SECTION sec))) \
22 __attribute__((__section__(PER_CPU_BASE_SECTION section))) \ 22 PER_CPU_ATTRIBUTES
23 PER_CPU_ATTRIBUTES __typeof__(type) per_cpu__##name 23
24 24#define __PCPU_DUMMY_ATTRS \
25#define DEFINE_PER_CPU_SECTION(type, name, section) \ 25 __attribute__((section(".discard"), unused))
26 __attribute__((__section__(PER_CPU_BASE_SECTION section))) \ 26
27 PER_CPU_ATTRIBUTES PER_CPU_DEF_ATTRIBUTES \ 27/*
28 * s390 and alpha modules require percpu variables to be defined as
29 * weak to force the compiler to generate GOT based external
30 * references for them. This is necessary because percpu sections
31 * will be located outside of the usually addressable area.
32 *
33 * This definition puts the following two extra restrictions when
34 * defining percpu variables.
35 *
36 * 1. The symbol must be globally unique, even the static ones.
37 * 2. Static percpu variables cannot be defined inside a function.
38 *
39 * Archs which need weak percpu definitions should define
40 * ARCH_NEEDS_WEAK_PER_CPU in asm/percpu.h when necessary.
41 *
42 * To ensure that the generic code observes the above two
43 * restrictions, if CONFIG_DEBUG_FORCE_WEAK_PER_CPU is set weak
44 * definition is used for all cases.
45 */
46#if defined(ARCH_NEEDS_WEAK_PER_CPU) || defined(CONFIG_DEBUG_FORCE_WEAK_PER_CPU)
47/*
48 * __pcpu_scope_* dummy variable is used to enforce scope. It
49 * receives the static modifier when it's used in front of
50 * DEFINE_PER_CPU() and will trigger build failure if
51 * DECLARE_PER_CPU() is used for the same variable.
52 *
53 * __pcpu_unique_* dummy variable is used to enforce symbol uniqueness
54 * such that hidden weak symbol collision, which will cause unrelated
55 * variables to share the same address, can be detected during build.
56 */
57#define DECLARE_PER_CPU_SECTION(type, name, sec) \
58 extern __PCPU_DUMMY_ATTRS char __pcpu_scope_##name; \
59 extern __PCPU_ATTRS(sec) __typeof__(type) per_cpu__##name
60
61#define DEFINE_PER_CPU_SECTION(type, name, sec) \
62 __PCPU_DUMMY_ATTRS char __pcpu_scope_##name; \
63 __PCPU_DUMMY_ATTRS char __pcpu_unique_##name; \
64 __PCPU_ATTRS(sec) PER_CPU_DEF_ATTRIBUTES __weak \
65 __typeof__(type) per_cpu__##name
66#else
67/*
68 * Normal declaration and definition macros.
69 */
70#define DECLARE_PER_CPU_SECTION(type, name, sec) \
71 extern __PCPU_ATTRS(sec) __typeof__(type) per_cpu__##name
72
73#define DEFINE_PER_CPU_SECTION(type, name, sec) \
74 __PCPU_ATTRS(sec) PER_CPU_DEF_ATTRIBUTES \
28 __typeof__(type) per_cpu__##name 75 __typeof__(type) per_cpu__##name
76#endif
29 77
30/* 78/*
31 * Variant on the per-CPU variable declaration/definition theme used for 79 * Variant on the per-CPU variable declaration/definition theme used for
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 26fd9d12f050..878836ca999c 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -34,7 +34,7 @@
34 34
35#ifdef CONFIG_SMP 35#ifdef CONFIG_SMP
36 36
37#ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA 37#ifndef CONFIG_HAVE_LEGACY_PER_CPU_AREA
38 38
39/* minimum unit size, also is the maximum supported allocation size */ 39/* minimum unit size, also is the maximum supported allocation size */
40#define PCPU_MIN_UNIT_SIZE PFN_ALIGN(64 << 10) 40#define PCPU_MIN_UNIT_SIZE PFN_ALIGN(64 << 10)
@@ -57,19 +57,70 @@
57#endif 57#endif
58 58
59extern void *pcpu_base_addr; 59extern void *pcpu_base_addr;
60extern const unsigned long *pcpu_unit_offsets;
60 61
61typedef struct page * (*pcpu_get_page_fn_t)(unsigned int cpu, int pageno); 62struct pcpu_group_info {
62typedef void (*pcpu_populate_pte_fn_t)(unsigned long addr); 63 int nr_units; /* aligned # of units */
64 unsigned long base_offset; /* base address offset */
65 unsigned int *cpu_map; /* unit->cpu map, empty
66 * entries contain NR_CPUS */
67};
68
69struct pcpu_alloc_info {
70 size_t static_size;
71 size_t reserved_size;
72 size_t dyn_size;
73 size_t unit_size;
74 size_t atom_size;
75 size_t alloc_size;
76 size_t __ai_size; /* internal, don't use */
77 int nr_groups; /* 0 if grouping unnecessary */
78 struct pcpu_group_info groups[];
79};
63 80
64extern size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, 81enum pcpu_fc {
65 size_t static_size, size_t reserved_size, 82 PCPU_FC_AUTO,
66 ssize_t dyn_size, ssize_t unit_size, 83 PCPU_FC_EMBED,
67 void *base_addr, 84 PCPU_FC_PAGE,
68 pcpu_populate_pte_fn_t populate_pte_fn);
69 85
70extern ssize_t __init pcpu_embed_first_chunk( 86 PCPU_FC_NR,
71 size_t static_size, size_t reserved_size, 87};
72 ssize_t dyn_size, ssize_t unit_size); 88extern const char *pcpu_fc_names[PCPU_FC_NR];
89
90extern enum pcpu_fc pcpu_chosen_fc;
91
92typedef void * (*pcpu_fc_alloc_fn_t)(unsigned int cpu, size_t size,
93 size_t align);
94typedef void (*pcpu_fc_free_fn_t)(void *ptr, size_t size);
95typedef void (*pcpu_fc_populate_pte_fn_t)(unsigned long addr);
96typedef int (pcpu_fc_cpu_distance_fn_t)(unsigned int from, unsigned int to);
97
98extern struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups,
99 int nr_units);
100extern void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai);
101
102extern struct pcpu_alloc_info * __init pcpu_build_alloc_info(
103 size_t reserved_size, ssize_t dyn_size,
104 size_t atom_size,
105 pcpu_fc_cpu_distance_fn_t cpu_distance_fn);
106
107extern int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
108 void *base_addr);
109
110#ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK
111extern int __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size,
112 size_t atom_size,
113 pcpu_fc_cpu_distance_fn_t cpu_distance_fn,
114 pcpu_fc_alloc_fn_t alloc_fn,
115 pcpu_fc_free_fn_t free_fn);
116#endif
117
118#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
119extern int __init pcpu_page_first_chunk(size_t reserved_size,
120 pcpu_fc_alloc_fn_t alloc_fn,
121 pcpu_fc_free_fn_t free_fn,
122 pcpu_fc_populate_pte_fn_t populate_pte_fn);
123#endif
73 124
74/* 125/*
75 * Use this to get to a cpu's version of the per-cpu object 126 * Use this to get to a cpu's version of the per-cpu object
@@ -80,7 +131,7 @@ extern ssize_t __init pcpu_embed_first_chunk(
80 131
81extern void *__alloc_reserved_percpu(size_t size, size_t align); 132extern void *__alloc_reserved_percpu(size_t size, size_t align);
82 133
83#else /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */ 134#else /* CONFIG_HAVE_LEGACY_PER_CPU_AREA */
84 135
85struct percpu_data { 136struct percpu_data {
86 void *ptrs[1]; 137 void *ptrs[1];
@@ -99,11 +150,15 @@ struct percpu_data {
99 (__typeof__(ptr))__p->ptrs[(cpu)]; \ 150 (__typeof__(ptr))__p->ptrs[(cpu)]; \
100}) 151})
101 152
102#endif /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */ 153#endif /* CONFIG_HAVE_LEGACY_PER_CPU_AREA */
103 154
104extern void *__alloc_percpu(size_t size, size_t align); 155extern void *__alloc_percpu(size_t size, size_t align);
105extern void free_percpu(void *__pdata); 156extern void free_percpu(void *__pdata);
106 157
158#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
159extern void __init setup_per_cpu_areas(void);
160#endif
161
107#else /* CONFIG_SMP */ 162#else /* CONFIG_SMP */
108 163
109#define per_cpu_ptr(ptr, cpu) ({ (void)(cpu); (ptr); }) 164#define per_cpu_ptr(ptr, cpu) ({ (void)(cpu); (ptr); })
@@ -124,6 +179,13 @@ static inline void free_percpu(void *p)
124 kfree(p); 179 kfree(p);
125} 180}
126 181
182static inline void __init setup_per_cpu_areas(void) { }
183
184static inline void *pcpu_lpage_remapped(void *kaddr)
185{
186 return NULL;
187}
188
127#endif /* CONFIG_SMP */ 189#endif /* CONFIG_SMP */
128 190
129#define alloc_percpu(type) (type *)__alloc_percpu(sizeof(type), \ 191#define alloc_percpu(type) (type *)__alloc_percpu(sizeof(type), \
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index a43ebec3a7b9..227c2a585e4f 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -115,4 +115,10 @@ extern rwlock_t vmlist_lock;
115extern struct vm_struct *vmlist; 115extern struct vm_struct *vmlist;
116extern __init void vm_area_register_early(struct vm_struct *vm, size_t align); 116extern __init void vm_area_register_early(struct vm_struct *vm, size_t align);
117 117
118struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
119 const size_t *sizes, int nr_vms,
120 size_t align, gfp_t gfp_mask);
121
122void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms);
123
118#endif /* _LINUX_VMALLOC_H */ 124#endif /* _LINUX_VMALLOC_H */
diff --git a/init/main.c b/init/main.c
index 2d9d6bdfe7c9..2f9544d8435a 100644
--- a/init/main.c
+++ b/init/main.c
@@ -353,7 +353,6 @@ static void __init smp_init(void)
353#define smp_init() do { } while (0) 353#define smp_init() do { } while (0)
354#endif 354#endif
355 355
356static inline void setup_per_cpu_areas(void) { }
357static inline void setup_nr_cpu_ids(void) { } 356static inline void setup_nr_cpu_ids(void) { }
358static inline void smp_prepare_cpus(unsigned int maxcpus) { } 357static inline void smp_prepare_cpus(unsigned int maxcpus) { }
359 358
@@ -374,29 +373,6 @@ static void __init setup_nr_cpu_ids(void)
374 nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1; 373 nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1;
375} 374}
376 375
377#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
378unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
379
380EXPORT_SYMBOL(__per_cpu_offset);
381
382static void __init setup_per_cpu_areas(void)
383{
384 unsigned long size, i;
385 char *ptr;
386 unsigned long nr_possible_cpus = num_possible_cpus();
387
388 /* Copy section for each CPU (we discard the original) */
389 size = ALIGN(PERCPU_ENOUGH_ROOM, PAGE_SIZE);
390 ptr = alloc_bootmem_pages(size * nr_possible_cpus);
391
392 for_each_possible_cpu(i) {
393 __per_cpu_offset[i] = ptr - __per_cpu_start;
394 memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
395 ptr += size;
396 }
397}
398#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */
399
400/* Called by boot processor to activate the rest. */ 376/* Called by boot processor to activate the rest. */
401static void __init smp_init(void) 377static void __init smp_init(void)
402{ 378{
diff --git a/kernel/module.c b/kernel/module.c
index fd1411403558..3a4db71ea494 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -364,7 +364,7 @@ EXPORT_SYMBOL_GPL(find_module);
364 364
365#ifdef CONFIG_SMP 365#ifdef CONFIG_SMP
366 366
367#ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA 367#ifndef CONFIG_HAVE_LEGACY_PER_CPU_AREA
368 368
369static void *percpu_modalloc(unsigned long size, unsigned long align, 369static void *percpu_modalloc(unsigned long size, unsigned long align,
370 const char *name) 370 const char *name)
@@ -389,7 +389,7 @@ static void percpu_modfree(void *freeme)
389 free_percpu(freeme); 389 free_percpu(freeme);
390} 390}
391 391
392#else /* ... !CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */ 392#else /* ... CONFIG_HAVE_LEGACY_PER_CPU_AREA */
393 393
394/* Number of blocks used and allocated. */ 394/* Number of blocks used and allocated. */
395static unsigned int pcpu_num_used, pcpu_num_allocated; 395static unsigned int pcpu_num_used, pcpu_num_allocated;
@@ -535,7 +535,7 @@ static int percpu_modinit(void)
535} 535}
536__initcall(percpu_modinit); 536__initcall(percpu_modinit);
537 537
538#endif /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */ 538#endif /* CONFIG_HAVE_LEGACY_PER_CPU_AREA */
539 539
540static unsigned int find_pcpusec(Elf_Ehdr *hdr, 540static unsigned int find_pcpusec(Elf_Ehdr *hdr,
541 Elf_Shdr *sechdrs, 541 Elf_Shdr *sechdrs,
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 534e20d14d63..b0bdb36ccfc8 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -100,16 +100,16 @@ hw_perf_group_sched_in(struct perf_counter *group_leader,
100 100
101void __weak perf_counter_print_debug(void) { } 101void __weak perf_counter_print_debug(void) { }
102 102
103static DEFINE_PER_CPU(int, disable_count); 103static DEFINE_PER_CPU(int, perf_disable_count);
104 104
105void __perf_disable(void) 105void __perf_disable(void)
106{ 106{
107 __get_cpu_var(disable_count)++; 107 __get_cpu_var(perf_disable_count)++;
108} 108}
109 109
110bool __perf_enable(void) 110bool __perf_enable(void)
111{ 111{
112 return !--__get_cpu_var(disable_count); 112 return !--__get_cpu_var(perf_disable_count);
113} 113}
114 114
115void perf_disable(void) 115void perf_disable(void)
diff --git a/kernel/sched.c b/kernel/sched.c
index 1b59e265273b..d3d7e7694da6 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -318,12 +318,12 @@ struct task_group root_task_group;
318/* Default task group's sched entity on each cpu */ 318/* Default task group's sched entity on each cpu */
319static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); 319static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
320/* Default task group's cfs_rq on each cpu */ 320/* Default task group's cfs_rq on each cpu */
321static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; 321static DEFINE_PER_CPU_SHARED_ALIGNED(struct cfs_rq, init_cfs_rq);
322#endif /* CONFIG_FAIR_GROUP_SCHED */ 322#endif /* CONFIG_FAIR_GROUP_SCHED */
323 323
324#ifdef CONFIG_RT_GROUP_SCHED 324#ifdef CONFIG_RT_GROUP_SCHED
325static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); 325static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
326static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; 326static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq);
327#endif /* CONFIG_RT_GROUP_SCHED */ 327#endif /* CONFIG_RT_GROUP_SCHED */
328#else /* !CONFIG_USER_SCHED */ 328#else /* !CONFIG_USER_SCHED */
329#define root_task_group init_task_group 329#define root_task_group init_task_group
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index e75276a49cf5..0db0a41e0079 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1334,7 +1334,7 @@ static __init void event_trace_self_tests(void)
1334 1334
1335#ifdef CONFIG_FUNCTION_TRACER 1335#ifdef CONFIG_FUNCTION_TRACER
1336 1336
1337static DEFINE_PER_CPU(atomic_t, test_event_disable); 1337static DEFINE_PER_CPU(atomic_t, ftrace_test_event_disable);
1338 1338
1339static void 1339static void
1340function_test_events_call(unsigned long ip, unsigned long parent_ip) 1340function_test_events_call(unsigned long ip, unsigned long parent_ip)
@@ -1350,7 +1350,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip)
1350 pc = preempt_count(); 1350 pc = preempt_count();
1351 resched = ftrace_preempt_disable(); 1351 resched = ftrace_preempt_disable();
1352 cpu = raw_smp_processor_id(); 1352 cpu = raw_smp_processor_id();
1353 disabled = atomic_inc_return(&per_cpu(test_event_disable, cpu)); 1353 disabled = atomic_inc_return(&per_cpu(ftrace_test_event_disable, cpu));
1354 1354
1355 if (disabled != 1) 1355 if (disabled != 1)
1356 goto out; 1356 goto out;
@@ -1368,7 +1368,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip)
1368 trace_nowake_buffer_unlock_commit(event, flags, pc); 1368 trace_nowake_buffer_unlock_commit(event, flags, pc);
1369 1369
1370 out: 1370 out:
1371 atomic_dec(&per_cpu(test_event_disable, cpu)); 1371 atomic_dec(&per_cpu(ftrace_test_event_disable, cpu));
1372 ftrace_preempt_enable(resched); 1372 ftrace_preempt_enable(resched);
1373} 1373}
1374 1374
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 12327b2bb785..43173c4e0ade 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -790,6 +790,21 @@ config DEBUG_BLOCK_EXT_DEVT
790 790
791 Say N if you are unsure. 791 Say N if you are unsure.
792 792
793config DEBUG_FORCE_WEAK_PER_CPU
794 bool "Force weak per-cpu definitions"
795 depends on DEBUG_KERNEL
796 help
797 s390 and alpha require percpu variables in modules to be
798 defined weak to work around addressing range issue which
799 puts the following two restrictions on percpu variable
800 definitions.
801
802 1. percpu symbols must be unique whether static or not
803 2. percpu variables can't be defined inside a function
804
805 To ensure that generic code follows the above rules, this
806 option forces all percpu variables to be defined as weak.
807
793config LKDTM 808config LKDTM
794 tristate "Linux Kernel Dump Test Tool Module" 809 tristate "Linux Kernel Dump Test Tool Module"
795 depends on DEBUG_KERNEL 810 depends on DEBUG_KERNEL
diff --git a/mm/Makefile b/mm/Makefile
index 5e0bd6426693..c77c6487552f 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -33,7 +33,7 @@ obj-$(CONFIG_FAILSLAB) += failslab.o
33obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o 33obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
34obj-$(CONFIG_FS_XIP) += filemap_xip.o 34obj-$(CONFIG_FS_XIP) += filemap_xip.o
35obj-$(CONFIG_MIGRATION) += migrate.o 35obj-$(CONFIG_MIGRATION) += migrate.o
36ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA 36ifndef CONFIG_HAVE_LEGACY_PER_CPU_AREA
37obj-$(CONFIG_SMP) += percpu.o 37obj-$(CONFIG_SMP) += percpu.o
38else 38else
39obj-$(CONFIG_SMP) += allocpercpu.o 39obj-$(CONFIG_SMP) += allocpercpu.o
diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c
index dfdee6a47359..df34ceae0c67 100644
--- a/mm/allocpercpu.c
+++ b/mm/allocpercpu.c
@@ -5,6 +5,8 @@
5 */ 5 */
6#include <linux/mm.h> 6#include <linux/mm.h>
7#include <linux/module.h> 7#include <linux/module.h>
8#include <linux/bootmem.h>
9#include <asm/sections.h>
8 10
9#ifndef cache_line_size 11#ifndef cache_line_size
10#define cache_line_size() L1_CACHE_BYTES 12#define cache_line_size() L1_CACHE_BYTES
@@ -147,3 +149,29 @@ void free_percpu(void *__pdata)
147 kfree(__percpu_disguise(__pdata)); 149 kfree(__percpu_disguise(__pdata));
148} 150}
149EXPORT_SYMBOL_GPL(free_percpu); 151EXPORT_SYMBOL_GPL(free_percpu);
152
153/*
154 * Generic percpu area setup.
155 */
156#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
157unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
158
159EXPORT_SYMBOL(__per_cpu_offset);
160
161void __init setup_per_cpu_areas(void)
162{
163 unsigned long size, i;
164 char *ptr;
165 unsigned long nr_possible_cpus = num_possible_cpus();
166
167 /* Copy section for each CPU (we discard the original) */
168 size = ALIGN(PERCPU_ENOUGH_ROOM, PAGE_SIZE);
169 ptr = alloc_bootmem_pages(size * nr_possible_cpus);
170
171 for_each_possible_cpu(i) {
172 __per_cpu_offset[i] = ptr - __per_cpu_start;
173 memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
174 ptr += size;
175 }
176}
177#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */
diff --git a/mm/kmemleak-test.c b/mm/kmemleak-test.c
index d5292fc6f523..177a5169bbde 100644
--- a/mm/kmemleak-test.c
+++ b/mm/kmemleak-test.c
@@ -36,7 +36,7 @@ struct test_node {
36}; 36};
37 37
38static LIST_HEAD(test_list); 38static LIST_HEAD(test_list);
39static DEFINE_PER_CPU(void *, test_pointer); 39static DEFINE_PER_CPU(void *, kmemleak_test_pointer);
40 40
41/* 41/*
42 * Some very simple testing. This function needs to be extended for 42 * Some very simple testing. This function needs to be extended for
@@ -86,9 +86,9 @@ static int __init kmemleak_test_init(void)
86 } 86 }
87 87
88 for_each_possible_cpu(i) { 88 for_each_possible_cpu(i) {
89 per_cpu(test_pointer, i) = kmalloc(129, GFP_KERNEL); 89 per_cpu(kmemleak_test_pointer, i) = kmalloc(129, GFP_KERNEL);
90 pr_info("kmemleak: kmalloc(129) = %p\n", 90 pr_info("kmemleak: kmalloc(129) = %p\n",
91 per_cpu(test_pointer, i)); 91 per_cpu(kmemleak_test_pointer, i));
92 } 92 }
93 93
94 return 0; 94 return 0;
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 81627ebcd313..997186c0b519 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -610,6 +610,8 @@ void set_page_dirty_balance(struct page *page, int page_mkwrite)
610 } 610 }
611} 611}
612 612
613static DEFINE_PER_CPU(unsigned long, bdp_ratelimits) = 0;
614
613/** 615/**
614 * balance_dirty_pages_ratelimited_nr - balance dirty memory state 616 * balance_dirty_pages_ratelimited_nr - balance dirty memory state
615 * @mapping: address_space which was dirtied 617 * @mapping: address_space which was dirtied
@@ -627,7 +629,6 @@ void set_page_dirty_balance(struct page *page, int page_mkwrite)
627void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, 629void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
628 unsigned long nr_pages_dirtied) 630 unsigned long nr_pages_dirtied)
629{ 631{
630 static DEFINE_PER_CPU(unsigned long, ratelimits) = 0;
631 unsigned long ratelimit; 632 unsigned long ratelimit;
632 unsigned long *p; 633 unsigned long *p;
633 634
@@ -640,7 +641,7 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
640 * tasks in balance_dirty_pages(). Period. 641 * tasks in balance_dirty_pages(). Period.
641 */ 642 */
642 preempt_disable(); 643 preempt_disable();
643 p = &__get_cpu_var(ratelimits); 644 p = &__get_cpu_var(bdp_ratelimits);
644 *p += nr_pages_dirtied; 645 *p += nr_pages_dirtied;
645 if (unlikely(*p >= ratelimit)) { 646 if (unlikely(*p >= ratelimit)) {
646 *p = 0; 647 *p = 0;
diff --git a/mm/percpu.c b/mm/percpu.c
index 3311c8919f37..43d8cacfdaa5 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -8,12 +8,13 @@
8 * 8 *
9 * This is percpu allocator which can handle both static and dynamic 9 * This is percpu allocator which can handle both static and dynamic
10 * areas. Percpu areas are allocated in chunks in vmalloc area. Each 10 * areas. Percpu areas are allocated in chunks in vmalloc area. Each
11 * chunk is consisted of nr_cpu_ids units and the first chunk is used 11 * chunk is consisted of boot-time determined number of units and the
12 * for static percpu variables in the kernel image (special boot time 12 * first chunk is used for static percpu variables in the kernel image
13 * alloc/init handling necessary as these areas need to be brought up 13 * (special boot time alloc/init handling necessary as these areas
14 * before allocation services are running). Unit grows as necessary 14 * need to be brought up before allocation services are running).
15 * and all units grow or shrink in unison. When a chunk is filled up, 15 * Unit grows as necessary and all units grow or shrink in unison.
16 * another chunk is allocated. ie. in vmalloc area 16 * When a chunk is filled up, another chunk is allocated. ie. in
17 * vmalloc area
17 * 18 *
18 * c0 c1 c2 19 * c0 c1 c2
19 * ------------------- ------------------- ------------ 20 * ------------------- ------------------- ------------
@@ -22,11 +23,13 @@
22 * 23 *
23 * Allocation is done in offset-size areas of single unit space. Ie, 24 * Allocation is done in offset-size areas of single unit space. Ie,
24 * an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0, 25 * an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0,
25 * c1:u1, c1:u2 and c1:u3. Percpu access can be done by configuring 26 * c1:u1, c1:u2 and c1:u3. On UMA, units corresponds directly to
26 * percpu base registers pcpu_unit_size apart. 27 * cpus. On NUMA, the mapping can be non-linear and even sparse.
28 * Percpu access can be done by configuring percpu base registers
29 * according to cpu to unit mapping and pcpu_unit_size.
27 * 30 *
28 * There are usually many small percpu allocations many of them as 31 * There are usually many small percpu allocations many of them being
29 * small as 4 bytes. The allocator organizes chunks into lists 32 * as small as 4 bytes. The allocator organizes chunks into lists
30 * according to free size and tries to allocate from the fullest one. 33 * according to free size and tries to allocate from the fullest one.
31 * Each chunk keeps the maximum contiguous area size hint which is 34 * Each chunk keeps the maximum contiguous area size hint which is
32 * guaranteed to be eqaul to or larger than the maximum contiguous 35 * guaranteed to be eqaul to or larger than the maximum contiguous
@@ -43,7 +46,7 @@
43 * 46 *
44 * To use this allocator, arch code should do the followings. 47 * To use this allocator, arch code should do the followings.
45 * 48 *
46 * - define CONFIG_HAVE_DYNAMIC_PER_CPU_AREA 49 * - drop CONFIG_HAVE_LEGACY_PER_CPU_AREA
47 * 50 *
48 * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate 51 * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate
49 * regular address to percpu pointer and back if they need to be 52 * regular address to percpu pointer and back if they need to be
@@ -55,7 +58,9 @@
55 58
56#include <linux/bitmap.h> 59#include <linux/bitmap.h>
57#include <linux/bootmem.h> 60#include <linux/bootmem.h>
61#include <linux/err.h>
58#include <linux/list.h> 62#include <linux/list.h>
63#include <linux/log2.h>
59#include <linux/mm.h> 64#include <linux/mm.h>
60#include <linux/module.h> 65#include <linux/module.h>
61#include <linux/mutex.h> 66#include <linux/mutex.h>
@@ -89,25 +94,38 @@ struct pcpu_chunk {
89 struct list_head list; /* linked to pcpu_slot lists */ 94 struct list_head list; /* linked to pcpu_slot lists */
90 int free_size; /* free bytes in the chunk */ 95 int free_size; /* free bytes in the chunk */
91 int contig_hint; /* max contiguous size hint */ 96 int contig_hint; /* max contiguous size hint */
92 struct vm_struct *vm; /* mapped vmalloc region */ 97 void *base_addr; /* base address of this chunk */
93 int map_used; /* # of map entries used */ 98 int map_used; /* # of map entries used */
94 int map_alloc; /* # of map entries allocated */ 99 int map_alloc; /* # of map entries allocated */
95 int *map; /* allocation map */ 100 int *map; /* allocation map */
101 struct vm_struct **vms; /* mapped vmalloc regions */
96 bool immutable; /* no [de]population allowed */ 102 bool immutable; /* no [de]population allowed */
97 struct page **page; /* points to page array */ 103 unsigned long populated[]; /* populated bitmap */
98 struct page *page_ar[]; /* #cpus * UNIT_PAGES */
99}; 104};
100 105
101static int pcpu_unit_pages __read_mostly; 106static int pcpu_unit_pages __read_mostly;
102static int pcpu_unit_size __read_mostly; 107static int pcpu_unit_size __read_mostly;
103static int pcpu_chunk_size __read_mostly; 108static int pcpu_nr_units __read_mostly;
109static int pcpu_atom_size __read_mostly;
104static int pcpu_nr_slots __read_mostly; 110static int pcpu_nr_slots __read_mostly;
105static size_t pcpu_chunk_struct_size __read_mostly; 111static size_t pcpu_chunk_struct_size __read_mostly;
106 112
113/* cpus with the lowest and highest unit numbers */
114static unsigned int pcpu_first_unit_cpu __read_mostly;
115static unsigned int pcpu_last_unit_cpu __read_mostly;
116
107/* the address of the first chunk which starts with the kernel static area */ 117/* the address of the first chunk which starts with the kernel static area */
108void *pcpu_base_addr __read_mostly; 118void *pcpu_base_addr __read_mostly;
109EXPORT_SYMBOL_GPL(pcpu_base_addr); 119EXPORT_SYMBOL_GPL(pcpu_base_addr);
110 120
121static const int *pcpu_unit_map __read_mostly; /* cpu -> unit */
122const unsigned long *pcpu_unit_offsets __read_mostly; /* cpu -> unit offset */
123
124/* group information, used for vm allocation */
125static int pcpu_nr_groups __read_mostly;
126static const unsigned long *pcpu_group_offsets __read_mostly;
127static const size_t *pcpu_group_sizes __read_mostly;
128
111/* 129/*
112 * The first chunk which always exists. Note that unlike other 130 * The first chunk which always exists. Note that unlike other
113 * chunks, this one can be allocated and mapped in several different 131 * chunks, this one can be allocated and mapped in several different
@@ -129,9 +147,9 @@ static int pcpu_reserved_chunk_limit;
129 * Synchronization rules. 147 * Synchronization rules.
130 * 148 *
131 * There are two locks - pcpu_alloc_mutex and pcpu_lock. The former 149 * There are two locks - pcpu_alloc_mutex and pcpu_lock. The former
132 * protects allocation/reclaim paths, chunks and chunk->page arrays. 150 * protects allocation/reclaim paths, chunks, populated bitmap and
133 * The latter is a spinlock and protects the index data structures - 151 * vmalloc mapping. The latter is a spinlock and protects the index
134 * chunk slots, chunks and area maps in chunks. 152 * data structures - chunk slots, chunks and area maps in chunks.
135 * 153 *
136 * During allocation, pcpu_alloc_mutex is kept locked all the time and 154 * During allocation, pcpu_alloc_mutex is kept locked all the time and
137 * pcpu_lock is grabbed and released as necessary. All actual memory 155 * pcpu_lock is grabbed and released as necessary. All actual memory
@@ -178,31 +196,23 @@ static int pcpu_chunk_slot(const struct pcpu_chunk *chunk)
178 196
179static int pcpu_page_idx(unsigned int cpu, int page_idx) 197static int pcpu_page_idx(unsigned int cpu, int page_idx)
180{ 198{
181 return cpu * pcpu_unit_pages + page_idx; 199 return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx;
182}
183
184static struct page **pcpu_chunk_pagep(struct pcpu_chunk *chunk,
185 unsigned int cpu, int page_idx)
186{
187 return &chunk->page[pcpu_page_idx(cpu, page_idx)];
188} 200}
189 201
190static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk, 202static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
191 unsigned int cpu, int page_idx) 203 unsigned int cpu, int page_idx)
192{ 204{
193 return (unsigned long)chunk->vm->addr + 205 return (unsigned long)chunk->base_addr + pcpu_unit_offsets[cpu] +
194 (pcpu_page_idx(cpu, page_idx) << PAGE_SHIFT); 206 (page_idx << PAGE_SHIFT);
195} 207}
196 208
197static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk, 209static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk,
198 int page_idx) 210 unsigned int cpu, int page_idx)
199{ 211{
200 /* 212 /* must not be used on pre-mapped chunk */
201 * Any possible cpu id can be used here, so there's no need to 213 WARN_ON(chunk->immutable);
202 * worry about preemption or cpu hotplug. 214
203 */ 215 return vmalloc_to_page((void *)pcpu_chunk_addr(chunk, cpu, page_idx));
204 return *pcpu_chunk_pagep(chunk, raw_smp_processor_id(),
205 page_idx) != NULL;
206} 216}
207 217
208/* set the pointer to a chunk in a page struct */ 218/* set the pointer to a chunk in a page struct */
@@ -217,6 +227,34 @@ static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page)
217 return (struct pcpu_chunk *)page->index; 227 return (struct pcpu_chunk *)page->index;
218} 228}
219 229
230static void pcpu_next_unpop(struct pcpu_chunk *chunk, int *rs, int *re, int end)
231{
232 *rs = find_next_zero_bit(chunk->populated, end, *rs);
233 *re = find_next_bit(chunk->populated, end, *rs + 1);
234}
235
236static void pcpu_next_pop(struct pcpu_chunk *chunk, int *rs, int *re, int end)
237{
238 *rs = find_next_bit(chunk->populated, end, *rs);
239 *re = find_next_zero_bit(chunk->populated, end, *rs + 1);
240}
241
242/*
243 * (Un)populated page region iterators. Iterate over (un)populated
244 * page regions betwen @start and @end in @chunk. @rs and @re should
245 * be integer variables and will be set to start and end page index of
246 * the current region.
247 */
248#define pcpu_for_each_unpop_region(chunk, rs, re, start, end) \
249 for ((rs) = (start), pcpu_next_unpop((chunk), &(rs), &(re), (end)); \
250 (rs) < (re); \
251 (rs) = (re) + 1, pcpu_next_unpop((chunk), &(rs), &(re), (end)))
252
253#define pcpu_for_each_pop_region(chunk, rs, re, start, end) \
254 for ((rs) = (start), pcpu_next_pop((chunk), &(rs), &(re), (end)); \
255 (rs) < (re); \
256 (rs) = (re) + 1, pcpu_next_pop((chunk), &(rs), &(re), (end)))
257
220/** 258/**
221 * pcpu_mem_alloc - allocate memory 259 * pcpu_mem_alloc - allocate memory
222 * @size: bytes to allocate 260 * @size: bytes to allocate
@@ -292,10 +330,10 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
292 */ 330 */
293static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) 331static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
294{ 332{
295 void *first_start = pcpu_first_chunk->vm->addr; 333 void *first_start = pcpu_first_chunk->base_addr;
296 334
297 /* is it in the first chunk? */ 335 /* is it in the first chunk? */
298 if (addr >= first_start && addr < first_start + pcpu_chunk_size) { 336 if (addr >= first_start && addr < first_start + pcpu_unit_size) {
299 /* is it in the reserved area? */ 337 /* is it in the reserved area? */
300 if (addr < first_start + pcpu_reserved_chunk_limit) 338 if (addr < first_start + pcpu_reserved_chunk_limit)
301 return pcpu_reserved_chunk; 339 return pcpu_reserved_chunk;
@@ -309,7 +347,7 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
309 * space. Note that any possible cpu id can be used here, so 347 * space. Note that any possible cpu id can be used here, so
310 * there's no need to worry about preemption or cpu hotplug. 348 * there's no need to worry about preemption or cpu hotplug.
311 */ 349 */
312 addr += raw_smp_processor_id() * pcpu_unit_size; 350 addr += pcpu_unit_offsets[raw_smp_processor_id()];
313 return pcpu_get_page_chunk(vmalloc_to_page(addr)); 351 return pcpu_get_page_chunk(vmalloc_to_page(addr));
314} 352}
315 353
@@ -558,125 +596,327 @@ static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme)
558} 596}
559 597
560/** 598/**
561 * pcpu_unmap - unmap pages out of a pcpu_chunk 599 * pcpu_get_pages_and_bitmap - get temp pages array and bitmap
562 * @chunk: chunk of interest 600 * @chunk: chunk of interest
563 * @page_start: page index of the first page to unmap 601 * @bitmapp: output parameter for bitmap
564 * @page_end: page index of the last page to unmap + 1 602 * @may_alloc: may allocate the array
565 * @flush_tlb: whether to flush tlb or not
566 * 603 *
567 * For each cpu, unmap pages [@page_start,@page_end) out of @chunk. 604 * Returns pointer to array of pointers to struct page and bitmap,
568 * If @flush is true, vcache is flushed before unmapping and tlb 605 * both of which can be indexed with pcpu_page_idx(). The returned
569 * after. 606 * array is cleared to zero and *@bitmapp is copied from
607 * @chunk->populated. Note that there is only one array and bitmap
608 * and access exclusion is the caller's responsibility.
609 *
610 * CONTEXT:
611 * pcpu_alloc_mutex and does GFP_KERNEL allocation if @may_alloc.
612 * Otherwise, don't care.
613 *
614 * RETURNS:
615 * Pointer to temp pages array on success, NULL on failure.
570 */ 616 */
571static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end, 617static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk,
572 bool flush_tlb) 618 unsigned long **bitmapp,
619 bool may_alloc)
573{ 620{
574 unsigned int last = nr_cpu_ids - 1; 621 static struct page **pages;
575 unsigned int cpu; 622 static unsigned long *bitmap;
623 size_t pages_size = pcpu_nr_units * pcpu_unit_pages * sizeof(pages[0]);
624 size_t bitmap_size = BITS_TO_LONGS(pcpu_unit_pages) *
625 sizeof(unsigned long);
626
627 if (!pages || !bitmap) {
628 if (may_alloc && !pages)
629 pages = pcpu_mem_alloc(pages_size);
630 if (may_alloc && !bitmap)
631 bitmap = pcpu_mem_alloc(bitmap_size);
632 if (!pages || !bitmap)
633 return NULL;
634 }
576 635
577 /* unmap must not be done on immutable chunk */ 636 memset(pages, 0, pages_size);
578 WARN_ON(chunk->immutable); 637 bitmap_copy(bitmap, chunk->populated, pcpu_unit_pages);
579 638
580 /* 639 *bitmapp = bitmap;
581 * Each flushing trial can be very expensive, issue flush on 640 return pages;
582 * the whole region at once rather than doing it for each cpu. 641}
583 * This could be an overkill but is more scalable.
584 */
585 flush_cache_vunmap(pcpu_chunk_addr(chunk, 0, page_start),
586 pcpu_chunk_addr(chunk, last, page_end));
587 642
588 for_each_possible_cpu(cpu) 643/**
589 unmap_kernel_range_noflush( 644 * pcpu_free_pages - free pages which were allocated for @chunk
590 pcpu_chunk_addr(chunk, cpu, page_start), 645 * @chunk: chunk pages were allocated for
591 (page_end - page_start) << PAGE_SHIFT); 646 * @pages: array of pages to be freed, indexed by pcpu_page_idx()
592 647 * @populated: populated bitmap
593 /* ditto as flush_cache_vunmap() */ 648 * @page_start: page index of the first page to be freed
594 if (flush_tlb) 649 * @page_end: page index of the last page to be freed + 1
595 flush_tlb_kernel_range(pcpu_chunk_addr(chunk, 0, page_start), 650 *
596 pcpu_chunk_addr(chunk, last, page_end)); 651 * Free pages [@page_start and @page_end) in @pages for all units.
652 * The pages were allocated for @chunk.
653 */
654static void pcpu_free_pages(struct pcpu_chunk *chunk,
655 struct page **pages, unsigned long *populated,
656 int page_start, int page_end)
657{
658 unsigned int cpu;
659 int i;
660
661 for_each_possible_cpu(cpu) {
662 for (i = page_start; i < page_end; i++) {
663 struct page *page = pages[pcpu_page_idx(cpu, i)];
664
665 if (page)
666 __free_page(page);
667 }
668 }
597} 669}
598 670
599/** 671/**
600 * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk 672 * pcpu_alloc_pages - allocates pages for @chunk
601 * @chunk: chunk to depopulate 673 * @chunk: target chunk
602 * @off: offset to the area to depopulate 674 * @pages: array to put the allocated pages into, indexed by pcpu_page_idx()
603 * @size: size of the area to depopulate in bytes 675 * @populated: populated bitmap
604 * @flush: whether to flush cache and tlb or not 676 * @page_start: page index of the first page to be allocated
605 * 677 * @page_end: page index of the last page to be allocated + 1
606 * For each cpu, depopulate and unmap pages [@page_start,@page_end) 678 *
607 * from @chunk. If @flush is true, vcache is flushed before unmapping 679 * Allocate pages [@page_start,@page_end) into @pages for all units.
608 * and tlb after. 680 * The allocation is for @chunk. Percpu core doesn't care about the
609 * 681 * content of @pages and will pass it verbatim to pcpu_map_pages().
610 * CONTEXT:
611 * pcpu_alloc_mutex.
612 */ 682 */
613static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size, 683static int pcpu_alloc_pages(struct pcpu_chunk *chunk,
614 bool flush) 684 struct page **pages, unsigned long *populated,
685 int page_start, int page_end)
615{ 686{
616 int page_start = PFN_DOWN(off); 687 const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD;
617 int page_end = PFN_UP(off + size);
618 int unmap_start = -1;
619 int uninitialized_var(unmap_end);
620 unsigned int cpu; 688 unsigned int cpu;
621 int i; 689 int i;
622 690
623 for (i = page_start; i < page_end; i++) { 691 for_each_possible_cpu(cpu) {
624 for_each_possible_cpu(cpu) { 692 for (i = page_start; i < page_end; i++) {
625 struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i); 693 struct page **pagep = &pages[pcpu_page_idx(cpu, i)];
694
695 *pagep = alloc_pages_node(cpu_to_node(cpu), gfp, 0);
696 if (!*pagep) {
697 pcpu_free_pages(chunk, pages, populated,
698 page_start, page_end);
699 return -ENOMEM;
700 }
701 }
702 }
703 return 0;
704}
626 705
627 if (!*pagep) 706/**
628 continue; 707 * pcpu_pre_unmap_flush - flush cache prior to unmapping
708 * @chunk: chunk the regions to be flushed belongs to
709 * @page_start: page index of the first page to be flushed
710 * @page_end: page index of the last page to be flushed + 1
711 *
712 * Pages in [@page_start,@page_end) of @chunk are about to be
713 * unmapped. Flush cache. As each flushing trial can be very
714 * expensive, issue flush on the whole region at once rather than
715 * doing it for each cpu. This could be an overkill but is more
716 * scalable.
717 */
718static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk,
719 int page_start, int page_end)
720{
721 flush_cache_vunmap(
722 pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
723 pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
724}
725
726static void __pcpu_unmap_pages(unsigned long addr, int nr_pages)
727{
728 unmap_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT);
729}
629 730
630 __free_page(*pagep); 731/**
732 * pcpu_unmap_pages - unmap pages out of a pcpu_chunk
733 * @chunk: chunk of interest
734 * @pages: pages array which can be used to pass information to free
735 * @populated: populated bitmap
736 * @page_start: page index of the first page to unmap
737 * @page_end: page index of the last page to unmap + 1
738 *
739 * For each cpu, unmap pages [@page_start,@page_end) out of @chunk.
740 * Corresponding elements in @pages were cleared by the caller and can
741 * be used to carry information to pcpu_free_pages() which will be
742 * called after all unmaps are finished. The caller should call
743 * proper pre/post flush functions.
744 */
745static void pcpu_unmap_pages(struct pcpu_chunk *chunk,
746 struct page **pages, unsigned long *populated,
747 int page_start, int page_end)
748{
749 unsigned int cpu;
750 int i;
631 751
632 /* 752 for_each_possible_cpu(cpu) {
633 * If it's partial depopulation, it might get 753 for (i = page_start; i < page_end; i++) {
634 * populated or depopulated again. Mark the 754 struct page *page;
635 * page gone.
636 */
637 *pagep = NULL;
638 755
639 unmap_start = unmap_start < 0 ? i : unmap_start; 756 page = pcpu_chunk_page(chunk, cpu, i);
640 unmap_end = i + 1; 757 WARN_ON(!page);
758 pages[pcpu_page_idx(cpu, i)] = page;
641 } 759 }
760 __pcpu_unmap_pages(pcpu_chunk_addr(chunk, cpu, page_start),
761 page_end - page_start);
642 } 762 }
643 763
644 if (unmap_start >= 0) 764 for (i = page_start; i < page_end; i++)
645 pcpu_unmap(chunk, unmap_start, unmap_end, flush); 765 __clear_bit(i, populated);
766}
767
768/**
769 * pcpu_post_unmap_tlb_flush - flush TLB after unmapping
770 * @chunk: pcpu_chunk the regions to be flushed belong to
771 * @page_start: page index of the first page to be flushed
772 * @page_end: page index of the last page to be flushed + 1
773 *
774 * Pages [@page_start,@page_end) of @chunk have been unmapped. Flush
775 * TLB for the regions. This can be skipped if the area is to be
776 * returned to vmalloc as vmalloc will handle TLB flushing lazily.
777 *
778 * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once
779 * for the whole region.
780 */
781static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk,
782 int page_start, int page_end)
783{
784 flush_tlb_kernel_range(
785 pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
786 pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
787}
788
789static int __pcpu_map_pages(unsigned long addr, struct page **pages,
790 int nr_pages)
791{
792 return map_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT,
793 PAGE_KERNEL, pages);
646} 794}
647 795
648/** 796/**
649 * pcpu_map - map pages into a pcpu_chunk 797 * pcpu_map_pages - map pages into a pcpu_chunk
650 * @chunk: chunk of interest 798 * @chunk: chunk of interest
799 * @pages: pages array containing pages to be mapped
800 * @populated: populated bitmap
651 * @page_start: page index of the first page to map 801 * @page_start: page index of the first page to map
652 * @page_end: page index of the last page to map + 1 802 * @page_end: page index of the last page to map + 1
653 * 803 *
654 * For each cpu, map pages [@page_start,@page_end) into @chunk. 804 * For each cpu, map pages [@page_start,@page_end) into @chunk. The
655 * vcache is flushed afterwards. 805 * caller is responsible for calling pcpu_post_map_flush() after all
806 * mappings are complete.
807 *
808 * This function is responsible for setting corresponding bits in
809 * @chunk->populated bitmap and whatever is necessary for reverse
810 * lookup (addr -> chunk).
656 */ 811 */
657static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end) 812static int pcpu_map_pages(struct pcpu_chunk *chunk,
813 struct page **pages, unsigned long *populated,
814 int page_start, int page_end)
658{ 815{
659 unsigned int last = nr_cpu_ids - 1; 816 unsigned int cpu, tcpu;
660 unsigned int cpu; 817 int i, err;
661 int err;
662
663 /* map must not be done on immutable chunk */
664 WARN_ON(chunk->immutable);
665 818
666 for_each_possible_cpu(cpu) { 819 for_each_possible_cpu(cpu) {
667 err = map_kernel_range_noflush( 820 err = __pcpu_map_pages(pcpu_chunk_addr(chunk, cpu, page_start),
668 pcpu_chunk_addr(chunk, cpu, page_start), 821 &pages[pcpu_page_idx(cpu, page_start)],
669 (page_end - page_start) << PAGE_SHIFT, 822 page_end - page_start);
670 PAGE_KERNEL,
671 pcpu_chunk_pagep(chunk, cpu, page_start));
672 if (err < 0) 823 if (err < 0)
673 return err; 824 goto err;
825 }
826
827 /* mapping successful, link chunk and mark populated */
828 for (i = page_start; i < page_end; i++) {
829 for_each_possible_cpu(cpu)
830 pcpu_set_page_chunk(pages[pcpu_page_idx(cpu, i)],
831 chunk);
832 __set_bit(i, populated);
674 } 833 }
675 834
676 /* flush at once, please read comments in pcpu_unmap() */
677 flush_cache_vmap(pcpu_chunk_addr(chunk, 0, page_start),
678 pcpu_chunk_addr(chunk, last, page_end));
679 return 0; 835 return 0;
836
837err:
838 for_each_possible_cpu(tcpu) {
839 if (tcpu == cpu)
840 break;
841 __pcpu_unmap_pages(pcpu_chunk_addr(chunk, tcpu, page_start),
842 page_end - page_start);
843 }
844 return err;
845}
846
847/**
848 * pcpu_post_map_flush - flush cache after mapping
849 * @chunk: pcpu_chunk the regions to be flushed belong to
850 * @page_start: page index of the first page to be flushed
851 * @page_end: page index of the last page to be flushed + 1
852 *
853 * Pages [@page_start,@page_end) of @chunk have been mapped. Flush
854 * cache.
855 *
856 * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once
857 * for the whole region.
858 */
859static void pcpu_post_map_flush(struct pcpu_chunk *chunk,
860 int page_start, int page_end)
861{
862 flush_cache_vmap(
863 pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
864 pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
865}
866
867/**
868 * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk
869 * @chunk: chunk to depopulate
870 * @off: offset to the area to depopulate
871 * @size: size of the area to depopulate in bytes
872 * @flush: whether to flush cache and tlb or not
873 *
874 * For each cpu, depopulate and unmap pages [@page_start,@page_end)
875 * from @chunk. If @flush is true, vcache is flushed before unmapping
876 * and tlb after.
877 *
878 * CONTEXT:
879 * pcpu_alloc_mutex.
880 */
881static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size)
882{
883 int page_start = PFN_DOWN(off);
884 int page_end = PFN_UP(off + size);
885 struct page **pages;
886 unsigned long *populated;
887 int rs, re;
888
889 /* quick path, check whether it's empty already */
890 pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
891 if (rs == page_start && re == page_end)
892 return;
893 break;
894 }
895
896 /* immutable chunks can't be depopulated */
897 WARN_ON(chunk->immutable);
898
899 /*
900 * If control reaches here, there must have been at least one
901 * successful population attempt so the temp pages array must
902 * be available now.
903 */
904 pages = pcpu_get_pages_and_bitmap(chunk, &populated, false);
905 BUG_ON(!pages);
906
907 /* unmap and free */
908 pcpu_pre_unmap_flush(chunk, page_start, page_end);
909
910 pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end)
911 pcpu_unmap_pages(chunk, pages, populated, rs, re);
912
913 /* no need to flush tlb, vmalloc will handle it lazily */
914
915 pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end)
916 pcpu_free_pages(chunk, pages, populated, rs, re);
917
918 /* commit new bitmap */
919 bitmap_copy(chunk->populated, populated, pcpu_unit_pages);
680} 920}
681 921
682/** 922/**
@@ -693,58 +933,68 @@ static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end)
693 */ 933 */
694static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) 934static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
695{ 935{
696 const gfp_t alloc_mask = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD;
697 int page_start = PFN_DOWN(off); 936 int page_start = PFN_DOWN(off);
698 int page_end = PFN_UP(off + size); 937 int page_end = PFN_UP(off + size);
699 int map_start = -1; 938 int free_end = page_start, unmap_end = page_start;
700 int uninitialized_var(map_end); 939 struct page **pages;
940 unsigned long *populated;
701 unsigned int cpu; 941 unsigned int cpu;
702 int i; 942 int rs, re, rc;
703 943
704 for (i = page_start; i < page_end; i++) { 944 /* quick path, check whether all pages are already there */
705 if (pcpu_chunk_page_occupied(chunk, i)) { 945 pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) {
706 if (map_start >= 0) { 946 if (rs == page_start && re == page_end)
707 if (pcpu_map(chunk, map_start, map_end)) 947 goto clear;
708 goto err; 948 break;
709 map_start = -1; 949 }
710 }
711 continue;
712 }
713 950
714 map_start = map_start < 0 ? i : map_start; 951 /* need to allocate and map pages, this chunk can't be immutable */
715 map_end = i + 1; 952 WARN_ON(chunk->immutable);
716 953
717 for_each_possible_cpu(cpu) { 954 pages = pcpu_get_pages_and_bitmap(chunk, &populated, true);
718 struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i); 955 if (!pages)
956 return -ENOMEM;
719 957
720 *pagep = alloc_pages_node(cpu_to_node(cpu), 958 /* alloc and map */
721 alloc_mask, 0); 959 pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
722 if (!*pagep) 960 rc = pcpu_alloc_pages(chunk, pages, populated, rs, re);
723 goto err; 961 if (rc)
724 pcpu_set_page_chunk(*pagep, chunk); 962 goto err_free;
725 } 963 free_end = re;
726 } 964 }
727 965
728 if (map_start >= 0 && pcpu_map(chunk, map_start, map_end)) 966 pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
729 goto err; 967 rc = pcpu_map_pages(chunk, pages, populated, rs, re);
968 if (rc)
969 goto err_unmap;
970 unmap_end = re;
971 }
972 pcpu_post_map_flush(chunk, page_start, page_end);
730 973
974 /* commit new bitmap */
975 bitmap_copy(chunk->populated, populated, pcpu_unit_pages);
976clear:
731 for_each_possible_cpu(cpu) 977 for_each_possible_cpu(cpu)
732 memset(chunk->vm->addr + cpu * pcpu_unit_size + off, 0, 978 memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
733 size);
734
735 return 0; 979 return 0;
736err: 980
737 /* likely under heavy memory pressure, give memory back */ 981err_unmap:
738 pcpu_depopulate_chunk(chunk, off, size, true); 982 pcpu_pre_unmap_flush(chunk, page_start, unmap_end);
739 return -ENOMEM; 983 pcpu_for_each_unpop_region(chunk, rs, re, page_start, unmap_end)
984 pcpu_unmap_pages(chunk, pages, populated, rs, re);
985 pcpu_post_unmap_tlb_flush(chunk, page_start, unmap_end);
986err_free:
987 pcpu_for_each_unpop_region(chunk, rs, re, page_start, free_end)
988 pcpu_free_pages(chunk, pages, populated, rs, re);
989 return rc;
740} 990}
741 991
742static void free_pcpu_chunk(struct pcpu_chunk *chunk) 992static void free_pcpu_chunk(struct pcpu_chunk *chunk)
743{ 993{
744 if (!chunk) 994 if (!chunk)
745 return; 995 return;
746 if (chunk->vm) 996 if (chunk->vms)
747 free_vm_area(chunk->vm); 997 pcpu_free_vm_areas(chunk->vms, pcpu_nr_groups);
748 pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0])); 998 pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0]));
749 kfree(chunk); 999 kfree(chunk);
750} 1000}
@@ -760,10 +1010,11 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void)
760 chunk->map = pcpu_mem_alloc(PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0])); 1010 chunk->map = pcpu_mem_alloc(PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0]));
761 chunk->map_alloc = PCPU_DFL_MAP_ALLOC; 1011 chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
762 chunk->map[chunk->map_used++] = pcpu_unit_size; 1012 chunk->map[chunk->map_used++] = pcpu_unit_size;
763 chunk->page = chunk->page_ar;
764 1013
765 chunk->vm = get_vm_area(pcpu_chunk_size, VM_ALLOC); 1014 chunk->vms = pcpu_get_vm_areas(pcpu_group_offsets, pcpu_group_sizes,
766 if (!chunk->vm) { 1015 pcpu_nr_groups, pcpu_atom_size,
1016 GFP_KERNEL);
1017 if (!chunk->vms) {
767 free_pcpu_chunk(chunk); 1018 free_pcpu_chunk(chunk);
768 return NULL; 1019 return NULL;
769 } 1020 }
@@ -771,6 +1022,7 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void)
771 INIT_LIST_HEAD(&chunk->list); 1022 INIT_LIST_HEAD(&chunk->list);
772 chunk->free_size = pcpu_unit_size; 1023 chunk->free_size = pcpu_unit_size;
773 chunk->contig_hint = pcpu_unit_size; 1024 chunk->contig_hint = pcpu_unit_size;
1025 chunk->base_addr = chunk->vms[0]->addr - pcpu_group_offsets[0];
774 1026
775 return chunk; 1027 return chunk;
776} 1028}
@@ -860,7 +1112,8 @@ area_found:
860 1112
861 mutex_unlock(&pcpu_alloc_mutex); 1113 mutex_unlock(&pcpu_alloc_mutex);
862 1114
863 return __addr_to_pcpu_ptr(chunk->vm->addr + off); 1115 /* return address relative to base address */
1116 return __addr_to_pcpu_ptr(chunk->base_addr + off);
864 1117
865fail_unlock: 1118fail_unlock:
866 spin_unlock_irq(&pcpu_lock); 1119 spin_unlock_irq(&pcpu_lock);
@@ -938,12 +1191,13 @@ static void pcpu_reclaim(struct work_struct *work)
938 } 1191 }
939 1192
940 spin_unlock_irq(&pcpu_lock); 1193 spin_unlock_irq(&pcpu_lock);
941 mutex_unlock(&pcpu_alloc_mutex);
942 1194
943 list_for_each_entry_safe(chunk, next, &todo, list) { 1195 list_for_each_entry_safe(chunk, next, &todo, list) {
944 pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false); 1196 pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size);
945 free_pcpu_chunk(chunk); 1197 free_pcpu_chunk(chunk);
946 } 1198 }
1199
1200 mutex_unlock(&pcpu_alloc_mutex);
947} 1201}
948 1202
949/** 1203/**
@@ -968,7 +1222,7 @@ void free_percpu(void *ptr)
968 spin_lock_irqsave(&pcpu_lock, flags); 1222 spin_lock_irqsave(&pcpu_lock, flags);
969 1223
970 chunk = pcpu_chunk_addr_search(addr); 1224 chunk = pcpu_chunk_addr_search(addr);
971 off = addr - chunk->vm->addr; 1225 off = addr - chunk->base_addr;
972 1226
973 pcpu_free_area(chunk, off); 1227 pcpu_free_area(chunk, off);
974 1228
@@ -987,30 +1241,295 @@ void free_percpu(void *ptr)
987} 1241}
988EXPORT_SYMBOL_GPL(free_percpu); 1242EXPORT_SYMBOL_GPL(free_percpu);
989 1243
1244static inline size_t pcpu_calc_fc_sizes(size_t static_size,
1245 size_t reserved_size,
1246 ssize_t *dyn_sizep)
1247{
1248 size_t size_sum;
1249
1250 size_sum = PFN_ALIGN(static_size + reserved_size +
1251 (*dyn_sizep >= 0 ? *dyn_sizep : 0));
1252 if (*dyn_sizep != 0)
1253 *dyn_sizep = size_sum - static_size - reserved_size;
1254
1255 return size_sum;
1256}
1257
990/** 1258/**
991 * pcpu_setup_first_chunk - initialize the first percpu chunk 1259 * pcpu_alloc_alloc_info - allocate percpu allocation info
992 * @get_page_fn: callback to fetch page pointer 1260 * @nr_groups: the number of groups
993 * @static_size: the size of static percpu area in bytes 1261 * @nr_units: the number of units
1262 *
1263 * Allocate ai which is large enough for @nr_groups groups containing
1264 * @nr_units units. The returned ai's groups[0].cpu_map points to the
1265 * cpu_map array which is long enough for @nr_units and filled with
1266 * NR_CPUS. It's the caller's responsibility to initialize cpu_map
1267 * pointer of other groups.
1268 *
1269 * RETURNS:
1270 * Pointer to the allocated pcpu_alloc_info on success, NULL on
1271 * failure.
1272 */
1273struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups,
1274 int nr_units)
1275{
1276 struct pcpu_alloc_info *ai;
1277 size_t base_size, ai_size;
1278 void *ptr;
1279 int unit;
1280
1281 base_size = ALIGN(sizeof(*ai) + nr_groups * sizeof(ai->groups[0]),
1282 __alignof__(ai->groups[0].cpu_map[0]));
1283 ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]);
1284
1285 ptr = alloc_bootmem_nopanic(PFN_ALIGN(ai_size));
1286 if (!ptr)
1287 return NULL;
1288 ai = ptr;
1289 ptr += base_size;
1290
1291 ai->groups[0].cpu_map = ptr;
1292
1293 for (unit = 0; unit < nr_units; unit++)
1294 ai->groups[0].cpu_map[unit] = NR_CPUS;
1295
1296 ai->nr_groups = nr_groups;
1297 ai->__ai_size = PFN_ALIGN(ai_size);
1298
1299 return ai;
1300}
1301
1302/**
1303 * pcpu_free_alloc_info - free percpu allocation info
1304 * @ai: pcpu_alloc_info to free
1305 *
1306 * Free @ai which was allocated by pcpu_alloc_alloc_info().
1307 */
1308void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai)
1309{
1310 free_bootmem(__pa(ai), ai->__ai_size);
1311}
1312
1313/**
1314 * pcpu_build_alloc_info - build alloc_info considering distances between CPUs
994 * @reserved_size: the size of reserved percpu area in bytes 1315 * @reserved_size: the size of reserved percpu area in bytes
995 * @dyn_size: free size for dynamic allocation in bytes, -1 for auto 1316 * @dyn_size: free size for dynamic allocation in bytes, -1 for auto
996 * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, -1 for auto 1317 * @atom_size: allocation atom size
997 * @base_addr: mapped address, NULL for auto 1318 * @cpu_distance_fn: callback to determine distance between cpus, optional
998 * @populate_pte_fn: callback to allocate pagetable, NULL if unnecessary 1319 *
1320 * This function determines grouping of units, their mappings to cpus
1321 * and other parameters considering needed percpu size, allocation
1322 * atom size and distances between CPUs.
1323 *
1324 * Groups are always mutliples of atom size and CPUs which are of
1325 * LOCAL_DISTANCE both ways are grouped together and share space for
1326 * units in the same group. The returned configuration is guaranteed
1327 * to have CPUs on different nodes on different groups and >=75% usage
1328 * of allocated virtual address space.
1329 *
1330 * RETURNS:
1331 * On success, pointer to the new allocation_info is returned. On
1332 * failure, ERR_PTR value is returned.
1333 */
1334struct pcpu_alloc_info * __init pcpu_build_alloc_info(
1335 size_t reserved_size, ssize_t dyn_size,
1336 size_t atom_size,
1337 pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
1338{
1339 static int group_map[NR_CPUS] __initdata;
1340 static int group_cnt[NR_CPUS] __initdata;
1341 const size_t static_size = __per_cpu_end - __per_cpu_start;
1342 int group_cnt_max = 0, nr_groups = 1, nr_units = 0;
1343 size_t size_sum, min_unit_size, alloc_size;
1344 int upa, max_upa, uninitialized_var(best_upa); /* units_per_alloc */
1345 int last_allocs, group, unit;
1346 unsigned int cpu, tcpu;
1347 struct pcpu_alloc_info *ai;
1348 unsigned int *cpu_map;
1349
1350 /*
1351 * Determine min_unit_size, alloc_size and max_upa such that
1352 * alloc_size is multiple of atom_size and is the smallest
1353 * which can accomodate 4k aligned segments which are equal to
1354 * or larger than min_unit_size.
1355 */
1356 size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size);
1357 min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
1358
1359 alloc_size = roundup(min_unit_size, atom_size);
1360 upa = alloc_size / min_unit_size;
1361 while (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
1362 upa--;
1363 max_upa = upa;
1364
1365 /* group cpus according to their proximity */
1366 for_each_possible_cpu(cpu) {
1367 group = 0;
1368 next_group:
1369 for_each_possible_cpu(tcpu) {
1370 if (cpu == tcpu)
1371 break;
1372 if (group_map[tcpu] == group && cpu_distance_fn &&
1373 (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE ||
1374 cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) {
1375 group++;
1376 nr_groups = max(nr_groups, group + 1);
1377 goto next_group;
1378 }
1379 }
1380 group_map[cpu] = group;
1381 group_cnt[group]++;
1382 group_cnt_max = max(group_cnt_max, group_cnt[group]);
1383 }
1384
1385 /*
1386 * Expand unit size until address space usage goes over 75%
1387 * and then as much as possible without using more address
1388 * space.
1389 */
1390 last_allocs = INT_MAX;
1391 for (upa = max_upa; upa; upa--) {
1392 int allocs = 0, wasted = 0;
1393
1394 if (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
1395 continue;
1396
1397 for (group = 0; group < nr_groups; group++) {
1398 int this_allocs = DIV_ROUND_UP(group_cnt[group], upa);
1399 allocs += this_allocs;
1400 wasted += this_allocs * upa - group_cnt[group];
1401 }
1402
1403 /*
1404 * Don't accept if wastage is over 25%. The
1405 * greater-than comparison ensures upa==1 always
1406 * passes the following check.
1407 */
1408 if (wasted > num_possible_cpus() / 3)
1409 continue;
1410
1411 /* and then don't consume more memory */
1412 if (allocs > last_allocs)
1413 break;
1414 last_allocs = allocs;
1415 best_upa = upa;
1416 }
1417 upa = best_upa;
1418
1419 /* allocate and fill alloc_info */
1420 for (group = 0; group < nr_groups; group++)
1421 nr_units += roundup(group_cnt[group], upa);
1422
1423 ai = pcpu_alloc_alloc_info(nr_groups, nr_units);
1424 if (!ai)
1425 return ERR_PTR(-ENOMEM);
1426 cpu_map = ai->groups[0].cpu_map;
1427
1428 for (group = 0; group < nr_groups; group++) {
1429 ai->groups[group].cpu_map = cpu_map;
1430 cpu_map += roundup(group_cnt[group], upa);
1431 }
1432
1433 ai->static_size = static_size;
1434 ai->reserved_size = reserved_size;
1435 ai->dyn_size = dyn_size;
1436 ai->unit_size = alloc_size / upa;
1437 ai->atom_size = atom_size;
1438 ai->alloc_size = alloc_size;
1439
1440 for (group = 0, unit = 0; group_cnt[group]; group++) {
1441 struct pcpu_group_info *gi = &ai->groups[group];
1442
1443 /*
1444 * Initialize base_offset as if all groups are located
1445 * back-to-back. The caller should update this to
1446 * reflect actual allocation.
1447 */
1448 gi->base_offset = unit * ai->unit_size;
1449
1450 for_each_possible_cpu(cpu)
1451 if (group_map[cpu] == group)
1452 gi->cpu_map[gi->nr_units++] = cpu;
1453 gi->nr_units = roundup(gi->nr_units, upa);
1454 unit += gi->nr_units;
1455 }
1456 BUG_ON(unit != nr_units);
1457
1458 return ai;
1459}
1460
1461/**
1462 * pcpu_dump_alloc_info - print out information about pcpu_alloc_info
1463 * @lvl: loglevel
1464 * @ai: allocation info to dump
1465 *
1466 * Print out information about @ai using loglevel @lvl.
1467 */
1468static void pcpu_dump_alloc_info(const char *lvl,
1469 const struct pcpu_alloc_info *ai)
1470{
1471 int group_width = 1, cpu_width = 1, width;
1472 char empty_str[] = "--------";
1473 int alloc = 0, alloc_end = 0;
1474 int group, v;
1475 int upa, apl; /* units per alloc, allocs per line */
1476
1477 v = ai->nr_groups;
1478 while (v /= 10)
1479 group_width++;
1480
1481 v = num_possible_cpus();
1482 while (v /= 10)
1483 cpu_width++;
1484 empty_str[min_t(int, cpu_width, sizeof(empty_str) - 1)] = '\0';
1485
1486 upa = ai->alloc_size / ai->unit_size;
1487 width = upa * (cpu_width + 1) + group_width + 3;
1488 apl = rounddown_pow_of_two(max(60 / width, 1));
1489
1490 printk("%spcpu-alloc: s%zu r%zu d%zu u%zu alloc=%zu*%zu",
1491 lvl, ai->static_size, ai->reserved_size, ai->dyn_size,
1492 ai->unit_size, ai->alloc_size / ai->atom_size, ai->atom_size);
1493
1494 for (group = 0; group < ai->nr_groups; group++) {
1495 const struct pcpu_group_info *gi = &ai->groups[group];
1496 int unit = 0, unit_end = 0;
1497
1498 BUG_ON(gi->nr_units % upa);
1499 for (alloc_end += gi->nr_units / upa;
1500 alloc < alloc_end; alloc++) {
1501 if (!(alloc % apl)) {
1502 printk("\n");
1503 printk("%spcpu-alloc: ", lvl);
1504 }
1505 printk("[%0*d] ", group_width, group);
1506
1507 for (unit_end += upa; unit < unit_end; unit++)
1508 if (gi->cpu_map[unit] != NR_CPUS)
1509 printk("%0*d ", cpu_width,
1510 gi->cpu_map[unit]);
1511 else
1512 printk("%s ", empty_str);
1513 }
1514 }
1515 printk("\n");
1516}
1517
1518/**
1519 * pcpu_setup_first_chunk - initialize the first percpu chunk
1520 * @ai: pcpu_alloc_info describing how to percpu area is shaped
1521 * @base_addr: mapped address
999 * 1522 *
1000 * Initialize the first percpu chunk which contains the kernel static 1523 * Initialize the first percpu chunk which contains the kernel static
1001 * perpcu area. This function is to be called from arch percpu area 1524 * perpcu area. This function is to be called from arch percpu area
1002 * setup path. The first two parameters are mandatory. The rest are 1525 * setup path.
1003 * optional. 1526 *
1004 * 1527 * @ai contains all information necessary to initialize the first
1005 * @get_page_fn() should return pointer to percpu page given cpu 1528 * chunk and prime the dynamic percpu allocator.
1006 * number and page number. It should at least return enough pages to 1529 *
1007 * cover the static area. The returned pages for static area should 1530 * @ai->static_size is the size of static percpu area.
1008 * have been initialized with valid data. If @unit_size is specified, 1531 *
1009 * it can also return pages after the static area. NULL return 1532 * @ai->reserved_size, if non-zero, specifies the amount of bytes to
1010 * indicates end of pages for the cpu. Note that @get_page_fn() must
1011 * return the same number of pages for all cpus.
1012 *
1013 * @reserved_size, if non-zero, specifies the amount of bytes to
1014 * reserve after the static area in the first chunk. This reserves 1533 * reserve after the static area in the first chunk. This reserves
1015 * the first chunk such that it's available only through reserved 1534 * the first chunk such that it's available only through reserved
1016 * percpu allocation. This is primarily used to serve module percpu 1535 * percpu allocation. This is primarily used to serve module percpu
@@ -1018,22 +1537,29 @@ EXPORT_SYMBOL_GPL(free_percpu);
1018 * limited offset range for symbol relocations to guarantee module 1537 * limited offset range for symbol relocations to guarantee module
1019 * percpu symbols fall inside the relocatable range. 1538 * percpu symbols fall inside the relocatable range.
1020 * 1539 *
1021 * @dyn_size, if non-negative, determines the number of bytes 1540 * @ai->dyn_size determines the number of bytes available for dynamic
1022 * available for dynamic allocation in the first chunk. Specifying 1541 * allocation in the first chunk. The area between @ai->static_size +
1023 * non-negative value makes percpu leave alone the area beyond 1542 * @ai->reserved_size + @ai->dyn_size and @ai->unit_size is unused.
1024 * @static_size + @reserved_size + @dyn_size.
1025 * 1543 *
1026 * @unit_size, if non-negative, specifies unit size and must be 1544 * @ai->unit_size specifies unit size and must be aligned to PAGE_SIZE
1027 * aligned to PAGE_SIZE and equal to or larger than @static_size + 1545 * and equal to or larger than @ai->static_size + @ai->reserved_size +
1028 * @reserved_size + if non-negative, @dyn_size. 1546 * @ai->dyn_size.
1029 * 1547 *
1030 * Non-null @base_addr means that the caller already allocated virtual 1548 * @ai->atom_size is the allocation atom size and used as alignment
1031 * region for the first chunk and mapped it. percpu must not mess 1549 * for vm areas.
1032 * with the chunk. Note that @base_addr with 0 @unit_size or non-NULL
1033 * @populate_pte_fn doesn't make any sense.
1034 * 1550 *
1035 * @populate_pte_fn is used to populate the pagetable. NULL means the 1551 * @ai->alloc_size is the allocation size and always multiple of
1036 * caller already populated the pagetable. 1552 * @ai->atom_size. This is larger than @ai->atom_size if
1553 * @ai->unit_size is larger than @ai->atom_size.
1554 *
1555 * @ai->nr_groups and @ai->groups describe virtual memory layout of
1556 * percpu areas. Units which should be colocated are put into the
1557 * same group. Dynamic VM areas will be allocated according to these
1558 * groupings. If @ai->nr_groups is zero, a single group containing
1559 * all units is assumed.
1560 *
1561 * The caller should have mapped the first chunk at @base_addr and
1562 * copied static data to each unit.
1037 * 1563 *
1038 * If the first chunk ends up with both reserved and dynamic areas, it 1564 * If the first chunk ends up with both reserved and dynamic areas, it
1039 * is served by two chunks - one to serve the core static and reserved 1565 * is served by two chunks - one to serve the core static and reserved
@@ -1043,49 +1569,83 @@ EXPORT_SYMBOL_GPL(free_percpu);
1043 * and available for dynamic allocation like any other chunks. 1569 * and available for dynamic allocation like any other chunks.
1044 * 1570 *
1045 * RETURNS: 1571 * RETURNS:
1046 * The determined pcpu_unit_size which can be used to initialize 1572 * 0 on success, -errno on failure.
1047 * percpu access.
1048 */ 1573 */
1049size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, 1574int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
1050 size_t static_size, size_t reserved_size, 1575 void *base_addr)
1051 ssize_t dyn_size, ssize_t unit_size,
1052 void *base_addr,
1053 pcpu_populate_pte_fn_t populate_pte_fn)
1054{ 1576{
1055 static struct vm_struct first_vm;
1056 static int smap[2], dmap[2]; 1577 static int smap[2], dmap[2];
1057 size_t size_sum = static_size + reserved_size + 1578 size_t dyn_size = ai->dyn_size;
1058 (dyn_size >= 0 ? dyn_size : 0); 1579 size_t size_sum = ai->static_size + ai->reserved_size + dyn_size;
1059 struct pcpu_chunk *schunk, *dchunk = NULL; 1580 struct pcpu_chunk *schunk, *dchunk = NULL;
1581 unsigned long *group_offsets;
1582 size_t *group_sizes;
1583 unsigned long *unit_off;
1060 unsigned int cpu; 1584 unsigned int cpu;
1061 int nr_pages; 1585 int *unit_map;
1062 int err, i; 1586 int group, unit, i;
1063 1587
1064 /* santiy checks */ 1588 /* sanity checks */
1065 BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC || 1589 BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC ||
1066 ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC); 1590 ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC);
1067 BUG_ON(!static_size); 1591 BUG_ON(ai->nr_groups <= 0);
1068 if (unit_size >= 0) { 1592 BUG_ON(!ai->static_size);
1069 BUG_ON(unit_size < size_sum); 1593 BUG_ON(!base_addr);
1070 BUG_ON(unit_size & ~PAGE_MASK); 1594 BUG_ON(ai->unit_size < size_sum);
1071 BUG_ON(unit_size < PCPU_MIN_UNIT_SIZE); 1595 BUG_ON(ai->unit_size & ~PAGE_MASK);
1072 } else 1596 BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE);
1073 BUG_ON(base_addr); 1597
1074 BUG_ON(base_addr && populate_pte_fn); 1598 pcpu_dump_alloc_info(KERN_DEBUG, ai);
1075 1599
1076 if (unit_size >= 0) 1600 /* process group information and build config tables accordingly */
1077 pcpu_unit_pages = unit_size >> PAGE_SHIFT; 1601 group_offsets = alloc_bootmem(ai->nr_groups * sizeof(group_offsets[0]));
1078 else 1602 group_sizes = alloc_bootmem(ai->nr_groups * sizeof(group_sizes[0]));
1079 pcpu_unit_pages = max_t(int, PCPU_MIN_UNIT_SIZE >> PAGE_SHIFT, 1603 unit_map = alloc_bootmem(nr_cpu_ids * sizeof(unit_map[0]));
1080 PFN_UP(size_sum)); 1604 unit_off = alloc_bootmem(nr_cpu_ids * sizeof(unit_off[0]));
1605
1606 for (cpu = 0; cpu < nr_cpu_ids; cpu++)
1607 unit_map[cpu] = NR_CPUS;
1608 pcpu_first_unit_cpu = NR_CPUS;
1609
1610 for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) {
1611 const struct pcpu_group_info *gi = &ai->groups[group];
1612
1613 group_offsets[group] = gi->base_offset;
1614 group_sizes[group] = gi->nr_units * ai->unit_size;
1615
1616 for (i = 0; i < gi->nr_units; i++) {
1617 cpu = gi->cpu_map[i];
1618 if (cpu == NR_CPUS)
1619 continue;
1081 1620
1082 pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT; 1621 BUG_ON(cpu > nr_cpu_ids || !cpu_possible(cpu));
1083 pcpu_chunk_size = nr_cpu_ids * pcpu_unit_size; 1622 BUG_ON(unit_map[cpu] != NR_CPUS);
1084 pcpu_chunk_struct_size = sizeof(struct pcpu_chunk)
1085 + nr_cpu_ids * pcpu_unit_pages * sizeof(struct page *);
1086 1623
1087 if (dyn_size < 0) 1624 unit_map[cpu] = unit + i;
1088 dyn_size = pcpu_unit_size - static_size - reserved_size; 1625 unit_off[cpu] = gi->base_offset + i * ai->unit_size;
1626
1627 if (pcpu_first_unit_cpu == NR_CPUS)
1628 pcpu_first_unit_cpu = cpu;
1629 }
1630 }
1631 pcpu_last_unit_cpu = cpu;
1632 pcpu_nr_units = unit;
1633
1634 for_each_possible_cpu(cpu)
1635 BUG_ON(unit_map[cpu] == NR_CPUS);
1636
1637 pcpu_nr_groups = ai->nr_groups;
1638 pcpu_group_offsets = group_offsets;
1639 pcpu_group_sizes = group_sizes;
1640 pcpu_unit_map = unit_map;
1641 pcpu_unit_offsets = unit_off;
1642
1643 /* determine basic parameters */
1644 pcpu_unit_pages = ai->unit_size >> PAGE_SHIFT;
1645 pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
1646 pcpu_atom_size = ai->atom_size;
1647 pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) +
1648 BITS_TO_LONGS(pcpu_unit_pages) * sizeof(unsigned long);
1089 1649
1090 /* 1650 /*
1091 * Allocate chunk slots. The additional last slot is for 1651 * Allocate chunk slots. The additional last slot is for
@@ -1105,189 +1665,351 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
1105 */ 1665 */
1106 schunk = alloc_bootmem(pcpu_chunk_struct_size); 1666 schunk = alloc_bootmem(pcpu_chunk_struct_size);
1107 INIT_LIST_HEAD(&schunk->list); 1667 INIT_LIST_HEAD(&schunk->list);
1108 schunk->vm = &first_vm; 1668 schunk->base_addr = base_addr;
1109 schunk->map = smap; 1669 schunk->map = smap;
1110 schunk->map_alloc = ARRAY_SIZE(smap); 1670 schunk->map_alloc = ARRAY_SIZE(smap);
1111 schunk->page = schunk->page_ar; 1671 schunk->immutable = true;
1672 bitmap_fill(schunk->populated, pcpu_unit_pages);
1112 1673
1113 if (reserved_size) { 1674 if (ai->reserved_size) {
1114 schunk->free_size = reserved_size; 1675 schunk->free_size = ai->reserved_size;
1115 pcpu_reserved_chunk = schunk; 1676 pcpu_reserved_chunk = schunk;
1116 pcpu_reserved_chunk_limit = static_size + reserved_size; 1677 pcpu_reserved_chunk_limit = ai->static_size + ai->reserved_size;
1117 } else { 1678 } else {
1118 schunk->free_size = dyn_size; 1679 schunk->free_size = dyn_size;
1119 dyn_size = 0; /* dynamic area covered */ 1680 dyn_size = 0; /* dynamic area covered */
1120 } 1681 }
1121 schunk->contig_hint = schunk->free_size; 1682 schunk->contig_hint = schunk->free_size;
1122 1683
1123 schunk->map[schunk->map_used++] = -static_size; 1684 schunk->map[schunk->map_used++] = -ai->static_size;
1124 if (schunk->free_size) 1685 if (schunk->free_size)
1125 schunk->map[schunk->map_used++] = schunk->free_size; 1686 schunk->map[schunk->map_used++] = schunk->free_size;
1126 1687
1127 /* init dynamic chunk if necessary */ 1688 /* init dynamic chunk if necessary */
1128 if (dyn_size) { 1689 if (dyn_size) {
1129 dchunk = alloc_bootmem(sizeof(struct pcpu_chunk)); 1690 dchunk = alloc_bootmem(pcpu_chunk_struct_size);
1130 INIT_LIST_HEAD(&dchunk->list); 1691 INIT_LIST_HEAD(&dchunk->list);
1131 dchunk->vm = &first_vm; 1692 dchunk->base_addr = base_addr;
1132 dchunk->map = dmap; 1693 dchunk->map = dmap;
1133 dchunk->map_alloc = ARRAY_SIZE(dmap); 1694 dchunk->map_alloc = ARRAY_SIZE(dmap);
1134 dchunk->page = schunk->page_ar; /* share page map with schunk */ 1695 dchunk->immutable = true;
1696 bitmap_fill(dchunk->populated, pcpu_unit_pages);
1135 1697
1136 dchunk->contig_hint = dchunk->free_size = dyn_size; 1698 dchunk->contig_hint = dchunk->free_size = dyn_size;
1137 dchunk->map[dchunk->map_used++] = -pcpu_reserved_chunk_limit; 1699 dchunk->map[dchunk->map_used++] = -pcpu_reserved_chunk_limit;
1138 dchunk->map[dchunk->map_used++] = dchunk->free_size; 1700 dchunk->map[dchunk->map_used++] = dchunk->free_size;
1139 } 1701 }
1140 1702
1141 /* allocate vm address */
1142 first_vm.flags = VM_ALLOC;
1143 first_vm.size = pcpu_chunk_size;
1144
1145 if (!base_addr)
1146 vm_area_register_early(&first_vm, PAGE_SIZE);
1147 else {
1148 /*
1149 * Pages already mapped. No need to remap into
1150 * vmalloc area. In this case the first chunks can't
1151 * be mapped or unmapped by percpu and are marked
1152 * immutable.
1153 */
1154 first_vm.addr = base_addr;
1155 schunk->immutable = true;
1156 if (dchunk)
1157 dchunk->immutable = true;
1158 }
1159
1160 /* assign pages */
1161 nr_pages = -1;
1162 for_each_possible_cpu(cpu) {
1163 for (i = 0; i < pcpu_unit_pages; i++) {
1164 struct page *page = get_page_fn(cpu, i);
1165
1166 if (!page)
1167 break;
1168 *pcpu_chunk_pagep(schunk, cpu, i) = page;
1169 }
1170
1171 BUG_ON(i < PFN_UP(static_size));
1172
1173 if (nr_pages < 0)
1174 nr_pages = i;
1175 else
1176 BUG_ON(nr_pages != i);
1177 }
1178
1179 /* map them */
1180 if (populate_pte_fn) {
1181 for_each_possible_cpu(cpu)
1182 for (i = 0; i < nr_pages; i++)
1183 populate_pte_fn(pcpu_chunk_addr(schunk,
1184 cpu, i));
1185
1186 err = pcpu_map(schunk, 0, nr_pages);
1187 if (err)
1188 panic("failed to setup static percpu area, err=%d\n",
1189 err);
1190 }
1191
1192 /* link the first chunk in */ 1703 /* link the first chunk in */
1193 pcpu_first_chunk = dchunk ?: schunk; 1704 pcpu_first_chunk = dchunk ?: schunk;
1194 pcpu_chunk_relocate(pcpu_first_chunk, -1); 1705 pcpu_chunk_relocate(pcpu_first_chunk, -1);
1195 1706
1196 /* we're done */ 1707 /* we're done */
1197 pcpu_base_addr = (void *)pcpu_chunk_addr(schunk, 0, 0); 1708 pcpu_base_addr = base_addr;
1198 return pcpu_unit_size; 1709 return 0;
1199} 1710}
1200 1711
1201/* 1712const char *pcpu_fc_names[PCPU_FC_NR] __initdata = {
1202 * Embedding first chunk setup helper. 1713 [PCPU_FC_AUTO] = "auto",
1203 */ 1714 [PCPU_FC_EMBED] = "embed",
1204static void *pcpue_ptr __initdata; 1715 [PCPU_FC_PAGE] = "page",
1205static size_t pcpue_size __initdata; 1716};
1206static size_t pcpue_unit_size __initdata;
1207 1717
1208static struct page * __init pcpue_get_page(unsigned int cpu, int pageno) 1718enum pcpu_fc pcpu_chosen_fc __initdata = PCPU_FC_AUTO;
1209{
1210 size_t off = (size_t)pageno << PAGE_SHIFT;
1211 1719
1212 if (off >= pcpue_size) 1720static int __init percpu_alloc_setup(char *str)
1213 return NULL; 1721{
1722 if (0)
1723 /* nada */;
1724#ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK
1725 else if (!strcmp(str, "embed"))
1726 pcpu_chosen_fc = PCPU_FC_EMBED;
1727#endif
1728#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
1729 else if (!strcmp(str, "page"))
1730 pcpu_chosen_fc = PCPU_FC_PAGE;
1731#endif
1732 else
1733 pr_warning("PERCPU: unknown allocator %s specified\n", str);
1214 1734
1215 return virt_to_page(pcpue_ptr + cpu * pcpue_unit_size + off); 1735 return 0;
1216} 1736}
1737early_param("percpu_alloc", percpu_alloc_setup);
1217 1738
1739#if defined(CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK) || \
1740 !defined(CONFIG_HAVE_SETUP_PER_CPU_AREA)
1218/** 1741/**
1219 * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem 1742 * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem
1220 * @static_size: the size of static percpu area in bytes
1221 * @reserved_size: the size of reserved percpu area in bytes 1743 * @reserved_size: the size of reserved percpu area in bytes
1222 * @dyn_size: free size for dynamic allocation in bytes, -1 for auto 1744 * @dyn_size: free size for dynamic allocation in bytes, -1 for auto
1223 * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, -1 for auto 1745 * @atom_size: allocation atom size
1746 * @cpu_distance_fn: callback to determine distance between cpus, optional
1747 * @alloc_fn: function to allocate percpu page
1748 * @free_fn: funtion to free percpu page
1224 * 1749 *
1225 * This is a helper to ease setting up embedded first percpu chunk and 1750 * This is a helper to ease setting up embedded first percpu chunk and
1226 * can be called where pcpu_setup_first_chunk() is expected. 1751 * can be called where pcpu_setup_first_chunk() is expected.
1227 * 1752 *
1228 * If this function is used to setup the first chunk, it is allocated 1753 * If this function is used to setup the first chunk, it is allocated
1229 * as a contiguous area using bootmem allocator and used as-is without 1754 * by calling @alloc_fn and used as-is without being mapped into
1230 * being mapped into vmalloc area. This enables the first chunk to 1755 * vmalloc area. Allocations are always whole multiples of @atom_size
1231 * piggy back on the linear physical mapping which often uses larger 1756 * aligned to @atom_size.
1232 * page size. 1757 *
1758 * This enables the first chunk to piggy back on the linear physical
1759 * mapping which often uses larger page size. Please note that this
1760 * can result in very sparse cpu->unit mapping on NUMA machines thus
1761 * requiring large vmalloc address space. Don't use this allocator if
1762 * vmalloc space is not orders of magnitude larger than distances
1763 * between node memory addresses (ie. 32bit NUMA machines).
1233 * 1764 *
1234 * When @dyn_size is positive, dynamic area might be larger than 1765 * When @dyn_size is positive, dynamic area might be larger than
1235 * specified to fill page alignment. Also, when @dyn_size is auto, 1766 * specified to fill page alignment. When @dyn_size is auto,
1236 * @dyn_size does not fill the whole first chunk but only what's 1767 * @dyn_size is just big enough to fill page alignment after static
1237 * necessary for page alignment after static and reserved areas. 1768 * and reserved areas.
1238 * 1769 *
1239 * If the needed size is smaller than the minimum or specified unit 1770 * If the needed size is smaller than the minimum or specified unit
1240 * size, the leftover is returned to the bootmem allocator. 1771 * size, the leftover is returned using @free_fn.
1241 * 1772 *
1242 * RETURNS: 1773 * RETURNS:
1243 * The determined pcpu_unit_size which can be used to initialize 1774 * 0 on success, -errno on failure.
1244 * percpu access on success, -errno on failure.
1245 */ 1775 */
1246ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, 1776int __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size,
1247 ssize_t dyn_size, ssize_t unit_size) 1777 size_t atom_size,
1778 pcpu_fc_cpu_distance_fn_t cpu_distance_fn,
1779 pcpu_fc_alloc_fn_t alloc_fn,
1780 pcpu_fc_free_fn_t free_fn)
1248{ 1781{
1249 size_t chunk_size; 1782 void *base = (void *)ULONG_MAX;
1250 unsigned int cpu; 1783 void **areas = NULL;
1784 struct pcpu_alloc_info *ai;
1785 size_t size_sum, areas_size;
1786 int group, i, rc;
1787
1788 ai = pcpu_build_alloc_info(reserved_size, dyn_size, atom_size,
1789 cpu_distance_fn);
1790 if (IS_ERR(ai))
1791 return PTR_ERR(ai);
1792
1793 size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
1794 areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *));
1795
1796 areas = alloc_bootmem_nopanic(areas_size);
1797 if (!areas) {
1798 rc = -ENOMEM;
1799 goto out_free;
1800 }
1251 1801
1252 /* determine parameters and allocate */ 1802 /* allocate, copy and determine base address */
1253 pcpue_size = PFN_ALIGN(static_size + reserved_size + 1803 for (group = 0; group < ai->nr_groups; group++) {
1254 (dyn_size >= 0 ? dyn_size : 0)); 1804 struct pcpu_group_info *gi = &ai->groups[group];
1255 if (dyn_size != 0) 1805 unsigned int cpu = NR_CPUS;
1256 dyn_size = pcpue_size - static_size - reserved_size; 1806 void *ptr;
1257 1807
1258 if (unit_size >= 0) { 1808 for (i = 0; i < gi->nr_units && cpu == NR_CPUS; i++)
1259 BUG_ON(unit_size < pcpue_size); 1809 cpu = gi->cpu_map[i];
1260 pcpue_unit_size = unit_size; 1810 BUG_ON(cpu == NR_CPUS);
1261 } else 1811
1262 pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE); 1812 /* allocate space for the whole group */
1263 1813 ptr = alloc_fn(cpu, gi->nr_units * ai->unit_size, atom_size);
1264 chunk_size = pcpue_unit_size * nr_cpu_ids; 1814 if (!ptr) {
1265 1815 rc = -ENOMEM;
1266 pcpue_ptr = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE, 1816 goto out_free_areas;
1267 __pa(MAX_DMA_ADDRESS)); 1817 }
1268 if (!pcpue_ptr) { 1818 areas[group] = ptr;
1269 pr_warning("PERCPU: failed to allocate %zu bytes for " 1819
1270 "embedding\n", chunk_size); 1820 base = min(ptr, base);
1271 return -ENOMEM; 1821
1822 for (i = 0; i < gi->nr_units; i++, ptr += ai->unit_size) {
1823 if (gi->cpu_map[i] == NR_CPUS) {
1824 /* unused unit, free whole */
1825 free_fn(ptr, ai->unit_size);
1826 continue;
1827 }
1828 /* copy and return the unused part */
1829 memcpy(ptr, __per_cpu_load, ai->static_size);
1830 free_fn(ptr + size_sum, ai->unit_size - size_sum);
1831 }
1272 } 1832 }
1273 1833
1274 /* return the leftover and copy */ 1834 /* base address is now known, determine group base offsets */
1275 for (cpu = 0; cpu < nr_cpu_ids; cpu++) { 1835 for (group = 0; group < ai->nr_groups; group++)
1276 void *ptr = pcpue_ptr + cpu * pcpue_unit_size; 1836 ai->groups[group].base_offset = areas[group] - base;
1837
1838 pr_info("PERCPU: Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu\n",
1839 PFN_DOWN(size_sum), base, ai->static_size, ai->reserved_size,
1840 ai->dyn_size, ai->unit_size);
1841
1842 rc = pcpu_setup_first_chunk(ai, base);
1843 goto out_free;
1844
1845out_free_areas:
1846 for (group = 0; group < ai->nr_groups; group++)
1847 free_fn(areas[group],
1848 ai->groups[group].nr_units * ai->unit_size);
1849out_free:
1850 pcpu_free_alloc_info(ai);
1851 if (areas)
1852 free_bootmem(__pa(areas), areas_size);
1853 return rc;
1854}
1855#endif /* CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK ||
1856 !CONFIG_HAVE_SETUP_PER_CPU_AREA */
1857
1858#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
1859/**
1860 * pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages
1861 * @reserved_size: the size of reserved percpu area in bytes
1862 * @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE
1863 * @free_fn: funtion to free percpu page, always called with PAGE_SIZE
1864 * @populate_pte_fn: function to populate pte
1865 *
1866 * This is a helper to ease setting up page-remapped first percpu
1867 * chunk and can be called where pcpu_setup_first_chunk() is expected.
1868 *
1869 * This is the basic allocator. Static percpu area is allocated
1870 * page-by-page into vmalloc area.
1871 *
1872 * RETURNS:
1873 * 0 on success, -errno on failure.
1874 */
1875int __init pcpu_page_first_chunk(size_t reserved_size,
1876 pcpu_fc_alloc_fn_t alloc_fn,
1877 pcpu_fc_free_fn_t free_fn,
1878 pcpu_fc_populate_pte_fn_t populate_pte_fn)
1879{
1880 static struct vm_struct vm;
1881 struct pcpu_alloc_info *ai;
1882 char psize_str[16];
1883 int unit_pages;
1884 size_t pages_size;
1885 struct page **pages;
1886 int unit, i, j, rc;
1887
1888 snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10);
1889
1890 ai = pcpu_build_alloc_info(reserved_size, -1, PAGE_SIZE, NULL);
1891 if (IS_ERR(ai))
1892 return PTR_ERR(ai);
1893 BUG_ON(ai->nr_groups != 1);
1894 BUG_ON(ai->groups[0].nr_units != num_possible_cpus());
1895
1896 unit_pages = ai->unit_size >> PAGE_SHIFT;
1897
1898 /* unaligned allocations can't be freed, round up to page size */
1899 pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() *
1900 sizeof(pages[0]));
1901 pages = alloc_bootmem(pages_size);
1902
1903 /* allocate pages */
1904 j = 0;
1905 for (unit = 0; unit < num_possible_cpus(); unit++)
1906 for (i = 0; i < unit_pages; i++) {
1907 unsigned int cpu = ai->groups[0].cpu_map[unit];
1908 void *ptr;
1909
1910 ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE);
1911 if (!ptr) {
1912 pr_warning("PERCPU: failed to allocate %s page "
1913 "for cpu%u\n", psize_str, cpu);
1914 goto enomem;
1915 }
1916 pages[j++] = virt_to_page(ptr);
1917 }
1918
1919 /* allocate vm area, map the pages and copy static data */
1920 vm.flags = VM_ALLOC;
1921 vm.size = num_possible_cpus() * ai->unit_size;
1922 vm_area_register_early(&vm, PAGE_SIZE);
1923
1924 for (unit = 0; unit < num_possible_cpus(); unit++) {
1925 unsigned long unit_addr =
1926 (unsigned long)vm.addr + unit * ai->unit_size;
1927
1928 for (i = 0; i < unit_pages; i++)
1929 populate_pte_fn(unit_addr + (i << PAGE_SHIFT));
1930
1931 /* pte already populated, the following shouldn't fail */
1932 rc = __pcpu_map_pages(unit_addr, &pages[unit * unit_pages],
1933 unit_pages);
1934 if (rc < 0)
1935 panic("failed to map percpu area, err=%d\n", rc);
1277 1936
1278 if (cpu_possible(cpu)) { 1937 /*
1279 free_bootmem(__pa(ptr + pcpue_size), 1938 * FIXME: Archs with virtual cache should flush local
1280 pcpue_unit_size - pcpue_size); 1939 * cache for the linear mapping here - something
1281 memcpy(ptr, __per_cpu_load, static_size); 1940 * equivalent to flush_cache_vmap() on the local cpu.
1282 } else 1941 * flush_cache_vmap() can't be used as most supporting
1283 free_bootmem(__pa(ptr), pcpue_unit_size); 1942 * data structures are not set up yet.
1943 */
1944
1945 /* copy static data */
1946 memcpy((void *)unit_addr, __per_cpu_load, ai->static_size);
1284 } 1947 }
1285 1948
1286 /* we're ready, commit */ 1949 /* we're ready, commit */
1287 pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n", 1950 pr_info("PERCPU: %d %s pages/cpu @%p s%zu r%zu d%zu\n",
1288 pcpue_size >> PAGE_SHIFT, pcpue_ptr, static_size); 1951 unit_pages, psize_str, vm.addr, ai->static_size,
1952 ai->reserved_size, ai->dyn_size);
1953
1954 rc = pcpu_setup_first_chunk(ai, vm.addr);
1955 goto out_free_ar;
1956
1957enomem:
1958 while (--j >= 0)
1959 free_fn(page_address(pages[j]), PAGE_SIZE);
1960 rc = -ENOMEM;
1961out_free_ar:
1962 free_bootmem(__pa(pages), pages_size);
1963 pcpu_free_alloc_info(ai);
1964 return rc;
1965}
1966#endif /* CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK */
1967
1968/*
1969 * Generic percpu area setup.
1970 *
1971 * The embedding helper is used because its behavior closely resembles
1972 * the original non-dynamic generic percpu area setup. This is
1973 * important because many archs have addressing restrictions and might
1974 * fail if the percpu area is located far away from the previous
1975 * location. As an added bonus, in non-NUMA cases, embedding is
1976 * generally a good idea TLB-wise because percpu area can piggy back
1977 * on the physical linear memory mapping which uses large page
1978 * mappings on applicable archs.
1979 */
1980#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
1981unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
1982EXPORT_SYMBOL(__per_cpu_offset);
1983
1984static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size,
1985 size_t align)
1986{
1987 return __alloc_bootmem_nopanic(size, align, __pa(MAX_DMA_ADDRESS));
1988}
1289 1989
1290 return pcpu_setup_first_chunk(pcpue_get_page, static_size, 1990static void __init pcpu_dfl_fc_free(void *ptr, size_t size)
1291 reserved_size, dyn_size, 1991{
1292 pcpue_unit_size, pcpue_ptr, NULL); 1992 free_bootmem(__pa(ptr), size);
1993}
1994
1995void __init setup_per_cpu_areas(void)
1996{
1997 unsigned long delta;
1998 unsigned int cpu;
1999 int rc;
2000
2001 /*
2002 * Always reserve area for module percpu variables. That's
2003 * what the legacy allocator did.
2004 */
2005 rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE,
2006 PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, NULL,
2007 pcpu_dfl_fc_alloc, pcpu_dfl_fc_free);
2008 if (rc < 0)
2009 panic("Failed to initialized percpu areas.");
2010
2011 delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
2012 for_each_possible_cpu(cpu)
2013 __per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu];
1293} 2014}
2015#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */
diff --git a/mm/quicklist.c b/mm/quicklist.c
index e66d07d1b4ff..6eedf7e473d1 100644
--- a/mm/quicklist.c
+++ b/mm/quicklist.c
@@ -19,7 +19,7 @@
19#include <linux/module.h> 19#include <linux/module.h>
20#include <linux/quicklist.h> 20#include <linux/quicklist.h>
21 21
22DEFINE_PER_CPU(struct quicklist, quicklist)[CONFIG_NR_QUICK]; 22DEFINE_PER_CPU(struct quicklist [CONFIG_NR_QUICK], quicklist);
23 23
24#define FRACTION_OF_NODE_MEM 16 24#define FRACTION_OF_NODE_MEM 16
25 25
diff --git a/mm/slub.c b/mm/slub.c
index b9f1491a58a1..dc9765bb49dc 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2091,8 +2091,8 @@ init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s)
2091 */ 2091 */
2092#define NR_KMEM_CACHE_CPU 100 2092#define NR_KMEM_CACHE_CPU 100
2093 2093
2094static DEFINE_PER_CPU(struct kmem_cache_cpu, 2094static DEFINE_PER_CPU(struct kmem_cache_cpu [NR_KMEM_CACHE_CPU],
2095 kmem_cache_cpu)[NR_KMEM_CACHE_CPU]; 2095 kmem_cache_cpu);
2096 2096
2097static DEFINE_PER_CPU(struct kmem_cache_cpu *, kmem_cache_cpu_free); 2097static DEFINE_PER_CPU(struct kmem_cache_cpu *, kmem_cache_cpu_free);
2098static DECLARE_BITMAP(kmem_cach_cpu_free_init_once, CONFIG_NR_CPUS); 2098static DECLARE_BITMAP(kmem_cach_cpu_free_init_once, CONFIG_NR_CPUS);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index f8189a4b3e13..204b8243d8ab 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -265,6 +265,7 @@ struct vmap_area {
265static DEFINE_SPINLOCK(vmap_area_lock); 265static DEFINE_SPINLOCK(vmap_area_lock);
266static struct rb_root vmap_area_root = RB_ROOT; 266static struct rb_root vmap_area_root = RB_ROOT;
267static LIST_HEAD(vmap_area_list); 267static LIST_HEAD(vmap_area_list);
268static unsigned long vmap_area_pcpu_hole;
268 269
269static struct vmap_area *__find_vmap_area(unsigned long addr) 270static struct vmap_area *__find_vmap_area(unsigned long addr)
270{ 271{
@@ -431,6 +432,15 @@ static void __free_vmap_area(struct vmap_area *va)
431 RB_CLEAR_NODE(&va->rb_node); 432 RB_CLEAR_NODE(&va->rb_node);
432 list_del_rcu(&va->list); 433 list_del_rcu(&va->list);
433 434
435 /*
436 * Track the highest possible candidate for pcpu area
437 * allocation. Areas outside of vmalloc area can be returned
438 * here too, consider only end addresses which fall inside
439 * vmalloc area proper.
440 */
441 if (va->va_end > VMALLOC_START && va->va_end <= VMALLOC_END)
442 vmap_area_pcpu_hole = max(vmap_area_pcpu_hole, va->va_end);
443
434 call_rcu(&va->rcu_head, rcu_free_va); 444 call_rcu(&va->rcu_head, rcu_free_va);
435} 445}
436 446
@@ -1038,6 +1048,9 @@ void __init vmalloc_init(void)
1038 va->va_end = va->va_start + tmp->size; 1048 va->va_end = va->va_start + tmp->size;
1039 __insert_vmap_area(va); 1049 __insert_vmap_area(va);
1040 } 1050 }
1051
1052 vmap_area_pcpu_hole = VMALLOC_END;
1053
1041 vmap_initialized = true; 1054 vmap_initialized = true;
1042} 1055}
1043 1056
@@ -1122,13 +1135,34 @@ EXPORT_SYMBOL_GPL(map_vm_area);
1122DEFINE_RWLOCK(vmlist_lock); 1135DEFINE_RWLOCK(vmlist_lock);
1123struct vm_struct *vmlist; 1136struct vm_struct *vmlist;
1124 1137
1138static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
1139 unsigned long flags, void *caller)
1140{
1141 struct vm_struct *tmp, **p;
1142
1143 vm->flags = flags;
1144 vm->addr = (void *)va->va_start;
1145 vm->size = va->va_end - va->va_start;
1146 vm->caller = caller;
1147 va->private = vm;
1148 va->flags |= VM_VM_AREA;
1149
1150 write_lock(&vmlist_lock);
1151 for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) {
1152 if (tmp->addr >= vm->addr)
1153 break;
1154 }
1155 vm->next = *p;
1156 *p = vm;
1157 write_unlock(&vmlist_lock);
1158}
1159
1125static struct vm_struct *__get_vm_area_node(unsigned long size, 1160static struct vm_struct *__get_vm_area_node(unsigned long size,
1126 unsigned long flags, unsigned long start, unsigned long end, 1161 unsigned long flags, unsigned long start, unsigned long end,
1127 int node, gfp_t gfp_mask, void *caller) 1162 int node, gfp_t gfp_mask, void *caller)
1128{ 1163{
1129 static struct vmap_area *va; 1164 static struct vmap_area *va;
1130 struct vm_struct *area; 1165 struct vm_struct *area;
1131 struct vm_struct *tmp, **p;
1132 unsigned long align = 1; 1166 unsigned long align = 1;
1133 1167
1134 BUG_ON(in_interrupt()); 1168 BUG_ON(in_interrupt());
@@ -1147,7 +1181,7 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
1147 if (unlikely(!size)) 1181 if (unlikely(!size))
1148 return NULL; 1182 return NULL;
1149 1183
1150 area = kmalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node); 1184 area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);
1151 if (unlikely(!area)) 1185 if (unlikely(!area))
1152 return NULL; 1186 return NULL;
1153 1187
@@ -1162,25 +1196,7 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
1162 return NULL; 1196 return NULL;
1163 } 1197 }
1164 1198
1165 area->flags = flags; 1199 insert_vmalloc_vm(area, va, flags, caller);
1166 area->addr = (void *)va->va_start;
1167 area->size = size;
1168 area->pages = NULL;
1169 area->nr_pages = 0;
1170 area->phys_addr = 0;
1171 area->caller = caller;
1172 va->private = area;
1173 va->flags |= VM_VM_AREA;
1174
1175 write_lock(&vmlist_lock);
1176 for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) {
1177 if (tmp->addr >= area->addr)
1178 break;
1179 }
1180 area->next = *p;
1181 *p = area;
1182 write_unlock(&vmlist_lock);
1183
1184 return area; 1200 return area;
1185} 1201}
1186 1202
@@ -1818,6 +1834,286 @@ void free_vm_area(struct vm_struct *area)
1818} 1834}
1819EXPORT_SYMBOL_GPL(free_vm_area); 1835EXPORT_SYMBOL_GPL(free_vm_area);
1820 1836
1837static struct vmap_area *node_to_va(struct rb_node *n)
1838{
1839 return n ? rb_entry(n, struct vmap_area, rb_node) : NULL;
1840}
1841
1842/**
1843 * pvm_find_next_prev - find the next and prev vmap_area surrounding @end
1844 * @end: target address
1845 * @pnext: out arg for the next vmap_area
1846 * @pprev: out arg for the previous vmap_area
1847 *
1848 * Returns: %true if either or both of next and prev are found,
1849 * %false if no vmap_area exists
1850 *
1851 * Find vmap_areas end addresses of which enclose @end. ie. if not
1852 * NULL, *pnext->va_end > @end and *pprev->va_end <= @end.
1853 */
1854static bool pvm_find_next_prev(unsigned long end,
1855 struct vmap_area **pnext,
1856 struct vmap_area **pprev)
1857{
1858 struct rb_node *n = vmap_area_root.rb_node;
1859 struct vmap_area *va = NULL;
1860
1861 while (n) {
1862 va = rb_entry(n, struct vmap_area, rb_node);
1863 if (end < va->va_end)
1864 n = n->rb_left;
1865 else if (end > va->va_end)
1866 n = n->rb_right;
1867 else
1868 break;
1869 }
1870
1871 if (!va)
1872 return false;
1873
1874 if (va->va_end > end) {
1875 *pnext = va;
1876 *pprev = node_to_va(rb_prev(&(*pnext)->rb_node));
1877 } else {
1878 *pprev = va;
1879 *pnext = node_to_va(rb_next(&(*pprev)->rb_node));
1880 }
1881 return true;
1882}
1883
1884/**
1885 * pvm_determine_end - find the highest aligned address between two vmap_areas
1886 * @pnext: in/out arg for the next vmap_area
1887 * @pprev: in/out arg for the previous vmap_area
1888 * @align: alignment
1889 *
1890 * Returns: determined end address
1891 *
1892 * Find the highest aligned address between *@pnext and *@pprev below
1893 * VMALLOC_END. *@pnext and *@pprev are adjusted so that the aligned
1894 * down address is between the end addresses of the two vmap_areas.
1895 *
1896 * Please note that the address returned by this function may fall
1897 * inside *@pnext vmap_area. The caller is responsible for checking
1898 * that.
1899 */
1900static unsigned long pvm_determine_end(struct vmap_area **pnext,
1901 struct vmap_area **pprev,
1902 unsigned long align)
1903{
1904 const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
1905 unsigned long addr;
1906
1907 if (*pnext)
1908 addr = min((*pnext)->va_start & ~(align - 1), vmalloc_end);
1909 else
1910 addr = vmalloc_end;
1911
1912 while (*pprev && (*pprev)->va_end > addr) {
1913 *pnext = *pprev;
1914 *pprev = node_to_va(rb_prev(&(*pnext)->rb_node));
1915 }
1916
1917 return addr;
1918}
1919
1920/**
1921 * pcpu_get_vm_areas - allocate vmalloc areas for percpu allocator
1922 * @offsets: array containing offset of each area
1923 * @sizes: array containing size of each area
1924 * @nr_vms: the number of areas to allocate
1925 * @align: alignment, all entries in @offsets and @sizes must be aligned to this
1926 * @gfp_mask: allocation mask
1927 *
1928 * Returns: kmalloc'd vm_struct pointer array pointing to allocated
1929 * vm_structs on success, %NULL on failure
1930 *
1931 * Percpu allocator wants to use congruent vm areas so that it can
1932 * maintain the offsets among percpu areas. This function allocates
1933 * congruent vmalloc areas for it. These areas tend to be scattered
1934 * pretty far, distance between two areas easily going up to
1935 * gigabytes. To avoid interacting with regular vmallocs, these areas
1936 * are allocated from top.
1937 *
1938 * Despite its complicated look, this allocator is rather simple. It
1939 * does everything top-down and scans areas from the end looking for
1940 * matching slot. While scanning, if any of the areas overlaps with
1941 * existing vmap_area, the base address is pulled down to fit the
1942 * area. Scanning is repeated till all the areas fit and then all
1943 * necessary data structres are inserted and the result is returned.
1944 */
1945struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
1946 const size_t *sizes, int nr_vms,
1947 size_t align, gfp_t gfp_mask)
1948{
1949 const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align);
1950 const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
1951 struct vmap_area **vas, *prev, *next;
1952 struct vm_struct **vms;
1953 int area, area2, last_area, term_area;
1954 unsigned long base, start, end, last_end;
1955 bool purged = false;
1956
1957 gfp_mask &= GFP_RECLAIM_MASK;
1958
1959 /* verify parameters and allocate data structures */
1960 BUG_ON(align & ~PAGE_MASK || !is_power_of_2(align));
1961 for (last_area = 0, area = 0; area < nr_vms; area++) {
1962 start = offsets[area];
1963 end = start + sizes[area];
1964
1965 /* is everything aligned properly? */
1966 BUG_ON(!IS_ALIGNED(offsets[area], align));
1967 BUG_ON(!IS_ALIGNED(sizes[area], align));
1968
1969 /* detect the area with the highest address */
1970 if (start > offsets[last_area])
1971 last_area = area;
1972
1973 for (area2 = 0; area2 < nr_vms; area2++) {
1974 unsigned long start2 = offsets[area2];
1975 unsigned long end2 = start2 + sizes[area2];
1976
1977 if (area2 == area)
1978 continue;
1979
1980 BUG_ON(start2 >= start && start2 < end);
1981 BUG_ON(end2 <= end && end2 > start);
1982 }
1983 }
1984 last_end = offsets[last_area] + sizes[last_area];
1985
1986 if (vmalloc_end - vmalloc_start < last_end) {
1987 WARN_ON(true);
1988 return NULL;
1989 }
1990
1991 vms = kzalloc(sizeof(vms[0]) * nr_vms, gfp_mask);
1992 vas = kzalloc(sizeof(vas[0]) * nr_vms, gfp_mask);
1993 if (!vas || !vms)
1994 goto err_free;
1995
1996 for (area = 0; area < nr_vms; area++) {
1997 vas[area] = kzalloc(sizeof(struct vmap_area), gfp_mask);
1998 vms[area] = kzalloc(sizeof(struct vm_struct), gfp_mask);
1999 if (!vas[area] || !vms[area])
2000 goto err_free;
2001 }
2002retry:
2003 spin_lock(&vmap_area_lock);
2004
2005 /* start scanning - we scan from the top, begin with the last area */
2006 area = term_area = last_area;
2007 start = offsets[area];
2008 end = start + sizes[area];
2009
2010 if (!pvm_find_next_prev(vmap_area_pcpu_hole, &next, &prev)) {
2011 base = vmalloc_end - last_end;
2012 goto found;
2013 }
2014 base = pvm_determine_end(&next, &prev, align) - end;
2015
2016 while (true) {
2017 BUG_ON(next && next->va_end <= base + end);
2018 BUG_ON(prev && prev->va_end > base + end);
2019
2020 /*
2021 * base might have underflowed, add last_end before
2022 * comparing.
2023 */
2024 if (base + last_end < vmalloc_start + last_end) {
2025 spin_unlock(&vmap_area_lock);
2026 if (!purged) {
2027 purge_vmap_area_lazy();
2028 purged = true;
2029 goto retry;
2030 }
2031 goto err_free;
2032 }
2033
2034 /*
2035 * If next overlaps, move base downwards so that it's
2036 * right below next and then recheck.
2037 */
2038 if (next && next->va_start < base + end) {
2039 base = pvm_determine_end(&next, &prev, align) - end;
2040 term_area = area;
2041 continue;
2042 }
2043
2044 /*
2045 * If prev overlaps, shift down next and prev and move
2046 * base so that it's right below new next and then
2047 * recheck.
2048 */
2049 if (prev && prev->va_end > base + start) {
2050 next = prev;
2051 prev = node_to_va(rb_prev(&next->rb_node));
2052 base = pvm_determine_end(&next, &prev, align) - end;
2053 term_area = area;
2054 continue;
2055 }
2056
2057 /*
2058 * This area fits, move on to the previous one. If
2059 * the previous one is the terminal one, we're done.
2060 */
2061 area = (area + nr_vms - 1) % nr_vms;
2062 if (area == term_area)
2063 break;
2064 start = offsets[area];
2065 end = start + sizes[area];
2066 pvm_find_next_prev(base + end, &next, &prev);
2067 }
2068found:
2069 /* we've found a fitting base, insert all va's */
2070 for (area = 0; area < nr_vms; area++) {
2071 struct vmap_area *va = vas[area];
2072
2073 va->va_start = base + offsets[area];
2074 va->va_end = va->va_start + sizes[area];
2075 __insert_vmap_area(va);
2076 }
2077
2078 vmap_area_pcpu_hole = base + offsets[last_area];
2079
2080 spin_unlock(&vmap_area_lock);
2081
2082 /* insert all vm's */
2083 for (area = 0; area < nr_vms; area++)
2084 insert_vmalloc_vm(vms[area], vas[area], VM_ALLOC,
2085 pcpu_get_vm_areas);
2086
2087 kfree(vas);
2088 return vms;
2089
2090err_free:
2091 for (area = 0; area < nr_vms; area++) {
2092 if (vas)
2093 kfree(vas[area]);
2094 if (vms)
2095 kfree(vms[area]);
2096 }
2097 kfree(vas);
2098 kfree(vms);
2099 return NULL;
2100}
2101
2102/**
2103 * pcpu_free_vm_areas - free vmalloc areas for percpu allocator
2104 * @vms: vm_struct pointer array returned by pcpu_get_vm_areas()
2105 * @nr_vms: the number of allocated areas
2106 *
2107 * Free vm_structs and the array allocated by pcpu_get_vm_areas().
2108 */
2109void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)
2110{
2111 int i;
2112
2113 for (i = 0; i < nr_vms; i++)
2114 free_vm_area(vms[i]);
2115 kfree(vms);
2116}
1821 2117
1822#ifdef CONFIG_PROC_FS 2118#ifdef CONFIG_PROC_FS
1823static void *s_start(struct seq_file *m, loff_t *pos) 2119static void *s_start(struct seq_file *m, loff_t *pos)
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index cd2b97f1b6e1..a6e0e077ac33 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -37,12 +37,13 @@ __initcall(init_syncookies);
37#define COOKIEBITS 24 /* Upper bits store count */ 37#define COOKIEBITS 24 /* Upper bits store count */
38#define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1) 38#define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1)
39 39
40static DEFINE_PER_CPU(__u32, cookie_scratch)[16 + 5 + SHA_WORKSPACE_WORDS]; 40static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS],
41 ipv4_cookie_scratch);
41 42
42static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport, 43static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport,
43 u32 count, int c) 44 u32 count, int c)
44{ 45{
45 __u32 *tmp = __get_cpu_var(cookie_scratch); 46 __u32 *tmp = __get_cpu_var(ipv4_cookie_scratch);
46 47
47 memcpy(tmp + 4, syncookie_secret[c], sizeof(syncookie_secret[c])); 48 memcpy(tmp + 4, syncookie_secret[c], sizeof(syncookie_secret[c]));
48 tmp[0] = (__force u32)saddr; 49 tmp[0] = (__force u32)saddr;
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
index 8c2513982b61..6b6ae913b5d4 100644
--- a/net/ipv6/syncookies.c
+++ b/net/ipv6/syncookies.c
@@ -74,12 +74,13 @@ static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb,
74 return child; 74 return child;
75} 75}
76 76
77static DEFINE_PER_CPU(__u32, cookie_scratch)[16 + 5 + SHA_WORKSPACE_WORDS]; 77static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS],
78 ipv6_cookie_scratch);
78 79
79static u32 cookie_hash(struct in6_addr *saddr, struct in6_addr *daddr, 80static u32 cookie_hash(struct in6_addr *saddr, struct in6_addr *daddr,
80 __be16 sport, __be16 dport, u32 count, int c) 81 __be16 sport, __be16 dport, u32 count, int c)
81{ 82{
82 __u32 *tmp = __get_cpu_var(cookie_scratch); 83 __u32 *tmp = __get_cpu_var(ipv6_cookie_scratch);
83 84
84 /* 85 /*
85 * we have 320 bits of information to hash, copy in the remaining 86 * we have 320 bits of information to hash, copy in the remaining
diff --git a/net/rds/ib_stats.c b/net/rds/ib_stats.c
index 02e3e3d50d4a..301ae51ae409 100644
--- a/net/rds/ib_stats.c
+++ b/net/rds/ib_stats.c
@@ -37,7 +37,7 @@
37#include "rds.h" 37#include "rds.h"
38#include "ib.h" 38#include "ib.h"
39 39
40DEFINE_PER_CPU(struct rds_ib_statistics, rds_ib_stats) ____cacheline_aligned; 40DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_ib_statistics, rds_ib_stats);
41 41
42static char *rds_ib_stat_names[] = { 42static char *rds_ib_stat_names[] = {
43 "ib_connect_raced", 43 "ib_connect_raced",
diff --git a/net/rds/iw_stats.c b/net/rds/iw_stats.c
index ccc7e8f0bf0e..fafea3cc92d7 100644
--- a/net/rds/iw_stats.c
+++ b/net/rds/iw_stats.c
@@ -37,7 +37,7 @@
37#include "rds.h" 37#include "rds.h"
38#include "iw.h" 38#include "iw.h"
39 39
40DEFINE_PER_CPU(struct rds_iw_statistics, rds_iw_stats) ____cacheline_aligned; 40DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_iw_statistics, rds_iw_stats);
41 41
42static char *rds_iw_stat_names[] = { 42static char *rds_iw_stat_names[] = {
43 "iw_connect_raced", 43 "iw_connect_raced",
diff --git a/net/rds/page.c b/net/rds/page.c
index c460743a89ad..de7bb84bcd78 100644
--- a/net/rds/page.c
+++ b/net/rds/page.c
@@ -39,7 +39,7 @@ struct rds_page_remainder {
39 unsigned long r_offset; 39 unsigned long r_offset;
40}; 40};
41 41
42DEFINE_PER_CPU(struct rds_page_remainder, rds_page_remainders) ____cacheline_aligned; 42DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_page_remainder, rds_page_remainders);
43 43
44/* 44/*
45 * returns 0 on success or -errno on failure. 45 * returns 0 on success or -errno on failure.
diff --git a/scripts/module-common.lds b/scripts/module-common.lds
new file mode 100644
index 000000000000..47a1f9ae0ede
--- /dev/null
+++ b/scripts/module-common.lds
@@ -0,0 +1,8 @@
1/*
2 * Common module linker script, always used when linking a module.
3 * Archs are free to supply their own linker scripts. ld will
4 * combine them automatically.
5 */
6SECTIONS {
7 /DISCARD/ : { *(.discard) }
8}