aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Makefile2
-rw-r--r--arch/alpha/include/asm/percpu.h100
-rw-r--r--arch/alpha/include/asm/tlbflush.h1
-rw-r--r--arch/alpha/kernel/vmlinux.lds.S9
-rw-r--r--arch/arm/kernel/vmlinux.lds.S1
-rw-r--r--arch/avr32/kernel/vmlinux.lds.S9
-rw-r--r--arch/blackfin/kernel/vmlinux.lds.S5
-rw-r--r--arch/blackfin/mm/sram-alloc.c6
-rw-r--r--arch/cris/include/asm/mmu_context.h3
-rw-r--r--arch/cris/kernel/vmlinux.lds.S9
-rw-r--r--arch/cris/mm/fault.c2
-rw-r--r--arch/frv/kernel/vmlinux.lds.S2
-rw-r--r--arch/h8300/kernel/vmlinux.lds.S5
-rw-r--r--arch/ia64/Kconfig3
-rw-r--r--arch/ia64/kernel/setup.c6
-rw-r--r--arch/ia64/kernel/smp.c3
-rw-r--r--arch/ia64/kernel/vmlinux.lds.S16
-rw-r--r--arch/ia64/sn/kernel/setup.c2
-rw-r--r--arch/m32r/kernel/vmlinux.lds.S10
-rw-r--r--arch/m68k/kernel/vmlinux-std.lds10
-rw-r--r--arch/m68k/kernel/vmlinux-sun3.lds9
-rw-r--r--arch/m68knommu/kernel/vmlinux.lds.S7
-rw-r--r--arch/microblaze/kernel/vmlinux.lds.S6
-rw-r--r--arch/mips/kernel/vmlinux.lds.S21
-rw-r--r--arch/mn10300/kernel/vmlinux.lds.S8
-rw-r--r--arch/parisc/kernel/vmlinux.lds.S8
-rw-r--r--arch/powerpc/Kconfig3
-rw-r--r--arch/powerpc/kernel/vmlinux.lds.S9
-rw-r--r--arch/powerpc/mm/stab.c2
-rw-r--r--arch/powerpc/platforms/ps3/smp.c2
-rw-r--r--arch/s390/include/asm/percpu.h32
-rw-r--r--arch/s390/kernel/vmlinux.lds.S9
-rw-r--r--arch/sh/kernel/vmlinux.lds.S10
-rw-r--r--arch/sparc/Kconfig3
-rw-r--r--arch/sparc/kernel/smp_64.c42
-rw-r--r--arch/sparc/kernel/vmlinux.lds.S8
-rw-r--r--arch/um/include/asm/common.lds.S5
-rw-r--r--arch/um/kernel/dyn.lds.S2
-rw-r--r--arch/um/kernel/uml.lds.S2
-rw-r--r--arch/x86/Kconfig3
-rw-r--r--arch/x86/include/asm/percpu.h9
-rw-r--r--arch/x86/kernel/cpu/cpu_debug.c4
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c8
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c2
-rw-r--r--arch/x86/kernel/cpu/perf_counter.c14
-rw-r--r--arch/x86/kernel/setup_percpu.c297
-rw-r--r--arch/x86/kernel/vmlinux.lds.S11
-rw-r--r--arch/x86/mm/pageattr.c1
-rw-r--r--arch/xtensa/kernel/vmlinux.lds.S13
-rw-r--r--block/as-iosched.c10
-rw-r--r--block/cfq-iosched.c10
-rw-r--r--drivers/cpufreq/cpufreq_conservative.c12
-rw-r--r--drivers/cpufreq/cpufreq_ondemand.c15
-rw-r--r--drivers/xen/events.c13
-rw-r--r--include/asm-generic/vmlinux.lds.h24
-rw-r--r--include/linux/percpu-defs.h66
-rw-r--r--include/linux/percpu.h85
-rw-r--r--init/main.c24
-rw-r--r--kernel/module.c6
-rw-r--r--kernel/perf_counter.c6
-rw-r--r--kernel/sched.c4
-rw-r--r--kernel/trace/trace_events.c6
-rw-r--r--lib/Kconfig.debug15
-rw-r--r--mm/Makefile2
-rw-r--r--mm/allocpercpu.c28
-rw-r--r--mm/kmemleak-test.c6
-rw-r--r--mm/page-writeback.c5
-rw-r--r--mm/percpu.c1318
-rw-r--r--mm/quicklist.c2
-rw-r--r--mm/slub.c4
-rw-r--r--net/ipv4/syncookies.c5
-rw-r--r--net/ipv6/syncookies.c5
-rw-r--r--net/rds/ib_stats.c2
-rw-r--r--net/rds/iw_stats.c2
-rw-r--r--net/rds/page.c2
-rw-r--r--scripts/module-common.lds8
76 files changed, 1511 insertions, 928 deletions
diff --git a/Makefile b/Makefile
index abcfa85f8f82..e1e7a71355d8 100644
--- a/Makefile
+++ b/Makefile
@@ -325,7 +325,7 @@ CHECKFLAGS := -D__linux__ -Dlinux -D__STDC__ -Dunix -D__unix__ \
325MODFLAGS = -DMODULE 325MODFLAGS = -DMODULE
326CFLAGS_MODULE = $(MODFLAGS) 326CFLAGS_MODULE = $(MODFLAGS)
327AFLAGS_MODULE = $(MODFLAGS) 327AFLAGS_MODULE = $(MODFLAGS)
328LDFLAGS_MODULE = 328LDFLAGS_MODULE = -T $(srctree)/scripts/module-common.lds
329CFLAGS_KERNEL = 329CFLAGS_KERNEL =
330AFLAGS_KERNEL = 330AFLAGS_KERNEL =
331CFLAGS_GCOV = -fprofile-arcs -ftest-coverage 331CFLAGS_GCOV = -fprofile-arcs -ftest-coverage
diff --git a/arch/alpha/include/asm/percpu.h b/arch/alpha/include/asm/percpu.h
index b663f1f10b6a..2c12378e3aa9 100644
--- a/arch/alpha/include/asm/percpu.h
+++ b/arch/alpha/include/asm/percpu.h
@@ -1,102 +1,18 @@
1#ifndef __ALPHA_PERCPU_H 1#ifndef __ALPHA_PERCPU_H
2#define __ALPHA_PERCPU_H 2#define __ALPHA_PERCPU_H
3 3
4#include <linux/compiler.h>
5#include <linux/threads.h>
6#include <linux/percpu-defs.h>
7
8/*
9 * Determine the real variable name from the name visible in the
10 * kernel sources.
11 */
12#define per_cpu_var(var) per_cpu__##var
13
14#ifdef CONFIG_SMP
15
16/*
17 * per_cpu_offset() is the offset that has to be added to a
18 * percpu variable to get to the instance for a certain processor.
19 */
20extern unsigned long __per_cpu_offset[NR_CPUS];
21
22#define per_cpu_offset(x) (__per_cpu_offset[x])
23
24#define __my_cpu_offset per_cpu_offset(raw_smp_processor_id())
25#ifdef CONFIG_DEBUG_PREEMPT
26#define my_cpu_offset per_cpu_offset(smp_processor_id())
27#else
28#define my_cpu_offset __my_cpu_offset
29#endif
30
31#ifndef MODULE
32#define SHIFT_PERCPU_PTR(var, offset) RELOC_HIDE(&per_cpu_var(var), (offset))
33#define PER_CPU_DEF_ATTRIBUTES
34#else
35/* 4/*
36 * To calculate addresses of locally defined variables, GCC uses 32-bit 5 * To calculate addresses of locally defined variables, GCC uses
37 * displacement from the GP. Which doesn't work for per cpu variables in 6 * 32-bit displacement from the GP. Which doesn't work for per cpu
38 * modules, as an offset to the kernel per cpu area is way above 4G. 7 * variables in modules, as an offset to the kernel per cpu area is
8 * way above 4G.
39 * 9 *
40 * This forces allocation of a GOT entry for per cpu variable using 10 * Always use weak definitions for percpu variables in modules.
41 * ldq instruction with a 'literal' relocation.
42 */
43#define SHIFT_PERCPU_PTR(var, offset) ({ \
44 extern int simple_identifier_##var(void); \
45 unsigned long __ptr, tmp_gp; \
46 asm ( "br %1, 1f \n\
47 1: ldgp %1, 0(%1) \n\
48 ldq %0, per_cpu__" #var"(%1)\t!literal" \
49 : "=&r"(__ptr), "=&r"(tmp_gp)); \
50 (typeof(&per_cpu_var(var)))(__ptr + (offset)); })
51
52#define PER_CPU_DEF_ATTRIBUTES __used
53
54#endif /* MODULE */
55
56/*
57 * A percpu variable may point to a discarded regions. The following are
58 * established ways to produce a usable pointer from the percpu variable
59 * offset.
60 */ 11 */
61#define per_cpu(var, cpu) \ 12#if defined(MODULE) && defined(CONFIG_SMP)
62 (*SHIFT_PERCPU_PTR(var, per_cpu_offset(cpu))) 13#define ARCH_NEEDS_WEAK_PER_CPU
63#define __get_cpu_var(var) \
64 (*SHIFT_PERCPU_PTR(var, my_cpu_offset))
65#define __raw_get_cpu_var(var) \
66 (*SHIFT_PERCPU_PTR(var, __my_cpu_offset))
67
68#else /* ! SMP */
69
70#define per_cpu(var, cpu) (*((void)(cpu), &per_cpu_var(var)))
71#define __get_cpu_var(var) per_cpu_var(var)
72#define __raw_get_cpu_var(var) per_cpu_var(var)
73
74#define PER_CPU_DEF_ATTRIBUTES
75
76#endif /* SMP */
77
78#ifdef CONFIG_SMP
79#define PER_CPU_BASE_SECTION ".data.percpu"
80#else
81#define PER_CPU_BASE_SECTION ".data"
82#endif
83
84#ifdef CONFIG_SMP
85
86#ifdef MODULE
87#define PER_CPU_SHARED_ALIGNED_SECTION ""
88#else
89#define PER_CPU_SHARED_ALIGNED_SECTION ".shared_aligned"
90#endif
91#define PER_CPU_FIRST_SECTION ".first"
92
93#else
94
95#define PER_CPU_SHARED_ALIGNED_SECTION ""
96#define PER_CPU_FIRST_SECTION ""
97
98#endif 14#endif
99 15
100#define PER_CPU_ATTRIBUTES 16#include <asm-generic/percpu.h>
101 17
102#endif /* __ALPHA_PERCPU_H */ 18#endif /* __ALPHA_PERCPU_H */
diff --git a/arch/alpha/include/asm/tlbflush.h b/arch/alpha/include/asm/tlbflush.h
index 9d87aaa08c0d..e89e0c2e15b1 100644
--- a/arch/alpha/include/asm/tlbflush.h
+++ b/arch/alpha/include/asm/tlbflush.h
@@ -2,6 +2,7 @@
2#define _ALPHA_TLBFLUSH_H 2#define _ALPHA_TLBFLUSH_H
3 3
4#include <linux/mm.h> 4#include <linux/mm.h>
5#include <linux/sched.h>
5#include <asm/compiler.h> 6#include <asm/compiler.h>
6#include <asm/pgalloc.h> 7#include <asm/pgalloc.h>
7 8
diff --git a/arch/alpha/kernel/vmlinux.lds.S b/arch/alpha/kernel/vmlinux.lds.S
index b9d6568e5f7f..6dc03c35caa0 100644
--- a/arch/alpha/kernel/vmlinux.lds.S
+++ b/arch/alpha/kernel/vmlinux.lds.S
@@ -134,13 +134,6 @@ SECTIONS
134 __bss_stop = .; 134 __bss_stop = .;
135 _end = .; 135 _end = .;
136 136
137 /* Sections to be discarded */
138 /DISCARD/ : {
139 EXIT_TEXT
140 EXIT_DATA
141 *(.exitcall.exit)
142 }
143
144 .mdebug 0 : { 137 .mdebug 0 : {
145 *(.mdebug) 138 *(.mdebug)
146 } 139 }
@@ -150,4 +143,6 @@ SECTIONS
150 143
151 STABS_DEBUG 144 STABS_DEBUG
152 DWARF_DEBUG 145 DWARF_DEBUG
146
147 DISCARDS
153} 148}
diff --git a/arch/arm/kernel/vmlinux.lds.S b/arch/arm/kernel/vmlinux.lds.S
index 69371028a202..5cc4812c9763 100644
--- a/arch/arm/kernel/vmlinux.lds.S
+++ b/arch/arm/kernel/vmlinux.lds.S
@@ -83,6 +83,7 @@ SECTIONS
83 EXIT_TEXT 83 EXIT_TEXT
84 EXIT_DATA 84 EXIT_DATA
85 *(.exitcall.exit) 85 *(.exitcall.exit)
86 *(.discard)
86 *(.ARM.exidx.exit.text) 87 *(.ARM.exidx.exit.text)
87 *(.ARM.extab.exit.text) 88 *(.ARM.extab.exit.text)
88#ifndef CONFIG_HOTPLUG_CPU 89#ifndef CONFIG_HOTPLUG_CPU
diff --git a/arch/avr32/kernel/vmlinux.lds.S b/arch/avr32/kernel/vmlinux.lds.S
index 7910d41eb886..c4b56654349a 100644
--- a/arch/avr32/kernel/vmlinux.lds.S
+++ b/arch/avr32/kernel/vmlinux.lds.S
@@ -124,14 +124,11 @@ SECTIONS
124 _end = .; 124 _end = .;
125 } 125 }
126 126
127 DWARF_DEBUG
128
127 /* When something in the kernel is NOT compiled as a module, the module 129 /* When something in the kernel is NOT compiled as a module, the module
128 * cleanup code and data are put into these segments. Both can then be 130 * cleanup code and data are put into these segments. Both can then be
129 * thrown away, as cleanup code is never called unless it's a module. 131 * thrown away, as cleanup code is never called unless it's a module.
130 */ 132 */
131 /DISCARD/ : { 133 DISCARDS
132 EXIT_DATA
133 *(.exitcall.exit)
134 }
135
136 DWARF_DEBUG
137} 134}
diff --git a/arch/blackfin/kernel/vmlinux.lds.S b/arch/blackfin/kernel/vmlinux.lds.S
index 6ac307ca0d80..d7ffe299b979 100644
--- a/arch/blackfin/kernel/vmlinux.lds.S
+++ b/arch/blackfin/kernel/vmlinux.lds.S
@@ -277,8 +277,5 @@ SECTIONS
277 277
278 DWARF_DEBUG 278 DWARF_DEBUG
279 279
280 /DISCARD/ : 280 DISCARDS
281 {
282 *(.exitcall.exit)
283 }
284} 281}
diff --git a/arch/blackfin/mm/sram-alloc.c b/arch/blackfin/mm/sram-alloc.c
index 0bc3c4ef0aad..99e4dbb1dfd1 100644
--- a/arch/blackfin/mm/sram-alloc.c
+++ b/arch/blackfin/mm/sram-alloc.c
@@ -42,9 +42,9 @@
42#include <asm/mem_map.h> 42#include <asm/mem_map.h>
43#include "blackfin_sram.h" 43#include "blackfin_sram.h"
44 44
45static DEFINE_PER_CPU(spinlock_t, l1sram_lock) ____cacheline_aligned_in_smp; 45static DEFINE_PER_CPU_SHARED_ALIGNED(spinlock_t, l1sram_lock);
46static DEFINE_PER_CPU(spinlock_t, l1_data_sram_lock) ____cacheline_aligned_in_smp; 46static DEFINE_PER_CPU_SHARED_ALIGNED(spinlock_t, l1_data_sram_lock);
47static DEFINE_PER_CPU(spinlock_t, l1_inst_sram_lock) ____cacheline_aligned_in_smp; 47static DEFINE_PER_CPU_SHARED_ALIGNED(spinlock_t, l1_inst_sram_lock);
48static spinlock_t l2_sram_lock ____cacheline_aligned_in_smp; 48static spinlock_t l2_sram_lock ____cacheline_aligned_in_smp;
49 49
50/* the data structure for L1 scratchpad and DATA SRAM */ 50/* the data structure for L1 scratchpad and DATA SRAM */
diff --git a/arch/cris/include/asm/mmu_context.h b/arch/cris/include/asm/mmu_context.h
index 72ba08dcfd18..1d45fd6365b7 100644
--- a/arch/cris/include/asm/mmu_context.h
+++ b/arch/cris/include/asm/mmu_context.h
@@ -17,7 +17,8 @@ extern void switch_mm(struct mm_struct *prev, struct mm_struct *next,
17 * registers like cr3 on the i386 17 * registers like cr3 on the i386
18 */ 18 */
19 19
20extern volatile DEFINE_PER_CPU(pgd_t *,current_pgd); /* defined in arch/cris/mm/fault.c */ 20/* defined in arch/cris/mm/fault.c */
21DECLARE_PER_CPU(pgd_t *, current_pgd);
21 22
22static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) 23static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
23{ 24{
diff --git a/arch/cris/kernel/vmlinux.lds.S b/arch/cris/kernel/vmlinux.lds.S
index 0d2adfc794d4..6c81836b9229 100644
--- a/arch/cris/kernel/vmlinux.lds.S
+++ b/arch/cris/kernel/vmlinux.lds.S
@@ -140,12 +140,7 @@ SECTIONS
140 _end = .; 140 _end = .;
141 __end = .; 141 __end = .;
142 142
143 /* Sections to be discarded */
144 /DISCARD/ : {
145 EXIT_TEXT
146 EXIT_DATA
147 *(.exitcall.exit)
148 }
149
150 dram_end = dram_start + (CONFIG_ETRAX_DRAM_SIZE - __CONFIG_ETRAX_VMEM_SIZE)*1024*1024; 143 dram_end = dram_start + (CONFIG_ETRAX_DRAM_SIZE - __CONFIG_ETRAX_VMEM_SIZE)*1024*1024;
144
145 DISCARDS
151} 146}
diff --git a/arch/cris/mm/fault.c b/arch/cris/mm/fault.c
index f925115e3250..4a7cdd9ea1ee 100644
--- a/arch/cris/mm/fault.c
+++ b/arch/cris/mm/fault.c
@@ -29,7 +29,7 @@ extern void die_if_kernel(const char *, struct pt_regs *, long);
29 29
30/* current active page directory */ 30/* current active page directory */
31 31
32volatile DEFINE_PER_CPU(pgd_t *,current_pgd); 32DEFINE_PER_CPU(pgd_t *, current_pgd);
33unsigned long cris_signal_return_page; 33unsigned long cris_signal_return_page;
34 34
35/* 35/*
diff --git a/arch/frv/kernel/vmlinux.lds.S b/arch/frv/kernel/vmlinux.lds.S
index 22d9787406ed..7dbf41f68b52 100644
--- a/arch/frv/kernel/vmlinux.lds.S
+++ b/arch/frv/kernel/vmlinux.lds.S
@@ -177,6 +177,8 @@ SECTIONS
177 .debug_ranges 0 : { *(.debug_ranges) } 177 .debug_ranges 0 : { *(.debug_ranges) }
178 178
179 .comment 0 : { *(.comment) } 179 .comment 0 : { *(.comment) }
180
181 DISCARDS
180} 182}
181 183
182__kernel_image_size_no_bss = __bss_start - __kernel_image_start; 184__kernel_image_size_no_bss = __bss_start - __kernel_image_start;
diff --git a/arch/h8300/kernel/vmlinux.lds.S b/arch/h8300/kernel/vmlinux.lds.S
index 43a87b9085b6..662b02ecb86e 100644
--- a/arch/h8300/kernel/vmlinux.lds.S
+++ b/arch/h8300/kernel/vmlinux.lds.S
@@ -152,9 +152,6 @@ SECTIONS
152 __end = . ; 152 __end = . ;
153 __ramstart = .; 153 __ramstart = .;
154 } 154 }
155 /DISCARD/ : {
156 *(.exitcall.exit)
157 }
158 .romfs : 155 .romfs :
159 { 156 {
160 *(.romfs*) 157 *(.romfs*)
@@ -165,4 +162,6 @@ SECTIONS
165 COMMAND_START = . - 0x200 ; 162 COMMAND_START = . - 0x200 ;
166 __ramend = . ; 163 __ramend = . ;
167 } 164 }
165
166 DISCARDS
168} 167}
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 170042b420d4..328d2f8b8c3f 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -89,6 +89,9 @@ config GENERIC_TIME_VSYSCALL
89 bool 89 bool
90 default y 90 default y
91 91
92config HAVE_LEGACY_PER_CPU_AREA
93 def_bool y
94
92config HAVE_SETUP_PER_CPU_AREA 95config HAVE_SETUP_PER_CPU_AREA
93 def_bool y 96 def_bool y
94 97
diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c
index 1b23ec126b63..1de86c96801d 100644
--- a/arch/ia64/kernel/setup.c
+++ b/arch/ia64/kernel/setup.c
@@ -855,11 +855,17 @@ identify_cpu (struct cpuinfo_ia64 *c)
855 c->unimpl_pa_mask = ~((1L<<63) | ((1L << phys_addr_size) - 1)); 855 c->unimpl_pa_mask = ~((1L<<63) | ((1L << phys_addr_size) - 1));
856} 856}
857 857
858/*
859 * In UP configuration, setup_per_cpu_areas() is defined in
860 * include/linux/percpu.h
861 */
862#ifdef CONFIG_SMP
858void __init 863void __init
859setup_per_cpu_areas (void) 864setup_per_cpu_areas (void)
860{ 865{
861 /* start_kernel() requires this... */ 866 /* start_kernel() requires this... */
862} 867}
868#endif
863 869
864/* 870/*
865 * Do the following calculations: 871 * Do the following calculations:
diff --git a/arch/ia64/kernel/smp.c b/arch/ia64/kernel/smp.c
index f0c521b0ba4c..93ebfea43c6c 100644
--- a/arch/ia64/kernel/smp.c
+++ b/arch/ia64/kernel/smp.c
@@ -58,7 +58,8 @@ static struct local_tlb_flush_counts {
58 unsigned int count; 58 unsigned int count;
59} __attribute__((__aligned__(32))) local_tlb_flush_counts[NR_CPUS]; 59} __attribute__((__aligned__(32))) local_tlb_flush_counts[NR_CPUS];
60 60
61static DEFINE_PER_CPU(unsigned short, shadow_flush_counts[NR_CPUS]) ____cacheline_aligned; 61static DEFINE_PER_CPU_SHARED_ALIGNED(unsigned short [NR_CPUS],
62 shadow_flush_counts);
62 63
63#define IPI_CALL_FUNC 0 64#define IPI_CALL_FUNC 0
64#define IPI_CPU_STOP 1 65#define IPI_CPU_STOP 1
diff --git a/arch/ia64/kernel/vmlinux.lds.S b/arch/ia64/kernel/vmlinux.lds.S
index 4a95e86b9ac2..eb4214d1c5af 100644
--- a/arch/ia64/kernel/vmlinux.lds.S
+++ b/arch/ia64/kernel/vmlinux.lds.S
@@ -24,14 +24,14 @@ PHDRS {
24} 24}
25SECTIONS 25SECTIONS
26{ 26{
27 /* Sections to be discarded */ 27 /* unwind exit sections must be discarded before the rest of the
28 sections get included. */
28 /DISCARD/ : { 29 /DISCARD/ : {
29 EXIT_TEXT
30 EXIT_DATA
31 *(.exitcall.exit)
32 *(.IA_64.unwind.exit.text) 30 *(.IA_64.unwind.exit.text)
33 *(.IA_64.unwind_info.exit.text) 31 *(.IA_64.unwind_info.exit.text)
34 } 32 *(.comment)
33 *(.note)
34 }
35 35
36 v = PAGE_OFFSET; /* this symbol is here to make debugging easier... */ 36 v = PAGE_OFFSET; /* this symbol is here to make debugging easier... */
37 phys_start = _start - LOAD_OFFSET; 37 phys_start = _start - LOAD_OFFSET;
@@ -316,7 +316,7 @@ SECTIONS
316 .debug_funcnames 0 : { *(.debug_funcnames) } 316 .debug_funcnames 0 : { *(.debug_funcnames) }
317 .debug_typenames 0 : { *(.debug_typenames) } 317 .debug_typenames 0 : { *(.debug_typenames) }
318 .debug_varnames 0 : { *(.debug_varnames) } 318 .debug_varnames 0 : { *(.debug_varnames) }
319 /* These must appear regardless of . */ 319
320 /DISCARD/ : { *(.comment) } 320 /* Default discards */
321 /DISCARD/ : { *(.note) } 321 DISCARDS
322} 322}
diff --git a/arch/ia64/sn/kernel/setup.c b/arch/ia64/sn/kernel/setup.c
index e456f062f241..ece1bf994499 100644
--- a/arch/ia64/sn/kernel/setup.c
+++ b/arch/ia64/sn/kernel/setup.c
@@ -71,7 +71,7 @@ EXPORT_SYMBOL(sn_rtc_cycles_per_second);
71DEFINE_PER_CPU(struct sn_hub_info_s, __sn_hub_info); 71DEFINE_PER_CPU(struct sn_hub_info_s, __sn_hub_info);
72EXPORT_PER_CPU_SYMBOL(__sn_hub_info); 72EXPORT_PER_CPU_SYMBOL(__sn_hub_info);
73 73
74DEFINE_PER_CPU(short, __sn_cnodeid_to_nasid[MAX_COMPACT_NODES]); 74DEFINE_PER_CPU(short [MAX_COMPACT_NODES], __sn_cnodeid_to_nasid);
75EXPORT_PER_CPU_SYMBOL(__sn_cnodeid_to_nasid); 75EXPORT_PER_CPU_SYMBOL(__sn_cnodeid_to_nasid);
76 76
77DEFINE_PER_CPU(struct nodepda_s *, __sn_nodepda); 77DEFINE_PER_CPU(struct nodepda_s *, __sn_nodepda);
diff --git a/arch/m32r/kernel/vmlinux.lds.S b/arch/m32r/kernel/vmlinux.lds.S
index 4179adf6c624..de5e21cca6a5 100644
--- a/arch/m32r/kernel/vmlinux.lds.S
+++ b/arch/m32r/kernel/vmlinux.lds.S
@@ -120,13 +120,6 @@ SECTIONS
120 120
121 _end = . ; 121 _end = . ;
122 122
123 /* Sections to be discarded */
124 /DISCARD/ : {
125 EXIT_TEXT
126 EXIT_DATA
127 *(.exitcall.exit)
128 }
129
130 /* Stabs debugging sections. */ 123 /* Stabs debugging sections. */
131 .stab 0 : { *(.stab) } 124 .stab 0 : { *(.stab) }
132 .stabstr 0 : { *(.stabstr) } 125 .stabstr 0 : { *(.stabstr) }
@@ -135,4 +128,7 @@ SECTIONS
135 .stab.index 0 : { *(.stab.index) } 128 .stab.index 0 : { *(.stab.index) }
136 .stab.indexstr 0 : { *(.stab.indexstr) } 129 .stab.indexstr 0 : { *(.stab.indexstr) }
137 .comment 0 : { *(.comment) } 130 .comment 0 : { *(.comment) }
131
132 /* Sections to be discarded */
133 DISCARDS
138} 134}
diff --git a/arch/m68k/kernel/vmlinux-std.lds b/arch/m68k/kernel/vmlinux-std.lds
index 01d212bb05a6..47eac19e8f61 100644
--- a/arch/m68k/kernel/vmlinux-std.lds
+++ b/arch/m68k/kernel/vmlinux-std.lds
@@ -82,13 +82,6 @@ SECTIONS
82 82
83 _end = . ; 83 _end = . ;
84 84
85 /* Sections to be discarded */
86 /DISCARD/ : {
87 EXIT_TEXT
88 EXIT_DATA
89 *(.exitcall.exit)
90 }
91
92 /* Stabs debugging sections. */ 85 /* Stabs debugging sections. */
93 .stab 0 : { *(.stab) } 86 .stab 0 : { *(.stab) }
94 .stabstr 0 : { *(.stabstr) } 87 .stabstr 0 : { *(.stabstr) }
@@ -97,4 +90,7 @@ SECTIONS
97 .stab.index 0 : { *(.stab.index) } 90 .stab.index 0 : { *(.stab.index) }
98 .stab.indexstr 0 : { *(.stab.indexstr) } 91 .stab.indexstr 0 : { *(.stab.indexstr) }
99 .comment 0 : { *(.comment) } 92 .comment 0 : { *(.comment) }
93
94 /* Sections to be discarded */
95 DISCARDS
100} 96}
diff --git a/arch/m68k/kernel/vmlinux-sun3.lds b/arch/m68k/kernel/vmlinux-sun3.lds
index c192f773db96..03efaf04d7d7 100644
--- a/arch/m68k/kernel/vmlinux-sun3.lds
+++ b/arch/m68k/kernel/vmlinux-sun3.lds
@@ -77,13 +77,6 @@ __init_begin = .;
77 77
78 _end = . ; 78 _end = . ;
79 79
80 /* Sections to be discarded */
81 /DISCARD/ : {
82 EXIT_TEXT
83 EXIT_DATA
84 *(.exitcall.exit)
85 }
86
87 .crap : { 80 .crap : {
88 /* Stabs debugging sections. */ 81 /* Stabs debugging sections. */
89 *(.stab) 82 *(.stab)
@@ -96,4 +89,6 @@ __init_begin = .;
96 *(.note) 89 *(.note)
97 } 90 }
98 91
92 /* Sections to be discarded */
93 DISCARDS
99} 94}
diff --git a/arch/m68knommu/kernel/vmlinux.lds.S b/arch/m68knommu/kernel/vmlinux.lds.S
index b7fe505e358d..2736a5e309c0 100644
--- a/arch/m68knommu/kernel/vmlinux.lds.S
+++ b/arch/m68knommu/kernel/vmlinux.lds.S
@@ -184,12 +184,6 @@ SECTIONS {
184 __init_end = .; 184 __init_end = .;
185 } > INIT 185 } > INIT
186 186
187 /DISCARD/ : {
188 EXIT_TEXT
189 EXIT_DATA
190 *(.exitcall.exit)
191 }
192
193 .bss : { 187 .bss : {
194 . = ALIGN(4); 188 . = ALIGN(4);
195 _sbss = . ; 189 _sbss = . ;
@@ -200,5 +194,6 @@ SECTIONS {
200 _end = . ; 194 _end = . ;
201 } > BSS 195 } > BSS
202 196
197 DISCARDS
203} 198}
204 199
diff --git a/arch/microblaze/kernel/vmlinux.lds.S b/arch/microblaze/kernel/vmlinux.lds.S
index d34d38dcd12c..ec5fa91a48d8 100644
--- a/arch/microblaze/kernel/vmlinux.lds.S
+++ b/arch/microblaze/kernel/vmlinux.lds.S
@@ -23,8 +23,8 @@ SECTIONS {
23 _stext = . ; 23 _stext = . ;
24 *(.text .text.*) 24 *(.text .text.*)
25 *(.fixup) 25 *(.fixup)
26 26 EXIT_TEXT
27 *(.exitcall.exit) 27 EXIT_CALL
28 SCHED_TEXT 28 SCHED_TEXT
29 LOCK_TEXT 29 LOCK_TEXT
30 KPROBES_TEXT 30 KPROBES_TEXT
@@ -162,4 +162,6 @@ SECTIONS {
162 } 162 }
163 . = ALIGN(4096); 163 . = ALIGN(4096);
164 _end = .; 164 _end = .;
165
166 DISCARDS
165} 167}
diff --git a/arch/mips/kernel/vmlinux.lds.S b/arch/mips/kernel/vmlinux.lds.S
index 58738c8d754f..1474c18fb777 100644
--- a/arch/mips/kernel/vmlinux.lds.S
+++ b/arch/mips/kernel/vmlinux.lds.S
@@ -176,17 +176,6 @@ SECTIONS
176 176
177 _end = . ; 177 _end = . ;
178 178
179 /* Sections to be discarded */
180 /DISCARD/ : {
181 *(.exitcall.exit)
182
183 /* ABI crap starts here */
184 *(.MIPS.options)
185 *(.options)
186 *(.pdr)
187 *(.reginfo)
188 }
189
190 /* These mark the ABI of the kernel for debuggers. */ 179 /* These mark the ABI of the kernel for debuggers. */
191 .mdebug.abi32 : { 180 .mdebug.abi32 : {
192 KEEP(*(.mdebug.abi32)) 181 KEEP(*(.mdebug.abi32))
@@ -212,4 +201,14 @@ SECTIONS
212 *(.gptab.bss) 201 *(.gptab.bss)
213 *(.gptab.sbss) 202 *(.gptab.sbss)
214 } 203 }
204
205 /* Sections to be discarded */
206 DISCARDS
207 /DISCARD/ : {
208 /* ABI crap starts here */
209 *(.MIPS.options)
210 *(.options)
211 *(.pdr)
212 *(.reginfo)
213 }
215} 214}
diff --git a/arch/mn10300/kernel/vmlinux.lds.S b/arch/mn10300/kernel/vmlinux.lds.S
index f4aa07934654..76f41bdb79c4 100644
--- a/arch/mn10300/kernel/vmlinux.lds.S
+++ b/arch/mn10300/kernel/vmlinux.lds.S
@@ -115,12 +115,10 @@ SECTIONS
115 . = ALIGN(PAGE_SIZE); 115 . = ALIGN(PAGE_SIZE);
116 pg0 = .; 116 pg0 = .;
117 117
118 /* Sections to be discarded */
119 /DISCARD/ : {
120 EXIT_CALL
121 }
122
123 STABS_DEBUG 118 STABS_DEBUG
124 119
125 DWARF_DEBUG 120 DWARF_DEBUG
121
122 /* Sections to be discarded */
123 DISCARDS
126} 124}
diff --git a/arch/parisc/kernel/vmlinux.lds.S b/arch/parisc/kernel/vmlinux.lds.S
index fd2cc4fd2b65..aea1784edbd1 100644
--- a/arch/parisc/kernel/vmlinux.lds.S
+++ b/arch/parisc/kernel/vmlinux.lds.S
@@ -237,9 +237,12 @@ SECTIONS
237 /* freed after init ends here */ 237 /* freed after init ends here */
238 _end = . ; 238 _end = . ;
239 239
240 STABS_DEBUG
241 .note 0 : { *(.note) }
242
240 /* Sections to be discarded */ 243 /* Sections to be discarded */
244 DISCARDS
241 /DISCARD/ : { 245 /DISCARD/ : {
242 *(.exitcall.exit)
243#ifdef CONFIG_64BIT 246#ifdef CONFIG_64BIT
244 /* temporary hack until binutils is fixed to not emit these 247 /* temporary hack until binutils is fixed to not emit these
245 * for static binaries 248 * for static binaries
@@ -252,7 +255,4 @@ SECTIONS
252 *(.gnu.hash) 255 *(.gnu.hash)
253#endif 256#endif
254 } 257 }
255
256 STABS_DEBUG
257 .note 0 : { *(.note) }
258} 258}
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index d00131ca0835..61bbffa2fe60 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -46,6 +46,9 @@ config GENERIC_HARDIRQS_NO__DO_IRQ
46 bool 46 bool
47 default y 47 default y
48 48
49config HAVE_LEGACY_PER_CPU_AREA
50 def_bool PPC64
51
49config HAVE_SETUP_PER_CPU_AREA 52config HAVE_SETUP_PER_CPU_AREA
50 def_bool PPC64 53 def_bool PPC64
51 54
diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S
index 8ef8a14abc95..244e3658983c 100644
--- a/arch/powerpc/kernel/vmlinux.lds.S
+++ b/arch/powerpc/kernel/vmlinux.lds.S
@@ -37,12 +37,6 @@ jiffies = jiffies_64 + 4;
37#endif 37#endif
38SECTIONS 38SECTIONS
39{ 39{
40 /* Sections to be discarded. */
41 /DISCARD/ : {
42 *(.exitcall.exit)
43 EXIT_DATA
44 }
45
46 . = KERNELBASE; 40 . = KERNELBASE;
47 41
48/* 42/*
@@ -298,4 +292,7 @@ SECTIONS
298 . = ALIGN(PAGE_SIZE); 292 . = ALIGN(PAGE_SIZE);
299 _end = . ; 293 _end = . ;
300 PROVIDE32 (end = .); 294 PROVIDE32 (end = .);
295
296 /* Sections to be discarded. */
297 DISCARDS
301} 298}
diff --git a/arch/powerpc/mm/stab.c b/arch/powerpc/mm/stab.c
index 98cd1dc2ae75..6e9b69c99856 100644
--- a/arch/powerpc/mm/stab.c
+++ b/arch/powerpc/mm/stab.c
@@ -31,7 +31,7 @@ struct stab_entry {
31 31
32#define NR_STAB_CACHE_ENTRIES 8 32#define NR_STAB_CACHE_ENTRIES 8
33static DEFINE_PER_CPU(long, stab_cache_ptr); 33static DEFINE_PER_CPU(long, stab_cache_ptr);
34static DEFINE_PER_CPU(long, stab_cache[NR_STAB_CACHE_ENTRIES]); 34static DEFINE_PER_CPU(long [NR_STAB_CACHE_ENTRIES], stab_cache);
35 35
36/* 36/*
37 * Create a segment table entry for the given esid/vsid pair. 37 * Create a segment table entry for the given esid/vsid pair.
diff --git a/arch/powerpc/platforms/ps3/smp.c b/arch/powerpc/platforms/ps3/smp.c
index f6e04bcc70ef..51ffde40af2b 100644
--- a/arch/powerpc/platforms/ps3/smp.c
+++ b/arch/powerpc/platforms/ps3/smp.c
@@ -37,7 +37,7 @@
37 */ 37 */
38 38
39#define MSG_COUNT 4 39#define MSG_COUNT 4
40static DEFINE_PER_CPU(unsigned int, ps3_ipi_virqs[MSG_COUNT]); 40static DEFINE_PER_CPU(unsigned int [MSG_COUNT], ps3_ipi_virqs);
41 41
42static void do_message_pass(int target, int msg) 42static void do_message_pass(int target, int msg)
43{ 43{
diff --git a/arch/s390/include/asm/percpu.h b/arch/s390/include/asm/percpu.h
index 408d60b4f75b..f7ad8719d02d 100644
--- a/arch/s390/include/asm/percpu.h
+++ b/arch/s390/include/asm/percpu.h
@@ -1,37 +1,21 @@
1#ifndef __ARCH_S390_PERCPU__ 1#ifndef __ARCH_S390_PERCPU__
2#define __ARCH_S390_PERCPU__ 2#define __ARCH_S390_PERCPU__
3 3
4#include <linux/compiler.h>
5#include <asm/lowcore.h>
6
7/* 4/*
8 * s390 uses its own implementation for per cpu data, the offset of 5 * s390 uses its own implementation for per cpu data, the offset of
9 * the cpu local data area is cached in the cpu's lowcore memory. 6 * the cpu local data area is cached in the cpu's lowcore memory.
10 * For 64 bit module code s390 forces the use of a GOT slot for the
11 * address of the per cpu variable. This is needed because the module
12 * may be more than 4G above the per cpu area.
13 */ 7 */
14#if defined(__s390x__) && defined(MODULE) 8#define __my_cpu_offset S390_lowcore.percpu_offset
15
16#define SHIFT_PERCPU_PTR(ptr,offset) (({ \
17 extern int simple_identifier_##var(void); \
18 unsigned long *__ptr; \
19 asm ( "larl %0, %1@GOTENT" \
20 : "=a" (__ptr) : "X" (ptr) ); \
21 (typeof(ptr))((*__ptr) + (offset)); }))
22
23#else
24
25#define SHIFT_PERCPU_PTR(ptr, offset) (({ \
26 extern int simple_identifier_##var(void); \
27 unsigned long __ptr; \
28 asm ( "" : "=a" (__ptr) : "0" (ptr) ); \
29 (typeof(ptr)) (__ptr + (offset)); }))
30 9
10/*
11 * For 64 bit module code, the module may be more than 4G above the
12 * per cpu area, use weak definitions to force the compiler to
13 * generate external references.
14 */
15#if defined(CONFIG_SMP) && defined(__s390x__) && defined(MODULE)
16#define ARCH_NEEDS_WEAK_PER_CPU
31#endif 17#endif
32 18
33#define __my_cpu_offset S390_lowcore.percpu_offset
34
35#include <asm-generic/percpu.h> 19#include <asm-generic/percpu.h>
36 20
37#endif /* __ARCH_S390_PERCPU__ */ 21#endif /* __ARCH_S390_PERCPU__ */
diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S
index a53db23ee092..82415c75b996 100644
--- a/arch/s390/kernel/vmlinux.lds.S
+++ b/arch/s390/kernel/vmlinux.lds.S
@@ -157,13 +157,10 @@ SECTIONS
157 157
158 _end = . ; 158 _end = . ;
159 159
160 /* Sections to be discarded */
161 /DISCARD/ : {
162 EXIT_DATA
163 *(.exitcall.exit)
164 }
165
166 /* Debugging sections. */ 160 /* Debugging sections. */
167 STABS_DEBUG 161 STABS_DEBUG
168 DWARF_DEBUG 162 DWARF_DEBUG
163
164 /* Sections to be discarded */
165 DISCARDS
169} 166}
diff --git a/arch/sh/kernel/vmlinux.lds.S b/arch/sh/kernel/vmlinux.lds.S
index f53c76acaede..0ce254bca92f 100644
--- a/arch/sh/kernel/vmlinux.lds.S
+++ b/arch/sh/kernel/vmlinux.lds.S
@@ -163,16 +163,14 @@ SECTIONS
163 _end = . ; 163 _end = . ;
164 } 164 }
165 165
166 STABS_DEBUG
167 DWARF_DEBUG
168
166 /* 169 /*
167 * When something in the kernel is NOT compiled as a module, the 170 * When something in the kernel is NOT compiled as a module, the
168 * module cleanup code and data are put into these segments. Both 171 * module cleanup code and data are put into these segments. Both
169 * can then be thrown away, as cleanup code is never called unless 172 * can then be thrown away, as cleanup code is never called unless
170 * it's a module. 173 * it's a module.
171 */ 174 */
172 /DISCARD/ : { 175 DISCARDS
173 *(.exitcall.exit)
174 }
175
176 STABS_DEBUG
177 DWARF_DEBUG
178} 176}
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 3f8b6a92eabd..4f6ed0f113f0 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -95,9 +95,6 @@ config AUDIT_ARCH
95config HAVE_SETUP_PER_CPU_AREA 95config HAVE_SETUP_PER_CPU_AREA
96 def_bool y if SPARC64 96 def_bool y if SPARC64
97 97
98config HAVE_DYNAMIC_PER_CPU_AREA
99 def_bool y if SPARC64
100
101config GENERIC_HARDIRQS_NO__DO_IRQ 98config GENERIC_HARDIRQS_NO__DO_IRQ
102 bool 99 bool
103 def_bool y if SPARC64 100 def_bool y if SPARC64
diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c
index 3691907a43b4..9856d866b77b 100644
--- a/arch/sparc/kernel/smp_64.c
+++ b/arch/sparc/kernel/smp_64.c
@@ -1415,19 +1415,6 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size,
1415#endif 1415#endif
1416} 1416}
1417 1417
1418static size_t pcpur_size __initdata;
1419static void **pcpur_ptrs __initdata;
1420
1421static struct page * __init pcpur_get_page(unsigned int cpu, int pageno)
1422{
1423 size_t off = (size_t)pageno << PAGE_SHIFT;
1424
1425 if (off >= pcpur_size)
1426 return NULL;
1427
1428 return virt_to_page(pcpur_ptrs[cpu] + off);
1429}
1430
1431#define PCPU_CHUNK_SIZE (4UL * 1024UL * 1024UL) 1418#define PCPU_CHUNK_SIZE (4UL * 1024UL * 1024UL)
1432 1419
1433static void __init pcpu_map_range(unsigned long start, unsigned long end, 1420static void __init pcpu_map_range(unsigned long start, unsigned long end,
@@ -1491,25 +1478,26 @@ void __init setup_per_cpu_areas(void)
1491 size_t dyn_size, static_size = __per_cpu_end - __per_cpu_start; 1478 size_t dyn_size, static_size = __per_cpu_end - __per_cpu_start;
1492 static struct vm_struct vm; 1479 static struct vm_struct vm;
1493 unsigned long delta, cpu; 1480 unsigned long delta, cpu;
1494 size_t pcpu_unit_size; 1481 size_t size_sum, pcpu_unit_size;
1495 size_t ptrs_size; 1482 size_t ptrs_size;
1483 void **ptrs;
1496 1484
1497 pcpur_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE + 1485 size_sum = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE +
1498 PERCPU_DYNAMIC_RESERVE); 1486 PERCPU_DYNAMIC_RESERVE);
1499 dyn_size = pcpur_size - static_size - PERCPU_MODULE_RESERVE; 1487 dyn_size = size_sum - static_size - PERCPU_MODULE_RESERVE;
1500 1488
1501 1489
1502 ptrs_size = PFN_ALIGN(nr_cpu_ids * sizeof(pcpur_ptrs[0])); 1490 ptrs_size = PFN_ALIGN(nr_cpu_ids * sizeof(ptrs[0]));
1503 pcpur_ptrs = alloc_bootmem(ptrs_size); 1491 ptrs = alloc_bootmem(ptrs_size);
1504 1492
1505 for_each_possible_cpu(cpu) { 1493 for_each_possible_cpu(cpu) {
1506 pcpur_ptrs[cpu] = pcpu_alloc_bootmem(cpu, PCPU_CHUNK_SIZE, 1494 ptrs[cpu] = pcpu_alloc_bootmem(cpu, PCPU_CHUNK_SIZE,
1507 PCPU_CHUNK_SIZE); 1495 PCPU_CHUNK_SIZE);
1508 1496
1509 free_bootmem(__pa(pcpur_ptrs[cpu] + pcpur_size), 1497 free_bootmem(__pa(ptrs[cpu] + size_sum),
1510 PCPU_CHUNK_SIZE - pcpur_size); 1498 PCPU_CHUNK_SIZE - size_sum);
1511 1499
1512 memcpy(pcpur_ptrs[cpu], __per_cpu_load, static_size); 1500 memcpy(ptrs[cpu], __per_cpu_load, static_size);
1513 } 1501 }
1514 1502
1515 /* allocate address and map */ 1503 /* allocate address and map */
@@ -1523,14 +1511,14 @@ void __init setup_per_cpu_areas(void)
1523 1511
1524 start += cpu * PCPU_CHUNK_SIZE; 1512 start += cpu * PCPU_CHUNK_SIZE;
1525 end = start + PCPU_CHUNK_SIZE; 1513 end = start + PCPU_CHUNK_SIZE;
1526 pcpu_map_range(start, end, virt_to_page(pcpur_ptrs[cpu])); 1514 pcpu_map_range(start, end, virt_to_page(ptrs[cpu]));
1527 } 1515 }
1528 1516
1529 pcpu_unit_size = pcpu_setup_first_chunk(pcpur_get_page, static_size, 1517 pcpu_unit_size = pcpu_setup_first_chunk(static_size,
1530 PERCPU_MODULE_RESERVE, dyn_size, 1518 PERCPU_MODULE_RESERVE, dyn_size,
1531 PCPU_CHUNK_SIZE, vm.addr, NULL); 1519 PCPU_CHUNK_SIZE, vm.addr, NULL);
1532 1520
1533 free_bootmem(__pa(pcpur_ptrs), ptrs_size); 1521 free_bootmem(__pa(ptrs), ptrs_size);
1534 1522
1535 delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; 1523 delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
1536 for_each_possible_cpu(cpu) { 1524 for_each_possible_cpu(cpu) {
diff --git a/arch/sparc/kernel/vmlinux.lds.S b/arch/sparc/kernel/vmlinux.lds.S
index fcbbd000ec08..866390feb683 100644
--- a/arch/sparc/kernel/vmlinux.lds.S
+++ b/arch/sparc/kernel/vmlinux.lds.S
@@ -171,12 +171,8 @@ SECTIONS
171 } 171 }
172 _end = . ; 172 _end = . ;
173 173
174 /DISCARD/ : {
175 EXIT_TEXT
176 EXIT_DATA
177 *(.exitcall.exit)
178 }
179
180 STABS_DEBUG 174 STABS_DEBUG
181 DWARF_DEBUG 175 DWARF_DEBUG
176
177 DISCARDS
182} 178}
diff --git a/arch/um/include/asm/common.lds.S b/arch/um/include/asm/common.lds.S
index cb0248616d49..37ecc5577a9a 100644
--- a/arch/um/include/asm/common.lds.S
+++ b/arch/um/include/asm/common.lds.S
@@ -123,8 +123,3 @@
123 __initramfs_end = .; 123 __initramfs_end = .;
124 } 124 }
125 125
126 /* Sections to be discarded */
127 /DISCARD/ : {
128 *(.exitcall.exit)
129 }
130
diff --git a/arch/um/kernel/dyn.lds.S b/arch/um/kernel/dyn.lds.S
index 9975e1ab44fb..715a188c0472 100644
--- a/arch/um/kernel/dyn.lds.S
+++ b/arch/um/kernel/dyn.lds.S
@@ -156,4 +156,6 @@ SECTIONS
156 STABS_DEBUG 156 STABS_DEBUG
157 157
158 DWARF_DEBUG 158 DWARF_DEBUG
159
160 DISCARDS
159} 161}
diff --git a/arch/um/kernel/uml.lds.S b/arch/um/kernel/uml.lds.S
index 11b835248b86..2ebd39765db8 100644
--- a/arch/um/kernel/uml.lds.S
+++ b/arch/um/kernel/uml.lds.S
@@ -100,4 +100,6 @@ SECTIONS
100 STABS_DEBUG 100 STABS_DEBUG
101 101
102 DWARF_DEBUG 102 DWARF_DEBUG
103
104 DISCARDS
103} 105}
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 13ffa5df37d7..e06b2eeff9f2 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -150,9 +150,6 @@ config ARCH_HAS_CACHE_LINE_SIZE
150config HAVE_SETUP_PER_CPU_AREA 150config HAVE_SETUP_PER_CPU_AREA
151 def_bool y 151 def_bool y
152 152
153config HAVE_DYNAMIC_PER_CPU_AREA
154 def_bool y
155
156config HAVE_CPUMASK_OF_CPU_MAP 153config HAVE_CPUMASK_OF_CPU_MAP
157 def_bool X86_64_SMP 154 def_bool X86_64_SMP
158 155
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 103f1ddb0d85..a18c038a3079 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -156,15 +156,6 @@ do { \
156/* We can use this directly for local CPU (faster). */ 156/* We can use this directly for local CPU (faster). */
157DECLARE_PER_CPU(unsigned long, this_cpu_off); 157DECLARE_PER_CPU(unsigned long, this_cpu_off);
158 158
159#ifdef CONFIG_NEED_MULTIPLE_NODES
160void *pcpu_lpage_remapped(void *kaddr);
161#else
162static inline void *pcpu_lpage_remapped(void *kaddr)
163{
164 return NULL;
165}
166#endif
167
168#endif /* !__ASSEMBLY__ */ 159#endif /* !__ASSEMBLY__ */
169 160
170#ifdef CONFIG_SMP 161#ifdef CONFIG_SMP
diff --git a/arch/x86/kernel/cpu/cpu_debug.c b/arch/x86/kernel/cpu/cpu_debug.c
index 6b2a52dd0403..dca325c03999 100644
--- a/arch/x86/kernel/cpu/cpu_debug.c
+++ b/arch/x86/kernel/cpu/cpu_debug.c
@@ -30,8 +30,8 @@
30#include <asm/apic.h> 30#include <asm/apic.h>
31#include <asm/desc.h> 31#include <asm/desc.h>
32 32
33static DEFINE_PER_CPU(struct cpu_cpuX_base, cpu_arr[CPU_REG_ALL_BIT]); 33static DEFINE_PER_CPU(struct cpu_cpuX_base [CPU_REG_ALL_BIT], cpu_arr);
34static DEFINE_PER_CPU(struct cpu_private *, priv_arr[MAX_CPU_FILES]); 34static DEFINE_PER_CPU(struct cpu_private * [MAX_CPU_FILES], priv_arr);
35static DEFINE_PER_CPU(int, cpu_priv_count); 35static DEFINE_PER_CPU(int, cpu_priv_count);
36 36
37static DEFINE_MUTEX(cpu_debug_lock); 37static DEFINE_MUTEX(cpu_debug_lock);
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 1cfb623ce11c..14ce5d49b2ad 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -1091,7 +1091,7 @@ void mce_log_therm_throt_event(__u64 status)
1091 */ 1091 */
1092static int check_interval = 5 * 60; /* 5 minutes */ 1092static int check_interval = 5 * 60; /* 5 minutes */
1093 1093
1094static DEFINE_PER_CPU(int, next_interval); /* in jiffies */ 1094static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */
1095static DEFINE_PER_CPU(struct timer_list, mce_timer); 1095static DEFINE_PER_CPU(struct timer_list, mce_timer);
1096 1096
1097static void mcheck_timer(unsigned long data) 1097static void mcheck_timer(unsigned long data)
@@ -1110,7 +1110,7 @@ static void mcheck_timer(unsigned long data)
1110 * Alert userspace if needed. If we logged an MCE, reduce the 1110 * Alert userspace if needed. If we logged an MCE, reduce the
1111 * polling interval, otherwise increase the polling interval. 1111 * polling interval, otherwise increase the polling interval.
1112 */ 1112 */
1113 n = &__get_cpu_var(next_interval); 1113 n = &__get_cpu_var(mce_next_interval);
1114 if (mce_notify_irq()) 1114 if (mce_notify_irq())
1115 *n = max(*n/2, HZ/100); 1115 *n = max(*n/2, HZ/100);
1116 else 1116 else
@@ -1311,7 +1311,7 @@ static void mce_cpu_features(struct cpuinfo_x86 *c)
1311static void mce_init_timer(void) 1311static void mce_init_timer(void)
1312{ 1312{
1313 struct timer_list *t = &__get_cpu_var(mce_timer); 1313 struct timer_list *t = &__get_cpu_var(mce_timer);
1314 int *n = &__get_cpu_var(next_interval); 1314 int *n = &__get_cpu_var(mce_next_interval);
1315 1315
1316 if (mce_ignore_ce) 1316 if (mce_ignore_ce)
1317 return; 1317 return;
@@ -1912,7 +1912,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
1912 case CPU_DOWN_FAILED: 1912 case CPU_DOWN_FAILED:
1913 case CPU_DOWN_FAILED_FROZEN: 1913 case CPU_DOWN_FAILED_FROZEN:
1914 t->expires = round_jiffies(jiffies + 1914 t->expires = round_jiffies(jiffies +
1915 __get_cpu_var(next_interval)); 1915 __get_cpu_var(mce_next_interval));
1916 add_timer_on(t, cpu); 1916 add_timer_on(t, cpu);
1917 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); 1917 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
1918 break; 1918 break;
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index ddae21620bda..bd2a2fa84628 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -69,7 +69,7 @@ struct threshold_bank {
69 struct threshold_block *blocks; 69 struct threshold_block *blocks;
70 cpumask_var_t cpus; 70 cpumask_var_t cpus;
71}; 71};
72static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]); 72static DEFINE_PER_CPU(struct threshold_bank * [NR_BANKS], threshold_banks);
73 73
74#ifdef CONFIG_SMP 74#ifdef CONFIG_SMP
75static unsigned char shared_bank[NR_BANKS] = { 75static unsigned char shared_bank[NR_BANKS] = {
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 900332b800f8..3d4ebbd2e129 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -976,7 +976,7 @@ amd_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
976 x86_pmu_disable_counter(hwc, idx); 976 x86_pmu_disable_counter(hwc, idx);
977} 977}
978 978
979static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]); 979static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
980 980
981/* 981/*
982 * Set the next IRQ period, based on the hwc->period_left value. 982 * Set the next IRQ period, based on the hwc->period_left value.
@@ -1015,7 +1015,7 @@ x86_perf_counter_set_period(struct perf_counter *counter,
1015 if (left > x86_pmu.max_period) 1015 if (left > x86_pmu.max_period)
1016 left = x86_pmu.max_period; 1016 left = x86_pmu.max_period;
1017 1017
1018 per_cpu(prev_left[idx], smp_processor_id()) = left; 1018 per_cpu(pmc_prev_left[idx], smp_processor_id()) = left;
1019 1019
1020 /* 1020 /*
1021 * The hw counter starts counting from this counter offset, 1021 * The hw counter starts counting from this counter offset,
@@ -1211,7 +1211,7 @@ void perf_counter_print_debug(void)
1211 rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl); 1211 rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
1212 rdmsrl(x86_pmu.perfctr + idx, pmc_count); 1212 rdmsrl(x86_pmu.perfctr + idx, pmc_count);
1213 1213
1214 prev_left = per_cpu(prev_left[idx], cpu); 1214 prev_left = per_cpu(pmc_prev_left[idx], cpu);
1215 1215
1216 pr_info("CPU#%d: gen-PMC%d ctrl: %016llx\n", 1216 pr_info("CPU#%d: gen-PMC%d ctrl: %016llx\n",
1217 cpu, idx, pmc_ctrl); 1217 cpu, idx, pmc_ctrl);
@@ -1798,8 +1798,8 @@ void callchain_store(struct perf_callchain_entry *entry, u64 ip)
1798 entry->ip[entry->nr++] = ip; 1798 entry->ip[entry->nr++] = ip;
1799} 1799}
1800 1800
1801static DEFINE_PER_CPU(struct perf_callchain_entry, irq_entry); 1801static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry);
1802static DEFINE_PER_CPU(struct perf_callchain_entry, nmi_entry); 1802static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry);
1803static DEFINE_PER_CPU(int, in_nmi_frame); 1803static DEFINE_PER_CPU(int, in_nmi_frame);
1804 1804
1805 1805
@@ -1952,9 +1952,9 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
1952 struct perf_callchain_entry *entry; 1952 struct perf_callchain_entry *entry;
1953 1953
1954 if (in_nmi()) 1954 if (in_nmi())
1955 entry = &__get_cpu_var(nmi_entry); 1955 entry = &__get_cpu_var(pmc_nmi_entry);
1956 else 1956 else
1957 entry = &__get_cpu_var(irq_entry); 1957 entry = &__get_cpu_var(pmc_irq_entry);
1958 1958
1959 entry->nr = 0; 1959 entry->nr = 0;
1960 1960
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 07d81916f212..a26ff61e2fb0 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -124,60 +124,51 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size,
124} 124}
125 125
126/* 126/*
127 * Large page remap allocator 127 * Helpers for first chunk memory allocation
128 *
129 * This allocator uses PMD page as unit. A PMD page is allocated for
130 * each cpu and each is remapped into vmalloc area using PMD mapping.
131 * As PMD page is quite large, only part of it is used for the first
132 * chunk. Unused part is returned to the bootmem allocator.
133 *
134 * So, the PMD pages are mapped twice - once to the physical mapping
135 * and to the vmalloc area for the first percpu chunk. The double
136 * mapping does add one more PMD TLB entry pressure but still is much
137 * better than only using 4k mappings while still being NUMA friendly.
138 */ 128 */
139#ifdef CONFIG_NEED_MULTIPLE_NODES 129static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size)
140struct pcpul_ent { 130{
141 unsigned int cpu; 131 return pcpu_alloc_bootmem(cpu, size, size);
142 void *ptr; 132}
143};
144 133
145static size_t pcpul_size; 134static void __init pcpu_fc_free(void *ptr, size_t size)
146static struct pcpul_ent *pcpul_map; 135{
147static struct vm_struct pcpul_vm; 136 free_bootmem(__pa(ptr), size);
137}
148 138
149static struct page * __init pcpul_get_page(unsigned int cpu, int pageno) 139/*
140 * Large page remapping allocator
141 */
142#ifdef CONFIG_NEED_MULTIPLE_NODES
143static void __init pcpul_map(void *ptr, size_t size, void *addr)
150{ 144{
151 size_t off = (size_t)pageno << PAGE_SHIFT; 145 pmd_t *pmd, pmd_v;
152 146
153 if (off >= pcpul_size) 147 pmd = populate_extra_pmd((unsigned long)addr);
154 return NULL; 148 pmd_v = pfn_pmd(page_to_pfn(virt_to_page(ptr)), PAGE_KERNEL_LARGE);
149 set_pmd(pmd, pmd_v);
150}
155 151
156 return virt_to_page(pcpul_map[cpu].ptr + off); 152static int pcpu_lpage_cpu_distance(unsigned int from, unsigned int to)
153{
154 if (early_cpu_to_node(from) == early_cpu_to_node(to))
155 return LOCAL_DISTANCE;
156 else
157 return REMOTE_DISTANCE;
157} 158}
158 159
159static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) 160static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen)
160{ 161{
161 size_t map_size, dyn_size; 162 size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE;
162 unsigned int cpu; 163 size_t dyn_size = reserve - PERCPU_FIRST_CHUNK_RESERVE;
163 int i, j; 164 size_t unit_map_size, unit_size;
165 int *unit_map;
166 int nr_units;
164 ssize_t ret; 167 ssize_t ret;
165 168
166 if (!chosen) { 169 /* on non-NUMA, embedding is better */
167 size_t vm_size = VMALLOC_END - VMALLOC_START; 170 if (!chosen && !pcpu_need_numa())
168 size_t tot_size = nr_cpu_ids * PMD_SIZE; 171 return -EINVAL;
169
170 /* on non-NUMA, embedding is better */
171 if (!pcpu_need_numa())
172 return -EINVAL;
173
174 /* don't consume more than 20% of vmalloc area */
175 if (tot_size > vm_size / 5) {
176 pr_info("PERCPU: too large chunk size %zuMB for "
177 "large page remap\n", tot_size >> 20);
178 return -EINVAL;
179 }
180 }
181 172
182 /* need PSE */ 173 /* need PSE */
183 if (!cpu_has_pse) { 174 if (!cpu_has_pse) {
@@ -185,134 +176,46 @@ static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen)
185 return -EINVAL; 176 return -EINVAL;
186 } 177 }
187 178
188 /* 179 /* allocate and build unit_map */
189 * Currently supports only single page. Supporting multiple 180 unit_map_size = nr_cpu_ids * sizeof(int);
190 * pages won't be too difficult if it ever becomes necessary. 181 unit_map = alloc_bootmem_nopanic(unit_map_size);
191 */ 182 if (!unit_map) {
192 pcpul_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE + 183 pr_warning("PERCPU: failed to allocate unit_map\n");
193 PERCPU_DYNAMIC_RESERVE); 184 return -ENOMEM;
194 if (pcpul_size > PMD_SIZE) {
195 pr_warning("PERCPU: static data is larger than large page, "
196 "can't use large page\n");
197 return -EINVAL;
198 }
199 dyn_size = pcpul_size - static_size - PERCPU_FIRST_CHUNK_RESERVE;
200
201 /* allocate pointer array and alloc large pages */
202 map_size = PFN_ALIGN(nr_cpu_ids * sizeof(pcpul_map[0]));
203 pcpul_map = alloc_bootmem(map_size);
204
205 for_each_possible_cpu(cpu) {
206 pcpul_map[cpu].cpu = cpu;
207 pcpul_map[cpu].ptr = pcpu_alloc_bootmem(cpu, PMD_SIZE,
208 PMD_SIZE);
209 if (!pcpul_map[cpu].ptr) {
210 pr_warning("PERCPU: failed to allocate large page "
211 "for cpu%u\n", cpu);
212 goto enomem;
213 }
214
215 /*
216 * Only use pcpul_size bytes and give back the rest.
217 *
218 * Ingo: The 2MB up-rounding bootmem is needed to make
219 * sure the partial 2MB page is still fully RAM - it's
220 * not well-specified to have a PAT-incompatible area
221 * (unmapped RAM, device memory, etc.) in that hole.
222 */
223 free_bootmem(__pa(pcpul_map[cpu].ptr + pcpul_size),
224 PMD_SIZE - pcpul_size);
225
226 memcpy(pcpul_map[cpu].ptr, __per_cpu_load, static_size);
227 } 185 }
228 186
229 /* allocate address and map */ 187 ret = pcpu_lpage_build_unit_map(static_size,
230 pcpul_vm.flags = VM_ALLOC; 188 PERCPU_FIRST_CHUNK_RESERVE,
231 pcpul_vm.size = nr_cpu_ids * PMD_SIZE; 189 &dyn_size, &unit_size, PMD_SIZE,
232 vm_area_register_early(&pcpul_vm, PMD_SIZE); 190 unit_map, pcpu_lpage_cpu_distance);
233 191 if (ret < 0) {
234 for_each_possible_cpu(cpu) { 192 pr_warning("PERCPU: failed to build unit_map\n");
235 pmd_t *pmd, pmd_v; 193 goto out_free;
236
237 pmd = populate_extra_pmd((unsigned long)pcpul_vm.addr +
238 cpu * PMD_SIZE);
239 pmd_v = pfn_pmd(page_to_pfn(virt_to_page(pcpul_map[cpu].ptr)),
240 PAGE_KERNEL_LARGE);
241 set_pmd(pmd, pmd_v);
242 } 194 }
195 nr_units = ret;
243 196
244 /* we're ready, commit */ 197 /* do the parameters look okay? */
245 pr_info("PERCPU: Remapped at %p with large pages, static data " 198 if (!chosen) {
246 "%zu bytes\n", pcpul_vm.addr, static_size); 199 size_t vm_size = VMALLOC_END - VMALLOC_START;
247 200 size_t tot_size = nr_units * unit_size;
248 ret = pcpu_setup_first_chunk(pcpul_get_page, static_size,
249 PERCPU_FIRST_CHUNK_RESERVE, dyn_size,
250 PMD_SIZE, pcpul_vm.addr, NULL);
251
252 /* sort pcpul_map array for pcpu_lpage_remapped() */
253 for (i = 0; i < nr_cpu_ids - 1; i++)
254 for (j = i + 1; j < nr_cpu_ids; j++)
255 if (pcpul_map[i].ptr > pcpul_map[j].ptr) {
256 struct pcpul_ent tmp = pcpul_map[i];
257 pcpul_map[i] = pcpul_map[j];
258 pcpul_map[j] = tmp;
259 }
260
261 return ret;
262
263enomem:
264 for_each_possible_cpu(cpu)
265 if (pcpul_map[cpu].ptr)
266 free_bootmem(__pa(pcpul_map[cpu].ptr), pcpul_size);
267 free_bootmem(__pa(pcpul_map), map_size);
268 return -ENOMEM;
269}
270 201
271/** 202 /* don't consume more than 20% of vmalloc area */
272 * pcpu_lpage_remapped - determine whether a kaddr is in pcpul recycled area 203 if (tot_size > vm_size / 5) {
273 * @kaddr: the kernel address in question 204 pr_info("PERCPU: too large chunk size %zuMB for "
274 * 205 "large page remap\n", tot_size >> 20);
275 * Determine whether @kaddr falls in the pcpul recycled area. This is 206 ret = -EINVAL;
276 * used by pageattr to detect VM aliases and break up the pcpu PMD 207 goto out_free;
277 * mapping such that the same physical page is not mapped under
278 * different attributes.
279 *
280 * The recycled area is always at the tail of a partially used PMD
281 * page.
282 *
283 * RETURNS:
284 * Address of corresponding remapped pcpu address if match is found;
285 * otherwise, NULL.
286 */
287void *pcpu_lpage_remapped(void *kaddr)
288{
289 void *pmd_addr = (void *)((unsigned long)kaddr & PMD_MASK);
290 unsigned long offset = (unsigned long)kaddr & ~PMD_MASK;
291 int left = 0, right = nr_cpu_ids - 1;
292 int pos;
293
294 /* pcpul in use at all? */
295 if (!pcpul_map)
296 return NULL;
297
298 /* okay, perform binary search */
299 while (left <= right) {
300 pos = (left + right) / 2;
301
302 if (pcpul_map[pos].ptr < pmd_addr)
303 left = pos + 1;
304 else if (pcpul_map[pos].ptr > pmd_addr)
305 right = pos - 1;
306 else {
307 /* it shouldn't be in the area for the first chunk */
308 WARN_ON(offset < pcpul_size);
309
310 return pcpul_vm.addr +
311 pcpul_map[pos].cpu * PMD_SIZE + offset;
312 } 208 }
313 } 209 }
314 210
315 return NULL; 211 ret = pcpu_lpage_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE,
212 dyn_size, unit_size, PMD_SIZE,
213 unit_map, nr_units,
214 pcpu_fc_alloc, pcpu_fc_free, pcpul_map);
215out_free:
216 if (ret < 0)
217 free_bootmem(__pa(unit_map), unit_map_size);
218 return ret;
316} 219}
317#else 220#else
318static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) 221static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen)
@@ -342,26 +245,15 @@ static ssize_t __init setup_pcpu_embed(size_t static_size, bool chosen)
342 return -EINVAL; 245 return -EINVAL;
343 246
344 return pcpu_embed_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE, 247 return pcpu_embed_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE,
345 reserve - PERCPU_FIRST_CHUNK_RESERVE, -1); 248 reserve - PERCPU_FIRST_CHUNK_RESERVE);
346} 249}
347 250
348/* 251/*
349 * 4k page allocator 252 * 4k allocator
350 * 253 *
351 * This is the basic allocator. Static percpu area is allocated 254 * Boring fallback 4k allocator. This allocator puts more pressure on
352 * page-by-page and most of initialization is done by the generic 255 * PTE TLBs but other than that behaves nicely on both UMA and NUMA.
353 * setup function.
354 */ 256 */
355static struct page **pcpu4k_pages __initdata;
356static int pcpu4k_nr_static_pages __initdata;
357
358static struct page * __init pcpu4k_get_page(unsigned int cpu, int pageno)
359{
360 if (pageno < pcpu4k_nr_static_pages)
361 return pcpu4k_pages[cpu * pcpu4k_nr_static_pages + pageno];
362 return NULL;
363}
364
365static void __init pcpu4k_populate_pte(unsigned long addr) 257static void __init pcpu4k_populate_pte(unsigned long addr)
366{ 258{
367 populate_extra_pte(addr); 259 populate_extra_pte(addr);
@@ -369,51 +261,9 @@ static void __init pcpu4k_populate_pte(unsigned long addr)
369 261
370static ssize_t __init setup_pcpu_4k(size_t static_size) 262static ssize_t __init setup_pcpu_4k(size_t static_size)
371{ 263{
372 size_t pages_size; 264 return pcpu_4k_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE,
373 unsigned int cpu; 265 pcpu_fc_alloc, pcpu_fc_free,
374 int i, j; 266 pcpu4k_populate_pte);
375 ssize_t ret;
376
377 pcpu4k_nr_static_pages = PFN_UP(static_size);
378
379 /* unaligned allocations can't be freed, round up to page size */
380 pages_size = PFN_ALIGN(pcpu4k_nr_static_pages * nr_cpu_ids
381 * sizeof(pcpu4k_pages[0]));
382 pcpu4k_pages = alloc_bootmem(pages_size);
383
384 /* allocate and copy */
385 j = 0;
386 for_each_possible_cpu(cpu)
387 for (i = 0; i < pcpu4k_nr_static_pages; i++) {
388 void *ptr;
389
390 ptr = pcpu_alloc_bootmem(cpu, PAGE_SIZE, PAGE_SIZE);
391 if (!ptr) {
392 pr_warning("PERCPU: failed to allocate "
393 "4k page for cpu%u\n", cpu);
394 goto enomem;
395 }
396
397 memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE);
398 pcpu4k_pages[j++] = virt_to_page(ptr);
399 }
400
401 /* we're ready, commit */
402 pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n",
403 pcpu4k_nr_static_pages, static_size);
404
405 ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size,
406 PERCPU_FIRST_CHUNK_RESERVE, -1,
407 -1, NULL, pcpu4k_populate_pte);
408 goto out_free_ar;
409
410enomem:
411 while (--j >= 0)
412 free_bootmem(__pa(page_address(pcpu4k_pages[j])), PAGE_SIZE);
413 ret = -ENOMEM;
414out_free_ar:
415 free_bootmem(__pa(pcpu4k_pages), pages_size);
416 return ret;
417} 267}
418 268
419/* for explicit first chunk allocator selection */ 269/* for explicit first chunk allocator selection */
@@ -486,7 +336,8 @@ void __init setup_per_cpu_areas(void)
486 /* alrighty, percpu areas up and running */ 336 /* alrighty, percpu areas up and running */
487 delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; 337 delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
488 for_each_possible_cpu(cpu) { 338 for_each_possible_cpu(cpu) {
489 per_cpu_offset(cpu) = delta + cpu * pcpu_unit_size; 339 per_cpu_offset(cpu) =
340 delta + pcpu_unit_map[cpu] * pcpu_unit_size;
490 per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu); 341 per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu);
491 per_cpu(cpu_number, cpu) = cpu; 342 per_cpu(cpu_number, cpu) = cpu;
492 setup_percpu_segment(cpu); 343 setup_percpu_segment(cpu);
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 78d185d797de..bbf4fd044d07 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -380,15 +380,12 @@ SECTIONS
380 _end = .; 380 _end = .;
381 } 381 }
382 382
383 /* Sections to be discarded */
384 /DISCARD/ : {
385 *(.exitcall.exit)
386 *(.eh_frame)
387 *(.discard)
388 }
389
390 STABS_DEBUG 383 STABS_DEBUG
391 DWARF_DEBUG 384 DWARF_DEBUG
385
386 /* Sections to be discarded */
387 DISCARDS
388 /DISCARD/ : { *(.eh_frame) }
392} 389}
393 390
394 391
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 7e600c1962db..dce282f65700 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -12,6 +12,7 @@
12#include <linux/seq_file.h> 12#include <linux/seq_file.h>
13#include <linux/debugfs.h> 13#include <linux/debugfs.h>
14#include <linux/pfn.h> 14#include <linux/pfn.h>
15#include <linux/percpu.h>
15 16
16#include <asm/e820.h> 17#include <asm/e820.h>
17#include <asm/processor.h> 18#include <asm/processor.h>
diff --git a/arch/xtensa/kernel/vmlinux.lds.S b/arch/xtensa/kernel/vmlinux.lds.S
index 41c159cd872f..921b6ff3b645 100644
--- a/arch/xtensa/kernel/vmlinux.lds.S
+++ b/arch/xtensa/kernel/vmlinux.lds.S
@@ -280,15 +280,6 @@ SECTIONS
280 *(.ResetVector.text) 280 *(.ResetVector.text)
281 } 281 }
282 282
283 /* Sections to be discarded */
284 /DISCARD/ :
285 {
286 *(.exit.literal)
287 EXIT_TEXT
288 EXIT_DATA
289 *(.exitcall.exit)
290 }
291
292 .xt.lit : { *(.xt.lit) } 283 .xt.lit : { *(.xt.lit) }
293 .xt.prop : { *(.xt.prop) } 284 .xt.prop : { *(.xt.prop) }
294 285
@@ -321,4 +312,8 @@ SECTIONS
321 *(.xt.lit) 312 *(.xt.lit)
322 *(.gnu.linkonce.p*) 313 *(.gnu.linkonce.p*)
323 } 314 }
315
316 /* Sections to be discarded */
317 DISCARDS
318 /DISCARD/ : { *(.exit.literal) }
324} 319}
diff --git a/block/as-iosched.c b/block/as-iosched.c
index 7a12cf6ee1d3..ce8ba57c6557 100644
--- a/block/as-iosched.c
+++ b/block/as-iosched.c
@@ -146,7 +146,7 @@ enum arq_state {
146#define RQ_STATE(rq) ((enum arq_state)(rq)->elevator_private2) 146#define RQ_STATE(rq) ((enum arq_state)(rq)->elevator_private2)
147#define RQ_SET_STATE(rq, state) ((rq)->elevator_private2 = (void *) state) 147#define RQ_SET_STATE(rq, state) ((rq)->elevator_private2 = (void *) state)
148 148
149static DEFINE_PER_CPU(unsigned long, ioc_count); 149static DEFINE_PER_CPU(unsigned long, as_ioc_count);
150static struct completion *ioc_gone; 150static struct completion *ioc_gone;
151static DEFINE_SPINLOCK(ioc_gone_lock); 151static DEFINE_SPINLOCK(ioc_gone_lock);
152 152
@@ -161,7 +161,7 @@ static void as_antic_stop(struct as_data *ad);
161static void free_as_io_context(struct as_io_context *aic) 161static void free_as_io_context(struct as_io_context *aic)
162{ 162{
163 kfree(aic); 163 kfree(aic);
164 elv_ioc_count_dec(ioc_count); 164 elv_ioc_count_dec(as_ioc_count);
165 if (ioc_gone) { 165 if (ioc_gone) {
166 /* 166 /*
167 * AS scheduler is exiting, grab exit lock and check 167 * AS scheduler is exiting, grab exit lock and check
@@ -169,7 +169,7 @@ static void free_as_io_context(struct as_io_context *aic)
169 * complete ioc_gone and set it back to NULL. 169 * complete ioc_gone and set it back to NULL.
170 */ 170 */
171 spin_lock(&ioc_gone_lock); 171 spin_lock(&ioc_gone_lock);
172 if (ioc_gone && !elv_ioc_count_read(ioc_count)) { 172 if (ioc_gone && !elv_ioc_count_read(as_ioc_count)) {
173 complete(ioc_gone); 173 complete(ioc_gone);
174 ioc_gone = NULL; 174 ioc_gone = NULL;
175 } 175 }
@@ -211,7 +211,7 @@ static struct as_io_context *alloc_as_io_context(void)
211 ret->seek_total = 0; 211 ret->seek_total = 0;
212 ret->seek_samples = 0; 212 ret->seek_samples = 0;
213 ret->seek_mean = 0; 213 ret->seek_mean = 0;
214 elv_ioc_count_inc(ioc_count); 214 elv_ioc_count_inc(as_ioc_count);
215 } 215 }
216 216
217 return ret; 217 return ret;
@@ -1507,7 +1507,7 @@ static void __exit as_exit(void)
1507 ioc_gone = &all_gone; 1507 ioc_gone = &all_gone;
1508 /* ioc_gone's update must be visible before reading ioc_count */ 1508 /* ioc_gone's update must be visible before reading ioc_count */
1509 smp_wmb(); 1509 smp_wmb();
1510 if (elv_ioc_count_read(ioc_count)) 1510 if (elv_ioc_count_read(as_ioc_count))
1511 wait_for_completion(&all_gone); 1511 wait_for_completion(&all_gone);
1512 synchronize_rcu(); 1512 synchronize_rcu();
1513} 1513}
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index fd7080ed7935..1b2d12cda43e 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -48,7 +48,7 @@ static int cfq_slice_idle = HZ / 125;
48static struct kmem_cache *cfq_pool; 48static struct kmem_cache *cfq_pool;
49static struct kmem_cache *cfq_ioc_pool; 49static struct kmem_cache *cfq_ioc_pool;
50 50
51static DEFINE_PER_CPU(unsigned long, ioc_count); 51static DEFINE_PER_CPU(unsigned long, cfq_ioc_count);
52static struct completion *ioc_gone; 52static struct completion *ioc_gone;
53static DEFINE_SPINLOCK(ioc_gone_lock); 53static DEFINE_SPINLOCK(ioc_gone_lock);
54 54
@@ -1427,7 +1427,7 @@ static void cfq_cic_free_rcu(struct rcu_head *head)
1427 cic = container_of(head, struct cfq_io_context, rcu_head); 1427 cic = container_of(head, struct cfq_io_context, rcu_head);
1428 1428
1429 kmem_cache_free(cfq_ioc_pool, cic); 1429 kmem_cache_free(cfq_ioc_pool, cic);
1430 elv_ioc_count_dec(ioc_count); 1430 elv_ioc_count_dec(cfq_ioc_count);
1431 1431
1432 if (ioc_gone) { 1432 if (ioc_gone) {
1433 /* 1433 /*
@@ -1436,7 +1436,7 @@ static void cfq_cic_free_rcu(struct rcu_head *head)
1436 * complete ioc_gone and set it back to NULL 1436 * complete ioc_gone and set it back to NULL
1437 */ 1437 */
1438 spin_lock(&ioc_gone_lock); 1438 spin_lock(&ioc_gone_lock);
1439 if (ioc_gone && !elv_ioc_count_read(ioc_count)) { 1439 if (ioc_gone && !elv_ioc_count_read(cfq_ioc_count)) {
1440 complete(ioc_gone); 1440 complete(ioc_gone);
1441 ioc_gone = NULL; 1441 ioc_gone = NULL;
1442 } 1442 }
@@ -1562,7 +1562,7 @@ cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
1562 INIT_HLIST_NODE(&cic->cic_list); 1562 INIT_HLIST_NODE(&cic->cic_list);
1563 cic->dtor = cfq_free_io_context; 1563 cic->dtor = cfq_free_io_context;
1564 cic->exit = cfq_exit_io_context; 1564 cic->exit = cfq_exit_io_context;
1565 elv_ioc_count_inc(ioc_count); 1565 elv_ioc_count_inc(cfq_ioc_count);
1566 } 1566 }
1567 1567
1568 return cic; 1568 return cic;
@@ -2668,7 +2668,7 @@ static void __exit cfq_exit(void)
2668 * this also protects us from entering cfq_slab_kill() with 2668 * this also protects us from entering cfq_slab_kill() with
2669 * pending RCU callbacks 2669 * pending RCU callbacks
2670 */ 2670 */
2671 if (elv_ioc_count_read(ioc_count)) 2671 if (elv_ioc_count_read(cfq_ioc_count))
2672 wait_for_completion(&all_gone); 2672 wait_for_completion(&all_gone);
2673 cfq_slab_kill(); 2673 cfq_slab_kill();
2674} 2674}
diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c
index bdea7e2f94ba..bc33ddc9c97c 100644
--- a/drivers/cpufreq/cpufreq_conservative.c
+++ b/drivers/cpufreq/cpufreq_conservative.c
@@ -71,7 +71,7 @@ struct cpu_dbs_info_s {
71 */ 71 */
72 struct mutex timer_mutex; 72 struct mutex timer_mutex;
73}; 73};
74static DEFINE_PER_CPU(struct cpu_dbs_info_s, cpu_dbs_info); 74static DEFINE_PER_CPU(struct cpu_dbs_info_s, cs_cpu_dbs_info);
75 75
76static unsigned int dbs_enable; /* number of CPUs using this policy */ 76static unsigned int dbs_enable; /* number of CPUs using this policy */
77 77
@@ -137,7 +137,7 @@ dbs_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
137 void *data) 137 void *data)
138{ 138{
139 struct cpufreq_freqs *freq = data; 139 struct cpufreq_freqs *freq = data;
140 struct cpu_dbs_info_s *this_dbs_info = &per_cpu(cpu_dbs_info, 140 struct cpu_dbs_info_s *this_dbs_info = &per_cpu(cs_cpu_dbs_info,
141 freq->cpu); 141 freq->cpu);
142 142
143 struct cpufreq_policy *policy; 143 struct cpufreq_policy *policy;
@@ -297,7 +297,7 @@ static ssize_t store_ignore_nice_load(struct cpufreq_policy *policy,
297 /* we need to re-evaluate prev_cpu_idle */ 297 /* we need to re-evaluate prev_cpu_idle */
298 for_each_online_cpu(j) { 298 for_each_online_cpu(j) {
299 struct cpu_dbs_info_s *dbs_info; 299 struct cpu_dbs_info_s *dbs_info;
300 dbs_info = &per_cpu(cpu_dbs_info, j); 300 dbs_info = &per_cpu(cs_cpu_dbs_info, j);
301 dbs_info->prev_cpu_idle = get_cpu_idle_time(j, 301 dbs_info->prev_cpu_idle = get_cpu_idle_time(j,
302 &dbs_info->prev_cpu_wall); 302 &dbs_info->prev_cpu_wall);
303 if (dbs_tuners_ins.ignore_nice) 303 if (dbs_tuners_ins.ignore_nice)
@@ -387,7 +387,7 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info)
387 cputime64_t cur_wall_time, cur_idle_time; 387 cputime64_t cur_wall_time, cur_idle_time;
388 unsigned int idle_time, wall_time; 388 unsigned int idle_time, wall_time;
389 389
390 j_dbs_info = &per_cpu(cpu_dbs_info, j); 390 j_dbs_info = &per_cpu(cs_cpu_dbs_info, j);
391 391
392 cur_idle_time = get_cpu_idle_time(j, &cur_wall_time); 392 cur_idle_time = get_cpu_idle_time(j, &cur_wall_time);
393 393
@@ -521,7 +521,7 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy,
521 unsigned int j; 521 unsigned int j;
522 int rc; 522 int rc;
523 523
524 this_dbs_info = &per_cpu(cpu_dbs_info, cpu); 524 this_dbs_info = &per_cpu(cs_cpu_dbs_info, cpu);
525 525
526 switch (event) { 526 switch (event) {
527 case CPUFREQ_GOV_START: 527 case CPUFREQ_GOV_START:
@@ -538,7 +538,7 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy,
538 538
539 for_each_cpu(j, policy->cpus) { 539 for_each_cpu(j, policy->cpus) {
540 struct cpu_dbs_info_s *j_dbs_info; 540 struct cpu_dbs_info_s *j_dbs_info;
541 j_dbs_info = &per_cpu(cpu_dbs_info, j); 541 j_dbs_info = &per_cpu(cs_cpu_dbs_info, j);
542 j_dbs_info->cur_policy = policy; 542 j_dbs_info->cur_policy = policy;
543 543
544 j_dbs_info->prev_cpu_idle = get_cpu_idle_time(j, 544 j_dbs_info->prev_cpu_idle = get_cpu_idle_time(j,
diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
index d6ba14276bb1..d7a528c80de8 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -78,7 +78,7 @@ struct cpu_dbs_info_s {
78 */ 78 */
79 struct mutex timer_mutex; 79 struct mutex timer_mutex;
80}; 80};
81static DEFINE_PER_CPU(struct cpu_dbs_info_s, cpu_dbs_info); 81static DEFINE_PER_CPU(struct cpu_dbs_info_s, od_cpu_dbs_info);
82 82
83static unsigned int dbs_enable; /* number of CPUs using this policy */ 83static unsigned int dbs_enable; /* number of CPUs using this policy */
84 84
@@ -149,7 +149,8 @@ static unsigned int powersave_bias_target(struct cpufreq_policy *policy,
149 unsigned int freq_hi, freq_lo; 149 unsigned int freq_hi, freq_lo;
150 unsigned int index = 0; 150 unsigned int index = 0;
151 unsigned int jiffies_total, jiffies_hi, jiffies_lo; 151 unsigned int jiffies_total, jiffies_hi, jiffies_lo;
152 struct cpu_dbs_info_s *dbs_info = &per_cpu(cpu_dbs_info, policy->cpu); 152 struct cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info,
153 policy->cpu);
153 154
154 if (!dbs_info->freq_table) { 155 if (!dbs_info->freq_table) {
155 dbs_info->freq_lo = 0; 156 dbs_info->freq_lo = 0;
@@ -192,7 +193,7 @@ static unsigned int powersave_bias_target(struct cpufreq_policy *policy,
192 193
193static void ondemand_powersave_bias_init_cpu(int cpu) 194static void ondemand_powersave_bias_init_cpu(int cpu)
194{ 195{
195 struct cpu_dbs_info_s *dbs_info = &per_cpu(cpu_dbs_info, cpu); 196 struct cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info, cpu);
196 dbs_info->freq_table = cpufreq_frequency_get_table(cpu); 197 dbs_info->freq_table = cpufreq_frequency_get_table(cpu);
197 dbs_info->freq_lo = 0; 198 dbs_info->freq_lo = 0;
198} 199}
@@ -297,7 +298,7 @@ static ssize_t store_ignore_nice_load(struct cpufreq_policy *policy,
297 /* we need to re-evaluate prev_cpu_idle */ 298 /* we need to re-evaluate prev_cpu_idle */
298 for_each_online_cpu(j) { 299 for_each_online_cpu(j) {
299 struct cpu_dbs_info_s *dbs_info; 300 struct cpu_dbs_info_s *dbs_info;
300 dbs_info = &per_cpu(cpu_dbs_info, j); 301 dbs_info = &per_cpu(od_cpu_dbs_info, j);
301 dbs_info->prev_cpu_idle = get_cpu_idle_time(j, 302 dbs_info->prev_cpu_idle = get_cpu_idle_time(j,
302 &dbs_info->prev_cpu_wall); 303 &dbs_info->prev_cpu_wall);
303 if (dbs_tuners_ins.ignore_nice) 304 if (dbs_tuners_ins.ignore_nice)
@@ -388,7 +389,7 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info)
388 unsigned int load, load_freq; 389 unsigned int load, load_freq;
389 int freq_avg; 390 int freq_avg;
390 391
391 j_dbs_info = &per_cpu(cpu_dbs_info, j); 392 j_dbs_info = &per_cpu(od_cpu_dbs_info, j);
392 393
393 cur_idle_time = get_cpu_idle_time(j, &cur_wall_time); 394 cur_idle_time = get_cpu_idle_time(j, &cur_wall_time);
394 395
@@ -535,7 +536,7 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy,
535 unsigned int j; 536 unsigned int j;
536 int rc; 537 int rc;
537 538
538 this_dbs_info = &per_cpu(cpu_dbs_info, cpu); 539 this_dbs_info = &per_cpu(od_cpu_dbs_info, cpu);
539 540
540 switch (event) { 541 switch (event) {
541 case CPUFREQ_GOV_START: 542 case CPUFREQ_GOV_START:
@@ -553,7 +554,7 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy,
553 dbs_enable++; 554 dbs_enable++;
554 for_each_cpu(j, policy->cpus) { 555 for_each_cpu(j, policy->cpus) {
555 struct cpu_dbs_info_s *j_dbs_info; 556 struct cpu_dbs_info_s *j_dbs_info;
556 j_dbs_info = &per_cpu(cpu_dbs_info, j); 557 j_dbs_info = &per_cpu(od_cpu_dbs_info, j);
557 j_dbs_info->cur_policy = policy; 558 j_dbs_info->cur_policy = policy;
558 559
559 j_dbs_info->prev_cpu_idle = get_cpu_idle_time(j, 560 j_dbs_info->prev_cpu_idle = get_cpu_idle_time(j,
diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index abad71b1632b..2f57276e87a2 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -47,10 +47,10 @@
47static DEFINE_SPINLOCK(irq_mapping_update_lock); 47static DEFINE_SPINLOCK(irq_mapping_update_lock);
48 48
49/* IRQ <-> VIRQ mapping. */ 49/* IRQ <-> VIRQ mapping. */
50static DEFINE_PER_CPU(int, virq_to_irq[NR_VIRQS]) = {[0 ... NR_VIRQS-1] = -1}; 50static DEFINE_PER_CPU(int [NR_VIRQS], virq_to_irq) = {[0 ... NR_VIRQS-1] = -1};
51 51
52/* IRQ <-> IPI mapping */ 52/* IRQ <-> IPI mapping */
53static DEFINE_PER_CPU(int, ipi_to_irq[XEN_NR_IPIS]) = {[0 ... XEN_NR_IPIS-1] = -1}; 53static DEFINE_PER_CPU(int [XEN_NR_IPIS], ipi_to_irq) = {[0 ... XEN_NR_IPIS-1] = -1};
54 54
55/* Interrupt types. */ 55/* Interrupt types. */
56enum xen_irq_type { 56enum xen_irq_type {
@@ -602,6 +602,8 @@ irqreturn_t xen_debug_interrupt(int irq, void *dev_id)
602 return IRQ_HANDLED; 602 return IRQ_HANDLED;
603} 603}
604 604
605static DEFINE_PER_CPU(unsigned, xed_nesting_count);
606
605/* 607/*
606 * Search the CPUs pending events bitmasks. For each one found, map 608 * Search the CPUs pending events bitmasks. For each one found, map
607 * the event number to an irq, and feed it into do_IRQ() for 609 * the event number to an irq, and feed it into do_IRQ() for
@@ -617,7 +619,6 @@ void xen_evtchn_do_upcall(struct pt_regs *regs)
617 struct pt_regs *old_regs = set_irq_regs(regs); 619 struct pt_regs *old_regs = set_irq_regs(regs);
618 struct shared_info *s = HYPERVISOR_shared_info; 620 struct shared_info *s = HYPERVISOR_shared_info;
619 struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu); 621 struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu);
620 static DEFINE_PER_CPU(unsigned, nesting_count);
621 unsigned count; 622 unsigned count;
622 623
623 exit_idle(); 624 exit_idle();
@@ -628,7 +629,7 @@ void xen_evtchn_do_upcall(struct pt_regs *regs)
628 629
629 vcpu_info->evtchn_upcall_pending = 0; 630 vcpu_info->evtchn_upcall_pending = 0;
630 631
631 if (__get_cpu_var(nesting_count)++) 632 if (__get_cpu_var(xed_nesting_count)++)
632 goto out; 633 goto out;
633 634
634#ifndef CONFIG_X86 /* No need for a barrier -- XCHG is a barrier on x86. */ 635#ifndef CONFIG_X86 /* No need for a barrier -- XCHG is a barrier on x86. */
@@ -653,8 +654,8 @@ void xen_evtchn_do_upcall(struct pt_regs *regs)
653 654
654 BUG_ON(!irqs_disabled()); 655 BUG_ON(!irqs_disabled());
655 656
656 count = __get_cpu_var(nesting_count); 657 count = __get_cpu_var(xed_nesting_count);
657 __get_cpu_var(nesting_count) = 0; 658 __get_cpu_var(xed_nesting_count) = 0;
658 } while(count != 1); 659 } while(count != 1);
659 660
660out: 661out:
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 6ad76bf5fb40..a43223af98b6 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -33,13 +33,10 @@
33 * BSS_SECTION(0, 0, 0) 33 * BSS_SECTION(0, 0, 0)
34 * _end = .; 34 * _end = .;
35 * 35 *
36 * /DISCARD/ : {
37 * EXIT_TEXT
38 * EXIT_DATA
39 * EXIT_CALL
40 * }
41 * STABS_DEBUG 36 * STABS_DEBUG
42 * DWARF_DEBUG 37 * DWARF_DEBUG
38 *
39 * DISCARDS // must be the last
43 * } 40 * }
44 * 41 *
45 * [__init_begin, __init_end] is the init section that may be freed after init 42 * [__init_begin, __init_end] is the init section that may be freed after init
@@ -626,6 +623,23 @@
626#define INIT_RAM_FS 623#define INIT_RAM_FS
627#endif 624#endif
628 625
626/*
627 * Default discarded sections.
628 *
629 * Some archs want to discard exit text/data at runtime rather than
630 * link time due to cross-section references such as alt instructions,
631 * bug table, eh_frame, etc. DISCARDS must be the last of output
632 * section definitions so that such archs put those in earlier section
633 * definitions.
634 */
635#define DISCARDS \
636 /DISCARD/ : { \
637 EXIT_TEXT \
638 EXIT_DATA \
639 EXIT_CALL \
640 *(.discard) \
641 }
642
629/** 643/**
630 * PERCPU_VADDR - define output section for percpu area 644 * PERCPU_VADDR - define output section for percpu area
631 * @vaddr: explicit base address (optional) 645 * @vaddr: explicit base address (optional)
diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h
index 68438e18fff4..aefc2f12b48c 100644
--- a/include/linux/percpu-defs.h
+++ b/include/linux/percpu-defs.h
@@ -10,22 +10,70 @@
10/* 10/*
11 * Base implementations of per-CPU variable declarations and definitions, where 11 * Base implementations of per-CPU variable declarations and definitions, where
12 * the section in which the variable is to be placed is provided by the 12 * the section in which the variable is to be placed is provided by the
13 * 'section' argument. This may be used to affect the parameters governing the 13 * 'sec' argument. This may be used to affect the parameters governing the
14 * variable's storage. 14 * variable's storage.
15 * 15 *
16 * NOTE! The sections for the DECLARE and for the DEFINE must match, lest 16 * NOTE! The sections for the DECLARE and for the DEFINE must match, lest
17 * linkage errors occur due the compiler generating the wrong code to access 17 * linkage errors occur due the compiler generating the wrong code to access
18 * that section. 18 * that section.
19 */ 19 */
20#define DECLARE_PER_CPU_SECTION(type, name, section) \ 20#define __PCPU_ATTRS(sec) \
21 extern \ 21 __attribute__((section(PER_CPU_BASE_SECTION sec))) \
22 __attribute__((__section__(PER_CPU_BASE_SECTION section))) \ 22 PER_CPU_ATTRIBUTES
23 PER_CPU_ATTRIBUTES __typeof__(type) per_cpu__##name 23
24 24#define __PCPU_DUMMY_ATTRS \
25#define DEFINE_PER_CPU_SECTION(type, name, section) \ 25 __attribute__((section(".discard"), unused))
26 __attribute__((__section__(PER_CPU_BASE_SECTION section))) \ 26
27 PER_CPU_ATTRIBUTES PER_CPU_DEF_ATTRIBUTES \ 27/*
28 * s390 and alpha modules require percpu variables to be defined as
29 * weak to force the compiler to generate GOT based external
30 * references for them. This is necessary because percpu sections
31 * will be located outside of the usually addressable area.
32 *
33 * This definition puts the following two extra restrictions when
34 * defining percpu variables.
35 *
36 * 1. The symbol must be globally unique, even the static ones.
37 * 2. Static percpu variables cannot be defined inside a function.
38 *
39 * Archs which need weak percpu definitions should define
40 * ARCH_NEEDS_WEAK_PER_CPU in asm/percpu.h when necessary.
41 *
42 * To ensure that the generic code observes the above two
43 * restrictions, if CONFIG_DEBUG_FORCE_WEAK_PER_CPU is set weak
44 * definition is used for all cases.
45 */
46#if defined(ARCH_NEEDS_WEAK_PER_CPU) || defined(CONFIG_DEBUG_FORCE_WEAK_PER_CPU)
47/*
48 * __pcpu_scope_* dummy variable is used to enforce scope. It
49 * receives the static modifier when it's used in front of
50 * DEFINE_PER_CPU() and will trigger build failure if
51 * DECLARE_PER_CPU() is used for the same variable.
52 *
53 * __pcpu_unique_* dummy variable is used to enforce symbol uniqueness
54 * such that hidden weak symbol collision, which will cause unrelated
55 * variables to share the same address, can be detected during build.
56 */
57#define DECLARE_PER_CPU_SECTION(type, name, sec) \
58 extern __PCPU_DUMMY_ATTRS char __pcpu_scope_##name; \
59 extern __PCPU_ATTRS(sec) __typeof__(type) per_cpu__##name
60
61#define DEFINE_PER_CPU_SECTION(type, name, sec) \
62 __PCPU_DUMMY_ATTRS char __pcpu_scope_##name; \
63 __PCPU_DUMMY_ATTRS char __pcpu_unique_##name; \
64 __PCPU_ATTRS(sec) PER_CPU_DEF_ATTRIBUTES __weak \
65 __typeof__(type) per_cpu__##name
66#else
67/*
68 * Normal declaration and definition macros.
69 */
70#define DECLARE_PER_CPU_SECTION(type, name, sec) \
71 extern __PCPU_ATTRS(sec) __typeof__(type) per_cpu__##name
72
73#define DEFINE_PER_CPU_SECTION(type, name, sec) \
74 __PCPU_ATTRS(sec) PER_CPU_DEF_ATTRIBUTES \
28 __typeof__(type) per_cpu__##name 75 __typeof__(type) per_cpu__##name
76#endif
29 77
30/* 78/*
31 * Variant on the per-CPU variable declaration/definition theme used for 79 * Variant on the per-CPU variable declaration/definition theme used for
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 26fd9d12f050..e134c8229631 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -34,7 +34,7 @@
34 34
35#ifdef CONFIG_SMP 35#ifdef CONFIG_SMP
36 36
37#ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA 37#ifndef CONFIG_HAVE_LEGACY_PER_CPU_AREA
38 38
39/* minimum unit size, also is the maximum supported allocation size */ 39/* minimum unit size, also is the maximum supported allocation size */
40#define PCPU_MIN_UNIT_SIZE PFN_ALIGN(64 << 10) 40#define PCPU_MIN_UNIT_SIZE PFN_ALIGN(64 << 10)
@@ -57,19 +57,73 @@
57#endif 57#endif
58 58
59extern void *pcpu_base_addr; 59extern void *pcpu_base_addr;
60extern const int *pcpu_unit_map;
60 61
61typedef struct page * (*pcpu_get_page_fn_t)(unsigned int cpu, int pageno); 62typedef void * (*pcpu_fc_alloc_fn_t)(unsigned int cpu, size_t size);
62typedef void (*pcpu_populate_pte_fn_t)(unsigned long addr); 63typedef void (*pcpu_fc_free_fn_t)(void *ptr, size_t size);
64typedef void (*pcpu_fc_populate_pte_fn_t)(unsigned long addr);
65typedef int (pcpu_fc_cpu_distance_fn_t)(unsigned int from, unsigned int to);
66typedef void (*pcpu_fc_map_fn_t)(void *ptr, size_t size, void *addr);
63 67
64extern size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, 68extern size_t __init pcpu_setup_first_chunk(
65 size_t static_size, size_t reserved_size, 69 size_t static_size, size_t reserved_size,
66 ssize_t dyn_size, ssize_t unit_size, 70 ssize_t dyn_size, size_t unit_size,
67 void *base_addr, 71 void *base_addr, const int *unit_map);
68 pcpu_populate_pte_fn_t populate_pte_fn);
69 72
70extern ssize_t __init pcpu_embed_first_chunk( 73extern ssize_t __init pcpu_embed_first_chunk(
71 size_t static_size, size_t reserved_size, 74 size_t static_size, size_t reserved_size,
72 ssize_t dyn_size, ssize_t unit_size); 75 ssize_t dyn_size);
76
77extern ssize_t __init pcpu_4k_first_chunk(
78 size_t static_size, size_t reserved_size,
79 pcpu_fc_alloc_fn_t alloc_fn,
80 pcpu_fc_free_fn_t free_fn,
81 pcpu_fc_populate_pte_fn_t populate_pte_fn);
82
83#ifdef CONFIG_NEED_MULTIPLE_NODES
84extern int __init pcpu_lpage_build_unit_map(
85 size_t static_size, size_t reserved_size,
86 ssize_t *dyn_sizep, size_t *unit_sizep,
87 size_t lpage_size, int *unit_map,
88 pcpu_fc_cpu_distance_fn_t cpu_distance_fn);
89
90extern ssize_t __init pcpu_lpage_first_chunk(
91 size_t static_size, size_t reserved_size,
92 size_t dyn_size, size_t unit_size,
93 size_t lpage_size, const int *unit_map,
94 int nr_units,
95 pcpu_fc_alloc_fn_t alloc_fn,
96 pcpu_fc_free_fn_t free_fn,
97 pcpu_fc_map_fn_t map_fn);
98
99extern void *pcpu_lpage_remapped(void *kaddr);
100#else
101static inline int pcpu_lpage_build_unit_map(
102 size_t static_size, size_t reserved_size,
103 ssize_t *dyn_sizep, size_t *unit_sizep,
104 size_t lpage_size, int *unit_map,
105 pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
106{
107 return -EINVAL;
108}
109
110static inline ssize_t __init pcpu_lpage_first_chunk(
111 size_t static_size, size_t reserved_size,
112 size_t dyn_size, size_t unit_size,
113 size_t lpage_size, const int *unit_map,
114 int nr_units,
115 pcpu_fc_alloc_fn_t alloc_fn,
116 pcpu_fc_free_fn_t free_fn,
117 pcpu_fc_map_fn_t map_fn)
118{
119 return -EINVAL;
120}
121
122static inline void *pcpu_lpage_remapped(void *kaddr)
123{
124 return NULL;
125}
126#endif
73 127
74/* 128/*
75 * Use this to get to a cpu's version of the per-cpu object 129 * Use this to get to a cpu's version of the per-cpu object
@@ -80,7 +134,7 @@ extern ssize_t __init pcpu_embed_first_chunk(
80 134
81extern void *__alloc_reserved_percpu(size_t size, size_t align); 135extern void *__alloc_reserved_percpu(size_t size, size_t align);
82 136
83#else /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */ 137#else /* CONFIG_HAVE_LEGACY_PER_CPU_AREA */
84 138
85struct percpu_data { 139struct percpu_data {
86 void *ptrs[1]; 140 void *ptrs[1];
@@ -99,11 +153,15 @@ struct percpu_data {
99 (__typeof__(ptr))__p->ptrs[(cpu)]; \ 153 (__typeof__(ptr))__p->ptrs[(cpu)]; \
100}) 154})
101 155
102#endif /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */ 156#endif /* CONFIG_HAVE_LEGACY_PER_CPU_AREA */
103 157
104extern void *__alloc_percpu(size_t size, size_t align); 158extern void *__alloc_percpu(size_t size, size_t align);
105extern void free_percpu(void *__pdata); 159extern void free_percpu(void *__pdata);
106 160
161#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
162extern void __init setup_per_cpu_areas(void);
163#endif
164
107#else /* CONFIG_SMP */ 165#else /* CONFIG_SMP */
108 166
109#define per_cpu_ptr(ptr, cpu) ({ (void)(cpu); (ptr); }) 167#define per_cpu_ptr(ptr, cpu) ({ (void)(cpu); (ptr); })
@@ -124,6 +182,13 @@ static inline void free_percpu(void *p)
124 kfree(p); 182 kfree(p);
125} 183}
126 184
185static inline void __init setup_per_cpu_areas(void) { }
186
187static inline void *pcpu_lpage_remapped(void *kaddr)
188{
189 return NULL;
190}
191
127#endif /* CONFIG_SMP */ 192#endif /* CONFIG_SMP */
128 193
129#define alloc_percpu(type) (type *)__alloc_percpu(sizeof(type), \ 194#define alloc_percpu(type) (type *)__alloc_percpu(sizeof(type), \
diff --git a/init/main.c b/init/main.c
index 2d9d6bdfe7c9..2f9544d8435a 100644
--- a/init/main.c
+++ b/init/main.c
@@ -353,7 +353,6 @@ static void __init smp_init(void)
353#define smp_init() do { } while (0) 353#define smp_init() do { } while (0)
354#endif 354#endif
355 355
356static inline void setup_per_cpu_areas(void) { }
357static inline void setup_nr_cpu_ids(void) { } 356static inline void setup_nr_cpu_ids(void) { }
358static inline void smp_prepare_cpus(unsigned int maxcpus) { } 357static inline void smp_prepare_cpus(unsigned int maxcpus) { }
359 358
@@ -374,29 +373,6 @@ static void __init setup_nr_cpu_ids(void)
374 nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1; 373 nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1;
375} 374}
376 375
377#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
378unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
379
380EXPORT_SYMBOL(__per_cpu_offset);
381
382static void __init setup_per_cpu_areas(void)
383{
384 unsigned long size, i;
385 char *ptr;
386 unsigned long nr_possible_cpus = num_possible_cpus();
387
388 /* Copy section for each CPU (we discard the original) */
389 size = ALIGN(PERCPU_ENOUGH_ROOM, PAGE_SIZE);
390 ptr = alloc_bootmem_pages(size * nr_possible_cpus);
391
392 for_each_possible_cpu(i) {
393 __per_cpu_offset[i] = ptr - __per_cpu_start;
394 memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
395 ptr += size;
396 }
397}
398#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */
399
400/* Called by boot processor to activate the rest. */ 376/* Called by boot processor to activate the rest. */
401static void __init smp_init(void) 377static void __init smp_init(void)
402{ 378{
diff --git a/kernel/module.c b/kernel/module.c
index fd1411403558..3a4db71ea494 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -364,7 +364,7 @@ EXPORT_SYMBOL_GPL(find_module);
364 364
365#ifdef CONFIG_SMP 365#ifdef CONFIG_SMP
366 366
367#ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA 367#ifndef CONFIG_HAVE_LEGACY_PER_CPU_AREA
368 368
369static void *percpu_modalloc(unsigned long size, unsigned long align, 369static void *percpu_modalloc(unsigned long size, unsigned long align,
370 const char *name) 370 const char *name)
@@ -389,7 +389,7 @@ static void percpu_modfree(void *freeme)
389 free_percpu(freeme); 389 free_percpu(freeme);
390} 390}
391 391
392#else /* ... !CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */ 392#else /* ... CONFIG_HAVE_LEGACY_PER_CPU_AREA */
393 393
394/* Number of blocks used and allocated. */ 394/* Number of blocks used and allocated. */
395static unsigned int pcpu_num_used, pcpu_num_allocated; 395static unsigned int pcpu_num_used, pcpu_num_allocated;
@@ -535,7 +535,7 @@ static int percpu_modinit(void)
535} 535}
536__initcall(percpu_modinit); 536__initcall(percpu_modinit);
537 537
538#endif /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */ 538#endif /* CONFIG_HAVE_LEGACY_PER_CPU_AREA */
539 539
540static unsigned int find_pcpusec(Elf_Ehdr *hdr, 540static unsigned int find_pcpusec(Elf_Ehdr *hdr,
541 Elf_Shdr *sechdrs, 541 Elf_Shdr *sechdrs,
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 534e20d14d63..b0bdb36ccfc8 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -100,16 +100,16 @@ hw_perf_group_sched_in(struct perf_counter *group_leader,
100 100
101void __weak perf_counter_print_debug(void) { } 101void __weak perf_counter_print_debug(void) { }
102 102
103static DEFINE_PER_CPU(int, disable_count); 103static DEFINE_PER_CPU(int, perf_disable_count);
104 104
105void __perf_disable(void) 105void __perf_disable(void)
106{ 106{
107 __get_cpu_var(disable_count)++; 107 __get_cpu_var(perf_disable_count)++;
108} 108}
109 109
110bool __perf_enable(void) 110bool __perf_enable(void)
111{ 111{
112 return !--__get_cpu_var(disable_count); 112 return !--__get_cpu_var(perf_disable_count);
113} 113}
114 114
115void perf_disable(void) 115void perf_disable(void)
diff --git a/kernel/sched.c b/kernel/sched.c
index 1b59e265273b..d3d7e7694da6 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -318,12 +318,12 @@ struct task_group root_task_group;
318/* Default task group's sched entity on each cpu */ 318/* Default task group's sched entity on each cpu */
319static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); 319static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
320/* Default task group's cfs_rq on each cpu */ 320/* Default task group's cfs_rq on each cpu */
321static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; 321static DEFINE_PER_CPU_SHARED_ALIGNED(struct cfs_rq, init_cfs_rq);
322#endif /* CONFIG_FAIR_GROUP_SCHED */ 322#endif /* CONFIG_FAIR_GROUP_SCHED */
323 323
324#ifdef CONFIG_RT_GROUP_SCHED 324#ifdef CONFIG_RT_GROUP_SCHED
325static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); 325static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
326static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; 326static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq);
327#endif /* CONFIG_RT_GROUP_SCHED */ 327#endif /* CONFIG_RT_GROUP_SCHED */
328#else /* !CONFIG_USER_SCHED */ 328#else /* !CONFIG_USER_SCHED */
329#define root_task_group init_task_group 329#define root_task_group init_task_group
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index e75276a49cf5..0db0a41e0079 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1334,7 +1334,7 @@ static __init void event_trace_self_tests(void)
1334 1334
1335#ifdef CONFIG_FUNCTION_TRACER 1335#ifdef CONFIG_FUNCTION_TRACER
1336 1336
1337static DEFINE_PER_CPU(atomic_t, test_event_disable); 1337static DEFINE_PER_CPU(atomic_t, ftrace_test_event_disable);
1338 1338
1339static void 1339static void
1340function_test_events_call(unsigned long ip, unsigned long parent_ip) 1340function_test_events_call(unsigned long ip, unsigned long parent_ip)
@@ -1350,7 +1350,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip)
1350 pc = preempt_count(); 1350 pc = preempt_count();
1351 resched = ftrace_preempt_disable(); 1351 resched = ftrace_preempt_disable();
1352 cpu = raw_smp_processor_id(); 1352 cpu = raw_smp_processor_id();
1353 disabled = atomic_inc_return(&per_cpu(test_event_disable, cpu)); 1353 disabled = atomic_inc_return(&per_cpu(ftrace_test_event_disable, cpu));
1354 1354
1355 if (disabled != 1) 1355 if (disabled != 1)
1356 goto out; 1356 goto out;
@@ -1368,7 +1368,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip)
1368 trace_nowake_buffer_unlock_commit(event, flags, pc); 1368 trace_nowake_buffer_unlock_commit(event, flags, pc);
1369 1369
1370 out: 1370 out:
1371 atomic_dec(&per_cpu(test_event_disable, cpu)); 1371 atomic_dec(&per_cpu(ftrace_test_event_disable, cpu));
1372 ftrace_preempt_enable(resched); 1372 ftrace_preempt_enable(resched);
1373} 1373}
1374 1374
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 12327b2bb785..43173c4e0ade 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -790,6 +790,21 @@ config DEBUG_BLOCK_EXT_DEVT
790 790
791 Say N if you are unsure. 791 Say N if you are unsure.
792 792
793config DEBUG_FORCE_WEAK_PER_CPU
794 bool "Force weak per-cpu definitions"
795 depends on DEBUG_KERNEL
796 help
797 s390 and alpha require percpu variables in modules to be
798 defined weak to work around addressing range issue which
799 puts the following two restrictions on percpu variable
800 definitions.
801
802 1. percpu symbols must be unique whether static or not
803 2. percpu variables can't be defined inside a function
804
805 To ensure that generic code follows the above rules, this
806 option forces all percpu variables to be defined as weak.
807
793config LKDTM 808config LKDTM
794 tristate "Linux Kernel Dump Test Tool Module" 809 tristate "Linux Kernel Dump Test Tool Module"
795 depends on DEBUG_KERNEL 810 depends on DEBUG_KERNEL
diff --git a/mm/Makefile b/mm/Makefile
index 5e0bd6426693..c77c6487552f 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -33,7 +33,7 @@ obj-$(CONFIG_FAILSLAB) += failslab.o
33obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o 33obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
34obj-$(CONFIG_FS_XIP) += filemap_xip.o 34obj-$(CONFIG_FS_XIP) += filemap_xip.o
35obj-$(CONFIG_MIGRATION) += migrate.o 35obj-$(CONFIG_MIGRATION) += migrate.o
36ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA 36ifndef CONFIG_HAVE_LEGACY_PER_CPU_AREA
37obj-$(CONFIG_SMP) += percpu.o 37obj-$(CONFIG_SMP) += percpu.o
38else 38else
39obj-$(CONFIG_SMP) += allocpercpu.o 39obj-$(CONFIG_SMP) += allocpercpu.o
diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c
index dfdee6a47359..df34ceae0c67 100644
--- a/mm/allocpercpu.c
+++ b/mm/allocpercpu.c
@@ -5,6 +5,8 @@
5 */ 5 */
6#include <linux/mm.h> 6#include <linux/mm.h>
7#include <linux/module.h> 7#include <linux/module.h>
8#include <linux/bootmem.h>
9#include <asm/sections.h>
8 10
9#ifndef cache_line_size 11#ifndef cache_line_size
10#define cache_line_size() L1_CACHE_BYTES 12#define cache_line_size() L1_CACHE_BYTES
@@ -147,3 +149,29 @@ void free_percpu(void *__pdata)
147 kfree(__percpu_disguise(__pdata)); 149 kfree(__percpu_disguise(__pdata));
148} 150}
149EXPORT_SYMBOL_GPL(free_percpu); 151EXPORT_SYMBOL_GPL(free_percpu);
152
153/*
154 * Generic percpu area setup.
155 */
156#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
157unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
158
159EXPORT_SYMBOL(__per_cpu_offset);
160
161void __init setup_per_cpu_areas(void)
162{
163 unsigned long size, i;
164 char *ptr;
165 unsigned long nr_possible_cpus = num_possible_cpus();
166
167 /* Copy section for each CPU (we discard the original) */
168 size = ALIGN(PERCPU_ENOUGH_ROOM, PAGE_SIZE);
169 ptr = alloc_bootmem_pages(size * nr_possible_cpus);
170
171 for_each_possible_cpu(i) {
172 __per_cpu_offset[i] = ptr - __per_cpu_start;
173 memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
174 ptr += size;
175 }
176}
177#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */
diff --git a/mm/kmemleak-test.c b/mm/kmemleak-test.c
index d5292fc6f523..177a5169bbde 100644
--- a/mm/kmemleak-test.c
+++ b/mm/kmemleak-test.c
@@ -36,7 +36,7 @@ struct test_node {
36}; 36};
37 37
38static LIST_HEAD(test_list); 38static LIST_HEAD(test_list);
39static DEFINE_PER_CPU(void *, test_pointer); 39static DEFINE_PER_CPU(void *, kmemleak_test_pointer);
40 40
41/* 41/*
42 * Some very simple testing. This function needs to be extended for 42 * Some very simple testing. This function needs to be extended for
@@ -86,9 +86,9 @@ static int __init kmemleak_test_init(void)
86 } 86 }
87 87
88 for_each_possible_cpu(i) { 88 for_each_possible_cpu(i) {
89 per_cpu(test_pointer, i) = kmalloc(129, GFP_KERNEL); 89 per_cpu(kmemleak_test_pointer, i) = kmalloc(129, GFP_KERNEL);
90 pr_info("kmemleak: kmalloc(129) = %p\n", 90 pr_info("kmemleak: kmalloc(129) = %p\n",
91 per_cpu(test_pointer, i)); 91 per_cpu(kmemleak_test_pointer, i));
92 } 92 }
93 93
94 return 0; 94 return 0;
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 81627ebcd313..997186c0b519 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -610,6 +610,8 @@ void set_page_dirty_balance(struct page *page, int page_mkwrite)
610 } 610 }
611} 611}
612 612
613static DEFINE_PER_CPU(unsigned long, bdp_ratelimits) = 0;
614
613/** 615/**
614 * balance_dirty_pages_ratelimited_nr - balance dirty memory state 616 * balance_dirty_pages_ratelimited_nr - balance dirty memory state
615 * @mapping: address_space which was dirtied 617 * @mapping: address_space which was dirtied
@@ -627,7 +629,6 @@ void set_page_dirty_balance(struct page *page, int page_mkwrite)
627void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, 629void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
628 unsigned long nr_pages_dirtied) 630 unsigned long nr_pages_dirtied)
629{ 631{
630 static DEFINE_PER_CPU(unsigned long, ratelimits) = 0;
631 unsigned long ratelimit; 632 unsigned long ratelimit;
632 unsigned long *p; 633 unsigned long *p;
633 634
@@ -640,7 +641,7 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
640 * tasks in balance_dirty_pages(). Period. 641 * tasks in balance_dirty_pages(). Period.
641 */ 642 */
642 preempt_disable(); 643 preempt_disable();
643 p = &__get_cpu_var(ratelimits); 644 p = &__get_cpu_var(bdp_ratelimits);
644 *p += nr_pages_dirtied; 645 *p += nr_pages_dirtied;
645 if (unlikely(*p >= ratelimit)) { 646 if (unlikely(*p >= ratelimit)) {
646 *p = 0; 647 *p = 0;
diff --git a/mm/percpu.c b/mm/percpu.c
index 5fe37842e0ea..3f9f182f9b44 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -8,12 +8,13 @@
8 * 8 *
9 * This is percpu allocator which can handle both static and dynamic 9 * This is percpu allocator which can handle both static and dynamic
10 * areas. Percpu areas are allocated in chunks in vmalloc area. Each 10 * areas. Percpu areas are allocated in chunks in vmalloc area. Each
11 * chunk is consisted of nr_cpu_ids units and the first chunk is used 11 * chunk is consisted of boot-time determined number of units and the
12 * for static percpu variables in the kernel image (special boot time 12 * first chunk is used for static percpu variables in the kernel image
13 * alloc/init handling necessary as these areas need to be brought up 13 * (special boot time alloc/init handling necessary as these areas
14 * before allocation services are running). Unit grows as necessary 14 * need to be brought up before allocation services are running).
15 * and all units grow or shrink in unison. When a chunk is filled up, 15 * Unit grows as necessary and all units grow or shrink in unison.
16 * another chunk is allocated. ie. in vmalloc area 16 * When a chunk is filled up, another chunk is allocated. ie. in
17 * vmalloc area
17 * 18 *
18 * c0 c1 c2 19 * c0 c1 c2
19 * ------------------- ------------------- ------------ 20 * ------------------- ------------------- ------------
@@ -22,11 +23,13 @@
22 * 23 *
23 * Allocation is done in offset-size areas of single unit space. Ie, 24 * Allocation is done in offset-size areas of single unit space. Ie,
24 * an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0, 25 * an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0,
25 * c1:u1, c1:u2 and c1:u3. Percpu access can be done by configuring 26 * c1:u1, c1:u2 and c1:u3. On UMA, units corresponds directly to
26 * percpu base registers pcpu_unit_size apart. 27 * cpus. On NUMA, the mapping can be non-linear and even sparse.
28 * Percpu access can be done by configuring percpu base registers
29 * according to cpu to unit mapping and pcpu_unit_size.
27 * 30 *
28 * There are usually many small percpu allocations many of them as 31 * There are usually many small percpu allocations many of them being
29 * small as 4 bytes. The allocator organizes chunks into lists 32 * as small as 4 bytes. The allocator organizes chunks into lists
30 * according to free size and tries to allocate from the fullest one. 33 * according to free size and tries to allocate from the fullest one.
31 * Each chunk keeps the maximum contiguous area size hint which is 34 * Each chunk keeps the maximum contiguous area size hint which is
32 * guaranteed to be eqaul to or larger than the maximum contiguous 35 * guaranteed to be eqaul to or larger than the maximum contiguous
@@ -43,7 +46,7 @@
43 * 46 *
44 * To use this allocator, arch code should do the followings. 47 * To use this allocator, arch code should do the followings.
45 * 48 *
46 * - define CONFIG_HAVE_DYNAMIC_PER_CPU_AREA 49 * - drop CONFIG_HAVE_LEGACY_PER_CPU_AREA
47 * 50 *
48 * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate 51 * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate
49 * regular address to percpu pointer and back if they need to be 52 * regular address to percpu pointer and back if they need to be
@@ -56,6 +59,7 @@
56#include <linux/bitmap.h> 59#include <linux/bitmap.h>
57#include <linux/bootmem.h> 60#include <linux/bootmem.h>
58#include <linux/list.h> 61#include <linux/list.h>
62#include <linux/log2.h>
59#include <linux/mm.h> 63#include <linux/mm.h>
60#include <linux/module.h> 64#include <linux/module.h>
61#include <linux/mutex.h> 65#include <linux/mutex.h>
@@ -94,20 +98,27 @@ struct pcpu_chunk {
94 int map_alloc; /* # of map entries allocated */ 98 int map_alloc; /* # of map entries allocated */
95 int *map; /* allocation map */ 99 int *map; /* allocation map */
96 bool immutable; /* no [de]population allowed */ 100 bool immutable; /* no [de]population allowed */
97 struct page **page; /* points to page array */ 101 unsigned long populated[]; /* populated bitmap */
98 struct page *page_ar[]; /* #cpus * UNIT_PAGES */
99}; 102};
100 103
101static int pcpu_unit_pages __read_mostly; 104static int pcpu_unit_pages __read_mostly;
102static int pcpu_unit_size __read_mostly; 105static int pcpu_unit_size __read_mostly;
106static int pcpu_nr_units __read_mostly;
103static int pcpu_chunk_size __read_mostly; 107static int pcpu_chunk_size __read_mostly;
104static int pcpu_nr_slots __read_mostly; 108static int pcpu_nr_slots __read_mostly;
105static size_t pcpu_chunk_struct_size __read_mostly; 109static size_t pcpu_chunk_struct_size __read_mostly;
106 110
111/* cpus with the lowest and highest unit numbers */
112static unsigned int pcpu_first_unit_cpu __read_mostly;
113static unsigned int pcpu_last_unit_cpu __read_mostly;
114
107/* the address of the first chunk which starts with the kernel static area */ 115/* the address of the first chunk which starts with the kernel static area */
108void *pcpu_base_addr __read_mostly; 116void *pcpu_base_addr __read_mostly;
109EXPORT_SYMBOL_GPL(pcpu_base_addr); 117EXPORT_SYMBOL_GPL(pcpu_base_addr);
110 118
119/* cpu -> unit map */
120const int *pcpu_unit_map __read_mostly;
121
111/* 122/*
112 * The first chunk which always exists. Note that unlike other 123 * The first chunk which always exists. Note that unlike other
113 * chunks, this one can be allocated and mapped in several different 124 * chunks, this one can be allocated and mapped in several different
@@ -129,9 +140,9 @@ static int pcpu_reserved_chunk_limit;
129 * Synchronization rules. 140 * Synchronization rules.
130 * 141 *
131 * There are two locks - pcpu_alloc_mutex and pcpu_lock. The former 142 * There are two locks - pcpu_alloc_mutex and pcpu_lock. The former
132 * protects allocation/reclaim paths, chunks and chunk->page arrays. 143 * protects allocation/reclaim paths, chunks, populated bitmap and
133 * The latter is a spinlock and protects the index data structures - 144 * vmalloc mapping. The latter is a spinlock and protects the index
134 * chunk slots, chunks and area maps in chunks. 145 * data structures - chunk slots, chunks and area maps in chunks.
135 * 146 *
136 * During allocation, pcpu_alloc_mutex is kept locked all the time and 147 * During allocation, pcpu_alloc_mutex is kept locked all the time and
137 * pcpu_lock is grabbed and released as necessary. All actual memory 148 * pcpu_lock is grabbed and released as necessary. All actual memory
@@ -178,13 +189,7 @@ static int pcpu_chunk_slot(const struct pcpu_chunk *chunk)
178 189
179static int pcpu_page_idx(unsigned int cpu, int page_idx) 190static int pcpu_page_idx(unsigned int cpu, int page_idx)
180{ 191{
181 return cpu * pcpu_unit_pages + page_idx; 192 return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx;
182}
183
184static struct page **pcpu_chunk_pagep(struct pcpu_chunk *chunk,
185 unsigned int cpu, int page_idx)
186{
187 return &chunk->page[pcpu_page_idx(cpu, page_idx)];
188} 193}
189 194
190static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk, 195static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
@@ -194,10 +199,13 @@ static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
194 (pcpu_page_idx(cpu, page_idx) << PAGE_SHIFT); 199 (pcpu_page_idx(cpu, page_idx) << PAGE_SHIFT);
195} 200}
196 201
197static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk, 202static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk,
198 int page_idx) 203 unsigned int cpu, int page_idx)
199{ 204{
200 return *pcpu_chunk_pagep(chunk, 0, page_idx) != NULL; 205 /* must not be used on pre-mapped chunk */
206 WARN_ON(chunk->immutable);
207
208 return vmalloc_to_page((void *)pcpu_chunk_addr(chunk, cpu, page_idx));
201} 209}
202 210
203/* set the pointer to a chunk in a page struct */ 211/* set the pointer to a chunk in a page struct */
@@ -212,6 +220,34 @@ static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page)
212 return (struct pcpu_chunk *)page->index; 220 return (struct pcpu_chunk *)page->index;
213} 221}
214 222
223static void pcpu_next_unpop(struct pcpu_chunk *chunk, int *rs, int *re, int end)
224{
225 *rs = find_next_zero_bit(chunk->populated, end, *rs);
226 *re = find_next_bit(chunk->populated, end, *rs + 1);
227}
228
229static void pcpu_next_pop(struct pcpu_chunk *chunk, int *rs, int *re, int end)
230{
231 *rs = find_next_bit(chunk->populated, end, *rs);
232 *re = find_next_zero_bit(chunk->populated, end, *rs + 1);
233}
234
235/*
236 * (Un)populated page region iterators. Iterate over (un)populated
237 * page regions betwen @start and @end in @chunk. @rs and @re should
238 * be integer variables and will be set to start and end page index of
239 * the current region.
240 */
241#define pcpu_for_each_unpop_region(chunk, rs, re, start, end) \
242 for ((rs) = (start), pcpu_next_unpop((chunk), &(rs), &(re), (end)); \
243 (rs) < (re); \
244 (rs) = (re) + 1, pcpu_next_unpop((chunk), &(rs), &(re), (end)))
245
246#define pcpu_for_each_pop_region(chunk, rs, re, start, end) \
247 for ((rs) = (start), pcpu_next_pop((chunk), &(rs), &(re), (end)); \
248 (rs) < (re); \
249 (rs) = (re) + 1, pcpu_next_pop((chunk), &(rs), &(re), (end)))
250
215/** 251/**
216 * pcpu_mem_alloc - allocate memory 252 * pcpu_mem_alloc - allocate memory
217 * @size: bytes to allocate 253 * @size: bytes to allocate
@@ -290,13 +326,21 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
290 void *first_start = pcpu_first_chunk->vm->addr; 326 void *first_start = pcpu_first_chunk->vm->addr;
291 327
292 /* is it in the first chunk? */ 328 /* is it in the first chunk? */
293 if (addr >= first_start && addr < first_start + pcpu_chunk_size) { 329 if (addr >= first_start && addr < first_start + pcpu_unit_size) {
294 /* is it in the reserved area? */ 330 /* is it in the reserved area? */
295 if (addr < first_start + pcpu_reserved_chunk_limit) 331 if (addr < first_start + pcpu_reserved_chunk_limit)
296 return pcpu_reserved_chunk; 332 return pcpu_reserved_chunk;
297 return pcpu_first_chunk; 333 return pcpu_first_chunk;
298 } 334 }
299 335
336 /*
337 * The address is relative to unit0 which might be unused and
338 * thus unmapped. Offset the address to the unit space of the
339 * current processor before looking it up in the vmalloc
340 * space. Note that any possible cpu id can be used here, so
341 * there's no need to worry about preemption or cpu hotplug.
342 */
343 addr += pcpu_unit_map[smp_processor_id()] * pcpu_unit_size;
300 return pcpu_get_page_chunk(vmalloc_to_page(addr)); 344 return pcpu_get_page_chunk(vmalloc_to_page(addr));
301} 345}
302 346
@@ -545,125 +589,327 @@ static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme)
545} 589}
546 590
547/** 591/**
548 * pcpu_unmap - unmap pages out of a pcpu_chunk 592 * pcpu_get_pages_and_bitmap - get temp pages array and bitmap
549 * @chunk: chunk of interest 593 * @chunk: chunk of interest
550 * @page_start: page index of the first page to unmap 594 * @bitmapp: output parameter for bitmap
551 * @page_end: page index of the last page to unmap + 1 595 * @may_alloc: may allocate the array
552 * @flush_tlb: whether to flush tlb or not
553 * 596 *
554 * For each cpu, unmap pages [@page_start,@page_end) out of @chunk. 597 * Returns pointer to array of pointers to struct page and bitmap,
555 * If @flush is true, vcache is flushed before unmapping and tlb 598 * both of which can be indexed with pcpu_page_idx(). The returned
556 * after. 599 * array is cleared to zero and *@bitmapp is copied from
600 * @chunk->populated. Note that there is only one array and bitmap
601 * and access exclusion is the caller's responsibility.
602 *
603 * CONTEXT:
604 * pcpu_alloc_mutex and does GFP_KERNEL allocation if @may_alloc.
605 * Otherwise, don't care.
606 *
607 * RETURNS:
608 * Pointer to temp pages array on success, NULL on failure.
557 */ 609 */
558static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end, 610static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk,
559 bool flush_tlb) 611 unsigned long **bitmapp,
612 bool may_alloc)
560{ 613{
561 unsigned int last = nr_cpu_ids - 1; 614 static struct page **pages;
562 unsigned int cpu; 615 static unsigned long *bitmap;
616 size_t pages_size = pcpu_nr_units * pcpu_unit_pages * sizeof(pages[0]);
617 size_t bitmap_size = BITS_TO_LONGS(pcpu_unit_pages) *
618 sizeof(unsigned long);
619
620 if (!pages || !bitmap) {
621 if (may_alloc && !pages)
622 pages = pcpu_mem_alloc(pages_size);
623 if (may_alloc && !bitmap)
624 bitmap = pcpu_mem_alloc(bitmap_size);
625 if (!pages || !bitmap)
626 return NULL;
627 }
563 628
564 /* unmap must not be done on immutable chunk */ 629 memset(pages, 0, pages_size);
565 WARN_ON(chunk->immutable); 630 bitmap_copy(bitmap, chunk->populated, pcpu_unit_pages);
566 631
567 /* 632 *bitmapp = bitmap;
568 * Each flushing trial can be very expensive, issue flush on 633 return pages;
569 * the whole region at once rather than doing it for each cpu. 634}
570 * This could be an overkill but is more scalable.
571 */
572 flush_cache_vunmap(pcpu_chunk_addr(chunk, 0, page_start),
573 pcpu_chunk_addr(chunk, last, page_end));
574 635
575 for_each_possible_cpu(cpu) 636/**
576 unmap_kernel_range_noflush( 637 * pcpu_free_pages - free pages which were allocated for @chunk
577 pcpu_chunk_addr(chunk, cpu, page_start), 638 * @chunk: chunk pages were allocated for
578 (page_end - page_start) << PAGE_SHIFT); 639 * @pages: array of pages to be freed, indexed by pcpu_page_idx()
579 640 * @populated: populated bitmap
580 /* ditto as flush_cache_vunmap() */ 641 * @page_start: page index of the first page to be freed
581 if (flush_tlb) 642 * @page_end: page index of the last page to be freed + 1
582 flush_tlb_kernel_range(pcpu_chunk_addr(chunk, 0, page_start), 643 *
583 pcpu_chunk_addr(chunk, last, page_end)); 644 * Free pages [@page_start and @page_end) in @pages for all units.
645 * The pages were allocated for @chunk.
646 */
647static void pcpu_free_pages(struct pcpu_chunk *chunk,
648 struct page **pages, unsigned long *populated,
649 int page_start, int page_end)
650{
651 unsigned int cpu;
652 int i;
653
654 for_each_possible_cpu(cpu) {
655 for (i = page_start; i < page_end; i++) {
656 struct page *page = pages[pcpu_page_idx(cpu, i)];
657
658 if (page)
659 __free_page(page);
660 }
661 }
584} 662}
585 663
586/** 664/**
587 * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk 665 * pcpu_alloc_pages - allocates pages for @chunk
588 * @chunk: chunk to depopulate 666 * @chunk: target chunk
589 * @off: offset to the area to depopulate 667 * @pages: array to put the allocated pages into, indexed by pcpu_page_idx()
590 * @size: size of the area to depopulate in bytes 668 * @populated: populated bitmap
591 * @flush: whether to flush cache and tlb or not 669 * @page_start: page index of the first page to be allocated
592 * 670 * @page_end: page index of the last page to be allocated + 1
593 * For each cpu, depopulate and unmap pages [@page_start,@page_end) 671 *
594 * from @chunk. If @flush is true, vcache is flushed before unmapping 672 * Allocate pages [@page_start,@page_end) into @pages for all units.
595 * and tlb after. 673 * The allocation is for @chunk. Percpu core doesn't care about the
596 * 674 * content of @pages and will pass it verbatim to pcpu_map_pages().
597 * CONTEXT:
598 * pcpu_alloc_mutex.
599 */ 675 */
600static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size, 676static int pcpu_alloc_pages(struct pcpu_chunk *chunk,
601 bool flush) 677 struct page **pages, unsigned long *populated,
678 int page_start, int page_end)
602{ 679{
603 int page_start = PFN_DOWN(off); 680 const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD;
604 int page_end = PFN_UP(off + size);
605 int unmap_start = -1;
606 int uninitialized_var(unmap_end);
607 unsigned int cpu; 681 unsigned int cpu;
608 int i; 682 int i;
609 683
610 for (i = page_start; i < page_end; i++) { 684 for_each_possible_cpu(cpu) {
611 for_each_possible_cpu(cpu) { 685 for (i = page_start; i < page_end; i++) {
612 struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i); 686 struct page **pagep = &pages[pcpu_page_idx(cpu, i)];
687
688 *pagep = alloc_pages_node(cpu_to_node(cpu), gfp, 0);
689 if (!*pagep) {
690 pcpu_free_pages(chunk, pages, populated,
691 page_start, page_end);
692 return -ENOMEM;
693 }
694 }
695 }
696 return 0;
697}
613 698
614 if (!*pagep) 699/**
615 continue; 700 * pcpu_pre_unmap_flush - flush cache prior to unmapping
701 * @chunk: chunk the regions to be flushed belongs to
702 * @page_start: page index of the first page to be flushed
703 * @page_end: page index of the last page to be flushed + 1
704 *
705 * Pages in [@page_start,@page_end) of @chunk are about to be
706 * unmapped. Flush cache. As each flushing trial can be very
707 * expensive, issue flush on the whole region at once rather than
708 * doing it for each cpu. This could be an overkill but is more
709 * scalable.
710 */
711static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk,
712 int page_start, int page_end)
713{
714 flush_cache_vunmap(
715 pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
716 pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
717}
616 718
617 __free_page(*pagep); 719static void __pcpu_unmap_pages(unsigned long addr, int nr_pages)
720{
721 unmap_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT);
722}
618 723
619 /* 724/**
620 * If it's partial depopulation, it might get 725 * pcpu_unmap_pages - unmap pages out of a pcpu_chunk
621 * populated or depopulated again. Mark the 726 * @chunk: chunk of interest
622 * page gone. 727 * @pages: pages array which can be used to pass information to free
623 */ 728 * @populated: populated bitmap
624 *pagep = NULL; 729 * @page_start: page index of the first page to unmap
730 * @page_end: page index of the last page to unmap + 1
731 *
732 * For each cpu, unmap pages [@page_start,@page_end) out of @chunk.
733 * Corresponding elements in @pages were cleared by the caller and can
734 * be used to carry information to pcpu_free_pages() which will be
735 * called after all unmaps are finished. The caller should call
736 * proper pre/post flush functions.
737 */
738static void pcpu_unmap_pages(struct pcpu_chunk *chunk,
739 struct page **pages, unsigned long *populated,
740 int page_start, int page_end)
741{
742 unsigned int cpu;
743 int i;
625 744
626 unmap_start = unmap_start < 0 ? i : unmap_start; 745 for_each_possible_cpu(cpu) {
627 unmap_end = i + 1; 746 for (i = page_start; i < page_end; i++) {
747 struct page *page;
748
749 page = pcpu_chunk_page(chunk, cpu, i);
750 WARN_ON(!page);
751 pages[pcpu_page_idx(cpu, i)] = page;
628 } 752 }
753 __pcpu_unmap_pages(pcpu_chunk_addr(chunk, cpu, page_start),
754 page_end - page_start);
629 } 755 }
630 756
631 if (unmap_start >= 0) 757 for (i = page_start; i < page_end; i++)
632 pcpu_unmap(chunk, unmap_start, unmap_end, flush); 758 __clear_bit(i, populated);
633} 759}
634 760
635/** 761/**
636 * pcpu_map - map pages into a pcpu_chunk 762 * pcpu_post_unmap_tlb_flush - flush TLB after unmapping
763 * @chunk: pcpu_chunk the regions to be flushed belong to
764 * @page_start: page index of the first page to be flushed
765 * @page_end: page index of the last page to be flushed + 1
766 *
767 * Pages [@page_start,@page_end) of @chunk have been unmapped. Flush
768 * TLB for the regions. This can be skipped if the area is to be
769 * returned to vmalloc as vmalloc will handle TLB flushing lazily.
770 *
771 * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once
772 * for the whole region.
773 */
774static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk,
775 int page_start, int page_end)
776{
777 flush_tlb_kernel_range(
778 pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
779 pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
780}
781
782static int __pcpu_map_pages(unsigned long addr, struct page **pages,
783 int nr_pages)
784{
785 return map_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT,
786 PAGE_KERNEL, pages);
787}
788
789/**
790 * pcpu_map_pages - map pages into a pcpu_chunk
637 * @chunk: chunk of interest 791 * @chunk: chunk of interest
792 * @pages: pages array containing pages to be mapped
793 * @populated: populated bitmap
638 * @page_start: page index of the first page to map 794 * @page_start: page index of the first page to map
639 * @page_end: page index of the last page to map + 1 795 * @page_end: page index of the last page to map + 1
640 * 796 *
641 * For each cpu, map pages [@page_start,@page_end) into @chunk. 797 * For each cpu, map pages [@page_start,@page_end) into @chunk. The
642 * vcache is flushed afterwards. 798 * caller is responsible for calling pcpu_post_map_flush() after all
799 * mappings are complete.
800 *
801 * This function is responsible for setting corresponding bits in
802 * @chunk->populated bitmap and whatever is necessary for reverse
803 * lookup (addr -> chunk).
643 */ 804 */
644static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end) 805static int pcpu_map_pages(struct pcpu_chunk *chunk,
806 struct page **pages, unsigned long *populated,
807 int page_start, int page_end)
645{ 808{
646 unsigned int last = nr_cpu_ids - 1; 809 unsigned int cpu, tcpu;
647 unsigned int cpu; 810 int i, err;
648 int err;
649
650 /* map must not be done on immutable chunk */
651 WARN_ON(chunk->immutable);
652 811
653 for_each_possible_cpu(cpu) { 812 for_each_possible_cpu(cpu) {
654 err = map_kernel_range_noflush( 813 err = __pcpu_map_pages(pcpu_chunk_addr(chunk, cpu, page_start),
655 pcpu_chunk_addr(chunk, cpu, page_start), 814 &pages[pcpu_page_idx(cpu, page_start)],
656 (page_end - page_start) << PAGE_SHIFT, 815 page_end - page_start);
657 PAGE_KERNEL,
658 pcpu_chunk_pagep(chunk, cpu, page_start));
659 if (err < 0) 816 if (err < 0)
660 return err; 817 goto err;
818 }
819
820 /* mapping successful, link chunk and mark populated */
821 for (i = page_start; i < page_end; i++) {
822 for_each_possible_cpu(cpu)
823 pcpu_set_page_chunk(pages[pcpu_page_idx(cpu, i)],
824 chunk);
825 __set_bit(i, populated);
661 } 826 }
662 827
663 /* flush at once, please read comments in pcpu_unmap() */
664 flush_cache_vmap(pcpu_chunk_addr(chunk, 0, page_start),
665 pcpu_chunk_addr(chunk, last, page_end));
666 return 0; 828 return 0;
829
830err:
831 for_each_possible_cpu(tcpu) {
832 if (tcpu == cpu)
833 break;
834 __pcpu_unmap_pages(pcpu_chunk_addr(chunk, tcpu, page_start),
835 page_end - page_start);
836 }
837 return err;
838}
839
840/**
841 * pcpu_post_map_flush - flush cache after mapping
842 * @chunk: pcpu_chunk the regions to be flushed belong to
843 * @page_start: page index of the first page to be flushed
844 * @page_end: page index of the last page to be flushed + 1
845 *
846 * Pages [@page_start,@page_end) of @chunk have been mapped. Flush
847 * cache.
848 *
849 * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once
850 * for the whole region.
851 */
852static void pcpu_post_map_flush(struct pcpu_chunk *chunk,
853 int page_start, int page_end)
854{
855 flush_cache_vmap(
856 pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
857 pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
858}
859
860/**
861 * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk
862 * @chunk: chunk to depopulate
863 * @off: offset to the area to depopulate
864 * @size: size of the area to depopulate in bytes
865 * @flush: whether to flush cache and tlb or not
866 *
867 * For each cpu, depopulate and unmap pages [@page_start,@page_end)
868 * from @chunk. If @flush is true, vcache is flushed before unmapping
869 * and tlb after.
870 *
871 * CONTEXT:
872 * pcpu_alloc_mutex.
873 */
874static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size)
875{
876 int page_start = PFN_DOWN(off);
877 int page_end = PFN_UP(off + size);
878 struct page **pages;
879 unsigned long *populated;
880 int rs, re;
881
882 /* quick path, check whether it's empty already */
883 pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
884 if (rs == page_start && re == page_end)
885 return;
886 break;
887 }
888
889 /* immutable chunks can't be depopulated */
890 WARN_ON(chunk->immutable);
891
892 /*
893 * If control reaches here, there must have been at least one
894 * successful population attempt so the temp pages array must
895 * be available now.
896 */
897 pages = pcpu_get_pages_and_bitmap(chunk, &populated, false);
898 BUG_ON(!pages);
899
900 /* unmap and free */
901 pcpu_pre_unmap_flush(chunk, page_start, page_end);
902
903 pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end)
904 pcpu_unmap_pages(chunk, pages, populated, rs, re);
905
906 /* no need to flush tlb, vmalloc will handle it lazily */
907
908 pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end)
909 pcpu_free_pages(chunk, pages, populated, rs, re);
910
911 /* commit new bitmap */
912 bitmap_copy(chunk->populated, populated, pcpu_unit_pages);
667} 913}
668 914
669/** 915/**
@@ -680,50 +926,60 @@ static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end)
680 */ 926 */
681static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) 927static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
682{ 928{
683 const gfp_t alloc_mask = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD;
684 int page_start = PFN_DOWN(off); 929 int page_start = PFN_DOWN(off);
685 int page_end = PFN_UP(off + size); 930 int page_end = PFN_UP(off + size);
686 int map_start = -1; 931 int free_end = page_start, unmap_end = page_start;
687 int uninitialized_var(map_end); 932 struct page **pages;
933 unsigned long *populated;
688 unsigned int cpu; 934 unsigned int cpu;
689 int i; 935 int rs, re, rc;
690 936
691 for (i = page_start; i < page_end; i++) { 937 /* quick path, check whether all pages are already there */
692 if (pcpu_chunk_page_occupied(chunk, i)) { 938 pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) {
693 if (map_start >= 0) { 939 if (rs == page_start && re == page_end)
694 if (pcpu_map(chunk, map_start, map_end)) 940 goto clear;
695 goto err; 941 break;
696 map_start = -1; 942 }
697 }
698 continue;
699 }
700 943
701 map_start = map_start < 0 ? i : map_start; 944 /* need to allocate and map pages, this chunk can't be immutable */
702 map_end = i + 1; 945 WARN_ON(chunk->immutable);
703 946
704 for_each_possible_cpu(cpu) { 947 pages = pcpu_get_pages_and_bitmap(chunk, &populated, true);
705 struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i); 948 if (!pages)
949 return -ENOMEM;
706 950
707 *pagep = alloc_pages_node(cpu_to_node(cpu), 951 /* alloc and map */
708 alloc_mask, 0); 952 pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
709 if (!*pagep) 953 rc = pcpu_alloc_pages(chunk, pages, populated, rs, re);
710 goto err; 954 if (rc)
711 pcpu_set_page_chunk(*pagep, chunk); 955 goto err_free;
712 } 956 free_end = re;
713 } 957 }
714 958
715 if (map_start >= 0 && pcpu_map(chunk, map_start, map_end)) 959 pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
716 goto err; 960 rc = pcpu_map_pages(chunk, pages, populated, rs, re);
961 if (rc)
962 goto err_unmap;
963 unmap_end = re;
964 }
965 pcpu_post_map_flush(chunk, page_start, page_end);
717 966
967 /* commit new bitmap */
968 bitmap_copy(chunk->populated, populated, pcpu_unit_pages);
969clear:
718 for_each_possible_cpu(cpu) 970 for_each_possible_cpu(cpu)
719 memset(chunk->vm->addr + cpu * pcpu_unit_size + off, 0, 971 memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
720 size);
721
722 return 0; 972 return 0;
723err: 973
724 /* likely under heavy memory pressure, give memory back */ 974err_unmap:
725 pcpu_depopulate_chunk(chunk, off, size, true); 975 pcpu_pre_unmap_flush(chunk, page_start, unmap_end);
726 return -ENOMEM; 976 pcpu_for_each_unpop_region(chunk, rs, re, page_start, unmap_end)
977 pcpu_unmap_pages(chunk, pages, populated, rs, re);
978 pcpu_post_unmap_tlb_flush(chunk, page_start, unmap_end);
979err_free:
980 pcpu_for_each_unpop_region(chunk, rs, re, page_start, free_end)
981 pcpu_free_pages(chunk, pages, populated, rs, re);
982 return rc;
727} 983}
728 984
729static void free_pcpu_chunk(struct pcpu_chunk *chunk) 985static void free_pcpu_chunk(struct pcpu_chunk *chunk)
@@ -747,7 +1003,6 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void)
747 chunk->map = pcpu_mem_alloc(PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0])); 1003 chunk->map = pcpu_mem_alloc(PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0]));
748 chunk->map_alloc = PCPU_DFL_MAP_ALLOC; 1004 chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
749 chunk->map[chunk->map_used++] = pcpu_unit_size; 1005 chunk->map[chunk->map_used++] = pcpu_unit_size;
750 chunk->page = chunk->page_ar;
751 1006
752 chunk->vm = get_vm_area(pcpu_chunk_size, VM_ALLOC); 1007 chunk->vm = get_vm_area(pcpu_chunk_size, VM_ALLOC);
753 if (!chunk->vm) { 1008 if (!chunk->vm) {
@@ -847,6 +1102,7 @@ area_found:
847 1102
848 mutex_unlock(&pcpu_alloc_mutex); 1103 mutex_unlock(&pcpu_alloc_mutex);
849 1104
1105 /* return address relative to unit0 */
850 return __addr_to_pcpu_ptr(chunk->vm->addr + off); 1106 return __addr_to_pcpu_ptr(chunk->vm->addr + off);
851 1107
852fail_unlock: 1108fail_unlock:
@@ -928,7 +1184,7 @@ static void pcpu_reclaim(struct work_struct *work)
928 mutex_unlock(&pcpu_alloc_mutex); 1184 mutex_unlock(&pcpu_alloc_mutex);
929 1185
930 list_for_each_entry_safe(chunk, next, &todo, list) { 1186 list_for_each_entry_safe(chunk, next, &todo, list) {
931 pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false); 1187 pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size);
932 free_pcpu_chunk(chunk); 1188 free_pcpu_chunk(chunk);
933 } 1189 }
934} 1190}
@@ -976,26 +1232,16 @@ EXPORT_SYMBOL_GPL(free_percpu);
976 1232
977/** 1233/**
978 * pcpu_setup_first_chunk - initialize the first percpu chunk 1234 * pcpu_setup_first_chunk - initialize the first percpu chunk
979 * @get_page_fn: callback to fetch page pointer
980 * @static_size: the size of static percpu area in bytes 1235 * @static_size: the size of static percpu area in bytes
981 * @reserved_size: the size of reserved percpu area in bytes 1236 * @reserved_size: the size of reserved percpu area in bytes, 0 for none
982 * @dyn_size: free size for dynamic allocation in bytes, -1 for auto 1237 * @dyn_size: free size for dynamic allocation in bytes, -1 for auto
983 * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, -1 for auto 1238 * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE
984 * @base_addr: mapped address, NULL for auto 1239 * @base_addr: mapped address
985 * @populate_pte_fn: callback to allocate pagetable, NULL if unnecessary 1240 * @unit_map: cpu -> unit map, NULL for sequential mapping
986 * 1241 *
987 * Initialize the first percpu chunk which contains the kernel static 1242 * Initialize the first percpu chunk which contains the kernel static
988 * perpcu area. This function is to be called from arch percpu area 1243 * perpcu area. This function is to be called from arch percpu area
989 * setup path. The first two parameters are mandatory. The rest are 1244 * setup path.
990 * optional.
991 *
992 * @get_page_fn() should return pointer to percpu page given cpu
993 * number and page number. It should at least return enough pages to
994 * cover the static area. The returned pages for static area should
995 * have been initialized with valid data. If @unit_size is specified,
996 * it can also return pages after the static area. NULL return
997 * indicates end of pages for the cpu. Note that @get_page_fn() must
998 * return the same number of pages for all cpus.
999 * 1245 *
1000 * @reserved_size, if non-zero, specifies the amount of bytes to 1246 * @reserved_size, if non-zero, specifies the amount of bytes to
1001 * reserve after the static area in the first chunk. This reserves 1247 * reserve after the static area in the first chunk. This reserves
@@ -1010,17 +1256,12 @@ EXPORT_SYMBOL_GPL(free_percpu);
1010 * non-negative value makes percpu leave alone the area beyond 1256 * non-negative value makes percpu leave alone the area beyond
1011 * @static_size + @reserved_size + @dyn_size. 1257 * @static_size + @reserved_size + @dyn_size.
1012 * 1258 *
1013 * @unit_size, if non-negative, specifies unit size and must be 1259 * @unit_size specifies unit size and must be aligned to PAGE_SIZE and
1014 * aligned to PAGE_SIZE and equal to or larger than @static_size + 1260 * equal to or larger than @static_size + @reserved_size + if
1015 * @reserved_size + if non-negative, @dyn_size. 1261 * non-negative, @dyn_size.
1016 *
1017 * Non-null @base_addr means that the caller already allocated virtual
1018 * region for the first chunk and mapped it. percpu must not mess
1019 * with the chunk. Note that @base_addr with 0 @unit_size or non-NULL
1020 * @populate_pte_fn doesn't make any sense.
1021 * 1262 *
1022 * @populate_pte_fn is used to populate the pagetable. NULL means the 1263 * The caller should have mapped the first chunk at @base_addr and
1023 * caller already populated the pagetable. 1264 * copied static data to each unit.
1024 * 1265 *
1025 * If the first chunk ends up with both reserved and dynamic areas, it 1266 * If the first chunk ends up with both reserved and dynamic areas, it
1026 * is served by two chunks - one to serve the core static and reserved 1267 * is served by two chunks - one to serve the core static and reserved
@@ -1033,47 +1274,83 @@ EXPORT_SYMBOL_GPL(free_percpu);
1033 * The determined pcpu_unit_size which can be used to initialize 1274 * The determined pcpu_unit_size which can be used to initialize
1034 * percpu access. 1275 * percpu access.
1035 */ 1276 */
1036size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, 1277size_t __init pcpu_setup_first_chunk(size_t static_size, size_t reserved_size,
1037 size_t static_size, size_t reserved_size, 1278 ssize_t dyn_size, size_t unit_size,
1038 ssize_t dyn_size, ssize_t unit_size, 1279 void *base_addr, const int *unit_map)
1039 void *base_addr,
1040 pcpu_populate_pte_fn_t populate_pte_fn)
1041{ 1280{
1042 static struct vm_struct first_vm; 1281 static struct vm_struct first_vm;
1043 static int smap[2], dmap[2]; 1282 static int smap[2], dmap[2];
1044 size_t size_sum = static_size + reserved_size + 1283 size_t size_sum = static_size + reserved_size +
1045 (dyn_size >= 0 ? dyn_size : 0); 1284 (dyn_size >= 0 ? dyn_size : 0);
1046 struct pcpu_chunk *schunk, *dchunk = NULL; 1285 struct pcpu_chunk *schunk, *dchunk = NULL;
1047 unsigned int cpu; 1286 unsigned int cpu, tcpu;
1048 int nr_pages; 1287 int i;
1049 int err, i;
1050 1288
1051 /* santiy checks */ 1289 /* sanity checks */
1052 BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC || 1290 BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC ||
1053 ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC); 1291 ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC);
1054 BUG_ON(!static_size); 1292 BUG_ON(!static_size);
1055 if (unit_size >= 0) { 1293 BUG_ON(!base_addr);
1056 BUG_ON(unit_size < size_sum); 1294 BUG_ON(unit_size < size_sum);
1057 BUG_ON(unit_size & ~PAGE_MASK); 1295 BUG_ON(unit_size & ~PAGE_MASK);
1058 BUG_ON(unit_size < PCPU_MIN_UNIT_SIZE); 1296 BUG_ON(unit_size < PCPU_MIN_UNIT_SIZE);
1059 } else 1297
1060 BUG_ON(base_addr); 1298 /* determine number of units and verify and initialize pcpu_unit_map */
1061 BUG_ON(base_addr && populate_pte_fn); 1299 if (unit_map) {
1062 1300 int first_unit = INT_MAX, last_unit = INT_MIN;
1063 if (unit_size >= 0) 1301
1064 pcpu_unit_pages = unit_size >> PAGE_SHIFT; 1302 for_each_possible_cpu(cpu) {
1065 else 1303 int unit = unit_map[cpu];
1066 pcpu_unit_pages = max_t(int, PCPU_MIN_UNIT_SIZE >> PAGE_SHIFT, 1304
1067 PFN_UP(size_sum)); 1305 BUG_ON(unit < 0);
1306 for_each_possible_cpu(tcpu) {
1307 if (tcpu == cpu)
1308 break;
1309 /* the mapping should be one-to-one */
1310 BUG_ON(unit_map[tcpu] == unit);
1311 }
1312
1313 if (unit < first_unit) {
1314 pcpu_first_unit_cpu = cpu;
1315 first_unit = unit;
1316 }
1317 if (unit > last_unit) {
1318 pcpu_last_unit_cpu = cpu;
1319 last_unit = unit;
1320 }
1321 }
1322 pcpu_nr_units = last_unit + 1;
1323 pcpu_unit_map = unit_map;
1324 } else {
1325 int *identity_map;
1326
1327 /* #units == #cpus, identity mapped */
1328 identity_map = alloc_bootmem(nr_cpu_ids *
1329 sizeof(identity_map[0]));
1068 1330
1331 for_each_possible_cpu(cpu)
1332 identity_map[cpu] = cpu;
1333
1334 pcpu_first_unit_cpu = 0;
1335 pcpu_last_unit_cpu = pcpu_nr_units - 1;
1336 pcpu_nr_units = nr_cpu_ids;
1337 pcpu_unit_map = identity_map;
1338 }
1339
1340 /* determine basic parameters */
1341 pcpu_unit_pages = unit_size >> PAGE_SHIFT;
1069 pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT; 1342 pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
1070 pcpu_chunk_size = nr_cpu_ids * pcpu_unit_size; 1343 pcpu_chunk_size = pcpu_nr_units * pcpu_unit_size;
1071 pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) 1344 pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) +
1072 + nr_cpu_ids * pcpu_unit_pages * sizeof(struct page *); 1345 BITS_TO_LONGS(pcpu_unit_pages) * sizeof(unsigned long);
1073 1346
1074 if (dyn_size < 0) 1347 if (dyn_size < 0)
1075 dyn_size = pcpu_unit_size - static_size - reserved_size; 1348 dyn_size = pcpu_unit_size - static_size - reserved_size;
1076 1349
1350 first_vm.flags = VM_ALLOC;
1351 first_vm.size = pcpu_chunk_size;
1352 first_vm.addr = base_addr;
1353
1077 /* 1354 /*
1078 * Allocate chunk slots. The additional last slot is for 1355 * Allocate chunk slots. The additional last slot is for
1079 * empty chunks. 1356 * empty chunks.
@@ -1095,7 +1372,8 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
1095 schunk->vm = &first_vm; 1372 schunk->vm = &first_vm;
1096 schunk->map = smap; 1373 schunk->map = smap;
1097 schunk->map_alloc = ARRAY_SIZE(smap); 1374 schunk->map_alloc = ARRAY_SIZE(smap);
1098 schunk->page = schunk->page_ar; 1375 schunk->immutable = true;
1376 bitmap_fill(schunk->populated, pcpu_unit_pages);
1099 1377
1100 if (reserved_size) { 1378 if (reserved_size) {
1101 schunk->free_size = reserved_size; 1379 schunk->free_size = reserved_size;
@@ -1113,93 +1391,39 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
1113 1391
1114 /* init dynamic chunk if necessary */ 1392 /* init dynamic chunk if necessary */
1115 if (dyn_size) { 1393 if (dyn_size) {
1116 dchunk = alloc_bootmem(sizeof(struct pcpu_chunk)); 1394 dchunk = alloc_bootmem(pcpu_chunk_struct_size);
1117 INIT_LIST_HEAD(&dchunk->list); 1395 INIT_LIST_HEAD(&dchunk->list);
1118 dchunk->vm = &first_vm; 1396 dchunk->vm = &first_vm;
1119 dchunk->map = dmap; 1397 dchunk->map = dmap;
1120 dchunk->map_alloc = ARRAY_SIZE(dmap); 1398 dchunk->map_alloc = ARRAY_SIZE(dmap);
1121 dchunk->page = schunk->page_ar; /* share page map with schunk */ 1399 dchunk->immutable = true;
1400 bitmap_fill(dchunk->populated, pcpu_unit_pages);
1122 1401
1123 dchunk->contig_hint = dchunk->free_size = dyn_size; 1402 dchunk->contig_hint = dchunk->free_size = dyn_size;
1124 dchunk->map[dchunk->map_used++] = -pcpu_reserved_chunk_limit; 1403 dchunk->map[dchunk->map_used++] = -pcpu_reserved_chunk_limit;
1125 dchunk->map[dchunk->map_used++] = dchunk->free_size; 1404 dchunk->map[dchunk->map_used++] = dchunk->free_size;
1126 } 1405 }
1127 1406
1128 /* allocate vm address */
1129 first_vm.flags = VM_ALLOC;
1130 first_vm.size = pcpu_chunk_size;
1131
1132 if (!base_addr)
1133 vm_area_register_early(&first_vm, PAGE_SIZE);
1134 else {
1135 /*
1136 * Pages already mapped. No need to remap into
1137 * vmalloc area. In this case the first chunks can't
1138 * be mapped or unmapped by percpu and are marked
1139 * immutable.
1140 */
1141 first_vm.addr = base_addr;
1142 schunk->immutable = true;
1143 if (dchunk)
1144 dchunk->immutable = true;
1145 }
1146
1147 /* assign pages */
1148 nr_pages = -1;
1149 for_each_possible_cpu(cpu) {
1150 for (i = 0; i < pcpu_unit_pages; i++) {
1151 struct page *page = get_page_fn(cpu, i);
1152
1153 if (!page)
1154 break;
1155 *pcpu_chunk_pagep(schunk, cpu, i) = page;
1156 }
1157
1158 BUG_ON(i < PFN_UP(static_size));
1159
1160 if (nr_pages < 0)
1161 nr_pages = i;
1162 else
1163 BUG_ON(nr_pages != i);
1164 }
1165
1166 /* map them */
1167 if (populate_pte_fn) {
1168 for_each_possible_cpu(cpu)
1169 for (i = 0; i < nr_pages; i++)
1170 populate_pte_fn(pcpu_chunk_addr(schunk,
1171 cpu, i));
1172
1173 err = pcpu_map(schunk, 0, nr_pages);
1174 if (err)
1175 panic("failed to setup static percpu area, err=%d\n",
1176 err);
1177 }
1178
1179 /* link the first chunk in */ 1407 /* link the first chunk in */
1180 pcpu_first_chunk = dchunk ?: schunk; 1408 pcpu_first_chunk = dchunk ?: schunk;
1181 pcpu_chunk_relocate(pcpu_first_chunk, -1); 1409 pcpu_chunk_relocate(pcpu_first_chunk, -1);
1182 1410
1183 /* we're done */ 1411 /* we're done */
1184 pcpu_base_addr = (void *)pcpu_chunk_addr(schunk, 0, 0); 1412 pcpu_base_addr = schunk->vm->addr;
1185 return pcpu_unit_size; 1413 return pcpu_unit_size;
1186} 1414}
1187 1415
1188/* 1416static size_t pcpu_calc_fc_sizes(size_t static_size, size_t reserved_size,
1189 * Embedding first chunk setup helper. 1417 ssize_t *dyn_sizep)
1190 */
1191static void *pcpue_ptr __initdata;
1192static size_t pcpue_size __initdata;
1193static size_t pcpue_unit_size __initdata;
1194
1195static struct page * __init pcpue_get_page(unsigned int cpu, int pageno)
1196{ 1418{
1197 size_t off = (size_t)pageno << PAGE_SHIFT; 1419 size_t size_sum;
1198 1420
1199 if (off >= pcpue_size) 1421 size_sum = PFN_ALIGN(static_size + reserved_size +
1200 return NULL; 1422 (*dyn_sizep >= 0 ? *dyn_sizep : 0));
1423 if (*dyn_sizep != 0)
1424 *dyn_sizep = size_sum - static_size - reserved_size;
1201 1425
1202 return virt_to_page(pcpue_ptr + cpu * pcpue_unit_size + off); 1426 return size_sum;
1203} 1427}
1204 1428
1205/** 1429/**
@@ -1207,7 +1431,6 @@ static struct page * __init pcpue_get_page(unsigned int cpu, int pageno)
1207 * @static_size: the size of static percpu area in bytes 1431 * @static_size: the size of static percpu area in bytes
1208 * @reserved_size: the size of reserved percpu area in bytes 1432 * @reserved_size: the size of reserved percpu area in bytes
1209 * @dyn_size: free size for dynamic allocation in bytes, -1 for auto 1433 * @dyn_size: free size for dynamic allocation in bytes, -1 for auto
1210 * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, -1 for auto
1211 * 1434 *
1212 * This is a helper to ease setting up embedded first percpu chunk and 1435 * This is a helper to ease setting up embedded first percpu chunk and
1213 * can be called where pcpu_setup_first_chunk() is expected. 1436 * can be called where pcpu_setup_first_chunk() is expected.
@@ -1219,9 +1442,9 @@ static struct page * __init pcpue_get_page(unsigned int cpu, int pageno)
1219 * page size. 1442 * page size.
1220 * 1443 *
1221 * When @dyn_size is positive, dynamic area might be larger than 1444 * When @dyn_size is positive, dynamic area might be larger than
1222 * specified to fill page alignment. Also, when @dyn_size is auto, 1445 * specified to fill page alignment. When @dyn_size is auto,
1223 * @dyn_size does not fill the whole first chunk but only what's 1446 * @dyn_size is just big enough to fill page alignment after static
1224 * necessary for page alignment after static and reserved areas. 1447 * and reserved areas.
1225 * 1448 *
1226 * If the needed size is smaller than the minimum or specified unit 1449 * If the needed size is smaller than the minimum or specified unit
1227 * size, the leftover is returned to the bootmem allocator. 1450 * size, the leftover is returned to the bootmem allocator.
@@ -1231,28 +1454,21 @@ static struct page * __init pcpue_get_page(unsigned int cpu, int pageno)
1231 * percpu access on success, -errno on failure. 1454 * percpu access on success, -errno on failure.
1232 */ 1455 */
1233ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, 1456ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size,
1234 ssize_t dyn_size, ssize_t unit_size) 1457 ssize_t dyn_size)
1235{ 1458{
1236 size_t chunk_size; 1459 size_t size_sum, unit_size, chunk_size;
1460 void *base;
1237 unsigned int cpu; 1461 unsigned int cpu;
1238 1462
1239 /* determine parameters and allocate */ 1463 /* determine parameters and allocate */
1240 pcpue_size = PFN_ALIGN(static_size + reserved_size + 1464 size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size);
1241 (dyn_size >= 0 ? dyn_size : 0)); 1465
1242 if (dyn_size != 0) 1466 unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
1243 dyn_size = pcpue_size - static_size - reserved_size; 1467 chunk_size = unit_size * nr_cpu_ids;
1244 1468
1245 if (unit_size >= 0) { 1469 base = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE,
1246 BUG_ON(unit_size < pcpue_size); 1470 __pa(MAX_DMA_ADDRESS));
1247 pcpue_unit_size = unit_size; 1471 if (!base) {
1248 } else
1249 pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE);
1250
1251 chunk_size = pcpue_unit_size * nr_cpu_ids;
1252
1253 pcpue_ptr = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE,
1254 __pa(MAX_DMA_ADDRESS));
1255 if (!pcpue_ptr) {
1256 pr_warning("PERCPU: failed to allocate %zu bytes for " 1472 pr_warning("PERCPU: failed to allocate %zu bytes for "
1257 "embedding\n", chunk_size); 1473 "embedding\n", chunk_size);
1258 return -ENOMEM; 1474 return -ENOMEM;
@@ -1260,21 +1476,543 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size,
1260 1476
1261 /* return the leftover and copy */ 1477 /* return the leftover and copy */
1262 for (cpu = 0; cpu < nr_cpu_ids; cpu++) { 1478 for (cpu = 0; cpu < nr_cpu_ids; cpu++) {
1263 void *ptr = pcpue_ptr + cpu * pcpue_unit_size; 1479 void *ptr = base + cpu * unit_size;
1264 1480
1265 if (cpu_possible(cpu)) { 1481 if (cpu_possible(cpu)) {
1266 free_bootmem(__pa(ptr + pcpue_size), 1482 free_bootmem(__pa(ptr + size_sum),
1267 pcpue_unit_size - pcpue_size); 1483 unit_size - size_sum);
1268 memcpy(ptr, __per_cpu_load, static_size); 1484 memcpy(ptr, __per_cpu_load, static_size);
1269 } else 1485 } else
1270 free_bootmem(__pa(ptr), pcpue_unit_size); 1486 free_bootmem(__pa(ptr), unit_size);
1271 } 1487 }
1272 1488
1273 /* we're ready, commit */ 1489 /* we're ready, commit */
1274 pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n", 1490 pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n",
1275 pcpue_size >> PAGE_SHIFT, pcpue_ptr, static_size); 1491 size_sum >> PAGE_SHIFT, base, static_size);
1492
1493 return pcpu_setup_first_chunk(static_size, reserved_size, dyn_size,
1494 unit_size, base, NULL);
1495}
1496
1497/**
1498 * pcpu_4k_first_chunk - map the first chunk using PAGE_SIZE pages
1499 * @static_size: the size of static percpu area in bytes
1500 * @reserved_size: the size of reserved percpu area in bytes
1501 * @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE
1502 * @free_fn: funtion to free percpu page, always called with PAGE_SIZE
1503 * @populate_pte_fn: function to populate pte
1504 *
1505 * This is a helper to ease setting up embedded first percpu chunk and
1506 * can be called where pcpu_setup_first_chunk() is expected.
1507 *
1508 * This is the basic allocator. Static percpu area is allocated
1509 * page-by-page into vmalloc area.
1510 *
1511 * RETURNS:
1512 * The determined pcpu_unit_size which can be used to initialize
1513 * percpu access on success, -errno on failure.
1514 */
1515ssize_t __init pcpu_4k_first_chunk(size_t static_size, size_t reserved_size,
1516 pcpu_fc_alloc_fn_t alloc_fn,
1517 pcpu_fc_free_fn_t free_fn,
1518 pcpu_fc_populate_pte_fn_t populate_pte_fn)
1519{
1520 static struct vm_struct vm;
1521 int unit_pages;
1522 size_t pages_size;
1523 struct page **pages;
1524 unsigned int cpu;
1525 int i, j;
1526 ssize_t ret;
1527
1528 unit_pages = PFN_UP(max_t(size_t, static_size + reserved_size,
1529 PCPU_MIN_UNIT_SIZE));
1530
1531 /* unaligned allocations can't be freed, round up to page size */
1532 pages_size = PFN_ALIGN(unit_pages * nr_cpu_ids * sizeof(pages[0]));
1533 pages = alloc_bootmem(pages_size);
1534
1535 /* allocate pages */
1536 j = 0;
1537 for_each_possible_cpu(cpu)
1538 for (i = 0; i < unit_pages; i++) {
1539 void *ptr;
1540
1541 ptr = alloc_fn(cpu, PAGE_SIZE);
1542 if (!ptr) {
1543 pr_warning("PERCPU: failed to allocate "
1544 "4k page for cpu%u\n", cpu);
1545 goto enomem;
1546 }
1547 pages[j++] = virt_to_page(ptr);
1548 }
1549
1550 /* allocate vm area, map the pages and copy static data */
1551 vm.flags = VM_ALLOC;
1552 vm.size = nr_cpu_ids * unit_pages << PAGE_SHIFT;
1553 vm_area_register_early(&vm, PAGE_SIZE);
1554
1555 for_each_possible_cpu(cpu) {
1556 unsigned long unit_addr = (unsigned long)vm.addr +
1557 (cpu * unit_pages << PAGE_SHIFT);
1558
1559 for (i = 0; i < unit_pages; i++)
1560 populate_pte_fn(unit_addr + (i << PAGE_SHIFT));
1561
1562 /* pte already populated, the following shouldn't fail */
1563 ret = __pcpu_map_pages(unit_addr, &pages[cpu * unit_pages],
1564 unit_pages);
1565 if (ret < 0)
1566 panic("failed to map percpu area, err=%zd\n", ret);
1567
1568 /*
1569 * FIXME: Archs with virtual cache should flush local
1570 * cache for the linear mapping here - something
1571 * equivalent to flush_cache_vmap() on the local cpu.
1572 * flush_cache_vmap() can't be used as most supporting
1573 * data structures are not set up yet.
1574 */
1575
1576 /* copy static data */
1577 memcpy((void *)unit_addr, __per_cpu_load, static_size);
1578 }
1579
1580 /* we're ready, commit */
1581 pr_info("PERCPU: %d 4k pages per cpu, static data %zu bytes\n",
1582 unit_pages, static_size);
1583
1584 ret = pcpu_setup_first_chunk(static_size, reserved_size, -1,
1585 unit_pages << PAGE_SHIFT, vm.addr, NULL);
1586 goto out_free_ar;
1587
1588enomem:
1589 while (--j >= 0)
1590 free_fn(page_address(pages[j]), PAGE_SIZE);
1591 ret = -ENOMEM;
1592out_free_ar:
1593 free_bootmem(__pa(pages), pages_size);
1594 return ret;
1595}
1596
1597/*
1598 * Large page remapping first chunk setup helper
1599 */
1600#ifdef CONFIG_NEED_MULTIPLE_NODES
1601
1602/**
1603 * pcpu_lpage_build_unit_map - build unit_map for large page remapping
1604 * @static_size: the size of static percpu area in bytes
1605 * @reserved_size: the size of reserved percpu area in bytes
1606 * @dyn_sizep: in/out parameter for dynamic size, -1 for auto
1607 * @unit_sizep: out parameter for unit size
1608 * @unit_map: unit_map to be filled
1609 * @cpu_distance_fn: callback to determine distance between cpus
1610 *
1611 * This function builds cpu -> unit map and determine other parameters
1612 * considering needed percpu size, large page size and distances
1613 * between CPUs in NUMA.
1614 *
1615 * CPUs which are of LOCAL_DISTANCE both ways are grouped together and
1616 * may share units in the same large page. The returned configuration
1617 * is guaranteed to have CPUs on different nodes on different large
1618 * pages and >=75% usage of allocated virtual address space.
1619 *
1620 * RETURNS:
1621 * On success, fills in @unit_map, sets *@dyn_sizep, *@unit_sizep and
1622 * returns the number of units to be allocated. -errno on failure.
1623 */
1624int __init pcpu_lpage_build_unit_map(size_t static_size, size_t reserved_size,
1625 ssize_t *dyn_sizep, size_t *unit_sizep,
1626 size_t lpage_size, int *unit_map,
1627 pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
1628{
1629 static int group_map[NR_CPUS] __initdata;
1630 static int group_cnt[NR_CPUS] __initdata;
1631 int group_cnt_max = 0;
1632 size_t size_sum, min_unit_size, alloc_size;
1633 int upa, max_upa, uninitialized_var(best_upa); /* units_per_alloc */
1634 int last_allocs;
1635 unsigned int cpu, tcpu;
1636 int group, unit;
1637
1638 /*
1639 * Determine min_unit_size, alloc_size and max_upa such that
1640 * alloc_size is multiple of lpage_size and is the smallest
1641 * which can accomodate 4k aligned segments which are equal to
1642 * or larger than min_unit_size.
1643 */
1644 size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, dyn_sizep);
1645 min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
1646
1647 alloc_size = roundup(min_unit_size, lpage_size);
1648 upa = alloc_size / min_unit_size;
1649 while (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
1650 upa--;
1651 max_upa = upa;
1652
1653 /* group cpus according to their proximity */
1654 for_each_possible_cpu(cpu) {
1655 group = 0;
1656 next_group:
1657 for_each_possible_cpu(tcpu) {
1658 if (cpu == tcpu)
1659 break;
1660 if (group_map[tcpu] == group &&
1661 (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE ||
1662 cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) {
1663 group++;
1664 goto next_group;
1665 }
1666 }
1667 group_map[cpu] = group;
1668 group_cnt[group]++;
1669 group_cnt_max = max(group_cnt_max, group_cnt[group]);
1670 }
1671
1672 /*
1673 * Expand unit size until address space usage goes over 75%
1674 * and then as much as possible without using more address
1675 * space.
1676 */
1677 last_allocs = INT_MAX;
1678 for (upa = max_upa; upa; upa--) {
1679 int allocs = 0, wasted = 0;
1680
1681 if (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
1682 continue;
1683
1684 for (group = 0; group_cnt[group]; group++) {
1685 int this_allocs = DIV_ROUND_UP(group_cnt[group], upa);
1686 allocs += this_allocs;
1687 wasted += this_allocs * upa - group_cnt[group];
1688 }
1689
1690 /*
1691 * Don't accept if wastage is over 25%. The
1692 * greater-than comparison ensures upa==1 always
1693 * passes the following check.
1694 */
1695 if (wasted > num_possible_cpus() / 3)
1696 continue;
1697
1698 /* and then don't consume more memory */
1699 if (allocs > last_allocs)
1700 break;
1701 last_allocs = allocs;
1702 best_upa = upa;
1703 }
1704 *unit_sizep = alloc_size / best_upa;
1276 1705
1277 return pcpu_setup_first_chunk(pcpue_get_page, static_size, 1706 /* assign units to cpus accordingly */
1278 reserved_size, dyn_size, 1707 unit = 0;
1279 pcpue_unit_size, pcpue_ptr, NULL); 1708 for (group = 0; group_cnt[group]; group++) {
1709 for_each_possible_cpu(cpu)
1710 if (group_map[cpu] == group)
1711 unit_map[cpu] = unit++;
1712 unit = roundup(unit, best_upa);
1713 }
1714
1715 return unit; /* unit contains aligned number of units */
1716}
1717
1718struct pcpul_ent {
1719 void *ptr;
1720 void *map_addr;
1721};
1722
1723static size_t pcpul_size;
1724static size_t pcpul_lpage_size;
1725static int pcpul_nr_lpages;
1726static struct pcpul_ent *pcpul_map;
1727
1728static bool __init pcpul_unit_to_cpu(int unit, const int *unit_map,
1729 unsigned int *cpup)
1730{
1731 unsigned int cpu;
1732
1733 for_each_possible_cpu(cpu)
1734 if (unit_map[cpu] == unit) {
1735 if (cpup)
1736 *cpup = cpu;
1737 return true;
1738 }
1739
1740 return false;
1741}
1742
1743static void __init pcpul_lpage_dump_cfg(const char *lvl, size_t static_size,
1744 size_t reserved_size, size_t dyn_size,
1745 size_t unit_size, size_t lpage_size,
1746 const int *unit_map, int nr_units)
1747{
1748 int width = 1, v = nr_units;
1749 char empty_str[] = "--------";
1750 int upl, lpl; /* units per lpage, lpage per line */
1751 unsigned int cpu;
1752 int lpage, unit;
1753
1754 while (v /= 10)
1755 width++;
1756 empty_str[min_t(int, width, sizeof(empty_str) - 1)] = '\0';
1757
1758 upl = max_t(int, lpage_size / unit_size, 1);
1759 lpl = rounddown_pow_of_two(max_t(int, 60 / (upl * (width + 1) + 2), 1));
1760
1761 printk("%spcpu-lpage: sta/res/dyn=%zu/%zu/%zu unit=%zu lpage=%zu", lvl,
1762 static_size, reserved_size, dyn_size, unit_size, lpage_size);
1763
1764 for (lpage = 0, unit = 0; unit < nr_units; unit++) {
1765 if (!(unit % upl)) {
1766 if (!(lpage++ % lpl)) {
1767 printk("\n");
1768 printk("%spcpu-lpage: ", lvl);
1769 } else
1770 printk("| ");
1771 }
1772 if (pcpul_unit_to_cpu(unit, unit_map, &cpu))
1773 printk("%0*d ", width, cpu);
1774 else
1775 printk("%s ", empty_str);
1776 }
1777 printk("\n");
1778}
1779
1780/**
1781 * pcpu_lpage_first_chunk - remap the first percpu chunk using large page
1782 * @static_size: the size of static percpu area in bytes
1783 * @reserved_size: the size of reserved percpu area in bytes
1784 * @dyn_size: free size for dynamic allocation in bytes
1785 * @unit_size: unit size in bytes
1786 * @lpage_size: the size of a large page
1787 * @unit_map: cpu -> unit mapping
1788 * @nr_units: the number of units
1789 * @alloc_fn: function to allocate percpu lpage, always called with lpage_size
1790 * @free_fn: function to free percpu memory, @size <= lpage_size
1791 * @map_fn: function to map percpu lpage, always called with lpage_size
1792 *
1793 * This allocator uses large page to build and map the first chunk.
1794 * Unlike other helpers, the caller should always specify @dyn_size
1795 * and @unit_size. These parameters along with @unit_map and
1796 * @nr_units can be determined using pcpu_lpage_build_unit_map().
1797 * This two stage initialization is to allow arch code to evaluate the
1798 * parameters before committing to it.
1799 *
1800 * Large pages are allocated as directed by @unit_map and other
1801 * parameters and mapped to vmalloc space. Unused holes are returned
1802 * to the page allocator. Note that these holes end up being actively
1803 * mapped twice - once to the physical mapping and to the vmalloc area
1804 * for the first percpu chunk. Depending on architecture, this might
1805 * cause problem when changing page attributes of the returned area.
1806 * These double mapped areas can be detected using
1807 * pcpu_lpage_remapped().
1808 *
1809 * RETURNS:
1810 * The determined pcpu_unit_size which can be used to initialize
1811 * percpu access on success, -errno on failure.
1812 */
1813ssize_t __init pcpu_lpage_first_chunk(size_t static_size, size_t reserved_size,
1814 size_t dyn_size, size_t unit_size,
1815 size_t lpage_size, const int *unit_map,
1816 int nr_units,
1817 pcpu_fc_alloc_fn_t alloc_fn,
1818 pcpu_fc_free_fn_t free_fn,
1819 pcpu_fc_map_fn_t map_fn)
1820{
1821 static struct vm_struct vm;
1822 size_t chunk_size = unit_size * nr_units;
1823 size_t map_size;
1824 unsigned int cpu;
1825 ssize_t ret;
1826 int i, j, unit;
1827
1828 pcpul_lpage_dump_cfg(KERN_DEBUG, static_size, reserved_size, dyn_size,
1829 unit_size, lpage_size, unit_map, nr_units);
1830
1831 BUG_ON(chunk_size % lpage_size);
1832
1833 pcpul_size = static_size + reserved_size + dyn_size;
1834 pcpul_lpage_size = lpage_size;
1835 pcpul_nr_lpages = chunk_size / lpage_size;
1836
1837 /* allocate pointer array and alloc large pages */
1838 map_size = pcpul_nr_lpages * sizeof(pcpul_map[0]);
1839 pcpul_map = alloc_bootmem(map_size);
1840
1841 /* allocate all pages */
1842 for (i = 0; i < pcpul_nr_lpages; i++) {
1843 size_t offset = i * lpage_size;
1844 int first_unit = offset / unit_size;
1845 int last_unit = (offset + lpage_size - 1) / unit_size;
1846 void *ptr;
1847
1848 /* find out which cpu is mapped to this unit */
1849 for (unit = first_unit; unit <= last_unit; unit++)
1850 if (pcpul_unit_to_cpu(unit, unit_map, &cpu))
1851 goto found;
1852 continue;
1853 found:
1854 ptr = alloc_fn(cpu, lpage_size);
1855 if (!ptr) {
1856 pr_warning("PERCPU: failed to allocate large page "
1857 "for cpu%u\n", cpu);
1858 goto enomem;
1859 }
1860
1861 pcpul_map[i].ptr = ptr;
1862 }
1863
1864 /* return unused holes */
1865 for (unit = 0; unit < nr_units; unit++) {
1866 size_t start = unit * unit_size;
1867 size_t end = start + unit_size;
1868 size_t off, next;
1869
1870 /* don't free used part of occupied unit */
1871 if (pcpul_unit_to_cpu(unit, unit_map, NULL))
1872 start += pcpul_size;
1873
1874 /* unit can span more than one page, punch the holes */
1875 for (off = start; off < end; off = next) {
1876 void *ptr = pcpul_map[off / lpage_size].ptr;
1877 next = min(roundup(off + 1, lpage_size), end);
1878 if (ptr)
1879 free_fn(ptr + off % lpage_size, next - off);
1880 }
1881 }
1882
1883 /* allocate address, map and copy */
1884 vm.flags = VM_ALLOC;
1885 vm.size = chunk_size;
1886 vm_area_register_early(&vm, unit_size);
1887
1888 for (i = 0; i < pcpul_nr_lpages; i++) {
1889 if (!pcpul_map[i].ptr)
1890 continue;
1891 pcpul_map[i].map_addr = vm.addr + i * lpage_size;
1892 map_fn(pcpul_map[i].ptr, lpage_size, pcpul_map[i].map_addr);
1893 }
1894
1895 for_each_possible_cpu(cpu)
1896 memcpy(vm.addr + unit_map[cpu] * unit_size, __per_cpu_load,
1897 static_size);
1898
1899 /* we're ready, commit */
1900 pr_info("PERCPU: Remapped at %p with large pages, static data "
1901 "%zu bytes\n", vm.addr, static_size);
1902
1903 ret = pcpu_setup_first_chunk(static_size, reserved_size, dyn_size,
1904 unit_size, vm.addr, unit_map);
1905
1906 /*
1907 * Sort pcpul_map array for pcpu_lpage_remapped(). Unmapped
1908 * lpages are pushed to the end and trimmed.
1909 */
1910 for (i = 0; i < pcpul_nr_lpages - 1; i++)
1911 for (j = i + 1; j < pcpul_nr_lpages; j++) {
1912 struct pcpul_ent tmp;
1913
1914 if (!pcpul_map[j].ptr)
1915 continue;
1916 if (pcpul_map[i].ptr &&
1917 pcpul_map[i].ptr < pcpul_map[j].ptr)
1918 continue;
1919
1920 tmp = pcpul_map[i];
1921 pcpul_map[i] = pcpul_map[j];
1922 pcpul_map[j] = tmp;
1923 }
1924
1925 while (pcpul_nr_lpages && !pcpul_map[pcpul_nr_lpages - 1].ptr)
1926 pcpul_nr_lpages--;
1927
1928 return ret;
1929
1930enomem:
1931 for (i = 0; i < pcpul_nr_lpages; i++)
1932 if (pcpul_map[i].ptr)
1933 free_fn(pcpul_map[i].ptr, lpage_size);
1934 free_bootmem(__pa(pcpul_map), map_size);
1935 return -ENOMEM;
1936}
1937
1938/**
1939 * pcpu_lpage_remapped - determine whether a kaddr is in pcpul recycled area
1940 * @kaddr: the kernel address in question
1941 *
1942 * Determine whether @kaddr falls in the pcpul recycled area. This is
1943 * used by pageattr to detect VM aliases and break up the pcpu large
1944 * page mapping such that the same physical page is not mapped under
1945 * different attributes.
1946 *
1947 * The recycled area is always at the tail of a partially used large
1948 * page.
1949 *
1950 * RETURNS:
1951 * Address of corresponding remapped pcpu address if match is found;
1952 * otherwise, NULL.
1953 */
1954void *pcpu_lpage_remapped(void *kaddr)
1955{
1956 unsigned long lpage_mask = pcpul_lpage_size - 1;
1957 void *lpage_addr = (void *)((unsigned long)kaddr & ~lpage_mask);
1958 unsigned long offset = (unsigned long)kaddr & lpage_mask;
1959 int left = 0, right = pcpul_nr_lpages - 1;
1960 int pos;
1961
1962 /* pcpul in use at all? */
1963 if (!pcpul_map)
1964 return NULL;
1965
1966 /* okay, perform binary search */
1967 while (left <= right) {
1968 pos = (left + right) / 2;
1969
1970 if (pcpul_map[pos].ptr < lpage_addr)
1971 left = pos + 1;
1972 else if (pcpul_map[pos].ptr > lpage_addr)
1973 right = pos - 1;
1974 else
1975 return pcpul_map[pos].map_addr + offset;
1976 }
1977
1978 return NULL;
1979}
1980#endif
1981
1982/*
1983 * Generic percpu area setup.
1984 *
1985 * The embedding helper is used because its behavior closely resembles
1986 * the original non-dynamic generic percpu area setup. This is
1987 * important because many archs have addressing restrictions and might
1988 * fail if the percpu area is located far away from the previous
1989 * location. As an added bonus, in non-NUMA cases, embedding is
1990 * generally a good idea TLB-wise because percpu area can piggy back
1991 * on the physical linear memory mapping which uses large page
1992 * mappings on applicable archs.
1993 */
1994#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
1995unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
1996EXPORT_SYMBOL(__per_cpu_offset);
1997
1998void __init setup_per_cpu_areas(void)
1999{
2000 size_t static_size = __per_cpu_end - __per_cpu_start;
2001 ssize_t unit_size;
2002 unsigned long delta;
2003 unsigned int cpu;
2004
2005 /*
2006 * Always reserve area for module percpu variables. That's
2007 * what the legacy allocator did.
2008 */
2009 unit_size = pcpu_embed_first_chunk(static_size, PERCPU_MODULE_RESERVE,
2010 PERCPU_DYNAMIC_RESERVE);
2011 if (unit_size < 0)
2012 panic("Failed to initialized percpu areas.");
2013
2014 delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
2015 for_each_possible_cpu(cpu)
2016 __per_cpu_offset[cpu] = delta + cpu * unit_size;
1280} 2017}
2018#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */
diff --git a/mm/quicklist.c b/mm/quicklist.c
index e66d07d1b4ff..6eedf7e473d1 100644
--- a/mm/quicklist.c
+++ b/mm/quicklist.c
@@ -19,7 +19,7 @@
19#include <linux/module.h> 19#include <linux/module.h>
20#include <linux/quicklist.h> 20#include <linux/quicklist.h>
21 21
22DEFINE_PER_CPU(struct quicklist, quicklist)[CONFIG_NR_QUICK]; 22DEFINE_PER_CPU(struct quicklist [CONFIG_NR_QUICK], quicklist);
23 23
24#define FRACTION_OF_NODE_MEM 16 24#define FRACTION_OF_NODE_MEM 16
25 25
diff --git a/mm/slub.c b/mm/slub.c
index b9f1491a58a1..dc9765bb49dc 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2091,8 +2091,8 @@ init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s)
2091 */ 2091 */
2092#define NR_KMEM_CACHE_CPU 100 2092#define NR_KMEM_CACHE_CPU 100
2093 2093
2094static DEFINE_PER_CPU(struct kmem_cache_cpu, 2094static DEFINE_PER_CPU(struct kmem_cache_cpu [NR_KMEM_CACHE_CPU],
2095 kmem_cache_cpu)[NR_KMEM_CACHE_CPU]; 2095 kmem_cache_cpu);
2096 2096
2097static DEFINE_PER_CPU(struct kmem_cache_cpu *, kmem_cache_cpu_free); 2097static DEFINE_PER_CPU(struct kmem_cache_cpu *, kmem_cache_cpu_free);
2098static DECLARE_BITMAP(kmem_cach_cpu_free_init_once, CONFIG_NR_CPUS); 2098static DECLARE_BITMAP(kmem_cach_cpu_free_init_once, CONFIG_NR_CPUS);
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index cd2b97f1b6e1..a6e0e077ac33 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -37,12 +37,13 @@ __initcall(init_syncookies);
37#define COOKIEBITS 24 /* Upper bits store count */ 37#define COOKIEBITS 24 /* Upper bits store count */
38#define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1) 38#define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1)
39 39
40static DEFINE_PER_CPU(__u32, cookie_scratch)[16 + 5 + SHA_WORKSPACE_WORDS]; 40static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS],
41 ipv4_cookie_scratch);
41 42
42static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport, 43static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport,
43 u32 count, int c) 44 u32 count, int c)
44{ 45{
45 __u32 *tmp = __get_cpu_var(cookie_scratch); 46 __u32 *tmp = __get_cpu_var(ipv4_cookie_scratch);
46 47
47 memcpy(tmp + 4, syncookie_secret[c], sizeof(syncookie_secret[c])); 48 memcpy(tmp + 4, syncookie_secret[c], sizeof(syncookie_secret[c]));
48 tmp[0] = (__force u32)saddr; 49 tmp[0] = (__force u32)saddr;
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
index 8c2513982b61..6b6ae913b5d4 100644
--- a/net/ipv6/syncookies.c
+++ b/net/ipv6/syncookies.c
@@ -74,12 +74,13 @@ static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb,
74 return child; 74 return child;
75} 75}
76 76
77static DEFINE_PER_CPU(__u32, cookie_scratch)[16 + 5 + SHA_WORKSPACE_WORDS]; 77static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS],
78 ipv6_cookie_scratch);
78 79
79static u32 cookie_hash(struct in6_addr *saddr, struct in6_addr *daddr, 80static u32 cookie_hash(struct in6_addr *saddr, struct in6_addr *daddr,
80 __be16 sport, __be16 dport, u32 count, int c) 81 __be16 sport, __be16 dport, u32 count, int c)
81{ 82{
82 __u32 *tmp = __get_cpu_var(cookie_scratch); 83 __u32 *tmp = __get_cpu_var(ipv6_cookie_scratch);
83 84
84 /* 85 /*
85 * we have 320 bits of information to hash, copy in the remaining 86 * we have 320 bits of information to hash, copy in the remaining
diff --git a/net/rds/ib_stats.c b/net/rds/ib_stats.c
index 02e3e3d50d4a..301ae51ae409 100644
--- a/net/rds/ib_stats.c
+++ b/net/rds/ib_stats.c
@@ -37,7 +37,7 @@
37#include "rds.h" 37#include "rds.h"
38#include "ib.h" 38#include "ib.h"
39 39
40DEFINE_PER_CPU(struct rds_ib_statistics, rds_ib_stats) ____cacheline_aligned; 40DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_ib_statistics, rds_ib_stats);
41 41
42static char *rds_ib_stat_names[] = { 42static char *rds_ib_stat_names[] = {
43 "ib_connect_raced", 43 "ib_connect_raced",
diff --git a/net/rds/iw_stats.c b/net/rds/iw_stats.c
index ccc7e8f0bf0e..fafea3cc92d7 100644
--- a/net/rds/iw_stats.c
+++ b/net/rds/iw_stats.c
@@ -37,7 +37,7 @@
37#include "rds.h" 37#include "rds.h"
38#include "iw.h" 38#include "iw.h"
39 39
40DEFINE_PER_CPU(struct rds_iw_statistics, rds_iw_stats) ____cacheline_aligned; 40DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_iw_statistics, rds_iw_stats);
41 41
42static char *rds_iw_stat_names[] = { 42static char *rds_iw_stat_names[] = {
43 "iw_connect_raced", 43 "iw_connect_raced",
diff --git a/net/rds/page.c b/net/rds/page.c
index c460743a89ad..de7bb84bcd78 100644
--- a/net/rds/page.c
+++ b/net/rds/page.c
@@ -39,7 +39,7 @@ struct rds_page_remainder {
39 unsigned long r_offset; 39 unsigned long r_offset;
40}; 40};
41 41
42DEFINE_PER_CPU(struct rds_page_remainder, rds_page_remainders) ____cacheline_aligned; 42DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_page_remainder, rds_page_remainders);
43 43
44/* 44/*
45 * returns 0 on success or -errno on failure. 45 * returns 0 on success or -errno on failure.
diff --git a/scripts/module-common.lds b/scripts/module-common.lds
new file mode 100644
index 000000000000..47a1f9ae0ede
--- /dev/null
+++ b/scripts/module-common.lds
@@ -0,0 +1,8 @@
1/*
2 * Common module linker script, always used when linking a module.
3 * Archs are free to supply their own linker scripts. ld will
4 * combine them automatically.
5 */
6SECTIONS {
7 /DISCARD/ : { *(.discard) }
8}