aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2009-02-24 15:52:45 -0500
committerIngo Molnar <mingo@elte.hu>2009-02-24 15:52:45 -0500
commit0edcf8d6926f4038443dbc24e319530177ca0353 (patch)
tree6010af62f73d01ab673d5106f310eaf4f4228e32
parent87b203079ed949de52f0d92aeae20e5e0116c12f (diff)
parent40150d37be7f7949b2ec07d511244da856647d84 (diff)
Merge branch 'tj-percpu' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/misc into core/percpu
Conflicts: arch/x86/include/asm/pgtable.h
-rw-r--r--arch/alpha/mm/init.c20
-rw-r--r--arch/avr32/Kconfig2
-rw-r--r--arch/x86/Kconfig5
-rw-r--r--arch/x86/include/asm/mmzone_32.h43
-rw-r--r--arch/x86/include/asm/percpu.h8
-rw-r--r--arch/x86/include/asm/pgtable.h2
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c2
-rw-r--r--arch/x86/kernel/irq_32.c29
-rw-r--r--arch/x86/kernel/setup_percpu.c365
-rw-r--r--arch/x86/mm/init_32.c17
-rw-r--r--arch/x86/mm/init_64.c72
-rw-r--r--block/blktrace.c2
-rw-r--r--drivers/acpi/processor_perflib.c4
-rw-r--r--include/linux/bootmem.h36
-rw-r--r--include/linux/percpu.h100
-rw-r--r--include/linux/vmalloc.h4
-rw-r--r--kernel/module.c64
-rw-r--r--kernel/sched.c6
-rw-r--r--kernel/stop_machine.c2
-rw-r--r--mm/Makefile4
-rw-r--r--mm/allocpercpu.c32
-rw-r--r--mm/bootmem.c14
-rw-r--r--mm/percpu.c979
-rw-r--r--mm/vmalloc.c94
-rw-r--r--net/ipv4/af_inet.c4
25 files changed, 1716 insertions, 194 deletions
diff --git a/arch/alpha/mm/init.c b/arch/alpha/mm/init.c
index 5d7a16eab31..91eddd8505d 100644
--- a/arch/alpha/mm/init.c
+++ b/arch/alpha/mm/init.c
@@ -189,9 +189,21 @@ callback_init(void * kernel_end)
189 189
190 if (alpha_using_srm) { 190 if (alpha_using_srm) {
191 static struct vm_struct console_remap_vm; 191 static struct vm_struct console_remap_vm;
192 unsigned long vaddr = VMALLOC_START; 192 unsigned long nr_pages = 0;
193 unsigned long vaddr;
193 unsigned long i, j; 194 unsigned long i, j;
194 195
196 /* calculate needed size */
197 for (i = 0; i < crb->map_entries; ++i)
198 nr_pages += crb->map[i].count;
199
200 /* register the vm area */
201 console_remap_vm.flags = VM_ALLOC;
202 console_remap_vm.size = nr_pages << PAGE_SHIFT;
203 vm_area_register_early(&console_remap_vm, PAGE_SIZE);
204
205 vaddr = (unsigned long)consle_remap_vm.addr;
206
195 /* Set up the third level PTEs and update the virtual 207 /* Set up the third level PTEs and update the virtual
196 addresses of the CRB entries. */ 208 addresses of the CRB entries. */
197 for (i = 0; i < crb->map_entries; ++i) { 209 for (i = 0; i < crb->map_entries; ++i) {
@@ -213,12 +225,6 @@ callback_init(void * kernel_end)
213 vaddr += PAGE_SIZE; 225 vaddr += PAGE_SIZE;
214 } 226 }
215 } 227 }
216
217 /* Let vmalloc know that we've allocated some space. */
218 console_remap_vm.flags = VM_ALLOC;
219 console_remap_vm.addr = (void *) VMALLOC_START;
220 console_remap_vm.size = vaddr - VMALLOC_START;
221 vmlist = &console_remap_vm;
222 } 228 }
223 229
224 callback_init_done = 1; 230 callback_init_done = 1;
diff --git a/arch/avr32/Kconfig b/arch/avr32/Kconfig
index b189680d18b..05fe3053dca 100644
--- a/arch/avr32/Kconfig
+++ b/arch/avr32/Kconfig
@@ -181,7 +181,7 @@ source "kernel/Kconfig.preempt"
181config QUICKLIST 181config QUICKLIST
182 def_bool y 182 def_bool y
183 183
184config HAVE_ARCH_BOOTMEM_NODE 184config HAVE_ARCH_BOOTMEM
185 def_bool n 185 def_bool n
186 186
187config ARCH_HAVE_MEMORY_PRESENT 187config ARCH_HAVE_MEMORY_PRESENT
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 5e2919c0ff9..8015641478b 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -135,6 +135,9 @@ config ARCH_HAS_CACHE_LINE_SIZE
135config HAVE_SETUP_PER_CPU_AREA 135config HAVE_SETUP_PER_CPU_AREA
136 def_bool y 136 def_bool y
137 137
138config HAVE_DYNAMIC_PER_CPU_AREA
139 def_bool y
140
138config HAVE_CPUMASK_OF_CPU_MAP 141config HAVE_CPUMASK_OF_CPU_MAP
139 def_bool X86_64_SMP 142 def_bool X86_64_SMP
140 143
@@ -1122,7 +1125,7 @@ config NODES_SHIFT
1122 Specify the maximum number of NUMA Nodes available on the target 1125 Specify the maximum number of NUMA Nodes available on the target
1123 system. Increases memory reserved to accomodate various tables. 1126 system. Increases memory reserved to accomodate various tables.
1124 1127
1125config HAVE_ARCH_BOOTMEM_NODE 1128config HAVE_ARCH_BOOTMEM
1126 def_bool y 1129 def_bool y
1127 depends on X86_32 && NUMA 1130 depends on X86_32 && NUMA
1128 1131
diff --git a/arch/x86/include/asm/mmzone_32.h b/arch/x86/include/asm/mmzone_32.h
index 105fb90a063..eeacf67de49 100644
--- a/arch/x86/include/asm/mmzone_32.h
+++ b/arch/x86/include/asm/mmzone_32.h
@@ -91,45 +91,12 @@ static inline int pfn_valid(int pfn)
91#endif /* CONFIG_DISCONTIGMEM */ 91#endif /* CONFIG_DISCONTIGMEM */
92 92
93#ifdef CONFIG_NEED_MULTIPLE_NODES 93#ifdef CONFIG_NEED_MULTIPLE_NODES
94 94/* always use node 0 for bootmem on this numa platform */
95/* 95#define alloc_bootmem_core(__bdata, size, align, goal, limit) \
96 * Following are macros that are specific to this numa platform.
97 */
98#define reserve_bootmem(addr, size, flags) \
99 reserve_bootmem_node(NODE_DATA(0), (addr), (size), (flags))
100#define alloc_bootmem(x) \
101 __alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
102#define alloc_bootmem_nopanic(x) \
103 __alloc_bootmem_node_nopanic(NODE_DATA(0), (x), SMP_CACHE_BYTES, \
104 __pa(MAX_DMA_ADDRESS))
105#define alloc_bootmem_low(x) \
106 __alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, 0)
107#define alloc_bootmem_pages(x) \
108 __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
109#define alloc_bootmem_pages_nopanic(x) \
110 __alloc_bootmem_node_nopanic(NODE_DATA(0), (x), PAGE_SIZE, \
111 __pa(MAX_DMA_ADDRESS))
112#define alloc_bootmem_low_pages(x) \
113 __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0)
114#define alloc_bootmem_node(pgdat, x) \
115({ \
116 struct pglist_data __maybe_unused \
117 *__alloc_bootmem_node__pgdat = (pgdat); \
118 __alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, \
119 __pa(MAX_DMA_ADDRESS)); \
120})
121#define alloc_bootmem_pages_node(pgdat, x) \
122({ \
123 struct pglist_data __maybe_unused \
124 *__alloc_bootmem_node__pgdat = (pgdat); \
125 __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, \
126 __pa(MAX_DMA_ADDRESS)); \
127})
128#define alloc_bootmem_low_pages_node(pgdat, x) \
129({ \ 96({ \
130 struct pglist_data __maybe_unused \ 97 bootmem_data_t __maybe_unused * __abm_bdata_dummy = (__bdata); \
131 *__alloc_bootmem_node__pgdat = (pgdat); \ 98 __alloc_bootmem_core(NODE_DATA(0)->bdata, \
132 __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0); \ 99 (size), (align), (goal), (limit)); \
133}) 100})
134#endif /* CONFIG_NEED_MULTIPLE_NODES */ 101#endif /* CONFIG_NEED_MULTIPLE_NODES */
135 102
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index aee103b26d0..8f1d2fbec1d 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -43,6 +43,14 @@
43#else /* ...!ASSEMBLY */ 43#else /* ...!ASSEMBLY */
44 44
45#include <linux/stringify.h> 45#include <linux/stringify.h>
46#include <asm/sections.h>
47
48#define __addr_to_pcpu_ptr(addr) \
49 (void *)((unsigned long)(addr) - (unsigned long)pcpu_base_addr \
50 + (unsigned long)__per_cpu_start)
51#define __pcpu_ptr_to_addr(ptr) \
52 (void *)((unsigned long)(ptr) + (unsigned long)pcpu_base_addr \
53 - (unsigned long)__per_cpu_start)
46 54
47#ifdef CONFIG_SMP 55#ifdef CONFIG_SMP
48#define __percpu_arg(x) "%%"__stringify(__percpu_seg)":%P" #x 56#define __percpu_arg(x) "%%"__stringify(__percpu_seg)":%P" #x
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 1c097a3a666..d0812e155f1 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -288,6 +288,8 @@ static inline int is_new_memtype_allowed(unsigned long flags,
288 return 1; 288 return 1;
289} 289}
290 290
291pmd_t *populate_extra_pmd(unsigned long vaddr);
292pte_t *populate_extra_pte(unsigned long vaddr);
291#endif /* __ASSEMBLY__ */ 293#endif /* __ASSEMBLY__ */
292 294
293#ifdef CONFIG_X86_32 295#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 4b1c319d30c..22590cf688a 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -601,7 +601,7 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
601 if (!data) 601 if (!data)
602 return -ENOMEM; 602 return -ENOMEM;
603 603
604 data->acpi_data = percpu_ptr(acpi_perf_data, cpu); 604 data->acpi_data = per_cpu_ptr(acpi_perf_data, cpu);
605 per_cpu(drv_data, cpu) = data; 605 per_cpu(drv_data, cpu) = data;
606 606
607 if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) 607 if (cpu_has(c, X86_FEATURE_CONSTANT_TSC))
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 9dc6b2b2427..3b09634a515 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -16,6 +16,7 @@
16#include <linux/cpu.h> 16#include <linux/cpu.h>
17#include <linux/delay.h> 17#include <linux/delay.h>
18#include <linux/uaccess.h> 18#include <linux/uaccess.h>
19#include <linux/percpu.h>
19 20
20#include <asm/apic.h> 21#include <asm/apic.h>
21 22
@@ -55,13 +56,13 @@ static inline void print_stack_overflow(void) { }
55union irq_ctx { 56union irq_ctx {
56 struct thread_info tinfo; 57 struct thread_info tinfo;
57 u32 stack[THREAD_SIZE/sizeof(u32)]; 58 u32 stack[THREAD_SIZE/sizeof(u32)];
58}; 59} __attribute__((aligned(PAGE_SIZE)));
59 60
60static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly; 61static DEFINE_PER_CPU(union irq_ctx *, hardirq_ctx);
61static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly; 62static DEFINE_PER_CPU(union irq_ctx *, softirq_ctx);
62 63
63static char softirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss; 64static DEFINE_PER_CPU_PAGE_ALIGNED(union irq_ctx, hardirq_stack);
64static char hardirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss; 65static DEFINE_PER_CPU_PAGE_ALIGNED(union irq_ctx, softirq_stack);
65 66
66static void call_on_stack(void *func, void *stack) 67static void call_on_stack(void *func, void *stack)
67{ 68{
@@ -81,7 +82,7 @@ execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq)
81 u32 *isp, arg1, arg2; 82 u32 *isp, arg1, arg2;
82 83
83 curctx = (union irq_ctx *) current_thread_info(); 84 curctx = (union irq_ctx *) current_thread_info();
84 irqctx = hardirq_ctx[smp_processor_id()]; 85 irqctx = __get_cpu_var(hardirq_ctx);
85 86
86 /* 87 /*
87 * this is where we switch to the IRQ stack. However, if we are 88 * this is where we switch to the IRQ stack. However, if we are
@@ -125,34 +126,34 @@ void __cpuinit irq_ctx_init(int cpu)
125{ 126{
126 union irq_ctx *irqctx; 127 union irq_ctx *irqctx;
127 128
128 if (hardirq_ctx[cpu]) 129 if (per_cpu(hardirq_ctx, cpu))
129 return; 130 return;
130 131
131 irqctx = (union irq_ctx*) &hardirq_stack[cpu*THREAD_SIZE]; 132 irqctx = &per_cpu(hardirq_stack, cpu);
132 irqctx->tinfo.task = NULL; 133 irqctx->tinfo.task = NULL;
133 irqctx->tinfo.exec_domain = NULL; 134 irqctx->tinfo.exec_domain = NULL;
134 irqctx->tinfo.cpu = cpu; 135 irqctx->tinfo.cpu = cpu;
135 irqctx->tinfo.preempt_count = HARDIRQ_OFFSET; 136 irqctx->tinfo.preempt_count = HARDIRQ_OFFSET;
136 irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); 137 irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
137 138
138 hardirq_ctx[cpu] = irqctx; 139 per_cpu(hardirq_ctx, cpu) = irqctx;
139 140
140 irqctx = (union irq_ctx *) &softirq_stack[cpu*THREAD_SIZE]; 141 irqctx = &per_cpu(softirq_stack, cpu);
141 irqctx->tinfo.task = NULL; 142 irqctx->tinfo.task = NULL;
142 irqctx->tinfo.exec_domain = NULL; 143 irqctx->tinfo.exec_domain = NULL;
143 irqctx->tinfo.cpu = cpu; 144 irqctx->tinfo.cpu = cpu;
144 irqctx->tinfo.preempt_count = 0; 145 irqctx->tinfo.preempt_count = 0;
145 irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); 146 irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
146 147
147 softirq_ctx[cpu] = irqctx; 148 per_cpu(softirq_ctx, cpu) = irqctx;
148 149
149 printk(KERN_DEBUG "CPU %u irqstacks, hard=%p soft=%p\n", 150 printk(KERN_DEBUG "CPU %u irqstacks, hard=%p soft=%p\n",
150 cpu, hardirq_ctx[cpu], softirq_ctx[cpu]); 151 cpu, per_cpu(hardirq_ctx, cpu), per_cpu(softirq_ctx, cpu));
151} 152}
152 153
153void irq_ctx_exit(int cpu) 154void irq_ctx_exit(int cpu)
154{ 155{
155 hardirq_ctx[cpu] = NULL; 156 per_cpu(hardirq_ctx, cpu) = NULL;
156} 157}
157 158
158asmlinkage void do_softirq(void) 159asmlinkage void do_softirq(void)
@@ -169,7 +170,7 @@ asmlinkage void do_softirq(void)
169 170
170 if (local_softirq_pending()) { 171 if (local_softirq_pending()) {
171 curctx = current_thread_info(); 172 curctx = current_thread_info();
172 irqctx = softirq_ctx[smp_processor_id()]; 173 irqctx = __get_cpu_var(softirq_ctx);
173 irqctx->tinfo.task = curctx->task; 174 irqctx->tinfo.task = curctx->task;
174 irqctx->tinfo.previous_esp = current_stack_pointer; 175 irqctx->tinfo.previous_esp = current_stack_pointer;
175 176
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index d992e6cff73..2d946a8f78b 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -7,6 +7,7 @@
7#include <linux/crash_dump.h> 7#include <linux/crash_dump.h>
8#include <linux/smp.h> 8#include <linux/smp.h>
9#include <linux/topology.h> 9#include <linux/topology.h>
10#include <linux/pfn.h>
10#include <asm/sections.h> 11#include <asm/sections.h>
11#include <asm/processor.h> 12#include <asm/processor.h>
12#include <asm/setup.h> 13#include <asm/setup.h>
@@ -41,6 +42,321 @@ unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = {
41}; 42};
42EXPORT_SYMBOL(__per_cpu_offset); 43EXPORT_SYMBOL(__per_cpu_offset);
43 44
45/**
46 * pcpu_need_numa - determine percpu allocation needs to consider NUMA
47 *
48 * If NUMA is not configured or there is only one NUMA node available,
49 * there is no reason to consider NUMA. This function determines
50 * whether percpu allocation should consider NUMA or not.
51 *
52 * RETURNS:
53 * true if NUMA should be considered; otherwise, false.
54 */
55static bool __init pcpu_need_numa(void)
56{
57#ifdef CONFIG_NEED_MULTIPLE_NODES
58 pg_data_t *last = NULL;
59 unsigned int cpu;
60
61 for_each_possible_cpu(cpu) {
62 int node = early_cpu_to_node(cpu);
63
64 if (node_online(node) && NODE_DATA(node) &&
65 last && last != NODE_DATA(node))
66 return true;
67
68 last = NODE_DATA(node);
69 }
70#endif
71 return false;
72}
73
74/**
75 * pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu
76 * @cpu: cpu to allocate for
77 * @size: size allocation in bytes
78 * @align: alignment
79 *
80 * Allocate @size bytes aligned at @align for cpu @cpu. This wrapper
81 * does the right thing for NUMA regardless of the current
82 * configuration.
83 *
84 * RETURNS:
85 * Pointer to the allocated area on success, NULL on failure.
86 */
87static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size,
88 unsigned long align)
89{
90 const unsigned long goal = __pa(MAX_DMA_ADDRESS);
91#ifdef CONFIG_NEED_MULTIPLE_NODES
92 int node = early_cpu_to_node(cpu);
93 void *ptr;
94
95 if (!node_online(node) || !NODE_DATA(node)) {
96 ptr = __alloc_bootmem_nopanic(size, align, goal);
97 pr_info("cpu %d has no node %d or node-local memory\n",
98 cpu, node);
99 pr_debug("per cpu data for cpu%d %lu bytes at %016lx\n",
100 cpu, size, __pa(ptr));
101 } else {
102 ptr = __alloc_bootmem_node_nopanic(NODE_DATA(node),
103 size, align, goal);
104 pr_debug("per cpu data for cpu%d %lu bytes on node%d at "
105 "%016lx\n", cpu, size, node, __pa(ptr));
106 }
107 return ptr;
108#else
109 return __alloc_bootmem_nopanic(size, align, goal);
110#endif
111}
112
113/*
114 * Remap allocator
115 *
116 * This allocator uses PMD page as unit. A PMD page is allocated for
117 * each cpu and each is remapped into vmalloc area using PMD mapping.
118 * As PMD page is quite large, only part of it is used for the first
119 * chunk. Unused part is returned to the bootmem allocator.
120 *
121 * So, the PMD pages are mapped twice - once to the physical mapping
122 * and to the vmalloc area for the first percpu chunk. The double
123 * mapping does add one more PMD TLB entry pressure but still is much
124 * better than only using 4k mappings while still being NUMA friendly.
125 */
126#ifdef CONFIG_NEED_MULTIPLE_NODES
127static size_t pcpur_size __initdata;
128static void **pcpur_ptrs __initdata;
129
130static struct page * __init pcpur_get_page(unsigned int cpu, int pageno)
131{
132 size_t off = (size_t)pageno << PAGE_SHIFT;
133
134 if (off >= pcpur_size)
135 return NULL;
136
137 return virt_to_page(pcpur_ptrs[cpu] + off);
138}
139
140static ssize_t __init setup_pcpu_remap(size_t static_size)
141{
142 static struct vm_struct vm;
143 pg_data_t *last;
144 size_t ptrs_size;
145 unsigned int cpu;
146 ssize_t ret;
147
148 /*
149 * If large page isn't supported, there's no benefit in doing
150 * this. Also, on non-NUMA, embedding is better.
151 */
152 if (!cpu_has_pse || pcpu_need_numa())
153 return -EINVAL;
154
155 last = NULL;
156 for_each_possible_cpu(cpu) {
157 int node = early_cpu_to_node(cpu);
158
159 if (node_online(node) && NODE_DATA(node) &&
160 last && last != NODE_DATA(node))
161 goto proceed;
162
163 last = NODE_DATA(node);
164 }
165 return -EINVAL;
166
167proceed:
168 /*
169 * Currently supports only single page. Supporting multiple
170 * pages won't be too difficult if it ever becomes necessary.
171 */
172 pcpur_size = PFN_ALIGN(static_size + PERCPU_DYNAMIC_RESERVE);
173 if (pcpur_size > PMD_SIZE) {
174 pr_warning("PERCPU: static data is larger than large page, "
175 "can't use large page\n");
176 return -EINVAL;
177 }
178
179 /* allocate pointer array and alloc large pages */
180 ptrs_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpur_ptrs[0]));
181 pcpur_ptrs = alloc_bootmem(ptrs_size);
182
183 for_each_possible_cpu(cpu) {
184 pcpur_ptrs[cpu] = pcpu_alloc_bootmem(cpu, PMD_SIZE, PMD_SIZE);
185 if (!pcpur_ptrs[cpu])
186 goto enomem;
187
188 /*
189 * Only use pcpur_size bytes and give back the rest.
190 *
191 * Ingo: The 2MB up-rounding bootmem is needed to make
192 * sure the partial 2MB page is still fully RAM - it's
193 * not well-specified to have a PAT-incompatible area
194 * (unmapped RAM, device memory, etc.) in that hole.
195 */
196 free_bootmem(__pa(pcpur_ptrs[cpu] + pcpur_size),
197 PMD_SIZE - pcpur_size);
198
199 memcpy(pcpur_ptrs[cpu], __per_cpu_load, static_size);
200 }
201
202 /* allocate address and map */
203 vm.flags = VM_ALLOC;
204 vm.size = num_possible_cpus() * PMD_SIZE;
205 vm_area_register_early(&vm, PMD_SIZE);
206
207 for_each_possible_cpu(cpu) {
208 pmd_t *pmd;
209
210 pmd = populate_extra_pmd((unsigned long)vm.addr
211 + cpu * PMD_SIZE);
212 set_pmd(pmd, pfn_pmd(page_to_pfn(virt_to_page(pcpur_ptrs[cpu])),
213 PAGE_KERNEL_LARGE));
214 }
215
216 /* we're ready, commit */
217 pr_info("PERCPU: Remapped at %p with large pages, static data "
218 "%zu bytes\n", vm.addr, static_size);
219
220 ret = pcpu_setup_first_chunk(pcpur_get_page, static_size, PMD_SIZE,
221 pcpur_size - static_size, vm.addr, NULL);
222 goto out_free_ar;
223
224enomem:
225 for_each_possible_cpu(cpu)
226 if (pcpur_ptrs[cpu])
227 free_bootmem(__pa(pcpur_ptrs[cpu]), PMD_SIZE);
228 ret = -ENOMEM;
229out_free_ar:
230 free_bootmem(__pa(pcpur_ptrs), ptrs_size);
231 return ret;
232}
233#else
234static ssize_t __init setup_pcpu_remap(size_t static_size)
235{
236 return -EINVAL;
237}
238#endif
239
240/*
241 * Embedding allocator
242 *
243 * The first chunk is sized to just contain the static area plus
244 * PERCPU_DYNAMIC_RESERVE and allocated as a contiguous area using
245 * bootmem allocator and used as-is without being mapped into vmalloc
246 * area. This enables the first chunk to piggy back on the linear
247 * physical PMD mapping and doesn't add any additional pressure to
248 * TLB.
249 */
250static void *pcpue_ptr __initdata;
251static size_t pcpue_unit_size __initdata;
252
253static struct page * __init pcpue_get_page(unsigned int cpu, int pageno)
254{
255 return virt_to_page(pcpue_ptr + cpu * pcpue_unit_size
256 + ((size_t)pageno << PAGE_SHIFT));
257}
258
259static ssize_t __init setup_pcpu_embed(size_t static_size)
260{
261 unsigned int cpu;
262
263 /*
264 * If large page isn't supported, there's no benefit in doing
265 * this. Also, embedding allocation doesn't play well with
266 * NUMA.
267 */
268 if (!cpu_has_pse || pcpu_need_numa())
269 return -EINVAL;
270
271 /* allocate and copy */
272 pcpue_unit_size = PFN_ALIGN(static_size + PERCPU_DYNAMIC_RESERVE);
273 pcpue_unit_size = max(pcpue_unit_size, PCPU_MIN_UNIT_SIZE);
274 pcpue_ptr = pcpu_alloc_bootmem(0, num_possible_cpus() * pcpue_unit_size,
275 PAGE_SIZE);
276 if (!pcpue_ptr)
277 return -ENOMEM;
278
279 for_each_possible_cpu(cpu)
280 memcpy(pcpue_ptr + cpu * pcpue_unit_size, __per_cpu_load,
281 static_size);
282
283 /* we're ready, commit */
284 pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n",
285 pcpue_unit_size >> PAGE_SHIFT, pcpue_ptr, static_size);
286
287 return pcpu_setup_first_chunk(pcpue_get_page, static_size,
288 pcpue_unit_size,
289 pcpue_unit_size - static_size, pcpue_ptr,
290 NULL);
291}
292
293/*
294 * 4k page allocator
295 *
296 * This is the basic allocator. Static percpu area is allocated
297 * page-by-page and most of initialization is done by the generic
298 * setup function.
299 */
300static struct page **pcpu4k_pages __initdata;
301static int pcpu4k_nr_static_pages __initdata;
302
303static struct page * __init pcpu4k_get_page(unsigned int cpu, int pageno)
304{
305 if (pageno < pcpu4k_nr_static_pages)
306 return pcpu4k_pages[cpu * pcpu4k_nr_static_pages + pageno];
307 return NULL;
308}
309
310static void __init pcpu4k_populate_pte(unsigned long addr)
311{
312 populate_extra_pte(addr);
313}
314
315static ssize_t __init setup_pcpu_4k(size_t static_size)
316{
317 size_t pages_size;
318 unsigned int cpu;
319 int i, j;
320 ssize_t ret;
321
322 pcpu4k_nr_static_pages = PFN_UP(static_size);
323
324 /* unaligned allocations can't be freed, round up to page size */
325 pages_size = PFN_ALIGN(pcpu4k_nr_static_pages * num_possible_cpus()
326 * sizeof(pcpu4k_pages[0]));
327 pcpu4k_pages = alloc_bootmem(pages_size);
328
329 /* allocate and copy */
330 j = 0;
331 for_each_possible_cpu(cpu)
332 for (i = 0; i < pcpu4k_nr_static_pages; i++) {
333 void *ptr;
334
335 ptr = pcpu_alloc_bootmem(cpu, PAGE_SIZE, PAGE_SIZE);
336 if (!ptr)
337 goto enomem;
338
339 memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE);
340 pcpu4k_pages[j++] = virt_to_page(ptr);
341 }
342
343 /* we're ready, commit */
344 pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n",
345 pcpu4k_nr_static_pages, static_size);
346
347 ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, 0, 0, NULL,
348 pcpu4k_populate_pte);
349 goto out_free_ar;
350
351enomem:
352 while (--j >= 0)
353 free_bootmem(__pa(page_address(pcpu4k_pages[j])), PAGE_SIZE);
354 ret = -ENOMEM;
355out_free_ar:
356 free_bootmem(__pa(pcpu4k_pages), pages_size);
357 return ret;
358}
359
44static inline void setup_percpu_segment(int cpu) 360static inline void setup_percpu_segment(int cpu)
45{ 361{
46#ifdef CONFIG_X86_32 362#ifdef CONFIG_X86_32
@@ -61,38 +377,35 @@ static inline void setup_percpu_segment(int cpu)
61 */ 377 */
62void __init setup_per_cpu_areas(void) 378void __init setup_per_cpu_areas(void)
63{ 379{
64 ssize_t size; 380 size_t static_size = __per_cpu_end - __per_cpu_start;
65 char *ptr; 381 unsigned int cpu;
66 int cpu; 382 unsigned long delta;
67 383 size_t pcpu_unit_size;
68 /* Copy section for each CPU (we discard the original) */ 384 ssize_t ret;
69 size = roundup(PERCPU_ENOUGH_ROOM, PAGE_SIZE);
70 385
71 pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n", 386 pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n",
72 NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids); 387 NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids);
73 388
74 pr_info("PERCPU: Allocating %zd bytes of per cpu data\n", size); 389 /*
390 * Allocate percpu area. If PSE is supported, try to make use
391 * of large page mappings. Please read comments on top of
392 * each allocator for details.
393 */
394 ret = setup_pcpu_remap(static_size);
395 if (ret < 0)
396 ret = setup_pcpu_embed(static_size);
397 if (ret < 0)
398 ret = setup_pcpu_4k(static_size);
399 if (ret < 0)
400 panic("cannot allocate static percpu area (%zu bytes, err=%zd)",
401 static_size, ret);
75 402
76 for_each_possible_cpu(cpu) { 403 pcpu_unit_size = ret;
77#ifndef CONFIG_NEED_MULTIPLE_NODES
78 ptr = alloc_bootmem_pages(size);
79#else
80 int node = early_cpu_to_node(cpu);
81 if (!node_online(node) || !NODE_DATA(node)) {
82 ptr = alloc_bootmem_pages(size);
83 pr_info("cpu %d has no node %d or node-local memory\n",
84 cpu, node);
85 pr_debug("per cpu data for cpu%d at %016lx\n",
86 cpu, __pa(ptr));
87 } else {
88 ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
89 pr_debug("per cpu data for cpu%d on node%d at %016lx\n",
90 cpu, node, __pa(ptr));
91 }
92#endif
93 404
94 memcpy(ptr, __per_cpu_load, __per_cpu_end - __per_cpu_start); 405 /* alrighty, percpu areas up and running */
95 per_cpu_offset(cpu) = ptr - __per_cpu_start; 406 delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
407 for_each_possible_cpu(cpu) {
408 per_cpu_offset(cpu) = delta + cpu * pcpu_unit_size;
96 per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu); 409 per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu);
97 per_cpu(cpu_number, cpu) = cpu; 410 per_cpu(cpu_number, cpu) = cpu;
98 setup_percpu_segment(cpu); 411 setup_percpu_segment(cpu);
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 06708ee94aa..ef0bb941cdf 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -137,6 +137,23 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
137 return pte_offset_kernel(pmd, 0); 137 return pte_offset_kernel(pmd, 0);
138} 138}
139 139
140pmd_t * __init populate_extra_pmd(unsigned long vaddr)
141{
142 int pgd_idx = pgd_index(vaddr);
143 int pmd_idx = pmd_index(vaddr);
144
145 return one_md_table_init(swapper_pg_dir + pgd_idx) + pmd_idx;
146}
147
148pte_t * __init populate_extra_pte(unsigned long vaddr)
149{
150 int pte_idx = pte_index(vaddr);
151 pmd_t *pmd;
152
153 pmd = populate_extra_pmd(vaddr);
154 return one_page_table_init(pmd) + pte_idx;
155}
156
140static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd, 157static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd,
141 unsigned long vaddr, pte_t *lastpte) 158 unsigned long vaddr, pte_t *lastpte)
142{ 159{
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index e6d36b49025..7d4e76da336 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -168,34 +168,51 @@ static __ref void *spp_getpage(void)
168 return ptr; 168 return ptr;
169} 169}
170 170
171void 171static pud_t * __init fill_pud(pgd_t *pgd, unsigned long vaddr)
172set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
173{ 172{
174 pud_t *pud; 173 if (pgd_none(*pgd)) {
175 pmd_t *pmd; 174 pud_t *pud = (pud_t *)spp_getpage();
176 pte_t *pte; 175 pgd_populate(&init_mm, pgd, pud);
176 if (pud != pud_offset(pgd, 0))
177 printk(KERN_ERR "PAGETABLE BUG #00! %p <-> %p\n",
178 pud, pud_offset(pgd, 0));
179 }
180 return pud_offset(pgd, vaddr);
181}
177 182
178 pud = pud_page + pud_index(vaddr); 183static pmd_t * __init fill_pmd(pud_t *pud, unsigned long vaddr)
184{
179 if (pud_none(*pud)) { 185 if (pud_none(*pud)) {
180 pmd = (pmd_t *) spp_getpage(); 186 pmd_t *pmd = (pmd_t *) spp_getpage();
181 pud_populate(&init_mm, pud, pmd); 187 pud_populate(&init_mm, pud, pmd);
182 if (pmd != pmd_offset(pud, 0)) { 188 if (pmd != pmd_offset(pud, 0))
183 printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n", 189 printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
184 pmd, pmd_offset(pud, 0)); 190 pmd, pmd_offset(pud, 0));
185 return;
186 }
187 } 191 }
188 pmd = pmd_offset(pud, vaddr); 192 return pmd_offset(pud, vaddr);
193}
194
195static pte_t * __init fill_pte(pmd_t *pmd, unsigned long vaddr)
196{
189 if (pmd_none(*pmd)) { 197 if (pmd_none(*pmd)) {
190 pte = (pte_t *) spp_getpage(); 198 pte_t *pte = (pte_t *) spp_getpage();
191 pmd_populate_kernel(&init_mm, pmd, pte); 199 pmd_populate_kernel(&init_mm, pmd, pte);
192 if (pte != pte_offset_kernel(pmd, 0)) { 200 if (pte != pte_offset_kernel(pmd, 0))
193 printk(KERN_ERR "PAGETABLE BUG #02!\n"); 201 printk(KERN_ERR "PAGETABLE BUG #02!\n");
194 return;
195 }
196 } 202 }
203 return pte_offset_kernel(pmd, vaddr);
204}
205
206void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
207{
208 pud_t *pud;
209 pmd_t *pmd;
210 pte_t *pte;
211
212 pud = pud_page + pud_index(vaddr);
213 pmd = fill_pmd(pud, vaddr);
214 pte = fill_pte(pmd, vaddr);
197 215
198 pte = pte_offset_kernel(pmd, vaddr);
199 set_pte(pte, new_pte); 216 set_pte(pte, new_pte);
200 217
201 /* 218 /*
@@ -205,8 +222,7 @@ set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
205 __flush_tlb_one(vaddr); 222 __flush_tlb_one(vaddr);
206} 223}
207 224
208void 225void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
209set_pte_vaddr(unsigned long vaddr, pte_t pteval)
210{ 226{
211 pgd_t *pgd; 227 pgd_t *pgd;
212 pud_t *pud_page; 228 pud_t *pud_page;
@@ -223,6 +239,24 @@ set_pte_vaddr(unsigned long vaddr, pte_t pteval)
223 set_pte_vaddr_pud(pud_page, vaddr, pteval); 239 set_pte_vaddr_pud(pud_page, vaddr, pteval);
224} 240}
225 241
242pmd_t * __init populate_extra_pmd(unsigned long vaddr)
243{
244 pgd_t *pgd;
245 pud_t *pud;
246
247 pgd = pgd_offset_k(vaddr);
248 pud = fill_pud(pgd, vaddr);
249 return fill_pmd(pud, vaddr);
250}
251
252pte_t * __init populate_extra_pte(unsigned long vaddr)
253{
254 pmd_t *pmd;
255
256 pmd = populate_extra_pmd(vaddr);
257 return fill_pte(pmd, vaddr);
258}
259
226/* 260/*
227 * Create large page table mappings for a range of physical addresses. 261 * Create large page table mappings for a range of physical addresses.
228 */ 262 */
diff --git a/block/blktrace.c b/block/blktrace.c
index 7cf9d1ff45a..028120a0965 100644
--- a/block/blktrace.c
+++ b/block/blktrace.c
@@ -363,7 +363,7 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
363 if (!bt->sequence) 363 if (!bt->sequence)
364 goto err; 364 goto err;
365 365
366 bt->msg_data = __alloc_percpu(BLK_TN_MAX_MSG); 366 bt->msg_data = __alloc_percpu(BLK_TN_MAX_MSG, __alignof__(char));
367 if (!bt->msg_data) 367 if (!bt->msg_data)
368 goto err; 368 goto err;
369 369
diff --git a/drivers/acpi/processor_perflib.c b/drivers/acpi/processor_perflib.c
index 9cc769b587f..68fd3d29279 100644
--- a/drivers/acpi/processor_perflib.c
+++ b/drivers/acpi/processor_perflib.c
@@ -516,12 +516,12 @@ int acpi_processor_preregister_performance(
516 continue; 516 continue;
517 } 517 }
518 518
519 if (!performance || !percpu_ptr(performance, i)) { 519 if (!performance || !per_cpu_ptr(performance, i)) {
520 retval = -EINVAL; 520 retval = -EINVAL;
521 continue; 521 continue;
522 } 522 }
523 523
524 pr->performance = percpu_ptr(performance, i); 524 pr->performance = per_cpu_ptr(performance, i);
525 cpumask_set_cpu(i, pr->performance->shared_cpu_map); 525 cpumask_set_cpu(i, pr->performance->shared_cpu_map);
526 if (acpi_processor_get_psd(pr)) { 526 if (acpi_processor_get_psd(pr)) {
527 retval = -EINVAL; 527 retval = -EINVAL;
diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index 95837bfb525..455d83219fa 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -65,23 +65,20 @@ extern void free_bootmem(unsigned long addr, unsigned long size);
65#define BOOTMEM_DEFAULT 0 65#define BOOTMEM_DEFAULT 0
66#define BOOTMEM_EXCLUSIVE (1<<0) 66#define BOOTMEM_EXCLUSIVE (1<<0)
67 67
68extern int reserve_bootmem(unsigned long addr,
69 unsigned long size,
70 int flags);
68extern int reserve_bootmem_node(pg_data_t *pgdat, 71extern int reserve_bootmem_node(pg_data_t *pgdat,
69 unsigned long physaddr, 72 unsigned long physaddr,
70 unsigned long size, 73 unsigned long size,
71 int flags); 74 int flags);
72#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
73extern int reserve_bootmem(unsigned long addr, unsigned long size, int flags);
74#endif
75 75
76extern void *__alloc_bootmem_nopanic(unsigned long size, 76extern void *__alloc_bootmem(unsigned long size,
77 unsigned long align, 77 unsigned long align,
78 unsigned long goal); 78 unsigned long goal);
79extern void *__alloc_bootmem(unsigned long size, 79extern void *__alloc_bootmem_nopanic(unsigned long size,
80 unsigned long align, 80 unsigned long align,
81 unsigned long goal); 81 unsigned long goal);
82extern void *__alloc_bootmem_low(unsigned long size,
83 unsigned long align,
84 unsigned long goal);
85extern void *__alloc_bootmem_node(pg_data_t *pgdat, 82extern void *__alloc_bootmem_node(pg_data_t *pgdat,
86 unsigned long size, 83 unsigned long size,
87 unsigned long align, 84 unsigned long align,
@@ -90,30 +87,35 @@ extern void *__alloc_bootmem_node_nopanic(pg_data_t *pgdat,
90 unsigned long size, 87 unsigned long size,
91 unsigned long align, 88 unsigned long align,
92 unsigned long goal); 89 unsigned long goal);
90extern void *__alloc_bootmem_low(unsigned long size,
91 unsigned long align,
92 unsigned long goal);
93extern void *__alloc_bootmem_low_node(pg_data_t *pgdat, 93extern void *__alloc_bootmem_low_node(pg_data_t *pgdat,
94 unsigned long size, 94 unsigned long size,
95 unsigned long align, 95 unsigned long align,
96 unsigned long goal); 96 unsigned long goal);
97#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE 97
98#define alloc_bootmem(x) \ 98#define alloc_bootmem(x) \
99 __alloc_bootmem(x, SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS)) 99 __alloc_bootmem(x, SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
100#define alloc_bootmem_nopanic(x) \ 100#define alloc_bootmem_nopanic(x) \
101 __alloc_bootmem_nopanic(x, SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS)) 101 __alloc_bootmem_nopanic(x, SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
102#define alloc_bootmem_low(x) \
103 __alloc_bootmem_low(x, SMP_CACHE_BYTES, 0)
104#define alloc_bootmem_pages(x) \ 102#define alloc_bootmem_pages(x) \
105 __alloc_bootmem(x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)) 103 __alloc_bootmem(x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
106#define alloc_bootmem_pages_nopanic(x) \ 104#define alloc_bootmem_pages_nopanic(x) \
107 __alloc_bootmem_nopanic(x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)) 105 __alloc_bootmem_nopanic(x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
108#define alloc_bootmem_low_pages(x) \
109 __alloc_bootmem_low(x, PAGE_SIZE, 0)
110#define alloc_bootmem_node(pgdat, x) \ 106#define alloc_bootmem_node(pgdat, x) \
111 __alloc_bootmem_node(pgdat, x, SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS)) 107 __alloc_bootmem_node(pgdat, x, SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
112#define alloc_bootmem_pages_node(pgdat, x) \ 108#define alloc_bootmem_pages_node(pgdat, x) \
113 __alloc_bootmem_node(pgdat, x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)) 109 __alloc_bootmem_node(pgdat, x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
110#define alloc_bootmem_pages_node_nopanic(pgdat, x) \
111 __alloc_bootmem_node_nopanic(pgdat, x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
112
113#define alloc_bootmem_low(x) \
114 __alloc_bootmem_low(x, SMP_CACHE_BYTES, 0)
115#define alloc_bootmem_low_pages(x) \
116 __alloc_bootmem_low(x, PAGE_SIZE, 0)
114#define alloc_bootmem_low_pages_node(pgdat, x) \ 117#define alloc_bootmem_low_pages_node(pgdat, x) \
115 __alloc_bootmem_low_node(pgdat, x, PAGE_SIZE, 0) 118 __alloc_bootmem_low_node(pgdat, x, PAGE_SIZE, 0)
116#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
117 119
118extern int reserve_bootmem_generic(unsigned long addr, unsigned long size, 120extern int reserve_bootmem_generic(unsigned long addr, unsigned long size,
119 int flags); 121 int flags);
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 3577ffd90d4..910beb0abea 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -76,52 +76,98 @@
76 76
77#ifdef CONFIG_SMP 77#ifdef CONFIG_SMP
78 78
79#ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
80
81/* minimum unit size, also is the maximum supported allocation size */
82#define PCPU_MIN_UNIT_SIZE (16UL << PAGE_SHIFT)
83
84/*
85 * PERCPU_DYNAMIC_RESERVE indicates the amount of free area to piggy
86 * back on the first chunk if arch is manually allocating and mapping
87 * it for faster access (as a part of large page mapping for example).
88 * Note that dynamic percpu allocator covers both static and dynamic
89 * areas, so these values are bigger than PERCPU_MODULE_RESERVE.
90 *
91 * On typical configuration with modules, the following values leave
92 * about 8k of free space on the first chunk after boot on both x86_32
93 * and 64 when module support is enabled. When module support is
94 * disabled, it's much tighter.
95 */
96#ifndef PERCPU_DYNAMIC_RESERVE
97# if BITS_PER_LONG > 32
98# ifdef CONFIG_MODULES
99# define PERCPU_DYNAMIC_RESERVE (6 << PAGE_SHIFT)
100# else
101# define PERCPU_DYNAMIC_RESERVE (4 << PAGE_SHIFT)
102# endif
103# else
104# ifdef CONFIG_MODULES
105# define PERCPU_DYNAMIC_RESERVE (4 << PAGE_SHIFT)
106# else
107# define PERCPU_DYNAMIC_RESERVE (2 << PAGE_SHIFT)
108# endif
109# endif
110#endif /* PERCPU_DYNAMIC_RESERVE */
111
112extern void *pcpu_base_addr;
113
114typedef struct page * (*pcpu_get_page_fn_t)(unsigned int cpu, int pageno);
115typedef void (*pcpu_populate_pte_fn_t)(unsigned long addr);
116
117extern size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
118 size_t static_size, size_t unit_size,
119 size_t free_size, void *base_addr,
120 pcpu_populate_pte_fn_t populate_pte_fn);
121
122/*
123 * Use this to get to a cpu's version of the per-cpu object
124 * dynamically allocated. Non-atomic access to the current CPU's
125 * version should probably be combined with get_cpu()/put_cpu().
126 */
127#define per_cpu_ptr(ptr, cpu) SHIFT_PERCPU_PTR((ptr), per_cpu_offset((cpu)))
128
129#else /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
130
79struct percpu_data { 131struct percpu_data {
80 void *ptrs[1]; 132 void *ptrs[1];
81}; 133};
82 134
83#define __percpu_disguise(pdata) (struct percpu_data *)~(unsigned long)(pdata) 135#define __percpu_disguise(pdata) (struct percpu_data *)~(unsigned long)(pdata)
84/* 136
85 * Use this to get to a cpu's version of the per-cpu object dynamically 137#define per_cpu_ptr(ptr, cpu) \
86 * allocated. Non-atomic access to the current CPU's version should 138({ \
87 * probably be combined with get_cpu()/put_cpu(). 139 struct percpu_data *__p = __percpu_disguise(ptr); \
88 */ 140 (__typeof__(ptr))__p->ptrs[(cpu)]; \
89#define percpu_ptr(ptr, cpu) \
90({ \
91 struct percpu_data *__p = __percpu_disguise(ptr); \
92 (__typeof__(ptr))__p->ptrs[(cpu)]; \
93}) 141})
94 142
95extern void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask); 143#endif /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
96extern void percpu_free(void *__pdata); 144
145extern void *__alloc_percpu(size_t size, size_t align);
146extern void free_percpu(void *__pdata);
97 147
98#else /* CONFIG_SMP */ 148#else /* CONFIG_SMP */
99 149
100#define percpu_ptr(ptr, cpu) ({ (void)(cpu); (ptr); }) 150#define per_cpu_ptr(ptr, cpu) ({ (void)(cpu); (ptr); })
101 151
102static __always_inline void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask) 152static inline void *__alloc_percpu(size_t size, size_t align)
103{ 153{
154 /*
155 * Can't easily make larger alignment work with kmalloc. WARN
156 * on it. Larger alignment should only be used for module
157 * percpu sections on SMP for which this path isn't used.
158 */
159 WARN_ON_ONCE(align > __alignof__(unsigned long long));
104 return kzalloc(size, gfp); 160 return kzalloc(size, gfp);
105} 161}
106 162
107static inline void percpu_free(void *__pdata) 163static inline void free_percpu(void *p)
108{ 164{
109 kfree(__pdata); 165 kfree(p);
110} 166}
111 167
112#endif /* CONFIG_SMP */ 168#endif /* CONFIG_SMP */
113 169
114#define percpu_alloc_mask(size, gfp, mask) \ 170#define alloc_percpu(type) (type *)__alloc_percpu(sizeof(type), \
115 __percpu_alloc_mask((size), (gfp), &(mask)) 171 __alignof__(type))
116
117#define percpu_alloc(size, gfp) percpu_alloc_mask((size), (gfp), cpu_online_map)
118
119/* (legacy) interface for use without CPU hotplug handling */
120
121#define __alloc_percpu(size) percpu_alloc_mask((size), GFP_KERNEL, \
122 cpu_possible_map)
123#define alloc_percpu(type) (type *)__alloc_percpu(sizeof(type))
124#define free_percpu(ptr) percpu_free((ptr))
125#define per_cpu_ptr(ptr, cpu) percpu_ptr((ptr), (cpu))
126 172
127#endif /* __LINUX_PERCPU_H */ 173#endif /* __LINUX_PERCPU_H */
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 9c0890c7a06..a43ebec3a7b 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -95,6 +95,9 @@ extern struct vm_struct *remove_vm_area(const void *addr);
95 95
96extern int map_vm_area(struct vm_struct *area, pgprot_t prot, 96extern int map_vm_area(struct vm_struct *area, pgprot_t prot,
97 struct page ***pages); 97 struct page ***pages);
98extern int map_kernel_range_noflush(unsigned long start, unsigned long size,
99 pgprot_t prot, struct page **pages);
100extern void unmap_kernel_range_noflush(unsigned long addr, unsigned long size);
98extern void unmap_kernel_range(unsigned long addr, unsigned long size); 101extern void unmap_kernel_range(unsigned long addr, unsigned long size);
99 102
100/* Allocate/destroy a 'vmalloc' VM area. */ 103/* Allocate/destroy a 'vmalloc' VM area. */
@@ -110,5 +113,6 @@ extern long vwrite(char *buf, char *addr, unsigned long count);
110 */ 113 */
111extern rwlock_t vmlist_lock; 114extern rwlock_t vmlist_lock;
112extern struct vm_struct *vmlist; 115extern struct vm_struct *vmlist;
116extern __init void vm_area_register_early(struct vm_struct *vm, size_t align);
113 117
114#endif /* _LINUX_VMALLOC_H */ 118#endif /* _LINUX_VMALLOC_H */
diff --git a/kernel/module.c b/kernel/module.c
index ba22484a987..1f0657ae555 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -51,6 +51,7 @@
51#include <linux/tracepoint.h> 51#include <linux/tracepoint.h>
52#include <linux/ftrace.h> 52#include <linux/ftrace.h>
53#include <linux/async.h> 53#include <linux/async.h>
54#include <linux/percpu.h>
54 55
55#if 0 56#if 0
56#define DEBUGP printk 57#define DEBUGP printk
@@ -366,6 +367,34 @@ static struct module *find_module(const char *name)
366} 367}
367 368
368#ifdef CONFIG_SMP 369#ifdef CONFIG_SMP
370
371#ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
372
373static void *percpu_modalloc(unsigned long size, unsigned long align,
374 const char *name)
375{
376 void *ptr;
377
378 if (align > PAGE_SIZE) {
379 printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n",
380 name, align, PAGE_SIZE);
381 align = PAGE_SIZE;
382 }
383
384 ptr = __alloc_percpu(size, align);
385 if (!ptr)
386 printk(KERN_WARNING
387 "Could not allocate %lu bytes percpu data\n", size);
388 return ptr;
389}
390
391static void percpu_modfree(void *freeme)
392{
393 free_percpu(freeme);
394}
395
396#else /* ... !CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
397
369/* Number of blocks used and allocated. */ 398/* Number of blocks used and allocated. */
370static unsigned int pcpu_num_used, pcpu_num_allocated; 399static unsigned int pcpu_num_used, pcpu_num_allocated;
371/* Size of each block. -ve means used. */ 400/* Size of each block. -ve means used. */
@@ -480,21 +509,6 @@ static void percpu_modfree(void *freeme)
480 } 509 }
481} 510}
482 511
483static unsigned int find_pcpusec(Elf_Ehdr *hdr,
484 Elf_Shdr *sechdrs,
485 const char *secstrings)
486{
487 return find_sec(hdr, sechdrs, secstrings, ".data.percpu");
488}
489
490static void percpu_modcopy(void *pcpudest, const void *from, unsigned long size)
491{
492 int cpu;
493
494 for_each_possible_cpu(cpu)
495 memcpy(pcpudest + per_cpu_offset(cpu), from, size);
496}
497
498static int percpu_modinit(void) 512static int percpu_modinit(void)
499{ 513{
500 pcpu_num_used = 2; 514 pcpu_num_used = 2;
@@ -513,7 +527,26 @@ static int percpu_modinit(void)
513 return 0; 527 return 0;
514} 528}
515__initcall(percpu_modinit); 529__initcall(percpu_modinit);
530
531#endif /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
532
533static unsigned int find_pcpusec(Elf_Ehdr *hdr,
534 Elf_Shdr *sechdrs,
535 const char *secstrings)
536{
537 return find_sec(hdr, sechdrs, secstrings, ".data.percpu");
538}
539
540static void percpu_modcopy(void *pcpudest, const void *from, unsigned long size)
541{
542 int cpu;
543
544 for_each_possible_cpu(cpu)
545 memcpy(pcpudest + per_cpu_offset(cpu), from, size);
546}
547
516#else /* ... !CONFIG_SMP */ 548#else /* ... !CONFIG_SMP */
549
517static inline void *percpu_modalloc(unsigned long size, unsigned long align, 550static inline void *percpu_modalloc(unsigned long size, unsigned long align,
518 const char *name) 551 const char *name)
519{ 552{
@@ -535,6 +568,7 @@ static inline void percpu_modcopy(void *pcpudst, const void *src,
535 /* pcpusec should be 0, and size of that section should be 0. */ 568 /* pcpusec should be 0, and size of that section should be 0. */
536 BUG_ON(size != 0); 569 BUG_ON(size != 0);
537} 570}
571
538#endif /* CONFIG_SMP */ 572#endif /* CONFIG_SMP */
539 573
540#define MODINFO_ATTR(field) \ 574#define MODINFO_ATTR(field) \
diff --git a/kernel/sched.c b/kernel/sched.c
index 7d97ff7c447..0e5c38e1c8b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -9476,7 +9476,7 @@ cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
9476 9476
9477static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu) 9477static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
9478{ 9478{
9479 u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu); 9479 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
9480 u64 data; 9480 u64 data;
9481 9481
9482#ifndef CONFIG_64BIT 9482#ifndef CONFIG_64BIT
@@ -9495,7 +9495,7 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
9495 9495
9496static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) 9496static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
9497{ 9497{
9498 u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu); 9498 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
9499 9499
9500#ifndef CONFIG_64BIT 9500#ifndef CONFIG_64BIT
9501 /* 9501 /*
@@ -9591,7 +9591,7 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
9591 ca = task_ca(tsk); 9591 ca = task_ca(tsk);
9592 9592
9593 for (; ca; ca = ca->parent) { 9593 for (; ca; ca = ca->parent) {
9594 u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu); 9594 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
9595 *cpuusage += cputime; 9595 *cpuusage += cputime;
9596 } 9596 }
9597} 9597}
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 0cd415ee62a..74541ca4953 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -170,7 +170,7 @@ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
170 * doesn't hit this CPU until we're ready. */ 170 * doesn't hit this CPU until we're ready. */
171 get_cpu(); 171 get_cpu();
172 for_each_online_cpu(i) { 172 for_each_online_cpu(i) {
173 sm_work = percpu_ptr(stop_machine_work, i); 173 sm_work = per_cpu_ptr(stop_machine_work, i);
174 INIT_WORK(sm_work, stop_cpu); 174 INIT_WORK(sm_work, stop_cpu);
175 queue_work_on(i, stop_machine_wq, sm_work); 175 queue_work_on(i, stop_machine_wq, sm_work);
176 } 176 }
diff --git a/mm/Makefile b/mm/Makefile
index 72255be57f8..818569b68f4 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -30,6 +30,10 @@ obj-$(CONFIG_FAILSLAB) += failslab.o
30obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o 30obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
31obj-$(CONFIG_FS_XIP) += filemap_xip.o 31obj-$(CONFIG_FS_XIP) += filemap_xip.o
32obj-$(CONFIG_MIGRATION) += migrate.o 32obj-$(CONFIG_MIGRATION) += migrate.o
33ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
34obj-$(CONFIG_SMP) += percpu.o
35else
33obj-$(CONFIG_SMP) += allocpercpu.o 36obj-$(CONFIG_SMP) += allocpercpu.o
37endif
34obj-$(CONFIG_QUICKLIST) += quicklist.o 38obj-$(CONFIG_QUICKLIST) += quicklist.o
35obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o 39obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c
index 4297bc41bfd..3653c570232 100644
--- a/mm/allocpercpu.c
+++ b/mm/allocpercpu.c
@@ -99,45 +99,51 @@ static int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp,
99 __percpu_populate_mask((__pdata), (size), (gfp), &(mask)) 99 __percpu_populate_mask((__pdata), (size), (gfp), &(mask))
100 100
101/** 101/**
102 * percpu_alloc_mask - initial setup of per-cpu data 102 * alloc_percpu - initial setup of per-cpu data
103 * @size: size of per-cpu object 103 * @size: size of per-cpu object
104 * @gfp: may sleep or not etc. 104 * @align: alignment
105 * @mask: populate per-data for cpu's selected through mask bits
106 * 105 *
107 * Populating per-cpu data for all online cpu's would be a typical use case, 106 * Allocate dynamic percpu area. Percpu objects are populated with
108 * which is simplified by the percpu_alloc() wrapper. 107 * zeroed buffers.
109 * Per-cpu objects are populated with zeroed buffers.
110 */ 108 */
111void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask) 109void *__alloc_percpu(size_t size, size_t align)
112{ 110{
113 /* 111 /*
114 * We allocate whole cache lines to avoid false sharing 112 * We allocate whole cache lines to avoid false sharing
115 */ 113 */
116 size_t sz = roundup(nr_cpu_ids * sizeof(void *), cache_line_size()); 114 size_t sz = roundup(nr_cpu_ids * sizeof(void *), cache_line_size());
117 void *pdata = kzalloc(sz, gfp); 115 void *pdata = kzalloc(sz, GFP_KERNEL);
118 void *__pdata = __percpu_disguise(pdata); 116 void *__pdata = __percpu_disguise(pdata);
119 117
118 /*
119 * Can't easily make larger alignment work with kmalloc. WARN
120 * on it. Larger alignment should only be used for module
121 * percpu sections on SMP for which this path isn't used.
122 */
123 WARN_ON_ONCE(align > __alignof__(unsigned long long));
124
120 if (unlikely(!pdata)) 125 if (unlikely(!pdata))
121 return NULL; 126 return NULL;
122 if (likely(!__percpu_populate_mask(__pdata, size, gfp, mask))) 127 if (likely(!__percpu_populate_mask(__pdata, size, GFP_KERNEL,
128 &cpu_possible_map)))
123 return __pdata; 129 return __pdata;
124 kfree(pdata); 130 kfree(pdata);
125 return NULL; 131 return NULL;
126} 132}
127EXPORT_SYMBOL_GPL(__percpu_alloc_mask); 133EXPORT_SYMBOL_GPL(__alloc_percpu);
128 134
129/** 135/**
130 * percpu_free - final cleanup of per-cpu data 136 * free_percpu - final cleanup of per-cpu data
131 * @__pdata: object to clean up 137 * @__pdata: object to clean up
132 * 138 *
133 * We simply clean up any per-cpu object left. No need for the client to 139 * We simply clean up any per-cpu object left. No need for the client to
134 * track and specify through a bis mask which per-cpu objects are to free. 140 * track and specify through a bis mask which per-cpu objects are to free.
135 */ 141 */
136void percpu_free(void *__pdata) 142void free_percpu(void *__pdata)
137{ 143{
138 if (unlikely(!__pdata)) 144 if (unlikely(!__pdata))
139 return; 145 return;
140 __percpu_depopulate_mask(__pdata, &cpu_possible_map); 146 __percpu_depopulate_mask(__pdata, &cpu_possible_map);
141 kfree(__percpu_disguise(__pdata)); 147 kfree(__percpu_disguise(__pdata));
142} 148}
143EXPORT_SYMBOL_GPL(percpu_free); 149EXPORT_SYMBOL_GPL(free_percpu);
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 51a0ccf61e0..d7140c008ba 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -37,6 +37,16 @@ static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list);
37 37
38static int bootmem_debug; 38static int bootmem_debug;
39 39
40/*
41 * If an arch needs to apply workarounds to bootmem allocation, it can
42 * set CONFIG_HAVE_ARCH_BOOTMEM and define a wrapper around
43 * __alloc_bootmem_core().
44 */
45#ifndef CONFIG_HAVE_ARCH_BOOTMEM
46#define alloc_bootmem_core(bdata, size, align, goal, limit) \
47 __alloc_bootmem_core((bdata), (size), (align), (goal), (limit))
48#endif
49
40static int __init bootmem_debug_setup(char *buf) 50static int __init bootmem_debug_setup(char *buf)
41{ 51{
42 bootmem_debug = 1; 52 bootmem_debug = 1;
@@ -382,7 +392,6 @@ int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
382 return mark_bootmem_node(pgdat->bdata, start, end, 1, flags); 392 return mark_bootmem_node(pgdat->bdata, start, end, 1, flags);
383} 393}
384 394
385#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
386/** 395/**
387 * reserve_bootmem - mark a page range as usable 396 * reserve_bootmem - mark a page range as usable
388 * @addr: starting address of the range 397 * @addr: starting address of the range
@@ -403,7 +412,6 @@ int __init reserve_bootmem(unsigned long addr, unsigned long size,
403 412
404 return mark_bootmem(start, end, 1, flags); 413 return mark_bootmem(start, end, 1, flags);
405} 414}
406#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
407 415
408static unsigned long align_idx(struct bootmem_data *bdata, unsigned long idx, 416static unsigned long align_idx(struct bootmem_data *bdata, unsigned long idx,
409 unsigned long step) 417 unsigned long step)
@@ -428,7 +436,7 @@ static unsigned long align_off(struct bootmem_data *bdata, unsigned long off,
428 return ALIGN(base + off, align) - base; 436 return ALIGN(base + off, align) - base;
429} 437}
430 438
431static void * __init alloc_bootmem_core(struct bootmem_data *bdata, 439static void * __init __alloc_bootmem_core(struct bootmem_data *bdata,
432 unsigned long size, unsigned long align, 440 unsigned long size, unsigned long align,
433 unsigned long goal, unsigned long limit) 441 unsigned long goal, unsigned long limit)
434{ 442{
diff --git a/mm/percpu.c b/mm/percpu.c
new file mode 100644
index 00000000000..5954e7a9eb1
--- /dev/null
+++ b/mm/percpu.c
@@ -0,0 +1,979 @@
1/*
2 * linux/mm/percpu.c - percpu memory allocator
3 *
4 * Copyright (C) 2009 SUSE Linux Products GmbH
5 * Copyright (C) 2009 Tejun Heo <tj@kernel.org>
6 *
7 * This file is released under the GPLv2.
8 *
9 * This is percpu allocator which can handle both static and dynamic
10 * areas. Percpu areas are allocated in chunks in vmalloc area. Each
11 * chunk is consisted of num_possible_cpus() units and the first chunk
12 * is used for static percpu variables in the kernel image (special
13 * boot time alloc/init handling necessary as these areas need to be
14 * brought up before allocation services are running). Unit grows as
15 * necessary and all units grow or shrink in unison. When a chunk is
16 * filled up, another chunk is allocated. ie. in vmalloc area
17 *
18 * c0 c1 c2
19 * ------------------- ------------------- ------------
20 * | u0 | u1 | u2 | u3 | | u0 | u1 | u2 | u3 | | u0 | u1 | u
21 * ------------------- ...... ------------------- .... ------------
22 *
23 * Allocation is done in offset-size areas of single unit space. Ie,
24 * an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0,
25 * c1:u1, c1:u2 and c1:u3. Percpu access can be done by configuring
26 * percpu base registers UNIT_SIZE apart.
27 *
28 * There are usually many small percpu allocations many of them as
29 * small as 4 bytes. The allocator organizes chunks into lists
30 * according to free size and tries to allocate from the fullest one.
31 * Each chunk keeps the maximum contiguous area size hint which is
32 * guaranteed to be eqaul to or larger than the maximum contiguous
33 * area in the chunk. This helps the allocator not to iterate the
34 * chunk maps unnecessarily.
35 *
36 * Allocation state in each chunk is kept using an array of integers
37 * on chunk->map. A positive value in the map represents a free
38 * region and negative allocated. Allocation inside a chunk is done
39 * by scanning this map sequentially and serving the first matching
40 * entry. This is mostly copied from the percpu_modalloc() allocator.
41 * Chunks are also linked into a rb tree to ease address to chunk
42 * mapping during free.
43 *
44 * To use this allocator, arch code should do the followings.
45 *
46 * - define CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
47 *
48 * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate
49 * regular address to percpu pointer and back
50 *
51 * - use pcpu_setup_first_chunk() during percpu area initialization to
52 * setup the first chunk containing the kernel static percpu area
53 */
54
55#include <linux/bitmap.h>
56#include <linux/bootmem.h>
57#include <linux/list.h>
58#include <linux/mm.h>
59#include <linux/module.h>
60#include <linux/mutex.h>
61#include <linux/percpu.h>
62#include <linux/pfn.h>
63#include <linux/rbtree.h>
64#include <linux/slab.h>
65#include <linux/vmalloc.h>
66
67#include <asm/cacheflush.h>
68#include <asm/tlbflush.h>
69
70#define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */
71#define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */
72
73struct pcpu_chunk {
74 struct list_head list; /* linked to pcpu_slot lists */
75 struct rb_node rb_node; /* key is chunk->vm->addr */
76 int free_size; /* free bytes in the chunk */
77 int contig_hint; /* max contiguous size hint */
78 struct vm_struct *vm; /* mapped vmalloc region */
79 int map_used; /* # of map entries used */
80 int map_alloc; /* # of map entries allocated */
81 int *map; /* allocation map */
82 bool immutable; /* no [de]population allowed */
83 struct page *page[]; /* #cpus * UNIT_PAGES */
84};
85
86static int pcpu_unit_pages __read_mostly;
87static int pcpu_unit_size __read_mostly;
88static int pcpu_chunk_size __read_mostly;
89static int pcpu_nr_slots __read_mostly;
90static size_t pcpu_chunk_struct_size __read_mostly;
91
92/* the address of the first chunk which starts with the kernel static area */
93void *pcpu_base_addr __read_mostly;
94EXPORT_SYMBOL_GPL(pcpu_base_addr);
95
96/* the size of kernel static area */
97static int pcpu_static_size __read_mostly;
98
99/*
100 * One mutex to rule them all.
101 *
102 * The following mutex is grabbed in the outermost public alloc/free
103 * interface functions and released only when the operation is
104 * complete. As such, every function in this file other than the
105 * outermost functions are called under pcpu_mutex.
106 *
107 * It can easily be switched to use spinlock such that only the area
108 * allocation and page population commit are protected with it doing
109 * actual [de]allocation without holding any lock. However, given
110 * what this allocator does, I think it's better to let them run
111 * sequentially.
112 */
113static DEFINE_MUTEX(pcpu_mutex);
114
115static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */
116static struct rb_root pcpu_addr_root = RB_ROOT; /* chunks by address */
117
118static int __pcpu_size_to_slot(int size)
119{
120 int highbit = fls(size); /* size is in bytes */
121 return max(highbit - PCPU_SLOT_BASE_SHIFT + 2, 1);
122}
123
124static int pcpu_size_to_slot(int size)
125{
126 if (size == pcpu_unit_size)
127 return pcpu_nr_slots - 1;
128 return __pcpu_size_to_slot(size);
129}
130
131static int pcpu_chunk_slot(const struct pcpu_chunk *chunk)
132{
133 if (chunk->free_size < sizeof(int) || chunk->contig_hint < sizeof(int))
134 return 0;
135
136 return pcpu_size_to_slot(chunk->free_size);
137}
138
139static int pcpu_page_idx(unsigned int cpu, int page_idx)
140{
141 return cpu * pcpu_unit_pages + page_idx;
142}
143
144static struct page **pcpu_chunk_pagep(struct pcpu_chunk *chunk,
145 unsigned int cpu, int page_idx)
146{
147 return &chunk->page[pcpu_page_idx(cpu, page_idx)];
148}
149
150static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
151 unsigned int cpu, int page_idx)
152{
153 return (unsigned long)chunk->vm->addr +
154 (pcpu_page_idx(cpu, page_idx) << PAGE_SHIFT);
155}
156
157static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk,
158 int page_idx)
159{
160 return *pcpu_chunk_pagep(chunk, 0, page_idx) != NULL;
161}
162
163/**
164 * pcpu_realloc - versatile realloc
165 * @p: the current pointer (can be NULL for new allocations)
166 * @size: the current size in bytes (can be 0 for new allocations)
167 * @new_size: the wanted new size in bytes (can be 0 for free)
168 *
169 * More robust realloc which can be used to allocate, resize or free a
170 * memory area of arbitrary size. If the needed size goes over
171 * PAGE_SIZE, kernel VM is used.
172 *
173 * RETURNS:
174 * The new pointer on success, NULL on failure.
175 */
176static void *pcpu_realloc(void *p, size_t size, size_t new_size)
177{
178 void *new;
179
180 if (new_size <= PAGE_SIZE)
181 new = kmalloc(new_size, GFP_KERNEL);
182 else
183 new = vmalloc(new_size);
184 if (new_size && !new)
185 return NULL;
186
187 memcpy(new, p, min(size, new_size));
188 if (new_size > size)
189 memset(new + size, 0, new_size - size);
190
191 if (size <= PAGE_SIZE)
192 kfree(p);
193 else
194 vfree(p);
195
196 return new;
197}
198
199/**
200 * pcpu_chunk_relocate - put chunk in the appropriate chunk slot
201 * @chunk: chunk of interest
202 * @oslot: the previous slot it was on
203 *
204 * This function is called after an allocation or free changed @chunk.
205 * New slot according to the changed state is determined and @chunk is
206 * moved to the slot.
207 */
208static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
209{
210 int nslot = pcpu_chunk_slot(chunk);
211
212 if (oslot != nslot) {
213 if (oslot < nslot)
214 list_move(&chunk->list, &pcpu_slot[nslot]);
215 else
216 list_move_tail(&chunk->list, &pcpu_slot[nslot]);
217 }
218}
219
220static struct rb_node **pcpu_chunk_rb_search(void *addr,
221 struct rb_node **parentp)
222{
223 struct rb_node **p = &pcpu_addr_root.rb_node;
224 struct rb_node *parent = NULL;
225 struct pcpu_chunk *chunk;
226
227 while (*p) {
228 parent = *p;
229 chunk = rb_entry(parent, struct pcpu_chunk, rb_node);
230
231 if (addr < chunk->vm->addr)
232 p = &(*p)->rb_left;
233 else if (addr > chunk->vm->addr)
234 p = &(*p)->rb_right;
235 else
236 break;
237 }
238
239 if (parentp)
240 *parentp = parent;
241 return p;
242}
243
244/**
245 * pcpu_chunk_addr_search - search for chunk containing specified address
246 * @addr: address to search for
247 *
248 * Look for chunk which might contain @addr. More specifically, it
249 * searchs for the chunk with the highest start address which isn't
250 * beyond @addr.
251 *
252 * RETURNS:
253 * The address of the found chunk.
254 */
255static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
256{
257 struct rb_node *n, *parent;
258 struct pcpu_chunk *chunk;
259
260 n = *pcpu_chunk_rb_search(addr, &parent);
261 if (!n) {
262 /* no exactly matching chunk, the parent is the closest */
263 n = parent;
264 BUG_ON(!n);
265 }
266 chunk = rb_entry(n, struct pcpu_chunk, rb_node);
267
268 if (addr < chunk->vm->addr) {
269 /* the parent was the next one, look for the previous one */
270 n = rb_prev(n);
271 BUG_ON(!n);
272 chunk = rb_entry(n, struct pcpu_chunk, rb_node);
273 }
274
275 return chunk;
276}
277
278/**
279 * pcpu_chunk_addr_insert - insert chunk into address rb tree
280 * @new: chunk to insert
281 *
282 * Insert @new into address rb tree.
283 */
284static void pcpu_chunk_addr_insert(struct pcpu_chunk *new)
285{
286 struct rb_node **p, *parent;
287
288 p = pcpu_chunk_rb_search(new->vm->addr, &parent);
289 BUG_ON(*p);
290 rb_link_node(&new->rb_node, parent, p);
291 rb_insert_color(&new->rb_node, &pcpu_addr_root);
292}
293
294/**
295 * pcpu_split_block - split a map block
296 * @chunk: chunk of interest
297 * @i: index of map block to split
298 * @head: head size in bytes (can be 0)
299 * @tail: tail size in bytes (can be 0)
300 *
301 * Split the @i'th map block into two or three blocks. If @head is
302 * non-zero, @head bytes block is inserted before block @i moving it
303 * to @i+1 and reducing its size by @head bytes.
304 *
305 * If @tail is non-zero, the target block, which can be @i or @i+1
306 * depending on @head, is reduced by @tail bytes and @tail byte block
307 * is inserted after the target block.
308 *
309 * RETURNS:
310 * 0 on success, -errno on failure.
311 */
312static int pcpu_split_block(struct pcpu_chunk *chunk, int i, int head, int tail)
313{
314 int nr_extra = !!head + !!tail;
315 int target = chunk->map_used + nr_extra;
316
317 /* reallocation required? */
318 if (chunk->map_alloc < target) {
319 int new_alloc = chunk->map_alloc;
320 int *new;
321
322 while (new_alloc < target)
323 new_alloc *= 2;
324
325 new = pcpu_realloc(chunk->map,
326 chunk->map_alloc * sizeof(new[0]),
327 new_alloc * sizeof(new[0]));
328 if (!new)
329 return -ENOMEM;
330
331 chunk->map_alloc = new_alloc;
332 chunk->map = new;
333 }
334
335 /* insert a new subblock */
336 memmove(&chunk->map[i + nr_extra], &chunk->map[i],
337 sizeof(chunk->map[0]) * (chunk->map_used - i));
338 chunk->map_used += nr_extra;
339
340 if (head) {
341 chunk->map[i + 1] = chunk->map[i] - head;
342 chunk->map[i++] = head;
343 }
344 if (tail) {
345 chunk->map[i++] -= tail;
346 chunk->map[i] = tail;
347 }
348 return 0;
349}
350
351/**
352 * pcpu_alloc_area - allocate area from a pcpu_chunk
353 * @chunk: chunk of interest
354 * @size: wanted size in bytes
355 * @align: wanted align
356 *
357 * Try to allocate @size bytes area aligned at @align from @chunk.
358 * Note that this function only allocates the offset. It doesn't
359 * populate or map the area.
360 *
361 * RETURNS:
362 * Allocated offset in @chunk on success, -errno on failure.
363 */
364static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align)
365{
366 int oslot = pcpu_chunk_slot(chunk);
367 int max_contig = 0;
368 int i, off;
369
370 /*
371 * The static chunk initially doesn't have map attached
372 * because kmalloc wasn't available during init. Give it one.
373 */
374 if (unlikely(!chunk->map)) {
375 chunk->map = pcpu_realloc(NULL, 0,
376 PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0]));
377 if (!chunk->map)
378 return -ENOMEM;
379
380 chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
381 chunk->map[chunk->map_used++] = -pcpu_static_size;
382 if (chunk->free_size)
383 chunk->map[chunk->map_used++] = chunk->free_size;
384 }
385
386 for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++])) {
387 bool is_last = i + 1 == chunk->map_used;
388 int head, tail;
389
390 /* extra for alignment requirement */
391 head = ALIGN(off, align) - off;
392 BUG_ON(i == 0 && head != 0);
393
394 if (chunk->map[i] < 0)
395 continue;
396 if (chunk->map[i] < head + size) {
397 max_contig = max(chunk->map[i], max_contig);
398 continue;
399 }
400
401 /*
402 * If head is small or the previous block is free,
403 * merge'em. Note that 'small' is defined as smaller
404 * than sizeof(int), which is very small but isn't too
405 * uncommon for percpu allocations.
406 */
407 if (head && (head < sizeof(int) || chunk->map[i - 1] > 0)) {
408 if (chunk->map[i - 1] > 0)
409 chunk->map[i - 1] += head;
410 else {
411 chunk->map[i - 1] -= head;
412 chunk->free_size -= head;
413 }
414 chunk->map[i] -= head;
415 off += head;
416 head = 0;
417 }
418
419 /* if tail is small, just keep it around */
420 tail = chunk->map[i] - head - size;
421 if (tail < sizeof(int))
422 tail = 0;
423
424 /* split if warranted */
425 if (head || tail) {
426 if (pcpu_split_block(chunk, i, head, tail))
427 return -ENOMEM;
428 if (head) {
429 i++;
430 off += head;
431 max_contig = max(chunk->map[i - 1], max_contig);
432 }
433 if (tail)
434 max_contig = max(chunk->map[i + 1], max_contig);
435 }
436
437 /* update hint and mark allocated */
438 if (is_last)
439 chunk->contig_hint = max_contig; /* fully scanned */
440 else
441 chunk->contig_hint = max(chunk->contig_hint,
442 max_contig);
443
444 chunk->free_size -= chunk->map[i];
445 chunk->map[i] = -chunk->map[i];
446
447 pcpu_chunk_relocate(chunk, oslot);
448 return off;
449 }
450
451 chunk->contig_hint = max_contig; /* fully scanned */
452 pcpu_chunk_relocate(chunk, oslot);
453
454 /*
455 * Tell the upper layer that this chunk has no area left.
456 * Note that this is not an error condition but a notification
457 * to upper layer that it needs to look at other chunks.
458 * -ENOSPC is chosen as it isn't used in memory subsystem and
459 * matches the meaning in a way.
460 */
461 return -ENOSPC;
462}
463
464/**
465 * pcpu_free_area - free area to a pcpu_chunk
466 * @chunk: chunk of interest
467 * @freeme: offset of area to free
468 *
469 * Free area starting from @freeme to @chunk. Note that this function
470 * only modifies the allocation map. It doesn't depopulate or unmap
471 * the area.
472 */
473static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme)
474{
475 int oslot = pcpu_chunk_slot(chunk);
476 int i, off;
477
478 for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++]))
479 if (off == freeme)
480 break;
481 BUG_ON(off != freeme);
482 BUG_ON(chunk->map[i] > 0);
483
484 chunk->map[i] = -chunk->map[i];
485 chunk->free_size += chunk->map[i];
486
487 /* merge with previous? */
488 if (i > 0 && chunk->map[i - 1] >= 0) {
489 chunk->map[i - 1] += chunk->map[i];
490 chunk->map_used--;
491 memmove(&chunk->map[i], &chunk->map[i + 1],
492 (chunk->map_used - i) * sizeof(chunk->map[0]));
493 i--;
494 }
495 /* merge with next? */
496 if (i + 1 < chunk->map_used && chunk->map[i + 1] >= 0) {
497 chunk->map[i] += chunk->map[i + 1];
498 chunk->map_used--;
499 memmove(&chunk->map[i + 1], &chunk->map[i + 2],
500 (chunk->map_used - (i + 1)) * sizeof(chunk->map[0]));
501 }
502
503 chunk->contig_hint = max(chunk->map[i], chunk->contig_hint);
504 pcpu_chunk_relocate(chunk, oslot);
505}
506
507/**
508 * pcpu_unmap - unmap pages out of a pcpu_chunk
509 * @chunk: chunk of interest
510 * @page_start: page index of the first page to unmap
511 * @page_end: page index of the last page to unmap + 1
512 * @flush: whether to flush cache and tlb or not
513 *
514 * For each cpu, unmap pages [@page_start,@page_end) out of @chunk.
515 * If @flush is true, vcache is flushed before unmapping and tlb
516 * after.
517 */
518static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end,
519 bool flush)
520{
521 unsigned int last = num_possible_cpus() - 1;
522 unsigned int cpu;
523
524 /* unmap must not be done on immutable chunk */
525 WARN_ON(chunk->immutable);
526
527 /*
528 * Each flushing trial can be very expensive, issue flush on
529 * the whole region at once rather than doing it for each cpu.
530 * This could be an overkill but is more scalable.
531 */
532 if (flush)
533 flush_cache_vunmap(pcpu_chunk_addr(chunk, 0, page_start),
534 pcpu_chunk_addr(chunk, last, page_end));
535
536 for_each_possible_cpu(cpu)
537 unmap_kernel_range_noflush(
538 pcpu_chunk_addr(chunk, cpu, page_start),
539 (page_end - page_start) << PAGE_SHIFT);
540
541 /* ditto as flush_cache_vunmap() */
542 if (flush)
543 flush_tlb_kernel_range(pcpu_chunk_addr(chunk, 0, page_start),
544 pcpu_chunk_addr(chunk, last, page_end));
545}
546
547/**
548 * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk
549 * @chunk: chunk to depopulate
550 * @off: offset to the area to depopulate
551 * @size: size of the area to depopulate in bytes
552 * @flush: whether to flush cache and tlb or not
553 *
554 * For each cpu, depopulate and unmap pages [@page_start,@page_end)
555 * from @chunk. If @flush is true, vcache is flushed before unmapping
556 * and tlb after.
557 */
558static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size,
559 bool flush)
560{
561 int page_start = PFN_DOWN(off);
562 int page_end = PFN_UP(off + size);
563 int unmap_start = -1;
564 int uninitialized_var(unmap_end);
565 unsigned int cpu;
566 int i;
567
568 for (i = page_start; i < page_end; i++) {
569 for_each_possible_cpu(cpu) {
570 struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i);
571
572 if (!*pagep)
573 continue;
574
575 __free_page(*pagep);
576
577 /*
578 * If it's partial depopulation, it might get
579 * populated or depopulated again. Mark the
580 * page gone.
581 */
582 *pagep = NULL;
583
584 unmap_start = unmap_start < 0 ? i : unmap_start;
585 unmap_end = i + 1;
586 }
587 }
588
589 if (unmap_start >= 0)
590 pcpu_unmap(chunk, unmap_start, unmap_end, flush);
591}
592
593/**
594 * pcpu_map - map pages into a pcpu_chunk
595 * @chunk: chunk of interest
596 * @page_start: page index of the first page to map
597 * @page_end: page index of the last page to map + 1
598 *
599 * For each cpu, map pages [@page_start,@page_end) into @chunk.
600 * vcache is flushed afterwards.
601 */
602static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end)
603{
604 unsigned int last = num_possible_cpus() - 1;
605 unsigned int cpu;
606 int err;
607
608 /* map must not be done on immutable chunk */
609 WARN_ON(chunk->immutable);
610
611 for_each_possible_cpu(cpu) {
612 err = map_kernel_range_noflush(
613 pcpu_chunk_addr(chunk, cpu, page_start),
614 (page_end - page_start) << PAGE_SHIFT,
615 PAGE_KERNEL,
616 pcpu_chunk_pagep(chunk, cpu, page_start));
617 if (err < 0)
618 return err;
619 }
620
621 /* flush at once, please read comments in pcpu_unmap() */
622 flush_cache_vmap(pcpu_chunk_addr(chunk, 0, page_start),
623 pcpu_chunk_addr(chunk, last, page_end));
624 return 0;
625}
626
627/**
628 * pcpu_populate_chunk - populate and map an area of a pcpu_chunk
629 * @chunk: chunk of interest
630 * @off: offset to the area to populate
631 * @size: size of the area to populate in bytes
632 *
633 * For each cpu, populate and map pages [@page_start,@page_end) into
634 * @chunk. The area is cleared on return.
635 */
636static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
637{
638 const gfp_t alloc_mask = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD;
639 int page_start = PFN_DOWN(off);
640 int page_end = PFN_UP(off + size);
641 int map_start = -1;
642 int map_end;
643 unsigned int cpu;
644 int i;
645
646 for (i = page_start; i < page_end; i++) {
647 if (pcpu_chunk_page_occupied(chunk, i)) {
648 if (map_start >= 0) {
649 if (pcpu_map(chunk, map_start, map_end))
650 goto err;
651 map_start = -1;
652 }
653 continue;
654 }
655
656 map_start = map_start < 0 ? i : map_start;
657 map_end = i + 1;
658
659 for_each_possible_cpu(cpu) {
660 struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i);
661
662 *pagep = alloc_pages_node(cpu_to_node(cpu),
663 alloc_mask, 0);
664 if (!*pagep)
665 goto err;
666 }
667 }
668
669 if (map_start >= 0 && pcpu_map(chunk, map_start, map_end))
670 goto err;
671
672 for_each_possible_cpu(cpu)
673 memset(chunk->vm->addr + cpu * pcpu_unit_size + off, 0,
674 size);
675
676 return 0;
677err:
678 /* likely under heavy memory pressure, give memory back */
679 pcpu_depopulate_chunk(chunk, off, size, true);
680 return -ENOMEM;
681}
682
683static void free_pcpu_chunk(struct pcpu_chunk *chunk)
684{
685 if (!chunk)
686 return;
687 if (chunk->vm)
688 free_vm_area(chunk->vm);
689 pcpu_realloc(chunk->map, chunk->map_alloc * sizeof(chunk->map[0]), 0);
690 kfree(chunk);
691}
692
693static struct pcpu_chunk *alloc_pcpu_chunk(void)
694{
695 struct pcpu_chunk *chunk;
696
697 chunk = kzalloc(pcpu_chunk_struct_size, GFP_KERNEL);
698 if (!chunk)
699 return NULL;
700
701 chunk->map = pcpu_realloc(NULL, 0,
702 PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0]));
703 chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
704 chunk->map[chunk->map_used++] = pcpu_unit_size;
705
706 chunk->vm = get_vm_area(pcpu_chunk_size, GFP_KERNEL);
707 if (!chunk->vm) {
708 free_pcpu_chunk(chunk);
709 return NULL;
710 }
711
712 INIT_LIST_HEAD(&chunk->list);
713 chunk->free_size = pcpu_unit_size;
714 chunk->contig_hint = pcpu_unit_size;
715
716 return chunk;
717}
718
719/**
720 * __alloc_percpu - allocate percpu area
721 * @size: size of area to allocate in bytes
722 * @align: alignment of area (max PAGE_SIZE)
723 *
724 * Allocate percpu area of @size bytes aligned at @align. Might
725 * sleep. Might trigger writeouts.
726 *
727 * RETURNS:
728 * Percpu pointer to the allocated area on success, NULL on failure.
729 */
730void *__alloc_percpu(size_t size, size_t align)
731{
732 void *ptr = NULL;
733 struct pcpu_chunk *chunk;
734 int slot, off;
735
736 if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) {
737 WARN(true, "illegal size (%zu) or align (%zu) for "
738 "percpu allocation\n", size, align);
739 return NULL;
740 }
741
742 mutex_lock(&pcpu_mutex);
743
744 /* allocate area */
745 for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) {
746 list_for_each_entry(chunk, &pcpu_slot[slot], list) {
747 if (size > chunk->contig_hint)
748 continue;
749 off = pcpu_alloc_area(chunk, size, align);
750 if (off >= 0)
751 goto area_found;
752 if (off != -ENOSPC)
753 goto out_unlock;
754 }
755 }
756
757 /* hmmm... no space left, create a new chunk */
758 chunk = alloc_pcpu_chunk();
759 if (!chunk)
760 goto out_unlock;
761 pcpu_chunk_relocate(chunk, -1);
762 pcpu_chunk_addr_insert(chunk);
763
764 off = pcpu_alloc_area(chunk, size, align);
765 if (off < 0)
766 goto out_unlock;
767
768area_found:
769 /* populate, map and clear the area */
770 if (pcpu_populate_chunk(chunk, off, size)) {
771 pcpu_free_area(chunk, off);
772 goto out_unlock;
773 }
774
775 ptr = __addr_to_pcpu_ptr(chunk->vm->addr + off);
776out_unlock:
777 mutex_unlock(&pcpu_mutex);
778 return ptr;
779}
780EXPORT_SYMBOL_GPL(__alloc_percpu);
781
782static void pcpu_kill_chunk(struct pcpu_chunk *chunk)
783{
784 WARN_ON(chunk->immutable);
785 pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false);
786 list_del(&chunk->list);
787 rb_erase(&chunk->rb_node, &pcpu_addr_root);
788 free_pcpu_chunk(chunk);
789}
790
791/**
792 * free_percpu - free percpu area
793 * @ptr: pointer to area to free
794 *
795 * Free percpu area @ptr. Might sleep.
796 */
797void free_percpu(void *ptr)
798{
799 void *addr = __pcpu_ptr_to_addr(ptr);
800 struct pcpu_chunk *chunk;
801 int off;
802
803 if (!ptr)
804 return;
805
806 mutex_lock(&pcpu_mutex);
807
808 chunk = pcpu_chunk_addr_search(addr);
809 off = addr - chunk->vm->addr;
810
811 pcpu_free_area(chunk, off);
812
813 /* the chunk became fully free, kill one if there are other free ones */
814 if (chunk->free_size == pcpu_unit_size) {
815 struct pcpu_chunk *pos;
816
817 list_for_each_entry(pos,
818 &pcpu_slot[pcpu_chunk_slot(chunk)], list)
819 if (pos != chunk) {
820 pcpu_kill_chunk(pos);
821 break;
822 }
823 }
824
825 mutex_unlock(&pcpu_mutex);
826}
827EXPORT_SYMBOL_GPL(free_percpu);
828
829/**
830 * pcpu_setup_first_chunk - initialize the first percpu chunk
831 * @get_page_fn: callback to fetch page pointer
832 * @static_size: the size of static percpu area in bytes
833 * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, 0 for auto
834 * @free_size: free size in bytes, 0 for auto
835 * @base_addr: mapped address, NULL for auto
836 * @populate_pte_fn: callback to allocate pagetable, NULL if unnecessary
837 *
838 * Initialize the first percpu chunk which contains the kernel static
839 * perpcu area. This function is to be called from arch percpu area
840 * setup path. The first two parameters are mandatory. The rest are
841 * optional.
842 *
843 * @get_page_fn() should return pointer to percpu page given cpu
844 * number and page number. It should at least return enough pages to
845 * cover the static area. The returned pages for static area should
846 * have been initialized with valid data. If @unit_size is specified,
847 * it can also return pages after the static area. NULL return
848 * indicates end of pages for the cpu. Note that @get_page_fn() must
849 * return the same number of pages for all cpus.
850 *
851 * @unit_size, if non-zero, determines unit size and must be aligned
852 * to PAGE_SIZE and equal to or larger than @static_size + @free_size.
853 *
854 * @free_size determines the number of free bytes after the static
855 * area in the first chunk. If zero, whatever left is available.
856 * Specifying non-zero value make percpu leave the area after
857 * @static_size + @free_size alone.
858 *
859 * Non-null @base_addr means that the caller already allocated virtual
860 * region for the first chunk and mapped it. percpu must not mess
861 * with the chunk. Note that @base_addr with 0 @unit_size or non-NULL
862 * @populate_pte_fn doesn't make any sense.
863 *
864 * @populate_pte_fn is used to populate the pagetable. NULL means the
865 * caller already populated the pagetable.
866 *
867 * RETURNS:
868 * The determined pcpu_unit_size which can be used to initialize
869 * percpu access.
870 */
871size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
872 size_t static_size, size_t unit_size,
873 size_t free_size, void *base_addr,
874 pcpu_populate_pte_fn_t populate_pte_fn)
875{
876 static struct vm_struct static_vm;
877 struct pcpu_chunk *static_chunk;
878 unsigned int cpu;
879 int nr_pages;
880 int err, i;
881
882 /* santiy checks */
883 BUG_ON(!static_size);
884 BUG_ON(!unit_size && free_size);
885 BUG_ON(unit_size && unit_size < static_size + free_size);
886 BUG_ON(unit_size & ~PAGE_MASK);
887 BUG_ON(base_addr && !unit_size);
888 BUG_ON(base_addr && populate_pte_fn);
889
890 if (unit_size)
891 pcpu_unit_pages = unit_size >> PAGE_SHIFT;
892 else
893 pcpu_unit_pages = max_t(int, PCPU_MIN_UNIT_SIZE >> PAGE_SHIFT,
894 PFN_UP(static_size));
895
896 pcpu_static_size = static_size;
897 pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
898 pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size;
899 pcpu_chunk_struct_size = sizeof(struct pcpu_chunk)
900 + num_possible_cpus() * pcpu_unit_pages * sizeof(struct page *);
901
902 /*
903 * Allocate chunk slots. The additional last slot is for
904 * empty chunks.
905 */
906 pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2;
907 pcpu_slot = alloc_bootmem(pcpu_nr_slots * sizeof(pcpu_slot[0]));
908 for (i = 0; i < pcpu_nr_slots; i++)
909 INIT_LIST_HEAD(&pcpu_slot[i]);
910
911 /* init static_chunk */
912 static_chunk = alloc_bootmem(pcpu_chunk_struct_size);
913 INIT_LIST_HEAD(&static_chunk->list);
914 static_chunk->vm = &static_vm;
915
916 if (free_size)
917 static_chunk->free_size = free_size;
918 else
919 static_chunk->free_size = pcpu_unit_size - pcpu_static_size;
920
921 static_chunk->contig_hint = static_chunk->free_size;
922
923 /* allocate vm address */
924 static_vm.flags = VM_ALLOC;
925 static_vm.size = pcpu_chunk_size;
926
927 if (!base_addr)
928 vm_area_register_early(&static_vm, PAGE_SIZE);
929 else {
930 /*
931 * Pages already mapped. No need to remap into
932 * vmalloc area. In this case the static chunk can't
933 * be mapped or unmapped by percpu and is marked
934 * immutable.
935 */
936 static_vm.addr = base_addr;
937 static_chunk->immutable = true;
938 }
939
940 /* assign pages */
941 nr_pages = -1;
942 for_each_possible_cpu(cpu) {
943 for (i = 0; i < pcpu_unit_pages; i++) {
944 struct page *page = get_page_fn(cpu, i);
945
946 if (!page)
947 break;
948 *pcpu_chunk_pagep(static_chunk, cpu, i) = page;
949 }
950
951 BUG_ON(i < PFN_UP(pcpu_static_size));
952
953 if (nr_pages < 0)
954 nr_pages = i;
955 else
956 BUG_ON(nr_pages != i);
957 }
958
959 /* map them */
960 if (populate_pte_fn) {
961 for_each_possible_cpu(cpu)
962 for (i = 0; i < nr_pages; i++)
963 populate_pte_fn(pcpu_chunk_addr(static_chunk,
964 cpu, i));
965
966 err = pcpu_map(static_chunk, 0, nr_pages);
967 if (err)
968 panic("failed to setup static percpu area, err=%d\n",
969 err);
970 }
971
972 /* link static_chunk in */
973 pcpu_chunk_relocate(static_chunk, -1);
974 pcpu_chunk_addr_insert(static_chunk);
975
976 /* we're done */
977 pcpu_base_addr = (void *)pcpu_chunk_addr(static_chunk, 0, 0);
978 return pcpu_unit_size;
979}
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 903cad46e79..fb6f59935fb 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -24,6 +24,7 @@
24#include <linux/radix-tree.h> 24#include <linux/radix-tree.h>
25#include <linux/rcupdate.h> 25#include <linux/rcupdate.h>
26#include <linux/bootmem.h> 26#include <linux/bootmem.h>
27#include <linux/pfn.h>
27 28
28#include <asm/atomic.h> 29#include <asm/atomic.h>
29#include <asm/uaccess.h> 30#include <asm/uaccess.h>
@@ -152,8 +153,8 @@ static int vmap_pud_range(pgd_t *pgd, unsigned long addr,
152 * 153 *
153 * Ie. pte at addr+N*PAGE_SIZE shall point to pfn corresponding to pages[N] 154 * Ie. pte at addr+N*PAGE_SIZE shall point to pfn corresponding to pages[N]
154 */ 155 */
155static int vmap_page_range(unsigned long start, unsigned long end, 156static int vmap_page_range_noflush(unsigned long start, unsigned long end,
156 pgprot_t prot, struct page **pages) 157 pgprot_t prot, struct page **pages)
157{ 158{
158 pgd_t *pgd; 159 pgd_t *pgd;
159 unsigned long next; 160 unsigned long next;
@@ -169,13 +170,22 @@ static int vmap_page_range(unsigned long start, unsigned long end,
169 if (err) 170 if (err)
170 break; 171 break;
171 } while (pgd++, addr = next, addr != end); 172 } while (pgd++, addr = next, addr != end);
172 flush_cache_vmap(start, end);
173 173
174 if (unlikely(err)) 174 if (unlikely(err))
175 return err; 175 return err;
176 return nr; 176 return nr;
177} 177}
178 178
179static int vmap_page_range(unsigned long start, unsigned long end,
180 pgprot_t prot, struct page **pages)
181{
182 int ret;
183
184 ret = vmap_page_range_noflush(start, end, prot, pages);
185 flush_cache_vmap(start, end);
186 return ret;
187}
188
179static inline int is_vmalloc_or_module_addr(const void *x) 189static inline int is_vmalloc_or_module_addr(const void *x)
180{ 190{
181 /* 191 /*
@@ -982,6 +992,32 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t pro
982} 992}
983EXPORT_SYMBOL(vm_map_ram); 993EXPORT_SYMBOL(vm_map_ram);
984 994
995/**
996 * vm_area_register_early - register vmap area early during boot
997 * @vm: vm_struct to register
998 * @align: requested alignment
999 *
1000 * This function is used to register kernel vm area before
1001 * vmalloc_init() is called. @vm->size and @vm->flags should contain
1002 * proper values on entry and other fields should be zero. On return,
1003 * vm->addr contains the allocated address.
1004 *
1005 * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING.
1006 */
1007void __init vm_area_register_early(struct vm_struct *vm, size_t align)
1008{
1009 static size_t vm_init_off __initdata;
1010 unsigned long addr;
1011
1012 addr = ALIGN(VMALLOC_START + vm_init_off, align);
1013 vm_init_off = PFN_ALIGN(addr + vm->size) - VMALLOC_START;
1014
1015 vm->addr = (void *)addr;
1016
1017 vm->next = vmlist;
1018 vmlist = vm;
1019}
1020
985void __init vmalloc_init(void) 1021void __init vmalloc_init(void)
986{ 1022{
987 struct vmap_area *va; 1023 struct vmap_area *va;
@@ -1009,6 +1045,58 @@ void __init vmalloc_init(void)
1009 vmap_initialized = true; 1045 vmap_initialized = true;
1010} 1046}
1011 1047
1048/**
1049 * map_kernel_range_noflush - map kernel VM area with the specified pages
1050 * @addr: start of the VM area to map
1051 * @size: size of the VM area to map
1052 * @prot: page protection flags to use
1053 * @pages: pages to map
1054 *
1055 * Map PFN_UP(@size) pages at @addr. The VM area @addr and @size
1056 * specify should have been allocated using get_vm_area() and its
1057 * friends.
1058 *
1059 * NOTE:
1060 * This function does NOT do any cache flushing. The caller is
1061 * responsible for calling flush_cache_vmap() on to-be-mapped areas
1062 * before calling this function.
1063 *
1064 * RETURNS:
1065 * The number of pages mapped on success, -errno on failure.
1066 */
1067int map_kernel_range_noflush(unsigned long addr, unsigned long size,
1068 pgprot_t prot, struct page **pages)
1069{
1070 return vmap_page_range_noflush(addr, addr + size, prot, pages);
1071}
1072
1073/**
1074 * unmap_kernel_range_noflush - unmap kernel VM area
1075 * @addr: start of the VM area to unmap
1076 * @size: size of the VM area to unmap
1077 *
1078 * Unmap PFN_UP(@size) pages at @addr. The VM area @addr and @size
1079 * specify should have been allocated using get_vm_area() and its
1080 * friends.
1081 *
1082 * NOTE:
1083 * This function does NOT do any cache flushing. The caller is
1084 * responsible for calling flush_cache_vunmap() on to-be-mapped areas
1085 * before calling this function and flush_tlb_kernel_range() after.
1086 */
1087void unmap_kernel_range_noflush(unsigned long addr, unsigned long size)
1088{
1089 vunmap_page_range(addr, addr + size);
1090}
1091
1092/**
1093 * unmap_kernel_range - unmap kernel VM area and flush cache and TLB
1094 * @addr: start of the VM area to unmap
1095 * @size: size of the VM area to unmap
1096 *
1097 * Similar to unmap_kernel_range_noflush() but flushes vcache before
1098 * the unmapping and tlb after.
1099 */
1012void unmap_kernel_range(unsigned long addr, unsigned long size) 1100void unmap_kernel_range(unsigned long addr, unsigned long size)
1013{ 1101{
1014 unsigned long end = addr + size; 1102 unsigned long end = addr + size;
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 743f5542d65..3a3dad80135 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1375,10 +1375,10 @@ EXPORT_SYMBOL_GPL(snmp_fold_field);
1375int snmp_mib_init(void *ptr[2], size_t mibsize) 1375int snmp_mib_init(void *ptr[2], size_t mibsize)
1376{ 1376{
1377 BUG_ON(ptr == NULL); 1377 BUG_ON(ptr == NULL);
1378 ptr[0] = __alloc_percpu(mibsize); 1378 ptr[0] = __alloc_percpu(mibsize, __alignof__(unsigned long long));
1379 if (!ptr[0]) 1379 if (!ptr[0])
1380 goto err0; 1380 goto err0;
1381 ptr[1] = __alloc_percpu(mibsize); 1381 ptr[1] = __alloc_percpu(mibsize, __alignof__(unsigned long long));
1382 if (!ptr[1]) 1382 if (!ptr[1])
1383 goto err1; 1383 goto err1;
1384 return 0; 1384 return 0;