aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel/setup_percpu.c
diff options
context:
space:
mode:
authorTejun Heo <tj@kernel.org>2009-02-23 21:57:22 -0500
committerTejun Heo <tj@kernel.org>2009-02-23 21:57:22 -0500
commit8ac837571491e239e64bd87863c1679d8002e8a2 (patch)
treeb001c6513609d86b8ebd1d1a2192054ff14c0443 /arch/x86/kernel/setup_percpu.c
parent89c9215165ca609096e845926d9a18f1306176a4 (diff)
x86: add remapping percpu first chunk allocator
Impact: add better first percpu allocation for NUMA On NUMA, embedding allocator can't be used as different units can't be made to fall in the correct NUMA nodes. To use large page mapping, each unit needs to be remapped. However, percpu areas are usually much smaller than large page size and unused space hurts a lot as the number of cpus grow. This allocator remaps large pages for each chunk but gives back unused part to the bootmem allocator making the large pages mapped twice. This adds slightly to the TLB pressure but is much better than using 4k mappings while still being NUMA-friendly. Ingo suggested that this would be the correct approach for NUMA. Signed-off-by: Tejun Heo <tj@kernel.org> Cc: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'arch/x86/kernel/setup_percpu.c')
-rw-r--r--arch/x86/kernel/setup_percpu.c137
1 files changed, 135 insertions, 2 deletions
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index fd4c399675d..2d946a8f78b 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -111,6 +111,133 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size,
111} 111}
112 112
113/* 113/*
114 * Remap allocator
115 *
116 * This allocator uses PMD page as unit. A PMD page is allocated for
117 * each cpu and each is remapped into vmalloc area using PMD mapping.
118 * As PMD page is quite large, only part of it is used for the first
119 * chunk. Unused part is returned to the bootmem allocator.
120 *
121 * So, the PMD pages are mapped twice - once to the physical mapping
122 * and to the vmalloc area for the first percpu chunk. The double
123 * mapping does add one more PMD TLB entry pressure but still is much
124 * better than only using 4k mappings while still being NUMA friendly.
125 */
126#ifdef CONFIG_NEED_MULTIPLE_NODES
127static size_t pcpur_size __initdata;
128static void **pcpur_ptrs __initdata;
129
130static struct page * __init pcpur_get_page(unsigned int cpu, int pageno)
131{
132 size_t off = (size_t)pageno << PAGE_SHIFT;
133
134 if (off >= pcpur_size)
135 return NULL;
136
137 return virt_to_page(pcpur_ptrs[cpu] + off);
138}
139
140static ssize_t __init setup_pcpu_remap(size_t static_size)
141{
142 static struct vm_struct vm;
143 pg_data_t *last;
144 size_t ptrs_size;
145 unsigned int cpu;
146 ssize_t ret;
147
148 /*
149 * If large page isn't supported, there's no benefit in doing
150 * this. Also, on non-NUMA, embedding is better.
151 */
152 if (!cpu_has_pse || pcpu_need_numa())
153 return -EINVAL;
154
155 last = NULL;
156 for_each_possible_cpu(cpu) {
157 int node = early_cpu_to_node(cpu);
158
159 if (node_online(node) && NODE_DATA(node) &&
160 last && last != NODE_DATA(node))
161 goto proceed;
162
163 last = NODE_DATA(node);
164 }
165 return -EINVAL;
166
167proceed:
168 /*
169 * Currently supports only single page. Supporting multiple
170 * pages won't be too difficult if it ever becomes necessary.
171 */
172 pcpur_size = PFN_ALIGN(static_size + PERCPU_DYNAMIC_RESERVE);
173 if (pcpur_size > PMD_SIZE) {
174 pr_warning("PERCPU: static data is larger than large page, "
175 "can't use large page\n");
176 return -EINVAL;
177 }
178
179 /* allocate pointer array and alloc large pages */
180 ptrs_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpur_ptrs[0]));
181 pcpur_ptrs = alloc_bootmem(ptrs_size);
182
183 for_each_possible_cpu(cpu) {
184 pcpur_ptrs[cpu] = pcpu_alloc_bootmem(cpu, PMD_SIZE, PMD_SIZE);
185 if (!pcpur_ptrs[cpu])
186 goto enomem;
187
188 /*
189 * Only use pcpur_size bytes and give back the rest.
190 *
191 * Ingo: The 2MB up-rounding bootmem is needed to make
192 * sure the partial 2MB page is still fully RAM - it's
193 * not well-specified to have a PAT-incompatible area
194 * (unmapped RAM, device memory, etc.) in that hole.
195 */
196 free_bootmem(__pa(pcpur_ptrs[cpu] + pcpur_size),
197 PMD_SIZE - pcpur_size);
198
199 memcpy(pcpur_ptrs[cpu], __per_cpu_load, static_size);
200 }
201
202 /* allocate address and map */
203 vm.flags = VM_ALLOC;
204 vm.size = num_possible_cpus() * PMD_SIZE;
205 vm_area_register_early(&vm, PMD_SIZE);
206
207 for_each_possible_cpu(cpu) {
208 pmd_t *pmd;
209
210 pmd = populate_extra_pmd((unsigned long)vm.addr
211 + cpu * PMD_SIZE);
212 set_pmd(pmd, pfn_pmd(page_to_pfn(virt_to_page(pcpur_ptrs[cpu])),
213 PAGE_KERNEL_LARGE));
214 }
215
216 /* we're ready, commit */
217 pr_info("PERCPU: Remapped at %p with large pages, static data "
218 "%zu bytes\n", vm.addr, static_size);
219
220 ret = pcpu_setup_first_chunk(pcpur_get_page, static_size, PMD_SIZE,
221 pcpur_size - static_size, vm.addr, NULL);
222 goto out_free_ar;
223
224enomem:
225 for_each_possible_cpu(cpu)
226 if (pcpur_ptrs[cpu])
227 free_bootmem(__pa(pcpur_ptrs[cpu]), PMD_SIZE);
228 ret = -ENOMEM;
229out_free_ar:
230 free_bootmem(__pa(pcpur_ptrs), ptrs_size);
231 return ret;
232}
233#else
234static ssize_t __init setup_pcpu_remap(size_t static_size)
235{
236 return -EINVAL;
237}
238#endif
239
240/*
114 * Embedding allocator 241 * Embedding allocator
115 * 242 *
116 * The first chunk is sized to just contain the static area plus 243 * The first chunk is sized to just contain the static area plus
@@ -259,8 +386,14 @@ void __init setup_per_cpu_areas(void)
259 pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n", 386 pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n",
260 NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids); 387 NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids);
261 388
262 /* allocate percpu area */ 389 /*
263 ret = setup_pcpu_embed(static_size); 390 * Allocate percpu area. If PSE is supported, try to make use
391 * of large page mappings. Please read comments on top of
392 * each allocator for details.
393 */
394 ret = setup_pcpu_remap(static_size);
395 if (ret < 0)
396 ret = setup_pcpu_embed(static_size);
264 if (ret < 0) 397 if (ret < 0)
265 ret = setup_pcpu_4k(static_size); 398 ret = setup_pcpu_4k(static_size);
266 if (ret < 0) 399 if (ret < 0)