aboutsummaryrefslogtreecommitdiffstats
path: root/arch
diff options
context:
space:
mode:
authorTejun Heo <tj@kernel.org>2009-08-14 02:00:52 -0400
committerTejun Heo <tj@kernel.org>2009-08-14 02:00:52 -0400
commit4518e6a0c038b98be4c480e6f4481e8676bd15dd (patch)
tree81b222a74951186aa85b5dfddae1fcae3b7e8ea9 /arch
parentc8826dd538602d730ed2c18c6753f1bbfa6c4933 (diff)
x86,percpu: use embedding for 64bit NUMA and page for 32bit NUMA
Embedding percpu first chunk allocator can now handle very sparse unit mapping. Use embedding allocator instead of lpage for 64bit NUMA. This removes extra TLB pressure and the need to do complex and fragile dancing when changing page attributes. For 32bit, using very sparse unit mapping isn't a good idea because the vmalloc space is very constrained. 32bit NUMA machines aren't exactly the focus of optimization and it isn't very clear whether lpage performs better than page. Use page first chunk allocator for 32bit NUMAs. As this leaves setup_pcpu_*() functions pretty much empty, fold them into setup_per_cpu_areas(). Signed-off-by: Tejun Heo <tj@kernel.org> Cc: Ingo Molnar <mingo@elte.hu> Cc: Andi Kleen <andi@firstfloor.org>
Diffstat (limited to 'arch')
-rw-r--r--arch/x86/Kconfig4
-rw-r--r--arch/x86/kernel/setup_percpu.c155
2 files changed, 28 insertions, 131 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index f7ac27215512..869d7d301448 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -156,10 +156,6 @@ config NEED_PER_CPU_EMBED_FIRST_CHUNK
156config NEED_PER_CPU_PAGE_FIRST_CHUNK 156config NEED_PER_CPU_PAGE_FIRST_CHUNK
157 def_bool y 157 def_bool y
158 158
159config NEED_PER_CPU_LPAGE_FIRST_CHUNK
160 def_bool y
161 depends on NEED_MULTIPLE_NODES
162
163config HAVE_CPUMASK_OF_CPU_MAP 159config HAVE_CPUMASK_OF_CPU_MAP
164 def_bool X86_64_SMP 160 def_bool X86_64_SMP
165 161
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 67f6314de9f1..d559af913e1f 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -55,6 +55,7 @@ EXPORT_SYMBOL(__per_cpu_offset);
55#define PERCPU_FIRST_CHUNK_RESERVE 0 55#define PERCPU_FIRST_CHUNK_RESERVE 0
56#endif 56#endif
57 57
58#ifdef CONFIG_X86_32
58/** 59/**
59 * pcpu_need_numa - determine percpu allocation needs to consider NUMA 60 * pcpu_need_numa - determine percpu allocation needs to consider NUMA
60 * 61 *
@@ -83,6 +84,7 @@ static bool __init pcpu_need_numa(void)
83#endif 84#endif
84 return false; 85 return false;
85} 86}
87#endif
86 88
87/** 89/**
88 * pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu 90 * pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu
@@ -136,128 +138,23 @@ static void __init pcpu_fc_free(void *ptr, size_t size)
136 free_bootmem(__pa(ptr), size); 138 free_bootmem(__pa(ptr), size);
137} 139}
138 140
139/* 141static int __init pcpu_cpu_distance(unsigned int from, unsigned int to)
140 * Large page remapping allocator
141 */
142#ifdef CONFIG_NEED_MULTIPLE_NODES
143static void __init pcpul_map(void *ptr, size_t size, void *addr)
144{
145 pmd_t *pmd, pmd_v;
146
147 pmd = populate_extra_pmd((unsigned long)addr);
148 pmd_v = pfn_pmd(page_to_pfn(virt_to_page(ptr)), PAGE_KERNEL_LARGE);
149 set_pmd(pmd, pmd_v);
150}
151
152static int pcpu_lpage_cpu_distance(unsigned int from, unsigned int to)
153{ 142{
143#ifdef CONFIG_NEED_MULTIPLE_NODES
154 if (early_cpu_to_node(from) == early_cpu_to_node(to)) 144 if (early_cpu_to_node(from) == early_cpu_to_node(to))
155 return LOCAL_DISTANCE; 145 return LOCAL_DISTANCE;
156 else 146 else
157 return REMOTE_DISTANCE; 147 return REMOTE_DISTANCE;
158}
159
160static int __init setup_pcpu_lpage(bool chosen)
161{
162 size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE;
163 size_t dyn_size = reserve - PERCPU_FIRST_CHUNK_RESERVE;
164 struct pcpu_alloc_info *ai;
165 int rc;
166
167 /* on non-NUMA, embedding is better */
168 if (!chosen && !pcpu_need_numa())
169 return -EINVAL;
170
171 /* need PSE */
172 if (!cpu_has_pse) {
173 pr_warning("PERCPU: lpage allocator requires PSE\n");
174 return -EINVAL;
175 }
176
177 /* allocate and build unit_map */
178 ai = pcpu_build_alloc_info(PERCPU_FIRST_CHUNK_RESERVE, dyn_size,
179 PMD_SIZE, pcpu_lpage_cpu_distance);
180 if (IS_ERR(ai)) {
181 pr_warning("PERCPU: failed to build unit_map (%ld)\n",
182 PTR_ERR(ai));
183 return PTR_ERR(ai);
184 }
185
186 /* do the parameters look okay? */
187 if (!chosen) {
188 size_t vm_size = VMALLOC_END - VMALLOC_START;
189 size_t tot_size = 0;
190 int group;
191
192 for (group = 0; group < ai->nr_groups; group++)
193 tot_size += ai->unit_size * ai->groups[group].nr_units;
194
195 /* don't consume more than 20% of vmalloc area */
196 if (tot_size > vm_size / 5) {
197 pr_info("PERCPU: too large chunk size %zuMB for "
198 "large page remap\n", tot_size >> 20);
199 rc = -EINVAL;
200 goto out_free;
201 }
202 }
203
204 rc = pcpu_lpage_first_chunk(ai, pcpu_fc_alloc, pcpu_fc_free, pcpul_map);
205out_free:
206 pcpu_free_alloc_info(ai);
207 return rc;
208}
209#else 148#else
210static int __init setup_pcpu_lpage(bool chosen) 149 return LOCAL_DISTANCE;
211{
212 return -EINVAL;
213}
214#endif 150#endif
215
216/*
217 * Embedding allocator
218 *
219 * The first chunk is sized to just contain the static area plus
220 * module and dynamic reserves and embedded into linear physical
221 * mapping so that it can use PMD mapping without additional TLB
222 * pressure.
223 */
224static int __init setup_pcpu_embed(bool chosen)
225{
226 size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE;
227
228 /*
229 * If large page isn't supported, there's no benefit in doing
230 * this. Also, embedding allocation doesn't play well with
231 * NUMA.
232 */
233 if (!chosen && (!cpu_has_pse || pcpu_need_numa()))
234 return -EINVAL;
235
236 return pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,
237 reserve - PERCPU_FIRST_CHUNK_RESERVE,
238 PAGE_SIZE, NULL, pcpu_fc_alloc,
239 pcpu_fc_free);
240} 151}
241 152
242/*
243 * Page allocator
244 *
245 * Boring fallback 4k page allocator. This allocator puts more
246 * pressure on PTE TLBs but other than that behaves nicely on both UMA
247 * and NUMA.
248 */
249static void __init pcpup_populate_pte(unsigned long addr) 153static void __init pcpup_populate_pte(unsigned long addr)
250{ 154{
251 populate_extra_pte(addr); 155 populate_extra_pte(addr);
252} 156}
253 157
254static int __init setup_pcpu_page(void)
255{
256 return pcpu_page_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,
257 pcpu_fc_alloc, pcpu_fc_free,
258 pcpup_populate_pte);
259}
260
261static inline void setup_percpu_segment(int cpu) 158static inline void setup_percpu_segment(int cpu)
262{ 159{
263#ifdef CONFIG_X86_32 160#ifdef CONFIG_X86_32
@@ -281,30 +178,34 @@ void __init setup_per_cpu_areas(void)
281 NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids); 178 NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids);
282 179
283 /* 180 /*
284 * Allocate percpu area. If PSE is supported, try to make use 181 * Allocate percpu area. Embedding allocator is our favorite;
285 * of large page mappings. Please read comments on top of 182 * however, on NUMA configurations, it can result in very
286 * each allocator for details. 183 * sparse unit mapping and vmalloc area isn't spacious enough
184 * on 32bit. Use page in that case.
287 */ 185 */
186#ifdef CONFIG_X86_32
187 if (pcpu_chosen_fc == PCPU_FC_AUTO && pcpu_need_numa())
188 pcpu_chosen_fc = PCPU_FC_PAGE;
189#endif
288 rc = -EINVAL; 190 rc = -EINVAL;
289 if (pcpu_chosen_fc != PCPU_FC_AUTO) { 191 if (pcpu_chosen_fc != PCPU_FC_PAGE) {
290 if (pcpu_chosen_fc != PCPU_FC_PAGE) { 192 const size_t atom_size = cpu_has_pse ? PMD_SIZE : PAGE_SIZE;
291 if (pcpu_chosen_fc == PCPU_FC_LPAGE) 193 const size_t dyn_size = PERCPU_MODULE_RESERVE +
292 rc = setup_pcpu_lpage(true); 194 PERCPU_DYNAMIC_RESERVE - PERCPU_FIRST_CHUNK_RESERVE;
293 else 195
294 rc = setup_pcpu_embed(true); 196 rc = pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,
295 197 dyn_size, atom_size,
296 if (rc < 0) 198 pcpu_cpu_distance,
297 pr_warning("PERCPU: %s allocator failed (%d), " 199 pcpu_fc_alloc, pcpu_fc_free);
298 "falling back to page size\n",
299 pcpu_fc_names[pcpu_chosen_fc], rc);
300 }
301 } else {
302 rc = setup_pcpu_lpage(false);
303 if (rc < 0) 200 if (rc < 0)
304 rc = setup_pcpu_embed(false); 201 pr_warning("PERCPU: %s allocator failed (%d), "
202 "falling back to page size\n",
203 pcpu_fc_names[pcpu_chosen_fc], rc);
305 } 204 }
306 if (rc < 0) 205 if (rc < 0)
307 rc = setup_pcpu_page(); 206 rc = pcpu_page_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,
207 pcpu_fc_alloc, pcpu_fc_free,
208 pcpup_populate_pte);
308 if (rc < 0) 209 if (rc < 0)
309 panic("cannot initialize percpu area (err=%d)", rc); 210 panic("cannot initialize percpu area (err=%d)", rc);
310 211