aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel
diff options
context:
space:
mode:
authorTejun Heo <tj@kernel.org>2009-07-03 19:11:00 -0400
committerTejun Heo <tj@kernel.org>2009-07-03 19:11:00 -0400
commita530b7958612bafe2027e21359083dba84f0b3b4 (patch)
treefecbfc0d23b7702a903e8b2539e04e6086ba4404 /arch/x86/kernel
parent2f39e637ea240efb74cf807d31c93a71a0b89174 (diff)
percpu: teach large page allocator about NUMA
Large page first chunk allocator is primarily used for NUMA machines; however, its NUMA handling is extremely simplistic. Regardless of their proximity, each cpu is put into separate large page just to return most of the allocated space back wasting large amount of vmalloc space and increasing cache footprint. This patch teachs NUMA details to large page allocator. Given processor proximity information, pcpu_lpage_build_unit_map() will find fitting cpu -> unit mapping in which cpus in LOCAL_DISTANCE share the same large page and not too much virtual address space is wasted. This greatly reduces the unit and thus chunk size and wastes much less address space for the first chunk. For example, on 4/4 NUMA machine, the original code occupied 16MB of virtual space for the first chunk while the new code only uses 4MB - one 2MB page for each node. [ Impact: much better space efficiency on NUMA machines ] Signed-off-by: Tejun Heo <tj@kernel.org> Cc: Ingo Molnar <mingo@elte.hu> Cc: Jan Beulich <JBeulich@novell.com> Cc: Andi Kleen <andi@firstfloor.org> Cc: David Miller <davem@davemloft.net>
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/setup_percpu.c72
1 files changed, 55 insertions, 17 deletions
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 4f2e0ac9130b..7501bb14bd51 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -149,36 +149,73 @@ static void __init pcpul_map(void *ptr, size_t size, void *addr)
149 set_pmd(pmd, pmd_v); 149 set_pmd(pmd, pmd_v);
150} 150}
151 151
152static int pcpu_lpage_cpu_distance(unsigned int from, unsigned int to)
153{
154 if (early_cpu_to_node(from) == early_cpu_to_node(to))
155 return LOCAL_DISTANCE;
156 else
157 return REMOTE_DISTANCE;
158}
159
152static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) 160static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen)
153{ 161{
154 size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE; 162 size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE;
163 size_t dyn_size = reserve - PERCPU_FIRST_CHUNK_RESERVE;
164 size_t unit_map_size, unit_size;
165 int *unit_map;
166 int nr_units;
167 ssize_t ret;
168
169 /* on non-NUMA, embedding is better */
170 if (!chosen && !pcpu_need_numa())
171 return -EINVAL;
172
173 /* need PSE */
174 if (!cpu_has_pse) {
175 pr_warning("PERCPU: lpage allocator requires PSE\n");
176 return -EINVAL;
177 }
155 178
179 /* allocate and build unit_map */
180 unit_map_size = num_possible_cpus() * sizeof(int);
181 unit_map = alloc_bootmem_nopanic(unit_map_size);
182 if (!unit_map) {
183 pr_warning("PERCPU: failed to allocate unit_map\n");
184 return -ENOMEM;
185 }
186
187 ret = pcpu_lpage_build_unit_map(static_size,
188 PERCPU_FIRST_CHUNK_RESERVE,
189 &dyn_size, &unit_size, PMD_SIZE,
190 unit_map, pcpu_lpage_cpu_distance);
191 if (ret < 0) {
192 pr_warning("PERCPU: failed to build unit_map\n");
193 goto out_free;
194 }
195 nr_units = ret;
196
197 /* do the parameters look okay? */
156 if (!chosen) { 198 if (!chosen) {
157 size_t vm_size = VMALLOC_END - VMALLOC_START; 199 size_t vm_size = VMALLOC_END - VMALLOC_START;
158 size_t tot_size = num_possible_cpus() * PMD_SIZE; 200 size_t tot_size = nr_units * unit_size;
159
160 /* on non-NUMA, embedding is better */
161 if (!pcpu_need_numa())
162 return -EINVAL;
163 201
164 /* don't consume more than 20% of vmalloc area */ 202 /* don't consume more than 20% of vmalloc area */
165 if (tot_size > vm_size / 5) { 203 if (tot_size > vm_size / 5) {
166 pr_info("PERCPU: too large chunk size %zuMB for " 204 pr_info("PERCPU: too large chunk size %zuMB for "
167 "large page remap\n", tot_size >> 20); 205 "large page remap\n", tot_size >> 20);
168 return -EINVAL; 206 ret = -EINVAL;
207 goto out_free;
169 } 208 }
170 } 209 }
171 210
172 /* need PSE */ 211 ret = pcpu_lpage_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE,
173 if (!cpu_has_pse) { 212 dyn_size, unit_size, PMD_SIZE,
174 pr_warning("PERCPU: lpage allocator requires PSE\n"); 213 unit_map, nr_units,
175 return -EINVAL; 214 pcpu_fc_alloc, pcpu_fc_free, pcpul_map);
176 } 215out_free:
177 216 if (ret < 0)
178 return pcpu_lpage_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE, 217 free_bootmem(__pa(unit_map), unit_map_size);
179 reserve - PERCPU_FIRST_CHUNK_RESERVE, 218 return ret;
180 PMD_SIZE,
181 pcpu_fc_alloc, pcpu_fc_free, pcpul_map);
182} 219}
183#else 220#else
184static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) 221static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen)
@@ -299,7 +336,8 @@ void __init setup_per_cpu_areas(void)
299 /* alrighty, percpu areas up and running */ 336 /* alrighty, percpu areas up and running */
300 delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; 337 delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
301 for_each_possible_cpu(cpu) { 338 for_each_possible_cpu(cpu) {
302 per_cpu_offset(cpu) = delta + cpu * pcpu_unit_size; 339 per_cpu_offset(cpu) =
340 delta + pcpu_unit_map[cpu] * pcpu_unit_size;
303 per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu); 341 per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu);
304 per_cpu(cpu_number, cpu) = cpu; 342 per_cpu(cpu_number, cpu) = cpu;
305 setup_percpu_segment(cpu); 343 setup_percpu_segment(cpu);