diff options
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r-- | arch/x86/kernel/setup_percpu.c | 137 |
1 files changed, 135 insertions, 2 deletions
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index fd4c399675df..2d946a8f78b9 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c | |||
@@ -111,6 +111,133 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size, | |||
111 | } | 111 | } |
112 | 112 | ||
113 | /* | 113 | /* |
114 | * Remap allocator | ||
115 | * | ||
116 | * This allocator uses PMD page as unit. A PMD page is allocated for | ||
117 | * each cpu and each is remapped into vmalloc area using PMD mapping. | ||
118 | * As PMD page is quite large, only part of it is used for the first | ||
119 | * chunk. Unused part is returned to the bootmem allocator. | ||
120 | * | ||
121 | * So, the PMD pages are mapped twice - once to the physical mapping | ||
122 | * and to the vmalloc area for the first percpu chunk. The double | ||
123 | * mapping does add one more PMD TLB entry pressure but still is much | ||
124 | * better than only using 4k mappings while still being NUMA friendly. | ||
125 | */ | ||
126 | #ifdef CONFIG_NEED_MULTIPLE_NODES | ||
127 | static size_t pcpur_size __initdata; | ||
128 | static void **pcpur_ptrs __initdata; | ||
129 | |||
130 | static struct page * __init pcpur_get_page(unsigned int cpu, int pageno) | ||
131 | { | ||
132 | size_t off = (size_t)pageno << PAGE_SHIFT; | ||
133 | |||
134 | if (off >= pcpur_size) | ||
135 | return NULL; | ||
136 | |||
137 | return virt_to_page(pcpur_ptrs[cpu] + off); | ||
138 | } | ||
139 | |||
140 | static ssize_t __init setup_pcpu_remap(size_t static_size) | ||
141 | { | ||
142 | static struct vm_struct vm; | ||
143 | pg_data_t *last; | ||
144 | size_t ptrs_size; | ||
145 | unsigned int cpu; | ||
146 | ssize_t ret; | ||
147 | |||
148 | /* | ||
149 | * If large page isn't supported, there's no benefit in doing | ||
150 | * this. Also, on non-NUMA, embedding is better. | ||
151 | */ | ||
152 | if (!cpu_has_pse || pcpu_need_numa()) | ||
153 | return -EINVAL; | ||
154 | |||
155 | last = NULL; | ||
156 | for_each_possible_cpu(cpu) { | ||
157 | int node = early_cpu_to_node(cpu); | ||
158 | |||
159 | if (node_online(node) && NODE_DATA(node) && | ||
160 | last && last != NODE_DATA(node)) | ||
161 | goto proceed; | ||
162 | |||
163 | last = NODE_DATA(node); | ||
164 | } | ||
165 | return -EINVAL; | ||
166 | |||
167 | proceed: | ||
168 | /* | ||
169 | * Currently supports only single page. Supporting multiple | ||
170 | * pages won't be too difficult if it ever becomes necessary. | ||
171 | */ | ||
172 | pcpur_size = PFN_ALIGN(static_size + PERCPU_DYNAMIC_RESERVE); | ||
173 | if (pcpur_size > PMD_SIZE) { | ||
174 | pr_warning("PERCPU: static data is larger than large page, " | ||
175 | "can't use large page\n"); | ||
176 | return -EINVAL; | ||
177 | } | ||
178 | |||
179 | /* allocate pointer array and alloc large pages */ | ||
180 | ptrs_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpur_ptrs[0])); | ||
181 | pcpur_ptrs = alloc_bootmem(ptrs_size); | ||
182 | |||
183 | for_each_possible_cpu(cpu) { | ||
184 | pcpur_ptrs[cpu] = pcpu_alloc_bootmem(cpu, PMD_SIZE, PMD_SIZE); | ||
185 | if (!pcpur_ptrs[cpu]) | ||
186 | goto enomem; | ||
187 | |||
188 | /* | ||
189 | * Only use pcpur_size bytes and give back the rest. | ||
190 | * | ||
191 | * Ingo: The 2MB up-rounding bootmem is needed to make | ||
192 | * sure the partial 2MB page is still fully RAM - it's | ||
193 | * not well-specified to have a PAT-incompatible area | ||
194 | * (unmapped RAM, device memory, etc.) in that hole. | ||
195 | */ | ||
196 | free_bootmem(__pa(pcpur_ptrs[cpu] + pcpur_size), | ||
197 | PMD_SIZE - pcpur_size); | ||
198 | |||
199 | memcpy(pcpur_ptrs[cpu], __per_cpu_load, static_size); | ||
200 | } | ||
201 | |||
202 | /* allocate address and map */ | ||
203 | vm.flags = VM_ALLOC; | ||
204 | vm.size = num_possible_cpus() * PMD_SIZE; | ||
205 | vm_area_register_early(&vm, PMD_SIZE); | ||
206 | |||
207 | for_each_possible_cpu(cpu) { | ||
208 | pmd_t *pmd; | ||
209 | |||
210 | pmd = populate_extra_pmd((unsigned long)vm.addr | ||
211 | + cpu * PMD_SIZE); | ||
212 | set_pmd(pmd, pfn_pmd(page_to_pfn(virt_to_page(pcpur_ptrs[cpu])), | ||
213 | PAGE_KERNEL_LARGE)); | ||
214 | } | ||
215 | |||
216 | /* we're ready, commit */ | ||
217 | pr_info("PERCPU: Remapped at %p with large pages, static data " | ||
218 | "%zu bytes\n", vm.addr, static_size); | ||
219 | |||
220 | ret = pcpu_setup_first_chunk(pcpur_get_page, static_size, PMD_SIZE, | ||
221 | pcpur_size - static_size, vm.addr, NULL); | ||
222 | goto out_free_ar; | ||
223 | |||
224 | enomem: | ||
225 | for_each_possible_cpu(cpu) | ||
226 | if (pcpur_ptrs[cpu]) | ||
227 | free_bootmem(__pa(pcpur_ptrs[cpu]), PMD_SIZE); | ||
228 | ret = -ENOMEM; | ||
229 | out_free_ar: | ||
230 | free_bootmem(__pa(pcpur_ptrs), ptrs_size); | ||
231 | return ret; | ||
232 | } | ||
233 | #else | ||
234 | static ssize_t __init setup_pcpu_remap(size_t static_size) | ||
235 | { | ||
236 | return -EINVAL; | ||
237 | } | ||
238 | #endif | ||
239 | |||
240 | /* | ||
114 | * Embedding allocator | 241 | * Embedding allocator |
115 | * | 242 | * |
116 | * The first chunk is sized to just contain the static area plus | 243 | * The first chunk is sized to just contain the static area plus |
@@ -259,8 +386,14 @@ void __init setup_per_cpu_areas(void) | |||
259 | pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n", | 386 | pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n", |
260 | NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids); | 387 | NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids); |
261 | 388 | ||
262 | /* allocate percpu area */ | 389 | /* |
263 | ret = setup_pcpu_embed(static_size); | 390 | * Allocate percpu area. If PSE is supported, try to make use |
391 | * of large page mappings. Please read comments on top of | ||
392 | * each allocator for details. | ||
393 | */ | ||
394 | ret = setup_pcpu_remap(static_size); | ||
395 | if (ret < 0) | ||
396 | ret = setup_pcpu_embed(static_size); | ||
264 | if (ret < 0) | 397 | if (ret < 0) |
265 | ret = setup_pcpu_4k(static_size); | 398 | ret = setup_pcpu_4k(static_size); |
266 | if (ret < 0) | 399 | if (ret < 0) |