aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTejun Heo <tj@kernel.org>2009-07-03 19:10:59 -0400
committerTejun Heo <tj@kernel.org>2009-07-03 19:10:59 -0400
commit8c4bfc6e8801616ab2e01c38140b2159b388d2ff (patch)
treee29e8bbfae362362554b870371a6187b41f92d82
parent8f05a6a65d944f2fed4eb384fb58aa8c8e5a9bab (diff)
x86,percpu: generalize lpage first chunk allocator
Generalize and move x86 setup_pcpu_lpage() into pcpu_lpage_first_chunk(). setup_pcpu_lpage() now is a simple wrapper around the generalized version. Other than taking size parameters and using arch supplied callbacks to allocate/free/map memory, pcpu_lpage_first_chunk() is identical to the original implementation. This simplifies arch code and will help converting more archs to dynamic percpu allocator. While at it, factor out pcpu_calc_fc_sizes() which is common to pcpu_embed_first_chunk() and pcpu_lpage_first_chunk(). [ Impact: code reorganization and generalization ] Signed-off-by: Tejun Heo <tj@kernel.org> Cc: Ingo Molnar <mingo@elte.hu>
-rw-r--r--arch/x86/include/asm/percpu.h9
-rw-r--r--arch/x86/kernel/setup_percpu.c169
-rw-r--r--arch/x86/mm/pageattr.c1
-rw-r--r--include/linux/percpu.h27
-rw-r--r--mm/percpu.c209
5 files changed, 244 insertions, 171 deletions
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 103f1ddb0d85..a18c038a3079 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -156,15 +156,6 @@ do { \
156/* We can use this directly for local CPU (faster). */ 156/* We can use this directly for local CPU (faster). */
157DECLARE_PER_CPU(unsigned long, this_cpu_off); 157DECLARE_PER_CPU(unsigned long, this_cpu_off);
158 158
159#ifdef CONFIG_NEED_MULTIPLE_NODES
160void *pcpu_lpage_remapped(void *kaddr);
161#else
162static inline void *pcpu_lpage_remapped(void *kaddr)
163{
164 return NULL;
165}
166#endif
167
168#endif /* !__ASSEMBLY__ */ 159#endif /* !__ASSEMBLY__ */
169 160
170#ifdef CONFIG_SMP 161#ifdef CONFIG_SMP
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index ab896b31e80b..4f2e0ac9130b 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -137,44 +137,21 @@ static void __init pcpu_fc_free(void *ptr, size_t size)
137} 137}
138 138
139/* 139/*
140 * Large page remap allocator 140 * Large page remapping allocator
141 *
142 * This allocator uses PMD page as unit. A PMD page is allocated for
143 * each cpu and each is remapped into vmalloc area using PMD mapping.
144 * As PMD page is quite large, only part of it is used for the first
145 * chunk. Unused part is returned to the bootmem allocator.
146 *
147 * So, the PMD pages are mapped twice - once to the physical mapping
148 * and to the vmalloc area for the first percpu chunk. The double
149 * mapping does add one more PMD TLB entry pressure but still is much
150 * better than only using 4k mappings while still being NUMA friendly.
151 */ 141 */
152#ifdef CONFIG_NEED_MULTIPLE_NODES 142#ifdef CONFIG_NEED_MULTIPLE_NODES
153struct pcpul_ent { 143static void __init pcpul_map(void *ptr, size_t size, void *addr)
154 unsigned int cpu;
155 void *ptr;
156};
157
158static size_t pcpul_size;
159static struct pcpul_ent *pcpul_map;
160static struct vm_struct pcpul_vm;
161
162static struct page * __init pcpul_get_page(unsigned int cpu, int pageno)
163{ 144{
164 size_t off = (size_t)pageno << PAGE_SHIFT; 145 pmd_t *pmd, pmd_v;
165 146
166 if (off >= pcpul_size) 147 pmd = populate_extra_pmd((unsigned long)addr);
167 return NULL; 148 pmd_v = pfn_pmd(page_to_pfn(virt_to_page(ptr)), PAGE_KERNEL_LARGE);
168 149 set_pmd(pmd, pmd_v);
169 return virt_to_page(pcpul_map[cpu].ptr + off);
170} 150}
171 151
172static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) 152static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen)
173{ 153{
174 size_t map_size, dyn_size; 154 size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE;
175 unsigned int cpu;
176 int i, j;
177 ssize_t ret;
178 155
179 if (!chosen) { 156 if (!chosen) {
180 size_t vm_size = VMALLOC_END - VMALLOC_START; 157 size_t vm_size = VMALLOC_END - VMALLOC_START;
@@ -198,134 +175,10 @@ static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen)
198 return -EINVAL; 175 return -EINVAL;
199 } 176 }
200 177
201 /* 178 return pcpu_lpage_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE,
202 * Currently supports only single page. Supporting multiple 179 reserve - PERCPU_FIRST_CHUNK_RESERVE,
203 * pages won't be too difficult if it ever becomes necessary. 180 PMD_SIZE,
204 */ 181 pcpu_fc_alloc, pcpu_fc_free, pcpul_map);
205 pcpul_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE +
206 PERCPU_DYNAMIC_RESERVE);
207 if (pcpul_size > PMD_SIZE) {
208 pr_warning("PERCPU: static data is larger than large page, "
209 "can't use large page\n");
210 return -EINVAL;
211 }
212 dyn_size = pcpul_size - static_size - PERCPU_FIRST_CHUNK_RESERVE;
213
214 /* allocate pointer array and alloc large pages */
215 map_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpul_map[0]));
216 pcpul_map = alloc_bootmem(map_size);
217
218 for_each_possible_cpu(cpu) {
219 pcpul_map[cpu].cpu = cpu;
220 pcpul_map[cpu].ptr = pcpu_alloc_bootmem(cpu, PMD_SIZE,
221 PMD_SIZE);
222 if (!pcpul_map[cpu].ptr) {
223 pr_warning("PERCPU: failed to allocate large page "
224 "for cpu%u\n", cpu);
225 goto enomem;
226 }
227
228 /*
229 * Only use pcpul_size bytes and give back the rest.
230 *
231 * Ingo: The 2MB up-rounding bootmem is needed to make
232 * sure the partial 2MB page is still fully RAM - it's
233 * not well-specified to have a PAT-incompatible area
234 * (unmapped RAM, device memory, etc.) in that hole.
235 */
236 free_bootmem(__pa(pcpul_map[cpu].ptr + pcpul_size),
237 PMD_SIZE - pcpul_size);
238
239 memcpy(pcpul_map[cpu].ptr, __per_cpu_load, static_size);
240 }
241
242 /* allocate address and map */
243 pcpul_vm.flags = VM_ALLOC;
244 pcpul_vm.size = num_possible_cpus() * PMD_SIZE;
245 vm_area_register_early(&pcpul_vm, PMD_SIZE);
246
247 for_each_possible_cpu(cpu) {
248 pmd_t *pmd, pmd_v;
249
250 pmd = populate_extra_pmd((unsigned long)pcpul_vm.addr +
251 cpu * PMD_SIZE);
252 pmd_v = pfn_pmd(page_to_pfn(virt_to_page(pcpul_map[cpu].ptr)),
253 PAGE_KERNEL_LARGE);
254 set_pmd(pmd, pmd_v);
255 }
256
257 /* we're ready, commit */
258 pr_info("PERCPU: Remapped at %p with large pages, static data "
259 "%zu bytes\n", pcpul_vm.addr, static_size);
260
261 ret = pcpu_setup_first_chunk(pcpul_get_page, static_size,
262 PERCPU_FIRST_CHUNK_RESERVE, dyn_size,
263 PMD_SIZE, pcpul_vm.addr, NULL);
264
265 /* sort pcpul_map array for pcpu_lpage_remapped() */
266 for (i = 0; i < num_possible_cpus() - 1; i++)
267 for (j = i + 1; j < num_possible_cpus(); j++)
268 if (pcpul_map[i].ptr > pcpul_map[j].ptr) {
269 struct pcpul_ent tmp = pcpul_map[i];
270 pcpul_map[i] = pcpul_map[j];
271 pcpul_map[j] = tmp;
272 }
273
274 return ret;
275
276enomem:
277 for_each_possible_cpu(cpu)
278 if (pcpul_map[cpu].ptr)
279 free_bootmem(__pa(pcpul_map[cpu].ptr), pcpul_size);
280 free_bootmem(__pa(pcpul_map), map_size);
281 return -ENOMEM;
282}
283
284/**
285 * pcpu_lpage_remapped - determine whether a kaddr is in pcpul recycled area
286 * @kaddr: the kernel address in question
287 *
288 * Determine whether @kaddr falls in the pcpul recycled area. This is
289 * used by pageattr to detect VM aliases and break up the pcpu PMD
290 * mapping such that the same physical page is not mapped under
291 * different attributes.
292 *
293 * The recycled area is always at the tail of a partially used PMD
294 * page.
295 *
296 * RETURNS:
297 * Address of corresponding remapped pcpu address if match is found;
298 * otherwise, NULL.
299 */
300void *pcpu_lpage_remapped(void *kaddr)
301{
302 void *pmd_addr = (void *)((unsigned long)kaddr & PMD_MASK);
303 unsigned long offset = (unsigned long)kaddr & ~PMD_MASK;
304 int left = 0, right = num_possible_cpus() - 1;
305 int pos;
306
307 /* pcpul in use at all? */
308 if (!pcpul_map)
309 return NULL;
310
311 /* okay, perform binary search */
312 while (left <= right) {
313 pos = (left + right) / 2;
314
315 if (pcpul_map[pos].ptr < pmd_addr)
316 left = pos + 1;
317 else if (pcpul_map[pos].ptr > pmd_addr)
318 right = pos - 1;
319 else {
320 /* it shouldn't be in the area for the first chunk */
321 WARN_ON(offset < pcpul_size);
322
323 return pcpul_vm.addr +
324 pcpul_map[pos].cpu * PMD_SIZE + offset;
325 }
326 }
327
328 return NULL;
329} 182}
330#else 183#else
331static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) 184static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen)
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 1b734d7a8966..c106f7852424 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -12,6 +12,7 @@
12#include <linux/seq_file.h> 12#include <linux/seq_file.h>
13#include <linux/debugfs.h> 13#include <linux/debugfs.h>
14#include <linux/pfn.h> 14#include <linux/pfn.h>
15#include <linux/percpu.h>
15 16
16#include <asm/e820.h> 17#include <asm/e820.h>
17#include <asm/processor.h> 18#include <asm/processor.h>
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 41b5bfab4195..9f6bfd7d4b92 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -62,6 +62,7 @@ typedef struct page * (*pcpu_get_page_fn_t)(unsigned int cpu, int pageno);
62typedef void * (*pcpu_fc_alloc_fn_t)(unsigned int cpu, size_t size); 62typedef void * (*pcpu_fc_alloc_fn_t)(unsigned int cpu, size_t size);
63typedef void (*pcpu_fc_free_fn_t)(void *ptr, size_t size); 63typedef void (*pcpu_fc_free_fn_t)(void *ptr, size_t size);
64typedef void (*pcpu_fc_populate_pte_fn_t)(unsigned long addr); 64typedef void (*pcpu_fc_populate_pte_fn_t)(unsigned long addr);
65typedef void (*pcpu_fc_map_fn_t)(void *ptr, size_t size, void *addr);
65 66
66extern size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, 67extern size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
67 size_t static_size, size_t reserved_size, 68 size_t static_size, size_t reserved_size,
@@ -79,6 +80,32 @@ extern ssize_t __init pcpu_4k_first_chunk(
79 pcpu_fc_free_fn_t free_fn, 80 pcpu_fc_free_fn_t free_fn,
80 pcpu_fc_populate_pte_fn_t populate_pte_fn); 81 pcpu_fc_populate_pte_fn_t populate_pte_fn);
81 82
83#ifdef CONFIG_NEED_MULTIPLE_NODES
84extern ssize_t __init pcpu_lpage_first_chunk(
85 size_t static_size, size_t reserved_size,
86 ssize_t dyn_size, size_t lpage_size,
87 pcpu_fc_alloc_fn_t alloc_fn,
88 pcpu_fc_free_fn_t free_fn,
89 pcpu_fc_map_fn_t map_fn);
90
91extern void *pcpu_lpage_remapped(void *kaddr);
92#else
93static inline ssize_t __init pcpu_lpage_first_chunk(
94 size_t static_size, size_t reserved_size,
95 ssize_t dyn_size, size_t lpage_size,
96 pcpu_fc_alloc_fn_t alloc_fn,
97 pcpu_fc_free_fn_t free_fn,
98 pcpu_fc_map_fn_t map_fn)
99{
100 return -EINVAL;
101}
102
103static inline void *pcpu_lpage_remapped(void *kaddr)
104{
105 return NULL;
106}
107#endif
108
82/* 109/*
83 * Use this to get to a cpu's version of the per-cpu object 110 * Use this to get to a cpu's version of the per-cpu object
84 * dynamically allocated. Non-atomic access to the current CPU's 111 * dynamically allocated. Non-atomic access to the current CPU's
diff --git a/mm/percpu.c b/mm/percpu.c
index f3fe7bc7378f..17db527ee2e2 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1190,6 +1190,19 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
1190 return pcpu_unit_size; 1190 return pcpu_unit_size;
1191} 1191}
1192 1192
1193static size_t pcpu_calc_fc_sizes(size_t static_size, size_t reserved_size,
1194 ssize_t *dyn_sizep)
1195{
1196 size_t size_sum;
1197
1198 size_sum = PFN_ALIGN(static_size + reserved_size +
1199 (*dyn_sizep >= 0 ? *dyn_sizep : 0));
1200 if (*dyn_sizep != 0)
1201 *dyn_sizep = size_sum - static_size - reserved_size;
1202
1203 return size_sum;
1204}
1205
1193/* 1206/*
1194 * Embedding first chunk setup helper. 1207 * Embedding first chunk setup helper.
1195 */ 1208 */
@@ -1241,10 +1254,7 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size,
1241 unsigned int cpu; 1254 unsigned int cpu;
1242 1255
1243 /* determine parameters and allocate */ 1256 /* determine parameters and allocate */
1244 pcpue_size = PFN_ALIGN(static_size + reserved_size + 1257 pcpue_size = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size);
1245 (dyn_size >= 0 ? dyn_size : 0));
1246 if (dyn_size != 0)
1247 dyn_size = pcpue_size - static_size - reserved_size;
1248 1258
1249 pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE); 1259 pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE);
1250 chunk_size = pcpue_unit_size * num_possible_cpus(); 1260 chunk_size = pcpue_unit_size * num_possible_cpus();
@@ -1391,6 +1401,197 @@ out_free_ar:
1391} 1401}
1392 1402
1393/* 1403/*
1404 * Large page remapping first chunk setup helper
1405 */
1406#ifdef CONFIG_NEED_MULTIPLE_NODES
1407struct pcpul_ent {
1408 unsigned int cpu;
1409 void *ptr;
1410};
1411
1412static size_t pcpul_size;
1413static size_t pcpul_unit_size;
1414static struct pcpul_ent *pcpul_map;
1415static struct vm_struct pcpul_vm;
1416
1417static struct page * __init pcpul_get_page(unsigned int cpu, int pageno)
1418{
1419 size_t off = (size_t)pageno << PAGE_SHIFT;
1420
1421 if (off >= pcpul_size)
1422 return NULL;
1423
1424 return virt_to_page(pcpul_map[cpu].ptr + off);
1425}
1426
1427/**
1428 * pcpu_lpage_first_chunk - remap the first percpu chunk using large page
1429 * @static_size: the size of static percpu area in bytes
1430 * @reserved_size: the size of reserved percpu area in bytes
1431 * @dyn_size: free size for dynamic allocation in bytes, -1 for auto
1432 * @lpage_size: the size of a large page
1433 * @alloc_fn: function to allocate percpu lpage, always called with lpage_size
1434 * @free_fn: function to free percpu memory, @size <= lpage_size
1435 * @map_fn: function to map percpu lpage, always called with lpage_size
1436 *
1437 * This allocator uses large page as unit. A large page is allocated
1438 * for each cpu and each is remapped into vmalloc area using large
1439 * page mapping. As large page can be quite large, only part of it is
1440 * used for the first chunk. Unused part is returned to the bootmem
1441 * allocator.
1442 *
1443 * So, the large pages are mapped twice - once to the physical mapping
1444 * and to the vmalloc area for the first percpu chunk. The double
1445 * mapping does add one more large TLB entry pressure but still is
1446 * much better than only using 4k mappings while still being NUMA
1447 * friendly.
1448 *
1449 * RETURNS:
1450 * The determined pcpu_unit_size which can be used to initialize
1451 * percpu access on success, -errno on failure.
1452 */
1453ssize_t __init pcpu_lpage_first_chunk(size_t static_size, size_t reserved_size,
1454 ssize_t dyn_size, size_t lpage_size,
1455 pcpu_fc_alloc_fn_t alloc_fn,
1456 pcpu_fc_free_fn_t free_fn,
1457 pcpu_fc_map_fn_t map_fn)
1458{
1459 size_t size_sum;
1460 size_t map_size;
1461 unsigned int cpu;
1462 int i, j;
1463 ssize_t ret;
1464
1465 /*
1466 * Currently supports only single page. Supporting multiple
1467 * pages won't be too difficult if it ever becomes necessary.
1468 */
1469 size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size);
1470
1471 pcpul_unit_size = lpage_size;
1472 pcpul_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
1473 if (pcpul_size > pcpul_unit_size) {
1474 pr_warning("PERCPU: static data is larger than large page, "
1475 "can't use large page\n");
1476 return -EINVAL;
1477 }
1478
1479 /* allocate pointer array and alloc large pages */
1480 map_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpul_map[0]));
1481 pcpul_map = alloc_bootmem(map_size);
1482
1483 for_each_possible_cpu(cpu) {
1484 void *ptr;
1485
1486 ptr = alloc_fn(cpu, lpage_size);
1487 if (!ptr) {
1488 pr_warning("PERCPU: failed to allocate large page "
1489 "for cpu%u\n", cpu);
1490 goto enomem;
1491 }
1492
1493 /*
1494 * Only use pcpul_size bytes and give back the rest.
1495 *
1496 * Ingo: The lpage_size up-rounding bootmem is needed
1497 * to make sure the partial lpage is still fully RAM -
1498 * it's not well-specified to have a incompatible area
1499 * (unmapped RAM, device memory, etc.) in that hole.
1500 */
1501 free_fn(ptr + pcpul_size, lpage_size - pcpul_size);
1502
1503 pcpul_map[cpu].cpu = cpu;
1504 pcpul_map[cpu].ptr = ptr;
1505
1506 memcpy(ptr, __per_cpu_load, static_size);
1507 }
1508
1509 /* allocate address and map */
1510 pcpul_vm.flags = VM_ALLOC;
1511 pcpul_vm.size = num_possible_cpus() * pcpul_unit_size;
1512 vm_area_register_early(&pcpul_vm, pcpul_unit_size);
1513
1514 for_each_possible_cpu(cpu)
1515 map_fn(pcpul_map[cpu].ptr, pcpul_unit_size,
1516 pcpul_vm.addr + cpu * pcpul_unit_size);
1517
1518 /* we're ready, commit */
1519 pr_info("PERCPU: Remapped at %p with large pages, static data "
1520 "%zu bytes\n", pcpul_vm.addr, static_size);
1521
1522 ret = pcpu_setup_first_chunk(pcpul_get_page, static_size,
1523 reserved_size, dyn_size, pcpul_unit_size,
1524 pcpul_vm.addr, NULL);
1525
1526 /* sort pcpul_map array for pcpu_lpage_remapped() */
1527 for (i = 0; i < num_possible_cpus() - 1; i++)
1528 for (j = i + 1; j < num_possible_cpus(); j++)
1529 if (pcpul_map[i].ptr > pcpul_map[j].ptr) {
1530 struct pcpul_ent tmp = pcpul_map[i];
1531 pcpul_map[i] = pcpul_map[j];
1532 pcpul_map[j] = tmp;
1533 }
1534
1535 return ret;
1536
1537enomem:
1538 for_each_possible_cpu(cpu)
1539 if (pcpul_map[cpu].ptr)
1540 free_fn(pcpul_map[cpu].ptr, pcpul_size);
1541 free_bootmem(__pa(pcpul_map), map_size);
1542 return -ENOMEM;
1543}
1544
1545/**
1546 * pcpu_lpage_remapped - determine whether a kaddr is in pcpul recycled area
1547 * @kaddr: the kernel address in question
1548 *
1549 * Determine whether @kaddr falls in the pcpul recycled area. This is
1550 * used by pageattr to detect VM aliases and break up the pcpu large
1551 * page mapping such that the same physical page is not mapped under
1552 * different attributes.
1553 *
1554 * The recycled area is always at the tail of a partially used large
1555 * page.
1556 *
1557 * RETURNS:
1558 * Address of corresponding remapped pcpu address if match is found;
1559 * otherwise, NULL.
1560 */
1561void *pcpu_lpage_remapped(void *kaddr)
1562{
1563 unsigned long unit_mask = pcpul_unit_size - 1;
1564 void *lpage_addr = (void *)((unsigned long)kaddr & ~unit_mask);
1565 unsigned long offset = (unsigned long)kaddr & unit_mask;
1566 int left = 0, right = num_possible_cpus() - 1;
1567 int pos;
1568
1569 /* pcpul in use at all? */
1570 if (!pcpul_map)
1571 return NULL;
1572
1573 /* okay, perform binary search */
1574 while (left <= right) {
1575 pos = (left + right) / 2;
1576
1577 if (pcpul_map[pos].ptr < lpage_addr)
1578 left = pos + 1;
1579 else if (pcpul_map[pos].ptr > lpage_addr)
1580 right = pos - 1;
1581 else {
1582 /* it shouldn't be in the area for the first chunk */
1583 WARN_ON(offset < pcpul_size);
1584
1585 return pcpul_vm.addr +
1586 pcpul_map[pos].cpu * pcpul_unit_size + offset;
1587 }
1588 }
1589
1590 return NULL;
1591}
1592#endif
1593
1594/*
1394 * Generic percpu area setup. 1595 * Generic percpu area setup.
1395 * 1596 *
1396 * The embedding helper is used because its behavior closely resembles 1597 * The embedding helper is used because its behavior closely resembles