diff options
author | Tejun Heo <tj@kernel.org> | 2009-07-03 19:10:59 -0400 |
---|---|---|
committer | Tejun Heo <tj@kernel.org> | 2009-07-03 19:10:59 -0400 |
commit | 8c4bfc6e8801616ab2e01c38140b2159b388d2ff (patch) | |
tree | e29e8bbfae362362554b870371a6187b41f92d82 /mm | |
parent | 8f05a6a65d944f2fed4eb384fb58aa8c8e5a9bab (diff) |
x86,percpu: generalize lpage first chunk allocator
Generalize and move x86 setup_pcpu_lpage() into
pcpu_lpage_first_chunk(). setup_pcpu_lpage() now is a simple wrapper
around the generalized version. Other than taking size parameters and
using arch supplied callbacks to allocate/free/map memory,
pcpu_lpage_first_chunk() is identical to the original implementation.
This simplifies arch code and will help converting more archs to
dynamic percpu allocator.
While at it, factor out pcpu_calc_fc_sizes() which is common to
pcpu_embed_first_chunk() and pcpu_lpage_first_chunk().
[ Impact: code reorganization and generalization ]
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'mm')
-rw-r--r-- | mm/percpu.c | 209 |
1 files changed, 205 insertions, 4 deletions
diff --git a/mm/percpu.c b/mm/percpu.c index f3fe7bc7378f..17db527ee2e2 100644 --- a/mm/percpu.c +++ b/mm/percpu.c | |||
@@ -1190,6 +1190,19 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, | |||
1190 | return pcpu_unit_size; | 1190 | return pcpu_unit_size; |
1191 | } | 1191 | } |
1192 | 1192 | ||
1193 | static size_t pcpu_calc_fc_sizes(size_t static_size, size_t reserved_size, | ||
1194 | ssize_t *dyn_sizep) | ||
1195 | { | ||
1196 | size_t size_sum; | ||
1197 | |||
1198 | size_sum = PFN_ALIGN(static_size + reserved_size + | ||
1199 | (*dyn_sizep >= 0 ? *dyn_sizep : 0)); | ||
1200 | if (*dyn_sizep != 0) | ||
1201 | *dyn_sizep = size_sum - static_size - reserved_size; | ||
1202 | |||
1203 | return size_sum; | ||
1204 | } | ||
1205 | |||
1193 | /* | 1206 | /* |
1194 | * Embedding first chunk setup helper. | 1207 | * Embedding first chunk setup helper. |
1195 | */ | 1208 | */ |
@@ -1241,10 +1254,7 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, | |||
1241 | unsigned int cpu; | 1254 | unsigned int cpu; |
1242 | 1255 | ||
1243 | /* determine parameters and allocate */ | 1256 | /* determine parameters and allocate */ |
1244 | pcpue_size = PFN_ALIGN(static_size + reserved_size + | 1257 | pcpue_size = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size); |
1245 | (dyn_size >= 0 ? dyn_size : 0)); | ||
1246 | if (dyn_size != 0) | ||
1247 | dyn_size = pcpue_size - static_size - reserved_size; | ||
1248 | 1258 | ||
1249 | pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE); | 1259 | pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE); |
1250 | chunk_size = pcpue_unit_size * num_possible_cpus(); | 1260 | chunk_size = pcpue_unit_size * num_possible_cpus(); |
@@ -1391,6 +1401,197 @@ out_free_ar: | |||
1391 | } | 1401 | } |
1392 | 1402 | ||
1393 | /* | 1403 | /* |
1404 | * Large page remapping first chunk setup helper | ||
1405 | */ | ||
1406 | #ifdef CONFIG_NEED_MULTIPLE_NODES | ||
1407 | struct pcpul_ent { | ||
1408 | unsigned int cpu; | ||
1409 | void *ptr; | ||
1410 | }; | ||
1411 | |||
1412 | static size_t pcpul_size; | ||
1413 | static size_t pcpul_unit_size; | ||
1414 | static struct pcpul_ent *pcpul_map; | ||
1415 | static struct vm_struct pcpul_vm; | ||
1416 | |||
1417 | static struct page * __init pcpul_get_page(unsigned int cpu, int pageno) | ||
1418 | { | ||
1419 | size_t off = (size_t)pageno << PAGE_SHIFT; | ||
1420 | |||
1421 | if (off >= pcpul_size) | ||
1422 | return NULL; | ||
1423 | |||
1424 | return virt_to_page(pcpul_map[cpu].ptr + off); | ||
1425 | } | ||
1426 | |||
1427 | /** | ||
1428 | * pcpu_lpage_first_chunk - remap the first percpu chunk using large page | ||
1429 | * @static_size: the size of static percpu area in bytes | ||
1430 | * @reserved_size: the size of reserved percpu area in bytes | ||
1431 | * @dyn_size: free size for dynamic allocation in bytes, -1 for auto | ||
1432 | * @lpage_size: the size of a large page | ||
1433 | * @alloc_fn: function to allocate percpu lpage, always called with lpage_size | ||
1434 | * @free_fn: function to free percpu memory, @size <= lpage_size | ||
1435 | * @map_fn: function to map percpu lpage, always called with lpage_size | ||
1436 | * | ||
1437 | * This allocator uses large page as unit. A large page is allocated | ||
1438 | * for each cpu and each is remapped into vmalloc area using large | ||
1439 | * page mapping. As large page can be quite large, only part of it is | ||
1440 | * used for the first chunk. Unused part is returned to the bootmem | ||
1441 | * allocator. | ||
1442 | * | ||
1443 | * So, the large pages are mapped twice - once to the physical mapping | ||
1444 | * and to the vmalloc area for the first percpu chunk. The double | ||
1445 | * mapping does add one more large TLB entry pressure but still is | ||
1446 | * much better than only using 4k mappings while still being NUMA | ||
1447 | * friendly. | ||
1448 | * | ||
1449 | * RETURNS: | ||
1450 | * The determined pcpu_unit_size which can be used to initialize | ||
1451 | * percpu access on success, -errno on failure. | ||
1452 | */ | ||
1453 | ssize_t __init pcpu_lpage_first_chunk(size_t static_size, size_t reserved_size, | ||
1454 | ssize_t dyn_size, size_t lpage_size, | ||
1455 | pcpu_fc_alloc_fn_t alloc_fn, | ||
1456 | pcpu_fc_free_fn_t free_fn, | ||
1457 | pcpu_fc_map_fn_t map_fn) | ||
1458 | { | ||
1459 | size_t size_sum; | ||
1460 | size_t map_size; | ||
1461 | unsigned int cpu; | ||
1462 | int i, j; | ||
1463 | ssize_t ret; | ||
1464 | |||
1465 | /* | ||
1466 | * Currently supports only single page. Supporting multiple | ||
1467 | * pages won't be too difficult if it ever becomes necessary. | ||
1468 | */ | ||
1469 | size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size); | ||
1470 | |||
1471 | pcpul_unit_size = lpage_size; | ||
1472 | pcpul_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE); | ||
1473 | if (pcpul_size > pcpul_unit_size) { | ||
1474 | pr_warning("PERCPU: static data is larger than large page, " | ||
1475 | "can't use large page\n"); | ||
1476 | return -EINVAL; | ||
1477 | } | ||
1478 | |||
1479 | /* allocate pointer array and alloc large pages */ | ||
1480 | map_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpul_map[0])); | ||
1481 | pcpul_map = alloc_bootmem(map_size); | ||
1482 | |||
1483 | for_each_possible_cpu(cpu) { | ||
1484 | void *ptr; | ||
1485 | |||
1486 | ptr = alloc_fn(cpu, lpage_size); | ||
1487 | if (!ptr) { | ||
1488 | pr_warning("PERCPU: failed to allocate large page " | ||
1489 | "for cpu%u\n", cpu); | ||
1490 | goto enomem; | ||
1491 | } | ||
1492 | |||
1493 | /* | ||
1494 | * Only use pcpul_size bytes and give back the rest. | ||
1495 | * | ||
1496 | * Ingo: The lpage_size up-rounding bootmem is needed | ||
1497 | * to make sure the partial lpage is still fully RAM - | ||
1498 | * it's not well-specified to have a incompatible area | ||
1499 | * (unmapped RAM, device memory, etc.) in that hole. | ||
1500 | */ | ||
1501 | free_fn(ptr + pcpul_size, lpage_size - pcpul_size); | ||
1502 | |||
1503 | pcpul_map[cpu].cpu = cpu; | ||
1504 | pcpul_map[cpu].ptr = ptr; | ||
1505 | |||
1506 | memcpy(ptr, __per_cpu_load, static_size); | ||
1507 | } | ||
1508 | |||
1509 | /* allocate address and map */ | ||
1510 | pcpul_vm.flags = VM_ALLOC; | ||
1511 | pcpul_vm.size = num_possible_cpus() * pcpul_unit_size; | ||
1512 | vm_area_register_early(&pcpul_vm, pcpul_unit_size); | ||
1513 | |||
1514 | for_each_possible_cpu(cpu) | ||
1515 | map_fn(pcpul_map[cpu].ptr, pcpul_unit_size, | ||
1516 | pcpul_vm.addr + cpu * pcpul_unit_size); | ||
1517 | |||
1518 | /* we're ready, commit */ | ||
1519 | pr_info("PERCPU: Remapped at %p with large pages, static data " | ||
1520 | "%zu bytes\n", pcpul_vm.addr, static_size); | ||
1521 | |||
1522 | ret = pcpu_setup_first_chunk(pcpul_get_page, static_size, | ||
1523 | reserved_size, dyn_size, pcpul_unit_size, | ||
1524 | pcpul_vm.addr, NULL); | ||
1525 | |||
1526 | /* sort pcpul_map array for pcpu_lpage_remapped() */ | ||
1527 | for (i = 0; i < num_possible_cpus() - 1; i++) | ||
1528 | for (j = i + 1; j < num_possible_cpus(); j++) | ||
1529 | if (pcpul_map[i].ptr > pcpul_map[j].ptr) { | ||
1530 | struct pcpul_ent tmp = pcpul_map[i]; | ||
1531 | pcpul_map[i] = pcpul_map[j]; | ||
1532 | pcpul_map[j] = tmp; | ||
1533 | } | ||
1534 | |||
1535 | return ret; | ||
1536 | |||
1537 | enomem: | ||
1538 | for_each_possible_cpu(cpu) | ||
1539 | if (pcpul_map[cpu].ptr) | ||
1540 | free_fn(pcpul_map[cpu].ptr, pcpul_size); | ||
1541 | free_bootmem(__pa(pcpul_map), map_size); | ||
1542 | return -ENOMEM; | ||
1543 | } | ||
1544 | |||
1545 | /** | ||
1546 | * pcpu_lpage_remapped - determine whether a kaddr is in pcpul recycled area | ||
1547 | * @kaddr: the kernel address in question | ||
1548 | * | ||
1549 | * Determine whether @kaddr falls in the pcpul recycled area. This is | ||
1550 | * used by pageattr to detect VM aliases and break up the pcpu large | ||
1551 | * page mapping such that the same physical page is not mapped under | ||
1552 | * different attributes. | ||
1553 | * | ||
1554 | * The recycled area is always at the tail of a partially used large | ||
1555 | * page. | ||
1556 | * | ||
1557 | * RETURNS: | ||
1558 | * Address of corresponding remapped pcpu address if match is found; | ||
1559 | * otherwise, NULL. | ||
1560 | */ | ||
1561 | void *pcpu_lpage_remapped(void *kaddr) | ||
1562 | { | ||
1563 | unsigned long unit_mask = pcpul_unit_size - 1; | ||
1564 | void *lpage_addr = (void *)((unsigned long)kaddr & ~unit_mask); | ||
1565 | unsigned long offset = (unsigned long)kaddr & unit_mask; | ||
1566 | int left = 0, right = num_possible_cpus() - 1; | ||
1567 | int pos; | ||
1568 | |||
1569 | /* pcpul in use at all? */ | ||
1570 | if (!pcpul_map) | ||
1571 | return NULL; | ||
1572 | |||
1573 | /* okay, perform binary search */ | ||
1574 | while (left <= right) { | ||
1575 | pos = (left + right) / 2; | ||
1576 | |||
1577 | if (pcpul_map[pos].ptr < lpage_addr) | ||
1578 | left = pos + 1; | ||
1579 | else if (pcpul_map[pos].ptr > lpage_addr) | ||
1580 | right = pos - 1; | ||
1581 | else { | ||
1582 | /* it shouldn't be in the area for the first chunk */ | ||
1583 | WARN_ON(offset < pcpul_size); | ||
1584 | |||
1585 | return pcpul_vm.addr + | ||
1586 | pcpul_map[pos].cpu * pcpul_unit_size + offset; | ||
1587 | } | ||
1588 | } | ||
1589 | |||
1590 | return NULL; | ||
1591 | } | ||
1592 | #endif | ||
1593 | |||
1594 | /* | ||
1394 | * Generic percpu area setup. | 1595 | * Generic percpu area setup. |
1395 | * | 1596 | * |
1396 | * The embedding helper is used because its behavior closely resembles | 1597 | * The embedding helper is used because its behavior closely resembles |