diff options
author | Tejun Heo <tj@kernel.org> | 2009-07-03 19:11:00 -0400 |
---|---|---|
committer | Tejun Heo <tj@kernel.org> | 2009-07-03 19:11:00 -0400 |
commit | 2f39e637ea240efb74cf807d31c93a71a0b89174 (patch) | |
tree | d26bd3ad962031c5b495a528b4115c2ed4ff7a80 /mm/percpu.c | |
parent | ce3141a277ff6cc37e51008b8888dc2cb7456ef1 (diff) |
percpu: allow non-linear / sparse cpu -> unit mapping
Currently cpu and unit are always identity mapped. To allow more
efficient large page support on NUMA and lazy allocation for possible
but offline cpus, cpu -> unit mapping needs to be non-linear and/or
sparse. This can be easily implemented by adding a cpu -> unit
mapping array and using it whenever looking up the matching unit for a
cpu.
The only unusal conversion is in pcpu_chunk_addr_search(). The passed
in address is unit0 based and unit0 might not be in use so it needs to
be converted to address of an in-use unit. This is easily done by
adding the unit offset for the current processor.
[ Impact: allows non-linear/sparse cpu -> unit mapping, no visible change yet ]
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: David Miller <davem@davemloft.net>
Diffstat (limited to 'mm/percpu.c')
-rw-r--r-- | mm/percpu.c | 129 |
1 files changed, 94 insertions, 35 deletions
diff --git a/mm/percpu.c b/mm/percpu.c index 21756814d99f..2196fae24f00 100644 --- a/mm/percpu.c +++ b/mm/percpu.c | |||
@@ -8,12 +8,13 @@ | |||
8 | * | 8 | * |
9 | * This is percpu allocator which can handle both static and dynamic | 9 | * This is percpu allocator which can handle both static and dynamic |
10 | * areas. Percpu areas are allocated in chunks in vmalloc area. Each | 10 | * areas. Percpu areas are allocated in chunks in vmalloc area. Each |
11 | * chunk is consisted of num_possible_cpus() units and the first chunk | 11 | * chunk is consisted of boot-time determined number of units and the |
12 | * is used for static percpu variables in the kernel image (special | 12 | * first chunk is used for static percpu variables in the kernel image |
13 | * boot time alloc/init handling necessary as these areas need to be | 13 | * (special boot time alloc/init handling necessary as these areas |
14 | * brought up before allocation services are running). Unit grows as | 14 | * need to be brought up before allocation services are running). |
15 | * necessary and all units grow or shrink in unison. When a chunk is | 15 | * Unit grows as necessary and all units grow or shrink in unison. |
16 | * filled up, another chunk is allocated. ie. in vmalloc area | 16 | * When a chunk is filled up, another chunk is allocated. ie. in |
17 | * vmalloc area | ||
17 | * | 18 | * |
18 | * c0 c1 c2 | 19 | * c0 c1 c2 |
19 | * ------------------- ------------------- ------------ | 20 | * ------------------- ------------------- ------------ |
@@ -22,11 +23,13 @@ | |||
22 | * | 23 | * |
23 | * Allocation is done in offset-size areas of single unit space. Ie, | 24 | * Allocation is done in offset-size areas of single unit space. Ie, |
24 | * an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0, | 25 | * an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0, |
25 | * c1:u1, c1:u2 and c1:u3. Percpu access can be done by configuring | 26 | * c1:u1, c1:u2 and c1:u3. On UMA, units corresponds directly to |
26 | * percpu base registers pcpu_unit_size apart. | 27 | * cpus. On NUMA, the mapping can be non-linear and even sparse. |
28 | * Percpu access can be done by configuring percpu base registers | ||
29 | * according to cpu to unit mapping and pcpu_unit_size. | ||
27 | * | 30 | * |
28 | * There are usually many small percpu allocations many of them as | 31 | * There are usually many small percpu allocations many of them being |
29 | * small as 4 bytes. The allocator organizes chunks into lists | 32 | * as small as 4 bytes. The allocator organizes chunks into lists |
30 | * according to free size and tries to allocate from the fullest one. | 33 | * according to free size and tries to allocate from the fullest one. |
31 | * Each chunk keeps the maximum contiguous area size hint which is | 34 | * Each chunk keeps the maximum contiguous area size hint which is |
32 | * guaranteed to be eqaul to or larger than the maximum contiguous | 35 | * guaranteed to be eqaul to or larger than the maximum contiguous |
@@ -99,14 +102,22 @@ struct pcpu_chunk { | |||
99 | 102 | ||
100 | static int pcpu_unit_pages __read_mostly; | 103 | static int pcpu_unit_pages __read_mostly; |
101 | static int pcpu_unit_size __read_mostly; | 104 | static int pcpu_unit_size __read_mostly; |
105 | static int pcpu_nr_units __read_mostly; | ||
102 | static int pcpu_chunk_size __read_mostly; | 106 | static int pcpu_chunk_size __read_mostly; |
103 | static int pcpu_nr_slots __read_mostly; | 107 | static int pcpu_nr_slots __read_mostly; |
104 | static size_t pcpu_chunk_struct_size __read_mostly; | 108 | static size_t pcpu_chunk_struct_size __read_mostly; |
105 | 109 | ||
110 | /* cpus with the lowest and highest unit numbers */ | ||
111 | static unsigned int pcpu_first_unit_cpu __read_mostly; | ||
112 | static unsigned int pcpu_last_unit_cpu __read_mostly; | ||
113 | |||
106 | /* the address of the first chunk which starts with the kernel static area */ | 114 | /* the address of the first chunk which starts with the kernel static area */ |
107 | void *pcpu_base_addr __read_mostly; | 115 | void *pcpu_base_addr __read_mostly; |
108 | EXPORT_SYMBOL_GPL(pcpu_base_addr); | 116 | EXPORT_SYMBOL_GPL(pcpu_base_addr); |
109 | 117 | ||
118 | /* cpu -> unit map */ | ||
119 | const int *pcpu_unit_map __read_mostly; | ||
120 | |||
110 | /* | 121 | /* |
111 | * The first chunk which always exists. Note that unlike other | 122 | * The first chunk which always exists. Note that unlike other |
112 | * chunks, this one can be allocated and mapped in several different | 123 | * chunks, this one can be allocated and mapped in several different |
@@ -177,7 +188,7 @@ static int pcpu_chunk_slot(const struct pcpu_chunk *chunk) | |||
177 | 188 | ||
178 | static int pcpu_page_idx(unsigned int cpu, int page_idx) | 189 | static int pcpu_page_idx(unsigned int cpu, int page_idx) |
179 | { | 190 | { |
180 | return cpu * pcpu_unit_pages + page_idx; | 191 | return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx; |
181 | } | 192 | } |
182 | 193 | ||
183 | static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk, | 194 | static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk, |
@@ -321,6 +332,14 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) | |||
321 | return pcpu_first_chunk; | 332 | return pcpu_first_chunk; |
322 | } | 333 | } |
323 | 334 | ||
335 | /* | ||
336 | * The address is relative to unit0 which might be unused and | ||
337 | * thus unmapped. Offset the address to the unit space of the | ||
338 | * current processor before looking it up in the vmalloc | ||
339 | * space. Note that any possible cpu id can be used here, so | ||
340 | * there's no need to worry about preemption or cpu hotplug. | ||
341 | */ | ||
342 | addr += pcpu_unit_map[smp_processor_id()] * pcpu_unit_size; | ||
324 | return pcpu_get_page_chunk(vmalloc_to_page(addr)); | 343 | return pcpu_get_page_chunk(vmalloc_to_page(addr)); |
325 | } | 344 | } |
326 | 345 | ||
@@ -593,8 +612,7 @@ static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk, | |||
593 | { | 612 | { |
594 | static struct page **pages; | 613 | static struct page **pages; |
595 | static unsigned long *bitmap; | 614 | static unsigned long *bitmap; |
596 | size_t pages_size = num_possible_cpus() * pcpu_unit_pages * | 615 | size_t pages_size = pcpu_nr_units * pcpu_unit_pages * sizeof(pages[0]); |
597 | sizeof(pages[0]); | ||
598 | size_t bitmap_size = BITS_TO_LONGS(pcpu_unit_pages) * | 616 | size_t bitmap_size = BITS_TO_LONGS(pcpu_unit_pages) * |
599 | sizeof(unsigned long); | 617 | sizeof(unsigned long); |
600 | 618 | ||
@@ -692,10 +710,9 @@ static int pcpu_alloc_pages(struct pcpu_chunk *chunk, | |||
692 | static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk, | 710 | static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk, |
693 | int page_start, int page_end) | 711 | int page_start, int page_end) |
694 | { | 712 | { |
695 | unsigned int last = num_possible_cpus() - 1; | 713 | flush_cache_vunmap( |
696 | 714 | pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start), | |
697 | flush_cache_vunmap(pcpu_chunk_addr(chunk, 0, page_start), | 715 | pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end)); |
698 | pcpu_chunk_addr(chunk, last, page_end)); | ||
699 | } | 716 | } |
700 | 717 | ||
701 | static void __pcpu_unmap_pages(unsigned long addr, int nr_pages) | 718 | static void __pcpu_unmap_pages(unsigned long addr, int nr_pages) |
@@ -756,10 +773,9 @@ static void pcpu_unmap_pages(struct pcpu_chunk *chunk, | |||
756 | static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk, | 773 | static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk, |
757 | int page_start, int page_end) | 774 | int page_start, int page_end) |
758 | { | 775 | { |
759 | unsigned int last = num_possible_cpus() - 1; | 776 | flush_tlb_kernel_range( |
760 | 777 | pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start), | |
761 | flush_tlb_kernel_range(pcpu_chunk_addr(chunk, 0, page_start), | 778 | pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end)); |
762 | pcpu_chunk_addr(chunk, last, page_end)); | ||
763 | } | 779 | } |
764 | 780 | ||
765 | static int __pcpu_map_pages(unsigned long addr, struct page **pages, | 781 | static int __pcpu_map_pages(unsigned long addr, struct page **pages, |
@@ -835,11 +851,9 @@ err: | |||
835 | static void pcpu_post_map_flush(struct pcpu_chunk *chunk, | 851 | static void pcpu_post_map_flush(struct pcpu_chunk *chunk, |
836 | int page_start, int page_end) | 852 | int page_start, int page_end) |
837 | { | 853 | { |
838 | unsigned int last = num_possible_cpus() - 1; | 854 | flush_cache_vmap( |
839 | 855 | pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start), | |
840 | /* flush at once, please read comments in pcpu_unmap() */ | 856 | pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end)); |
841 | flush_cache_vmap(pcpu_chunk_addr(chunk, 0, page_start), | ||
842 | pcpu_chunk_addr(chunk, last, page_end)); | ||
843 | } | 857 | } |
844 | 858 | ||
845 | /** | 859 | /** |
@@ -953,8 +967,7 @@ static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) | |||
953 | bitmap_copy(chunk->populated, populated, pcpu_unit_pages); | 967 | bitmap_copy(chunk->populated, populated, pcpu_unit_pages); |
954 | clear: | 968 | clear: |
955 | for_each_possible_cpu(cpu) | 969 | for_each_possible_cpu(cpu) |
956 | memset(chunk->vm->addr + cpu * pcpu_unit_size + off, 0, | 970 | memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size); |
957 | size); | ||
958 | return 0; | 971 | return 0; |
959 | 972 | ||
960 | err_unmap: | 973 | err_unmap: |
@@ -1088,6 +1101,7 @@ area_found: | |||
1088 | 1101 | ||
1089 | mutex_unlock(&pcpu_alloc_mutex); | 1102 | mutex_unlock(&pcpu_alloc_mutex); |
1090 | 1103 | ||
1104 | /* return address relative to unit0 */ | ||
1091 | return __addr_to_pcpu_ptr(chunk->vm->addr + off); | 1105 | return __addr_to_pcpu_ptr(chunk->vm->addr + off); |
1092 | 1106 | ||
1093 | fail_unlock: | 1107 | fail_unlock: |
@@ -1222,6 +1236,7 @@ EXPORT_SYMBOL_GPL(free_percpu); | |||
1222 | * @dyn_size: free size for dynamic allocation in bytes, -1 for auto | 1236 | * @dyn_size: free size for dynamic allocation in bytes, -1 for auto |
1223 | * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE | 1237 | * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE |
1224 | * @base_addr: mapped address | 1238 | * @base_addr: mapped address |
1239 | * @unit_map: cpu -> unit map, NULL for sequential mapping | ||
1225 | * | 1240 | * |
1226 | * Initialize the first percpu chunk which contains the kernel static | 1241 | * Initialize the first percpu chunk which contains the kernel static |
1227 | * perpcu area. This function is to be called from arch percpu area | 1242 | * perpcu area. This function is to be called from arch percpu area |
@@ -1260,16 +1275,17 @@ EXPORT_SYMBOL_GPL(free_percpu); | |||
1260 | */ | 1275 | */ |
1261 | size_t __init pcpu_setup_first_chunk(size_t static_size, size_t reserved_size, | 1276 | size_t __init pcpu_setup_first_chunk(size_t static_size, size_t reserved_size, |
1262 | ssize_t dyn_size, size_t unit_size, | 1277 | ssize_t dyn_size, size_t unit_size, |
1263 | void *base_addr) | 1278 | void *base_addr, const int *unit_map) |
1264 | { | 1279 | { |
1265 | static struct vm_struct first_vm; | 1280 | static struct vm_struct first_vm; |
1266 | static int smap[2], dmap[2]; | 1281 | static int smap[2], dmap[2]; |
1267 | size_t size_sum = static_size + reserved_size + | 1282 | size_t size_sum = static_size + reserved_size + |
1268 | (dyn_size >= 0 ? dyn_size : 0); | 1283 | (dyn_size >= 0 ? dyn_size : 0); |
1269 | struct pcpu_chunk *schunk, *dchunk = NULL; | 1284 | struct pcpu_chunk *schunk, *dchunk = NULL; |
1285 | unsigned int cpu, tcpu; | ||
1270 | int i; | 1286 | int i; |
1271 | 1287 | ||
1272 | /* santiy checks */ | 1288 | /* sanity checks */ |
1273 | BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC || | 1289 | BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC || |
1274 | ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC); | 1290 | ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC); |
1275 | BUG_ON(!static_size); | 1291 | BUG_ON(!static_size); |
@@ -1278,9 +1294,52 @@ size_t __init pcpu_setup_first_chunk(size_t static_size, size_t reserved_size, | |||
1278 | BUG_ON(unit_size & ~PAGE_MASK); | 1294 | BUG_ON(unit_size & ~PAGE_MASK); |
1279 | BUG_ON(unit_size < PCPU_MIN_UNIT_SIZE); | 1295 | BUG_ON(unit_size < PCPU_MIN_UNIT_SIZE); |
1280 | 1296 | ||
1297 | /* determine number of units and verify and initialize pcpu_unit_map */ | ||
1298 | if (unit_map) { | ||
1299 | int first_unit = INT_MAX, last_unit = INT_MIN; | ||
1300 | |||
1301 | for_each_possible_cpu(cpu) { | ||
1302 | int unit = unit_map[cpu]; | ||
1303 | |||
1304 | BUG_ON(unit < 0); | ||
1305 | for_each_possible_cpu(tcpu) { | ||
1306 | if (tcpu == cpu) | ||
1307 | break; | ||
1308 | /* the mapping should be one-to-one */ | ||
1309 | BUG_ON(unit_map[tcpu] == unit); | ||
1310 | } | ||
1311 | |||
1312 | if (unit < first_unit) { | ||
1313 | pcpu_first_unit_cpu = cpu; | ||
1314 | first_unit = unit; | ||
1315 | } | ||
1316 | if (unit > last_unit) { | ||
1317 | pcpu_last_unit_cpu = cpu; | ||
1318 | last_unit = unit; | ||
1319 | } | ||
1320 | } | ||
1321 | pcpu_nr_units = last_unit + 1; | ||
1322 | pcpu_unit_map = unit_map; | ||
1323 | } else { | ||
1324 | int *identity_map; | ||
1325 | |||
1326 | /* #units == #cpus, identity mapped */ | ||
1327 | identity_map = alloc_bootmem(num_possible_cpus() * | ||
1328 | sizeof(identity_map[0])); | ||
1329 | |||
1330 | for_each_possible_cpu(cpu) | ||
1331 | identity_map[cpu] = cpu; | ||
1332 | |||
1333 | pcpu_first_unit_cpu = 0; | ||
1334 | pcpu_last_unit_cpu = pcpu_nr_units - 1; | ||
1335 | pcpu_nr_units = num_possible_cpus(); | ||
1336 | pcpu_unit_map = identity_map; | ||
1337 | } | ||
1338 | |||
1339 | /* determine basic parameters */ | ||
1281 | pcpu_unit_pages = unit_size >> PAGE_SHIFT; | 1340 | pcpu_unit_pages = unit_size >> PAGE_SHIFT; |
1282 | pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT; | 1341 | pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT; |
1283 | pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size; | 1342 | pcpu_chunk_size = pcpu_nr_units * pcpu_unit_size; |
1284 | pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) + | 1343 | pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) + |
1285 | BITS_TO_LONGS(pcpu_unit_pages) * sizeof(unsigned long); | 1344 | BITS_TO_LONGS(pcpu_unit_pages) * sizeof(unsigned long); |
1286 | 1345 | ||
@@ -1349,7 +1408,7 @@ size_t __init pcpu_setup_first_chunk(size_t static_size, size_t reserved_size, | |||
1349 | pcpu_chunk_relocate(pcpu_first_chunk, -1); | 1408 | pcpu_chunk_relocate(pcpu_first_chunk, -1); |
1350 | 1409 | ||
1351 | /* we're done */ | 1410 | /* we're done */ |
1352 | pcpu_base_addr = (void *)pcpu_chunk_addr(schunk, 0, 0); | 1411 | pcpu_base_addr = schunk->vm->addr; |
1353 | return pcpu_unit_size; | 1412 | return pcpu_unit_size; |
1354 | } | 1413 | } |
1355 | 1414 | ||
@@ -1427,7 +1486,7 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, | |||
1427 | size_sum >> PAGE_SHIFT, base, static_size); | 1486 | size_sum >> PAGE_SHIFT, base, static_size); |
1428 | 1487 | ||
1429 | return pcpu_setup_first_chunk(static_size, reserved_size, dyn_size, | 1488 | return pcpu_setup_first_chunk(static_size, reserved_size, dyn_size, |
1430 | unit_size, base); | 1489 | unit_size, base, NULL); |
1431 | } | 1490 | } |
1432 | 1491 | ||
1433 | /** | 1492 | /** |
@@ -1519,7 +1578,7 @@ ssize_t __init pcpu_4k_first_chunk(size_t static_size, size_t reserved_size, | |||
1519 | unit_pages, static_size); | 1578 | unit_pages, static_size); |
1520 | 1579 | ||
1521 | ret = pcpu_setup_first_chunk(static_size, reserved_size, -1, | 1580 | ret = pcpu_setup_first_chunk(static_size, reserved_size, -1, |
1522 | unit_pages << PAGE_SHIFT, vm.addr); | 1581 | unit_pages << PAGE_SHIFT, vm.addr, NULL); |
1523 | goto out_free_ar; | 1582 | goto out_free_ar; |
1524 | 1583 | ||
1525 | enomem: | 1584 | enomem: |
@@ -1641,7 +1700,7 @@ ssize_t __init pcpu_lpage_first_chunk(size_t static_size, size_t reserved_size, | |||
1641 | "%zu bytes\n", pcpul_vm.addr, static_size); | 1700 | "%zu bytes\n", pcpul_vm.addr, static_size); |
1642 | 1701 | ||
1643 | ret = pcpu_setup_first_chunk(static_size, reserved_size, dyn_size, | 1702 | ret = pcpu_setup_first_chunk(static_size, reserved_size, dyn_size, |
1644 | pcpul_unit_size, pcpul_vm.addr); | 1703 | pcpul_unit_size, pcpul_vm.addr, NULL); |
1645 | 1704 | ||
1646 | /* sort pcpul_map array for pcpu_lpage_remapped() */ | 1705 | /* sort pcpul_map array for pcpu_lpage_remapped() */ |
1647 | for (i = 0; i < num_possible_cpus() - 1; i++) | 1706 | for (i = 0; i < num_possible_cpus() - 1; i++) |