aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorTejun Heo <tj@kernel.org>2009-07-03 19:11:00 -0400
committerTejun Heo <tj@kernel.org>2009-07-03 19:11:00 -0400
commit2f39e637ea240efb74cf807d31c93a71a0b89174 (patch)
treed26bd3ad962031c5b495a528b4115c2ed4ff7a80 /mm
parentce3141a277ff6cc37e51008b8888dc2cb7456ef1 (diff)
percpu: allow non-linear / sparse cpu -> unit mapping
Currently cpu and unit are always identity mapped. To allow more efficient large page support on NUMA and lazy allocation for possible but offline cpus, cpu -> unit mapping needs to be non-linear and/or sparse. This can be easily implemented by adding a cpu -> unit mapping array and using it whenever looking up the matching unit for a cpu. The only unusal conversion is in pcpu_chunk_addr_search(). The passed in address is unit0 based and unit0 might not be in use so it needs to be converted to address of an in-use unit. This is easily done by adding the unit offset for the current processor. [ Impact: allows non-linear/sparse cpu -> unit mapping, no visible change yet ] Signed-off-by: Tejun Heo <tj@kernel.org> Cc: Ingo Molnar <mingo@elte.hu> Cc: David Miller <davem@davemloft.net>
Diffstat (limited to 'mm')
-rw-r--r--mm/percpu.c129
1 files changed, 94 insertions, 35 deletions
diff --git a/mm/percpu.c b/mm/percpu.c
index 21756814d99f..2196fae24f00 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -8,12 +8,13 @@
8 * 8 *
9 * This is percpu allocator which can handle both static and dynamic 9 * This is percpu allocator which can handle both static and dynamic
10 * areas. Percpu areas are allocated in chunks in vmalloc area. Each 10 * areas. Percpu areas are allocated in chunks in vmalloc area. Each
11 * chunk is consisted of num_possible_cpus() units and the first chunk 11 * chunk is consisted of boot-time determined number of units and the
12 * is used for static percpu variables in the kernel image (special 12 * first chunk is used for static percpu variables in the kernel image
13 * boot time alloc/init handling necessary as these areas need to be 13 * (special boot time alloc/init handling necessary as these areas
14 * brought up before allocation services are running). Unit grows as 14 * need to be brought up before allocation services are running).
15 * necessary and all units grow or shrink in unison. When a chunk is 15 * Unit grows as necessary and all units grow or shrink in unison.
16 * filled up, another chunk is allocated. ie. in vmalloc area 16 * When a chunk is filled up, another chunk is allocated. ie. in
17 * vmalloc area
17 * 18 *
18 * c0 c1 c2 19 * c0 c1 c2
19 * ------------------- ------------------- ------------ 20 * ------------------- ------------------- ------------
@@ -22,11 +23,13 @@
22 * 23 *
23 * Allocation is done in offset-size areas of single unit space. Ie, 24 * Allocation is done in offset-size areas of single unit space. Ie,
24 * an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0, 25 * an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0,
25 * c1:u1, c1:u2 and c1:u3. Percpu access can be done by configuring 26 * c1:u1, c1:u2 and c1:u3. On UMA, units corresponds directly to
26 * percpu base registers pcpu_unit_size apart. 27 * cpus. On NUMA, the mapping can be non-linear and even sparse.
28 * Percpu access can be done by configuring percpu base registers
29 * according to cpu to unit mapping and pcpu_unit_size.
27 * 30 *
28 * There are usually many small percpu allocations many of them as 31 * There are usually many small percpu allocations many of them being
29 * small as 4 bytes. The allocator organizes chunks into lists 32 * as small as 4 bytes. The allocator organizes chunks into lists
30 * according to free size and tries to allocate from the fullest one. 33 * according to free size and tries to allocate from the fullest one.
31 * Each chunk keeps the maximum contiguous area size hint which is 34 * Each chunk keeps the maximum contiguous area size hint which is
32 * guaranteed to be eqaul to or larger than the maximum contiguous 35 * guaranteed to be eqaul to or larger than the maximum contiguous
@@ -99,14 +102,22 @@ struct pcpu_chunk {
99 102
100static int pcpu_unit_pages __read_mostly; 103static int pcpu_unit_pages __read_mostly;
101static int pcpu_unit_size __read_mostly; 104static int pcpu_unit_size __read_mostly;
105static int pcpu_nr_units __read_mostly;
102static int pcpu_chunk_size __read_mostly; 106static int pcpu_chunk_size __read_mostly;
103static int pcpu_nr_slots __read_mostly; 107static int pcpu_nr_slots __read_mostly;
104static size_t pcpu_chunk_struct_size __read_mostly; 108static size_t pcpu_chunk_struct_size __read_mostly;
105 109
110/* cpus with the lowest and highest unit numbers */
111static unsigned int pcpu_first_unit_cpu __read_mostly;
112static unsigned int pcpu_last_unit_cpu __read_mostly;
113
106/* the address of the first chunk which starts with the kernel static area */ 114/* the address of the first chunk which starts with the kernel static area */
107void *pcpu_base_addr __read_mostly; 115void *pcpu_base_addr __read_mostly;
108EXPORT_SYMBOL_GPL(pcpu_base_addr); 116EXPORT_SYMBOL_GPL(pcpu_base_addr);
109 117
118/* cpu -> unit map */
119const int *pcpu_unit_map __read_mostly;
120
110/* 121/*
111 * The first chunk which always exists. Note that unlike other 122 * The first chunk which always exists. Note that unlike other
112 * chunks, this one can be allocated and mapped in several different 123 * chunks, this one can be allocated and mapped in several different
@@ -177,7 +188,7 @@ static int pcpu_chunk_slot(const struct pcpu_chunk *chunk)
177 188
178static int pcpu_page_idx(unsigned int cpu, int page_idx) 189static int pcpu_page_idx(unsigned int cpu, int page_idx)
179{ 190{
180 return cpu * pcpu_unit_pages + page_idx; 191 return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx;
181} 192}
182 193
183static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk, 194static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
@@ -321,6 +332,14 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
321 return pcpu_first_chunk; 332 return pcpu_first_chunk;
322 } 333 }
323 334
335 /*
336 * The address is relative to unit0 which might be unused and
337 * thus unmapped. Offset the address to the unit space of the
338 * current processor before looking it up in the vmalloc
339 * space. Note that any possible cpu id can be used here, so
340 * there's no need to worry about preemption or cpu hotplug.
341 */
342 addr += pcpu_unit_map[smp_processor_id()] * pcpu_unit_size;
324 return pcpu_get_page_chunk(vmalloc_to_page(addr)); 343 return pcpu_get_page_chunk(vmalloc_to_page(addr));
325} 344}
326 345
@@ -593,8 +612,7 @@ static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk,
593{ 612{
594 static struct page **pages; 613 static struct page **pages;
595 static unsigned long *bitmap; 614 static unsigned long *bitmap;
596 size_t pages_size = num_possible_cpus() * pcpu_unit_pages * 615 size_t pages_size = pcpu_nr_units * pcpu_unit_pages * sizeof(pages[0]);
597 sizeof(pages[0]);
598 size_t bitmap_size = BITS_TO_LONGS(pcpu_unit_pages) * 616 size_t bitmap_size = BITS_TO_LONGS(pcpu_unit_pages) *
599 sizeof(unsigned long); 617 sizeof(unsigned long);
600 618
@@ -692,10 +710,9 @@ static int pcpu_alloc_pages(struct pcpu_chunk *chunk,
692static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk, 710static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk,
693 int page_start, int page_end) 711 int page_start, int page_end)
694{ 712{
695 unsigned int last = num_possible_cpus() - 1; 713 flush_cache_vunmap(
696 714 pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
697 flush_cache_vunmap(pcpu_chunk_addr(chunk, 0, page_start), 715 pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
698 pcpu_chunk_addr(chunk, last, page_end));
699} 716}
700 717
701static void __pcpu_unmap_pages(unsigned long addr, int nr_pages) 718static void __pcpu_unmap_pages(unsigned long addr, int nr_pages)
@@ -756,10 +773,9 @@ static void pcpu_unmap_pages(struct pcpu_chunk *chunk,
756static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk, 773static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk,
757 int page_start, int page_end) 774 int page_start, int page_end)
758{ 775{
759 unsigned int last = num_possible_cpus() - 1; 776 flush_tlb_kernel_range(
760 777 pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
761 flush_tlb_kernel_range(pcpu_chunk_addr(chunk, 0, page_start), 778 pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
762 pcpu_chunk_addr(chunk, last, page_end));
763} 779}
764 780
765static int __pcpu_map_pages(unsigned long addr, struct page **pages, 781static int __pcpu_map_pages(unsigned long addr, struct page **pages,
@@ -835,11 +851,9 @@ err:
835static void pcpu_post_map_flush(struct pcpu_chunk *chunk, 851static void pcpu_post_map_flush(struct pcpu_chunk *chunk,
836 int page_start, int page_end) 852 int page_start, int page_end)
837{ 853{
838 unsigned int last = num_possible_cpus() - 1; 854 flush_cache_vmap(
839 855 pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
840 /* flush at once, please read comments in pcpu_unmap() */ 856 pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
841 flush_cache_vmap(pcpu_chunk_addr(chunk, 0, page_start),
842 pcpu_chunk_addr(chunk, last, page_end));
843} 857}
844 858
845/** 859/**
@@ -953,8 +967,7 @@ static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
953 bitmap_copy(chunk->populated, populated, pcpu_unit_pages); 967 bitmap_copy(chunk->populated, populated, pcpu_unit_pages);
954clear: 968clear:
955 for_each_possible_cpu(cpu) 969 for_each_possible_cpu(cpu)
956 memset(chunk->vm->addr + cpu * pcpu_unit_size + off, 0, 970 memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
957 size);
958 return 0; 971 return 0;
959 972
960err_unmap: 973err_unmap:
@@ -1088,6 +1101,7 @@ area_found:
1088 1101
1089 mutex_unlock(&pcpu_alloc_mutex); 1102 mutex_unlock(&pcpu_alloc_mutex);
1090 1103
1104 /* return address relative to unit0 */
1091 return __addr_to_pcpu_ptr(chunk->vm->addr + off); 1105 return __addr_to_pcpu_ptr(chunk->vm->addr + off);
1092 1106
1093fail_unlock: 1107fail_unlock:
@@ -1222,6 +1236,7 @@ EXPORT_SYMBOL_GPL(free_percpu);
1222 * @dyn_size: free size for dynamic allocation in bytes, -1 for auto 1236 * @dyn_size: free size for dynamic allocation in bytes, -1 for auto
1223 * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE 1237 * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE
1224 * @base_addr: mapped address 1238 * @base_addr: mapped address
1239 * @unit_map: cpu -> unit map, NULL for sequential mapping
1225 * 1240 *
1226 * Initialize the first percpu chunk which contains the kernel static 1241 * Initialize the first percpu chunk which contains the kernel static
1227 * perpcu area. This function is to be called from arch percpu area 1242 * perpcu area. This function is to be called from arch percpu area
@@ -1260,16 +1275,17 @@ EXPORT_SYMBOL_GPL(free_percpu);
1260 */ 1275 */
1261size_t __init pcpu_setup_first_chunk(size_t static_size, size_t reserved_size, 1276size_t __init pcpu_setup_first_chunk(size_t static_size, size_t reserved_size,
1262 ssize_t dyn_size, size_t unit_size, 1277 ssize_t dyn_size, size_t unit_size,
1263 void *base_addr) 1278 void *base_addr, const int *unit_map)
1264{ 1279{
1265 static struct vm_struct first_vm; 1280 static struct vm_struct first_vm;
1266 static int smap[2], dmap[2]; 1281 static int smap[2], dmap[2];
1267 size_t size_sum = static_size + reserved_size + 1282 size_t size_sum = static_size + reserved_size +
1268 (dyn_size >= 0 ? dyn_size : 0); 1283 (dyn_size >= 0 ? dyn_size : 0);
1269 struct pcpu_chunk *schunk, *dchunk = NULL; 1284 struct pcpu_chunk *schunk, *dchunk = NULL;
1285 unsigned int cpu, tcpu;
1270 int i; 1286 int i;
1271 1287
1272 /* santiy checks */ 1288 /* sanity checks */
1273 BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC || 1289 BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC ||
1274 ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC); 1290 ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC);
1275 BUG_ON(!static_size); 1291 BUG_ON(!static_size);
@@ -1278,9 +1294,52 @@ size_t __init pcpu_setup_first_chunk(size_t static_size, size_t reserved_size,
1278 BUG_ON(unit_size & ~PAGE_MASK); 1294 BUG_ON(unit_size & ~PAGE_MASK);
1279 BUG_ON(unit_size < PCPU_MIN_UNIT_SIZE); 1295 BUG_ON(unit_size < PCPU_MIN_UNIT_SIZE);
1280 1296
1297 /* determine number of units and verify and initialize pcpu_unit_map */
1298 if (unit_map) {
1299 int first_unit = INT_MAX, last_unit = INT_MIN;
1300
1301 for_each_possible_cpu(cpu) {
1302 int unit = unit_map[cpu];
1303
1304 BUG_ON(unit < 0);
1305 for_each_possible_cpu(tcpu) {
1306 if (tcpu == cpu)
1307 break;
1308 /* the mapping should be one-to-one */
1309 BUG_ON(unit_map[tcpu] == unit);
1310 }
1311
1312 if (unit < first_unit) {
1313 pcpu_first_unit_cpu = cpu;
1314 first_unit = unit;
1315 }
1316 if (unit > last_unit) {
1317 pcpu_last_unit_cpu = cpu;
1318 last_unit = unit;
1319 }
1320 }
1321 pcpu_nr_units = last_unit + 1;
1322 pcpu_unit_map = unit_map;
1323 } else {
1324 int *identity_map;
1325
1326 /* #units == #cpus, identity mapped */
1327 identity_map = alloc_bootmem(num_possible_cpus() *
1328 sizeof(identity_map[0]));
1329
1330 for_each_possible_cpu(cpu)
1331 identity_map[cpu] = cpu;
1332
1333 pcpu_first_unit_cpu = 0;
1334 pcpu_last_unit_cpu = pcpu_nr_units - 1;
1335 pcpu_nr_units = num_possible_cpus();
1336 pcpu_unit_map = identity_map;
1337 }
1338
1339 /* determine basic parameters */
1281 pcpu_unit_pages = unit_size >> PAGE_SHIFT; 1340 pcpu_unit_pages = unit_size >> PAGE_SHIFT;
1282 pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT; 1341 pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
1283 pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size; 1342 pcpu_chunk_size = pcpu_nr_units * pcpu_unit_size;
1284 pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) + 1343 pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) +
1285 BITS_TO_LONGS(pcpu_unit_pages) * sizeof(unsigned long); 1344 BITS_TO_LONGS(pcpu_unit_pages) * sizeof(unsigned long);
1286 1345
@@ -1349,7 +1408,7 @@ size_t __init pcpu_setup_first_chunk(size_t static_size, size_t reserved_size,
1349 pcpu_chunk_relocate(pcpu_first_chunk, -1); 1408 pcpu_chunk_relocate(pcpu_first_chunk, -1);
1350 1409
1351 /* we're done */ 1410 /* we're done */
1352 pcpu_base_addr = (void *)pcpu_chunk_addr(schunk, 0, 0); 1411 pcpu_base_addr = schunk->vm->addr;
1353 return pcpu_unit_size; 1412 return pcpu_unit_size;
1354} 1413}
1355 1414
@@ -1427,7 +1486,7 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size,
1427 size_sum >> PAGE_SHIFT, base, static_size); 1486 size_sum >> PAGE_SHIFT, base, static_size);
1428 1487
1429 return pcpu_setup_first_chunk(static_size, reserved_size, dyn_size, 1488 return pcpu_setup_first_chunk(static_size, reserved_size, dyn_size,
1430 unit_size, base); 1489 unit_size, base, NULL);
1431} 1490}
1432 1491
1433/** 1492/**
@@ -1519,7 +1578,7 @@ ssize_t __init pcpu_4k_first_chunk(size_t static_size, size_t reserved_size,
1519 unit_pages, static_size); 1578 unit_pages, static_size);
1520 1579
1521 ret = pcpu_setup_first_chunk(static_size, reserved_size, -1, 1580 ret = pcpu_setup_first_chunk(static_size, reserved_size, -1,
1522 unit_pages << PAGE_SHIFT, vm.addr); 1581 unit_pages << PAGE_SHIFT, vm.addr, NULL);
1523 goto out_free_ar; 1582 goto out_free_ar;
1524 1583
1525enomem: 1584enomem:
@@ -1641,7 +1700,7 @@ ssize_t __init pcpu_lpage_first_chunk(size_t static_size, size_t reserved_size,
1641 "%zu bytes\n", pcpul_vm.addr, static_size); 1700 "%zu bytes\n", pcpul_vm.addr, static_size);
1642 1701
1643 ret = pcpu_setup_first_chunk(static_size, reserved_size, dyn_size, 1702 ret = pcpu_setup_first_chunk(static_size, reserved_size, dyn_size,
1644 pcpul_unit_size, pcpul_vm.addr); 1703 pcpul_unit_size, pcpul_vm.addr, NULL);
1645 1704
1646 /* sort pcpul_map array for pcpu_lpage_remapped() */ 1705 /* sort pcpul_map array for pcpu_lpage_remapped() */
1647 for (i = 0; i < num_possible_cpus() - 1; i++) 1706 for (i = 0; i < num_possible_cpus() - 1; i++)