aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2017-09-07 00:33:12 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2017-09-07 00:33:12 -0400
commita7cbfd05f427f8f1164bc53866971e89a0cbe103 (patch)
tree05d68a80d4d6635800a53e60e2638ebebc7a5761
parentd34fc1adf01ff87026da85fb972dc259dc347540 (diff)
parent5e81ee3e6a79cc9fa85af5c3db0f1f269709bbf1 (diff)
Merge branch 'for-4.14' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/percpu
Pull percpu updates from Tejun Heo: "A lot of changes for percpu this time around. percpu inherited the same area allocator from the original pre-virtual-address-mapped implementation. This was from the time when percpu allocator wasn't used all that much and the implementation was focused on simplicity, with the unfortunate computational complexity of O(number of areas allocated from the chunk) per alloc / free. With the increase in percpu usage, we're hitting cases where the lack of scalability is hurting. The most prominent one right now is bpf perpcu map creation / destruction which may allocate and free a lot of entries consecutively and it's likely that the problem will become more prominent in the future. To address the issue, Dennis replaced the area allocator with hinted bitmap allocator which is more consistent. While the new allocator does perform a bit worse in some cases, it outperforms the old allocator way more than an order of magnitude in other more common scenarios while staying mostly flat in CPU overhead and completely flat in memory consumption" * 'for-4.14' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/percpu: (27 commits) percpu: update header to contain bitmap allocator explanation. percpu: update pcpu_find_block_fit to use an iterator percpu: use metadata blocks to update the chunk contig hint percpu: update free path to take advantage of contig hints percpu: update alloc path to only scan if contig hints are broken percpu: keep track of the best offset for contig hints percpu: skip chunks if the alloc does not fit in the contig hint percpu: add first_bit to keep track of the first free in the bitmap percpu: introduce bitmap metadata blocks percpu: replace area map allocator with bitmap percpu: generalize bitmap (un)populated iterators percpu: increase minimum percpu allocation size and align first regions percpu: introduce nr_empty_pop_pages to help empty page accounting percpu: change the number of pages marked in the first_chunk pop bitmap percpu: combine percpu address checks percpu: modify base_addr to be region specific percpu: setup_first_chunk rename schunk/dchunk to chunk percpu: end chunk area maps page aligned for the populated bitmap percpu: unify allocation of schunk and dchunk percpu: setup_first_chunk remove dyn_size and consolidate logic ...
-rw-r--r--include/linux/percpu.h20
-rw-r--r--init/main.c1
-rw-r--r--mm/percpu-internal.h82
-rw-r--r--mm/percpu-km.c2
-rw-r--r--mm/percpu-stats.c111
-rw-r--r--mm/percpu.c1522
6 files changed, 1112 insertions, 626 deletions
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 491b3f5a5f8a..6a5fb939d3e5 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -21,6 +21,25 @@
21/* minimum unit size, also is the maximum supported allocation size */ 21/* minimum unit size, also is the maximum supported allocation size */
22#define PCPU_MIN_UNIT_SIZE PFN_ALIGN(32 << 10) 22#define PCPU_MIN_UNIT_SIZE PFN_ALIGN(32 << 10)
23 23
24/* minimum allocation size and shift in bytes */
25#define PCPU_MIN_ALLOC_SHIFT 2
26#define PCPU_MIN_ALLOC_SIZE (1 << PCPU_MIN_ALLOC_SHIFT)
27
28/* number of bits per page, used to trigger a scan if blocks are > PAGE_SIZE */
29#define PCPU_BITS_PER_PAGE (PAGE_SIZE >> PCPU_MIN_ALLOC_SHIFT)
30
31/*
32 * This determines the size of each metadata block. There are several subtle
33 * constraints around this constant. The reserved region must be a multiple of
34 * PCPU_BITMAP_BLOCK_SIZE. Additionally, PCPU_BITMAP_BLOCK_SIZE must be a
35 * multiple of PAGE_SIZE or PAGE_SIZE must be a multiple of
36 * PCPU_BITMAP_BLOCK_SIZE to align with the populated page map. The unit_size
37 * also has to be a multiple of PCPU_BITMAP_BLOCK_SIZE to ensure full blocks.
38 */
39#define PCPU_BITMAP_BLOCK_SIZE PAGE_SIZE
40#define PCPU_BITMAP_BLOCK_BITS (PCPU_BITMAP_BLOCK_SIZE >> \
41 PCPU_MIN_ALLOC_SHIFT)
42
24/* 43/*
25 * Percpu allocator can serve percpu allocations before slab is 44 * Percpu allocator can serve percpu allocations before slab is
26 * initialized which allows slab to depend on the percpu allocator. 45 * initialized which allows slab to depend on the percpu allocator.
@@ -116,7 +135,6 @@ extern bool is_kernel_percpu_address(unsigned long addr);
116#if !defined(CONFIG_SMP) || !defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) 135#if !defined(CONFIG_SMP) || !defined(CONFIG_HAVE_SETUP_PER_CPU_AREA)
117extern void __init setup_per_cpu_areas(void); 136extern void __init setup_per_cpu_areas(void);
118#endif 137#endif
119extern void __init percpu_init_late(void);
120 138
121extern void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp); 139extern void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp);
122extern void __percpu *__alloc_percpu(size_t size, size_t align); 140extern void __percpu *__alloc_percpu(size_t size, size_t align);
diff --git a/init/main.c b/init/main.c
index a21a1a8708a8..949306bb5b6a 100644
--- a/init/main.c
+++ b/init/main.c
@@ -501,7 +501,6 @@ static void __init mm_init(void)
501 page_ext_init_flatmem(); 501 page_ext_init_flatmem();
502 mem_init(); 502 mem_init();
503 kmem_cache_init(); 503 kmem_cache_init();
504 percpu_init_late();
505 pgtable_init(); 504 pgtable_init();
506 vmalloc_init(); 505 vmalloc_init();
507 ioremap_huge_init(); 506 ioremap_huge_init();
diff --git a/mm/percpu-internal.h b/mm/percpu-internal.h
index cd2442e13d8f..7065faf74b46 100644
--- a/mm/percpu-internal.h
+++ b/mm/percpu-internal.h
@@ -4,6 +4,22 @@
4#include <linux/types.h> 4#include <linux/types.h>
5#include <linux/percpu.h> 5#include <linux/percpu.h>
6 6
7/*
8 * pcpu_block_md is the metadata block struct.
9 * Each chunk's bitmap is split into a number of full blocks.
10 * All units are in terms of bits.
11 */
12struct pcpu_block_md {
13 int contig_hint; /* contig hint for block */
14 int contig_hint_start; /* block relative starting
15 position of the contig hint */
16 int left_free; /* size of free space along
17 the left side of the block */
18 int right_free; /* size of free space along
19 the right side of the block */
20 int first_free; /* block position of first free */
21};
22
7struct pcpu_chunk { 23struct pcpu_chunk {
8#ifdef CONFIG_PERCPU_STATS 24#ifdef CONFIG_PERCPU_STATS
9 int nr_alloc; /* # of allocations */ 25 int nr_alloc; /* # of allocations */
@@ -11,24 +27,29 @@ struct pcpu_chunk {
11#endif 27#endif
12 28
13 struct list_head list; /* linked to pcpu_slot lists */ 29 struct list_head list; /* linked to pcpu_slot lists */
14 int free_size; /* free bytes in the chunk */ 30 int free_bytes; /* free bytes in the chunk */
15 int contig_hint; /* max contiguous size hint */ 31 int contig_bits; /* max contiguous size hint */
32 int contig_bits_start; /* contig_bits starting
33 offset */
16 void *base_addr; /* base address of this chunk */ 34 void *base_addr; /* base address of this chunk */
17 35
18 int map_used; /* # of map entries used before the sentry */ 36 unsigned long *alloc_map; /* allocation map */
19 int map_alloc; /* # of map entries allocated */ 37 unsigned long *bound_map; /* boundary map */
20 int *map; /* allocation map */ 38 struct pcpu_block_md *md_blocks; /* metadata blocks */
21 struct list_head map_extend_list;/* on pcpu_map_extend_chunks */
22 39
23 void *data; /* chunk data */ 40 void *data; /* chunk data */
24 int first_free; /* no free below this */ 41 int first_bit; /* no free below this */
25 bool immutable; /* no [de]population allowed */ 42 bool immutable; /* no [de]population allowed */
26 bool has_reserved; /* Indicates if chunk has reserved space 43 int start_offset; /* the overlap with the previous
27 at the beginning. Reserved chunk will 44 region to have a page aligned
28 contain reservation for static chunk. 45 base_addr */
29 Dynamic chunk will contain reservation 46 int end_offset; /* additional area required to
30 for static and reserved chunks. */ 47 have the region end page
48 aligned */
49
50 int nr_pages; /* # of pages served by this chunk */
31 int nr_populated; /* # of populated pages */ 51 int nr_populated; /* # of populated pages */
52 int nr_empty_pop_pages; /* # of empty populated pages */
32 unsigned long populated[]; /* populated bitmap */ 53 unsigned long populated[]; /* populated bitmap */
33}; 54};
34 55
@@ -36,10 +57,47 @@ extern spinlock_t pcpu_lock;
36 57
37extern struct list_head *pcpu_slot; 58extern struct list_head *pcpu_slot;
38extern int pcpu_nr_slots; 59extern int pcpu_nr_slots;
60extern int pcpu_nr_empty_pop_pages;
39 61
40extern struct pcpu_chunk *pcpu_first_chunk; 62extern struct pcpu_chunk *pcpu_first_chunk;
41extern struct pcpu_chunk *pcpu_reserved_chunk; 63extern struct pcpu_chunk *pcpu_reserved_chunk;
42 64
65/**
66 * pcpu_chunk_nr_blocks - converts nr_pages to # of md_blocks
67 * @chunk: chunk of interest
68 *
69 * This conversion is from the number of physical pages that the chunk
70 * serves to the number of bitmap blocks used.
71 */
72static inline int pcpu_chunk_nr_blocks(struct pcpu_chunk *chunk)
73{
74 return chunk->nr_pages * PAGE_SIZE / PCPU_BITMAP_BLOCK_SIZE;
75}
76
77/**
78 * pcpu_nr_pages_to_map_bits - converts the pages to size of bitmap
79 * @pages: number of physical pages
80 *
81 * This conversion is from physical pages to the number of bits
82 * required in the bitmap.
83 */
84static inline int pcpu_nr_pages_to_map_bits(int pages)
85{
86 return pages * PAGE_SIZE / PCPU_MIN_ALLOC_SIZE;
87}
88
89/**
90 * pcpu_chunk_map_bits - helper to convert nr_pages to size of bitmap
91 * @chunk: chunk of interest
92 *
93 * This conversion is from the number of physical pages that the chunk
94 * serves to the number of bits in the bitmap.
95 */
96static inline int pcpu_chunk_map_bits(struct pcpu_chunk *chunk)
97{
98 return pcpu_nr_pages_to_map_bits(chunk->nr_pages);
99}
100
43#ifdef CONFIG_PERCPU_STATS 101#ifdef CONFIG_PERCPU_STATS
44 102
45#include <linux/spinlock.h> 103#include <linux/spinlock.h>
diff --git a/mm/percpu-km.c b/mm/percpu-km.c
index eb58aa4c0997..d2a76642c4ae 100644
--- a/mm/percpu-km.c
+++ b/mm/percpu-km.c
@@ -69,7 +69,7 @@ static struct pcpu_chunk *pcpu_create_chunk(void)
69 chunk->base_addr = page_address(pages) - pcpu_group_offsets[0]; 69 chunk->base_addr = page_address(pages) - pcpu_group_offsets[0];
70 70
71 spin_lock_irq(&pcpu_lock); 71 spin_lock_irq(&pcpu_lock);
72 pcpu_chunk_populated(chunk, 0, nr_pages); 72 pcpu_chunk_populated(chunk, 0, nr_pages, false);
73 spin_unlock_irq(&pcpu_lock); 73 spin_unlock_irq(&pcpu_lock);
74 74
75 pcpu_stats_chunk_alloc(); 75 pcpu_stats_chunk_alloc();
diff --git a/mm/percpu-stats.c b/mm/percpu-stats.c
index 03524a56eeff..6142484e88f7 100644
--- a/mm/percpu-stats.c
+++ b/mm/percpu-stats.c
@@ -18,7 +18,7 @@
18#include "percpu-internal.h" 18#include "percpu-internal.h"
19 19
20#define P(X, Y) \ 20#define P(X, Y) \
21 seq_printf(m, " %-24s: %8lld\n", X, (long long int)Y) 21 seq_printf(m, " %-20s: %12lld\n", X, (long long int)Y)
22 22
23struct percpu_stats pcpu_stats; 23struct percpu_stats pcpu_stats;
24struct pcpu_alloc_info pcpu_stats_ai; 24struct pcpu_alloc_info pcpu_stats_ai;
@@ -29,64 +29,85 @@ static int cmpint(const void *a, const void *b)
29} 29}
30 30
31/* 31/*
32 * Iterates over all chunks to find the max # of map entries used. 32 * Iterates over all chunks to find the max nr_alloc entries.
33 */ 33 */
34static int find_max_map_used(void) 34static int find_max_nr_alloc(void)
35{ 35{
36 struct pcpu_chunk *chunk; 36 struct pcpu_chunk *chunk;
37 int slot, max_map_used; 37 int slot, max_nr_alloc;
38 38
39 max_map_used = 0; 39 max_nr_alloc = 0;
40 for (slot = 0; slot < pcpu_nr_slots; slot++) 40 for (slot = 0; slot < pcpu_nr_slots; slot++)
41 list_for_each_entry(chunk, &pcpu_slot[slot], list) 41 list_for_each_entry(chunk, &pcpu_slot[slot], list)
42 max_map_used = max(max_map_used, chunk->map_used); 42 max_nr_alloc = max(max_nr_alloc, chunk->nr_alloc);
43 43
44 return max_map_used; 44 return max_nr_alloc;
45} 45}
46 46
47/* 47/*
48 * Prints out chunk state. Fragmentation is considered between 48 * Prints out chunk state. Fragmentation is considered between
49 * the beginning of the chunk to the last allocation. 49 * the beginning of the chunk to the last allocation.
50 *
51 * All statistics are in bytes unless stated otherwise.
50 */ 52 */
51static void chunk_map_stats(struct seq_file *m, struct pcpu_chunk *chunk, 53static void chunk_map_stats(struct seq_file *m, struct pcpu_chunk *chunk,
52 void *buffer) 54 int *buffer)
53{ 55{
54 int i, s_index, last_alloc, alloc_sign, as_len; 56 int i, last_alloc, as_len, start, end;
55 int *alloc_sizes, *p; 57 int *alloc_sizes, *p;
56 /* statistics */ 58 /* statistics */
57 int sum_frag = 0, max_frag = 0; 59 int sum_frag = 0, max_frag = 0;
58 int cur_min_alloc = 0, cur_med_alloc = 0, cur_max_alloc = 0; 60 int cur_min_alloc = 0, cur_med_alloc = 0, cur_max_alloc = 0;
59 61
60 alloc_sizes = buffer; 62 alloc_sizes = buffer;
61 s_index = chunk->has_reserved ? 1 : 0;
62
63 /* find last allocation */
64 last_alloc = -1;
65 for (i = chunk->map_used - 1; i >= s_index; i--) {
66 if (chunk->map[i] & 1) {
67 last_alloc = i;
68 break;
69 }
70 }
71 63
72 /* if the chunk is not empty - ignoring reserve */ 64 /*
73 if (last_alloc >= s_index) { 65 * find_last_bit returns the start value if nothing found.
74 as_len = last_alloc + 1 - s_index; 66 * Therefore, we must determine if it is a failure of find_last_bit
75 67 * and set the appropriate value.
76 /* 68 */
77 * Iterate through chunk map computing size info. 69 last_alloc = find_last_bit(chunk->alloc_map,
78 * The first bit is overloaded to be a used flag. 70 pcpu_chunk_map_bits(chunk) -
79 * negative = free space, positive = allocated 71 chunk->end_offset / PCPU_MIN_ALLOC_SIZE - 1);
80 */ 72 last_alloc = test_bit(last_alloc, chunk->alloc_map) ?
81 for (i = 0, p = chunk->map + s_index; i < as_len; i++, p++) { 73 last_alloc + 1 : 0;
82 alloc_sign = (*p & 1) ? 1 : -1; 74
83 alloc_sizes[i] = alloc_sign * 75 as_len = 0;
84 ((p[1] & ~1) - (p[0] & ~1)); 76 start = chunk->start_offset;
77
78 /*
79 * If a bit is set in the allocation map, the bound_map identifies
80 * where the allocation ends. If the allocation is not set, the
81 * bound_map does not identify free areas as it is only kept accurate
82 * on allocation, not free.
83 *
84 * Positive values are allocations and negative values are free
85 * fragments.
86 */
87 while (start < last_alloc) {
88 if (test_bit(start, chunk->alloc_map)) {
89 end = find_next_bit(chunk->bound_map, last_alloc,
90 start + 1);
91 alloc_sizes[as_len] = 1;
92 } else {
93 end = find_next_bit(chunk->alloc_map, last_alloc,
94 start + 1);
95 alloc_sizes[as_len] = -1;
85 } 96 }
86 97
87 sort(alloc_sizes, as_len, sizeof(chunk->map[0]), cmpint, NULL); 98 alloc_sizes[as_len++] *= (end - start) * PCPU_MIN_ALLOC_SIZE;
99
100 start = end;
101 }
102
103 /*
104 * The negative values are free fragments and thus sorting gives the
105 * free fragments at the beginning in largest first order.
106 */
107 if (as_len > 0) {
108 sort(alloc_sizes, as_len, sizeof(int), cmpint, NULL);
88 109
89 /* Iterate through the unallocated fragements. */ 110 /* iterate through the unallocated fragments */
90 for (i = 0, p = alloc_sizes; *p < 0 && i < as_len; i++, p++) { 111 for (i = 0, p = alloc_sizes; *p < 0 && i < as_len; i++, p++) {
91 sum_frag -= *p; 112 sum_frag -= *p;
92 max_frag = max(max_frag, -1 * (*p)); 113 max_frag = max(max_frag, -1 * (*p));
@@ -99,8 +120,10 @@ static void chunk_map_stats(struct seq_file *m, struct pcpu_chunk *chunk,
99 120
100 P("nr_alloc", chunk->nr_alloc); 121 P("nr_alloc", chunk->nr_alloc);
101 P("max_alloc_size", chunk->max_alloc_size); 122 P("max_alloc_size", chunk->max_alloc_size);
102 P("free_size", chunk->free_size); 123 P("empty_pop_pages", chunk->nr_empty_pop_pages);
103 P("contig_hint", chunk->contig_hint); 124 P("first_bit", chunk->first_bit);
125 P("free_bytes", chunk->free_bytes);
126 P("contig_bytes", chunk->contig_bits * PCPU_MIN_ALLOC_SIZE);
104 P("sum_frag", sum_frag); 127 P("sum_frag", sum_frag);
105 P("max_frag", max_frag); 128 P("max_frag", max_frag);
106 P("cur_min_alloc", cur_min_alloc); 129 P("cur_min_alloc", cur_min_alloc);
@@ -112,29 +135,30 @@ static void chunk_map_stats(struct seq_file *m, struct pcpu_chunk *chunk,
112static int percpu_stats_show(struct seq_file *m, void *v) 135static int percpu_stats_show(struct seq_file *m, void *v)
113{ 136{
114 struct pcpu_chunk *chunk; 137 struct pcpu_chunk *chunk;
115 int slot, max_map_used; 138 int slot, max_nr_alloc;
116 void *buffer; 139 int *buffer;
117 140
118alloc_buffer: 141alloc_buffer:
119 spin_lock_irq(&pcpu_lock); 142 spin_lock_irq(&pcpu_lock);
120 max_map_used = find_max_map_used(); 143 max_nr_alloc = find_max_nr_alloc();
121 spin_unlock_irq(&pcpu_lock); 144 spin_unlock_irq(&pcpu_lock);
122 145
123 buffer = vmalloc(max_map_used * sizeof(pcpu_first_chunk->map[0])); 146 /* there can be at most this many free and allocated fragments */
147 buffer = vmalloc((2 * max_nr_alloc + 1) * sizeof(int));
124 if (!buffer) 148 if (!buffer)
125 return -ENOMEM; 149 return -ENOMEM;
126 150
127 spin_lock_irq(&pcpu_lock); 151 spin_lock_irq(&pcpu_lock);
128 152
129 /* if the buffer allocated earlier is too small */ 153 /* if the buffer allocated earlier is too small */
130 if (max_map_used < find_max_map_used()) { 154 if (max_nr_alloc < find_max_nr_alloc()) {
131 spin_unlock_irq(&pcpu_lock); 155 spin_unlock_irq(&pcpu_lock);
132 vfree(buffer); 156 vfree(buffer);
133 goto alloc_buffer; 157 goto alloc_buffer;
134 } 158 }
135 159
136#define PL(X) \ 160#define PL(X) \
137 seq_printf(m, " %-24s: %8lld\n", #X, (long long int)pcpu_stats_ai.X) 161 seq_printf(m, " %-20s: %12lld\n", #X, (long long int)pcpu_stats_ai.X)
138 162
139 seq_printf(m, 163 seq_printf(m,
140 "Percpu Memory Statistics\n" 164 "Percpu Memory Statistics\n"
@@ -151,7 +175,7 @@ alloc_buffer:
151#undef PL 175#undef PL
152 176
153#define PU(X) \ 177#define PU(X) \
154 seq_printf(m, " %-18s: %14llu\n", #X, (unsigned long long)pcpu_stats.X) 178 seq_printf(m, " %-20s: %12llu\n", #X, (unsigned long long)pcpu_stats.X)
155 179
156 seq_printf(m, 180 seq_printf(m,
157 "Global Stats:\n" 181 "Global Stats:\n"
@@ -164,6 +188,7 @@ alloc_buffer:
164 PU(nr_max_chunks); 188 PU(nr_max_chunks);
165 PU(min_alloc_size); 189 PU(min_alloc_size);
166 PU(max_alloc_size); 190 PU(max_alloc_size);
191 P("empty_pop_pages", pcpu_nr_empty_pop_pages);
167 seq_putc(m, '\n'); 192 seq_putc(m, '\n');
168 193
169#undef PU 194#undef PU
diff --git a/mm/percpu.c b/mm/percpu.c
index bd4130a69bbc..59d44d61f5f1 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -4,44 +4,53 @@
4 * Copyright (C) 2009 SUSE Linux Products GmbH 4 * Copyright (C) 2009 SUSE Linux Products GmbH
5 * Copyright (C) 2009 Tejun Heo <tj@kernel.org> 5 * Copyright (C) 2009 Tejun Heo <tj@kernel.org>
6 * 6 *
7 * This file is released under the GPLv2. 7 * Copyright (C) 2017 Facebook Inc.
8 * Copyright (C) 2017 Dennis Zhou <dennisszhou@gmail.com>
8 * 9 *
9 * This is percpu allocator which can handle both static and dynamic 10 * This file is released under the GPLv2 license.
10 * areas. Percpu areas are allocated in chunks. Each chunk is 11 *
11 * consisted of boot-time determined number of units and the first 12 * The percpu allocator handles both static and dynamic areas. Percpu
12 * chunk is used for static percpu variables in the kernel image 13 * areas are allocated in chunks which are divided into units. There is
13 * (special boot time alloc/init handling necessary as these areas 14 * a 1-to-1 mapping for units to possible cpus. These units are grouped
14 * need to be brought up before allocation services are running). 15 * based on NUMA properties of the machine.
15 * Unit grows as necessary and all units grow or shrink in unison.
16 * When a chunk is filled up, another chunk is allocated.
17 * 16 *
18 * c0 c1 c2 17 * c0 c1 c2
19 * ------------------- ------------------- ------------ 18 * ------------------- ------------------- ------------
20 * | u0 | u1 | u2 | u3 | | u0 | u1 | u2 | u3 | | u0 | u1 | u 19 * | u0 | u1 | u2 | u3 | | u0 | u1 | u2 | u3 | | u0 | u1 | u
21 * ------------------- ...... ------------------- .... ------------ 20 * ------------------- ...... ------------------- .... ------------
22 * 21 *
23 * Allocation is done in offset-size areas of single unit space. Ie, 22 * Allocation is done by offsets into a unit's address space. Ie., an
24 * an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0, 23 * area of 512 bytes at 6k in c1 occupies 512 bytes at 6k in c1:u0,
25 * c1:u1, c1:u2 and c1:u3. On UMA, units corresponds directly to 24 * c1:u1, c1:u2, etc. On NUMA machines, the mapping may be non-linear
26 * cpus. On NUMA, the mapping can be non-linear and even sparse. 25 * and even sparse. Access is handled by configuring percpu base
27 * Percpu access can be done by configuring percpu base registers 26 * registers according to the cpu to unit mappings and offsetting the
28 * according to cpu to unit mapping and pcpu_unit_size. 27 * base address using pcpu_unit_size.
29 * 28 *
30 * There are usually many small percpu allocations many of them being 29 * There is special consideration for the first chunk which must handle
31 * as small as 4 bytes. The allocator organizes chunks into lists 30 * the static percpu variables in the kernel image as allocation services
32 * according to free size and tries to allocate from the fullest one. 31 * are not online yet. In short, the first chunk is structured like so:
33 * Each chunk keeps the maximum contiguous area size hint which is 32 *
34 * guaranteed to be equal to or larger than the maximum contiguous 33 * <Static | [Reserved] | Dynamic>
35 * area in the chunk. This helps the allocator not to iterate the 34 *
36 * chunk maps unnecessarily. 35 * The static data is copied from the original section managed by the
37 * 36 * linker. The reserved section, if non-zero, primarily manages static
38 * Allocation state in each chunk is kept using an array of integers 37 * percpu variables from kernel modules. Finally, the dynamic section
39 * on chunk->map. A positive value in the map represents a free 38 * takes care of normal allocations.
40 * region and negative allocated. Allocation inside a chunk is done 39 *
41 * by scanning this map sequentially and serving the first matching 40 * The allocator organizes chunks into lists according to free size and
42 * entry. This is mostly copied from the percpu_modalloc() allocator. 41 * tries to allocate from the fullest chunk first. Each chunk is managed
43 * Chunks can be determined from the address using the index field 42 * by a bitmap with metadata blocks. The allocation map is updated on
44 * in the page struct. The index field contains a pointer to the chunk. 43 * every allocation and free to reflect the current state while the boundary
44 * map is only updated on allocation. Each metadata block contains
45 * information to help mitigate the need to iterate over large portions
46 * of the bitmap. The reverse mapping from page to chunk is stored in
47 * the page's index. Lastly, units are lazily backed and grow in unison.
48 *
49 * There is a unique conversion that goes on here between bytes and bits.
50 * Each bit represents a fragment of size PCPU_MIN_ALLOC_SIZE. The chunk
51 * tracks the number of pages it is responsible for in nr_pages. Helper
52 * functions are used to convert from between the bytes, bits, and blocks.
53 * All hints are managed in bits unless explicitly stated.
45 * 54 *
46 * To use this allocator, arch code should do the following: 55 * To use this allocator, arch code should do the following:
47 * 56 *
@@ -58,6 +67,7 @@
58#include <linux/bitmap.h> 67#include <linux/bitmap.h>
59#include <linux/bootmem.h> 68#include <linux/bootmem.h>
60#include <linux/err.h> 69#include <linux/err.h>
70#include <linux/lcm.h>
61#include <linux/list.h> 71#include <linux/list.h>
62#include <linux/log2.h> 72#include <linux/log2.h>
63#include <linux/mm.h> 73#include <linux/mm.h>
@@ -81,10 +91,9 @@
81 91
82#include "percpu-internal.h" 92#include "percpu-internal.h"
83 93
84#define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */ 94/* the slots are sorted by free bytes left, 1-31 bytes share the same slot */
85#define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */ 95#define PCPU_SLOT_BASE_SHIFT 5
86#define PCPU_ATOMIC_MAP_MARGIN_LOW 32 96
87#define PCPU_ATOMIC_MAP_MARGIN_HIGH 64
88#define PCPU_EMPTY_POP_PAGES_LOW 2 97#define PCPU_EMPTY_POP_PAGES_LOW 2
89#define PCPU_EMPTY_POP_PAGES_HIGH 4 98#define PCPU_EMPTY_POP_PAGES_HIGH 4
90 99
@@ -140,13 +149,10 @@ struct pcpu_chunk *pcpu_first_chunk __ro_after_init;
140 149
141/* 150/*
142 * Optional reserved chunk. This chunk reserves part of the first 151 * Optional reserved chunk. This chunk reserves part of the first
143 * chunk and serves it for reserved allocations. The amount of 152 * chunk and serves it for reserved allocations. When the reserved
144 * reserved offset is in pcpu_reserved_chunk_limit. When reserved 153 * region doesn't exist, the following variable is NULL.
145 * area doesn't exist, the following variables contain NULL and 0
146 * respectively.
147 */ 154 */
148struct pcpu_chunk *pcpu_reserved_chunk __ro_after_init; 155struct pcpu_chunk *pcpu_reserved_chunk __ro_after_init;
149static int pcpu_reserved_chunk_limit __ro_after_init;
150 156
151DEFINE_SPINLOCK(pcpu_lock); /* all internal data structures */ 157DEFINE_SPINLOCK(pcpu_lock); /* all internal data structures */
152static DEFINE_MUTEX(pcpu_alloc_mutex); /* chunk create/destroy, [de]pop, map ext */ 158static DEFINE_MUTEX(pcpu_alloc_mutex); /* chunk create/destroy, [de]pop, map ext */
@@ -160,7 +166,7 @@ static LIST_HEAD(pcpu_map_extend_chunks);
160 * The number of empty populated pages, protected by pcpu_lock. The 166 * The number of empty populated pages, protected by pcpu_lock. The
161 * reserved chunk doesn't contribute to the count. 167 * reserved chunk doesn't contribute to the count.
162 */ 168 */
163static int pcpu_nr_empty_pop_pages; 169int pcpu_nr_empty_pop_pages;
164 170
165/* 171/*
166 * Balance work is used to populate or destroy chunks asynchronously. We 172 * Balance work is used to populate or destroy chunks asynchronously. We
@@ -179,19 +185,26 @@ static void pcpu_schedule_balance_work(void)
179 schedule_work(&pcpu_balance_work); 185 schedule_work(&pcpu_balance_work);
180} 186}
181 187
182static bool pcpu_addr_in_first_chunk(void *addr) 188/**
189 * pcpu_addr_in_chunk - check if the address is served from this chunk
190 * @chunk: chunk of interest
191 * @addr: percpu address
192 *
193 * RETURNS:
194 * True if the address is served from this chunk.
195 */
196static bool pcpu_addr_in_chunk(struct pcpu_chunk *chunk, void *addr)
183{ 197{
184 void *first_start = pcpu_first_chunk->base_addr; 198 void *start_addr, *end_addr;
185 199
186 return addr >= first_start && addr < first_start + pcpu_unit_size; 200 if (!chunk)
187} 201 return false;
188 202
189static bool pcpu_addr_in_reserved_chunk(void *addr) 203 start_addr = chunk->base_addr + chunk->start_offset;
190{ 204 end_addr = chunk->base_addr + chunk->nr_pages * PAGE_SIZE -
191 void *first_start = pcpu_first_chunk->base_addr; 205 chunk->end_offset;
192 206
193 return addr >= first_start && 207 return addr >= start_addr && addr < end_addr;
194 addr < first_start + pcpu_reserved_chunk_limit;
195} 208}
196 209
197static int __pcpu_size_to_slot(int size) 210static int __pcpu_size_to_slot(int size)
@@ -209,10 +222,10 @@ static int pcpu_size_to_slot(int size)
209 222
210static int pcpu_chunk_slot(const struct pcpu_chunk *chunk) 223static int pcpu_chunk_slot(const struct pcpu_chunk *chunk)
211{ 224{
212 if (chunk->free_size < sizeof(int) || chunk->contig_hint < sizeof(int)) 225 if (chunk->free_bytes < PCPU_MIN_ALLOC_SIZE || chunk->contig_bits == 0)
213 return 0; 226 return 0;
214 227
215 return pcpu_size_to_slot(chunk->free_size); 228 return pcpu_size_to_slot(chunk->free_bytes);
216} 229}
217 230
218/* set the pointer to a chunk in a page struct */ 231/* set the pointer to a chunk in a page struct */
@@ -232,42 +245,200 @@ static int __maybe_unused pcpu_page_idx(unsigned int cpu, int page_idx)
232 return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx; 245 return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx;
233} 246}
234 247
248static unsigned long pcpu_unit_page_offset(unsigned int cpu, int page_idx)
249{
250 return pcpu_unit_offsets[cpu] + (page_idx << PAGE_SHIFT);
251}
252
235static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk, 253static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
236 unsigned int cpu, int page_idx) 254 unsigned int cpu, int page_idx)
237{ 255{
238 return (unsigned long)chunk->base_addr + pcpu_unit_offsets[cpu] + 256 return (unsigned long)chunk->base_addr +
239 (page_idx << PAGE_SHIFT); 257 pcpu_unit_page_offset(cpu, page_idx);
240} 258}
241 259
242static void __maybe_unused pcpu_next_unpop(struct pcpu_chunk *chunk, 260static void pcpu_next_unpop(unsigned long *bitmap, int *rs, int *re, int end)
243 int *rs, int *re, int end)
244{ 261{
245 *rs = find_next_zero_bit(chunk->populated, end, *rs); 262 *rs = find_next_zero_bit(bitmap, end, *rs);
246 *re = find_next_bit(chunk->populated, end, *rs + 1); 263 *re = find_next_bit(bitmap, end, *rs + 1);
247} 264}
248 265
249static void __maybe_unused pcpu_next_pop(struct pcpu_chunk *chunk, 266static void pcpu_next_pop(unsigned long *bitmap, int *rs, int *re, int end)
250 int *rs, int *re, int end)
251{ 267{
252 *rs = find_next_bit(chunk->populated, end, *rs); 268 *rs = find_next_bit(bitmap, end, *rs);
253 *re = find_next_zero_bit(chunk->populated, end, *rs + 1); 269 *re = find_next_zero_bit(bitmap, end, *rs + 1);
254} 270}
255 271
256/* 272/*
257 * (Un)populated page region iterators. Iterate over (un)populated 273 * Bitmap region iterators. Iterates over the bitmap between
258 * page regions between @start and @end in @chunk. @rs and @re should 274 * [@start, @end) in @chunk. @rs and @re should be integer variables
259 * be integer variables and will be set to start and end page index of 275 * and will be set to start and end index of the current free region.
260 * the current region. 276 */
277#define pcpu_for_each_unpop_region(bitmap, rs, re, start, end) \
278 for ((rs) = (start), pcpu_next_unpop((bitmap), &(rs), &(re), (end)); \
279 (rs) < (re); \
280 (rs) = (re) + 1, pcpu_next_unpop((bitmap), &(rs), &(re), (end)))
281
282#define pcpu_for_each_pop_region(bitmap, rs, re, start, end) \
283 for ((rs) = (start), pcpu_next_pop((bitmap), &(rs), &(re), (end)); \
284 (rs) < (re); \
285 (rs) = (re) + 1, pcpu_next_pop((bitmap), &(rs), &(re), (end)))
286
287/*
288 * The following are helper functions to help access bitmaps and convert
289 * between bitmap offsets to address offsets.
290 */
291static unsigned long *pcpu_index_alloc_map(struct pcpu_chunk *chunk, int index)
292{
293 return chunk->alloc_map +
294 (index * PCPU_BITMAP_BLOCK_BITS / BITS_PER_LONG);
295}
296
297static unsigned long pcpu_off_to_block_index(int off)
298{
299 return off / PCPU_BITMAP_BLOCK_BITS;
300}
301
302static unsigned long pcpu_off_to_block_off(int off)
303{
304 return off & (PCPU_BITMAP_BLOCK_BITS - 1);
305}
306
307static unsigned long pcpu_block_off_to_off(int index, int off)
308{
309 return index * PCPU_BITMAP_BLOCK_BITS + off;
310}
311
312/**
313 * pcpu_next_md_free_region - finds the next hint free area
314 * @chunk: chunk of interest
315 * @bit_off: chunk offset
316 * @bits: size of free area
317 *
318 * Helper function for pcpu_for_each_md_free_region. It checks
319 * block->contig_hint and performs aggregation across blocks to find the
320 * next hint. It modifies bit_off and bits in-place to be consumed in the
321 * loop.
322 */
323static void pcpu_next_md_free_region(struct pcpu_chunk *chunk, int *bit_off,
324 int *bits)
325{
326 int i = pcpu_off_to_block_index(*bit_off);
327 int block_off = pcpu_off_to_block_off(*bit_off);
328 struct pcpu_block_md *block;
329
330 *bits = 0;
331 for (block = chunk->md_blocks + i; i < pcpu_chunk_nr_blocks(chunk);
332 block++, i++) {
333 /* handles contig area across blocks */
334 if (*bits) {
335 *bits += block->left_free;
336 if (block->left_free == PCPU_BITMAP_BLOCK_BITS)
337 continue;
338 return;
339 }
340
341 /*
342 * This checks three things. First is there a contig_hint to
343 * check. Second, have we checked this hint before by
344 * comparing the block_off. Third, is this the same as the
345 * right contig hint. In the last case, it spills over into
346 * the next block and should be handled by the contig area
347 * across blocks code.
348 */
349 *bits = block->contig_hint;
350 if (*bits && block->contig_hint_start >= block_off &&
351 *bits + block->contig_hint_start < PCPU_BITMAP_BLOCK_BITS) {
352 *bit_off = pcpu_block_off_to_off(i,
353 block->contig_hint_start);
354 return;
355 }
356
357 *bits = block->right_free;
358 *bit_off = (i + 1) * PCPU_BITMAP_BLOCK_BITS - block->right_free;
359 }
360}
361
362/**
363 * pcpu_next_fit_region - finds fit areas for a given allocation request
364 * @chunk: chunk of interest
365 * @alloc_bits: size of allocation
366 * @align: alignment of area (max PAGE_SIZE)
367 * @bit_off: chunk offset
368 * @bits: size of free area
369 *
370 * Finds the next free region that is viable for use with a given size and
371 * alignment. This only returns if there is a valid area to be used for this
372 * allocation. block->first_free is returned if the allocation request fits
373 * within the block to see if the request can be fulfilled prior to the contig
374 * hint.
261 */ 375 */
262#define pcpu_for_each_unpop_region(chunk, rs, re, start, end) \ 376static void pcpu_next_fit_region(struct pcpu_chunk *chunk, int alloc_bits,
263 for ((rs) = (start), pcpu_next_unpop((chunk), &(rs), &(re), (end)); \ 377 int align, int *bit_off, int *bits)
264 (rs) < (re); \ 378{
265 (rs) = (re) + 1, pcpu_next_unpop((chunk), &(rs), &(re), (end))) 379 int i = pcpu_off_to_block_index(*bit_off);
380 int block_off = pcpu_off_to_block_off(*bit_off);
381 struct pcpu_block_md *block;
382
383 *bits = 0;
384 for (block = chunk->md_blocks + i; i < pcpu_chunk_nr_blocks(chunk);
385 block++, i++) {
386 /* handles contig area across blocks */
387 if (*bits) {
388 *bits += block->left_free;
389 if (*bits >= alloc_bits)
390 return;
391 if (block->left_free == PCPU_BITMAP_BLOCK_BITS)
392 continue;
393 }
394
395 /* check block->contig_hint */
396 *bits = ALIGN(block->contig_hint_start, align) -
397 block->contig_hint_start;
398 /*
399 * This uses the block offset to determine if this has been
400 * checked in the prior iteration.
401 */
402 if (block->contig_hint &&
403 block->contig_hint_start >= block_off &&
404 block->contig_hint >= *bits + alloc_bits) {
405 *bits += alloc_bits + block->contig_hint_start -
406 block->first_free;
407 *bit_off = pcpu_block_off_to_off(i, block->first_free);
408 return;
409 }
410
411 *bit_off = ALIGN(PCPU_BITMAP_BLOCK_BITS - block->right_free,
412 align);
413 *bits = PCPU_BITMAP_BLOCK_BITS - *bit_off;
414 *bit_off = pcpu_block_off_to_off(i, *bit_off);
415 if (*bits >= alloc_bits)
416 return;
417 }
266 418
267#define pcpu_for_each_pop_region(chunk, rs, re, start, end) \ 419 /* no valid offsets were found - fail condition */
268 for ((rs) = (start), pcpu_next_pop((chunk), &(rs), &(re), (end)); \ 420 *bit_off = pcpu_chunk_map_bits(chunk);
269 (rs) < (re); \ 421}
270 (rs) = (re) + 1, pcpu_next_pop((chunk), &(rs), &(re), (end))) 422
423/*
424 * Metadata free area iterators. These perform aggregation of free areas
425 * based on the metadata blocks and return the offset @bit_off and size in
426 * bits of the free area @bits. pcpu_for_each_fit_region only returns when
427 * a fit is found for the allocation request.
428 */
429#define pcpu_for_each_md_free_region(chunk, bit_off, bits) \
430 for (pcpu_next_md_free_region((chunk), &(bit_off), &(bits)); \
431 (bit_off) < pcpu_chunk_map_bits((chunk)); \
432 (bit_off) += (bits) + 1, \
433 pcpu_next_md_free_region((chunk), &(bit_off), &(bits)))
434
435#define pcpu_for_each_fit_region(chunk, alloc_bits, align, bit_off, bits) \
436 for (pcpu_next_fit_region((chunk), (alloc_bits), (align), &(bit_off), \
437 &(bits)); \
438 (bit_off) < pcpu_chunk_map_bits((chunk)); \
439 (bit_off) += (bits), \
440 pcpu_next_fit_region((chunk), (alloc_bits), (align), &(bit_off), \
441 &(bits)))
271 442
272/** 443/**
273 * pcpu_mem_zalloc - allocate memory 444 * pcpu_mem_zalloc - allocate memory
@@ -306,38 +477,6 @@ static void pcpu_mem_free(void *ptr)
306} 477}
307 478
308/** 479/**
309 * pcpu_count_occupied_pages - count the number of pages an area occupies
310 * @chunk: chunk of interest
311 * @i: index of the area in question
312 *
313 * Count the number of pages chunk's @i'th area occupies. When the area's
314 * start and/or end address isn't aligned to page boundary, the straddled
315 * page is included in the count iff the rest of the page is free.
316 */
317static int pcpu_count_occupied_pages(struct pcpu_chunk *chunk, int i)
318{
319 int off = chunk->map[i] & ~1;
320 int end = chunk->map[i + 1] & ~1;
321
322 if (!PAGE_ALIGNED(off) && i > 0) {
323 int prev = chunk->map[i - 1];
324
325 if (!(prev & 1) && prev <= round_down(off, PAGE_SIZE))
326 off = round_down(off, PAGE_SIZE);
327 }
328
329 if (!PAGE_ALIGNED(end) && i + 1 < chunk->map_used) {
330 int next = chunk->map[i + 1];
331 int nend = chunk->map[i + 2] & ~1;
332
333 if (!(next & 1) && nend >= round_up(end, PAGE_SIZE))
334 end = round_up(end, PAGE_SIZE);
335 }
336
337 return max_t(int, PFN_DOWN(end) - PFN_UP(off), 0);
338}
339
340/**
341 * pcpu_chunk_relocate - put chunk in the appropriate chunk slot 480 * pcpu_chunk_relocate - put chunk in the appropriate chunk slot
342 * @chunk: chunk of interest 481 * @chunk: chunk of interest
343 * @oslot: the previous slot it was on 482 * @oslot: the previous slot it was on
@@ -363,383 +502,706 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
363} 502}
364 503
365/** 504/**
366 * pcpu_need_to_extend - determine whether chunk area map needs to be extended 505 * pcpu_cnt_pop_pages- counts populated backing pages in range
367 * @chunk: chunk of interest 506 * @chunk: chunk of interest
368 * @is_atomic: the allocation context 507 * @bit_off: start offset
508 * @bits: size of area to check
369 * 509 *
370 * Determine whether area map of @chunk needs to be extended. If 510 * Calculates the number of populated pages in the region
371 * @is_atomic, only the amount necessary for a new allocation is 511 * [page_start, page_end). This keeps track of how many empty populated
372 * considered; however, async extension is scheduled if the left amount is 512 * pages are available and decide if async work should be scheduled.
373 * low. If !@is_atomic, it aims for more empty space. Combined, this
374 * ensures that the map is likely to have enough available space to
375 * accomodate atomic allocations which can't extend maps directly.
376 *
377 * CONTEXT:
378 * pcpu_lock.
379 * 513 *
380 * RETURNS: 514 * RETURNS:
381 * New target map allocation length if extension is necessary, 0 515 * The nr of populated pages.
382 * otherwise.
383 */ 516 */
384static int pcpu_need_to_extend(struct pcpu_chunk *chunk, bool is_atomic) 517static inline int pcpu_cnt_pop_pages(struct pcpu_chunk *chunk, int bit_off,
518 int bits)
385{ 519{
386 int margin, new_alloc; 520 int page_start = PFN_UP(bit_off * PCPU_MIN_ALLOC_SIZE);
387 521 int page_end = PFN_DOWN((bit_off + bits) * PCPU_MIN_ALLOC_SIZE);
388 lockdep_assert_held(&pcpu_lock);
389
390 if (is_atomic) {
391 margin = 3;
392 522
393 if (chunk->map_alloc < 523 if (page_start >= page_end)
394 chunk->map_used + PCPU_ATOMIC_MAP_MARGIN_LOW) {
395 if (list_empty(&chunk->map_extend_list)) {
396 list_add_tail(&chunk->map_extend_list,
397 &pcpu_map_extend_chunks);
398 pcpu_schedule_balance_work();
399 }
400 }
401 } else {
402 margin = PCPU_ATOMIC_MAP_MARGIN_HIGH;
403 }
404
405 if (chunk->map_alloc >= chunk->map_used + margin)
406 return 0; 524 return 0;
407 525
408 new_alloc = PCPU_DFL_MAP_ALLOC; 526 /*
409 while (new_alloc < chunk->map_used + margin) 527 * bitmap_weight counts the number of bits set in a bitmap up to
410 new_alloc *= 2; 528 * the specified number of bits. This is counting the populated
411 529 * pages up to page_end and then subtracting the populated pages
412 return new_alloc; 530 * up to page_start to count the populated pages in
531 * [page_start, page_end).
532 */
533 return bitmap_weight(chunk->populated, page_end) -
534 bitmap_weight(chunk->populated, page_start);
413} 535}
414 536
415/** 537/**
416 * pcpu_extend_area_map - extend area map of a chunk 538 * pcpu_chunk_update - updates the chunk metadata given a free area
417 * @chunk: chunk of interest 539 * @chunk: chunk of interest
418 * @new_alloc: new target allocation length of the area map 540 * @bit_off: chunk offset
541 * @bits: size of free area
419 * 542 *
420 * Extend area map of @chunk to have @new_alloc entries. 543 * This updates the chunk's contig hint and starting offset given a free area.
544 * Choose the best starting offset if the contig hint is equal.
545 */
546static void pcpu_chunk_update(struct pcpu_chunk *chunk, int bit_off, int bits)
547{
548 if (bits > chunk->contig_bits) {
549 chunk->contig_bits_start = bit_off;
550 chunk->contig_bits = bits;
551 } else if (bits == chunk->contig_bits && chunk->contig_bits_start &&
552 (!bit_off ||
553 __ffs(bit_off) > __ffs(chunk->contig_bits_start))) {
554 /* use the start with the best alignment */
555 chunk->contig_bits_start = bit_off;
556 }
557}
558
559/**
560 * pcpu_chunk_refresh_hint - updates metadata about a chunk
561 * @chunk: chunk of interest
421 * 562 *
422 * CONTEXT: 563 * Iterates over the metadata blocks to find the largest contig area.
423 * Does GFP_KERNEL allocation. Grabs and releases pcpu_lock. 564 * It also counts the populated pages and uses the delta to update the
565 * global count.
424 * 566 *
425 * RETURNS: 567 * Updates:
426 * 0 on success, -errno on failure. 568 * chunk->contig_bits
569 * chunk->contig_bits_start
570 * nr_empty_pop_pages (chunk and global)
427 */ 571 */
428static int pcpu_extend_area_map(struct pcpu_chunk *chunk, int new_alloc) 572static void pcpu_chunk_refresh_hint(struct pcpu_chunk *chunk)
429{ 573{
430 int *old = NULL, *new = NULL; 574 int bit_off, bits, nr_empty_pop_pages;
431 size_t old_size = 0, new_size = new_alloc * sizeof(new[0]);
432 unsigned long flags;
433 575
434 lockdep_assert_held(&pcpu_alloc_mutex); 576 /* clear metadata */
577 chunk->contig_bits = 0;
435 578
436 new = pcpu_mem_zalloc(new_size); 579 bit_off = chunk->first_bit;
437 if (!new) 580 bits = nr_empty_pop_pages = 0;
438 return -ENOMEM; 581 pcpu_for_each_md_free_region(chunk, bit_off, bits) {
582 pcpu_chunk_update(chunk, bit_off, bits);
439 583
440 /* acquire pcpu_lock and switch to new area map */ 584 nr_empty_pop_pages += pcpu_cnt_pop_pages(chunk, bit_off, bits);
441 spin_lock_irqsave(&pcpu_lock, flags); 585 }
442 586
443 if (new_alloc <= chunk->map_alloc) 587 /*
444 goto out_unlock; 588 * Keep track of nr_empty_pop_pages.
589 *
590 * The chunk maintains the previous number of free pages it held,
591 * so the delta is used to update the global counter. The reserved
592 * chunk is not part of the free page count as they are populated
593 * at init and are special to serving reserved allocations.
594 */
595 if (chunk != pcpu_reserved_chunk)
596 pcpu_nr_empty_pop_pages +=
597 (nr_empty_pop_pages - chunk->nr_empty_pop_pages);
445 598
446 old_size = chunk->map_alloc * sizeof(chunk->map[0]); 599 chunk->nr_empty_pop_pages = nr_empty_pop_pages;
447 old = chunk->map; 600}
448 601
449 memcpy(new, old, old_size); 602/**
603 * pcpu_block_update - updates a block given a free area
604 * @block: block of interest
605 * @start: start offset in block
606 * @end: end offset in block
607 *
608 * Updates a block given a known free area. The region [start, end) is
609 * expected to be the entirety of the free area within a block. Chooses
610 * the best starting offset if the contig hints are equal.
611 */
612static void pcpu_block_update(struct pcpu_block_md *block, int start, int end)
613{
614 int contig = end - start;
615
616 block->first_free = min(block->first_free, start);
617 if (start == 0)
618 block->left_free = contig;
619
620 if (end == PCPU_BITMAP_BLOCK_BITS)
621 block->right_free = contig;
622
623 if (contig > block->contig_hint) {
624 block->contig_hint_start = start;
625 block->contig_hint = contig;
626 } else if (block->contig_hint_start && contig == block->contig_hint &&
627 (!start || __ffs(start) > __ffs(block->contig_hint_start))) {
628 /* use the start with the best alignment */
629 block->contig_hint_start = start;
630 }
631}
450 632
451 chunk->map_alloc = new_alloc; 633/**
452 chunk->map = new; 634 * pcpu_block_refresh_hint
453 new = NULL; 635 * @chunk: chunk of interest
636 * @index: index of the metadata block
637 *
638 * Scans over the block beginning at first_free and updates the block
639 * metadata accordingly.
640 */
641static void pcpu_block_refresh_hint(struct pcpu_chunk *chunk, int index)
642{
643 struct pcpu_block_md *block = chunk->md_blocks + index;
644 unsigned long *alloc_map = pcpu_index_alloc_map(chunk, index);
645 int rs, re; /* region start, region end */
646
647 /* clear hints */
648 block->contig_hint = 0;
649 block->left_free = block->right_free = 0;
650
651 /* iterate over free areas and update the contig hints */
652 pcpu_for_each_unpop_region(alloc_map, rs, re, block->first_free,
653 PCPU_BITMAP_BLOCK_BITS) {
654 pcpu_block_update(block, rs, re);
655 }
656}
454 657
455out_unlock: 658/**
456 spin_unlock_irqrestore(&pcpu_lock, flags); 659 * pcpu_block_update_hint_alloc - update hint on allocation path
660 * @chunk: chunk of interest
661 * @bit_off: chunk offset
662 * @bits: size of request
663 *
664 * Updates metadata for the allocation path. The metadata only has to be
665 * refreshed by a full scan iff the chunk's contig hint is broken. Block level
666 * scans are required if the block's contig hint is broken.
667 */
668static void pcpu_block_update_hint_alloc(struct pcpu_chunk *chunk, int bit_off,
669 int bits)
670{
671 struct pcpu_block_md *s_block, *e_block, *block;
672 int s_index, e_index; /* block indexes of the freed allocation */
673 int s_off, e_off; /* block offsets of the freed allocation */
457 674
458 /* 675 /*
459 * pcpu_mem_free() might end up calling vfree() which uses 676 * Calculate per block offsets.
460 * IRQ-unsafe lock and thus can't be called under pcpu_lock. 677 * The calculation uses an inclusive range, but the resulting offsets
678 * are [start, end). e_index always points to the last block in the
679 * range.
461 */ 680 */
462 pcpu_mem_free(old); 681 s_index = pcpu_off_to_block_index(bit_off);
463 pcpu_mem_free(new); 682 e_index = pcpu_off_to_block_index(bit_off + bits - 1);
683 s_off = pcpu_off_to_block_off(bit_off);
684 e_off = pcpu_off_to_block_off(bit_off + bits - 1) + 1;
464 685
465 return 0; 686 s_block = chunk->md_blocks + s_index;
687 e_block = chunk->md_blocks + e_index;
688
689 /*
690 * Update s_block.
691 * block->first_free must be updated if the allocation takes its place.
692 * If the allocation breaks the contig_hint, a scan is required to
693 * restore this hint.
694 */
695 if (s_off == s_block->first_free)
696 s_block->first_free = find_next_zero_bit(
697 pcpu_index_alloc_map(chunk, s_index),
698 PCPU_BITMAP_BLOCK_BITS,
699 s_off + bits);
700
701 if (s_off >= s_block->contig_hint_start &&
702 s_off < s_block->contig_hint_start + s_block->contig_hint) {
703 /* block contig hint is broken - scan to fix it */
704 pcpu_block_refresh_hint(chunk, s_index);
705 } else {
706 /* update left and right contig manually */
707 s_block->left_free = min(s_block->left_free, s_off);
708 if (s_index == e_index)
709 s_block->right_free = min_t(int, s_block->right_free,
710 PCPU_BITMAP_BLOCK_BITS - e_off);
711 else
712 s_block->right_free = 0;
713 }
714
715 /*
716 * Update e_block.
717 */
718 if (s_index != e_index) {
719 /*
720 * When the allocation is across blocks, the end is along
721 * the left part of the e_block.
722 */
723 e_block->first_free = find_next_zero_bit(
724 pcpu_index_alloc_map(chunk, e_index),
725 PCPU_BITMAP_BLOCK_BITS, e_off);
726
727 if (e_off == PCPU_BITMAP_BLOCK_BITS) {
728 /* reset the block */
729 e_block++;
730 } else {
731 if (e_off > e_block->contig_hint_start) {
732 /* contig hint is broken - scan to fix it */
733 pcpu_block_refresh_hint(chunk, e_index);
734 } else {
735 e_block->left_free = 0;
736 e_block->right_free =
737 min_t(int, e_block->right_free,
738 PCPU_BITMAP_BLOCK_BITS - e_off);
739 }
740 }
741
742 /* update in-between md_blocks */
743 for (block = s_block + 1; block < e_block; block++) {
744 block->contig_hint = 0;
745 block->left_free = 0;
746 block->right_free = 0;
747 }
748 }
749
750 /*
751 * The only time a full chunk scan is required is if the chunk
752 * contig hint is broken. Otherwise, it means a smaller space
753 * was used and therefore the chunk contig hint is still correct.
754 */
755 if (bit_off >= chunk->contig_bits_start &&
756 bit_off < chunk->contig_bits_start + chunk->contig_bits)
757 pcpu_chunk_refresh_hint(chunk);
466} 758}
467 759
468/** 760/**
469 * pcpu_fit_in_area - try to fit the requested allocation in a candidate area 761 * pcpu_block_update_hint_free - updates the block hints on the free path
470 * @chunk: chunk the candidate area belongs to 762 * @chunk: chunk of interest
471 * @off: the offset to the start of the candidate area 763 * @bit_off: chunk offset
472 * @this_size: the size of the candidate area 764 * @bits: size of request
473 * @size: the size of the target allocation 765 *
474 * @align: the alignment of the target allocation 766 * Updates metadata for the allocation path. This avoids a blind block
475 * @pop_only: only allocate from already populated region 767 * refresh by making use of the block contig hints. If this fails, it scans
476 * 768 * forward and backward to determine the extent of the free area. This is
477 * We're trying to allocate @size bytes aligned at @align. @chunk's area 769 * capped at the boundary of blocks.
478 * at @off sized @this_size is a candidate. This function determines 770 *
479 * whether the target allocation fits in the candidate area and returns the 771 * A chunk update is triggered if a page becomes free, a block becomes free,
480 * number of bytes to pad after @off. If the target area doesn't fit, -1 772 * or the free spans across blocks. This tradeoff is to minimize iterating
481 * is returned. 773 * over the block metadata to update chunk->contig_bits. chunk->contig_bits
482 * 774 * may be off by up to a page, but it will never be more than the available
483 * If @pop_only is %true, this function only considers the already 775 * space. If the contig hint is contained in one block, it will be accurate.
484 * populated part of the candidate area.
485 */ 776 */
486static int pcpu_fit_in_area(struct pcpu_chunk *chunk, int off, int this_size, 777static void pcpu_block_update_hint_free(struct pcpu_chunk *chunk, int bit_off,
487 int size, int align, bool pop_only) 778 int bits)
488{ 779{
489 int cand_off = off; 780 struct pcpu_block_md *s_block, *e_block, *block;
490 781 int s_index, e_index; /* block indexes of the freed allocation */
491 while (true) { 782 int s_off, e_off; /* block offsets of the freed allocation */
492 int head = ALIGN(cand_off, align) - off; 783 int start, end; /* start and end of the whole free area */
493 int page_start, page_end, rs, re;
494 784
495 if (this_size < head + size) 785 /*
496 return -1; 786 * Calculate per block offsets.
787 * The calculation uses an inclusive range, but the resulting offsets
788 * are [start, end). e_index always points to the last block in the
789 * range.
790 */
791 s_index = pcpu_off_to_block_index(bit_off);
792 e_index = pcpu_off_to_block_index(bit_off + bits - 1);
793 s_off = pcpu_off_to_block_off(bit_off);
794 e_off = pcpu_off_to_block_off(bit_off + bits - 1) + 1;
497 795
498 if (!pop_only) 796 s_block = chunk->md_blocks + s_index;
499 return head; 797 e_block = chunk->md_blocks + e_index;
500 798
799 /*
800 * Check if the freed area aligns with the block->contig_hint.
801 * If it does, then the scan to find the beginning/end of the
802 * larger free area can be avoided.
803 *
804 * start and end refer to beginning and end of the free area
805 * within each their respective blocks. This is not necessarily
806 * the entire free area as it may span blocks past the beginning
807 * or end of the block.
808 */
809 start = s_off;
810 if (s_off == s_block->contig_hint + s_block->contig_hint_start) {
811 start = s_block->contig_hint_start;
812 } else {
501 /* 813 /*
502 * If the first unpopulated page is beyond the end of the 814 * Scan backwards to find the extent of the free area.
503 * allocation, the whole allocation is populated; 815 * find_last_bit returns the starting bit, so if the start bit
504 * otherwise, retry from the end of the unpopulated area. 816 * is returned, that means there was no last bit and the
817 * remainder of the chunk is free.
505 */ 818 */
506 page_start = PFN_DOWN(head + off); 819 int l_bit = find_last_bit(pcpu_index_alloc_map(chunk, s_index),
507 page_end = PFN_UP(head + off + size); 820 start);
508 821 start = (start == l_bit) ? 0 : l_bit + 1;
509 rs = page_start; 822 }
510 pcpu_next_unpop(chunk, &rs, &re, PFN_UP(off + this_size)); 823
511 if (rs >= page_end) 824 end = e_off;
512 return head; 825 if (e_off == e_block->contig_hint_start)
513 cand_off = re * PAGE_SIZE; 826 end = e_block->contig_hint_start + e_block->contig_hint;
827 else
828 end = find_next_bit(pcpu_index_alloc_map(chunk, e_index),
829 PCPU_BITMAP_BLOCK_BITS, end);
830
831 /* update s_block */
832 e_off = (s_index == e_index) ? end : PCPU_BITMAP_BLOCK_BITS;
833 pcpu_block_update(s_block, start, e_off);
834
835 /* freeing in the same block */
836 if (s_index != e_index) {
837 /* update e_block */
838 pcpu_block_update(e_block, 0, end);
839
840 /* reset md_blocks in the middle */
841 for (block = s_block + 1; block < e_block; block++) {
842 block->first_free = 0;
843 block->contig_hint_start = 0;
844 block->contig_hint = PCPU_BITMAP_BLOCK_BITS;
845 block->left_free = PCPU_BITMAP_BLOCK_BITS;
846 block->right_free = PCPU_BITMAP_BLOCK_BITS;
847 }
514 } 848 }
849
850 /*
851 * Refresh chunk metadata when the free makes a page free, a block
852 * free, or spans across blocks. The contig hint may be off by up to
853 * a page, but if the hint is contained in a block, it will be accurate
854 * with the else condition below.
855 */
856 if ((ALIGN_DOWN(end, min(PCPU_BITS_PER_PAGE, PCPU_BITMAP_BLOCK_BITS)) >
857 ALIGN(start, min(PCPU_BITS_PER_PAGE, PCPU_BITMAP_BLOCK_BITS))) ||
858 s_index != e_index)
859 pcpu_chunk_refresh_hint(chunk);
860 else
861 pcpu_chunk_update(chunk, pcpu_block_off_to_off(s_index, start),
862 s_block->contig_hint);
515} 863}
516 864
517/** 865/**
518 * pcpu_alloc_area - allocate area from a pcpu_chunk 866 * pcpu_is_populated - determines if the region is populated
519 * @chunk: chunk of interest 867 * @chunk: chunk of interest
520 * @size: wanted size in bytes 868 * @bit_off: chunk offset
521 * @align: wanted align 869 * @bits: size of area
522 * @pop_only: allocate only from the populated area 870 * @next_off: return value for the next offset to start searching
523 * @occ_pages_p: out param for the number of pages the area occupies
524 *
525 * Try to allocate @size bytes area aligned at @align from @chunk.
526 * Note that this function only allocates the offset. It doesn't
527 * populate or map the area.
528 * 871 *
529 * @chunk->map must have at least two free slots. 872 * For atomic allocations, check if the backing pages are populated.
530 * 873 *
531 * CONTEXT: 874 * RETURNS:
532 * pcpu_lock. 875 * Bool if the backing pages are populated.
876 * next_index is to skip over unpopulated blocks in pcpu_find_block_fit.
877 */
878static bool pcpu_is_populated(struct pcpu_chunk *chunk, int bit_off, int bits,
879 int *next_off)
880{
881 int page_start, page_end, rs, re;
882
883 page_start = PFN_DOWN(bit_off * PCPU_MIN_ALLOC_SIZE);
884 page_end = PFN_UP((bit_off + bits) * PCPU_MIN_ALLOC_SIZE);
885
886 rs = page_start;
887 pcpu_next_unpop(chunk->populated, &rs, &re, page_end);
888 if (rs >= page_end)
889 return true;
890
891 *next_off = re * PAGE_SIZE / PCPU_MIN_ALLOC_SIZE;
892 return false;
893}
894
895/**
896 * pcpu_find_block_fit - finds the block index to start searching
897 * @chunk: chunk of interest
898 * @alloc_bits: size of request in allocation units
899 * @align: alignment of area (max PAGE_SIZE bytes)
900 * @pop_only: use populated regions only
901 *
902 * Given a chunk and an allocation spec, find the offset to begin searching
903 * for a free region. This iterates over the bitmap metadata blocks to
904 * find an offset that will be guaranteed to fit the requirements. It is
905 * not quite first fit as if the allocation does not fit in the contig hint
906 * of a block or chunk, it is skipped. This errs on the side of caution
907 * to prevent excess iteration. Poor alignment can cause the allocator to
908 * skip over blocks and chunks that have valid free areas.
533 * 909 *
534 * RETURNS: 910 * RETURNS:
535 * Allocated offset in @chunk on success, -1 if no matching area is 911 * The offset in the bitmap to begin searching.
536 * found. 912 * -1 if no offset is found.
537 */ 913 */
538static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align, 914static int pcpu_find_block_fit(struct pcpu_chunk *chunk, int alloc_bits,
539 bool pop_only, int *occ_pages_p) 915 size_t align, bool pop_only)
540{ 916{
541 int oslot = pcpu_chunk_slot(chunk); 917 int bit_off, bits, next_off;
542 int max_contig = 0;
543 int i, off;
544 bool seen_free = false;
545 int *p;
546
547 for (i = chunk->first_free, p = chunk->map + i; i < chunk->map_used; i++, p++) {
548 int head, tail;
549 int this_size;
550
551 off = *p;
552 if (off & 1)
553 continue;
554 918
555 this_size = (p[1] & ~1) - off; 919 /*
920 * Check to see if the allocation can fit in the chunk's contig hint.
921 * This is an optimization to prevent scanning by assuming if it
922 * cannot fit in the global hint, there is memory pressure and creating
923 * a new chunk would happen soon.
924 */
925 bit_off = ALIGN(chunk->contig_bits_start, align) -
926 chunk->contig_bits_start;
927 if (bit_off + alloc_bits > chunk->contig_bits)
928 return -1;
929
930 bit_off = chunk->first_bit;
931 bits = 0;
932 pcpu_for_each_fit_region(chunk, alloc_bits, align, bit_off, bits) {
933 if (!pop_only || pcpu_is_populated(chunk, bit_off, bits,
934 &next_off))
935 break;
556 936
557 head = pcpu_fit_in_area(chunk, off, this_size, size, align, 937 bit_off = next_off;
558 pop_only); 938 bits = 0;
559 if (head < 0) { 939 }
560 if (!seen_free) {
561 chunk->first_free = i;
562 seen_free = true;
563 }
564 max_contig = max(this_size, max_contig);
565 continue;
566 }
567 940
568 /* 941 if (bit_off == pcpu_chunk_map_bits(chunk))
569 * If head is small or the previous block is free, 942 return -1;
570 * merge'em. Note that 'small' is defined as smaller
571 * than sizeof(int), which is very small but isn't too
572 * uncommon for percpu allocations.
573 */
574 if (head && (head < sizeof(int) || !(p[-1] & 1))) {
575 *p = off += head;
576 if (p[-1] & 1)
577 chunk->free_size -= head;
578 else
579 max_contig = max(*p - p[-1], max_contig);
580 this_size -= head;
581 head = 0;
582 }
583 943
584 /* if tail is small, just keep it around */ 944 return bit_off;
585 tail = this_size - head - size; 945}
586 if (tail < sizeof(int)) {
587 tail = 0;
588 size = this_size - head;
589 }
590 946
591 /* split if warranted */ 947/**
592 if (head || tail) { 948 * pcpu_alloc_area - allocates an area from a pcpu_chunk
593 int nr_extra = !!head + !!tail; 949 * @chunk: chunk of interest
594 950 * @alloc_bits: size of request in allocation units
595 /* insert new subblocks */ 951 * @align: alignment of area (max PAGE_SIZE)
596 memmove(p + nr_extra + 1, p + 1, 952 * @start: bit_off to start searching
597 sizeof(chunk->map[0]) * (chunk->map_used - i)); 953 *
598 chunk->map_used += nr_extra; 954 * This function takes in a @start offset to begin searching to fit an
599 955 * allocation of @alloc_bits with alignment @align. It needs to scan
600 if (head) { 956 * the allocation map because if it fits within the block's contig hint,
601 if (!seen_free) { 957 * @start will be block->first_free. This is an attempt to fill the
602 chunk->first_free = i; 958 * allocation prior to breaking the contig hint. The allocation and
603 seen_free = true; 959 * boundary maps are updated accordingly if it confirms a valid
604 } 960 * free area.
605 *++p = off += head; 961 *
606 ++i; 962 * RETURNS:
607 max_contig = max(head, max_contig); 963 * Allocated addr offset in @chunk on success.
608 } 964 * -1 if no matching area is found.
609 if (tail) { 965 */
610 p[1] = off + size; 966static int pcpu_alloc_area(struct pcpu_chunk *chunk, int alloc_bits,
611 max_contig = max(tail, max_contig); 967 size_t align, int start)
612 } 968{
613 } 969 size_t align_mask = (align) ? (align - 1) : 0;
970 int bit_off, end, oslot;
614 971
615 if (!seen_free) 972 lockdep_assert_held(&pcpu_lock);
616 chunk->first_free = i + 1;
617 973
618 /* update hint and mark allocated */ 974 oslot = pcpu_chunk_slot(chunk);
619 if (i + 1 == chunk->map_used)
620 chunk->contig_hint = max_contig; /* fully scanned */
621 else
622 chunk->contig_hint = max(chunk->contig_hint,
623 max_contig);
624 975
625 chunk->free_size -= size; 976 /*
626 *p |= 1; 977 * Search to find a fit.
978 */
979 end = start + alloc_bits + PCPU_BITMAP_BLOCK_BITS;
980 bit_off = bitmap_find_next_zero_area(chunk->alloc_map, end, start,
981 alloc_bits, align_mask);
982 if (bit_off >= end)
983 return -1;
627 984
628 *occ_pages_p = pcpu_count_occupied_pages(chunk, i); 985 /* update alloc map */
629 pcpu_chunk_relocate(chunk, oslot); 986 bitmap_set(chunk->alloc_map, bit_off, alloc_bits);
630 return off; 987
631 } 988 /* update boundary map */
989 set_bit(bit_off, chunk->bound_map);
990 bitmap_clear(chunk->bound_map, bit_off + 1, alloc_bits - 1);
991 set_bit(bit_off + alloc_bits, chunk->bound_map);
992
993 chunk->free_bytes -= alloc_bits * PCPU_MIN_ALLOC_SIZE;
994
995 /* update first free bit */
996 if (bit_off == chunk->first_bit)
997 chunk->first_bit = find_next_zero_bit(
998 chunk->alloc_map,
999 pcpu_chunk_map_bits(chunk),
1000 bit_off + alloc_bits);
1001
1002 pcpu_block_update_hint_alloc(chunk, bit_off, alloc_bits);
632 1003
633 chunk->contig_hint = max_contig; /* fully scanned */
634 pcpu_chunk_relocate(chunk, oslot); 1004 pcpu_chunk_relocate(chunk, oslot);
635 1005
636 /* tell the upper layer that this chunk has no matching area */ 1006 return bit_off * PCPU_MIN_ALLOC_SIZE;
637 return -1;
638} 1007}
639 1008
640/** 1009/**
641 * pcpu_free_area - free area to a pcpu_chunk 1010 * pcpu_free_area - frees the corresponding offset
642 * @chunk: chunk of interest 1011 * @chunk: chunk of interest
643 * @freeme: offset of area to free 1012 * @off: addr offset into chunk
644 * @occ_pages_p: out param for the number of pages the area occupies
645 *
646 * Free area starting from @freeme to @chunk. Note that this function
647 * only modifies the allocation map. It doesn't depopulate or unmap
648 * the area.
649 * 1013 *
650 * CONTEXT: 1014 * This function determines the size of an allocation to free using
651 * pcpu_lock. 1015 * the boundary bitmap and clears the allocation map.
652 */ 1016 */
653static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme, 1017static void pcpu_free_area(struct pcpu_chunk *chunk, int off)
654 int *occ_pages_p)
655{ 1018{
656 int oslot = pcpu_chunk_slot(chunk); 1019 int bit_off, bits, end, oslot;
657 int off = 0;
658 unsigned i, j;
659 int to_free = 0;
660 int *p;
661 1020
662 lockdep_assert_held(&pcpu_lock); 1021 lockdep_assert_held(&pcpu_lock);
663 pcpu_stats_area_dealloc(chunk); 1022 pcpu_stats_area_dealloc(chunk);
664 1023
665 freeme |= 1; /* we are searching for <given offset, in use> pair */ 1024 oslot = pcpu_chunk_slot(chunk);
666 1025
667 i = 0; 1026 bit_off = off / PCPU_MIN_ALLOC_SIZE;
668 j = chunk->map_used; 1027
669 while (i != j) { 1028 /* find end index */
670 unsigned k = (i + j) / 2; 1029 end = find_next_bit(chunk->bound_map, pcpu_chunk_map_bits(chunk),
671 off = chunk->map[k]; 1030 bit_off + 1);
672 if (off < freeme) 1031 bits = end - bit_off;
673 i = k + 1; 1032 bitmap_clear(chunk->alloc_map, bit_off, bits);
674 else if (off > freeme) 1033
675 j = k; 1034 /* update metadata */
676 else 1035 chunk->free_bytes += bits * PCPU_MIN_ALLOC_SIZE;
677 i = j = k; 1036
1037 /* update first free bit */
1038 chunk->first_bit = min(chunk->first_bit, bit_off);
1039
1040 pcpu_block_update_hint_free(chunk, bit_off, bits);
1041
1042 pcpu_chunk_relocate(chunk, oslot);
1043}
1044
1045static void pcpu_init_md_blocks(struct pcpu_chunk *chunk)
1046{
1047 struct pcpu_block_md *md_block;
1048
1049 for (md_block = chunk->md_blocks;
1050 md_block != chunk->md_blocks + pcpu_chunk_nr_blocks(chunk);
1051 md_block++) {
1052 md_block->contig_hint = PCPU_BITMAP_BLOCK_BITS;
1053 md_block->left_free = PCPU_BITMAP_BLOCK_BITS;
1054 md_block->right_free = PCPU_BITMAP_BLOCK_BITS;
678 } 1055 }
679 BUG_ON(off != freeme); 1056}
680 1057
681 if (i < chunk->first_free) 1058/**
682 chunk->first_free = i; 1059 * pcpu_alloc_first_chunk - creates chunks that serve the first chunk
1060 * @tmp_addr: the start of the region served
1061 * @map_size: size of the region served
1062 *
1063 * This is responsible for creating the chunks that serve the first chunk. The
1064 * base_addr is page aligned down of @tmp_addr while the region end is page
1065 * aligned up. Offsets are kept track of to determine the region served. All
1066 * this is done to appease the bitmap allocator in avoiding partial blocks.
1067 *
1068 * RETURNS:
1069 * Chunk serving the region at @tmp_addr of @map_size.
1070 */
1071static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr,
1072 int map_size)
1073{
1074 struct pcpu_chunk *chunk;
1075 unsigned long aligned_addr, lcm_align;
1076 int start_offset, offset_bits, region_size, region_bits;
683 1077
684 p = chunk->map + i; 1078 /* region calculations */
685 *p = off &= ~1; 1079 aligned_addr = tmp_addr & PAGE_MASK;
686 chunk->free_size += (p[1] & ~1) - off;
687 1080
688 *occ_pages_p = pcpu_count_occupied_pages(chunk, i); 1081 start_offset = tmp_addr - aligned_addr;
689 1082
690 /* merge with next? */ 1083 /*
691 if (!(p[1] & 1)) 1084 * Align the end of the region with the LCM of PAGE_SIZE and
692 to_free++; 1085 * PCPU_BITMAP_BLOCK_SIZE. One of these constants is a multiple of
693 /* merge with previous? */ 1086 * the other.
694 if (i > 0 && !(p[-1] & 1)) { 1087 */
695 to_free++; 1088 lcm_align = lcm(PAGE_SIZE, PCPU_BITMAP_BLOCK_SIZE);
696 i--; 1089 region_size = ALIGN(start_offset + map_size, lcm_align);
697 p--; 1090
1091 /* allocate chunk */
1092 chunk = memblock_virt_alloc(sizeof(struct pcpu_chunk) +
1093 BITS_TO_LONGS(region_size >> PAGE_SHIFT),
1094 0);
1095
1096 INIT_LIST_HEAD(&chunk->list);
1097
1098 chunk->base_addr = (void *)aligned_addr;
1099 chunk->start_offset = start_offset;
1100 chunk->end_offset = region_size - chunk->start_offset - map_size;
1101
1102 chunk->nr_pages = region_size >> PAGE_SHIFT;
1103 region_bits = pcpu_chunk_map_bits(chunk);
1104
1105 chunk->alloc_map = memblock_virt_alloc(BITS_TO_LONGS(region_bits) *
1106 sizeof(chunk->alloc_map[0]), 0);
1107 chunk->bound_map = memblock_virt_alloc(BITS_TO_LONGS(region_bits + 1) *
1108 sizeof(chunk->bound_map[0]), 0);
1109 chunk->md_blocks = memblock_virt_alloc(pcpu_chunk_nr_blocks(chunk) *
1110 sizeof(chunk->md_blocks[0]), 0);
1111 pcpu_init_md_blocks(chunk);
1112
1113 /* manage populated page bitmap */
1114 chunk->immutable = true;
1115 bitmap_fill(chunk->populated, chunk->nr_pages);
1116 chunk->nr_populated = chunk->nr_pages;
1117 chunk->nr_empty_pop_pages =
1118 pcpu_cnt_pop_pages(chunk, start_offset / PCPU_MIN_ALLOC_SIZE,
1119 map_size / PCPU_MIN_ALLOC_SIZE);
1120
1121 chunk->contig_bits = map_size / PCPU_MIN_ALLOC_SIZE;
1122 chunk->free_bytes = map_size;
1123
1124 if (chunk->start_offset) {
1125 /* hide the beginning of the bitmap */
1126 offset_bits = chunk->start_offset / PCPU_MIN_ALLOC_SIZE;
1127 bitmap_set(chunk->alloc_map, 0, offset_bits);
1128 set_bit(0, chunk->bound_map);
1129 set_bit(offset_bits, chunk->bound_map);
1130
1131 chunk->first_bit = offset_bits;
1132
1133 pcpu_block_update_hint_alloc(chunk, 0, offset_bits);
698 } 1134 }
699 if (to_free) { 1135
700 chunk->map_used -= to_free; 1136 if (chunk->end_offset) {
701 memmove(p + 1, p + 1 + to_free, 1137 /* hide the end of the bitmap */
702 (chunk->map_used - i) * sizeof(chunk->map[0])); 1138 offset_bits = chunk->end_offset / PCPU_MIN_ALLOC_SIZE;
1139 bitmap_set(chunk->alloc_map,
1140 pcpu_chunk_map_bits(chunk) - offset_bits,
1141 offset_bits);
1142 set_bit((start_offset + map_size) / PCPU_MIN_ALLOC_SIZE,
1143 chunk->bound_map);
1144 set_bit(region_bits, chunk->bound_map);
1145
1146 pcpu_block_update_hint_alloc(chunk, pcpu_chunk_map_bits(chunk)
1147 - offset_bits, offset_bits);
703 } 1148 }
704 1149
705 chunk->contig_hint = max(chunk->map[i + 1] - chunk->map[i] - 1, chunk->contig_hint); 1150 return chunk;
706 pcpu_chunk_relocate(chunk, oslot);
707} 1151}
708 1152
709static struct pcpu_chunk *pcpu_alloc_chunk(void) 1153static struct pcpu_chunk *pcpu_alloc_chunk(void)
710{ 1154{
711 struct pcpu_chunk *chunk; 1155 struct pcpu_chunk *chunk;
1156 int region_bits;
712 1157
713 chunk = pcpu_mem_zalloc(pcpu_chunk_struct_size); 1158 chunk = pcpu_mem_zalloc(pcpu_chunk_struct_size);
714 if (!chunk) 1159 if (!chunk)
715 return NULL; 1160 return NULL;
716 1161
717 chunk->map = pcpu_mem_zalloc(PCPU_DFL_MAP_ALLOC * 1162 INIT_LIST_HEAD(&chunk->list);
718 sizeof(chunk->map[0])); 1163 chunk->nr_pages = pcpu_unit_pages;
719 if (!chunk->map) { 1164 region_bits = pcpu_chunk_map_bits(chunk);
720 pcpu_mem_free(chunk);
721 return NULL;
722 }
723 1165
724 chunk->map_alloc = PCPU_DFL_MAP_ALLOC; 1166 chunk->alloc_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits) *
725 chunk->map[0] = 0; 1167 sizeof(chunk->alloc_map[0]));
726 chunk->map[1] = pcpu_unit_size | 1; 1168 if (!chunk->alloc_map)
727 chunk->map_used = 1; 1169 goto alloc_map_fail;
728 chunk->has_reserved = false;
729 1170
730 INIT_LIST_HEAD(&chunk->list); 1171 chunk->bound_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits + 1) *
731 INIT_LIST_HEAD(&chunk->map_extend_list); 1172 sizeof(chunk->bound_map[0]));
732 chunk->free_size = pcpu_unit_size; 1173 if (!chunk->bound_map)
733 chunk->contig_hint = pcpu_unit_size; 1174 goto bound_map_fail;
1175
1176 chunk->md_blocks = pcpu_mem_zalloc(pcpu_chunk_nr_blocks(chunk) *
1177 sizeof(chunk->md_blocks[0]));
1178 if (!chunk->md_blocks)
1179 goto md_blocks_fail;
1180
1181 pcpu_init_md_blocks(chunk);
1182
1183 /* init metadata */
1184 chunk->contig_bits = region_bits;
1185 chunk->free_bytes = chunk->nr_pages * PAGE_SIZE;
734 1186
735 return chunk; 1187 return chunk;
1188
1189md_blocks_fail:
1190 pcpu_mem_free(chunk->bound_map);
1191bound_map_fail:
1192 pcpu_mem_free(chunk->alloc_map);
1193alloc_map_fail:
1194 pcpu_mem_free(chunk);
1195
1196 return NULL;
736} 1197}
737 1198
738static void pcpu_free_chunk(struct pcpu_chunk *chunk) 1199static void pcpu_free_chunk(struct pcpu_chunk *chunk)
739{ 1200{
740 if (!chunk) 1201 if (!chunk)
741 return; 1202 return;
742 pcpu_mem_free(chunk->map); 1203 pcpu_mem_free(chunk->bound_map);
1204 pcpu_mem_free(chunk->alloc_map);
743 pcpu_mem_free(chunk); 1205 pcpu_mem_free(chunk);
744} 1206}
745 1207
@@ -748,13 +1210,17 @@ static void pcpu_free_chunk(struct pcpu_chunk *chunk)
748 * @chunk: pcpu_chunk which got populated 1210 * @chunk: pcpu_chunk which got populated
749 * @page_start: the start page 1211 * @page_start: the start page
750 * @page_end: the end page 1212 * @page_end: the end page
1213 * @for_alloc: if this is to populate for allocation
751 * 1214 *
752 * Pages in [@page_start,@page_end) have been populated to @chunk. Update 1215 * Pages in [@page_start,@page_end) have been populated to @chunk. Update
753 * the bookkeeping information accordingly. Must be called after each 1216 * the bookkeeping information accordingly. Must be called after each
754 * successful population. 1217 * successful population.
1218 *
1219 * If this is @for_alloc, do not increment pcpu_nr_empty_pop_pages because it
1220 * is to serve an allocation in that area.
755 */ 1221 */
756static void pcpu_chunk_populated(struct pcpu_chunk *chunk, 1222static void pcpu_chunk_populated(struct pcpu_chunk *chunk, int page_start,
757 int page_start, int page_end) 1223 int page_end, bool for_alloc)
758{ 1224{
759 int nr = page_end - page_start; 1225 int nr = page_end - page_start;
760 1226
@@ -762,7 +1228,11 @@ static void pcpu_chunk_populated(struct pcpu_chunk *chunk,
762 1228
763 bitmap_set(chunk->populated, page_start, nr); 1229 bitmap_set(chunk->populated, page_start, nr);
764 chunk->nr_populated += nr; 1230 chunk->nr_populated += nr;
765 pcpu_nr_empty_pop_pages += nr; 1231
1232 if (!for_alloc) {
1233 chunk->nr_empty_pop_pages += nr;
1234 pcpu_nr_empty_pop_pages += nr;
1235 }
766} 1236}
767 1237
768/** 1238/**
@@ -784,6 +1254,7 @@ static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk,
784 1254
785 bitmap_clear(chunk->populated, page_start, nr); 1255 bitmap_clear(chunk->populated, page_start, nr);
786 chunk->nr_populated -= nr; 1256 chunk->nr_populated -= nr;
1257 chunk->nr_empty_pop_pages -= nr;
787 pcpu_nr_empty_pop_pages -= nr; 1258 pcpu_nr_empty_pop_pages -= nr;
788} 1259}
789 1260
@@ -819,18 +1290,21 @@ static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai);
819 * pcpu_chunk_addr_search - determine chunk containing specified address 1290 * pcpu_chunk_addr_search - determine chunk containing specified address
820 * @addr: address for which the chunk needs to be determined. 1291 * @addr: address for which the chunk needs to be determined.
821 * 1292 *
1293 * This is an internal function that handles all but static allocations.
1294 * Static percpu address values should never be passed into the allocator.
1295 *
822 * RETURNS: 1296 * RETURNS:
823 * The address of the found chunk. 1297 * The address of the found chunk.
824 */ 1298 */
825static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) 1299static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
826{ 1300{
827 /* is it in the first chunk? */ 1301 /* is it in the dynamic region (first chunk)? */
828 if (pcpu_addr_in_first_chunk(addr)) { 1302 if (pcpu_addr_in_chunk(pcpu_first_chunk, addr))
829 /* is it in the reserved area? */
830 if (pcpu_addr_in_reserved_chunk(addr))
831 return pcpu_reserved_chunk;
832 return pcpu_first_chunk; 1303 return pcpu_first_chunk;
833 } 1304
1305 /* is it in the reserved region? */
1306 if (pcpu_addr_in_chunk(pcpu_reserved_chunk, addr))
1307 return pcpu_reserved_chunk;
834 1308
835 /* 1309 /*
836 * The address is relative to unit0 which might be unused and 1310 * The address is relative to unit0 which might be unused and
@@ -863,19 +1337,23 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
863 struct pcpu_chunk *chunk; 1337 struct pcpu_chunk *chunk;
864 const char *err; 1338 const char *err;
865 bool is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL; 1339 bool is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL;
866 int occ_pages = 0; 1340 int slot, off, cpu, ret;
867 int slot, off, new_alloc, cpu, ret;
868 unsigned long flags; 1341 unsigned long flags;
869 void __percpu *ptr; 1342 void __percpu *ptr;
1343 size_t bits, bit_align;
870 1344
871 /* 1345 /*
872 * We want the lowest bit of offset available for in-use/free 1346 * There is now a minimum allocation size of PCPU_MIN_ALLOC_SIZE,
873 * indicator, so force >= 16bit alignment and make size even. 1347 * therefore alignment must be a minimum of that many bytes.
1348 * An allocation may have internal fragmentation from rounding up
1349 * of up to PCPU_MIN_ALLOC_SIZE - 1 bytes.
874 */ 1350 */
875 if (unlikely(align < 2)) 1351 if (unlikely(align < PCPU_MIN_ALLOC_SIZE))
876 align = 2; 1352 align = PCPU_MIN_ALLOC_SIZE;
877 1353
878 size = ALIGN(size, 2); 1354 size = ALIGN(size, PCPU_MIN_ALLOC_SIZE);
1355 bits = size >> PCPU_MIN_ALLOC_SHIFT;
1356 bit_align = align >> PCPU_MIN_ALLOC_SHIFT;
879 1357
880 if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE || 1358 if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE ||
881 !is_power_of_2(align))) { 1359 !is_power_of_2(align))) {
@@ -893,23 +1371,13 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
893 if (reserved && pcpu_reserved_chunk) { 1371 if (reserved && pcpu_reserved_chunk) {
894 chunk = pcpu_reserved_chunk; 1372 chunk = pcpu_reserved_chunk;
895 1373
896 if (size > chunk->contig_hint) { 1374 off = pcpu_find_block_fit(chunk, bits, bit_align, is_atomic);
1375 if (off < 0) {
897 err = "alloc from reserved chunk failed"; 1376 err = "alloc from reserved chunk failed";
898 goto fail_unlock; 1377 goto fail_unlock;
899 } 1378 }
900 1379
901 while ((new_alloc = pcpu_need_to_extend(chunk, is_atomic))) { 1380 off = pcpu_alloc_area(chunk, bits, bit_align, off);
902 spin_unlock_irqrestore(&pcpu_lock, flags);
903 if (is_atomic ||
904 pcpu_extend_area_map(chunk, new_alloc) < 0) {
905 err = "failed to extend area map of reserved chunk";
906 goto fail;
907 }
908 spin_lock_irqsave(&pcpu_lock, flags);
909 }
910
911 off = pcpu_alloc_area(chunk, size, align, is_atomic,
912 &occ_pages);
913 if (off >= 0) 1381 if (off >= 0)
914 goto area_found; 1382 goto area_found;
915 1383
@@ -921,31 +1389,15 @@ restart:
921 /* search through normal chunks */ 1389 /* search through normal chunks */
922 for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) { 1390 for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) {
923 list_for_each_entry(chunk, &pcpu_slot[slot], list) { 1391 list_for_each_entry(chunk, &pcpu_slot[slot], list) {
924 if (size > chunk->contig_hint) 1392 off = pcpu_find_block_fit(chunk, bits, bit_align,
1393 is_atomic);
1394 if (off < 0)
925 continue; 1395 continue;
926 1396
927 new_alloc = pcpu_need_to_extend(chunk, is_atomic); 1397 off = pcpu_alloc_area(chunk, bits, bit_align, off);
928 if (new_alloc) {
929 if (is_atomic)
930 continue;
931 spin_unlock_irqrestore(&pcpu_lock, flags);
932 if (pcpu_extend_area_map(chunk,
933 new_alloc) < 0) {
934 err = "failed to extend area map";
935 goto fail;
936 }
937 spin_lock_irqsave(&pcpu_lock, flags);
938 /*
939 * pcpu_lock has been dropped, need to
940 * restart cpu_slot list walking.
941 */
942 goto restart;
943 }
944
945 off = pcpu_alloc_area(chunk, size, align, is_atomic,
946 &occ_pages);
947 if (off >= 0) 1398 if (off >= 0)
948 goto area_found; 1399 goto area_found;
1400
949 } 1401 }
950 } 1402 }
951 1403
@@ -987,30 +1439,25 @@ area_found:
987 page_start = PFN_DOWN(off); 1439 page_start = PFN_DOWN(off);
988 page_end = PFN_UP(off + size); 1440 page_end = PFN_UP(off + size);
989 1441
990 pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { 1442 pcpu_for_each_unpop_region(chunk->populated, rs, re,
1443 page_start, page_end) {
991 WARN_ON(chunk->immutable); 1444 WARN_ON(chunk->immutable);
992 1445
993 ret = pcpu_populate_chunk(chunk, rs, re); 1446 ret = pcpu_populate_chunk(chunk, rs, re);
994 1447
995 spin_lock_irqsave(&pcpu_lock, flags); 1448 spin_lock_irqsave(&pcpu_lock, flags);
996 if (ret) { 1449 if (ret) {
997 pcpu_free_area(chunk, off, &occ_pages); 1450 pcpu_free_area(chunk, off);
998 err = "failed to populate"; 1451 err = "failed to populate";
999 goto fail_unlock; 1452 goto fail_unlock;
1000 } 1453 }
1001 pcpu_chunk_populated(chunk, rs, re); 1454 pcpu_chunk_populated(chunk, rs, re, true);
1002 spin_unlock_irqrestore(&pcpu_lock, flags); 1455 spin_unlock_irqrestore(&pcpu_lock, flags);
1003 } 1456 }
1004 1457
1005 mutex_unlock(&pcpu_alloc_mutex); 1458 mutex_unlock(&pcpu_alloc_mutex);
1006 } 1459 }
1007 1460
1008 if (chunk != pcpu_reserved_chunk) {
1009 spin_lock_irqsave(&pcpu_lock, flags);
1010 pcpu_nr_empty_pop_pages -= occ_pages;
1011 spin_unlock_irqrestore(&pcpu_lock, flags);
1012 }
1013
1014 if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW) 1461 if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW)
1015 pcpu_schedule_balance_work(); 1462 pcpu_schedule_balance_work();
1016 1463
@@ -1128,7 +1575,6 @@ static void pcpu_balance_workfn(struct work_struct *work)
1128 if (chunk == list_first_entry(free_head, struct pcpu_chunk, list)) 1575 if (chunk == list_first_entry(free_head, struct pcpu_chunk, list))
1129 continue; 1576 continue;
1130 1577
1131 list_del_init(&chunk->map_extend_list);
1132 list_move(&chunk->list, &to_free); 1578 list_move(&chunk->list, &to_free);
1133 } 1579 }
1134 1580
@@ -1137,7 +1583,8 @@ static void pcpu_balance_workfn(struct work_struct *work)
1137 list_for_each_entry_safe(chunk, next, &to_free, list) { 1583 list_for_each_entry_safe(chunk, next, &to_free, list) {
1138 int rs, re; 1584 int rs, re;
1139 1585
1140 pcpu_for_each_pop_region(chunk, rs, re, 0, pcpu_unit_pages) { 1586 pcpu_for_each_pop_region(chunk->populated, rs, re, 0,
1587 chunk->nr_pages) {
1141 pcpu_depopulate_chunk(chunk, rs, re); 1588 pcpu_depopulate_chunk(chunk, rs, re);
1142 spin_lock_irq(&pcpu_lock); 1589 spin_lock_irq(&pcpu_lock);
1143 pcpu_chunk_depopulated(chunk, rs, re); 1590 pcpu_chunk_depopulated(chunk, rs, re);
@@ -1146,25 +1593,6 @@ static void pcpu_balance_workfn(struct work_struct *work)
1146 pcpu_destroy_chunk(chunk); 1593 pcpu_destroy_chunk(chunk);
1147 } 1594 }
1148 1595
1149 /* service chunks which requested async area map extension */
1150 do {
1151 int new_alloc = 0;
1152
1153 spin_lock_irq(&pcpu_lock);
1154
1155 chunk = list_first_entry_or_null(&pcpu_map_extend_chunks,
1156 struct pcpu_chunk, map_extend_list);
1157 if (chunk) {
1158 list_del_init(&chunk->map_extend_list);
1159 new_alloc = pcpu_need_to_extend(chunk, false);
1160 }
1161
1162 spin_unlock_irq(&pcpu_lock);
1163
1164 if (new_alloc)
1165 pcpu_extend_area_map(chunk, new_alloc);
1166 } while (chunk);
1167
1168 /* 1596 /*
1169 * Ensure there are certain number of free populated pages for 1597 * Ensure there are certain number of free populated pages for
1170 * atomic allocs. Fill up from the most packed so that atomic 1598 * atomic allocs. Fill up from the most packed so that atomic
@@ -1194,7 +1622,7 @@ retry_pop:
1194 1622
1195 spin_lock_irq(&pcpu_lock); 1623 spin_lock_irq(&pcpu_lock);
1196 list_for_each_entry(chunk, &pcpu_slot[slot], list) { 1624 list_for_each_entry(chunk, &pcpu_slot[slot], list) {
1197 nr_unpop = pcpu_unit_pages - chunk->nr_populated; 1625 nr_unpop = chunk->nr_pages - chunk->nr_populated;
1198 if (nr_unpop) 1626 if (nr_unpop)
1199 break; 1627 break;
1200 } 1628 }
@@ -1204,14 +1632,15 @@ retry_pop:
1204 continue; 1632 continue;
1205 1633
1206 /* @chunk can't go away while pcpu_alloc_mutex is held */ 1634 /* @chunk can't go away while pcpu_alloc_mutex is held */
1207 pcpu_for_each_unpop_region(chunk, rs, re, 0, pcpu_unit_pages) { 1635 pcpu_for_each_unpop_region(chunk->populated, rs, re, 0,
1636 chunk->nr_pages) {
1208 int nr = min(re - rs, nr_to_pop); 1637 int nr = min(re - rs, nr_to_pop);
1209 1638
1210 ret = pcpu_populate_chunk(chunk, rs, rs + nr); 1639 ret = pcpu_populate_chunk(chunk, rs, rs + nr);
1211 if (!ret) { 1640 if (!ret) {
1212 nr_to_pop -= nr; 1641 nr_to_pop -= nr;
1213 spin_lock_irq(&pcpu_lock); 1642 spin_lock_irq(&pcpu_lock);
1214 pcpu_chunk_populated(chunk, rs, rs + nr); 1643 pcpu_chunk_populated(chunk, rs, rs + nr, false);
1215 spin_unlock_irq(&pcpu_lock); 1644 spin_unlock_irq(&pcpu_lock);
1216 } else { 1645 } else {
1217 nr_to_pop = 0; 1646 nr_to_pop = 0;
@@ -1250,7 +1679,7 @@ void free_percpu(void __percpu *ptr)
1250 void *addr; 1679 void *addr;
1251 struct pcpu_chunk *chunk; 1680 struct pcpu_chunk *chunk;
1252 unsigned long flags; 1681 unsigned long flags;
1253 int off, occ_pages; 1682 int off;
1254 1683
1255 if (!ptr) 1684 if (!ptr)
1256 return; 1685 return;
@@ -1264,13 +1693,10 @@ void free_percpu(void __percpu *ptr)
1264 chunk = pcpu_chunk_addr_search(addr); 1693 chunk = pcpu_chunk_addr_search(addr);
1265 off = addr - chunk->base_addr; 1694 off = addr - chunk->base_addr;
1266 1695
1267 pcpu_free_area(chunk, off, &occ_pages); 1696 pcpu_free_area(chunk, off);
1268
1269 if (chunk != pcpu_reserved_chunk)
1270 pcpu_nr_empty_pop_pages += occ_pages;
1271 1697
1272 /* if there are more than one fully free chunks, wake up grim reaper */ 1698 /* if there are more than one fully free chunks, wake up grim reaper */
1273 if (chunk->free_size == pcpu_unit_size) { 1699 if (chunk->free_bytes == pcpu_unit_size) {
1274 struct pcpu_chunk *pos; 1700 struct pcpu_chunk *pos;
1275 1701
1276 list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list) 1702 list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list)
@@ -1361,10 +1787,16 @@ phys_addr_t per_cpu_ptr_to_phys(void *addr)
1361 * The following test on unit_low/high isn't strictly 1787 * The following test on unit_low/high isn't strictly
1362 * necessary but will speed up lookups of addresses which 1788 * necessary but will speed up lookups of addresses which
1363 * aren't in the first chunk. 1789 * aren't in the first chunk.
1790 *
1791 * The address check is against full chunk sizes. pcpu_base_addr
1792 * points to the beginning of the first chunk including the
1793 * static region. Assumes good intent as the first chunk may
1794 * not be full (ie. < pcpu_unit_pages in size).
1364 */ 1795 */
1365 first_low = pcpu_chunk_addr(pcpu_first_chunk, pcpu_low_unit_cpu, 0); 1796 first_low = (unsigned long)pcpu_base_addr +
1366 first_high = pcpu_chunk_addr(pcpu_first_chunk, pcpu_high_unit_cpu, 1797 pcpu_unit_page_offset(pcpu_low_unit_cpu, 0);
1367 pcpu_unit_pages); 1798 first_high = (unsigned long)pcpu_base_addr +
1799 pcpu_unit_page_offset(pcpu_high_unit_cpu, pcpu_unit_pages);
1368 if ((unsigned long)addr >= first_low && 1800 if ((unsigned long)addr >= first_low &&
1369 (unsigned long)addr < first_high) { 1801 (unsigned long)addr < first_high) {
1370 for_each_possible_cpu(cpu) { 1802 for_each_possible_cpu(cpu) {
@@ -1546,12 +1978,13 @@ static void pcpu_dump_alloc_info(const char *lvl,
1546 * The caller should have mapped the first chunk at @base_addr and 1978 * The caller should have mapped the first chunk at @base_addr and
1547 * copied static data to each unit. 1979 * copied static data to each unit.
1548 * 1980 *
1549 * If the first chunk ends up with both reserved and dynamic areas, it 1981 * The first chunk will always contain a static and a dynamic region.
1550 * is served by two chunks - one to serve the core static and reserved 1982 * However, the static region is not managed by any chunk. If the first
1551 * areas and the other for the dynamic area. They share the same vm 1983 * chunk also contains a reserved region, it is served by two chunks -
1552 * and page map but uses different area allocation map to stay away 1984 * one for the reserved region and one for the dynamic region. They
1553 * from each other. The latter chunk is circulated in the chunk slots 1985 * share the same vm, but use offset regions in the area allocation map.
1554 * and available for dynamic allocation like any other chunks. 1986 * The chunk serving the dynamic region is circulated in the chunk slots
1987 * and available for dynamic allocation like any other chunk.
1555 * 1988 *
1556 * RETURNS: 1989 * RETURNS:
1557 * 0 on success, -errno on failure. 1990 * 0 on success, -errno on failure.
@@ -1559,17 +1992,17 @@ static void pcpu_dump_alloc_info(const char *lvl,
1559int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, 1992int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
1560 void *base_addr) 1993 void *base_addr)
1561{ 1994{
1562 static int smap[PERCPU_DYNAMIC_EARLY_SLOTS] __initdata; 1995 size_t size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
1563 static int dmap[PERCPU_DYNAMIC_EARLY_SLOTS] __initdata; 1996 size_t static_size, dyn_size;
1564 size_t dyn_size = ai->dyn_size; 1997 struct pcpu_chunk *chunk;
1565 size_t size_sum = ai->static_size + ai->reserved_size + dyn_size;
1566 struct pcpu_chunk *schunk, *dchunk = NULL;
1567 unsigned long *group_offsets; 1998 unsigned long *group_offsets;
1568 size_t *group_sizes; 1999 size_t *group_sizes;
1569 unsigned long *unit_off; 2000 unsigned long *unit_off;
1570 unsigned int cpu; 2001 unsigned int cpu;
1571 int *unit_map; 2002 int *unit_map;
1572 int group, unit, i; 2003 int group, unit, i;
2004 int map_size;
2005 unsigned long tmp_addr;
1573 2006
1574#define PCPU_SETUP_BUG_ON(cond) do { \ 2007#define PCPU_SETUP_BUG_ON(cond) do { \
1575 if (unlikely(cond)) { \ 2008 if (unlikely(cond)) { \
@@ -1592,7 +2025,12 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
1592 PCPU_SETUP_BUG_ON(ai->unit_size < size_sum); 2025 PCPU_SETUP_BUG_ON(ai->unit_size < size_sum);
1593 PCPU_SETUP_BUG_ON(offset_in_page(ai->unit_size)); 2026 PCPU_SETUP_BUG_ON(offset_in_page(ai->unit_size));
1594 PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE); 2027 PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE);
2028 PCPU_SETUP_BUG_ON(!IS_ALIGNED(ai->unit_size, PCPU_BITMAP_BLOCK_SIZE));
1595 PCPU_SETUP_BUG_ON(ai->dyn_size < PERCPU_DYNAMIC_EARLY_SIZE); 2029 PCPU_SETUP_BUG_ON(ai->dyn_size < PERCPU_DYNAMIC_EARLY_SIZE);
2030 PCPU_SETUP_BUG_ON(!ai->dyn_size);
2031 PCPU_SETUP_BUG_ON(!IS_ALIGNED(ai->reserved_size, PCPU_MIN_ALLOC_SIZE));
2032 PCPU_SETUP_BUG_ON(!(IS_ALIGNED(PCPU_BITMAP_BLOCK_SIZE, PAGE_SIZE) ||
2033 IS_ALIGNED(PAGE_SIZE, PCPU_BITMAP_BLOCK_SIZE)));
1596 PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0); 2034 PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0);
1597 2035
1598 /* process group information and build config tables accordingly */ 2036 /* process group information and build config tables accordingly */
@@ -1671,64 +2109,41 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
1671 INIT_LIST_HEAD(&pcpu_slot[i]); 2109 INIT_LIST_HEAD(&pcpu_slot[i]);
1672 2110
1673 /* 2111 /*
1674 * Initialize static chunk. If reserved_size is zero, the 2112 * The end of the static region needs to be aligned with the
1675 * static chunk covers static area + dynamic allocation area 2113 * minimum allocation size as this offsets the reserved and
1676 * in the first chunk. If reserved_size is not zero, it 2114 * dynamic region. The first chunk ends page aligned by
1677 * covers static area + reserved area (mostly used for module 2115 * expanding the dynamic region, therefore the dynamic region
1678 * static percpu allocation). 2116 * can be shrunk to compensate while still staying above the
2117 * configured sizes.
1679 */ 2118 */
1680 schunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0); 2119 static_size = ALIGN(ai->static_size, PCPU_MIN_ALLOC_SIZE);
1681 INIT_LIST_HEAD(&schunk->list); 2120 dyn_size = ai->dyn_size - (static_size - ai->static_size);
1682 INIT_LIST_HEAD(&schunk->map_extend_list);
1683 schunk->base_addr = base_addr;
1684 schunk->map = smap;
1685 schunk->map_alloc = ARRAY_SIZE(smap);
1686 schunk->immutable = true;
1687 bitmap_fill(schunk->populated, pcpu_unit_pages);
1688 schunk->nr_populated = pcpu_unit_pages;
1689 2121
1690 if (ai->reserved_size) { 2122 /*
1691 schunk->free_size = ai->reserved_size; 2123 * Initialize first chunk.
1692 pcpu_reserved_chunk = schunk; 2124 * If the reserved_size is non-zero, this initializes the reserved
1693 pcpu_reserved_chunk_limit = ai->static_size + ai->reserved_size; 2125 * chunk. If the reserved_size is zero, the reserved chunk is NULL
1694 } else { 2126 * and the dynamic region is initialized here. The first chunk,
1695 schunk->free_size = dyn_size; 2127 * pcpu_first_chunk, will always point to the chunk that serves
1696 dyn_size = 0; /* dynamic area covered */ 2128 * the dynamic region.
1697 } 2129 */
1698 schunk->contig_hint = schunk->free_size; 2130 tmp_addr = (unsigned long)base_addr + static_size;
1699 2131 map_size = ai->reserved_size ?: dyn_size;
1700 schunk->map[0] = 1; 2132 chunk = pcpu_alloc_first_chunk(tmp_addr, map_size);
1701 schunk->map[1] = ai->static_size;
1702 schunk->map_used = 1;
1703 if (schunk->free_size)
1704 schunk->map[++schunk->map_used] = ai->static_size + schunk->free_size;
1705 schunk->map[schunk->map_used] |= 1;
1706 schunk->has_reserved = true;
1707 2133
1708 /* init dynamic chunk if necessary */ 2134 /* init dynamic chunk if necessary */
1709 if (dyn_size) { 2135 if (ai->reserved_size) {
1710 dchunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0); 2136 pcpu_reserved_chunk = chunk;
1711 INIT_LIST_HEAD(&dchunk->list); 2137
1712 INIT_LIST_HEAD(&dchunk->map_extend_list); 2138 tmp_addr = (unsigned long)base_addr + static_size +
1713 dchunk->base_addr = base_addr; 2139 ai->reserved_size;
1714 dchunk->map = dmap; 2140 map_size = dyn_size;
1715 dchunk->map_alloc = ARRAY_SIZE(dmap); 2141 chunk = pcpu_alloc_first_chunk(tmp_addr, map_size);
1716 dchunk->immutable = true;
1717 bitmap_fill(dchunk->populated, pcpu_unit_pages);
1718 dchunk->nr_populated = pcpu_unit_pages;
1719
1720 dchunk->contig_hint = dchunk->free_size = dyn_size;
1721 dchunk->map[0] = 1;
1722 dchunk->map[1] = pcpu_reserved_chunk_limit;
1723 dchunk->map[2] = (pcpu_reserved_chunk_limit + dchunk->free_size) | 1;
1724 dchunk->map_used = 2;
1725 dchunk->has_reserved = true;
1726 } 2142 }
1727 2143
1728 /* link the first chunk in */ 2144 /* link the first chunk in */
1729 pcpu_first_chunk = dchunk ?: schunk; 2145 pcpu_first_chunk = chunk;
1730 pcpu_nr_empty_pop_pages += 2146 pcpu_nr_empty_pop_pages = pcpu_first_chunk->nr_empty_pop_pages;
1731 pcpu_count_occupied_pages(pcpu_first_chunk, 1);
1732 pcpu_chunk_relocate(pcpu_first_chunk, -1); 2147 pcpu_chunk_relocate(pcpu_first_chunk, -1);
1733 2148
1734 pcpu_stats_chunk_alloc(); 2149 pcpu_stats_chunk_alloc();
@@ -1842,6 +2257,7 @@ static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
1842 */ 2257 */
1843 min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE); 2258 min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
1844 2259
2260 /* determine the maximum # of units that can fit in an allocation */
1845 alloc_size = roundup(min_unit_size, atom_size); 2261 alloc_size = roundup(min_unit_size, atom_size);
1846 upa = alloc_size / min_unit_size; 2262 upa = alloc_size / min_unit_size;
1847 while (alloc_size % upa || (offset_in_page(alloc_size / upa))) 2263 while (alloc_size % upa || (offset_in_page(alloc_size / upa)))
@@ -1868,9 +2284,9 @@ static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
1868 } 2284 }
1869 2285
1870 /* 2286 /*
1871 * Expand unit size until address space usage goes over 75% 2287 * Wasted space is caused by a ratio imbalance of upa to group_cnt.
1872 * and then as much as possible without using more address 2288 * Expand the unit_size until we use >= 75% of the units allocated.
1873 * space. 2289 * Related to atom_size, which could be much larger than the unit_size.
1874 */ 2290 */
1875 last_allocs = INT_MAX; 2291 last_allocs = INT_MAX;
1876 for (upa = max_upa; upa; upa--) { 2292 for (upa = max_upa; upa; upa--) {
@@ -2299,36 +2715,6 @@ void __init setup_per_cpu_areas(void)
2299#endif /* CONFIG_SMP */ 2715#endif /* CONFIG_SMP */
2300 2716
2301/* 2717/*
2302 * First and reserved chunks are initialized with temporary allocation
2303 * map in initdata so that they can be used before slab is online.
2304 * This function is called after slab is brought up and replaces those
2305 * with properly allocated maps.
2306 */
2307void __init percpu_init_late(void)
2308{
2309 struct pcpu_chunk *target_chunks[] =
2310 { pcpu_first_chunk, pcpu_reserved_chunk, NULL };
2311 struct pcpu_chunk *chunk;
2312 unsigned long flags;
2313 int i;
2314
2315 for (i = 0; (chunk = target_chunks[i]); i++) {
2316 int *map;
2317 const size_t size = PERCPU_DYNAMIC_EARLY_SLOTS * sizeof(map[0]);
2318
2319 BUILD_BUG_ON(size > PAGE_SIZE);
2320
2321 map = pcpu_mem_zalloc(size);
2322 BUG_ON(!map);
2323
2324 spin_lock_irqsave(&pcpu_lock, flags);
2325 memcpy(map, chunk->map, size);
2326 chunk->map = map;
2327 spin_unlock_irqrestore(&pcpu_lock, flags);
2328 }
2329}
2330
2331/*
2332 * Percpu allocator is initialized early during boot when neither slab or 2718 * Percpu allocator is initialized early during boot when neither slab or
2333 * workqueue is available. Plug async management until everything is up 2719 * workqueue is available. Plug async management until everything is up
2334 * and running. 2720 * and running.