diff options
Diffstat (limited to 'kernel/irq/affinity.c')
-rw-r--r-- | kernel/irq/affinity.c | 239 |
1 files changed, 200 insertions, 39 deletions
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index c7cca942bd8a..d905e844bf3a 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c | |||
@@ -7,6 +7,7 @@ | |||
7 | #include <linux/kernel.h> | 7 | #include <linux/kernel.h> |
8 | #include <linux/slab.h> | 8 | #include <linux/slab.h> |
9 | #include <linux/cpu.h> | 9 | #include <linux/cpu.h> |
10 | #include <linux/sort.h> | ||
10 | 11 | ||
11 | static void irq_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk, | 12 | static void irq_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk, |
12 | unsigned int cpus_per_vec) | 13 | unsigned int cpus_per_vec) |
@@ -94,6 +95,155 @@ static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask, | |||
94 | return nodes; | 95 | return nodes; |
95 | } | 96 | } |
96 | 97 | ||
98 | struct node_vectors { | ||
99 | unsigned id; | ||
100 | |||
101 | union { | ||
102 | unsigned nvectors; | ||
103 | unsigned ncpus; | ||
104 | }; | ||
105 | }; | ||
106 | |||
107 | static int ncpus_cmp_func(const void *l, const void *r) | ||
108 | { | ||
109 | const struct node_vectors *ln = l; | ||
110 | const struct node_vectors *rn = r; | ||
111 | |||
112 | return ln->ncpus - rn->ncpus; | ||
113 | } | ||
114 | |||
115 | /* | ||
116 | * Allocate vector number for each node, so that for each node: | ||
117 | * | ||
118 | * 1) the allocated number is >= 1 | ||
119 | * | ||
120 | * 2) the allocated numbver is <= active CPU number of this node | ||
121 | * | ||
122 | * The actual allocated total vectors may be less than @numvecs when | ||
123 | * active total CPU number is less than @numvecs. | ||
124 | * | ||
125 | * Active CPUs means the CPUs in '@cpu_mask AND @node_to_cpumask[]' | ||
126 | * for each node. | ||
127 | */ | ||
128 | static void alloc_nodes_vectors(unsigned int numvecs, | ||
129 | const cpumask_var_t *node_to_cpumask, | ||
130 | const struct cpumask *cpu_mask, | ||
131 | const nodemask_t nodemsk, | ||
132 | struct cpumask *nmsk, | ||
133 | struct node_vectors *node_vectors) | ||
134 | { | ||
135 | unsigned n, remaining_ncpus = 0; | ||
136 | |||
137 | for (n = 0; n < nr_node_ids; n++) { | ||
138 | node_vectors[n].id = n; | ||
139 | node_vectors[n].ncpus = UINT_MAX; | ||
140 | } | ||
141 | |||
142 | for_each_node_mask(n, nodemsk) { | ||
143 | unsigned ncpus; | ||
144 | |||
145 | cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]); | ||
146 | ncpus = cpumask_weight(nmsk); | ||
147 | |||
148 | if (!ncpus) | ||
149 | continue; | ||
150 | remaining_ncpus += ncpus; | ||
151 | node_vectors[n].ncpus = ncpus; | ||
152 | } | ||
153 | |||
154 | numvecs = min_t(unsigned, remaining_ncpus, numvecs); | ||
155 | |||
156 | sort(node_vectors, nr_node_ids, sizeof(node_vectors[0]), | ||
157 | ncpus_cmp_func, NULL); | ||
158 | |||
159 | /* | ||
160 | * Allocate vectors for each node according to the ratio of this | ||
161 | * node's nr_cpus to remaining un-assigned ncpus. 'numvecs' is | ||
162 | * bigger than number of active numa nodes. Always start the | ||
163 | * allocation from the node with minimized nr_cpus. | ||
164 | * | ||
165 | * This way guarantees that each active node gets allocated at | ||
166 | * least one vector, and the theory is simple: over-allocation | ||
167 | * is only done when this node is assigned by one vector, so | ||
168 | * other nodes will be allocated >= 1 vector, since 'numvecs' is | ||
169 | * bigger than number of numa nodes. | ||
170 | * | ||
171 | * One perfect invariant is that number of allocated vectors for | ||
172 | * each node is <= CPU count of this node: | ||
173 | * | ||
174 | * 1) suppose there are two nodes: A and B | ||
175 | * ncpu(X) is CPU count of node X | ||
176 | * vecs(X) is the vector count allocated to node X via this | ||
177 | * algorithm | ||
178 | * | ||
179 | * ncpu(A) <= ncpu(B) | ||
180 | * ncpu(A) + ncpu(B) = N | ||
181 | * vecs(A) + vecs(B) = V | ||
182 | * | ||
183 | * vecs(A) = max(1, round_down(V * ncpu(A) / N)) | ||
184 | * vecs(B) = V - vecs(A) | ||
185 | * | ||
186 | * both N and V are integer, and 2 <= V <= N, suppose | ||
187 | * V = N - delta, and 0 <= delta <= N - 2 | ||
188 | * | ||
189 | * 2) obviously vecs(A) <= ncpu(A) because: | ||
190 | * | ||
191 | * if vecs(A) is 1, then vecs(A) <= ncpu(A) given | ||
192 | * ncpu(A) >= 1 | ||
193 | * | ||
194 | * otherwise, | ||
195 | * vecs(A) <= V * ncpu(A) / N <= ncpu(A), given V <= N | ||
196 | * | ||
197 | * 3) prove how vecs(B) <= ncpu(B): | ||
198 | * | ||
199 | * if round_down(V * ncpu(A) / N) == 0, vecs(B) won't be | ||
200 | * over-allocated, so vecs(B) <= ncpu(B), | ||
201 | * | ||
202 | * otherwise: | ||
203 | * | ||
204 | * vecs(A) = | ||
205 | * round_down(V * ncpu(A) / N) = | ||
206 | * round_down((N - delta) * ncpu(A) / N) = | ||
207 | * round_down((N * ncpu(A) - delta * ncpu(A)) / N) >= | ||
208 | * round_down((N * ncpu(A) - delta * N) / N) = | ||
209 | * cpu(A) - delta | ||
210 | * | ||
211 | * then: | ||
212 | * | ||
213 | * vecs(A) - V >= ncpu(A) - delta - V | ||
214 | * => | ||
215 | * V - vecs(A) <= V + delta - ncpu(A) | ||
216 | * => | ||
217 | * vecs(B) <= N - ncpu(A) | ||
218 | * => | ||
219 | * vecs(B) <= cpu(B) | ||
220 | * | ||
221 | * For nodes >= 3, it can be thought as one node and another big | ||
222 | * node given that is exactly what this algorithm is implemented, | ||
223 | * and we always re-calculate 'remaining_ncpus' & 'numvecs', and | ||
224 | * finally for each node X: vecs(X) <= ncpu(X). | ||
225 | * | ||
226 | */ | ||
227 | for (n = 0; n < nr_node_ids; n++) { | ||
228 | unsigned nvectors, ncpus; | ||
229 | |||
230 | if (node_vectors[n].ncpus == UINT_MAX) | ||
231 | continue; | ||
232 | |||
233 | WARN_ON_ONCE(numvecs == 0); | ||
234 | |||
235 | ncpus = node_vectors[n].ncpus; | ||
236 | nvectors = max_t(unsigned, 1, | ||
237 | numvecs * ncpus / remaining_ncpus); | ||
238 | WARN_ON_ONCE(nvectors > ncpus); | ||
239 | |||
240 | node_vectors[n].nvectors = nvectors; | ||
241 | |||
242 | remaining_ncpus -= ncpus; | ||
243 | numvecs -= nvectors; | ||
244 | } | ||
245 | } | ||
246 | |||
97 | static int __irq_build_affinity_masks(unsigned int startvec, | 247 | static int __irq_build_affinity_masks(unsigned int startvec, |
98 | unsigned int numvecs, | 248 | unsigned int numvecs, |
99 | unsigned int firstvec, | 249 | unsigned int firstvec, |
@@ -102,10 +252,11 @@ static int __irq_build_affinity_masks(unsigned int startvec, | |||
102 | struct cpumask *nmsk, | 252 | struct cpumask *nmsk, |
103 | struct irq_affinity_desc *masks) | 253 | struct irq_affinity_desc *masks) |
104 | { | 254 | { |
105 | unsigned int n, nodes, cpus_per_vec, extra_vecs, done = 0; | 255 | unsigned int i, n, nodes, cpus_per_vec, extra_vecs, done = 0; |
106 | unsigned int last_affv = firstvec + numvecs; | 256 | unsigned int last_affv = firstvec + numvecs; |
107 | unsigned int curvec = startvec; | 257 | unsigned int curvec = startvec; |
108 | nodemask_t nodemsk = NODE_MASK_NONE; | 258 | nodemask_t nodemsk = NODE_MASK_NONE; |
259 | struct node_vectors *node_vectors; | ||
109 | 260 | ||
110 | if (!cpumask_weight(cpu_mask)) | 261 | if (!cpumask_weight(cpu_mask)) |
111 | return 0; | 262 | return 0; |
@@ -126,53 +277,57 @@ static int __irq_build_affinity_masks(unsigned int startvec, | |||
126 | return numvecs; | 277 | return numvecs; |
127 | } | 278 | } |
128 | 279 | ||
129 | for_each_node_mask(n, nodemsk) { | 280 | node_vectors = kcalloc(nr_node_ids, |
130 | unsigned int ncpus, v, vecs_to_assign, vecs_per_node; | 281 | sizeof(struct node_vectors), |
282 | GFP_KERNEL); | ||
283 | if (!node_vectors) | ||
284 | return -ENOMEM; | ||
285 | |||
286 | /* allocate vector number for each node */ | ||
287 | alloc_nodes_vectors(numvecs, node_to_cpumask, cpu_mask, | ||
288 | nodemsk, nmsk, node_vectors); | ||
289 | |||
290 | for (i = 0; i < nr_node_ids; i++) { | ||
291 | unsigned int ncpus, v; | ||
292 | struct node_vectors *nv = &node_vectors[i]; | ||
293 | |||
294 | if (nv->nvectors == UINT_MAX) | ||
295 | continue; | ||
131 | 296 | ||
132 | /* Get the cpus on this node which are in the mask */ | 297 | /* Get the cpus on this node which are in the mask */ |
133 | cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]); | 298 | cpumask_and(nmsk, cpu_mask, node_to_cpumask[nv->id]); |
134 | ncpus = cpumask_weight(nmsk); | 299 | ncpus = cpumask_weight(nmsk); |
135 | if (!ncpus) | 300 | if (!ncpus) |
136 | continue; | 301 | continue; |
137 | 302 | ||
138 | /* | 303 | WARN_ON_ONCE(nv->nvectors > ncpus); |
139 | * Calculate the number of cpus per vector | ||
140 | * | ||
141 | * Spread the vectors evenly per node. If the requested | ||
142 | * vector number has been reached, simply allocate one | ||
143 | * vector for each remaining node so that all nodes can | ||
144 | * be covered | ||
145 | */ | ||
146 | if (numvecs > done) | ||
147 | vecs_per_node = max_t(unsigned, | ||
148 | (numvecs - done) / nodes, 1); | ||
149 | else | ||
150 | vecs_per_node = 1; | ||
151 | |||
152 | vecs_to_assign = min(vecs_per_node, ncpus); | ||
153 | 304 | ||
154 | /* Account for rounding errors */ | 305 | /* Account for rounding errors */ |
155 | extra_vecs = ncpus - vecs_to_assign * (ncpus / vecs_to_assign); | 306 | extra_vecs = ncpus - nv->nvectors * (ncpus / nv->nvectors); |
156 | 307 | ||
157 | for (v = 0; curvec < last_affv && v < vecs_to_assign; | 308 | /* Spread allocated vectors on CPUs of the current node */ |
158 | curvec++, v++) { | 309 | for (v = 0; v < nv->nvectors; v++, curvec++) { |
159 | cpus_per_vec = ncpus / vecs_to_assign; | 310 | cpus_per_vec = ncpus / nv->nvectors; |
160 | 311 | ||
161 | /* Account for extra vectors to compensate rounding errors */ | 312 | /* Account for extra vectors to compensate rounding errors */ |
162 | if (extra_vecs) { | 313 | if (extra_vecs) { |
163 | cpus_per_vec++; | 314 | cpus_per_vec++; |
164 | --extra_vecs; | 315 | --extra_vecs; |
165 | } | 316 | } |
317 | |||
318 | /* | ||
319 | * wrapping has to be considered given 'startvec' | ||
320 | * may start anywhere | ||
321 | */ | ||
322 | if (curvec >= last_affv) | ||
323 | curvec = firstvec; | ||
166 | irq_spread_init_one(&masks[curvec].mask, nmsk, | 324 | irq_spread_init_one(&masks[curvec].mask, nmsk, |
167 | cpus_per_vec); | 325 | cpus_per_vec); |
168 | } | 326 | } |
169 | 327 | done += nv->nvectors; | |
170 | done += v; | ||
171 | if (curvec >= last_affv) | ||
172 | curvec = firstvec; | ||
173 | --nodes; | ||
174 | } | 328 | } |
175 | return done < numvecs ? done : numvecs; | 329 | kfree(node_vectors); |
330 | return done; | ||
176 | } | 331 | } |
177 | 332 | ||
178 | /* | 333 | /* |
@@ -184,7 +339,7 @@ static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs, | |||
184 | unsigned int firstvec, | 339 | unsigned int firstvec, |
185 | struct irq_affinity_desc *masks) | 340 | struct irq_affinity_desc *masks) |
186 | { | 341 | { |
187 | unsigned int curvec = startvec, nr_present, nr_others; | 342 | unsigned int curvec = startvec, nr_present = 0, nr_others = 0; |
188 | cpumask_var_t *node_to_cpumask; | 343 | cpumask_var_t *node_to_cpumask; |
189 | cpumask_var_t nmsk, npresmsk; | 344 | cpumask_var_t nmsk, npresmsk; |
190 | int ret = -ENOMEM; | 345 | int ret = -ENOMEM; |
@@ -199,15 +354,17 @@ static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs, | |||
199 | if (!node_to_cpumask) | 354 | if (!node_to_cpumask) |
200 | goto fail_npresmsk; | 355 | goto fail_npresmsk; |
201 | 356 | ||
202 | ret = 0; | ||
203 | /* Stabilize the cpumasks */ | 357 | /* Stabilize the cpumasks */ |
204 | get_online_cpus(); | 358 | get_online_cpus(); |
205 | build_node_to_cpumask(node_to_cpumask); | 359 | build_node_to_cpumask(node_to_cpumask); |
206 | 360 | ||
207 | /* Spread on present CPUs starting from affd->pre_vectors */ | 361 | /* Spread on present CPUs starting from affd->pre_vectors */ |
208 | nr_present = __irq_build_affinity_masks(curvec, numvecs, | 362 | ret = __irq_build_affinity_masks(curvec, numvecs, firstvec, |
209 | firstvec, node_to_cpumask, | 363 | node_to_cpumask, cpu_present_mask, |
210 | cpu_present_mask, nmsk, masks); | 364 | nmsk, masks); |
365 | if (ret < 0) | ||
366 | goto fail_build_affinity; | ||
367 | nr_present = ret; | ||
211 | 368 | ||
212 | /* | 369 | /* |
213 | * Spread on non present CPUs starting from the next vector to be | 370 | * Spread on non present CPUs starting from the next vector to be |
@@ -220,12 +377,16 @@ static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs, | |||
220 | else | 377 | else |
221 | curvec = firstvec + nr_present; | 378 | curvec = firstvec + nr_present; |
222 | cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask); | 379 | cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask); |
223 | nr_others = __irq_build_affinity_masks(curvec, numvecs, | 380 | ret = __irq_build_affinity_masks(curvec, numvecs, firstvec, |
224 | firstvec, node_to_cpumask, | 381 | node_to_cpumask, npresmsk, nmsk, |
225 | npresmsk, nmsk, masks); | 382 | masks); |
383 | if (ret >= 0) | ||
384 | nr_others = ret; | ||
385 | |||
386 | fail_build_affinity: | ||
226 | put_online_cpus(); | 387 | put_online_cpus(); |
227 | 388 | ||
228 | if (nr_present < numvecs) | 389 | if (ret >= 0) |
229 | WARN_ON(nr_present + nr_others < numvecs); | 390 | WARN_ON(nr_present + nr_others < numvecs); |
230 | 391 | ||
231 | free_node_to_cpumask(node_to_cpumask); | 392 | free_node_to_cpumask(node_to_cpumask); |
@@ -235,7 +396,7 @@ static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs, | |||
235 | 396 | ||
236 | fail_nmsk: | 397 | fail_nmsk: |
237 | free_cpumask_var(nmsk); | 398 | free_cpumask_var(nmsk); |
238 | return ret; | 399 | return ret < 0 ? ret : 0; |
239 | } | 400 | } |
240 | 401 | ||
241 | static void default_calc_sets(struct irq_affinity *affd, unsigned int affvecs) | 402 | static void default_calc_sets(struct irq_affinity *affd, unsigned int affvecs) |