summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMing Lei <ming.lei@redhat.com>2019-08-15 22:28:49 -0400
committerThomas Gleixner <tglx@linutronix.de>2019-08-27 10:31:17 -0400
commitb1a5a73e64e99faa5f4deef2ae96d7371a0fb5d0 (patch)
treeb1746030db9be93db21499a30aa7e6fe8a1939d4
parent53c1788b7d7720565214a466afffdc818d8c6e5f (diff)
genirq/affinity: Spread vectors on node according to nr_cpu ratio
Now __irq_build_affinity_masks() spreads vectors evenly per node, but there is a case that not all vectors have been spread when each numa node has a different number of CPUs which triggers the warning in the spreading code. Improve the spreading algorithm by - assigning vectors according to the ratio of the number of CPUs on a node to the number of remaining CPUs. - running the assignment from smaller nodes to bigger nodes to guarantee that every active node gets allocated at least one vector. This ensures that all vectors are spread out. Asided of that the spread becomes more fair if the nodes have different number of CPUs. For example, on the following machine: CPU(s): 16 On-line CPU(s) list: 0-15 Thread(s) per core: 1 Core(s) per socket: 8 Socket(s): 2 NUMA node(s): 2 ... NUMA node0 CPU(s): 0,1,3,5-9,11,13-15 NUMA node1 CPU(s): 2,4,10,12 When a driver requests to allocate 8 vectors, the following spread results: irq 31, cpu list 2,4 irq 32, cpu list 10,12 irq 33, cpu list 0-1 irq 34, cpu list 3,5 irq 35, cpu list 6-7 irq 36, cpu list 8-9 irq 37, cpu list 11,13 irq 38, cpu list 14-15 So Node 0 has now 6 and Node 1 has 2 vectors assigned. The original algorithm assigned 4 vectors on each node which was unfair versus Node 0. [ tglx: Massaged changelog ] Reported-by: Jon Derrick <jonathan.derrick@intel.com> Signed-off-by: Ming Lei <ming.lei@redhat.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Reviewed-by: Keith Busch <kbusch@kernel.org> Reviewed-by: Jon Derrick <jonathan.derrick@intel.com> Link: https://lkml.kernel.org/r/20190816022849.14075-3-ming.lei@redhat.com
-rw-r--r--kernel/irq/affinity.c239
1 files changed, 200 insertions, 39 deletions
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index c7cca942bd8a..d905e844bf3a 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -7,6 +7,7 @@
7#include <linux/kernel.h> 7#include <linux/kernel.h>
8#include <linux/slab.h> 8#include <linux/slab.h>
9#include <linux/cpu.h> 9#include <linux/cpu.h>
10#include <linux/sort.h>
10 11
11static void irq_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk, 12static void irq_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk,
12 unsigned int cpus_per_vec) 13 unsigned int cpus_per_vec)
@@ -94,6 +95,155 @@ static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask,
94 return nodes; 95 return nodes;
95} 96}
96 97
98struct node_vectors {
99 unsigned id;
100
101 union {
102 unsigned nvectors;
103 unsigned ncpus;
104 };
105};
106
107static int ncpus_cmp_func(const void *l, const void *r)
108{
109 const struct node_vectors *ln = l;
110 const struct node_vectors *rn = r;
111
112 return ln->ncpus - rn->ncpus;
113}
114
115/*
116 * Allocate vector number for each node, so that for each node:
117 *
118 * 1) the allocated number is >= 1
119 *
120 * 2) the allocated numbver is <= active CPU number of this node
121 *
122 * The actual allocated total vectors may be less than @numvecs when
123 * active total CPU number is less than @numvecs.
124 *
125 * Active CPUs means the CPUs in '@cpu_mask AND @node_to_cpumask[]'
126 * for each node.
127 */
128static void alloc_nodes_vectors(unsigned int numvecs,
129 const cpumask_var_t *node_to_cpumask,
130 const struct cpumask *cpu_mask,
131 const nodemask_t nodemsk,
132 struct cpumask *nmsk,
133 struct node_vectors *node_vectors)
134{
135 unsigned n, remaining_ncpus = 0;
136
137 for (n = 0; n < nr_node_ids; n++) {
138 node_vectors[n].id = n;
139 node_vectors[n].ncpus = UINT_MAX;
140 }
141
142 for_each_node_mask(n, nodemsk) {
143 unsigned ncpus;
144
145 cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]);
146 ncpus = cpumask_weight(nmsk);
147
148 if (!ncpus)
149 continue;
150 remaining_ncpus += ncpus;
151 node_vectors[n].ncpus = ncpus;
152 }
153
154 numvecs = min_t(unsigned, remaining_ncpus, numvecs);
155
156 sort(node_vectors, nr_node_ids, sizeof(node_vectors[0]),
157 ncpus_cmp_func, NULL);
158
159 /*
160 * Allocate vectors for each node according to the ratio of this
161 * node's nr_cpus to remaining un-assigned ncpus. 'numvecs' is
162 * bigger than number of active numa nodes. Always start the
163 * allocation from the node with minimized nr_cpus.
164 *
165 * This way guarantees that each active node gets allocated at
166 * least one vector, and the theory is simple: over-allocation
167 * is only done when this node is assigned by one vector, so
168 * other nodes will be allocated >= 1 vector, since 'numvecs' is
169 * bigger than number of numa nodes.
170 *
171 * One perfect invariant is that number of allocated vectors for
172 * each node is <= CPU count of this node:
173 *
174 * 1) suppose there are two nodes: A and B
175 * ncpu(X) is CPU count of node X
176 * vecs(X) is the vector count allocated to node X via this
177 * algorithm
178 *
179 * ncpu(A) <= ncpu(B)
180 * ncpu(A) + ncpu(B) = N
181 * vecs(A) + vecs(B) = V
182 *
183 * vecs(A) = max(1, round_down(V * ncpu(A) / N))
184 * vecs(B) = V - vecs(A)
185 *
186 * both N and V are integer, and 2 <= V <= N, suppose
187 * V = N - delta, and 0 <= delta <= N - 2
188 *
189 * 2) obviously vecs(A) <= ncpu(A) because:
190 *
191 * if vecs(A) is 1, then vecs(A) <= ncpu(A) given
192 * ncpu(A) >= 1
193 *
194 * otherwise,
195 * vecs(A) <= V * ncpu(A) / N <= ncpu(A), given V <= N
196 *
197 * 3) prove how vecs(B) <= ncpu(B):
198 *
199 * if round_down(V * ncpu(A) / N) == 0, vecs(B) won't be
200 * over-allocated, so vecs(B) <= ncpu(B),
201 *
202 * otherwise:
203 *
204 * vecs(A) =
205 * round_down(V * ncpu(A) / N) =
206 * round_down((N - delta) * ncpu(A) / N) =
207 * round_down((N * ncpu(A) - delta * ncpu(A)) / N) >=
208 * round_down((N * ncpu(A) - delta * N) / N) =
209 * cpu(A) - delta
210 *
211 * then:
212 *
213 * vecs(A) - V >= ncpu(A) - delta - V
214 * =>
215 * V - vecs(A) <= V + delta - ncpu(A)
216 * =>
217 * vecs(B) <= N - ncpu(A)
218 * =>
219 * vecs(B) <= cpu(B)
220 *
221 * For nodes >= 3, it can be thought as one node and another big
222 * node given that is exactly what this algorithm is implemented,
223 * and we always re-calculate 'remaining_ncpus' & 'numvecs', and
224 * finally for each node X: vecs(X) <= ncpu(X).
225 *
226 */
227 for (n = 0; n < nr_node_ids; n++) {
228 unsigned nvectors, ncpus;
229
230 if (node_vectors[n].ncpus == UINT_MAX)
231 continue;
232
233 WARN_ON_ONCE(numvecs == 0);
234
235 ncpus = node_vectors[n].ncpus;
236 nvectors = max_t(unsigned, 1,
237 numvecs * ncpus / remaining_ncpus);
238 WARN_ON_ONCE(nvectors > ncpus);
239
240 node_vectors[n].nvectors = nvectors;
241
242 remaining_ncpus -= ncpus;
243 numvecs -= nvectors;
244 }
245}
246
97static int __irq_build_affinity_masks(unsigned int startvec, 247static int __irq_build_affinity_masks(unsigned int startvec,
98 unsigned int numvecs, 248 unsigned int numvecs,
99 unsigned int firstvec, 249 unsigned int firstvec,
@@ -102,10 +252,11 @@ static int __irq_build_affinity_masks(unsigned int startvec,
102 struct cpumask *nmsk, 252 struct cpumask *nmsk,
103 struct irq_affinity_desc *masks) 253 struct irq_affinity_desc *masks)
104{ 254{
105 unsigned int n, nodes, cpus_per_vec, extra_vecs, done = 0; 255 unsigned int i, n, nodes, cpus_per_vec, extra_vecs, done = 0;
106 unsigned int last_affv = firstvec + numvecs; 256 unsigned int last_affv = firstvec + numvecs;
107 unsigned int curvec = startvec; 257 unsigned int curvec = startvec;
108 nodemask_t nodemsk = NODE_MASK_NONE; 258 nodemask_t nodemsk = NODE_MASK_NONE;
259 struct node_vectors *node_vectors;
109 260
110 if (!cpumask_weight(cpu_mask)) 261 if (!cpumask_weight(cpu_mask))
111 return 0; 262 return 0;
@@ -126,53 +277,57 @@ static int __irq_build_affinity_masks(unsigned int startvec,
126 return numvecs; 277 return numvecs;
127 } 278 }
128 279
129 for_each_node_mask(n, nodemsk) { 280 node_vectors = kcalloc(nr_node_ids,
130 unsigned int ncpus, v, vecs_to_assign, vecs_per_node; 281 sizeof(struct node_vectors),
282 GFP_KERNEL);
283 if (!node_vectors)
284 return -ENOMEM;
285
286 /* allocate vector number for each node */
287 alloc_nodes_vectors(numvecs, node_to_cpumask, cpu_mask,
288 nodemsk, nmsk, node_vectors);
289
290 for (i = 0; i < nr_node_ids; i++) {
291 unsigned int ncpus, v;
292 struct node_vectors *nv = &node_vectors[i];
293
294 if (nv->nvectors == UINT_MAX)
295 continue;
131 296
132 /* Get the cpus on this node which are in the mask */ 297 /* Get the cpus on this node which are in the mask */
133 cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]); 298 cpumask_and(nmsk, cpu_mask, node_to_cpumask[nv->id]);
134 ncpus = cpumask_weight(nmsk); 299 ncpus = cpumask_weight(nmsk);
135 if (!ncpus) 300 if (!ncpus)
136 continue; 301 continue;
137 302
138 /* 303 WARN_ON_ONCE(nv->nvectors > ncpus);
139 * Calculate the number of cpus per vector
140 *
141 * Spread the vectors evenly per node. If the requested
142 * vector number has been reached, simply allocate one
143 * vector for each remaining node so that all nodes can
144 * be covered
145 */
146 if (numvecs > done)
147 vecs_per_node = max_t(unsigned,
148 (numvecs - done) / nodes, 1);
149 else
150 vecs_per_node = 1;
151
152 vecs_to_assign = min(vecs_per_node, ncpus);
153 304
154 /* Account for rounding errors */ 305 /* Account for rounding errors */
155 extra_vecs = ncpus - vecs_to_assign * (ncpus / vecs_to_assign); 306 extra_vecs = ncpus - nv->nvectors * (ncpus / nv->nvectors);
156 307
157 for (v = 0; curvec < last_affv && v < vecs_to_assign; 308 /* Spread allocated vectors on CPUs of the current node */
158 curvec++, v++) { 309 for (v = 0; v < nv->nvectors; v++, curvec++) {
159 cpus_per_vec = ncpus / vecs_to_assign; 310 cpus_per_vec = ncpus / nv->nvectors;
160 311
161 /* Account for extra vectors to compensate rounding errors */ 312 /* Account for extra vectors to compensate rounding errors */
162 if (extra_vecs) { 313 if (extra_vecs) {
163 cpus_per_vec++; 314 cpus_per_vec++;
164 --extra_vecs; 315 --extra_vecs;
165 } 316 }
317
318 /*
319 * wrapping has to be considered given 'startvec'
320 * may start anywhere
321 */
322 if (curvec >= last_affv)
323 curvec = firstvec;
166 irq_spread_init_one(&masks[curvec].mask, nmsk, 324 irq_spread_init_one(&masks[curvec].mask, nmsk,
167 cpus_per_vec); 325 cpus_per_vec);
168 } 326 }
169 327 done += nv->nvectors;
170 done += v;
171 if (curvec >= last_affv)
172 curvec = firstvec;
173 --nodes;
174 } 328 }
175 return done < numvecs ? done : numvecs; 329 kfree(node_vectors);
330 return done;
176} 331}
177 332
178/* 333/*
@@ -184,7 +339,7 @@ static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs,
184 unsigned int firstvec, 339 unsigned int firstvec,
185 struct irq_affinity_desc *masks) 340 struct irq_affinity_desc *masks)
186{ 341{
187 unsigned int curvec = startvec, nr_present, nr_others; 342 unsigned int curvec = startvec, nr_present = 0, nr_others = 0;
188 cpumask_var_t *node_to_cpumask; 343 cpumask_var_t *node_to_cpumask;
189 cpumask_var_t nmsk, npresmsk; 344 cpumask_var_t nmsk, npresmsk;
190 int ret = -ENOMEM; 345 int ret = -ENOMEM;
@@ -199,15 +354,17 @@ static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs,
199 if (!node_to_cpumask) 354 if (!node_to_cpumask)
200 goto fail_npresmsk; 355 goto fail_npresmsk;
201 356
202 ret = 0;
203 /* Stabilize the cpumasks */ 357 /* Stabilize the cpumasks */
204 get_online_cpus(); 358 get_online_cpus();
205 build_node_to_cpumask(node_to_cpumask); 359 build_node_to_cpumask(node_to_cpumask);
206 360
207 /* Spread on present CPUs starting from affd->pre_vectors */ 361 /* Spread on present CPUs starting from affd->pre_vectors */
208 nr_present = __irq_build_affinity_masks(curvec, numvecs, 362 ret = __irq_build_affinity_masks(curvec, numvecs, firstvec,
209 firstvec, node_to_cpumask, 363 node_to_cpumask, cpu_present_mask,
210 cpu_present_mask, nmsk, masks); 364 nmsk, masks);
365 if (ret < 0)
366 goto fail_build_affinity;
367 nr_present = ret;
211 368
212 /* 369 /*
213 * Spread on non present CPUs starting from the next vector to be 370 * Spread on non present CPUs starting from the next vector to be
@@ -220,12 +377,16 @@ static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs,
220 else 377 else
221 curvec = firstvec + nr_present; 378 curvec = firstvec + nr_present;
222 cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask); 379 cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask);
223 nr_others = __irq_build_affinity_masks(curvec, numvecs, 380 ret = __irq_build_affinity_masks(curvec, numvecs, firstvec,
224 firstvec, node_to_cpumask, 381 node_to_cpumask, npresmsk, nmsk,
225 npresmsk, nmsk, masks); 382 masks);
383 if (ret >= 0)
384 nr_others = ret;
385
386 fail_build_affinity:
226 put_online_cpus(); 387 put_online_cpus();
227 388
228 if (nr_present < numvecs) 389 if (ret >= 0)
229 WARN_ON(nr_present + nr_others < numvecs); 390 WARN_ON(nr_present + nr_others < numvecs);
230 391
231 free_node_to_cpumask(node_to_cpumask); 392 free_node_to_cpumask(node_to_cpumask);
@@ -235,7 +396,7 @@ static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs,
235 396
236 fail_nmsk: 397 fail_nmsk:
237 free_cpumask_var(nmsk); 398 free_cpumask_var(nmsk);
238 return ret; 399 return ret < 0 ? ret : 0;
239} 400}
240 401
241static void default_calc_sets(struct irq_affinity *affd, unsigned int affvecs) 402static void default_calc_sets(struct irq_affinity *affd, unsigned int affvecs)