diff options
author | Gregory Haskins <ghaskins@novell.com> | 2008-01-25 15:08:11 -0500 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2008-01-25 15:08:11 -0500 |
commit | 6e1254d2c41215da27025add8900ed187bca121d (patch) | |
tree | f68081e3b87d44c20a1b2739cf0119d8624208b0 | |
parent | 318e0893ce3f524ca045f9fd9dfd567c0a6f9446 (diff) |
sched: optimize RT affinity
The current code base assumes a relatively flat CPU/core topology and will
route RT tasks to any CPU fairly equally. In the real world, there are
various toplogies and affinities that govern where a task is best suited to
run with the smallest amount of overhead. NUMA and multi-core CPUs are
prime examples of topologies that can impact cache performance.
Fortunately, linux is already structured to represent these topologies via
the sched_domains interface. So we change our RT router to consult a
combination of topology and affinity policy to best place tasks during
migration.
Signed-off-by: Gregory Haskins <ghaskins@novell.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
-rw-r--r-- | kernel/sched_rt.c | 100 |
1 files changed, 88 insertions, 12 deletions
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index ac7d06786454..a9d7d4408160 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c | |||
@@ -281,35 +281,111 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, | |||
281 | } | 281 | } |
282 | 282 | ||
283 | static DEFINE_PER_CPU(cpumask_t, local_cpu_mask); | 283 | static DEFINE_PER_CPU(cpumask_t, local_cpu_mask); |
284 | static DEFINE_PER_CPU(cpumask_t, valid_cpu_mask); | ||
284 | 285 | ||
285 | static int find_lowest_rq(struct task_struct *task) | 286 | static int find_lowest_cpus(struct task_struct *task, cpumask_t *lowest_mask) |
286 | { | 287 | { |
287 | int cpu; | 288 | int cpu; |
288 | cpumask_t *cpu_mask = &__get_cpu_var(local_cpu_mask); | 289 | cpumask_t *valid_mask = &__get_cpu_var(valid_cpu_mask); |
289 | struct rq *lowest_rq = NULL; | 290 | int lowest_prio = -1; |
291 | int ret = 0; | ||
290 | 292 | ||
291 | cpus_and(*cpu_mask, cpu_online_map, task->cpus_allowed); | 293 | cpus_clear(*lowest_mask); |
294 | cpus_and(*valid_mask, cpu_online_map, task->cpus_allowed); | ||
292 | 295 | ||
293 | /* | 296 | /* |
294 | * Scan each rq for the lowest prio. | 297 | * Scan each rq for the lowest prio. |
295 | */ | 298 | */ |
296 | for_each_cpu_mask(cpu, *cpu_mask) { | 299 | for_each_cpu_mask(cpu, *valid_mask) { |
297 | struct rq *rq = cpu_rq(cpu); | 300 | struct rq *rq = cpu_rq(cpu); |
298 | 301 | ||
299 | /* We look for lowest RT prio or non-rt CPU */ | 302 | /* We look for lowest RT prio or non-rt CPU */ |
300 | if (rq->rt.highest_prio >= MAX_RT_PRIO) { | 303 | if (rq->rt.highest_prio >= MAX_RT_PRIO) { |
301 | lowest_rq = rq; | 304 | if (ret) |
302 | break; | 305 | cpus_clear(*lowest_mask); |
306 | cpu_set(rq->cpu, *lowest_mask); | ||
307 | return 1; | ||
303 | } | 308 | } |
304 | 309 | ||
305 | /* no locking for now */ | 310 | /* no locking for now */ |
306 | if (rq->rt.highest_prio > task->prio && | 311 | if ((rq->rt.highest_prio > task->prio) |
307 | (!lowest_rq || rq->rt.highest_prio > lowest_rq->rt.highest_prio)) { | 312 | && (rq->rt.highest_prio >= lowest_prio)) { |
308 | lowest_rq = rq; | 313 | if (rq->rt.highest_prio > lowest_prio) { |
314 | /* new low - clear old data */ | ||
315 | lowest_prio = rq->rt.highest_prio; | ||
316 | cpus_clear(*lowest_mask); | ||
317 | } | ||
318 | cpu_set(rq->cpu, *lowest_mask); | ||
319 | ret = 1; | ||
320 | } | ||
321 | } | ||
322 | |||
323 | return ret; | ||
324 | } | ||
325 | |||
326 | static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask) | ||
327 | { | ||
328 | int first; | ||
329 | |||
330 | /* "this_cpu" is cheaper to preempt than a remote processor */ | ||
331 | if ((this_cpu != -1) && cpu_isset(this_cpu, *mask)) | ||
332 | return this_cpu; | ||
333 | |||
334 | first = first_cpu(*mask); | ||
335 | if (first != NR_CPUS) | ||
336 | return first; | ||
337 | |||
338 | return -1; | ||
339 | } | ||
340 | |||
341 | static int find_lowest_rq(struct task_struct *task) | ||
342 | { | ||
343 | struct sched_domain *sd; | ||
344 | cpumask_t *lowest_mask = &__get_cpu_var(local_cpu_mask); | ||
345 | int this_cpu = smp_processor_id(); | ||
346 | int cpu = task_cpu(task); | ||
347 | |||
348 | if (!find_lowest_cpus(task, lowest_mask)) | ||
349 | return -1; | ||
350 | |||
351 | /* | ||
352 | * At this point we have built a mask of cpus representing the | ||
353 | * lowest priority tasks in the system. Now we want to elect | ||
354 | * the best one based on our affinity and topology. | ||
355 | * | ||
356 | * We prioritize the last cpu that the task executed on since | ||
357 | * it is most likely cache-hot in that location. | ||
358 | */ | ||
359 | if (cpu_isset(cpu, *lowest_mask)) | ||
360 | return cpu; | ||
361 | |||
362 | /* | ||
363 | * Otherwise, we consult the sched_domains span maps to figure | ||
364 | * out which cpu is logically closest to our hot cache data. | ||
365 | */ | ||
366 | if (this_cpu == cpu) | ||
367 | this_cpu = -1; /* Skip this_cpu opt if the same */ | ||
368 | |||
369 | for_each_domain(cpu, sd) { | ||
370 | if (sd->flags & SD_WAKE_AFFINE) { | ||
371 | cpumask_t domain_mask; | ||
372 | int best_cpu; | ||
373 | |||
374 | cpus_and(domain_mask, sd->span, *lowest_mask); | ||
375 | |||
376 | best_cpu = pick_optimal_cpu(this_cpu, | ||
377 | &domain_mask); | ||
378 | if (best_cpu != -1) | ||
379 | return best_cpu; | ||
309 | } | 380 | } |
310 | } | 381 | } |
311 | 382 | ||
312 | return lowest_rq ? lowest_rq->cpu : -1; | 383 | /* |
384 | * And finally, if there were no matches within the domains | ||
385 | * just give the caller *something* to work with from the compatible | ||
386 | * locations. | ||
387 | */ | ||
388 | return pick_optimal_cpu(this_cpu, lowest_mask); | ||
313 | } | 389 | } |
314 | 390 | ||
315 | /* Will lock the rq it finds */ | 391 | /* Will lock the rq it finds */ |