aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorGregory Haskins <ghaskins@novell.com>2008-01-25 15:08:07 -0500
committerIngo Molnar <mingo@elte.hu>2008-01-25 15:08:07 -0500
commit73fe6aae84400e2b475e2a1dc4e8592cd3ed6e69 (patch)
tree97c7d6a866d75563082c422491fc423b47aca9d7
parentc7a1e46aa9782a947cf2ed506245d43396dbf991 (diff)
sched: add RT-balance cpu-weight
Some RT tasks (particularly kthreads) are bound to one specific CPU. It is fairly common for two or more bound tasks to get queued up at the same time. Consider, for instance, softirq_timer and softirq_sched. A timer goes off in an ISR which schedules softirq_thread to run at RT50. Then the timer handler determines that it's time to smp-rebalance the system so it schedules softirq_sched to run. So we are in a situation where we have two RT50 tasks queued, and the system will go into rt-overload condition to request other CPUs for help. This causes two problems in the current code: 1) If a high-priority bound task and a low-priority unbounded task queue up behind the running task, we will fail to ever relocate the unbounded task because we terminate the search on the first unmovable task. 2) We spend precious futile cycles in the fast-path trying to pull overloaded tasks over. It is therefore optimial to strive to avoid the overhead all together if we can cheaply detect the condition before overload even occurs. This patch tries to achieve this optimization by utilizing the hamming weight of the task->cpus_allowed mask. A weight of 1 indicates that the task cannot be migrated. We will then utilize this information to skip non-migratable tasks and to eliminate uncessary rebalance attempts. We introduce a per-rq variable to count the number of migratable tasks that are currently running. We only go into overload if we have more than one rt task, AND at least one of them is migratable. In addition, we introduce a per-task variable to cache the cpus_allowed weight, since the hamming calculation is probably relatively expensive. We only update the cached value when the mask is updated which should be relatively infrequent, especially compared to scheduling frequency in the fast path. Signed-off-by: Gregory Haskins <ghaskins@novell.com> Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
-rw-r--r--include/linux/init_task.h1
-rw-r--r--include/linux/sched.h2
-rw-r--r--kernel/fork.c1
-rw-r--r--kernel/sched.c9
-rw-r--r--kernel/sched_rt.c50
5 files changed, 57 insertions, 6 deletions
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index cae35b6b9aec..572c65bcc80f 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -130,6 +130,7 @@ extern struct group_info init_groups;
130 .normal_prio = MAX_PRIO-20, \ 130 .normal_prio = MAX_PRIO-20, \
131 .policy = SCHED_NORMAL, \ 131 .policy = SCHED_NORMAL, \
132 .cpus_allowed = CPU_MASK_ALL, \ 132 .cpus_allowed = CPU_MASK_ALL, \
133 .nr_cpus_allowed = NR_CPUS, \
133 .mm = NULL, \ 134 .mm = NULL, \
134 .active_mm = &init_mm, \ 135 .active_mm = &init_mm, \
135 .run_list = LIST_HEAD_INIT(tsk.run_list), \ 136 .run_list = LIST_HEAD_INIT(tsk.run_list), \
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0846f1f9e196..b07a2cf76401 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -847,6 +847,7 @@ struct sched_class {
847 void (*set_curr_task) (struct rq *rq); 847 void (*set_curr_task) (struct rq *rq);
848 void (*task_tick) (struct rq *rq, struct task_struct *p); 848 void (*task_tick) (struct rq *rq, struct task_struct *p);
849 void (*task_new) (struct rq *rq, struct task_struct *p); 849 void (*task_new) (struct rq *rq, struct task_struct *p);
850 void (*set_cpus_allowed)(struct task_struct *p, cpumask_t *newmask);
850}; 851};
851 852
852struct load_weight { 853struct load_weight {
@@ -956,6 +957,7 @@ struct task_struct {
956 957
957 unsigned int policy; 958 unsigned int policy;
958 cpumask_t cpus_allowed; 959 cpumask_t cpus_allowed;
960 int nr_cpus_allowed;
959 unsigned int time_slice; 961 unsigned int time_slice;
960 962
961#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 963#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
diff --git a/kernel/fork.c b/kernel/fork.c
index 09c0b90a69cc..930c51865ab4 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1242,6 +1242,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1242 * parent's CPU). This avoids alot of nasty races. 1242 * parent's CPU). This avoids alot of nasty races.
1243 */ 1243 */
1244 p->cpus_allowed = current->cpus_allowed; 1244 p->cpus_allowed = current->cpus_allowed;
1245 p->nr_cpus_allowed = current->nr_cpus_allowed;
1245 if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) || 1246 if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) ||
1246 !cpu_online(task_cpu(p)))) 1247 !cpu_online(task_cpu(p))))
1247 set_task_cpu(p, smp_processor_id()); 1248 set_task_cpu(p, smp_processor_id());
diff --git a/kernel/sched.c b/kernel/sched.c
index 357d3a084de8..66e99b419b31 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -343,6 +343,7 @@ struct rt_rq {
343 int rt_load_balance_idx; 343 int rt_load_balance_idx;
344 struct list_head *rt_load_balance_head, *rt_load_balance_curr; 344 struct list_head *rt_load_balance_head, *rt_load_balance_curr;
345 unsigned long rt_nr_running; 345 unsigned long rt_nr_running;
346 unsigned long rt_nr_migratory;
346 /* highest queued rt task prio */ 347 /* highest queued rt task prio */
347 int highest_prio; 348 int highest_prio;
348}; 349};
@@ -5144,7 +5145,13 @@ int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
5144 goto out; 5145 goto out;
5145 } 5146 }
5146 5147
5147 p->cpus_allowed = new_mask; 5148 if (p->sched_class->set_cpus_allowed)
5149 p->sched_class->set_cpus_allowed(p, &new_mask);
5150 else {
5151 p->cpus_allowed = new_mask;
5152 p->nr_cpus_allowed = cpus_weight(new_mask);
5153 }
5154
5148 /* Can the task run on the task's current CPU? If so, we're done */ 5155 /* Can the task run on the task's current CPU? If so, we're done */
5149 if (cpu_isset(task_cpu(p), new_mask)) 5156 if (cpu_isset(task_cpu(p), new_mask))
5150 goto out; 5157 goto out;
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index c492fd2b2eec..ae4995c09aac 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -33,6 +33,14 @@ static inline void rt_clear_overload(struct rq *rq)
33 atomic_dec(&rto_count); 33 atomic_dec(&rto_count);
34 cpu_clear(rq->cpu, rt_overload_mask); 34 cpu_clear(rq->cpu, rt_overload_mask);
35} 35}
36
37static void update_rt_migration(struct rq *rq)
38{
39 if (rq->rt.rt_nr_migratory && (rq->rt.rt_nr_running > 1))
40 rt_set_overload(rq);
41 else
42 rt_clear_overload(rq);
43}
36#endif /* CONFIG_SMP */ 44#endif /* CONFIG_SMP */
37 45
38/* 46/*
@@ -65,8 +73,10 @@ static inline void inc_rt_tasks(struct task_struct *p, struct rq *rq)
65#ifdef CONFIG_SMP 73#ifdef CONFIG_SMP
66 if (p->prio < rq->rt.highest_prio) 74 if (p->prio < rq->rt.highest_prio)
67 rq->rt.highest_prio = p->prio; 75 rq->rt.highest_prio = p->prio;
68 if (rq->rt.rt_nr_running > 1) 76 if (p->nr_cpus_allowed > 1)
69 rt_set_overload(rq); 77 rq->rt.rt_nr_migratory++;
78
79 update_rt_migration(rq);
70#endif /* CONFIG_SMP */ 80#endif /* CONFIG_SMP */
71} 81}
72 82
@@ -88,8 +98,10 @@ static inline void dec_rt_tasks(struct task_struct *p, struct rq *rq)
88 } /* otherwise leave rq->highest prio alone */ 98 } /* otherwise leave rq->highest prio alone */
89 } else 99 } else
90 rq->rt.highest_prio = MAX_RT_PRIO; 100 rq->rt.highest_prio = MAX_RT_PRIO;
91 if (rq->rt.rt_nr_running < 2) 101 if (p->nr_cpus_allowed > 1)
92 rt_clear_overload(rq); 102 rq->rt.rt_nr_migratory--;
103
104 update_rt_migration(rq);
93#endif /* CONFIG_SMP */ 105#endif /* CONFIG_SMP */
94} 106}
95 107
@@ -182,7 +194,8 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep);
182static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) 194static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
183{ 195{
184 if (!task_running(rq, p) && 196 if (!task_running(rq, p) &&
185 (cpu < 0 || cpu_isset(cpu, p->cpus_allowed))) 197 (cpu < 0 || cpu_isset(cpu, p->cpus_allowed)) &&
198 (p->nr_cpus_allowed > 1))
186 return 1; 199 return 1;
187 return 0; 200 return 0;
188} 201}
@@ -584,6 +597,32 @@ move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
584 /* don't touch RT tasks */ 597 /* don't touch RT tasks */
585 return 0; 598 return 0;
586} 599}
600static void set_cpus_allowed_rt(struct task_struct *p, cpumask_t *new_mask)
601{
602 int weight = cpus_weight(*new_mask);
603
604 BUG_ON(!rt_task(p));
605
606 /*
607 * Update the migration status of the RQ if we have an RT task
608 * which is running AND changing its weight value.
609 */
610 if (p->se.on_rq && (weight != p->nr_cpus_allowed)) {
611 struct rq *rq = task_rq(p);
612
613 if ((p->nr_cpus_allowed <= 1) && (weight > 1))
614 rq->rt.rt_nr_migratory++;
615 else if((p->nr_cpus_allowed > 1) && (weight <= 1)) {
616 BUG_ON(!rq->rt.rt_nr_migratory);
617 rq->rt.rt_nr_migratory--;
618 }
619
620 update_rt_migration(rq);
621 }
622
623 p->cpus_allowed = *new_mask;
624 p->nr_cpus_allowed = weight;
625}
587#else /* CONFIG_SMP */ 626#else /* CONFIG_SMP */
588# define schedule_tail_balance_rt(rq) do { } while (0) 627# define schedule_tail_balance_rt(rq) do { } while (0)
589# define schedule_balance_rt(rq, prev) do { } while (0) 628# define schedule_balance_rt(rq, prev) do { } while (0)
@@ -637,6 +676,7 @@ const struct sched_class rt_sched_class = {
637#ifdef CONFIG_SMP 676#ifdef CONFIG_SMP
638 .load_balance = load_balance_rt, 677 .load_balance = load_balance_rt,
639 .move_one_task = move_one_task_rt, 678 .move_one_task = move_one_task_rt,
679 .set_cpus_allowed = set_cpus_allowed_rt,
640#endif 680#endif
641 681
642 .set_curr_task = set_curr_task_rt, 682 .set_curr_task = set_curr_task_rt,