aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPeter Zijlstra <a.p.zijlstra@chello.nl>2008-01-25 15:08:30 -0500
committerIngo Molnar <mingo@elte.hu>2008-01-25 15:08:30 -0500
commit6f505b16425a51270058e4a93441fe64de3dd435 (patch)
treebe21e711d93bc4d088b97c4a4f585a5044dbaa7d
parentfa85ae2418e6843953107cd6a06f645752829bc0 (diff)
sched: rt group scheduling
Extend group scheduling to also cover the realtime classes. It uses the time limiting introduced by the previous patch to allow multiple realtime groups. The hard time limit is required to keep behaviour deterministic. The algorithms used make the realtime scheduler O(tg), linear scaling wrt the number of task groups. This is the worst case behaviour I can't seem to get out of, the avg. case of the algorithms can be improved, I focused on correctness and worst case. [ akpm@linux-foundation.org: move side-effects out of BUG_ON(). ] Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Ingo Molnar <mingo@elte.hu>
-rw-r--r--include/linux/init_task.h5
-rw-r--r--include/linux/sched.h10
-rw-r--r--kernel/fork.c2
-rw-r--r--kernel/sched.c283
-rw-r--r--kernel/sched_rt.c455
5 files changed, 549 insertions, 206 deletions
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index ee65d87bedb7..796019b22b6f 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -130,12 +130,13 @@ extern struct group_info init_groups;
130 .normal_prio = MAX_PRIO-20, \ 130 .normal_prio = MAX_PRIO-20, \
131 .policy = SCHED_NORMAL, \ 131 .policy = SCHED_NORMAL, \
132 .cpus_allowed = CPU_MASK_ALL, \ 132 .cpus_allowed = CPU_MASK_ALL, \
133 .nr_cpus_allowed = NR_CPUS, \
134 .mm = NULL, \ 133 .mm = NULL, \
135 .active_mm = &init_mm, \ 134 .active_mm = &init_mm, \
136 .rt = { \ 135 .rt = { \
137 .run_list = LIST_HEAD_INIT(tsk.rt.run_list), \ 136 .run_list = LIST_HEAD_INIT(tsk.rt.run_list), \
138 .time_slice = HZ, }, \ 137 .time_slice = HZ, \
138 .nr_cpus_allowed = NR_CPUS, \
139 }, \
139 .ioprio = 0, \ 140 .ioprio = 0, \
140 .tasks = LIST_HEAD_INIT(tsk.tasks), \ 141 .tasks = LIST_HEAD_INIT(tsk.tasks), \
141 .ptrace_children= LIST_HEAD_INIT(tsk.ptrace_children), \ 142 .ptrace_children= LIST_HEAD_INIT(tsk.ptrace_children), \
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d5ea144df836..04eecbf0241e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -934,6 +934,15 @@ struct sched_rt_entity {
934 struct list_head run_list; 934 struct list_head run_list;
935 unsigned int time_slice; 935 unsigned int time_slice;
936 unsigned long timeout; 936 unsigned long timeout;
937 int nr_cpus_allowed;
938
939#ifdef CONFIG_FAIR_GROUP_SCHED
940 struct sched_rt_entity *parent;
941 /* rq on which this entity is (to be) queued: */
942 struct rt_rq *rt_rq;
943 /* rq "owned" by this entity/group: */
944 struct rt_rq *my_q;
945#endif
937}; 946};
938 947
939struct task_struct { 948struct task_struct {
@@ -978,7 +987,6 @@ struct task_struct {
978 987
979 unsigned int policy; 988 unsigned int policy;
980 cpumask_t cpus_allowed; 989 cpumask_t cpus_allowed;
981 int nr_cpus_allowed;
982 990
983#ifdef CONFIG_PREEMPT_RCU 991#ifdef CONFIG_PREEMPT_RCU
984 int rcu_read_lock_nesting; 992 int rcu_read_lock_nesting;
diff --git a/kernel/fork.c b/kernel/fork.c
index 9f8ef32cbc7a..0c969f4fade0 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1246,7 +1246,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1246 * parent's CPU). This avoids alot of nasty races. 1246 * parent's CPU). This avoids alot of nasty races.
1247 */ 1247 */
1248 p->cpus_allowed = current->cpus_allowed; 1248 p->cpus_allowed = current->cpus_allowed;
1249 p->nr_cpus_allowed = current->nr_cpus_allowed; 1249 p->rt.nr_cpus_allowed = current->rt.nr_cpus_allowed;
1250 if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) || 1250 if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) ||
1251 !cpu_online(task_cpu(p)))) 1251 !cpu_online(task_cpu(p))))
1252 set_task_cpu(p, smp_processor_id()); 1252 set_task_cpu(p, smp_processor_id());
diff --git a/kernel/sched.c b/kernel/sched.c
index e9a7beee9b79..5ea2c533b432 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -161,6 +161,8 @@ struct rt_prio_array {
161 161
162struct cfs_rq; 162struct cfs_rq;
163 163
164static LIST_HEAD(task_groups);
165
164/* task group related information */ 166/* task group related information */
165struct task_group { 167struct task_group {
166#ifdef CONFIG_FAIR_CGROUP_SCHED 168#ifdef CONFIG_FAIR_CGROUP_SCHED
@@ -171,6 +173,11 @@ struct task_group {
171 /* runqueue "owned" by this group on each cpu */ 173 /* runqueue "owned" by this group on each cpu */
172 struct cfs_rq **cfs_rq; 174 struct cfs_rq **cfs_rq;
173 175
176 struct sched_rt_entity **rt_se;
177 struct rt_rq **rt_rq;
178
179 unsigned int rt_ratio;
180
174 /* 181 /*
175 * shares assigned to a task group governs how much of cpu bandwidth 182 * shares assigned to a task group governs how much of cpu bandwidth
176 * is allocated to the group. The more shares a group has, the more is 183 * is allocated to the group. The more shares a group has, the more is
@@ -208,6 +215,7 @@ struct task_group {
208 unsigned long shares; 215 unsigned long shares;
209 216
210 struct rcu_head rcu; 217 struct rcu_head rcu;
218 struct list_head list;
211}; 219};
212 220
213/* Default task group's sched entity on each cpu */ 221/* Default task group's sched entity on each cpu */
@@ -215,9 +223,15 @@ static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
215/* Default task group's cfs_rq on each cpu */ 223/* Default task group's cfs_rq on each cpu */
216static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; 224static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
217 225
226static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
227static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
228
218static struct sched_entity *init_sched_entity_p[NR_CPUS]; 229static struct sched_entity *init_sched_entity_p[NR_CPUS];
219static struct cfs_rq *init_cfs_rq_p[NR_CPUS]; 230static struct cfs_rq *init_cfs_rq_p[NR_CPUS];
220 231
232static struct sched_rt_entity *init_sched_rt_entity_p[NR_CPUS];
233static struct rt_rq *init_rt_rq_p[NR_CPUS];
234
221/* task_group_mutex serializes add/remove of task groups and also changes to 235/* task_group_mutex serializes add/remove of task groups and also changes to
222 * a task group's cpu shares. 236 * a task group's cpu shares.
223 */ 237 */
@@ -240,6 +254,9 @@ static void set_se_shares(struct sched_entity *se, unsigned long shares);
240struct task_group init_task_group = { 254struct task_group init_task_group = {
241 .se = init_sched_entity_p, 255 .se = init_sched_entity_p,
242 .cfs_rq = init_cfs_rq_p, 256 .cfs_rq = init_cfs_rq_p,
257
258 .rt_se = init_sched_rt_entity_p,
259 .rt_rq = init_rt_rq_p,
243}; 260};
244 261
245#ifdef CONFIG_FAIR_USER_SCHED 262#ifdef CONFIG_FAIR_USER_SCHED
@@ -269,10 +286,13 @@ static inline struct task_group *task_group(struct task_struct *p)
269} 286}
270 287
271/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ 288/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
272static inline void set_task_cfs_rq(struct task_struct *p, unsigned int cpu) 289static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
273{ 290{
274 p->se.cfs_rq = task_group(p)->cfs_rq[cpu]; 291 p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
275 p->se.parent = task_group(p)->se[cpu]; 292 p->se.parent = task_group(p)->se[cpu];
293
294 p->rt.rt_rq = task_group(p)->rt_rq[cpu];
295 p->rt.parent = task_group(p)->rt_se[cpu];
276} 296}
277 297
278static inline void lock_task_group_list(void) 298static inline void lock_task_group_list(void)
@@ -297,7 +317,7 @@ static inline void unlock_doms_cur(void)
297 317
298#else 318#else
299 319
300static inline void set_task_cfs_rq(struct task_struct *p, unsigned int cpu) { } 320static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
301static inline void lock_task_group_list(void) { } 321static inline void lock_task_group_list(void) { }
302static inline void unlock_task_group_list(void) { } 322static inline void unlock_task_group_list(void) { }
303static inline void lock_doms_cur(void) { } 323static inline void lock_doms_cur(void) { }
@@ -343,13 +363,22 @@ struct cfs_rq {
343struct rt_rq { 363struct rt_rq {
344 struct rt_prio_array active; 364 struct rt_prio_array active;
345 unsigned long rt_nr_running; 365 unsigned long rt_nr_running;
366#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED
367 int highest_prio; /* highest queued rt task prio */
368#endif
346#ifdef CONFIG_SMP 369#ifdef CONFIG_SMP
347 unsigned long rt_nr_migratory; 370 unsigned long rt_nr_migratory;
348 int highest_prio; /* highest queued rt task prio */
349 int overloaded; 371 int overloaded;
350#endif 372#endif
373 int rt_throttled;
351 u64 rt_time; 374 u64 rt_time;
352 u64 rt_throttled; 375
376#ifdef CONFIG_FAIR_GROUP_SCHED
377 struct rq *rq;
378 struct list_head leaf_rt_rq_list;
379 struct task_group *tg;
380 struct sched_rt_entity *rt_se;
381#endif
353}; 382};
354 383
355#ifdef CONFIG_SMP 384#ifdef CONFIG_SMP
@@ -411,12 +440,14 @@ struct rq {
411 u64 nr_switches; 440 u64 nr_switches;
412 441
413 struct cfs_rq cfs; 442 struct cfs_rq cfs;
443 struct rt_rq rt;
444 u64 rt_period_expire;
445
414#ifdef CONFIG_FAIR_GROUP_SCHED 446#ifdef CONFIG_FAIR_GROUP_SCHED
415 /* list of leaf cfs_rq on this cpu: */ 447 /* list of leaf cfs_rq on this cpu: */
416 struct list_head leaf_cfs_rq_list; 448 struct list_head leaf_cfs_rq_list;
449 struct list_head leaf_rt_rq_list;
417#endif 450#endif
418 struct rt_rq rt;
419 u64 rt_period_expire;
420 451
421 /* 452 /*
422 * This is part of a global counter where only the total sum 453 * This is part of a global counter where only the total sum
@@ -613,9 +644,9 @@ const_debug unsigned int sysctl_sched_rt_period = 1000;
613 644
614/* 645/*
615 * ratio of time -rt tasks may consume. 646 * ratio of time -rt tasks may consume.
616 * default: 100% 647 * default: 95%
617 */ 648 */
618const_debug unsigned int sysctl_sched_rt_ratio = SCHED_RT_FRAC; 649const_debug unsigned int sysctl_sched_rt_ratio = 62259;
619 650
620/* 651/*
621 * For kernel-internal use: high-speed (but slightly incorrect) per-cpu 652 * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
@@ -1337,7 +1368,7 @@ unsigned long weighted_cpuload(const int cpu)
1337 1368
1338static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) 1369static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1339{ 1370{
1340 set_task_cfs_rq(p, cpu); 1371 set_task_rq(p, cpu);
1341#ifdef CONFIG_SMP 1372#ifdef CONFIG_SMP
1342 /* 1373 /*
1343 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be 1374 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
@@ -5281,7 +5312,7 @@ int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
5281 p->sched_class->set_cpus_allowed(p, &new_mask); 5312 p->sched_class->set_cpus_allowed(p, &new_mask);
5282 else { 5313 else {
5283 p->cpus_allowed = new_mask; 5314 p->cpus_allowed = new_mask;
5284 p->nr_cpus_allowed = cpus_weight(new_mask); 5315 p->rt.nr_cpus_allowed = cpus_weight(new_mask);
5285 } 5316 }
5286 5317
5287 /* Can the task run on the task's current CPU? If so, we're done */ 5318 /* Can the task run on the task's current CPU? If so, we're done */
@@ -7079,8 +7110,50 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
7079 7110
7080 rt_rq->rt_time = 0; 7111 rt_rq->rt_time = 0;
7081 rt_rq->rt_throttled = 0; 7112 rt_rq->rt_throttled = 0;
7113
7114#ifdef CONFIG_FAIR_GROUP_SCHED
7115 rt_rq->rq = rq;
7116#endif
7082} 7117}
7083 7118
7119#ifdef CONFIG_FAIR_GROUP_SCHED
7120static void init_tg_cfs_entry(struct rq *rq, struct task_group *tg,
7121 struct cfs_rq *cfs_rq, struct sched_entity *se,
7122 int cpu, int add)
7123{
7124 tg->cfs_rq[cpu] = cfs_rq;
7125 init_cfs_rq(cfs_rq, rq);
7126 cfs_rq->tg = tg;
7127 if (add)
7128 list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
7129
7130 tg->se[cpu] = se;
7131 se->cfs_rq = &rq->cfs;
7132 se->my_q = cfs_rq;
7133 se->load.weight = tg->shares;
7134 se->load.inv_weight = div64_64(1ULL<<32, se->load.weight);
7135 se->parent = NULL;
7136}
7137
7138static void init_tg_rt_entry(struct rq *rq, struct task_group *tg,
7139 struct rt_rq *rt_rq, struct sched_rt_entity *rt_se,
7140 int cpu, int add)
7141{
7142 tg->rt_rq[cpu] = rt_rq;
7143 init_rt_rq(rt_rq, rq);
7144 rt_rq->tg = tg;
7145 rt_rq->rt_se = rt_se;
7146 if (add)
7147 list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
7148
7149 tg->rt_se[cpu] = rt_se;
7150 rt_se->rt_rq = &rq->rt;
7151 rt_se->my_q = rt_rq;
7152 rt_se->parent = NULL;
7153 INIT_LIST_HEAD(&rt_se->run_list);
7154}
7155#endif
7156
7084void __init sched_init(void) 7157void __init sched_init(void)
7085{ 7158{
7086 int highest_cpu = 0; 7159 int highest_cpu = 0;
@@ -7090,6 +7163,10 @@ void __init sched_init(void)
7090 init_defrootdomain(); 7163 init_defrootdomain();
7091#endif 7164#endif
7092 7165
7166#ifdef CONFIG_FAIR_GROUP_SCHED
7167 list_add(&init_task_group.list, &task_groups);
7168#endif
7169
7093 for_each_possible_cpu(i) { 7170 for_each_possible_cpu(i) {
7094 struct rq *rq; 7171 struct rq *rq;
7095 7172
@@ -7099,30 +7176,20 @@ void __init sched_init(void)
7099 rq->nr_running = 0; 7176 rq->nr_running = 0;
7100 rq->clock = 1; 7177 rq->clock = 1;
7101 init_cfs_rq(&rq->cfs, rq); 7178 init_cfs_rq(&rq->cfs, rq);
7179 init_rt_rq(&rq->rt, rq);
7102#ifdef CONFIG_FAIR_GROUP_SCHED 7180#ifdef CONFIG_FAIR_GROUP_SCHED
7103 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
7104 {
7105 struct cfs_rq *cfs_rq = &per_cpu(init_cfs_rq, i);
7106 struct sched_entity *se =
7107 &per_cpu(init_sched_entity, i);
7108
7109 init_cfs_rq_p[i] = cfs_rq;
7110 init_cfs_rq(cfs_rq, rq);
7111 cfs_rq->tg = &init_task_group;
7112 list_add(&cfs_rq->leaf_cfs_rq_list,
7113 &rq->leaf_cfs_rq_list);
7114
7115 init_sched_entity_p[i] = se;
7116 se->cfs_rq = &rq->cfs;
7117 se->my_q = cfs_rq;
7118 se->load.weight = init_task_group_load;
7119 se->load.inv_weight =
7120 div64_64(1ULL<<32, init_task_group_load);
7121 se->parent = NULL;
7122 }
7123 init_task_group.shares = init_task_group_load; 7181 init_task_group.shares = init_task_group_load;
7182 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
7183 init_tg_cfs_entry(rq, &init_task_group,
7184 &per_cpu(init_cfs_rq, i),
7185 &per_cpu(init_sched_entity, i), i, 1);
7186
7187 init_task_group.rt_ratio = sysctl_sched_rt_ratio; /* XXX */
7188 INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
7189 init_tg_rt_entry(rq, &init_task_group,
7190 &per_cpu(init_rt_rq, i),
7191 &per_cpu(init_sched_rt_entity, i), i, 1);
7124#endif 7192#endif
7125 init_rt_rq(&rq->rt, rq);
7126 rq->rt_period_expire = 0; 7193 rq->rt_period_expire = 0;
7127 7194
7128 for (j = 0; j < CPU_LOAD_IDX_MAX; j++) 7195 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
@@ -7460,12 +7527,36 @@ static int load_balance_monitor(void *unused)
7460} 7527}
7461#endif /* CONFIG_SMP */ 7528#endif /* CONFIG_SMP */
7462 7529
7530static void free_sched_group(struct task_group *tg)
7531{
7532 int i;
7533
7534 for_each_possible_cpu(i) {
7535 if (tg->cfs_rq)
7536 kfree(tg->cfs_rq[i]);
7537 if (tg->se)
7538 kfree(tg->se[i]);
7539 if (tg->rt_rq)
7540 kfree(tg->rt_rq[i]);
7541 if (tg->rt_se)
7542 kfree(tg->rt_se[i]);
7543 }
7544
7545 kfree(tg->cfs_rq);
7546 kfree(tg->se);
7547 kfree(tg->rt_rq);
7548 kfree(tg->rt_se);
7549 kfree(tg);
7550}
7551
7463/* allocate runqueue etc for a new task group */ 7552/* allocate runqueue etc for a new task group */
7464struct task_group *sched_create_group(void) 7553struct task_group *sched_create_group(void)
7465{ 7554{
7466 struct task_group *tg; 7555 struct task_group *tg;
7467 struct cfs_rq *cfs_rq; 7556 struct cfs_rq *cfs_rq;
7468 struct sched_entity *se; 7557 struct sched_entity *se;
7558 struct rt_rq *rt_rq;
7559 struct sched_rt_entity *rt_se;
7469 struct rq *rq; 7560 struct rq *rq;
7470 int i; 7561 int i;
7471 7562
@@ -7479,100 +7570,89 @@ struct task_group *sched_create_group(void)
7479 tg->se = kzalloc(sizeof(se) * NR_CPUS, GFP_KERNEL); 7570 tg->se = kzalloc(sizeof(se) * NR_CPUS, GFP_KERNEL);
7480 if (!tg->se) 7571 if (!tg->se)
7481 goto err; 7572 goto err;
7573 tg->rt_rq = kzalloc(sizeof(rt_rq) * NR_CPUS, GFP_KERNEL);
7574 if (!tg->rt_rq)
7575 goto err;
7576 tg->rt_se = kzalloc(sizeof(rt_se) * NR_CPUS, GFP_KERNEL);
7577 if (!tg->rt_se)
7578 goto err;
7579
7580 tg->shares = NICE_0_LOAD;
7581 tg->rt_ratio = 0; /* XXX */
7482 7582
7483 for_each_possible_cpu(i) { 7583 for_each_possible_cpu(i) {
7484 rq = cpu_rq(i); 7584 rq = cpu_rq(i);
7485 7585
7486 cfs_rq = kmalloc_node(sizeof(struct cfs_rq), GFP_KERNEL, 7586 cfs_rq = kmalloc_node(sizeof(struct cfs_rq),
7487 cpu_to_node(i)); 7587 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
7488 if (!cfs_rq) 7588 if (!cfs_rq)
7489 goto err; 7589 goto err;
7490 7590
7491 se = kmalloc_node(sizeof(struct sched_entity), GFP_KERNEL, 7591 se = kmalloc_node(sizeof(struct sched_entity),
7492 cpu_to_node(i)); 7592 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
7493 if (!se) 7593 if (!se)
7494 goto err; 7594 goto err;
7495 7595
7496 memset(cfs_rq, 0, sizeof(struct cfs_rq)); 7596 rt_rq = kmalloc_node(sizeof(struct rt_rq),
7497 memset(se, 0, sizeof(struct sched_entity)); 7597 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
7598 if (!rt_rq)
7599 goto err;
7498 7600
7499 tg->cfs_rq[i] = cfs_rq; 7601 rt_se = kmalloc_node(sizeof(struct sched_rt_entity),
7500 init_cfs_rq(cfs_rq, rq); 7602 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
7501 cfs_rq->tg = tg; 7603 if (!rt_se)
7604 goto err;
7502 7605
7503 tg->se[i] = se; 7606 init_tg_cfs_entry(rq, tg, cfs_rq, se, i, 0);
7504 se->cfs_rq = &rq->cfs; 7607 init_tg_rt_entry(rq, tg, rt_rq, rt_se, i, 0);
7505 se->my_q = cfs_rq;
7506 se->load.weight = NICE_0_LOAD;
7507 se->load.inv_weight = div64_64(1ULL<<32, NICE_0_LOAD);
7508 se->parent = NULL;
7509 } 7608 }
7510 7609
7511 tg->shares = NICE_0_LOAD;
7512
7513 lock_task_group_list(); 7610 lock_task_group_list();
7514 for_each_possible_cpu(i) { 7611 for_each_possible_cpu(i) {
7515 rq = cpu_rq(i); 7612 rq = cpu_rq(i);
7516 cfs_rq = tg->cfs_rq[i]; 7613 cfs_rq = tg->cfs_rq[i];
7517 list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); 7614 list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
7615 rt_rq = tg->rt_rq[i];
7616 list_add_rcu(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
7518 } 7617 }
7618 list_add_rcu(&tg->list, &task_groups);
7519 unlock_task_group_list(); 7619 unlock_task_group_list();
7520 7620
7521 return tg; 7621 return tg;
7522 7622
7523err: 7623err:
7524 for_each_possible_cpu(i) { 7624 free_sched_group(tg);
7525 if (tg->cfs_rq)
7526 kfree(tg->cfs_rq[i]);
7527 if (tg->se)
7528 kfree(tg->se[i]);
7529 }
7530 kfree(tg->cfs_rq);
7531 kfree(tg->se);
7532 kfree(tg);
7533
7534 return ERR_PTR(-ENOMEM); 7625 return ERR_PTR(-ENOMEM);
7535} 7626}
7536 7627
7537/* rcu callback to free various structures associated with a task group */ 7628/* rcu callback to free various structures associated with a task group */
7538static void free_sched_group(struct rcu_head *rhp) 7629static void free_sched_group_rcu(struct rcu_head *rhp)
7539{ 7630{
7540 struct task_group *tg = container_of(rhp, struct task_group, rcu);
7541 struct cfs_rq *cfs_rq;
7542 struct sched_entity *se;
7543 int i;
7544
7545 /* now it should be safe to free those cfs_rqs */ 7631 /* now it should be safe to free those cfs_rqs */
7546 for_each_possible_cpu(i) { 7632 free_sched_group(container_of(rhp, struct task_group, rcu));
7547 cfs_rq = tg->cfs_rq[i];
7548 kfree(cfs_rq);
7549
7550 se = tg->se[i];
7551 kfree(se);
7552 }
7553
7554 kfree(tg->cfs_rq);
7555 kfree(tg->se);
7556 kfree(tg);
7557} 7633}
7558 7634
7559/* Destroy runqueue etc associated with a task group */ 7635/* Destroy runqueue etc associated with a task group */
7560void sched_destroy_group(struct task_group *tg) 7636void sched_destroy_group(struct task_group *tg)
7561{ 7637{
7562 struct cfs_rq *cfs_rq = NULL; 7638 struct cfs_rq *cfs_rq = NULL;
7639 struct rt_rq *rt_rq = NULL;
7563 int i; 7640 int i;
7564 7641
7565 lock_task_group_list(); 7642 lock_task_group_list();
7566 for_each_possible_cpu(i) { 7643 for_each_possible_cpu(i) {
7567 cfs_rq = tg->cfs_rq[i]; 7644 cfs_rq = tg->cfs_rq[i];
7568 list_del_rcu(&cfs_rq->leaf_cfs_rq_list); 7645 list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
7646 rt_rq = tg->rt_rq[i];
7647 list_del_rcu(&rt_rq->leaf_rt_rq_list);
7569 } 7648 }
7649 list_del_rcu(&tg->list);
7570 unlock_task_group_list(); 7650 unlock_task_group_list();
7571 7651
7572 BUG_ON(!cfs_rq); 7652 BUG_ON(!cfs_rq);
7573 7653
7574 /* wait for possible concurrent references to cfs_rqs complete */ 7654 /* wait for possible concurrent references to cfs_rqs complete */
7575 call_rcu(&tg->rcu, free_sched_group); 7655 call_rcu(&tg->rcu, free_sched_group_rcu);
7576} 7656}
7577 7657
7578/* change task's runqueue when it moves between groups. 7658/* change task's runqueue when it moves between groups.
@@ -7588,11 +7668,6 @@ void sched_move_task(struct task_struct *tsk)
7588 7668
7589 rq = task_rq_lock(tsk, &flags); 7669 rq = task_rq_lock(tsk, &flags);
7590 7670
7591 if (tsk->sched_class != &fair_sched_class) {
7592 set_task_cfs_rq(tsk, task_cpu(tsk));
7593 goto done;
7594 }
7595
7596 update_rq_clock(rq); 7671 update_rq_clock(rq);
7597 7672
7598 running = task_current(rq, tsk); 7673 running = task_current(rq, tsk);
@@ -7604,7 +7679,7 @@ void sched_move_task(struct task_struct *tsk)
7604 tsk->sched_class->put_prev_task(rq, tsk); 7679 tsk->sched_class->put_prev_task(rq, tsk);
7605 } 7680 }
7606 7681
7607 set_task_cfs_rq(tsk, task_cpu(tsk)); 7682 set_task_rq(tsk, task_cpu(tsk));
7608 7683
7609 if (on_rq) { 7684 if (on_rq) {
7610 if (unlikely(running)) 7685 if (unlikely(running))
@@ -7612,7 +7687,6 @@ void sched_move_task(struct task_struct *tsk)
7612 enqueue_task(rq, tsk, 0); 7687 enqueue_task(rq, tsk, 0);
7613 } 7688 }
7614 7689
7615done:
7616 task_rq_unlock(rq, &flags); 7690 task_rq_unlock(rq, &flags);
7617} 7691}
7618 7692
@@ -7697,6 +7771,31 @@ unsigned long sched_group_shares(struct task_group *tg)
7697 return tg->shares; 7771 return tg->shares;
7698} 7772}
7699 7773
7774/*
7775 * Ensure the total rt_ratio <= sysctl_sched_rt_ratio
7776 */
7777int sched_group_set_rt_ratio(struct task_group *tg, unsigned long rt_ratio)
7778{
7779 struct task_group *tgi;
7780 unsigned long total = 0;
7781
7782 rcu_read_lock();
7783 list_for_each_entry_rcu(tgi, &task_groups, list)
7784 total += tgi->rt_ratio;
7785 rcu_read_unlock();
7786
7787 if (total + rt_ratio - tg->rt_ratio > sysctl_sched_rt_ratio)
7788 return -EINVAL;
7789
7790 tg->rt_ratio = rt_ratio;
7791 return 0;
7792}
7793
7794unsigned long sched_group_rt_ratio(struct task_group *tg)
7795{
7796 return tg->rt_ratio;
7797}
7798
7700#endif /* CONFIG_FAIR_GROUP_SCHED */ 7799#endif /* CONFIG_FAIR_GROUP_SCHED */
7701 7800
7702#ifdef CONFIG_FAIR_CGROUP_SCHED 7801#ifdef CONFIG_FAIR_CGROUP_SCHED
@@ -7772,12 +7871,30 @@ static u64 cpu_shares_read_uint(struct cgroup *cgrp, struct cftype *cft)
7772 return (u64) tg->shares; 7871 return (u64) tg->shares;
7773} 7872}
7774 7873
7874static int cpu_rt_ratio_write_uint(struct cgroup *cgrp, struct cftype *cftype,
7875 u64 rt_ratio_val)
7876{
7877 return sched_group_set_rt_ratio(cgroup_tg(cgrp), rt_ratio_val);
7878}
7879
7880static u64 cpu_rt_ratio_read_uint(struct cgroup *cgrp, struct cftype *cft)
7881{
7882 struct task_group *tg = cgroup_tg(cgrp);
7883
7884 return (u64) tg->rt_ratio;
7885}
7886
7775static struct cftype cpu_files[] = { 7887static struct cftype cpu_files[] = {
7776 { 7888 {
7777 .name = "shares", 7889 .name = "shares",
7778 .read_uint = cpu_shares_read_uint, 7890 .read_uint = cpu_shares_read_uint,
7779 .write_uint = cpu_shares_write_uint, 7891 .write_uint = cpu_shares_write_uint,
7780 }, 7892 },
7893 {
7894 .name = "rt_ratio",
7895 .read_uint = cpu_rt_ratio_read_uint,
7896 .write_uint = cpu_rt_ratio_write_uint,
7897 },
7781}; 7898};
7782 7899
7783static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont) 7900static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index fd10d965aa06..1178257613ad 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -45,47 +45,167 @@ static void update_rt_migration(struct rq *rq)
45} 45}
46#endif /* CONFIG_SMP */ 46#endif /* CONFIG_SMP */
47 47
48static int sched_rt_ratio_exceeded(struct rq *rq, struct rt_rq *rt_rq) 48static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
49{ 49{
50 return container_of(rt_se, struct task_struct, rt);
51}
52
53static inline int on_rt_rq(struct sched_rt_entity *rt_se)
54{
55 return !list_empty(&rt_se->run_list);
56}
57
58#ifdef CONFIG_FAIR_GROUP_SCHED
59
60static inline unsigned int sched_rt_ratio(struct rt_rq *rt_rq)
61{
62 if (!rt_rq->tg)
63 return SCHED_RT_FRAC;
64
65 return rt_rq->tg->rt_ratio;
66}
67
68#define for_each_leaf_rt_rq(rt_rq, rq) \
69 list_for_each_entry(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
70
71static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
72{
73 return rt_rq->rq;
74}
75
76static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
77{
78 return rt_se->rt_rq;
79}
80
81#define for_each_sched_rt_entity(rt_se) \
82 for (; rt_se; rt_se = rt_se->parent)
83
84static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
85{
86 return rt_se->my_q;
87}
88
89static void enqueue_rt_entity(struct sched_rt_entity *rt_se);
90static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
91
92static void sched_rt_ratio_enqueue(struct rt_rq *rt_rq)
93{
94 struct sched_rt_entity *rt_se = rt_rq->rt_se;
95
96 if (rt_se && !on_rt_rq(rt_se) && rt_rq->rt_nr_running) {
97 enqueue_rt_entity(rt_se);
98 resched_task(rq_of_rt_rq(rt_rq)->curr);
99 }
100}
101
102static void sched_rt_ratio_dequeue(struct rt_rq *rt_rq)
103{
104 struct sched_rt_entity *rt_se = rt_rq->rt_se;
105
106 if (rt_se && on_rt_rq(rt_se))
107 dequeue_rt_entity(rt_se);
108}
109
110#else
111
112static inline unsigned int sched_rt_ratio(struct rt_rq *rt_rq)
113{
114 return sysctl_sched_rt_ratio;
115}
116
117#define for_each_leaf_rt_rq(rt_rq, rq) \
118 for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
119
120static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
121{
122 return container_of(rt_rq, struct rq, rt);
123}
124
125static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
126{
127 struct task_struct *p = rt_task_of(rt_se);
128 struct rq *rq = task_rq(p);
129
130 return &rq->rt;
131}
132
133#define for_each_sched_rt_entity(rt_se) \
134 for (; rt_se; rt_se = NULL)
135
136static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
137{
138 return NULL;
139}
140
141static inline void sched_rt_ratio_enqueue(struct rt_rq *rt_rq)
142{
143}
144
145static inline void sched_rt_ratio_dequeue(struct rt_rq *rt_rq)
146{
147}
148
149#endif
150
151static inline int rt_se_prio(struct sched_rt_entity *rt_se)
152{
153#ifdef CONFIG_FAIR_GROUP_SCHED
154 struct rt_rq *rt_rq = group_rt_rq(rt_se);
155
156 if (rt_rq)
157 return rt_rq->highest_prio;
158#endif
159
160 return rt_task_of(rt_se)->prio;
161}
162
163static int sched_rt_ratio_exceeded(struct rt_rq *rt_rq)
164{
165 unsigned int rt_ratio = sched_rt_ratio(rt_rq);
50 u64 period, ratio; 166 u64 period, ratio;
51 167
52 if (sysctl_sched_rt_ratio == SCHED_RT_FRAC) 168 if (rt_ratio == SCHED_RT_FRAC)
53 return 0; 169 return 0;
54 170
55 if (rt_rq->rt_throttled) 171 if (rt_rq->rt_throttled)
56 return 1; 172 return 1;
57 173
58 period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC; 174 period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC;
59 ratio = (period * sysctl_sched_rt_ratio) >> SCHED_RT_FRAC_SHIFT; 175 ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT;
60 176
61 if (rt_rq->rt_time > ratio) { 177 if (rt_rq->rt_time > ratio) {
62 rt_rq->rt_throttled = rq->clock + period - rt_rq->rt_time; 178 rt_rq->rt_throttled = 1;
179 sched_rt_ratio_dequeue(rt_rq);
63 return 1; 180 return 1;
64 } 181 }
65 182
66 return 0; 183 return 0;
67} 184}
68 185
186static void __update_sched_rt_period(struct rt_rq *rt_rq, u64 period)
187{
188 unsigned long rt_ratio = sched_rt_ratio(rt_rq);
189 u64 ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT;
190
191 rt_rq->rt_time -= min(rt_rq->rt_time, ratio);
192 if (rt_rq->rt_throttled) {
193 rt_rq->rt_throttled = 0;
194 sched_rt_ratio_enqueue(rt_rq);
195 }
196}
197
69static void update_sched_rt_period(struct rq *rq) 198static void update_sched_rt_period(struct rq *rq)
70{ 199{
71 while (rq->clock > rq->rt_period_expire) { 200 struct rt_rq *rt_rq;
72 u64 period, ratio; 201 u64 period;
73 202
203 while (rq->clock > rq->rt_period_expire) {
74 period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC; 204 period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC;
75 ratio = (period * sysctl_sched_rt_ratio) >> SCHED_RT_FRAC_SHIFT;
76
77 rq->rt.rt_time -= min(rq->rt.rt_time, ratio);
78 rq->rt_period_expire += period; 205 rq->rt_period_expire += period;
79 }
80 206
81 /* 207 for_each_leaf_rt_rq(rt_rq, rq)
82 * When the rt throttle is expired, let them rip. 208 __update_sched_rt_period(rt_rq, period);
83 * (XXX: use hrtick when available)
84 */
85 if (rq->rt.rt_throttled && rq->clock > rq->rt.rt_throttled) {
86 rq->rt.rt_throttled = 0;
87 if (!sched_rt_ratio_exceeded(rq, &rq->rt))
88 resched_task(rq->curr);
89 } 209 }
90} 210}
91 211
@@ -96,6 +216,8 @@ static void update_sched_rt_period(struct rq *rq)
96static void update_curr_rt(struct rq *rq) 216static void update_curr_rt(struct rq *rq)
97{ 217{
98 struct task_struct *curr = rq->curr; 218 struct task_struct *curr = rq->curr;
219 struct sched_rt_entity *rt_se = &curr->rt;
220 struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
99 u64 delta_exec; 221 u64 delta_exec;
100 222
101 if (!task_has_rt_policy(curr)) 223 if (!task_has_rt_policy(curr))
@@ -111,95 +233,184 @@ static void update_curr_rt(struct rq *rq)
111 curr->se.exec_start = rq->clock; 233 curr->se.exec_start = rq->clock;
112 cpuacct_charge(curr, delta_exec); 234 cpuacct_charge(curr, delta_exec);
113 235
114 rq->rt.rt_time += delta_exec; 236 rt_rq->rt_time += delta_exec;
115 update_sched_rt_period(rq); 237 /*
116 if (sched_rt_ratio_exceeded(rq, &rq->rt)) 238 * might make it a tad more accurate:
239 *
240 * update_sched_rt_period(rq);
241 */
242 if (sched_rt_ratio_exceeded(rt_rq))
117 resched_task(curr); 243 resched_task(curr);
118} 244}
119 245
120static inline void inc_rt_tasks(struct task_struct *p, struct rq *rq) 246static inline
247void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
121{ 248{
122 WARN_ON(!rt_task(p)); 249 WARN_ON(!rt_prio(rt_se_prio(rt_se)));
123 rq->rt.rt_nr_running++; 250 rt_rq->rt_nr_running++;
251#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED
252 if (rt_se_prio(rt_se) < rt_rq->highest_prio)
253 rt_rq->highest_prio = rt_se_prio(rt_se);
254#endif
124#ifdef CONFIG_SMP 255#ifdef CONFIG_SMP
125 if (p->prio < rq->rt.highest_prio) 256 if (rt_se->nr_cpus_allowed > 1) {
126 rq->rt.highest_prio = p->prio; 257 struct rq *rq = rq_of_rt_rq(rt_rq);
127 if (p->nr_cpus_allowed > 1)
128 rq->rt.rt_nr_migratory++; 258 rq->rt.rt_nr_migratory++;
259 }
129 260
130 update_rt_migration(rq); 261 update_rt_migration(rq_of_rt_rq(rt_rq));
131#endif /* CONFIG_SMP */ 262#endif
132} 263}
133 264
134static inline void dec_rt_tasks(struct task_struct *p, struct rq *rq) 265static inline
266void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
135{ 267{
136 WARN_ON(!rt_task(p)); 268 WARN_ON(!rt_prio(rt_se_prio(rt_se)));
137 WARN_ON(!rq->rt.rt_nr_running); 269 WARN_ON(!rt_rq->rt_nr_running);
138 rq->rt.rt_nr_running--; 270 rt_rq->rt_nr_running--;
139#ifdef CONFIG_SMP 271#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED
140 if (rq->rt.rt_nr_running) { 272 if (rt_rq->rt_nr_running) {
141 struct rt_prio_array *array; 273 struct rt_prio_array *array;
142 274
143 WARN_ON(p->prio < rq->rt.highest_prio); 275 WARN_ON(rt_se_prio(rt_se) < rt_rq->highest_prio);
144 if (p->prio == rq->rt.highest_prio) { 276 if (rt_se_prio(rt_se) == rt_rq->highest_prio) {
145 /* recalculate */ 277 /* recalculate */
146 array = &rq->rt.active; 278 array = &rt_rq->active;
147 rq->rt.highest_prio = 279 rt_rq->highest_prio =
148 sched_find_first_bit(array->bitmap); 280 sched_find_first_bit(array->bitmap);
149 } /* otherwise leave rq->highest prio alone */ 281 } /* otherwise leave rq->highest prio alone */
150 } else 282 } else
151 rq->rt.highest_prio = MAX_RT_PRIO; 283 rt_rq->highest_prio = MAX_RT_PRIO;
152 if (p->nr_cpus_allowed > 1) 284#endif
285#ifdef CONFIG_SMP
286 if (rt_se->nr_cpus_allowed > 1) {
287 struct rq *rq = rq_of_rt_rq(rt_rq);
153 rq->rt.rt_nr_migratory--; 288 rq->rt.rt_nr_migratory--;
289 }
154 290
155 update_rt_migration(rq); 291 update_rt_migration(rq_of_rt_rq(rt_rq));
156#endif /* CONFIG_SMP */ 292#endif /* CONFIG_SMP */
157} 293}
158 294
159static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup) 295static void enqueue_rt_entity(struct sched_rt_entity *rt_se)
160{ 296{
161 struct rt_prio_array *array = &rq->rt.active; 297 struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
298 struct rt_prio_array *array = &rt_rq->active;
299 struct rt_rq *group_rq = group_rt_rq(rt_se);
162 300
163 list_add_tail(&p->rt.run_list, array->queue + p->prio); 301 if (group_rq && group_rq->rt_throttled)
164 __set_bit(p->prio, array->bitmap); 302 return;
165 inc_cpu_load(rq, p->se.load.weight);
166 303
167 inc_rt_tasks(p, rq); 304 list_add_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se));
305 __set_bit(rt_se_prio(rt_se), array->bitmap);
168 306
169 if (wakeup) 307 inc_rt_tasks(rt_se, rt_rq);
170 p->rt.timeout = 0; 308}
309
310static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
311{
312 struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
313 struct rt_prio_array *array = &rt_rq->active;
314
315 list_del_init(&rt_se->run_list);
316 if (list_empty(array->queue + rt_se_prio(rt_se)))
317 __clear_bit(rt_se_prio(rt_se), array->bitmap);
318
319 dec_rt_tasks(rt_se, rt_rq);
320}
321
322/*
323 * Because the prio of an upper entry depends on the lower
324 * entries, we must remove entries top - down.
325 *
326 * XXX: O(1/2 h^2) because we can only walk up, not down the chain.
327 * doesn't matter much for now, as h=2 for GROUP_SCHED.
328 */
329static void dequeue_rt_stack(struct task_struct *p)
330{
331 struct sched_rt_entity *rt_se, *top_se;
332
333 /*
334 * dequeue all, top - down.
335 */
336 do {
337 rt_se = &p->rt;
338 top_se = NULL;
339 for_each_sched_rt_entity(rt_se) {
340 if (on_rt_rq(rt_se))
341 top_se = rt_se;
342 }
343 if (top_se)
344 dequeue_rt_entity(top_se);
345 } while (top_se);
171} 346}
172 347
173/* 348/*
174 * Adding/removing a task to/from a priority array: 349 * Adding/removing a task to/from a priority array:
175 */ 350 */
351static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
352{
353 struct sched_rt_entity *rt_se = &p->rt;
354
355 if (wakeup)
356 rt_se->timeout = 0;
357
358 dequeue_rt_stack(p);
359
360 /*
361 * enqueue everybody, bottom - up.
362 */
363 for_each_sched_rt_entity(rt_se)
364 enqueue_rt_entity(rt_se);
365
366 inc_cpu_load(rq, p->se.load.weight);
367}
368
176static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) 369static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
177{ 370{
178 struct rt_prio_array *array = &rq->rt.active; 371 struct sched_rt_entity *rt_se = &p->rt;
372 struct rt_rq *rt_rq;
179 373
180 update_curr_rt(rq); 374 update_curr_rt(rq);
181 375
182 list_del(&p->rt.run_list); 376 dequeue_rt_stack(p);
183 if (list_empty(array->queue + p->prio)) 377
184 __clear_bit(p->prio, array->bitmap); 378 /*
185 dec_cpu_load(rq, p->se.load.weight); 379 * re-enqueue all non-empty rt_rq entities.
380 */
381 for_each_sched_rt_entity(rt_se) {
382 rt_rq = group_rt_rq(rt_se);
383 if (rt_rq && rt_rq->rt_nr_running)
384 enqueue_rt_entity(rt_se);
385 }
186 386
187 dec_rt_tasks(p, rq); 387 dec_cpu_load(rq, p->se.load.weight);
188} 388}
189 389
190/* 390/*
191 * Put task to the end of the run list without the overhead of dequeue 391 * Put task to the end of the run list without the overhead of dequeue
192 * followed by enqueue. 392 * followed by enqueue.
193 */ 393 */
394static
395void requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
396{
397 struct rt_prio_array *array = &rt_rq->active;
398
399 list_move_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se));
400}
401
194static void requeue_task_rt(struct rq *rq, struct task_struct *p) 402static void requeue_task_rt(struct rq *rq, struct task_struct *p)
195{ 403{
196 struct rt_prio_array *array = &rq->rt.active; 404 struct sched_rt_entity *rt_se = &p->rt;
405 struct rt_rq *rt_rq;
197 406
198 list_move_tail(&p->rt.run_list, array->queue + p->prio); 407 for_each_sched_rt_entity(rt_se) {
408 rt_rq = rt_rq_of_se(rt_se);
409 requeue_rt_entity(rt_rq, rt_se);
410 }
199} 411}
200 412
201static void 413static void yield_task_rt(struct rq *rq)
202yield_task_rt(struct rq *rq)
203{ 414{
204 requeue_task_rt(rq, rq->curr); 415 requeue_task_rt(rq, rq->curr);
205} 416}
@@ -229,7 +440,7 @@ static int select_task_rq_rt(struct task_struct *p, int sync)
229 * cold cache anyway. 440 * cold cache anyway.
230 */ 441 */
231 if (unlikely(rt_task(rq->curr)) && 442 if (unlikely(rt_task(rq->curr)) &&
232 (p->nr_cpus_allowed > 1)) { 443 (p->rt.nr_cpus_allowed > 1)) {
233 int cpu = find_lowest_rq(p); 444 int cpu = find_lowest_rq(p);
234 445
235 return (cpu == -1) ? task_cpu(p) : cpu; 446 return (cpu == -1) ? task_cpu(p) : cpu;
@@ -252,27 +463,51 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p)
252 resched_task(rq->curr); 463 resched_task(rq->curr);
253} 464}
254 465
255static struct task_struct *pick_next_task_rt(struct rq *rq) 466static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
467 struct rt_rq *rt_rq)
256{ 468{
257 struct rt_prio_array *array = &rq->rt.active; 469 struct rt_prio_array *array = &rt_rq->active;
258 struct task_struct *next; 470 struct sched_rt_entity *next = NULL;
259 struct list_head *queue; 471 struct list_head *queue;
260 struct rt_rq *rt_rq = &rq->rt;
261 int idx; 472 int idx;
262 473
263 if (sched_rt_ratio_exceeded(rq, rt_rq)) 474 if (sched_rt_ratio_exceeded(rt_rq))
264 return NULL; 475 goto out;
265 476
266 idx = sched_find_first_bit(array->bitmap); 477 idx = sched_find_first_bit(array->bitmap);
267 if (idx >= MAX_RT_PRIO) 478 BUG_ON(idx >= MAX_RT_PRIO);
268 return NULL;
269 479
270 queue = array->queue + idx; 480 queue = array->queue + idx;
271 next = list_entry(queue->next, struct task_struct, rt.run_list); 481 next = list_entry(queue->next, struct sched_rt_entity, run_list);
482 out:
483 return next;
484}
272 485
273 next->se.exec_start = rq->clock; 486static struct task_struct *pick_next_task_rt(struct rq *rq)
487{
488 struct sched_rt_entity *rt_se;
489 struct task_struct *p;
490 struct rt_rq *rt_rq;
274 491
275 return next; 492 retry:
493 rt_rq = &rq->rt;
494
495 if (unlikely(!rt_rq->rt_nr_running))
496 return NULL;
497
498 if (sched_rt_ratio_exceeded(rt_rq))
499 return NULL;
500
501 do {
502 rt_se = pick_next_rt_entity(rq, rt_rq);
503 if (unlikely(!rt_se))
504 goto retry;
505 rt_rq = group_rt_rq(rt_se);
506 } while (rt_rq);
507
508 p = rt_task_of(rt_se);
509 p->se.exec_start = rq->clock;
510 return p;
276} 511}
277 512
278static void put_prev_task_rt(struct rq *rq, struct task_struct *p) 513static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
@@ -282,6 +517,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
282} 517}
283 518
284#ifdef CONFIG_SMP 519#ifdef CONFIG_SMP
520
285/* Only try algorithms three times */ 521/* Only try algorithms three times */
286#define RT_MAX_TRIES 3 522#define RT_MAX_TRIES 3
287 523
@@ -292,7 +528,7 @@ static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
292{ 528{
293 if (!task_running(rq, p) && 529 if (!task_running(rq, p) &&
294 (cpu < 0 || cpu_isset(cpu, p->cpus_allowed)) && 530 (cpu < 0 || cpu_isset(cpu, p->cpus_allowed)) &&
295 (p->nr_cpus_allowed > 1)) 531 (p->rt.nr_cpus_allowed > 1))
296 return 1; 532 return 1;
297 return 0; 533 return 0;
298} 534}
@@ -300,52 +536,33 @@ static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
300/* Return the second highest RT task, NULL otherwise */ 536/* Return the second highest RT task, NULL otherwise */
301static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu) 537static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
302{ 538{
303 struct rt_prio_array *array = &rq->rt.active; 539 struct task_struct *next = NULL;
304 struct task_struct *next; 540 struct sched_rt_entity *rt_se;
305 struct list_head *queue; 541 struct rt_prio_array *array;
542 struct rt_rq *rt_rq;
306 int idx; 543 int idx;
307 544
308 if (likely(rq->rt.rt_nr_running < 2)) 545 for_each_leaf_rt_rq(rt_rq, rq) {
309 return NULL; 546 array = &rt_rq->active;
310 547 idx = sched_find_first_bit(array->bitmap);
311 idx = sched_find_first_bit(array->bitmap); 548 next_idx:
312 if (unlikely(idx >= MAX_RT_PRIO)) { 549 if (idx >= MAX_RT_PRIO)
313 WARN_ON(1); /* rt_nr_running is bad */ 550 continue;
314 return NULL; 551 if (next && next->prio < idx)
315 } 552 continue;
316 553 list_for_each_entry(rt_se, array->queue + idx, run_list) {
317 queue = array->queue + idx; 554 struct task_struct *p = rt_task_of(rt_se);
318 BUG_ON(list_empty(queue)); 555 if (pick_rt_task(rq, p, cpu)) {
319 556 next = p;
320 next = list_entry(queue->next, struct task_struct, rt.run_list); 557 break;
321 if (unlikely(pick_rt_task(rq, next, cpu))) 558 }
322 goto out; 559 }
323 560 if (!next) {
324 if (queue->next->next != queue) { 561 idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1);
325 /* same prio task */ 562 goto next_idx;
326 next = list_entry(queue->next->next, struct task_struct, 563 }
327 rt.run_list);
328 if (pick_rt_task(rq, next, cpu))
329 goto out;
330 }
331
332 retry:
333 /* slower, but more flexible */
334 idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1);
335 if (unlikely(idx >= MAX_RT_PRIO))
336 return NULL;
337
338 queue = array->queue + idx;
339 BUG_ON(list_empty(queue));
340
341 list_for_each_entry(next, queue, rt.run_list) {
342 if (pick_rt_task(rq, next, cpu))
343 goto out;
344 } 564 }
345 565
346 goto retry;
347
348 out:
349 return next; 566 return next;
350} 567}
351 568
@@ -774,12 +991,12 @@ static void set_cpus_allowed_rt(struct task_struct *p, cpumask_t *new_mask)
774 * Update the migration status of the RQ if we have an RT task 991 * Update the migration status of the RQ if we have an RT task
775 * which is running AND changing its weight value. 992 * which is running AND changing its weight value.
776 */ 993 */
777 if (p->se.on_rq && (weight != p->nr_cpus_allowed)) { 994 if (p->se.on_rq && (weight != p->rt.nr_cpus_allowed)) {
778 struct rq *rq = task_rq(p); 995 struct rq *rq = task_rq(p);
779 996
780 if ((p->nr_cpus_allowed <= 1) && (weight > 1)) { 997 if ((p->rt.nr_cpus_allowed <= 1) && (weight > 1)) {
781 rq->rt.rt_nr_migratory++; 998 rq->rt.rt_nr_migratory++;
782 } else if ((p->nr_cpus_allowed > 1) && (weight <= 1)) { 999 } else if ((p->rt.nr_cpus_allowed > 1) && (weight <= 1)) {
783 BUG_ON(!rq->rt.rt_nr_migratory); 1000 BUG_ON(!rq->rt.rt_nr_migratory);
784 rq->rt.rt_nr_migratory--; 1001 rq->rt.rt_nr_migratory--;
785 } 1002 }
@@ -788,7 +1005,7 @@ static void set_cpus_allowed_rt(struct task_struct *p, cpumask_t *new_mask)
788 } 1005 }
789 1006
790 p->cpus_allowed = *new_mask; 1007 p->cpus_allowed = *new_mask;
791 p->nr_cpus_allowed = weight; 1008 p->rt.nr_cpus_allowed = weight;
792} 1009}
793 1010
794/* Assumes rq->lock is held */ 1011/* Assumes rq->lock is held */