aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--include/linux/init_task.h5
-rw-r--r--include/linux/sched.h10
-rw-r--r--kernel/fork.c2
-rw-r--r--kernel/sched.c283
-rw-r--r--kernel/sched_rt.c455
5 files changed, 549 insertions, 206 deletions
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index ee65d87bedb7..796019b22b6f 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -130,12 +130,13 @@ extern struct group_info init_groups;
130 .normal_prio = MAX_PRIO-20, \ 130 .normal_prio = MAX_PRIO-20, \
131 .policy = SCHED_NORMAL, \ 131 .policy = SCHED_NORMAL, \
132 .cpus_allowed = CPU_MASK_ALL, \ 132 .cpus_allowed = CPU_MASK_ALL, \
133 .nr_cpus_allowed = NR_CPUS, \
134 .mm = NULL, \ 133 .mm = NULL, \
135 .active_mm = &init_mm, \ 134 .active_mm = &init_mm, \
136 .rt = { \ 135 .rt = { \
137 .run_list = LIST_HEAD_INIT(tsk.rt.run_list), \ 136 .run_list = LIST_HEAD_INIT(tsk.rt.run_list), \
138 .time_slice = HZ, }, \ 137 .time_slice = HZ, \
138 .nr_cpus_allowed = NR_CPUS, \
139 }, \
139 .ioprio = 0, \ 140 .ioprio = 0, \
140 .tasks = LIST_HEAD_INIT(tsk.tasks), \ 141 .tasks = LIST_HEAD_INIT(tsk.tasks), \
141 .ptrace_children= LIST_HEAD_INIT(tsk.ptrace_children), \ 142 .ptrace_children= LIST_HEAD_INIT(tsk.ptrace_children), \
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d5ea144df836..04eecbf0241e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -934,6 +934,15 @@ struct sched_rt_entity {
934 struct list_head run_list; 934 struct list_head run_list;
935 unsigned int time_slice; 935 unsigned int time_slice;
936 unsigned long timeout; 936 unsigned long timeout;
937 int nr_cpus_allowed;
938
939#ifdef CONFIG_FAIR_GROUP_SCHED
940 struct sched_rt_entity *parent;
941 /* rq on which this entity is (to be) queued: */
942 struct rt_rq *rt_rq;
943 /* rq "owned" by this entity/group: */
944 struct rt_rq *my_q;
945#endif
937}; 946};
938 947
939struct task_struct { 948struct task_struct {
@@ -978,7 +987,6 @@ struct task_struct {
978 987
979 unsigned int policy; 988 unsigned int policy;
980 cpumask_t cpus_allowed; 989 cpumask_t cpus_allowed;
981 int nr_cpus_allowed;
982 990
983#ifdef CONFIG_PREEMPT_RCU 991#ifdef CONFIG_PREEMPT_RCU
984 int rcu_read_lock_nesting; 992 int rcu_read_lock_nesting;
diff --git a/kernel/fork.c b/kernel/fork.c
index 9f8ef32cbc7a..0c969f4fade0 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1246,7 +1246,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1246 * parent's CPU). This avoids alot of nasty races. 1246 * parent's CPU). This avoids alot of nasty races.
1247 */ 1247 */
1248 p->cpus_allowed = current->cpus_allowed; 1248 p->cpus_allowed = current->cpus_allowed;
1249 p->nr_cpus_allowed = current->nr_cpus_allowed; 1249 p->rt.nr_cpus_allowed = current->rt.nr_cpus_allowed;
1250 if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) || 1250 if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) ||
1251 !cpu_online(task_cpu(p)))) 1251 !cpu_online(task_cpu(p))))
1252 set_task_cpu(p, smp_processor_id()); 1252 set_task_cpu(p, smp_processor_id());
diff --git a/kernel/sched.c b/kernel/sched.c
index e9a7beee9b79..5ea2c533b432 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -161,6 +161,8 @@ struct rt_prio_array {
161 161
162struct cfs_rq; 162struct cfs_rq;
163 163
164static LIST_HEAD(task_groups);
165
164/* task group related information */ 166/* task group related information */
165struct task_group { 167struct task_group {
166#ifdef CONFIG_FAIR_CGROUP_SCHED 168#ifdef CONFIG_FAIR_CGROUP_SCHED
@@ -171,6 +173,11 @@ struct task_group {
171 /* runqueue "owned" by this group on each cpu */ 173 /* runqueue "owned" by this group on each cpu */
172 struct cfs_rq **cfs_rq; 174 struct cfs_rq **cfs_rq;
173 175
176 struct sched_rt_entity **rt_se;
177 struct rt_rq **rt_rq;
178
179 unsigned int rt_ratio;
180
174 /* 181 /*
175 * shares assigned to a task group governs how much of cpu bandwidth 182 * shares assigned to a task group governs how much of cpu bandwidth
176 * is allocated to the group. The more shares a group has, the more is 183 * is allocated to the group. The more shares a group has, the more is
@@ -208,6 +215,7 @@ struct task_group {
208 unsigned long shares; 215 unsigned long shares;
209 216
210 struct rcu_head rcu; 217 struct rcu_head rcu;
218 struct list_head list;
211}; 219};
212 220
213/* Default task group's sched entity on each cpu */ 221/* Default task group's sched entity on each cpu */
@@ -215,9 +223,15 @@ static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
215/* Default task group's cfs_rq on each cpu */ 223/* Default task group's cfs_rq on each cpu */
216static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; 224static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
217 225
226static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
227static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
228
218static struct sched_entity *init_sched_entity_p[NR_CPUS]; 229static struct sched_entity *init_sched_entity_p[NR_CPUS];
219static struct cfs_rq *init_cfs_rq_p[NR_CPUS]; 230static struct cfs_rq *init_cfs_rq_p[NR_CPUS];
220 231
232static struct sched_rt_entity *init_sched_rt_entity_p[NR_CPUS];
233static struct rt_rq *init_rt_rq_p[NR_CPUS];
234
221/* task_group_mutex serializes add/remove of task groups and also changes to 235/* task_group_mutex serializes add/remove of task groups and also changes to
222 * a task group's cpu shares. 236 * a task group's cpu shares.
223 */ 237 */
@@ -240,6 +254,9 @@ static void set_se_shares(struct sched_entity *se, unsigned long shares);
240struct task_group init_task_group = { 254struct task_group init_task_group = {
241 .se = init_sched_entity_p, 255 .se = init_sched_entity_p,
242 .cfs_rq = init_cfs_rq_p, 256 .cfs_rq = init_cfs_rq_p,
257
258 .rt_se = init_sched_rt_entity_p,
259 .rt_rq = init_rt_rq_p,
243}; 260};
244 261
245#ifdef CONFIG_FAIR_USER_SCHED 262#ifdef CONFIG_FAIR_USER_SCHED
@@ -269,10 +286,13 @@ static inline struct task_group *task_group(struct task_struct *p)
269} 286}
270 287
271/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ 288/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
272static inline void set_task_cfs_rq(struct task_struct *p, unsigned int cpu) 289static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
273{ 290{
274 p->se.cfs_rq = task_group(p)->cfs_rq[cpu]; 291 p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
275 p->se.parent = task_group(p)->se[cpu]; 292 p->se.parent = task_group(p)->se[cpu];
293
294 p->rt.rt_rq = task_group(p)->rt_rq[cpu];
295 p->rt.parent = task_group(p)->rt_se[cpu];
276} 296}
277 297
278static inline void lock_task_group_list(void) 298static inline void lock_task_group_list(void)
@@ -297,7 +317,7 @@ static inline void unlock_doms_cur(void)
297 317
298#else 318#else
299 319
300static inline void set_task_cfs_rq(struct task_struct *p, unsigned int cpu) { } 320static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
301static inline void lock_task_group_list(void) { } 321static inline void lock_task_group_list(void) { }
302static inline void unlock_task_group_list(void) { } 322static inline void unlock_task_group_list(void) { }
303static inline void lock_doms_cur(void) { } 323static inline void lock_doms_cur(void) { }
@@ -343,13 +363,22 @@ struct cfs_rq {
343struct rt_rq { 363struct rt_rq {
344 struct rt_prio_array active; 364 struct rt_prio_array active;
345 unsigned long rt_nr_running; 365 unsigned long rt_nr_running;
366#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED
367 int highest_prio; /* highest queued rt task prio */
368#endif
346#ifdef CONFIG_SMP 369#ifdef CONFIG_SMP
347 unsigned long rt_nr_migratory; 370 unsigned long rt_nr_migratory;
348 int highest_prio; /* highest queued rt task prio */
349 int overloaded; 371 int overloaded;
350#endif 372#endif
373 int rt_throttled;
351 u64 rt_time; 374 u64 rt_time;
352 u64 rt_throttled; 375
376#ifdef CONFIG_FAIR_GROUP_SCHED
377 struct rq *rq;
378 struct list_head leaf_rt_rq_list;
379 struct task_group *tg;
380 struct sched_rt_entity *rt_se;
381#endif
353}; 382};
354 383
355#ifdef CONFIG_SMP 384#ifdef CONFIG_SMP
@@ -411,12 +440,14 @@ struct rq {
411 u64 nr_switches; 440 u64 nr_switches;
412 441
413 struct cfs_rq cfs; 442 struct cfs_rq cfs;
443 struct rt_rq rt;
444 u64 rt_period_expire;
445
414#ifdef CONFIG_FAIR_GROUP_SCHED 446#ifdef CONFIG_FAIR_GROUP_SCHED
415 /* list of leaf cfs_rq on this cpu: */ 447 /* list of leaf cfs_rq on this cpu: */
416 struct list_head leaf_cfs_rq_list; 448 struct list_head leaf_cfs_rq_list;
449 struct list_head leaf_rt_rq_list;
417#endif 450#endif
418 struct rt_rq rt;
419 u64 rt_period_expire;
420 451
421 /* 452 /*
422 * This is part of a global counter where only the total sum 453 * This is part of a global counter where only the total sum
@@ -613,9 +644,9 @@ const_debug unsigned int sysctl_sched_rt_period = 1000;
613 644
614/* 645/*
615 * ratio of time -rt tasks may consume. 646 * ratio of time -rt tasks may consume.
616 * default: 100% 647 * default: 95%
617 */ 648 */
618const_debug unsigned int sysctl_sched_rt_ratio = SCHED_RT_FRAC; 649const_debug unsigned int sysctl_sched_rt_ratio = 62259;
619 650
620/* 651/*
621 * For kernel-internal use: high-speed (but slightly incorrect) per-cpu 652 * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
@@ -1337,7 +1368,7 @@ unsigned long weighted_cpuload(const int cpu)
1337 1368
1338static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) 1369static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1339{ 1370{
1340 set_task_cfs_rq(p, cpu); 1371 set_task_rq(p, cpu);
1341#ifdef CONFIG_SMP 1372#ifdef CONFIG_SMP
1342 /* 1373 /*
1343 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be 1374 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
@@ -5281,7 +5312,7 @@ int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
5281 p->sched_class->set_cpus_allowed(p, &new_mask); 5312 p->sched_class->set_cpus_allowed(p, &new_mask);
5282 else { 5313 else {
5283 p->cpus_allowed = new_mask; 5314 p->cpus_allowed = new_mask;
5284 p->nr_cpus_allowed = cpus_weight(new_mask); 5315 p->rt.nr_cpus_allowed = cpus_weight(new_mask);
5285 } 5316 }
5286 5317
5287 /* Can the task run on the task's current CPU? If so, we're done */ 5318 /* Can the task run on the task's current CPU? If so, we're done */
@@ -7079,8 +7110,50 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
7079 7110
7080 rt_rq->rt_time = 0; 7111 rt_rq->rt_time = 0;
7081 rt_rq->rt_throttled = 0; 7112 rt_rq->rt_throttled = 0;
7113
7114#ifdef CONFIG_FAIR_GROUP_SCHED
7115 rt_rq->rq = rq;
7116#endif
7082} 7117}
7083 7118
7119#ifdef CONFIG_FAIR_GROUP_SCHED
7120static void init_tg_cfs_entry(struct rq *rq, struct task_group *tg,
7121 struct cfs_rq *cfs_rq, struct sched_entity *se,
7122 int cpu, int add)
7123{
7124 tg->cfs_rq[cpu] = cfs_rq;
7125 init_cfs_rq(cfs_rq, rq);
7126 cfs_rq->tg = tg;
7127 if (add)
7128 list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
7129
7130 tg->se[cpu] = se;
7131 se->cfs_rq = &rq->cfs;
7132 se->my_q = cfs_rq;
7133 se->load.weight = tg->shares;
7134 se->load.inv_weight = div64_64(1ULL<<32, se->load.weight);
7135 se->parent = NULL;
7136}
7137
7138static void init_tg_rt_entry(struct rq *rq, struct task_group *tg,
7139 struct rt_rq *rt_rq, struct sched_rt_entity *rt_se,
7140 int cpu, int add)
7141{
7142 tg->rt_rq[cpu] = rt_rq;
7143 init_rt_rq(rt_rq, rq);
7144 rt_rq->tg = tg;
7145 rt_rq->rt_se = rt_se;
7146 if (add)
7147 list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
7148
7149 tg->rt_se[cpu] = rt_se;
7150 rt_se->rt_rq = &rq->rt;
7151 rt_se->my_q = rt_rq;
7152 rt_se->parent = NULL;
7153 INIT_LIST_HEAD(&rt_se->run_list);
7154}
7155#endif
7156
7084void __init sched_init(void) 7157void __init sched_init(void)
7085{ 7158{
7086 int highest_cpu = 0; 7159 int highest_cpu = 0;
@@ -7090,6 +7163,10 @@ void __init sched_init(void)
7090 init_defrootdomain(); 7163 init_defrootdomain();
7091#endif 7164#endif
7092 7165
7166#ifdef CONFIG_FAIR_GROUP_SCHED
7167 list_add(&init_task_group.list, &task_groups);
7168#endif
7169
7093 for_each_possible_cpu(i) { 7170 for_each_possible_cpu(i) {
7094 struct rq *rq; 7171 struct rq *rq;
7095 7172
@@ -7099,30 +7176,20 @@ void __init sched_init(void)
7099 rq->nr_running = 0; 7176 rq->nr_running = 0;
7100 rq->clock = 1; 7177 rq->clock = 1;
7101 init_cfs_rq(&rq->cfs, rq); 7178 init_cfs_rq(&rq->cfs, rq);
7179 init_rt_rq(&rq->rt, rq);
7102#ifdef CONFIG_FAIR_GROUP_SCHED 7180#ifdef CONFIG_FAIR_GROUP_SCHED
7103 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
7104 {
7105 struct cfs_rq *cfs_rq = &per_cpu(init_cfs_rq, i);
7106 struct sched_entity *se =
7107 &per_cpu(init_sched_entity, i);
7108
7109 init_cfs_rq_p[i] = cfs_rq;
7110 init_cfs_rq(cfs_rq, rq);
7111 cfs_rq->tg = &init_task_group;
7112 list_add(&cfs_rq->leaf_cfs_rq_list,
7113 &rq->leaf_cfs_rq_list);
7114
7115 init_sched_entity_p[i] = se;
7116 se->cfs_rq = &rq->cfs;
7117 se->my_q = cfs_rq;
7118 se->load.weight = init_task_group_load;
7119 se->load.inv_weight =
7120 div64_64(1ULL<<32, init_task_group_load);
7121 se->parent = NULL;
7122 }
7123 init_task_group.shares = init_task_group_load; 7181 init_task_group.shares = init_task_group_load;
7182 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
7183 init_tg_cfs_entry(rq, &init_task_group,
7184 &per_cpu(init_cfs_rq, i),
7185 &per_cpu(init_sched_entity, i), i, 1);
7186
7187 init_task_group.rt_ratio = sysctl_sched_rt_ratio; /* XXX */
7188 INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
7189 init_tg_rt_entry(rq, &init_task_group,
7190 &per_cpu(init_rt_rq, i),
7191 &per_cpu(init_sched_rt_entity, i), i, 1);
7124#endif 7192#endif
7125 init_rt_rq(&rq->rt, rq);
7126 rq->rt_period_expire = 0; 7193 rq->rt_period_expire = 0;
7127 7194
7128 for (j = 0; j < CPU_LOAD_IDX_MAX; j++) 7195 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
@@ -7460,12 +7527,36 @@ static int load_balance_monitor(void *unused)
7460} 7527}
7461#endif /* CONFIG_SMP */ 7528#endif /* CONFIG_SMP */
7462 7529
7530static void free_sched_group(struct task_group *tg)
7531{
7532 int i;
7533
7534 for_each_possible_cpu(i) {
7535 if (tg->cfs_rq)
7536 kfree(tg->cfs_rq[i]);
7537 if (tg->se)
7538 kfree(tg->se[i]);
7539 if (tg->rt_rq)
7540 kfree(tg->rt_rq[i]);
7541 if (tg->rt_se)
7542 kfree(tg->rt_se[i]);
7543 }
7544
7545 kfree(tg->cfs_rq);
7546 kfree(tg->se);
7547 kfree(tg->rt_rq);
7548 kfree(tg->rt_se);
7549 kfree(tg);
7550}
7551
7463/* allocate runqueue etc for a new task group */ 7552/* allocate runqueue etc for a new task group */
7464struct task_group *sched_create_group(void) 7553struct task_group *sched_create_group(void)
7465{ 7554{
7466 struct task_group *tg; 7555 struct task_group *tg;
7467 struct cfs_rq *cfs_rq; 7556 struct cfs_rq *cfs_rq;
7468 struct sched_entity *se; 7557 struct sched_entity *se;
7558 struct rt_rq *rt_rq;
7559 struct sched_rt_entity *rt_se;
7469 struct rq *rq; 7560 struct rq *rq;
7470 int i; 7561 int i;
7471 7562
@@ -7479,100 +7570,89 @@ struct task_group *sched_create_group(void)
7479 tg->se = kzalloc(sizeof(se) * NR_CPUS, GFP_KERNEL); 7570 tg->se = kzalloc(sizeof(se) * NR_CPUS, GFP_KERNEL);
7480 if (!tg->se) 7571 if (!tg->se)
7481 goto err; 7572 goto err;
7573 tg->rt_rq = kzalloc(sizeof(rt_rq) * NR_CPUS, GFP_KERNEL);
7574 if (!tg->rt_rq)
7575 goto err;
7576 tg->rt_se = kzalloc(sizeof(rt_se) * NR_CPUS, GFP_KERNEL);
7577 if (!tg->rt_se)
7578 goto err;
7579
7580 tg->shares = NICE_0_LOAD;
7581 tg->rt_ratio = 0; /* XXX */
7482 7582
7483 for_each_possible_cpu(i) { 7583 for_each_possible_cpu(i) {
7484 rq = cpu_rq(i); 7584 rq = cpu_rq(i);
7485 7585
7486 cfs_rq = kmalloc_node(sizeof(struct cfs_rq), GFP_KERNEL, 7586 cfs_rq = kmalloc_node(sizeof(struct cfs_rq),
7487 cpu_to_node(i)); 7587 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
7488 if (!cfs_rq) 7588 if (!cfs_rq)
7489 goto err; 7589 goto err;
7490 7590
7491 se = kmalloc_node(sizeof(struct sched_entity), GFP_KERNEL, 7591 se = kmalloc_node(sizeof(struct sched_entity),
7492 cpu_to_node(i)); 7592 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
7493 if (!se) 7593 if (!se)
7494 goto err; 7594 goto err;
7495 7595
7496 memset(cfs_rq, 0, sizeof(struct cfs_rq)); 7596 rt_rq = kmalloc_node(sizeof(struct rt_rq),
7497 memset(se, 0, sizeof(struct sched_entity)); 7597 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
7598 if (!rt_rq)
7599 goto err;
7498 7600
7499 tg->cfs_rq[i] = cfs_rq; 7601 rt_se = kmalloc_node(sizeof(struct sched_rt_entity),
7500 init_cfs_rq(cfs_rq, rq); 7602 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
7501 cfs_rq->tg = tg; 7603 if (!rt_se)
7604 goto err;
7502 7605
7503 tg->se[i] = se; 7606 init_tg_cfs_entry(rq, tg, cfs_rq, se, i, 0);
7504 se->cfs_rq = &rq->cfs; 7607 init_tg_rt_entry(rq, tg, rt_rq, rt_se, i, 0);
7505 se->my_q = cfs_rq;
7506 se->load.weight = NICE_0_LOAD;
7507 se->load.inv_weight = div64_64(1ULL<<32, NICE_0_LOAD);
7508 se->parent = NULL;
7509 } 7608 }
7510 7609
7511 tg->shares = NICE_0_LOAD;
7512
7513 lock_task_group_list(); 7610 lock_task_group_list();
7514 for_each_possible_cpu(i) { 7611 for_each_possible_cpu(i) {
7515 rq = cpu_rq(i); 7612 rq = cpu_rq(i);
7516 cfs_rq = tg->cfs_rq[i]; 7613 cfs_rq = tg->cfs_rq[i];
7517 list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); 7614 list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
7615 rt_rq = tg->rt_rq[i];
7616 list_add_rcu(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
7518 } 7617 }
7618 list_add_rcu(&tg->list, &task_groups);
7519 unlock_task_group_list(); 7619 unlock_task_group_list();
7520 7620
7521 return tg; 7621 return tg;
7522 7622
7523err: 7623err:
7524 for_each_possible_cpu(i) { 7624 free_sched_group(tg);
7525 if (tg->cfs_rq)
7526 kfree(tg->cfs_rq[i]);
7527 if (tg->se)
7528 kfree(tg->se[i]);
7529 }
7530 kfree(tg->cfs_rq);
7531 kfree(tg->se);
7532 kfree(tg);
7533
7534 return ERR_PTR(-ENOMEM); 7625 return ERR_PTR(-ENOMEM);
7535} 7626}
7536 7627
7537/* rcu callback to free various structures associated with a task group */ 7628/* rcu callback to free various structures associated with a task group */
7538static void free_sched_group(struct rcu_head *rhp) 7629static void free_sched_group_rcu(struct rcu_head *rhp)
7539{ 7630{
7540 struct task_group *tg = container_of(rhp, struct task_group, rcu);
7541 struct cfs_rq *cfs_rq;
7542 struct sched_entity *se;
7543 int i;
7544
7545 /* now it should be safe to free those cfs_rqs */ 7631 /* now it should be safe to free those cfs_rqs */
7546 for_each_possible_cpu(i) { 7632 free_sched_group(container_of(rhp, struct task_group, rcu));
7547 cfs_rq = tg->cfs_rq[i];
7548 kfree(cfs_rq);
7549
7550 se = tg->se[i];
7551 kfree(se);
7552 }
7553
7554 kfree(tg->cfs_rq);
7555 kfree(tg->se);
7556 kfree(tg);
7557} 7633}
7558 7634
7559/* Destroy runqueue etc associated with a task group */ 7635/* Destroy runqueue etc associated with a task group */
7560void sched_destroy_group(struct task_group *tg) 7636void sched_destroy_group(struct task_group *tg)
7561{ 7637{
7562 struct cfs_rq *cfs_rq = NULL; 7638 struct cfs_rq *cfs_rq = NULL;
7639 struct rt_rq *rt_rq = NULL;
7563 int i; 7640 int i;
7564 7641
7565 lock_task_group_list(); 7642 lock_task_group_list();
7566 for_each_possible_cpu(i) { 7643 for_each_possible_cpu(i) {
7567 cfs_rq = tg->cfs_rq[i]; 7644 cfs_rq = tg->cfs_rq[i];
7568 list_del_rcu(&cfs_rq->leaf_cfs_rq_list); 7645 list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
7646 rt_rq = tg->rt_rq[i];
7647 list_del_rcu(&rt_rq->leaf_rt_rq_list);
7569 } 7648 }
7649 list_del_rcu(&tg->list);
7570 unlock_task_group_list(); 7650 unlock_task_group_list();
7571 7651
7572 BUG_ON(!cfs_rq); 7652 BUG_ON(!cfs_rq);
7573 7653
7574 /* wait for possible concurrent references to cfs_rqs complete */ 7654 /* wait for possible concurrent references to cfs_rqs complete */
7575 call_rcu(&tg->rcu, free_sched_group); 7655 call_rcu(&tg->rcu, free_sched_group_rcu);
7576} 7656}
7577 7657
7578/* change task's runqueue when it moves between groups. 7658/* change task's runqueue when it moves between groups.
@@ -7588,11 +7668,6 @@ void sched_move_task(struct task_struct *tsk)
7588 7668
7589 rq = task_rq_lock(tsk, &flags); 7669 rq = task_rq_lock(tsk, &flags);
7590 7670
7591 if (tsk->sched_class != &fair_sched_class) {
7592 set_task_cfs_rq(tsk, task_cpu(tsk));
7593 goto done;
7594 }
7595
7596 update_rq_clock(rq); 7671 update_rq_clock(rq);
7597 7672
7598 running = task_current(rq, tsk); 7673 running = task_current(rq, tsk);
@@ -7604,7 +7679,7 @@ void sched_move_task(struct task_struct *tsk)
7604 tsk->sched_class->put_prev_task(rq, tsk); 7679 tsk->sched_class->put_prev_task(rq, tsk);
7605 } 7680 }
7606 7681
7607 set_task_cfs_rq(tsk, task_cpu(tsk)); 7682 set_task_rq(tsk, task_cpu(tsk));
7608 7683
7609 if (on_rq) { 7684 if (on_rq) {
7610 if (unlikely(running)) 7685 if (unlikely(running))
@@ -7612,7 +7687,6 @@ void sched_move_task(struct task_struct *tsk)
7612 enqueue_task(rq, tsk, 0); 7687 enqueue_task(rq, tsk, 0);
7613 } 7688 }
7614 7689
7615done:
7616 task_rq_unlock(rq, &flags); 7690 task_rq_unlock(rq, &flags);
7617} 7691}
7618 7692
@@ -7697,6 +7771,31 @@ unsigned long sched_group_shares(struct task_group *tg)
7697 return tg->shares; 7771 return tg->shares;
7698} 7772}
7699 7773
7774/*
7775 * Ensure the total rt_ratio <= sysctl_sched_rt_ratio
7776 */
7777int sched_group_set_rt_ratio(struct task_group *tg, unsigned long rt_ratio)
7778{
7779 struct task_group *tgi;
7780 unsigned long total = 0;
7781
7782 rcu_read_lock();
7783 list_for_each_entry_rcu(tgi, &task_groups, list)
7784 total += tgi->rt_ratio;
7785 rcu_read_unlock();
7786
7787 if (total + rt_ratio - tg->rt_ratio > sysctl_sched_rt_ratio)
7788 return -EINVAL;
7789
7790 tg->rt_ratio = rt_ratio;
7791 return 0;
7792}
7793
7794unsigned long sched_group_rt_ratio(struct task_group *tg)
7795{
7796 return tg->rt_ratio;
7797}
7798
7700#endif /* CONFIG_FAIR_GROUP_SCHED */ 7799#endif /* CONFIG_FAIR_GROUP_SCHED */
7701 7800
7702#ifdef CONFIG_FAIR_CGROUP_SCHED 7801#ifdef CONFIG_FAIR_CGROUP_SCHED
@@ -7772,12 +7871,30 @@ static u64 cpu_shares_read_uint(struct cgroup *cgrp, struct cftype *cft)
7772 return (u64) tg->shares; 7871 return (u64) tg->shares;
7773} 7872}
7774 7873
7874static int cpu_rt_ratio_write_uint(struct cgroup *cgrp, struct cftype *cftype,
7875 u64 rt_ratio_val)
7876{
7877 return sched_group_set_rt_ratio(cgroup_tg(cgrp), rt_ratio_val);
7878}
7879
7880static u64 cpu_rt_ratio_read_uint(struct cgroup *cgrp, struct cftype *cft)
7881{
7882 struct task_group *tg = cgroup_tg(cgrp);
7883
7884 return (u64) tg->rt_ratio;
7885}
7886
7775static struct cftype cpu_files[] = { 7887static struct cftype cpu_files[] = {
7776 { 7888 {
7777 .name = "shares", 7889 .name = "shares",
7778 .read_uint = cpu_shares_read_uint, 7890 .read_uint = cpu_shares_read_uint,
7779 .write_uint = cpu_shares_write_uint, 7891 .write_uint = cpu_shares_write_uint,
7780 }, 7892 },
7893 {
7894 .name = "rt_ratio",
7895 .read_uint = cpu_rt_ratio_read_uint,
7896 .write_uint = cpu_rt_ratio_write_uint,
7897 },
7781}; 7898};
7782 7899
7783static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont) 7900static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index fd10d965aa06..1178257613ad 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -45,47 +45,167 @@ static void update_rt_migration(struct rq *rq)
45} 45}
46#endif /* CONFIG_SMP */ 46#endif /* CONFIG_SMP */
47 47
48static int sched_rt_ratio_exceeded(struct rq *rq, struct rt_rq *rt_rq) 48static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
49{ 49{
50 return container_of(rt_se, struct task_struct, rt);
51}
52
53static inline int on_rt_rq(struct sched_rt_entity *rt_se)
54{
55 return !list_empty(&rt_se->run_list);
56}
57
58#ifdef CONFIG_FAIR_GROUP_SCHED
59
60static inline unsigned int sched_rt_ratio(struct rt_rq *rt_rq)
61{
62 if (!rt_rq->tg)
63 return SCHED_RT_FRAC;
64
65 return rt_rq->tg->rt_ratio;
66}
67
68#define for_each_leaf_rt_rq(rt_rq, rq) \
69 list_for_each_entry(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
70
71static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
72{
73 return rt_rq->rq;
74}
75
76static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
77{
78 return rt_se->rt_rq;
79}
80
81#define for_each_sched_rt_entity(rt_se) \
82 for (; rt_se; rt_se = rt_se->parent)
83
84static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
85{
86 return rt_se->my_q;
87}
88
89static void enqueue_rt_entity(struct sched_rt_entity *rt_se);
90static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
91
92static void sched_rt_ratio_enqueue(struct rt_rq *rt_rq)
93{
94 struct sched_rt_entity *rt_se = rt_rq->rt_se;
95
96 if (rt_se && !on_rt_rq(rt_se) && rt_rq->rt_nr_running) {
97 enqueue_rt_entity(rt_se);
98 resched_task(rq_of_rt_rq(rt_rq)->curr);
99 }
100}
101
102static void sched_rt_ratio_dequeue(struct rt_rq *rt_rq)
103{
104 struct sched_rt_entity *rt_se = rt_rq->rt_se;
105
106 if (rt_se && on_rt_rq(rt_se))
107 dequeue_rt_entity(rt_se);
108}
109
110#else
111
112static inline unsigned int sched_rt_ratio(struct rt_rq *rt_rq)
113{
114 return sysctl_sched_rt_ratio;
115}
116
117#define for_each_leaf_rt_rq(rt_rq, rq) \
118 for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
119
120static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
121{
122 return container_of(rt_rq, struct rq, rt);
123}
124
125static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
126{
127 struct task_struct *p = rt_task_of(rt_se);
128 struct rq *rq = task_rq(p);
129
130 return &rq->rt;
131}
132
133#define for_each_sched_rt_entity(rt_se) \
134 for (; rt_se; rt_se = NULL)
135
136static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
137{
138 return NULL;
139}
140
141static inline void sched_rt_ratio_enqueue(struct rt_rq *rt_rq)
142{
143}
144
145static inline void sched_rt_ratio_dequeue(struct rt_rq *rt_rq)
146{
147}
148
149#endif
150
151static inline int rt_se_prio(struct sched_rt_entity *rt_se)
152{
153#ifdef CONFIG_FAIR_GROUP_SCHED
154 struct rt_rq *rt_rq = group_rt_rq(rt_se);
155
156 if (rt_rq)
157 return rt_rq->highest_prio;
158#endif
159
160 return rt_task_of(rt_se)->prio;
161}
162
163static int sched_rt_ratio_exceeded(struct rt_rq *rt_rq)
164{
165 unsigned int rt_ratio = sched_rt_ratio(rt_rq);
50 u64 period, ratio; 166 u64 period, ratio;
51 167
52 if (sysctl_sched_rt_ratio == SCHED_RT_FRAC) 168 if (rt_ratio == SCHED_RT_FRAC)
53 return 0; 169 return 0;
54 170
55 if (rt_rq->rt_throttled) 171 if (rt_rq->rt_throttled)
56 return 1; 172 return 1;
57 173
58 period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC; 174 period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC;
59 ratio = (period * sysctl_sched_rt_ratio) >> SCHED_RT_FRAC_SHIFT; 175 ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT;
60 176
61 if (rt_rq->rt_time > ratio) { 177 if (rt_rq->rt_time > ratio) {
62 rt_rq->rt_throttled = rq->clock + period - rt_rq->rt_time; 178 rt_rq->rt_throttled = 1;
179 sched_rt_ratio_dequeue(rt_rq);
63 return 1; 180 return 1;
64 } 181 }
65 182
66 return 0; 183 return 0;
67} 184}
68 185
186static void __update_sched_rt_period(struct rt_rq *rt_rq, u64 period)
187{
188 unsigned long rt_ratio = sched_rt_ratio(rt_rq);
189 u64 ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT;
190
191 rt_rq->rt_time -= min(rt_rq->rt_time, ratio);
192 if (rt_rq->rt_throttled) {
193 rt_rq->rt_throttled = 0;
194 sched_rt_ratio_enqueue(rt_rq);
195 }
196}
197
69static void update_sched_rt_period(struct rq *rq) 198static void update_sched_rt_period(struct rq *rq)
70{ 199{
71 while (rq->clock > rq->rt_period_expire) { 200 struct rt_rq *rt_rq;
72 u64 period, ratio; 201 u64 period;
73 202
203 while (rq->clock > rq->rt_period_expire) {
74 period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC; 204 period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC;
75 ratio = (period * sysctl_sched_rt_ratio) >> SCHED_RT_FRAC_SHIFT;
76
77 rq->rt.rt_time -= min(rq->rt.rt_time, ratio);
78 rq->rt_period_expire += period; 205 rq->rt_period_expire += period;
79 }
80 206
81 /* 207 for_each_leaf_rt_rq(rt_rq, rq)
82 * When the rt throttle is expired, let them rip. 208 __update_sched_rt_period(rt_rq, period);
83 * (XXX: use hrtick when available)
84 */
85 if (rq->rt.rt_throttled && rq->clock > rq->rt.rt_throttled) {
86 rq->rt.rt_throttled = 0;
87 if (!sched_rt_ratio_exceeded(rq, &rq->rt))
88 resched_task(rq->curr);
89 } 209 }
90} 210}
91 211
@@ -96,6 +216,8 @@ static void update_sched_rt_period(struct rq *rq)
96static void update_curr_rt(struct rq *rq) 216static void update_curr_rt(struct rq *rq)
97{ 217{
98 struct task_struct *curr = rq->curr; 218 struct task_struct *curr = rq->curr;
219 struct sched_rt_entity *rt_se = &curr->rt;
220 struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
99 u64 delta_exec; 221 u64 delta_exec;
100 222
101 if (!task_has_rt_policy(curr)) 223 if (!task_has_rt_policy(curr))
@@ -111,95 +233,184 @@ static void update_curr_rt(struct rq *rq)
111 curr->se.exec_start = rq->clock; 233 curr->se.exec_start = rq->clock;
112 cpuacct_charge(curr, delta_exec); 234 cpuacct_charge(curr, delta_exec);
113 235
114 rq->rt.rt_time += delta_exec; 236 rt_rq->rt_time += delta_exec;
115 update_sched_rt_period(rq); 237 /*
116 if (sched_rt_ratio_exceeded(rq, &rq->rt)) 238 * might make it a tad more accurate:
239 *
240 * update_sched_rt_period(rq);
241 */
242 if (sched_rt_ratio_exceeded(rt_rq))
117 resched_task(curr); 243 resched_task(curr);
118} 244}
119 245
120static inline void inc_rt_tasks(struct task_struct *p, struct rq *rq) 246static inline
247void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
121{ 248{
122 WARN_ON(!rt_task(p)); 249 WARN_ON(!rt_prio(rt_se_prio(rt_se)));
123 rq->rt.rt_nr_running++; 250 rt_rq->rt_nr_running++;
251#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED
252 if (rt_se_prio(rt_se) < rt_rq->highest_prio)
253 rt_rq->highest_prio = rt_se_prio(rt_se);
254#endif
124#ifdef CONFIG_SMP 255#ifdef CONFIG_SMP
125 if (p->prio < rq->rt.highest_prio) 256 if (rt_se->nr_cpus_allowed > 1) {
126 rq->rt.highest_prio = p->prio; 257 struct rq *rq = rq_of_rt_rq(rt_rq);
127 if (p->nr_cpus_allowed > 1)
128 rq->rt.rt_nr_migratory++; 258 rq->rt.rt_nr_migratory++;
259 }
129 260
130 update_rt_migration(rq); 261 update_rt_migration(rq_of_rt_rq(rt_rq));
131#endif /* CONFIG_SMP */ 262#endif
132} 263}
133 264
134static inline void dec_rt_tasks(struct task_struct *p, struct rq *rq) 265static inline
266void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
135{ 267{
136 WARN_ON(!rt_task(p)); 268 WARN_ON(!rt_prio(rt_se_prio(rt_se)));
137 WARN_ON(!rq->rt.rt_nr_running); 269 WARN_ON(!rt_rq->rt_nr_running);
138 rq->rt.rt_nr_running--; 270 rt_rq->rt_nr_running--;
139#ifdef CONFIG_SMP 271#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED
140 if (rq->rt.rt_nr_running) { 272 if (rt_rq->rt_nr_running) {
141 struct rt_prio_array *array; 273 struct rt_prio_array *array;
142 274
143 WARN_ON(p->prio < rq->rt.highest_prio); 275 WARN_ON(rt_se_prio(rt_se) < rt_rq->highest_prio);
144 if (p->prio == rq->rt.highest_prio) { 276 if (rt_se_prio(rt_se) == rt_rq->highest_prio) {
145 /* recalculate */ 277 /* recalculate */
146 array = &rq->rt.active; 278 array = &rt_rq->active;
147 rq->rt.highest_prio = 279 rt_rq->highest_prio =
148 sched_find_first_bit(array->bitmap); 280 sched_find_first_bit(array->bitmap);
149 } /* otherwise leave rq->highest prio alone */ 281 } /* otherwise leave rq->highest prio alone */
150 } else 282 } else
151 rq->rt.highest_prio = MAX_RT_PRIO; 283 rt_rq->highest_prio = MAX_RT_PRIO;
152 if (p->nr_cpus_allowed > 1) 284#endif
285#ifdef CONFIG_SMP
286 if (rt_se->nr_cpus_allowed > 1) {
287 struct rq *rq = rq_of_rt_rq(rt_rq);
153 rq->rt.rt_nr_migratory--; 288 rq->rt.rt_nr_migratory--;
289 }
154 290
155 update_rt_migration(rq); 291 update_rt_migration(rq_of_rt_rq(rt_rq));
156#endif /* CONFIG_SMP */ 292#endif /* CONFIG_SMP */
157} 293}
158 294
159static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup) 295static void enqueue_rt_entity(struct sched_rt_entity *rt_se)
160{ 296{
161 struct rt_prio_array *array = &rq->rt.active; 297 struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
298 struct rt_prio_array *array = &rt_rq->active;
299 struct rt_rq *group_rq = group_rt_rq(rt_se);
162 300
163 list_add_tail(&p->rt.run_list, array->queue + p->prio); 301 if (group_rq && group_rq->rt_throttled)
164 __set_bit(p->prio, array->bitmap); 302 return;
165 inc_cpu_load(rq, p->se.load.weight);
166 303
167 inc_rt_tasks(p, rq); 304 list_add_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se));
305 __set_bit(rt_se_prio(rt_se), array->bitmap);
168 306
169 if (wakeup) 307 inc_rt_tasks(rt_se, rt_rq);
170 p->rt.timeout = 0; 308}
309
310static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
311{
312 struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
313 struct rt_prio_array *array = &rt_rq->active;
314
315 list_del_init(&rt_se->run_list);
316 if (list_empty(array->queue + rt_se_prio(rt_se)))
317 __clear_bit(rt_se_prio(rt_se), array->bitmap);
318
319 dec_rt_tasks(rt_se, rt_rq);
320}
321
322/*
323 * Because the prio of an upper entry depends on the lower
324 * entries, we must remove entries top - down.
325 *
326 * XXX: O(1/2 h^2) because we can only walk up, not down the chain.
327 * doesn't matter much for now, as h=2 for GROUP_SCHED.
328 */
329static void dequeue_rt_stack(struct task_struct *p)
330{
331 struct sched_rt_entity *rt_se, *top_se;
332
333 /*
334 * dequeue all, top - down.
335 */
336 do {
337 rt_se = &p->rt;
338 top_se = NULL;
339 for_each_sched_rt_entity(rt_se) {
340 if (on_rt_rq(rt_se))
341 top_se = rt_se;
342 }
343 if (top_se)
344 dequeue_rt_entity(top_se);
345 } while (top_se);
171} 346}
172 347
173/* 348/*
174 * Adding/removing a task to/from a priority array: 349 * Adding/removing a task to/from a priority array:
175 */ 350 */
351static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
352{
353 struct sched_rt_entity *rt_se = &p->rt;
354
355 if (wakeup)
356 rt_se->timeout = 0;
357
358 dequeue_rt_stack(p);
359
360 /*
361 * enqueue everybody, bottom - up.
362 */
363 for_each_sched_rt_entity(rt_se)
364 enqueue_rt_entity(rt_se);
365
366 inc_cpu_load(rq, p->se.load.weight);
367}
368
176static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) 369static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
177{ 370{
178 struct rt_prio_array *array = &rq->rt.active; 371 struct sched_rt_entity *rt_se = &p->rt;
372 struct rt_rq *rt_rq;
179 373
180 update_curr_rt(rq); 374 update_curr_rt(rq);
181 375
182 list_del(&p->rt.run_list); 376 dequeue_rt_stack(p);
183 if (list_empty(array->queue + p->prio)) 377
184 __clear_bit(p->prio, array->bitmap); 378 /*
185 dec_cpu_load(rq, p->se.load.weight); 379 * re-enqueue all non-empty rt_rq entities.
380 */
381 for_each_sched_rt_entity(rt_se) {
382 rt_rq = group_rt_rq(rt_se);
383 if (rt_rq && rt_rq->rt_nr_running)
384 enqueue_rt_entity(rt_se);
385 }
186 386
187 dec_rt_tasks(p, rq); 387 dec_cpu_load(rq, p->se.load.weight);
188} 388}
189 389
190/* 390/*
191 * Put task to the end of the run list without the overhead of dequeue 391 * Put task to the end of the run list without the overhead of dequeue
192 * followed by enqueue. 392 * followed by enqueue.
193 */ 393 */
394static
395void requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
396{
397 struct rt_prio_array *array = &rt_rq->active;
398
399 list_move_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se));
400}
401
194static void requeue_task_rt(struct rq *rq, struct task_struct *p) 402static void requeue_task_rt(struct rq *rq, struct task_struct *p)
195{ 403{
196 struct rt_prio_array *array = &rq->rt.active; 404 struct sched_rt_entity *rt_se = &p->rt;
405 struct rt_rq *rt_rq;
197 406
198 list_move_tail(&p->rt.run_list, array->queue + p->prio); 407 for_each_sched_rt_entity(rt_se) {
408 rt_rq = rt_rq_of_se(rt_se);
409 requeue_rt_entity(rt_rq, rt_se);
410 }
199} 411}
200 412
201static void 413static void yield_task_rt(struct rq *rq)
202yield_task_rt(struct rq *rq)
203{ 414{
204 requeue_task_rt(rq, rq->curr); 415 requeue_task_rt(rq, rq->curr);
205} 416}
@@ -229,7 +440,7 @@ static int select_task_rq_rt(struct task_struct *p, int sync)
229 * cold cache anyway. 440 * cold cache anyway.
230 */ 441 */
231 if (unlikely(rt_task(rq->curr)) && 442 if (unlikely(rt_task(rq->curr)) &&
232 (p->nr_cpus_allowed > 1)) { 443 (p->rt.nr_cpus_allowed > 1)) {
233 int cpu = find_lowest_rq(p); 444 int cpu = find_lowest_rq(p);
234 445
235 return (cpu == -1) ? task_cpu(p) : cpu; 446 return (cpu == -1) ? task_cpu(p) : cpu;
@@ -252,27 +463,51 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p)
252 resched_task(rq->curr); 463 resched_task(rq->curr);
253} 464}
254 465
255static struct task_struct *pick_next_task_rt(struct rq *rq) 466static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
467 struct rt_rq *rt_rq)
256{ 468{
257 struct rt_prio_array *array = &rq->rt.active; 469 struct rt_prio_array *array = &rt_rq->active;
258 struct task_struct *next; 470 struct sched_rt_entity *next = NULL;
259 struct list_head *queue; 471 struct list_head *queue;
260 struct rt_rq *rt_rq = &rq->rt;
261 int idx; 472 int idx;
262 473
263 if (sched_rt_ratio_exceeded(rq, rt_rq)) 474 if (sched_rt_ratio_exceeded(rt_rq))
264 return NULL; 475 goto out;
265 476
266 idx = sched_find_first_bit(array->bitmap); 477 idx = sched_find_first_bit(array->bitmap);
267 if (idx >= MAX_RT_PRIO) 478 BUG_ON(idx >= MAX_RT_PRIO);
268 return NULL;
269 479
270 queue = array->queue + idx; 480 queue = array->queue + idx;
271 next = list_entry(queue->next, struct task_struct, rt.run_list); 481 next = list_entry(queue->next, struct sched_rt_entity, run_list);
482 out:
483 return next;
484}
272 485
273 next->se.exec_start = rq->clock; 486static struct task_struct *pick_next_task_rt(struct rq *rq)
487{
488 struct sched_rt_entity *rt_se;
489 struct task_struct *p;
490 struct rt_rq *rt_rq;
274 491
275 return next; 492 retry:
493 rt_rq = &rq->rt;
494
495 if (unlikely(!rt_rq->rt_nr_running))
496 return NULL;
497
498 if (sched_rt_ratio_exceeded(rt_rq))
499 return NULL;
500
501 do {
502 rt_se = pick_next_rt_entity(rq, rt_rq);
503 if (unlikely(!rt_se))
504 goto retry;
505 rt_rq = group_rt_rq(rt_se);
506 } while (rt_rq);
507
508 p = rt_task_of(rt_se);
509 p->se.exec_start = rq->clock;
510 return p;
276} 511}
277 512
278static void put_prev_task_rt(struct rq *rq, struct task_struct *p) 513static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
@@ -282,6 +517,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
282} 517}
283 518
284#ifdef CONFIG_SMP 519#ifdef CONFIG_SMP
520
285/* Only try algorithms three times */ 521/* Only try algorithms three times */
286#define RT_MAX_TRIES 3 522#define RT_MAX_TRIES 3
287 523
@@ -292,7 +528,7 @@ static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
292{ 528{
293 if (!task_running(rq, p) && 529 if (!task_running(rq, p) &&
294 (cpu < 0 || cpu_isset(cpu, p->cpus_allowed)) && 530 (cpu < 0 || cpu_isset(cpu, p->cpus_allowed)) &&
295 (p->nr_cpus_allowed > 1)) 531 (p->rt.nr_cpus_allowed > 1))
296 return 1; 532 return 1;
297 return 0; 533 return 0;
298} 534}
@@ -300,52 +536,33 @@ static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
300/* Return the second highest RT task, NULL otherwise */ 536/* Return the second highest RT task, NULL otherwise */
301static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu) 537static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
302{ 538{
303 struct rt_prio_array *array = &rq->rt.active; 539 struct task_struct *next = NULL;
304 struct task_struct *next; 540 struct sched_rt_entity *rt_se;
305 struct list_head *queue; 541 struct rt_prio_array *array;
542 struct rt_rq *rt_rq;
306 int idx; 543 int idx;
307 544
308 if (likely(rq->rt.rt_nr_running < 2)) 545 for_each_leaf_rt_rq(rt_rq, rq) {
309 return NULL; 546 array = &rt_rq->active;
310 547 idx = sched_find_first_bit(array->bitmap);
311 idx = sched_find_first_bit(array->bitmap); 548 next_idx:
312 if (unlikely(idx >= MAX_RT_PRIO)) { 549 if (idx >= MAX_RT_PRIO)
313 WARN_ON(1); /* rt_nr_running is bad */ 550 continue;
314 return NULL; 551 if (next && next->prio < idx)
315 } 552 continue;
316 553 list_for_each_entry(rt_se, array->queue + idx, run_list) {
317 queue = array->queue + idx; 554 struct task_struct *p = rt_task_of(rt_se);
318 BUG_ON(list_empty(queue)); 555 if (pick_rt_task(rq, p, cpu)) {
319 556 next = p;
320 next = list_entry(queue->next, struct task_struct, rt.run_list); 557 break;
321 if (unlikely(pick_rt_task(rq, next, cpu))) 558 }
322 goto out; 559 }
323 560 if (!next) {
324 if (queue->next->next != queue) { 561 idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1);
325 /* same prio task */ 562 goto next_idx;
326 next = list_entry(queue->next->next, struct task_struct, 563 }
327 rt.run_list);
328 if (pick_rt_task(rq, next, cpu))
329 goto out;
330 }
331
332 retry:
333 /* slower, but more flexible */
334 idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1);
335 if (unlikely(idx >= MAX_RT_PRIO))
336 return NULL;
337
338 queue = array->queue + idx;
339 BUG_ON(list_empty(queue));
340
341 list_for_each_entry(next, queue, rt.run_list) {
342 if (pick_rt_task(rq, next, cpu))
343 goto out;
344 } 564 }
345 565
346 goto retry;
347
348 out:
349 return next; 566 return next;
350} 567}
351 568
@@ -774,12 +991,12 @@ static void set_cpus_allowed_rt(struct task_struct *p, cpumask_t *new_mask)
774 * Update the migration status of the RQ if we have an RT task 991 * Update the migration status of the RQ if we have an RT task
775 * which is running AND changing its weight value. 992 * which is running AND changing its weight value.
776 */ 993 */
777 if (p->se.on_rq && (weight != p->nr_cpus_allowed)) { 994 if (p->se.on_rq && (weight != p->rt.nr_cpus_allowed)) {
778 struct rq *rq = task_rq(p); 995 struct rq *rq = task_rq(p);
779 996
780 if ((p->nr_cpus_allowed <= 1) && (weight > 1)) { 997 if ((p->rt.nr_cpus_allowed <= 1) && (weight > 1)) {
781 rq->rt.rt_nr_migratory++; 998 rq->rt.rt_nr_migratory++;
782 } else if ((p->nr_cpus_allowed > 1) && (weight <= 1)) { 999 } else if ((p->rt.nr_cpus_allowed > 1) && (weight <= 1)) {
783 BUG_ON(!rq->rt.rt_nr_migratory); 1000 BUG_ON(!rq->rt.rt_nr_migratory);
784 rq->rt.rt_nr_migratory--; 1001 rq->rt.rt_nr_migratory--;
785 } 1002 }
@@ -788,7 +1005,7 @@ static void set_cpus_allowed_rt(struct task_struct *p, cpumask_t *new_mask)
788 } 1005 }
789 1006
790 p->cpus_allowed = *new_mask; 1007 p->cpus_allowed = *new_mask;
791 p->nr_cpus_allowed = weight; 1008 p->rt.nr_cpus_allowed = weight;
792} 1009}
793 1010
794/* Assumes rq->lock is held */ 1011/* Assumes rq->lock is held */