sched: rt group scheduling

Extend group scheduling to also cover the realtime classes. It uses the time limiting introduced by the previous patch to allow multiple realtime groups. The hard time limit is required to keep behaviour deterministic. The algorithms used make the realtime scheduler O(tg), linear scaling wrt the number of task groups. This is the worst case behaviour I can't seem to get out of, the avg. case of the algorithms can be improved, I focused on correctness and worst case. [ akpm@linux-foundation.org: move side-effects out of BUG_ON(). ] Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Ingo Molnar <mingo@elte.hu>
author: Peter Zijlstra <a.p.zijlstra@chello.nl> 2008-01-25 15:08:30 -0500
committer: Ingo Molnar <mingo@elte.hu> 2008-01-25 15:08:30 -0500
commit: 6f505b16425a51270058e4a93441fe64de3dd435 (patch)
tree: be21e711d93bc4d088b97c4a4f585a5044dbaa7d /kernel/sched.c
parent: fa85ae2418e6843953107cd6a06f645752829bc0 (diff)
1 files changed, 200 insertions, 83 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index e9a7beee9b79..5ea2c533b432 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -161,6 +161,8 @@ struct rt_prio_array {
 struct cfs_rq;
+static LIST_HEAD(task_groups);
 /* task group related information */
 struct task_group {
 #ifdef CONFIG_FAIR_CGROUP_SCHED
@@ -171,6 +173,11 @@ struct task_group {
        /* runqueue "owned" by this group on each cpu */
        struct cfs_rq **cfs_rq;
+        struct sched_rt_entity **rt_se;
+        struct rt_rq **rt_rq;
+        unsigned int rt_ratio;
        /*
         * shares assigned to a task group governs how much of cpu bandwidth
         * is allocated to the group. The more shares a group has, the more is
@@ -208,6 +215,7 @@ struct task_group {
        unsigned long shares;
        struct rcu_head rcu;
+        struct list_head list;
 };
 /* Default task group's sched entity on each cpu */
@@ -215,9 +223,15 @@ static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
 /* Default task group's cfs_rq on each cpu */
 static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
+static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
+static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
 static struct sched_entity *init_sched_entity_p[NR_CPUS];
 static struct cfs_rq *init_cfs_rq_p[NR_CPUS];
+static struct sched_rt_entity *init_sched_rt_entity_p[NR_CPUS];
+static struct rt_rq *init_rt_rq_p[NR_CPUS];
 /* task_group_mutex serializes add/remove of task groups and also changes to
 * a task group's cpu shares.
 */
@@ -240,6 +254,9 @@ static void set_se_shares(struct sched_entity *se, unsigned long shares);
 struct task_group init_task_group = {
        .se     = init_sched_entity_p,
        .cfs_rq = init_cfs_rq_p,
+        .rt_se  = init_sched_rt_entity_p,
+        .rt_rq  = init_rt_rq_p,
 };
 #ifdef CONFIG_FAIR_USER_SCHED
@@ -269,10 +286,13 @@ static inline struct task_group *task_group(struct task_struct *p)
 }
 /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
-static inline void set_task_cfs_rq(struct task_struct *p, unsigned int cpu)
+static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
 {
        p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
        p->se.parent = task_group(p)->se[cpu];
+        p->rt.rt_rq  = task_group(p)->rt_rq[cpu];
+        p->rt.parent = task_group(p)->rt_se[cpu];
 }
 static inline void lock_task_group_list(void)
@@ -297,7 +317,7 @@ static inline void unlock_doms_cur(void)
 #else
-static inline void set_task_cfs_rq(struct task_struct *p, unsigned int cpu) { }
+static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
 static inline void lock_task_group_list(void) { }
 static inline void unlock_task_group_list(void) { }
 static inline void lock_doms_cur(void) { }
@@ -343,13 +363,22 @@ struct cfs_rq {
 struct rt_rq {
        struct rt_prio_array active;
        unsigned long rt_nr_running;
+#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED
+        int highest_prio; /* highest queued rt task prio */
+#endif
 #ifdef CONFIG_SMP
        unsigned long rt_nr_migratory;
-        int highest_prio; /* highest queued rt task prio */
        int overloaded;
 #endif
+        int rt_throttled;
        u64 rt_time;
-        u64 rt_throttled;
+#ifdef CONFIG_FAIR_GROUP_SCHED
+        struct rq *rq;
+        struct list_head leaf_rt_rq_list;
+        struct task_group *tg;
+        struct sched_rt_entity *rt_se;
+#endif
 };
 #ifdef CONFIG_SMP
@@ -411,12 +440,14 @@ struct rq {
        u64 nr_switches;
        struct cfs_rq cfs;
+        struct rt_rq rt;
+        u64 rt_period_expire;
 #ifdef CONFIG_FAIR_GROUP_SCHED
        /* list of leaf cfs_rq on this cpu: */
        struct list_head leaf_cfs_rq_list;
+        struct list_head leaf_rt_rq_list;
 #endif
-        struct rt_rq rt;
-        u64 rt_period_expire;
        /*
         * This is part of a global counter where only the total sum
@@ -613,9 +644,9 @@ const_debug unsigned int sysctl_sched_rt_period = 1000;
 /*
 * ratio of time -rt tasks may consume.
- * default: 100%
+ * default: 95%
 */
-const_debug unsigned int sysctl_sched_rt_ratio = SCHED_RT_FRAC;
+const_debug unsigned int sysctl_sched_rt_ratio = 62259;
 /*
 * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
@@ -1337,7 +1368,7 @@ unsigned long weighted_cpuload(const int cpu)
 static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
 {
-        set_task_cfs_rq(p, cpu);
+        set_task_rq(p, cpu);
 #ifdef CONFIG_SMP
        /*
         * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
@@ -5281,7 +5312,7 @@ int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
                p->sched_class->set_cpus_allowed(p, &new_mask);
        else {
                p->cpus_allowed = new_mask;
-                p->nr_cpus_allowed = cpus_weight(new_mask);
+                p->rt.nr_cpus_allowed = cpus_weight(new_mask);
        }
        /* Can the task run on the task's current CPU? If so, we're done */
@@ -7079,8 +7110,50 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
        rt_rq->rt_time = 0;
        rt_rq->rt_throttled = 0;
+#ifdef CONFIG_FAIR_GROUP_SCHED
+        rt_rq->rq = rq;
+#endif
 }
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static void init_tg_cfs_entry(struct rq *rq, struct task_group *tg,
+                struct cfs_rq *cfs_rq, struct sched_entity *se,
+                int cpu, int add)
+{
+        tg->cfs_rq[cpu] = cfs_rq;
+        init_cfs_rq(cfs_rq, rq);
+        cfs_rq->tg = tg;
+        if (add)
+                list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
+        tg->se[cpu] = se;
+        se->cfs_rq = &rq->cfs;
+        se->my_q = cfs_rq;
+        se->load.weight = tg->shares;
+        se->load.inv_weight = div64_64(1ULL<<32, se->load.weight);
+        se->parent = NULL;
+}
+static void init_tg_rt_entry(struct rq *rq, struct task_group *tg,
+                struct rt_rq *rt_rq, struct sched_rt_entity *rt_se,
+                int cpu, int add)
+{
+        tg->rt_rq[cpu] = rt_rq;
+        init_rt_rq(rt_rq, rq);
+        rt_rq->tg = tg;
+        rt_rq->rt_se = rt_se;
+        if (add)
+                list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
+        tg->rt_se[cpu] = rt_se;
+        rt_se->rt_rq = &rq->rt;
+        rt_se->my_q = rt_rq;
+        rt_se->parent = NULL;
+        INIT_LIST_HEAD(&rt_se->run_list);
+}
+#endif
 void __init sched_init(void)
 {
        int highest_cpu = 0;
@@ -7090,6 +7163,10 @@ void __init sched_init(void)
        init_defrootdomain();
 #endif
+#ifdef CONFIG_FAIR_GROUP_SCHED
+        list_add(&init_task_group.list, &task_groups);
+#endif
        for_each_possible_cpu(i) {
                struct rq *rq;
@@ -7099,30 +7176,20 @@ void __init sched_init(void)
                rq->nr_running = 0;
                rq->clock = 1;
                init_cfs_rq(&rq->cfs, rq);
+                init_rt_rq(&rq->rt, rq);
 #ifdef CONFIG_FAIR_GROUP_SCHED
-                INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
-                {
-                        struct cfs_rq *cfs_rq = &per_cpu(init_cfs_rq, i);
-                        struct sched_entity *se =
-                                         &per_cpu(init_sched_entity, i);
-                        init_cfs_rq_p[i] = cfs_rq;
-                        init_cfs_rq(cfs_rq, rq);
-                        cfs_rq->tg = &init_task_group;
-                        list_add(&cfs_rq->leaf_cfs_rq_list,
-                                                         &rq->leaf_cfs_rq_list);
-                        init_sched_entity_p[i] = se;
-                        se->cfs_rq = &rq->cfs;
-                        se->my_q = cfs_rq;
-                        se->load.weight = init_task_group_load;
-                        se->load.inv_weight =
-                                 div64_64(1ULL<<32, init_task_group_load);
-                        se->parent = NULL;
-                }
                init_task_group.shares = init_task_group_load;
+                INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
+                init_tg_cfs_entry(rq, &init_task_group,
+                                &per_cpu(init_cfs_rq, i),
+                                &per_cpu(init_sched_entity, i), i, 1);
+                init_task_group.rt_ratio = sysctl_sched_rt_ratio; /* XXX */
+                INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
+                init_tg_rt_entry(rq, &init_task_group,
+                                &per_cpu(init_rt_rq, i),
+                                &per_cpu(init_sched_rt_entity, i), i, 1);
 #endif
-                init_rt_rq(&rq->rt, rq);
                rq->rt_period_expire = 0;
                for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
@@ -7460,12 +7527,36 @@ static int load_balance_monitor(void *unused)
 }
 #endif  /* CONFIG_SMP */
+static void free_sched_group(struct task_group *tg)
+{
+        int i;
+        for_each_possible_cpu(i) {
+                if (tg->cfs_rq)
+                        kfree(tg->cfs_rq[i]);
+                if (tg->se)
+                        kfree(tg->se[i]);
+                if (tg->rt_rq)
+                        kfree(tg->rt_rq[i]);
+                if (tg->rt_se)
+                        kfree(tg->rt_se[i]);
+        }
+        kfree(tg->cfs_rq);
+        kfree(tg->se);
+        kfree(tg->rt_rq);
+        kfree(tg->rt_se);
+        kfree(tg);
+}
 /* allocate runqueue etc for a new task group */
 struct task_group *sched_create_group(void)
 {
        struct task_group *tg;
        struct cfs_rq *cfs_rq;
        struct sched_entity *se;
+        struct rt_rq *rt_rq;
+        struct sched_rt_entity *rt_se;
        struct rq *rq;
        int i;
@@ -7479,100 +7570,89 @@ struct task_group *sched_create_group(void)
        tg->se = kzalloc(sizeof(se) * NR_CPUS, GFP_KERNEL);
        if (!tg->se)
                goto err;
+        tg->rt_rq = kzalloc(sizeof(rt_rq) * NR_CPUS, GFP_KERNEL);
+        if (!tg->rt_rq)
+                goto err;
+        tg->rt_se = kzalloc(sizeof(rt_se) * NR_CPUS, GFP_KERNEL);
+        if (!tg->rt_se)
+                goto err;
+        tg->shares = NICE_0_LOAD;
+        tg->rt_ratio = 0; /* XXX */
        for_each_possible_cpu(i) {
                rq = cpu_rq(i);
-                cfs_rq = kmalloc_node(sizeof(struct cfs_rq), GFP_KERNEL,
+                cfs_rq = kmalloc_node(sizeof(struct cfs_rq),
-                                                         cpu_to_node(i));
+                                GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
                if (!cfs_rq)
                        goto err;
-                se = kmalloc_node(sizeof(struct sched_entity), GFP_KERNEL,
+                se = kmalloc_node(sizeof(struct sched_entity),
-                                                        cpu_to_node(i));
+                                GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
                if (!se)
                        goto err;
-                memset(cfs_rq, 0, sizeof(struct cfs_rq));
+                rt_rq = kmalloc_node(sizeof(struct rt_rq),
-                memset(se, 0, sizeof(struct sched_entity));
+                                GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
+                if (!rt_rq)
+                        goto err;
-                tg->cfs_rq[i] = cfs_rq;
+                rt_se = kmalloc_node(sizeof(struct sched_rt_entity),
-                init_cfs_rq(cfs_rq, rq);
+                                GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
-                cfs_rq->tg = tg;
+                if (!rt_se)
+                        goto err;
-                tg->se[i] = se;
+                init_tg_cfs_entry(rq, tg, cfs_rq, se, i, 0);
-                se->cfs_rq = &rq->cfs;
+                init_tg_rt_entry(rq, tg, rt_rq, rt_se, i, 0);
-                se->my_q = cfs_rq;
-                se->load.weight = NICE_0_LOAD;
-                se->load.inv_weight = div64_64(1ULL<<32, NICE_0_LOAD);
-                se->parent = NULL;
        }
-        tg->shares = NICE_0_LOAD;
        lock_task_group_list();
        for_each_possible_cpu(i) {
                rq = cpu_rq(i);
                cfs_rq = tg->cfs_rq[i];
                list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
+                rt_rq = tg->rt_rq[i];
+                list_add_rcu(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
        }
+        list_add_rcu(&tg->list, &task_groups);
        unlock_task_group_list();
        return tg;
 err:
-        for_each_possible_cpu(i) {
+        free_sched_group(tg);
-                if (tg->cfs_rq)
-                        kfree(tg->cfs_rq[i]);
-                if (tg->se)
-                        kfree(tg->se[i]);
-        }
-        kfree(tg->cfs_rq);
-        kfree(tg->se);
-        kfree(tg);
        return ERR_PTR(-ENOMEM);
 }
 /* rcu callback to free various structures associated with a task group */
-static void free_sched_group(struct rcu_head *rhp)
+static void free_sched_group_rcu(struct rcu_head *rhp)
 {
-        struct task_group *tg = container_of(rhp, struct task_group, rcu);
-        struct cfs_rq *cfs_rq;
-        struct sched_entity *se;
-        int i;
        /* now it should be safe to free those cfs_rqs */
-        for_each_possible_cpu(i) {
+        free_sched_group(container_of(rhp, struct task_group, rcu));
-                cfs_rq = tg->cfs_rq[i];
-                kfree(cfs_rq);
-                se = tg->se[i];
-                kfree(se);
-        }
-        kfree(tg->cfs_rq);
-        kfree(tg->se);
-        kfree(tg);
 }
 /* Destroy runqueue etc associated with a task group */
 void sched_destroy_group(struct task_group *tg)
 {
        struct cfs_rq *cfs_rq = NULL;
+        struct rt_rq *rt_rq = NULL;
        int i;
        lock_task_group_list();
        for_each_possible_cpu(i) {
                cfs_rq = tg->cfs_rq[i];
                list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
+                rt_rq = tg->rt_rq[i];
+                list_del_rcu(&rt_rq->leaf_rt_rq_list);
        }
+        list_del_rcu(&tg->list);
        unlock_task_group_list();
        BUG_ON(!cfs_rq);
        /* wait for possible concurrent references to cfs_rqs complete */
-        call_rcu(&tg->rcu, free_sched_group);
+        call_rcu(&tg->rcu, free_sched_group_rcu);
 }
 /* change task's runqueue when it moves between groups.
@@ -7588,11 +7668,6 @@ void sched_move_task(struct task_struct *tsk)
        rq = task_rq_lock(tsk, &flags);
-        if (tsk->sched_class != &fair_sched_class) {
-                set_task_cfs_rq(tsk, task_cpu(tsk));
-                goto done;
-        }
        update_rq_clock(rq);
        running = task_current(rq, tsk);
@@ -7604,7 +7679,7 @@ void sched_move_task(struct task_struct *tsk)
                        tsk->sched_class->put_prev_task(rq, tsk);
        }
-        set_task_cfs_rq(tsk, task_cpu(tsk));
+        set_task_rq(tsk, task_cpu(tsk));
        if (on_rq) {
                if (unlikely(running))
@@ -7612,7 +7687,6 @@ void sched_move_task(struct task_struct *tsk)
                enqueue_task(rq, tsk, 0);
        }
-done:
        task_rq_unlock(rq, &flags);
 }
@@ -7697,6 +7771,31 @@ unsigned long sched_group_shares(struct task_group *tg)
        return tg->shares;
 }
+/*
+ * Ensure the total rt_ratio <= sysctl_sched_rt_ratio
+ */
+int sched_group_set_rt_ratio(struct task_group *tg, unsigned long rt_ratio)
+{
+        struct task_group *tgi;
+        unsigned long total = 0;
+        rcu_read_lock();
+        list_for_each_entry_rcu(tgi, &task_groups, list)
+                total += tgi->rt_ratio;
+        rcu_read_unlock();
+        if (total + rt_ratio - tg->rt_ratio > sysctl_sched_rt_ratio)
+                return -EINVAL;
+        tg->rt_ratio = rt_ratio;
+        return 0;
+}
+unsigned long sched_group_rt_ratio(struct task_group *tg)
+{
+        return tg->rt_ratio;
+}
 #endif  /* CONFIG_FAIR_GROUP_SCHED */
 #ifdef CONFIG_FAIR_CGROUP_SCHED
@@ -7772,12 +7871,30 @@ static u64 cpu_shares_read_uint(struct cgroup *cgrp, struct cftype *cft)
        return (u64) tg->shares;
 }
+static int cpu_rt_ratio_write_uint(struct cgroup *cgrp, struct cftype *cftype,
+                u64 rt_ratio_val)
+{
+        return sched_group_set_rt_ratio(cgroup_tg(cgrp), rt_ratio_val);
+}
+static u64 cpu_rt_ratio_read_uint(struct cgroup *cgrp, struct cftype *cft)
+{
+        struct task_group *tg = cgroup_tg(cgrp);
+        return (u64) tg->rt_ratio;
+}
 static struct cftype cpu_files[] = {
        {
                .name = "shares",
                .read_uint = cpu_shares_read_uint,
                .write_uint = cpu_shares_write_uint,
        },
+        {
+                .name = "rt_ratio",
+                .read_uint = cpu_rt_ratio_read_uint,
+                .write_uint = cpu_rt_ratio_write_uint,
+        },
 };
 static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
author	Peter Zijlstra <a.p.zijlstra@chello.nl>	2008-01-25 15:08:30 -0500
committer	Ingo Molnar <mingo@elte.hu>	2008-01-25 15:08:30 -0500
commit	6f505b16425a51270058e4a93441fe64de3dd435 (patch)
tree	be21e711d93bc4d088b97c4a4f585a5044dbaa7d /kernel/sched.c
parent	fa85ae2418e6843953107cd6a06f645752829bc0 (diff)

diff --git a/kernel/sched.c b/kernel/sched.c index e9a7beee9b79..5ea2c533b432 100644 --- a/kernel/sched.c +++ b/kernel/sched.c
@@ -161,6 +161,8 @@ struct rt_prio_array {
161		161
162	struct cfs_rq;	162	struct cfs_rq;
163		163
		164	static LIST_HEAD(task_groups);
		165
164	/* task group related information */	166	/* task group related information */
165	struct task_group {	167	struct task_group {
166	#ifdef CONFIG_FAIR_CGROUP_SCHED	168	#ifdef CONFIG_FAIR_CGROUP_SCHED
@@ -171,6 +173,11 @@ struct task_group {
171	/* runqueue "owned" by this group on each cpu */	173	/* runqueue "owned" by this group on each cpu */
172	struct cfs_rq **cfs_rq;	174	struct cfs_rq **cfs_rq;
173		175
		176	struct sched_rt_entity **rt_se;
		177	struct rt_rq **rt_rq;
		178
		179	unsigned int rt_ratio;
		180
174	/*	181	/*
175	* shares assigned to a task group governs how much of cpu bandwidth	182	* shares assigned to a task group governs how much of cpu bandwidth
176	* is allocated to the group. The more shares a group has, the more is	183	* is allocated to the group. The more shares a group has, the more is
@@ -208,6 +215,7 @@ struct task_group {
208	unsigned long shares;	215	unsigned long shares;
209		216
210	struct rcu_head rcu;	217	struct rcu_head rcu;
		218	struct list_head list;
211	};	219	};
212		220
213	/* Default task group's sched entity on each cpu */	221	/* Default task group's sched entity on each cpu */
@@ -215,9 +223,15 @@ static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
215	/* Default task group's cfs_rq on each cpu */	223	/* Default task group's cfs_rq on each cpu */
216	static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;	224	static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
217		225
		226	static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
		227	static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
		228
218	static struct sched_entity *init_sched_entity_p[NR_CPUS];	229	static struct sched_entity *init_sched_entity_p[NR_CPUS];
219	static struct cfs_rq *init_cfs_rq_p[NR_CPUS];	230	static struct cfs_rq *init_cfs_rq_p[NR_CPUS];
220		231
		232	static struct sched_rt_entity *init_sched_rt_entity_p[NR_CPUS];
		233	static struct rt_rq *init_rt_rq_p[NR_CPUS];
		234
221	/* task_group_mutex serializes add/remove of task groups and also changes to	235	/* task_group_mutex serializes add/remove of task groups and also changes to
222	* a task group's cpu shares.	236	* a task group's cpu shares.
223	*/	237	*/
@@ -240,6 +254,9 @@ static void set_se_shares(struct sched_entity *se, unsigned long shares);
240	struct task_group init_task_group = {	254	struct task_group init_task_group = {
241	.se = init_sched_entity_p,	255	.se = init_sched_entity_p,
242	.cfs_rq = init_cfs_rq_p,	256	.cfs_rq = init_cfs_rq_p,
		257
		258	.rt_se = init_sched_rt_entity_p,
		259	.rt_rq = init_rt_rq_p,
243	};	260	};
244		261
245	#ifdef CONFIG_FAIR_USER_SCHED	262	#ifdef CONFIG_FAIR_USER_SCHED
@@ -269,10 +286,13 @@ static inline struct task_group task_group(struct task_struct p)
269	}	286	}
270		287
271	/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */	288	/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
272	static inline void set_task_cfs_rq(struct task_struct *p, unsigned int cpu)	289	static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
273	{	290	{
274	p->se.cfs_rq = task_group(p)->cfs_rq[cpu];	291	p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
275	p->se.parent = task_group(p)->se[cpu];	292	p->se.parent = task_group(p)->se[cpu];
		293
		294	p->rt.rt_rq = task_group(p)->rt_rq[cpu];
		295	p->rt.parent = task_group(p)->rt_se[cpu];
276	}	296	}
277		297
278	static inline void lock_task_group_list(void)	298	static inline void lock_task_group_list(void)
@@ -297,7 +317,7 @@ static inline void unlock_doms_cur(void)
297		317
298	#else	318	#else
299		319
300	static inline void set_task_cfs_rq(struct task_struct *p, unsigned int cpu) { }	320	static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
301	static inline void lock_task_group_list(void) { }	321	static inline void lock_task_group_list(void) { }
302	static inline void unlock_task_group_list(void) { }	322	static inline void unlock_task_group_list(void) { }
303	static inline void lock_doms_cur(void) { }	323	static inline void lock_doms_cur(void) { }
@@ -343,13 +363,22 @@ struct cfs_rq {
343	struct rt_rq {	363	struct rt_rq {
344	struct rt_prio_array active;	364	struct rt_prio_array active;
345	unsigned long rt_nr_running;	365	unsigned long rt_nr_running;
		366	#if defined CONFIG_SMP \|\| defined CONFIG_FAIR_GROUP_SCHED
		367	int highest_prio; /* highest queued rt task prio */
		368	#endif
346	#ifdef CONFIG_SMP	369	#ifdef CONFIG_SMP
347	unsigned long rt_nr_migratory;	370	unsigned long rt_nr_migratory;
348	int highest_prio; /* highest queued rt task prio */
349	int overloaded;	371	int overloaded;
350	#endif	372	#endif
		373	int rt_throttled;
351	u64 rt_time;	374	u64 rt_time;
352	u64 rt_throttled;	375
		376	#ifdef CONFIG_FAIR_GROUP_SCHED
		377	struct rq *rq;
		378	struct list_head leaf_rt_rq_list;
		379	struct task_group *tg;
		380	struct sched_rt_entity *rt_se;
		381	#endif
353	};	382	};
354		383
355	#ifdef CONFIG_SMP	384	#ifdef CONFIG_SMP
@@ -411,12 +440,14 @@ struct rq {
411	u64 nr_switches;	440	u64 nr_switches;
412		441
413	struct cfs_rq cfs;	442	struct cfs_rq cfs;
		443	struct rt_rq rt;
		444	u64 rt_period_expire;
		445
414	#ifdef CONFIG_FAIR_GROUP_SCHED	446	#ifdef CONFIG_FAIR_GROUP_SCHED
415	/* list of leaf cfs_rq on this cpu: */	447	/* list of leaf cfs_rq on this cpu: */
416	struct list_head leaf_cfs_rq_list;	448	struct list_head leaf_cfs_rq_list;
		449	struct list_head leaf_rt_rq_list;
417	#endif	450	#endif
418	struct rt_rq rt;
419	u64 rt_period_expire;
420		451
421	/*	452	/*
422	* This is part of a global counter where only the total sum	453	* This is part of a global counter where only the total sum
@@ -613,9 +644,9 @@ const_debug unsigned int sysctl_sched_rt_period = 1000;
613		644
614	/*	645	/*
615	* ratio of time -rt tasks may consume.	646	* ratio of time -rt tasks may consume.
616	* default: 100%	647	* default: 95%
617	*/	648	*/
618	const_debug unsigned int sysctl_sched_rt_ratio = SCHED_RT_FRAC;	649	const_debug unsigned int sysctl_sched_rt_ratio = 62259;
619		650
620	/*	651	/*
621	* For kernel-internal use: high-speed (but slightly incorrect) per-cpu	652	* For kernel-internal use: high-speed (but slightly incorrect) per-cpu
@@ -1337,7 +1368,7 @@ unsigned long weighted_cpuload(const int cpu)
1337		1368
1338	static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)	1369	static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1339	{	1370	{
1340	set_task_cfs_rq(p, cpu);	1371	set_task_rq(p, cpu);
1341	#ifdef CONFIG_SMP	1372	#ifdef CONFIG_SMP
1342	/*	1373	/*
1343	* After ->cpu is set up to a new value, task_rq_lock(p, ...) can be	1374	* After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
@@ -5281,7 +5312,7 @@ int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
5281	p->sched_class->set_cpus_allowed(p, &new_mask);	5312	p->sched_class->set_cpus_allowed(p, &new_mask);
5282	else {	5313	else {
5283	p->cpus_allowed = new_mask;	5314	p->cpus_allowed = new_mask;
5284	p->nr_cpus_allowed = cpus_weight(new_mask);	5315	p->rt.nr_cpus_allowed = cpus_weight(new_mask);
5285	}	5316	}
5286		5317
5287	/* Can the task run on the task's current CPU? If so, we're done */	5318	/* Can the task run on the task's current CPU? If so, we're done */
@@ -7079,8 +7110,50 @@ static void init_rt_rq(struct rt_rq rt_rq, struct rq rq)
7079		7110
7080	rt_rq->rt_time = 0;	7111	rt_rq->rt_time = 0;
7081	rt_rq->rt_throttled = 0;	7112	rt_rq->rt_throttled = 0;
		7113
		7114	#ifdef CONFIG_FAIR_GROUP_SCHED
		7115	rt_rq->rq = rq;
		7116	#endif
7082	}	7117	}
7083		7118
		7119	#ifdef CONFIG_FAIR_GROUP_SCHED
		7120	static void init_tg_cfs_entry(struct rq rq, struct task_group tg,
		7121	struct cfs_rq cfs_rq, struct sched_entity se,
		7122	int cpu, int add)
		7123	{
		7124	tg->cfs_rq[cpu] = cfs_rq;
		7125	init_cfs_rq(cfs_rq, rq);
		7126	cfs_rq->tg = tg;
		7127	if (add)
		7128	list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
		7129
		7130	tg->se[cpu] = se;
		7131	se->cfs_rq = &rq->cfs;
		7132	se->my_q = cfs_rq;
		7133	se->load.weight = tg->shares;
		7134	se->load.inv_weight = div64_64(1ULL<<32, se->load.weight);
		7135	se->parent = NULL;
		7136	}
		7137
		7138	static void init_tg_rt_entry(struct rq rq, struct task_group tg,
		7139	struct rt_rq rt_rq, struct sched_rt_entity rt_se,
		7140	int cpu, int add)
		7141	{
		7142	tg->rt_rq[cpu] = rt_rq;
		7143	init_rt_rq(rt_rq, rq);
		7144	rt_rq->tg = tg;
		7145	rt_rq->rt_se = rt_se;
		7146	if (add)
		7147	list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
		7148
		7149	tg->rt_se[cpu] = rt_se;
		7150	rt_se->rt_rq = &rq->rt;
		7151	rt_se->my_q = rt_rq;
		7152	rt_se->parent = NULL;
		7153	INIT_LIST_HEAD(&rt_se->run_list);
		7154	}
		7155	#endif
		7156
7084	void __init sched_init(void)	7157	void __init sched_init(void)
7085	{	7158	{
7086	int highest_cpu = 0;	7159	int highest_cpu = 0;
@@ -7090,6 +7163,10 @@ void __init sched_init(void)
7090	init_defrootdomain();	7163	init_defrootdomain();
7091	#endif	7164	#endif
7092		7165
		7166	#ifdef CONFIG_FAIR_GROUP_SCHED
		7167	list_add(&init_task_group.list, &task_groups);
		7168	#endif
		7169
7093	for_each_possible_cpu(i) {	7170	for_each_possible_cpu(i) {
7094	struct rq *rq;	7171	struct rq *rq;
7095		7172
@@ -7099,30 +7176,20 @@ void __init sched_init(void)
7099	rq->nr_running = 0;	7176	rq->nr_running = 0;
7100	rq->clock = 1;	7177	rq->clock = 1;
7101	init_cfs_rq(&rq->cfs, rq);	7178	init_cfs_rq(&rq->cfs, rq);
		7179	init_rt_rq(&rq->rt, rq);
7102	#ifdef CONFIG_FAIR_GROUP_SCHED	7180	#ifdef CONFIG_FAIR_GROUP_SCHED
7103	INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
7104	{
7105	struct cfs_rq *cfs_rq = &per_cpu(init_cfs_rq, i);
7106	struct sched_entity *se =
7107	&per_cpu(init_sched_entity, i);
7108
7109	init_cfs_rq_p[i] = cfs_rq;
7110	init_cfs_rq(cfs_rq, rq);
7111	cfs_rq->tg = &init_task_group;
7112	list_add(&cfs_rq->leaf_cfs_rq_list,
7113	&rq->leaf_cfs_rq_list);
7114
7115	init_sched_entity_p[i] = se;
7116	se->cfs_rq = &rq->cfs;
7117	se->my_q = cfs_rq;
7118	se->load.weight = init_task_group_load;
7119	se->load.inv_weight =
7120	div64_64(1ULL<<32, init_task_group_load);
7121	se->parent = NULL;
7122	}
7123	init_task_group.shares = init_task_group_load;	7181	init_task_group.shares = init_task_group_load;
		7182	INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
		7183	init_tg_cfs_entry(rq, &init_task_group,
		7184	&per_cpu(init_cfs_rq, i),
		7185	&per_cpu(init_sched_entity, i), i, 1);
		7186
		7187	init_task_group.rt_ratio = sysctl_sched_rt_ratio; /* XXX */
		7188	INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
		7189	init_tg_rt_entry(rq, &init_task_group,
		7190	&per_cpu(init_rt_rq, i),
		7191	&per_cpu(init_sched_rt_entity, i), i, 1);
7124	#endif	7192	#endif
7125	init_rt_rq(&rq->rt, rq);
7126	rq->rt_period_expire = 0;	7193	rq->rt_period_expire = 0;
7127		7194
7128	for (j = 0; j < CPU_LOAD_IDX_MAX; j++)	7195	for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
@@ -7460,12 +7527,36 @@ static int load_balance_monitor(void *unused)
7460	}	7527	}
7461	#endif /* CONFIG_SMP */	7528	#endif /* CONFIG_SMP */
7462		7529
		7530	static void free_sched_group(struct task_group *tg)
		7531	{
		7532	int i;
		7533
		7534	for_each_possible_cpu(i) {
		7535	if (tg->cfs_rq)
		7536	kfree(tg->cfs_rq[i]);
		7537	if (tg->se)
		7538	kfree(tg->se[i]);
		7539	if (tg->rt_rq)
		7540	kfree(tg->rt_rq[i]);
		7541	if (tg->rt_se)
		7542	kfree(tg->rt_se[i]);
		7543	}
		7544
		7545	kfree(tg->cfs_rq);
		7546	kfree(tg->se);
		7547	kfree(tg->rt_rq);
		7548	kfree(tg->rt_se);
		7549	kfree(tg);
		7550	}
		7551
7463	/* allocate runqueue etc for a new task group */	7552	/* allocate runqueue etc for a new task group */
7464	struct task_group *sched_create_group(void)	7553	struct task_group *sched_create_group(void)
7465	{	7554	{
7466	struct task_group *tg;	7555	struct task_group *tg;
7467	struct cfs_rq *cfs_rq;	7556	struct cfs_rq *cfs_rq;
7468	struct sched_entity *se;	7557	struct sched_entity *se;
		7558	struct rt_rq *rt_rq;
		7559	struct sched_rt_entity *rt_se;
7469	struct rq *rq;	7560	struct rq *rq;
7470	int i;	7561	int i;
7471		7562
@@ -7479,100 +7570,89 @@ struct task_group *sched_create_group(void)
7479	tg->se = kzalloc(sizeof(se) * NR_CPUS, GFP_KERNEL);	7570	tg->se = kzalloc(sizeof(se) * NR_CPUS, GFP_KERNEL);
7480	if (!tg->se)	7571	if (!tg->se)
7481	goto err;	7572	goto err;
		7573	tg->rt_rq = kzalloc(sizeof(rt_rq) * NR_CPUS, GFP_KERNEL);
		7574	if (!tg->rt_rq)
		7575	goto err;
		7576	tg->rt_se = kzalloc(sizeof(rt_se) * NR_CPUS, GFP_KERNEL);
		7577	if (!tg->rt_se)
		7578	goto err;
		7579
		7580	tg->shares = NICE_0_LOAD;
		7581	tg->rt_ratio = 0; /* XXX */
7482		7582
7483	for_each_possible_cpu(i) {	7583	for_each_possible_cpu(i) {
7484	rq = cpu_rq(i);	7584	rq = cpu_rq(i);
7485		7585
7486	cfs_rq = kmalloc_node(sizeof(struct cfs_rq), GFP_KERNEL,	7586	cfs_rq = kmalloc_node(sizeof(struct cfs_rq),
7487	cpu_to_node(i));	7587	GFP_KERNEL\|__GFP_ZERO, cpu_to_node(i));
7488	if (!cfs_rq)	7588	if (!cfs_rq)
7489	goto err;	7589	goto err;
7490		7590
7491	se = kmalloc_node(sizeof(struct sched_entity), GFP_KERNEL,	7591	se = kmalloc_node(sizeof(struct sched_entity),
7492	cpu_to_node(i));	7592	GFP_KERNEL\|__GFP_ZERO, cpu_to_node(i));
7493	if (!se)	7593	if (!se)
7494	goto err;	7594	goto err;
7495		7595
7496	memset(cfs_rq, 0, sizeof(struct cfs_rq));	7596	rt_rq = kmalloc_node(sizeof(struct rt_rq),
7497	memset(se, 0, sizeof(struct sched_entity));	7597	GFP_KERNEL\|__GFP_ZERO, cpu_to_node(i));
		7598	if (!rt_rq)
		7599	goto err;
7498		7600
7499	tg->cfs_rq[i] = cfs_rq;	7601	rt_se = kmalloc_node(sizeof(struct sched_rt_entity),
7500	init_cfs_rq(cfs_rq, rq);	7602	GFP_KERNEL\|__GFP_ZERO, cpu_to_node(i));
7501	cfs_rq->tg = tg;	7603	if (!rt_se)
		7604	goto err;
7502		7605
7503	tg->se[i] = se;	7606	init_tg_cfs_entry(rq, tg, cfs_rq, se, i, 0);
7504	se->cfs_rq = &rq->cfs;	7607	init_tg_rt_entry(rq, tg, rt_rq, rt_se, i, 0);
7505	se->my_q = cfs_rq;
7506	se->load.weight = NICE_0_LOAD;
7507	se->load.inv_weight = div64_64(1ULL<<32, NICE_0_LOAD);
7508	se->parent = NULL;
7509	}	7608	}
7510		7609
7511	tg->shares = NICE_0_LOAD;
7512
7513	lock_task_group_list();	7610	lock_task_group_list();
7514	for_each_possible_cpu(i) {	7611	for_each_possible_cpu(i) {
7515	rq = cpu_rq(i);	7612	rq = cpu_rq(i);
7516	cfs_rq = tg->cfs_rq[i];	7613	cfs_rq = tg->cfs_rq[i];
7517	list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);	7614	list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
		7615	rt_rq = tg->rt_rq[i];
		7616	list_add_rcu(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
7518	}	7617	}
		7618	list_add_rcu(&tg->list, &task_groups);
7519	unlock_task_group_list();	7619	unlock_task_group_list();
7520		7620
7521	return tg;	7621	return tg;
7522		7622
7523	err:	7623	err:
7524	for_each_possible_cpu(i) {	7624	free_sched_group(tg);
7525	if (tg->cfs_rq)
7526	kfree(tg->cfs_rq[i]);
7527	if (tg->se)
7528	kfree(tg->se[i]);
7529	}
7530	kfree(tg->cfs_rq);
7531	kfree(tg->se);
7532	kfree(tg);
7533
7534	return ERR_PTR(-ENOMEM);	7625	return ERR_PTR(-ENOMEM);
7535	}	7626	}
7536		7627
7537	/* rcu callback to free various structures associated with a task group */	7628	/* rcu callback to free various structures associated with a task group */
7538	static void free_sched_group(struct rcu_head *rhp)	7629	static void free_sched_group_rcu(struct rcu_head *rhp)
7539	{	7630	{
7540	struct task_group *tg = container_of(rhp, struct task_group, rcu);
7541	struct cfs_rq *cfs_rq;
7542	struct sched_entity *se;
7543	int i;
7544
7545	/* now it should be safe to free those cfs_rqs */	7631	/* now it should be safe to free those cfs_rqs */
7546	for_each_possible_cpu(i) {	7632	free_sched_group(container_of(rhp, struct task_group, rcu));
7547	cfs_rq = tg->cfs_rq[i];
7548	kfree(cfs_rq);
7549
7550	se = tg->se[i];
7551	kfree(se);
7552	}
7553
7554	kfree(tg->cfs_rq);
7555	kfree(tg->se);
7556	kfree(tg);
7557	}	7633	}
7558		7634
7559	/* Destroy runqueue etc associated with a task group */	7635	/* Destroy runqueue etc associated with a task group */
7560	void sched_destroy_group(struct task_group *tg)	7636	void sched_destroy_group(struct task_group *tg)
7561	{	7637	{
7562	struct cfs_rq *cfs_rq = NULL;	7638	struct cfs_rq *cfs_rq = NULL;
		7639	struct rt_rq *rt_rq = NULL;
7563	int i;	7640	int i;
7564		7641
7565	lock_task_group_list();	7642	lock_task_group_list();
7566	for_each_possible_cpu(i) {	7643	for_each_possible_cpu(i) {
7567	cfs_rq = tg->cfs_rq[i];	7644	cfs_rq = tg->cfs_rq[i];
7568	list_del_rcu(&cfs_rq->leaf_cfs_rq_list);	7645	list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
		7646	rt_rq = tg->rt_rq[i];
		7647	list_del_rcu(&rt_rq->leaf_rt_rq_list);
7569	}	7648	}
		7649	list_del_rcu(&tg->list);
7570	unlock_task_group_list();	7650	unlock_task_group_list();
7571		7651
7572	BUG_ON(!cfs_rq);	7652	BUG_ON(!cfs_rq);
7573		7653
7574	/* wait for possible concurrent references to cfs_rqs complete */	7654	/* wait for possible concurrent references to cfs_rqs complete */
7575	call_rcu(&tg->rcu, free_sched_group);	7655	call_rcu(&tg->rcu, free_sched_group_rcu);
7576	}	7656	}
7577		7657
7578	/* change task's runqueue when it moves between groups.	7658	/* change task's runqueue when it moves between groups.
@@ -7588,11 +7668,6 @@ void sched_move_task(struct task_struct *tsk)
7588		7668
7589	rq = task_rq_lock(tsk, &flags);	7669	rq = task_rq_lock(tsk, &flags);
7590		7670
7591	if (tsk->sched_class != &fair_sched_class) {
7592	set_task_cfs_rq(tsk, task_cpu(tsk));
7593	goto done;
7594	}
7595
7596	update_rq_clock(rq);	7671	update_rq_clock(rq);
7597		7672
7598	running = task_current(rq, tsk);	7673	running = task_current(rq, tsk);
@@ -7604,7 +7679,7 @@ void sched_move_task(struct task_struct *tsk)
7604	tsk->sched_class->put_prev_task(rq, tsk);	7679	tsk->sched_class->put_prev_task(rq, tsk);
7605	}	7680	}
7606		7681
7607	set_task_cfs_rq(tsk, task_cpu(tsk));	7682	set_task_rq(tsk, task_cpu(tsk));
7608		7683
7609	if (on_rq) {	7684	if (on_rq) {
7610	if (unlikely(running))	7685	if (unlikely(running))
@@ -7612,7 +7687,6 @@ void sched_move_task(struct task_struct *tsk)
7612	enqueue_task(rq, tsk, 0);	7687	enqueue_task(rq, tsk, 0);
7613	}	7688	}
7614		7689
7615	done:
7616	task_rq_unlock(rq, &flags);	7690	task_rq_unlock(rq, &flags);
7617	}	7691	}
7618		7692
@@ -7697,6 +7771,31 @@ unsigned long sched_group_shares(struct task_group *tg)
7697	return tg->shares;	7771	return tg->shares;
7698	}	7772	}
7699		7773
		7774	/*
		7775	* Ensure the total rt_ratio <= sysctl_sched_rt_ratio
		7776	*/
		7777	int sched_group_set_rt_ratio(struct task_group *tg, unsigned long rt_ratio)
		7778	{
		7779	struct task_group *tgi;
		7780	unsigned long total = 0;
		7781
		7782	rcu_read_lock();
		7783	list_for_each_entry_rcu(tgi, &task_groups, list)
		7784	total += tgi->rt_ratio;
		7785	rcu_read_unlock();
		7786
		7787	if (total + rt_ratio - tg->rt_ratio > sysctl_sched_rt_ratio)
		7788	return -EINVAL;
		7789
		7790	tg->rt_ratio = rt_ratio;
		7791	return 0;
		7792	}
		7793
		7794	unsigned long sched_group_rt_ratio(struct task_group *tg)
		7795	{
		7796	return tg->rt_ratio;
		7797	}
		7798
7700	#endif /* CONFIG_FAIR_GROUP_SCHED */	7799	#endif /* CONFIG_FAIR_GROUP_SCHED */
7701		7800
7702	#ifdef CONFIG_FAIR_CGROUP_SCHED	7801	#ifdef CONFIG_FAIR_CGROUP_SCHED
@@ -7772,12 +7871,30 @@ static u64 cpu_shares_read_uint(struct cgroup cgrp, struct cftype cft)
7772	return (u64) tg->shares;	7871	return (u64) tg->shares;
7773	}	7872	}
7774		7873
		7874	static int cpu_rt_ratio_write_uint(struct cgroup cgrp, struct cftype cftype,
		7875	u64 rt_ratio_val)
		7876	{
		7877	return sched_group_set_rt_ratio(cgroup_tg(cgrp), rt_ratio_val);
		7878	}
		7879
		7880	static u64 cpu_rt_ratio_read_uint(struct cgroup cgrp, struct cftype cft)
		7881	{
		7882	struct task_group *tg = cgroup_tg(cgrp);
		7883
		7884	return (u64) tg->rt_ratio;
		7885	}
		7886
7775	static struct cftype cpu_files[] = {	7887	static struct cftype cpu_files[] = {
7776	{	7888	{
7777	.name = "shares",	7889	.name = "shares",
7778	.read_uint = cpu_shares_read_uint,	7890	.read_uint = cpu_shares_read_uint,
7779	.write_uint = cpu_shares_write_uint,	7891	.write_uint = cpu_shares_write_uint,
7780	},	7892	},
		7893	{
		7894	.name = "rt_ratio",
		7895	.read_uint = cpu_rt_ratio_read_uint,
		7896	.write_uint = cpu_rt_ratio_write_uint,
		7897	},
7781	};	7898	};
7782		7899
7783	static int cpu_cgroup_populate(struct cgroup_subsys ss, struct cgroup cont)	7900	static int cpu_cgroup_populate(struct cgroup_subsys ss, struct cgroup cont)