aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSrivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>2007-10-15 11:00:07 -0400
committerIngo Molnar <mingo@elte.hu>2007-10-15 11:00:07 -0400
commit29f59db3a74b0bdf78a1f5b53ef773caa82692dc (patch)
tree5ac877639bac41d3749d08b7f624bd13052e5ec6
parent119fe5e06800afc197781ebc8c2d8ca7d03497c8 (diff)
sched: group-scheduler core
Add interface to control cpu bandwidth allocation to task-groups. (not yet configurable, due to missing CONFIG_CONTAINERS) Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com> Signed-off-by: Dhaval Giani <dhaval@linux.vnet.ibm.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
-rw-r--r--init/Kconfig9
-rw-r--r--kernel/sched.c346
-rw-r--r--kernel/sched_fair.c3
-rw-r--r--kernel/sched_idletask.c5
-rw-r--r--kernel/sched_rt.c5
5 files changed, 350 insertions, 18 deletions
diff --git a/init/Kconfig b/init/Kconfig
index d54d0cadcc06..11c6762a6529 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -281,6 +281,15 @@ config CPUSETS
281 281
282 Say N if unsure. 282 Say N if unsure.
283 283
284config FAIR_GROUP_SCHED
285 bool "Fair group scheduler"
286 depends on EXPERIMENTAL && CONTAINERS
287 help
288 This option enables you to group tasks and control CPU resource
289 allocation to such groups.
290
291 Say N if unsure.
292
284config SYSFS_DEPRECATED 293config SYSFS_DEPRECATED
285 bool "Create deprecated sysfs files" 294 bool "Create deprecated sysfs files"
286 default y 295 default y
diff --git a/kernel/sched.c b/kernel/sched.c
index 4ad789d268fe..b2688ce54b11 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -171,6 +171,58 @@ struct rt_prio_array {
171 struct list_head queue[MAX_RT_PRIO]; 171 struct list_head queue[MAX_RT_PRIO];
172}; 172};
173 173
174#ifdef CONFIG_FAIR_GROUP_SCHED
175
176#include <linux/container.h>
177
178struct cfs_rq;
179
180/* task group related information */
181struct task_grp {
182 struct container_subsys_state css;
183 /* schedulable entities of this group on each cpu */
184 struct sched_entity **se;
185 /* runqueue "owned" by this group on each cpu */
186 struct cfs_rq **cfs_rq;
187 unsigned long shares;
188};
189
190/* Default task group's sched entity on each cpu */
191static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
192/* Default task group's cfs_rq on each cpu */
193static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
194
195static struct sched_entity *init_sched_entity_p[CONFIG_NR_CPUS];
196static struct cfs_rq *init_cfs_rq_p[CONFIG_NR_CPUS];
197
198/* Default task group.
199 * Every task in system belong to this group at bootup.
200 */
201static struct task_grp init_task_grp = {
202 .se = init_sched_entity_p,
203 .cfs_rq = init_cfs_rq_p,
204 };
205
206/* return group to which a task belongs */
207static inline struct task_grp *task_grp(struct task_struct *p)
208{
209 return container_of(task_subsys_state(p, cpu_subsys_id),
210 struct task_grp, css);
211}
212
213/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
214static inline void set_task_cfs_rq(struct task_struct *p)
215{
216 p->se.cfs_rq = task_grp(p)->cfs_rq[task_cpu(p)];
217 p->se.parent = task_grp(p)->se[task_cpu(p)];
218}
219
220#else
221
222static inline void set_task_cfs_rq(struct task_struct *p) { }
223
224#endif /* CONFIG_FAIR_GROUP_SCHED */
225
174/* CFS-related fields in a runqueue */ 226/* CFS-related fields in a runqueue */
175struct cfs_rq { 227struct cfs_rq {
176 struct load_weight load; 228 struct load_weight load;
@@ -197,6 +249,7 @@ struct cfs_rq {
197 * list is used during load balance. 249 * list is used during load balance.
198 */ 250 */
199 struct list_head leaf_cfs_rq_list; /* Better name : task_cfs_rq_list? */ 251 struct list_head leaf_cfs_rq_list; /* Better name : task_cfs_rq_list? */
252 struct task_grp *tg; /* group that "owns" this runqueue */
200#endif 253#endif
201}; 254};
202 255
@@ -419,18 +472,6 @@ unsigned long long cpu_clock(int cpu)
419 return now; 472 return now;
420} 473}
421 474
422#ifdef CONFIG_FAIR_GROUP_SCHED
423/* Change a task's ->cfs_rq if it moves across CPUs */
424static inline void set_task_cfs_rq(struct task_struct *p)
425{
426 p->se.cfs_rq = &task_rq(p)->cfs;
427}
428#else
429static inline void set_task_cfs_rq(struct task_struct *p)
430{
431}
432#endif
433
434#ifndef prepare_arch_switch 475#ifndef prepare_arch_switch
435# define prepare_arch_switch(next) do { } while (0) 476# define prepare_arch_switch(next) do { } while (0)
436#endif 477#endif
@@ -970,8 +1011,8 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
970{ 1011{
971#ifdef CONFIG_SMP 1012#ifdef CONFIG_SMP
972 task_thread_info(p)->cpu = cpu; 1013 task_thread_info(p)->cpu = cpu;
973 set_task_cfs_rq(p);
974#endif 1014#endif
1015 set_task_cfs_rq(p);
975} 1016}
976 1017
977#ifdef CONFIG_SMP 1018#ifdef CONFIG_SMP
@@ -3885,8 +3926,11 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
3885 3926
3886 oldprio = p->prio; 3927 oldprio = p->prio;
3887 on_rq = p->se.on_rq; 3928 on_rq = p->se.on_rq;
3888 if (on_rq) 3929 if (on_rq) {
3889 dequeue_task(rq, p, 0); 3930 dequeue_task(rq, p, 0);
3931 if (task_running(rq, p))
3932 p->sched_class->put_prev_task(rq, p);
3933 }
3890 3934
3891 if (rt_prio(prio)) 3935 if (rt_prio(prio))
3892 p->sched_class = &rt_sched_class; 3936 p->sched_class = &rt_sched_class;
@@ -3905,6 +3949,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
3905 if (task_running(rq, p)) { 3949 if (task_running(rq, p)) {
3906 if (p->prio > oldprio) 3950 if (p->prio > oldprio)
3907 resched_task(rq->curr); 3951 resched_task(rq->curr);
3952 p->sched_class->set_curr_task(rq);
3908 } else { 3953 } else {
3909 check_preempt_curr(rq, p); 3954 check_preempt_curr(rq, p);
3910 } 3955 }
@@ -4190,8 +4235,11 @@ recheck:
4190 } 4235 }
4191 update_rq_clock(rq); 4236 update_rq_clock(rq);
4192 on_rq = p->se.on_rq; 4237 on_rq = p->se.on_rq;
4193 if (on_rq) 4238 if (on_rq) {
4194 deactivate_task(rq, p, 0); 4239 deactivate_task(rq, p, 0);
4240 if (task_running(rq, p))
4241 p->sched_class->put_prev_task(rq, p);
4242 }
4195 oldprio = p->prio; 4243 oldprio = p->prio;
4196 __setscheduler(rq, p, policy, param->sched_priority); 4244 __setscheduler(rq, p, policy, param->sched_priority);
4197 if (on_rq) { 4245 if (on_rq) {
@@ -4204,6 +4252,7 @@ recheck:
4204 if (task_running(rq, p)) { 4252 if (task_running(rq, p)) {
4205 if (p->prio > oldprio) 4253 if (p->prio > oldprio)
4206 resched_task(rq->curr); 4254 resched_task(rq->curr);
4255 p->sched_class->set_curr_task(rq);
4207 } else { 4256 } else {
4208 check_preempt_curr(rq, p); 4257 check_preempt_curr(rq, p);
4209 } 4258 }
@@ -6444,7 +6493,25 @@ void __init sched_init(void)
6444 init_cfs_rq(&rq->cfs, rq); 6493 init_cfs_rq(&rq->cfs, rq);
6445#ifdef CONFIG_FAIR_GROUP_SCHED 6494#ifdef CONFIG_FAIR_GROUP_SCHED
6446 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); 6495 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
6447 list_add(&rq->cfs.leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); 6496 {
6497 struct cfs_rq *cfs_rq = &per_cpu(init_cfs_rq, i);
6498 struct sched_entity *se =
6499 &per_cpu(init_sched_entity, i);
6500
6501 init_cfs_rq_p[i] = cfs_rq;
6502 init_cfs_rq(cfs_rq, rq);
6503 cfs_rq->tg = &init_task_grp;
6504 list_add(&cfs_rq->leaf_cfs_rq_list,
6505 &rq->leaf_cfs_rq_list);
6506
6507 init_sched_entity_p[i] = se;
6508 se->cfs_rq = &rq->cfs;
6509 se->my_q = cfs_rq;
6510 se->load.weight = NICE_0_LOAD;
6511 se->load.inv_weight = div64_64(1ULL<<32, NICE_0_LOAD);
6512 se->parent = NULL;
6513 }
6514 init_task_grp.shares = NICE_0_LOAD;
6448#endif 6515#endif
6449 6516
6450 for (j = 0; j < CPU_LOAD_IDX_MAX; j++) 6517 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
@@ -6632,3 +6699,250 @@ void set_curr_task(int cpu, struct task_struct *p)
6632} 6699}
6633 6700
6634#endif 6701#endif
6702
6703#ifdef CONFIG_FAIR_GROUP_SCHED
6704
6705/* return corresponding task_grp object of a container */
6706static inline struct task_grp *container_tg(struct container *cont)
6707{
6708 return container_of(container_subsys_state(cont, cpu_subsys_id),
6709 struct task_grp, css);
6710}
6711
6712/* allocate runqueue etc for a new task group */
6713static struct container_subsys_state *
6714sched_create_group(struct container_subsys *ss, struct container *cont)
6715{
6716 struct task_grp *tg;
6717 struct cfs_rq *cfs_rq;
6718 struct sched_entity *se;
6719 int i;
6720
6721 if (!cont->parent) {
6722 /* This is early initialization for the top container */
6723 init_task_grp.css.container = cont;
6724 return &init_task_grp.css;
6725 }
6726
6727 /* we support only 1-level deep hierarchical scheduler atm */
6728 if (cont->parent->parent)
6729 return ERR_PTR(-EINVAL);
6730
6731 tg = kzalloc(sizeof(*tg), GFP_KERNEL);
6732 if (!tg)
6733 return ERR_PTR(-ENOMEM);
6734
6735 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * num_possible_cpus(), GFP_KERNEL);
6736 if (!tg->cfs_rq)
6737 goto err;
6738 tg->se = kzalloc(sizeof(se) * num_possible_cpus(), GFP_KERNEL);
6739 if (!tg->se)
6740 goto err;
6741
6742 for_each_possible_cpu(i) {
6743 struct rq *rq = cpu_rq(i);
6744
6745 cfs_rq = kmalloc_node(sizeof(struct cfs_rq), GFP_KERNEL,
6746 cpu_to_node(i));
6747 if (!cfs_rq)
6748 goto err;
6749
6750 se = kmalloc_node(sizeof(struct sched_entity), GFP_KERNEL,
6751 cpu_to_node(i));
6752 if (!se)
6753 goto err;
6754
6755 memset(cfs_rq, 0, sizeof(struct cfs_rq));
6756 memset(se, 0, sizeof(struct sched_entity));
6757
6758 tg->cfs_rq[i] = cfs_rq;
6759 init_cfs_rq(cfs_rq, rq);
6760 cfs_rq->tg = tg;
6761 list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
6762
6763 tg->se[i] = se;
6764 se->cfs_rq = &rq->cfs;
6765 se->my_q = cfs_rq;
6766 se->load.weight = NICE_0_LOAD;
6767 se->load.inv_weight = div64_64(1ULL<<32, NICE_0_LOAD);
6768 se->parent = NULL;
6769 }
6770
6771 tg->shares = NICE_0_LOAD;
6772
6773 /* Bind the container to task_grp object we just created */
6774 tg->css.container = cont;
6775
6776 return &tg->css;
6777
6778err:
6779 for_each_possible_cpu(i) {
6780 if (tg->cfs_rq && tg->cfs_rq[i])
6781 kfree(tg->cfs_rq[i]);
6782 if (tg->se && tg->se[i])
6783 kfree(tg->se[i]);
6784 }
6785 if (tg->cfs_rq)
6786 kfree(tg->cfs_rq);
6787 if (tg->se)
6788 kfree(tg->se);
6789 if (tg)
6790 kfree(tg);
6791
6792 return ERR_PTR(-ENOMEM);
6793}
6794
6795
6796/* destroy runqueue etc associated with a task group */
6797static void sched_destroy_group(struct container_subsys *ss,
6798 struct container *cont)
6799{
6800 struct task_grp *tg = container_tg(cont);
6801 struct cfs_rq *cfs_rq;
6802 struct sched_entity *se;
6803 int i;
6804
6805 for_each_possible_cpu(i) {
6806 cfs_rq = tg->cfs_rq[i];
6807 list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
6808 }
6809
6810 /* wait for possible concurrent references to cfs_rqs complete */
6811 synchronize_sched();
6812
6813 /* now it should be safe to free those cfs_rqs */
6814 for_each_possible_cpu(i) {
6815 cfs_rq = tg->cfs_rq[i];
6816 kfree(cfs_rq);
6817
6818 se = tg->se[i];
6819 kfree(se);
6820 }
6821
6822 kfree(tg->cfs_rq);
6823 kfree(tg->se);
6824 kfree(tg);
6825}
6826
6827static int sched_can_attach(struct container_subsys *ss,
6828 struct container *cont, struct task_struct *tsk)
6829{
6830 /* We don't support RT-tasks being in separate groups */
6831 if (tsk->sched_class != &fair_sched_class)
6832 return -EINVAL;
6833
6834 return 0;
6835}
6836
6837/* change task's runqueue when it moves between groups */
6838static void sched_move_task(struct container_subsys *ss, struct container *cont,
6839 struct container *old_cont, struct task_struct *tsk)
6840{
6841 int on_rq, running;
6842 unsigned long flags;
6843 struct rq *rq;
6844
6845 rq = task_rq_lock(tsk, &flags);
6846
6847 if (tsk->sched_class != &fair_sched_class)
6848 goto done;
6849
6850 update_rq_clock(rq);
6851
6852 running = task_running(rq, tsk);
6853 on_rq = tsk->se.on_rq;
6854
6855 if (on_rq) {
6856 dequeue_task(rq, tsk, 0);
6857 if (unlikely(running))
6858 tsk->sched_class->put_prev_task(rq, tsk);
6859 }
6860
6861 set_task_cfs_rq(tsk);
6862
6863 if (on_rq) {
6864 enqueue_task(rq, tsk, 0);
6865 if (unlikely(running))
6866 tsk->sched_class->set_curr_task(rq);
6867 }
6868
6869done:
6870 task_rq_unlock(rq, &flags);
6871}
6872
6873static void set_se_shares(struct sched_entity *se, unsigned long shares)
6874{
6875 struct cfs_rq *cfs_rq = se->cfs_rq;
6876 struct rq *rq = cfs_rq->rq;
6877 int on_rq;
6878
6879 spin_lock_irq(&rq->lock);
6880
6881 on_rq = se->on_rq;
6882 if (on_rq)
6883 dequeue_entity(cfs_rq, se, 0);
6884
6885 se->load.weight = shares;
6886 se->load.inv_weight = div64_64((1ULL<<32), shares);
6887
6888 if (on_rq)
6889 enqueue_entity(cfs_rq, se, 0);
6890
6891 spin_unlock_irq(&rq->lock);
6892}
6893
6894static ssize_t cpu_shares_write(struct container *cont, struct cftype *cftype,
6895 struct file *file, const char __user *userbuf,
6896 size_t nbytes, loff_t *ppos)
6897{
6898 int i;
6899 unsigned long shareval;
6900 struct task_grp *tg = container_tg(cont);
6901 char buffer[2*sizeof(unsigned long) + 1];
6902
6903 if (nbytes > 2*sizeof(unsigned long)) /* safety check */
6904 return -E2BIG;
6905
6906 if (copy_from_user(buffer, userbuf, nbytes))
6907 return -EFAULT;
6908
6909 buffer[nbytes] = 0; /* nul-terminate */
6910 shareval = simple_strtoul(buffer, NULL, 10);
6911
6912 tg->shares = shareval;
6913 for_each_possible_cpu(i)
6914 set_se_shares(tg->se[i], shareval);
6915
6916 return nbytes;
6917}
6918
6919static u64 cpu_shares_read_uint(struct container *cont, struct cftype *cft)
6920{
6921 struct task_grp *tg = container_tg(cont);
6922
6923 return (u64) tg->shares;
6924}
6925
6926struct cftype cpuctl_share = {
6927 .name = "shares",
6928 .read_uint = cpu_shares_read_uint,
6929 .write = cpu_shares_write,
6930};
6931
6932static int sched_populate(struct container_subsys *ss, struct container *cont)
6933{
6934 return container_add_file(cont, ss, &cpuctl_share);
6935}
6936
6937struct container_subsys cpu_subsys = {
6938 .name = "cpu",
6939 .create = sched_create_group,
6940 .destroy = sched_destroy_group,
6941 .can_attach = sched_can_attach,
6942 .attach = sched_move_task,
6943 .populate = sched_populate,
6944 .subsys_id = cpu_subsys_id,
6945 .early_init = 1,
6946};
6947
6948#endif /* CONFIG_FAIR_GROUP_SCHED */
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index ec445cadbb01..12ab9338d563 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -610,8 +610,7 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
610 */ 610 */
611static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) 611static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
612{ 612{
613 /* A later patch will take group into account */ 613 return cfs_rq->tg->cfs_rq[this_cpu];
614 return &cpu_rq(this_cpu)->cfs;
615} 614}
616 615
617/* Iterate thr' all leaf cfs_rq's on a runqueue */ 616/* Iterate thr' all leaf cfs_rq's on a runqueue */
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 3503fb2d9f96..5ebf829cdd73 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -50,6 +50,10 @@ static void task_tick_idle(struct rq *rq, struct task_struct *curr)
50{ 50{
51} 51}
52 52
53static void set_curr_task_idle(struct rq *rq)
54{
55}
56
53/* 57/*
54 * Simple, special scheduling class for the per-CPU idle tasks: 58 * Simple, special scheduling class for the per-CPU idle tasks:
55 */ 59 */
@@ -66,6 +70,7 @@ static struct sched_class idle_sched_class __read_mostly = {
66 70
67 .load_balance = load_balance_idle, 71 .load_balance = load_balance_idle,
68 72
73 .set_curr_task = set_curr_task_idle,
69 .task_tick = task_tick_idle, 74 .task_tick = task_tick_idle,
70 /* no .task_new for idle tasks */ 75 /* no .task_new for idle tasks */
71}; 76};
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 4b87476a02d0..45b339f56aea 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -218,6 +218,10 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p)
218 } 218 }
219} 219}
220 220
221static void set_curr_task_rt(struct rq *rq)
222{
223}
224
221static struct sched_class rt_sched_class __read_mostly = { 225static struct sched_class rt_sched_class __read_mostly = {
222 .enqueue_task = enqueue_task_rt, 226 .enqueue_task = enqueue_task_rt,
223 .dequeue_task = dequeue_task_rt, 227 .dequeue_task = dequeue_task_rt,
@@ -230,5 +234,6 @@ static struct sched_class rt_sched_class __read_mostly = {
230 234
231 .load_balance = load_balance_rt, 235 .load_balance = load_balance_rt,
232 236
237 .set_curr_task = set_curr_task_rt,
233 .task_tick = task_tick_rt, 238 .task_tick = task_tick_rt,
234}; 239};