diff options
author | Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com> | 2007-10-15 11:00:07 -0400 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2007-10-15 11:00:07 -0400 |
commit | 29f59db3a74b0bdf78a1f5b53ef773caa82692dc (patch) | |
tree | 5ac877639bac41d3749d08b7f624bd13052e5ec6 /kernel/sched.c | |
parent | 119fe5e06800afc197781ebc8c2d8ca7d03497c8 (diff) |
sched: group-scheduler core
Add interface to control cpu bandwidth allocation to task-groups.
(not yet configurable, due to missing CONFIG_CONTAINERS)
Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Signed-off-by: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Diffstat (limited to 'kernel/sched.c')
-rw-r--r-- | kernel/sched.c | 346 |
1 files changed, 330 insertions, 16 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index 4ad789d268fe..b2688ce54b11 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -171,6 +171,58 @@ struct rt_prio_array { | |||
171 | struct list_head queue[MAX_RT_PRIO]; | 171 | struct list_head queue[MAX_RT_PRIO]; |
172 | }; | 172 | }; |
173 | 173 | ||
174 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
175 | |||
176 | #include <linux/container.h> | ||
177 | |||
178 | struct cfs_rq; | ||
179 | |||
180 | /* task group related information */ | ||
181 | struct task_grp { | ||
182 | struct container_subsys_state css; | ||
183 | /* schedulable entities of this group on each cpu */ | ||
184 | struct sched_entity **se; | ||
185 | /* runqueue "owned" by this group on each cpu */ | ||
186 | struct cfs_rq **cfs_rq; | ||
187 | unsigned long shares; | ||
188 | }; | ||
189 | |||
190 | /* Default task group's sched entity on each cpu */ | ||
191 | static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); | ||
192 | /* Default task group's cfs_rq on each cpu */ | ||
193 | static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; | ||
194 | |||
195 | static struct sched_entity *init_sched_entity_p[CONFIG_NR_CPUS]; | ||
196 | static struct cfs_rq *init_cfs_rq_p[CONFIG_NR_CPUS]; | ||
197 | |||
198 | /* Default task group. | ||
199 | * Every task in system belong to this group at bootup. | ||
200 | */ | ||
201 | static struct task_grp init_task_grp = { | ||
202 | .se = init_sched_entity_p, | ||
203 | .cfs_rq = init_cfs_rq_p, | ||
204 | }; | ||
205 | |||
206 | /* return group to which a task belongs */ | ||
207 | static inline struct task_grp *task_grp(struct task_struct *p) | ||
208 | { | ||
209 | return container_of(task_subsys_state(p, cpu_subsys_id), | ||
210 | struct task_grp, css); | ||
211 | } | ||
212 | |||
213 | /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ | ||
214 | static inline void set_task_cfs_rq(struct task_struct *p) | ||
215 | { | ||
216 | p->se.cfs_rq = task_grp(p)->cfs_rq[task_cpu(p)]; | ||
217 | p->se.parent = task_grp(p)->se[task_cpu(p)]; | ||
218 | } | ||
219 | |||
220 | #else | ||
221 | |||
222 | static inline void set_task_cfs_rq(struct task_struct *p) { } | ||
223 | |||
224 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | ||
225 | |||
174 | /* CFS-related fields in a runqueue */ | 226 | /* CFS-related fields in a runqueue */ |
175 | struct cfs_rq { | 227 | struct cfs_rq { |
176 | struct load_weight load; | 228 | struct load_weight load; |
@@ -197,6 +249,7 @@ struct cfs_rq { | |||
197 | * list is used during load balance. | 249 | * list is used during load balance. |
198 | */ | 250 | */ |
199 | struct list_head leaf_cfs_rq_list; /* Better name : task_cfs_rq_list? */ | 251 | struct list_head leaf_cfs_rq_list; /* Better name : task_cfs_rq_list? */ |
252 | struct task_grp *tg; /* group that "owns" this runqueue */ | ||
200 | #endif | 253 | #endif |
201 | }; | 254 | }; |
202 | 255 | ||
@@ -419,18 +472,6 @@ unsigned long long cpu_clock(int cpu) | |||
419 | return now; | 472 | return now; |
420 | } | 473 | } |
421 | 474 | ||
422 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
423 | /* Change a task's ->cfs_rq if it moves across CPUs */ | ||
424 | static inline void set_task_cfs_rq(struct task_struct *p) | ||
425 | { | ||
426 | p->se.cfs_rq = &task_rq(p)->cfs; | ||
427 | } | ||
428 | #else | ||
429 | static inline void set_task_cfs_rq(struct task_struct *p) | ||
430 | { | ||
431 | } | ||
432 | #endif | ||
433 | |||
434 | #ifndef prepare_arch_switch | 475 | #ifndef prepare_arch_switch |
435 | # define prepare_arch_switch(next) do { } while (0) | 476 | # define prepare_arch_switch(next) do { } while (0) |
436 | #endif | 477 | #endif |
@@ -970,8 +1011,8 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) | |||
970 | { | 1011 | { |
971 | #ifdef CONFIG_SMP | 1012 | #ifdef CONFIG_SMP |
972 | task_thread_info(p)->cpu = cpu; | 1013 | task_thread_info(p)->cpu = cpu; |
973 | set_task_cfs_rq(p); | ||
974 | #endif | 1014 | #endif |
1015 | set_task_cfs_rq(p); | ||
975 | } | 1016 | } |
976 | 1017 | ||
977 | #ifdef CONFIG_SMP | 1018 | #ifdef CONFIG_SMP |
@@ -3885,8 +3926,11 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
3885 | 3926 | ||
3886 | oldprio = p->prio; | 3927 | oldprio = p->prio; |
3887 | on_rq = p->se.on_rq; | 3928 | on_rq = p->se.on_rq; |
3888 | if (on_rq) | 3929 | if (on_rq) { |
3889 | dequeue_task(rq, p, 0); | 3930 | dequeue_task(rq, p, 0); |
3931 | if (task_running(rq, p)) | ||
3932 | p->sched_class->put_prev_task(rq, p); | ||
3933 | } | ||
3890 | 3934 | ||
3891 | if (rt_prio(prio)) | 3935 | if (rt_prio(prio)) |
3892 | p->sched_class = &rt_sched_class; | 3936 | p->sched_class = &rt_sched_class; |
@@ -3905,6 +3949,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
3905 | if (task_running(rq, p)) { | 3949 | if (task_running(rq, p)) { |
3906 | if (p->prio > oldprio) | 3950 | if (p->prio > oldprio) |
3907 | resched_task(rq->curr); | 3951 | resched_task(rq->curr); |
3952 | p->sched_class->set_curr_task(rq); | ||
3908 | } else { | 3953 | } else { |
3909 | check_preempt_curr(rq, p); | 3954 | check_preempt_curr(rq, p); |
3910 | } | 3955 | } |
@@ -4190,8 +4235,11 @@ recheck: | |||
4190 | } | 4235 | } |
4191 | update_rq_clock(rq); | 4236 | update_rq_clock(rq); |
4192 | on_rq = p->se.on_rq; | 4237 | on_rq = p->se.on_rq; |
4193 | if (on_rq) | 4238 | if (on_rq) { |
4194 | deactivate_task(rq, p, 0); | 4239 | deactivate_task(rq, p, 0); |
4240 | if (task_running(rq, p)) | ||
4241 | p->sched_class->put_prev_task(rq, p); | ||
4242 | } | ||
4195 | oldprio = p->prio; | 4243 | oldprio = p->prio; |
4196 | __setscheduler(rq, p, policy, param->sched_priority); | 4244 | __setscheduler(rq, p, policy, param->sched_priority); |
4197 | if (on_rq) { | 4245 | if (on_rq) { |
@@ -4204,6 +4252,7 @@ recheck: | |||
4204 | if (task_running(rq, p)) { | 4252 | if (task_running(rq, p)) { |
4205 | if (p->prio > oldprio) | 4253 | if (p->prio > oldprio) |
4206 | resched_task(rq->curr); | 4254 | resched_task(rq->curr); |
4255 | p->sched_class->set_curr_task(rq); | ||
4207 | } else { | 4256 | } else { |
4208 | check_preempt_curr(rq, p); | 4257 | check_preempt_curr(rq, p); |
4209 | } | 4258 | } |
@@ -6444,7 +6493,25 @@ void __init sched_init(void) | |||
6444 | init_cfs_rq(&rq->cfs, rq); | 6493 | init_cfs_rq(&rq->cfs, rq); |
6445 | #ifdef CONFIG_FAIR_GROUP_SCHED | 6494 | #ifdef CONFIG_FAIR_GROUP_SCHED |
6446 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); | 6495 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); |
6447 | list_add(&rq->cfs.leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); | 6496 | { |
6497 | struct cfs_rq *cfs_rq = &per_cpu(init_cfs_rq, i); | ||
6498 | struct sched_entity *se = | ||
6499 | &per_cpu(init_sched_entity, i); | ||
6500 | |||
6501 | init_cfs_rq_p[i] = cfs_rq; | ||
6502 | init_cfs_rq(cfs_rq, rq); | ||
6503 | cfs_rq->tg = &init_task_grp; | ||
6504 | list_add(&cfs_rq->leaf_cfs_rq_list, | ||
6505 | &rq->leaf_cfs_rq_list); | ||
6506 | |||
6507 | init_sched_entity_p[i] = se; | ||
6508 | se->cfs_rq = &rq->cfs; | ||
6509 | se->my_q = cfs_rq; | ||
6510 | se->load.weight = NICE_0_LOAD; | ||
6511 | se->load.inv_weight = div64_64(1ULL<<32, NICE_0_LOAD); | ||
6512 | se->parent = NULL; | ||
6513 | } | ||
6514 | init_task_grp.shares = NICE_0_LOAD; | ||
6448 | #endif | 6515 | #endif |
6449 | 6516 | ||
6450 | for (j = 0; j < CPU_LOAD_IDX_MAX; j++) | 6517 | for (j = 0; j < CPU_LOAD_IDX_MAX; j++) |
@@ -6632,3 +6699,250 @@ void set_curr_task(int cpu, struct task_struct *p) | |||
6632 | } | 6699 | } |
6633 | 6700 | ||
6634 | #endif | 6701 | #endif |
6702 | |||
6703 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
6704 | |||
6705 | /* return corresponding task_grp object of a container */ | ||
6706 | static inline struct task_grp *container_tg(struct container *cont) | ||
6707 | { | ||
6708 | return container_of(container_subsys_state(cont, cpu_subsys_id), | ||
6709 | struct task_grp, css); | ||
6710 | } | ||
6711 | |||
6712 | /* allocate runqueue etc for a new task group */ | ||
6713 | static struct container_subsys_state * | ||
6714 | sched_create_group(struct container_subsys *ss, struct container *cont) | ||
6715 | { | ||
6716 | struct task_grp *tg; | ||
6717 | struct cfs_rq *cfs_rq; | ||
6718 | struct sched_entity *se; | ||
6719 | int i; | ||
6720 | |||
6721 | if (!cont->parent) { | ||
6722 | /* This is early initialization for the top container */ | ||
6723 | init_task_grp.css.container = cont; | ||
6724 | return &init_task_grp.css; | ||
6725 | } | ||
6726 | |||
6727 | /* we support only 1-level deep hierarchical scheduler atm */ | ||
6728 | if (cont->parent->parent) | ||
6729 | return ERR_PTR(-EINVAL); | ||
6730 | |||
6731 | tg = kzalloc(sizeof(*tg), GFP_KERNEL); | ||
6732 | if (!tg) | ||
6733 | return ERR_PTR(-ENOMEM); | ||
6734 | |||
6735 | tg->cfs_rq = kzalloc(sizeof(cfs_rq) * num_possible_cpus(), GFP_KERNEL); | ||
6736 | if (!tg->cfs_rq) | ||
6737 | goto err; | ||
6738 | tg->se = kzalloc(sizeof(se) * num_possible_cpus(), GFP_KERNEL); | ||
6739 | if (!tg->se) | ||
6740 | goto err; | ||
6741 | |||
6742 | for_each_possible_cpu(i) { | ||
6743 | struct rq *rq = cpu_rq(i); | ||
6744 | |||
6745 | cfs_rq = kmalloc_node(sizeof(struct cfs_rq), GFP_KERNEL, | ||
6746 | cpu_to_node(i)); | ||
6747 | if (!cfs_rq) | ||
6748 | goto err; | ||
6749 | |||
6750 | se = kmalloc_node(sizeof(struct sched_entity), GFP_KERNEL, | ||
6751 | cpu_to_node(i)); | ||
6752 | if (!se) | ||
6753 | goto err; | ||
6754 | |||
6755 | memset(cfs_rq, 0, sizeof(struct cfs_rq)); | ||
6756 | memset(se, 0, sizeof(struct sched_entity)); | ||
6757 | |||
6758 | tg->cfs_rq[i] = cfs_rq; | ||
6759 | init_cfs_rq(cfs_rq, rq); | ||
6760 | cfs_rq->tg = tg; | ||
6761 | list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); | ||
6762 | |||
6763 | tg->se[i] = se; | ||
6764 | se->cfs_rq = &rq->cfs; | ||
6765 | se->my_q = cfs_rq; | ||
6766 | se->load.weight = NICE_0_LOAD; | ||
6767 | se->load.inv_weight = div64_64(1ULL<<32, NICE_0_LOAD); | ||
6768 | se->parent = NULL; | ||
6769 | } | ||
6770 | |||
6771 | tg->shares = NICE_0_LOAD; | ||
6772 | |||
6773 | /* Bind the container to task_grp object we just created */ | ||
6774 | tg->css.container = cont; | ||
6775 | |||
6776 | return &tg->css; | ||
6777 | |||
6778 | err: | ||
6779 | for_each_possible_cpu(i) { | ||
6780 | if (tg->cfs_rq && tg->cfs_rq[i]) | ||
6781 | kfree(tg->cfs_rq[i]); | ||
6782 | if (tg->se && tg->se[i]) | ||
6783 | kfree(tg->se[i]); | ||
6784 | } | ||
6785 | if (tg->cfs_rq) | ||
6786 | kfree(tg->cfs_rq); | ||
6787 | if (tg->se) | ||
6788 | kfree(tg->se); | ||
6789 | if (tg) | ||
6790 | kfree(tg); | ||
6791 | |||
6792 | return ERR_PTR(-ENOMEM); | ||
6793 | } | ||
6794 | |||
6795 | |||
6796 | /* destroy runqueue etc associated with a task group */ | ||
6797 | static void sched_destroy_group(struct container_subsys *ss, | ||
6798 | struct container *cont) | ||
6799 | { | ||
6800 | struct task_grp *tg = container_tg(cont); | ||
6801 | struct cfs_rq *cfs_rq; | ||
6802 | struct sched_entity *se; | ||
6803 | int i; | ||
6804 | |||
6805 | for_each_possible_cpu(i) { | ||
6806 | cfs_rq = tg->cfs_rq[i]; | ||
6807 | list_del_rcu(&cfs_rq->leaf_cfs_rq_list); | ||
6808 | } | ||
6809 | |||
6810 | /* wait for possible concurrent references to cfs_rqs complete */ | ||
6811 | synchronize_sched(); | ||
6812 | |||
6813 | /* now it should be safe to free those cfs_rqs */ | ||
6814 | for_each_possible_cpu(i) { | ||
6815 | cfs_rq = tg->cfs_rq[i]; | ||
6816 | kfree(cfs_rq); | ||
6817 | |||
6818 | se = tg->se[i]; | ||
6819 | kfree(se); | ||
6820 | } | ||
6821 | |||
6822 | kfree(tg->cfs_rq); | ||
6823 | kfree(tg->se); | ||
6824 | kfree(tg); | ||
6825 | } | ||
6826 | |||
6827 | static int sched_can_attach(struct container_subsys *ss, | ||
6828 | struct container *cont, struct task_struct *tsk) | ||
6829 | { | ||
6830 | /* We don't support RT-tasks being in separate groups */ | ||
6831 | if (tsk->sched_class != &fair_sched_class) | ||
6832 | return -EINVAL; | ||
6833 | |||
6834 | return 0; | ||
6835 | } | ||
6836 | |||
6837 | /* change task's runqueue when it moves between groups */ | ||
6838 | static void sched_move_task(struct container_subsys *ss, struct container *cont, | ||
6839 | struct container *old_cont, struct task_struct *tsk) | ||
6840 | { | ||
6841 | int on_rq, running; | ||
6842 | unsigned long flags; | ||
6843 | struct rq *rq; | ||
6844 | |||
6845 | rq = task_rq_lock(tsk, &flags); | ||
6846 | |||
6847 | if (tsk->sched_class != &fair_sched_class) | ||
6848 | goto done; | ||
6849 | |||
6850 | update_rq_clock(rq); | ||
6851 | |||
6852 | running = task_running(rq, tsk); | ||
6853 | on_rq = tsk->se.on_rq; | ||
6854 | |||
6855 | if (on_rq) { | ||
6856 | dequeue_task(rq, tsk, 0); | ||
6857 | if (unlikely(running)) | ||
6858 | tsk->sched_class->put_prev_task(rq, tsk); | ||
6859 | } | ||
6860 | |||
6861 | set_task_cfs_rq(tsk); | ||
6862 | |||
6863 | if (on_rq) { | ||
6864 | enqueue_task(rq, tsk, 0); | ||
6865 | if (unlikely(running)) | ||
6866 | tsk->sched_class->set_curr_task(rq); | ||
6867 | } | ||
6868 | |||
6869 | done: | ||
6870 | task_rq_unlock(rq, &flags); | ||
6871 | } | ||
6872 | |||
6873 | static void set_se_shares(struct sched_entity *se, unsigned long shares) | ||
6874 | { | ||
6875 | struct cfs_rq *cfs_rq = se->cfs_rq; | ||
6876 | struct rq *rq = cfs_rq->rq; | ||
6877 | int on_rq; | ||
6878 | |||
6879 | spin_lock_irq(&rq->lock); | ||
6880 | |||
6881 | on_rq = se->on_rq; | ||
6882 | if (on_rq) | ||
6883 | dequeue_entity(cfs_rq, se, 0); | ||
6884 | |||
6885 | se->load.weight = shares; | ||
6886 | se->load.inv_weight = div64_64((1ULL<<32), shares); | ||
6887 | |||
6888 | if (on_rq) | ||
6889 | enqueue_entity(cfs_rq, se, 0); | ||
6890 | |||
6891 | spin_unlock_irq(&rq->lock); | ||
6892 | } | ||
6893 | |||
6894 | static ssize_t cpu_shares_write(struct container *cont, struct cftype *cftype, | ||
6895 | struct file *file, const char __user *userbuf, | ||
6896 | size_t nbytes, loff_t *ppos) | ||
6897 | { | ||
6898 | int i; | ||
6899 | unsigned long shareval; | ||
6900 | struct task_grp *tg = container_tg(cont); | ||
6901 | char buffer[2*sizeof(unsigned long) + 1]; | ||
6902 | |||
6903 | if (nbytes > 2*sizeof(unsigned long)) /* safety check */ | ||
6904 | return -E2BIG; | ||
6905 | |||
6906 | if (copy_from_user(buffer, userbuf, nbytes)) | ||
6907 | return -EFAULT; | ||
6908 | |||
6909 | buffer[nbytes] = 0; /* nul-terminate */ | ||
6910 | shareval = simple_strtoul(buffer, NULL, 10); | ||
6911 | |||
6912 | tg->shares = shareval; | ||
6913 | for_each_possible_cpu(i) | ||
6914 | set_se_shares(tg->se[i], shareval); | ||
6915 | |||
6916 | return nbytes; | ||
6917 | } | ||
6918 | |||
6919 | static u64 cpu_shares_read_uint(struct container *cont, struct cftype *cft) | ||
6920 | { | ||
6921 | struct task_grp *tg = container_tg(cont); | ||
6922 | |||
6923 | return (u64) tg->shares; | ||
6924 | } | ||
6925 | |||
6926 | struct cftype cpuctl_share = { | ||
6927 | .name = "shares", | ||
6928 | .read_uint = cpu_shares_read_uint, | ||
6929 | .write = cpu_shares_write, | ||
6930 | }; | ||
6931 | |||
6932 | static int sched_populate(struct container_subsys *ss, struct container *cont) | ||
6933 | { | ||
6934 | return container_add_file(cont, ss, &cpuctl_share); | ||
6935 | } | ||
6936 | |||
6937 | struct container_subsys cpu_subsys = { | ||
6938 | .name = "cpu", | ||
6939 | .create = sched_create_group, | ||
6940 | .destroy = sched_destroy_group, | ||
6941 | .can_attach = sched_can_attach, | ||
6942 | .attach = sched_move_task, | ||
6943 | .populate = sched_populate, | ||
6944 | .subsys_id = cpu_subsys_id, | ||
6945 | .early_init = 1, | ||
6946 | }; | ||
6947 | |||
6948 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | ||