aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
authorSrivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>2008-01-25 15:08:00 -0500
committerIngo Molnar <mingo@elte.hu>2008-01-25 15:08:00 -0500
commit6b2d7700266b9402e12824e11e0099ae6a4a6a79 (patch)
treed72c25b03150901ad8643f931186a11eb85635dc /kernel/sched.c
parenta183561567b5446d3362b4839bd4f744f4b2af1e (diff)
sched: group scheduler, fix fairness of cpu bandwidth allocation for task groups
The current load balancing scheme isn't good enough for precise group fairness. For example: on a 8-cpu system, I created 3 groups as under: a = 8 tasks (cpu.shares = 1024) b = 4 tasks (cpu.shares = 1024) c = 3 tasks (cpu.shares = 1024) a, b and c are task groups that have equal weight. We would expect each of the groups to receive 33.33% of cpu bandwidth under a fair scheduler. This is what I get with the latest scheduler git tree: Signed-off-by: Ingo Molnar <mingo@elte.hu> -------------------------------------------------------------------------------- Col1 | Col2 | Col3 | Col4 ------|---------|-------|------------------------------------------------------- a | 277.676 | 57.8% | 54.1% 54.1% 54.1% 54.2% 56.7% 62.2% 62.8% 64.5% b | 116.108 | 24.2% | 47.4% 48.1% 48.7% 49.3% c | 86.326 | 18.0% | 47.5% 47.9% 48.5% -------------------------------------------------------------------------------- Explanation of o/p: Col1 -> Group name Col2 -> Cumulative execution time (in seconds) received by all tasks of that group in a 60sec window across 8 cpus Col3 -> CPU bandwidth received by the group in the 60sec window, expressed in percentage. Col3 data is derived as: Col3 = 100 * Col2 / (NR_CPUS * 60) Col4 -> CPU bandwidth received by each individual task of the group. Col4 = 100 * cpu_time_recd_by_task / 60 [I can share the test case that produces a similar o/p if reqd] The deviation from desired group fairness is as below: a = +24.47% b = -9.13% c = -15.33% which is quite high. After the patch below is applied, here are the results: -------------------------------------------------------------------------------- Col1 | Col2 | Col3 | Col4 ------|---------|-------|------------------------------------------------------- a | 163.112 | 34.0% | 33.2% 33.4% 33.5% 33.5% 33.7% 34.4% 34.8% 35.3% b | 156.220 | 32.5% | 63.3% 64.5% 66.1% 66.5% c | 160.653 | 33.5% | 85.8% 90.6% 91.4% -------------------------------------------------------------------------------- Deviation from desired group fairness is as below: a = +0.67% b = -0.83% c = +0.17% which is far better IMO. Most of other runs have yielded a deviation within +-2% at the most, which is good. Why do we see bad (group) fairness with current scheuler? ========================================================= Currently cpu's weight is just the summation of individual task weights. This can yield incorrect results. For ex: consider three groups as below on a 2-cpu system: CPU0 CPU1 --------------------------- A (10) B(5) C(5) --------------------------- Group A has 10 tasks, all on CPU0, Group B and C have 5 tasks each all of which are on CPU1. Each task has the same weight (NICE_0_LOAD = 1024). The current scheme would yield a cpu weight of 10240 (10*1024) for each cpu and the load balancer will think both CPUs are perfectly balanced and won't move around any tasks. This, however, would yield this bandwidth: A = 50% B = 25% C = 25% which is not the desired result. What's changing in the patch? ============================= - How cpu weights are calculated when CONFIF_FAIR_GROUP_SCHED is defined (see below) - API Change - Two tunables introduced in sysfs (under SCHED_DEBUG) to control the frequency at which the load balance monitor thread runs. The basic change made in this patch is how cpu weight (rq->load.weight) is calculated. Its now calculated as the summation of group weights on a cpu, rather than summation of task weights. Weight exerted by a group on a cpu is dependent on the shares allocated to it and also the number of tasks the group has on that cpu compared to the total number of (runnable) tasks the group has in the system. Let, W(K,i) = Weight of group K on cpu i T(K,i) = Task load present in group K's cfs_rq on cpu i T(K) = Total task load of group K across various cpus S(K) = Shares allocated to group K NRCPUS = Number of online cpus in the scheduler domain to which group K is assigned. Then, W(K,i) = S(K) * NRCPUS * T(K,i) / T(K) A load balance monitor thread is created at bootup, which periodically runs and adjusts group's weight on each cpu. To avoid its overhead, two min/max tunables are introduced (under SCHED_DEBUG) to control the rate at which it runs. Fixes from: Peter Zijlstra <a.p.zijlstra@chello.nl> - don't start the load_balance_monitor when there is only a single cpu. - rename the kthread because its currently longer than TASK_COMM_LEN Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c270
1 files changed, 256 insertions, 14 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index d9585f15043f..86e55a9c2de6 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -168,7 +168,43 @@ struct task_group {
168 struct sched_entity **se; 168 struct sched_entity **se;
169 /* runqueue "owned" by this group on each cpu */ 169 /* runqueue "owned" by this group on each cpu */
170 struct cfs_rq **cfs_rq; 170 struct cfs_rq **cfs_rq;
171
172 /*
173 * shares assigned to a task group governs how much of cpu bandwidth
174 * is allocated to the group. The more shares a group has, the more is
175 * the cpu bandwidth allocated to it.
176 *
177 * For ex, lets say that there are three task groups, A, B and C which
178 * have been assigned shares 1000, 2000 and 3000 respectively. Then,
179 * cpu bandwidth allocated by the scheduler to task groups A, B and C
180 * should be:
181 *
182 * Bw(A) = 1000/(1000+2000+3000) * 100 = 16.66%
183 * Bw(B) = 2000/(1000+2000+3000) * 100 = 33.33%
184 * Bw(C) = 3000/(1000+2000+3000) * 100 = 50%
185 *
186 * The weight assigned to a task group's schedulable entities on every
187 * cpu (task_group.se[a_cpu]->load.weight) is derived from the task
188 * group's shares. For ex: lets say that task group A has been
189 * assigned shares of 1000 and there are two CPUs in a system. Then,
190 *
191 * tg_A->se[0]->load.weight = tg_A->se[1]->load.weight = 1000;
192 *
193 * Note: It's not necessary that each of a task's group schedulable
194 * entity have the same weight on all CPUs. If the group
195 * has 2 of its tasks on CPU0 and 1 task on CPU1, then a
196 * better distribution of weight could be:
197 *
198 * tg_A->se[0]->load.weight = 2/3 * 2000 = 1333
199 * tg_A->se[1]->load.weight = 1/2 * 2000 = 667
200 *
201 * rebalance_shares() is responsible for distributing the shares of a
202 * task groups like this among the group's schedulable entities across
203 * cpus.
204 *
205 */
171 unsigned long shares; 206 unsigned long shares;
207
172 struct rcu_head rcu; 208 struct rcu_head rcu;
173}; 209};
174 210
@@ -188,6 +224,14 @@ static DEFINE_MUTEX(task_group_mutex);
188/* doms_cur_mutex serializes access to doms_cur[] array */ 224/* doms_cur_mutex serializes access to doms_cur[] array */
189static DEFINE_MUTEX(doms_cur_mutex); 225static DEFINE_MUTEX(doms_cur_mutex);
190 226
227#ifdef CONFIG_SMP
228/* kernel thread that runs rebalance_shares() periodically */
229static struct task_struct *lb_monitor_task;
230static int load_balance_monitor(void *unused);
231#endif
232
233static void set_se_shares(struct sched_entity *se, unsigned long shares);
234
191/* Default task group. 235/* Default task group.
192 * Every task in system belong to this group at bootup. 236 * Every task in system belong to this group at bootup.
193 */ 237 */
@@ -202,6 +246,8 @@ struct task_group init_task_group = {
202# define INIT_TASK_GROUP_LOAD NICE_0_LOAD 246# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
203#endif 247#endif
204 248
249#define MIN_GROUP_SHARES 2
250
205static int init_task_group_load = INIT_TASK_GROUP_LOAD; 251static int init_task_group_load = INIT_TASK_GROUP_LOAD;
206 252
207/* return group to which a task belongs */ 253/* return group to which a task belongs */
@@ -6736,6 +6782,21 @@ void __init sched_init_smp(void)
6736 if (set_cpus_allowed(current, non_isolated_cpus) < 0) 6782 if (set_cpus_allowed(current, non_isolated_cpus) < 0)
6737 BUG(); 6783 BUG();
6738 sched_init_granularity(); 6784 sched_init_granularity();
6785
6786#ifdef CONFIG_FAIR_GROUP_SCHED
6787 if (nr_cpu_ids == 1)
6788 return;
6789
6790 lb_monitor_task = kthread_create(load_balance_monitor, NULL,
6791 "group_balance");
6792 if (!IS_ERR(lb_monitor_task)) {
6793 lb_monitor_task->flags |= PF_NOFREEZE;
6794 wake_up_process(lb_monitor_task);
6795 } else {
6796 printk(KERN_ERR "Could not create load balance monitor thread"
6797 "(error = %ld) \n", PTR_ERR(lb_monitor_task));
6798 }
6799#endif
6739} 6800}
6740#else 6801#else
6741void __init sched_init_smp(void) 6802void __init sched_init_smp(void)
@@ -6988,6 +7049,157 @@ void set_curr_task(int cpu, struct task_struct *p)
6988 7049
6989#ifdef CONFIG_FAIR_GROUP_SCHED 7050#ifdef CONFIG_FAIR_GROUP_SCHED
6990 7051
7052#ifdef CONFIG_SMP
7053/*
7054 * distribute shares of all task groups among their schedulable entities,
7055 * to reflect load distrbution across cpus.
7056 */
7057static int rebalance_shares(struct sched_domain *sd, int this_cpu)
7058{
7059 struct cfs_rq *cfs_rq;
7060 struct rq *rq = cpu_rq(this_cpu);
7061 cpumask_t sdspan = sd->span;
7062 int balanced = 1;
7063
7064 /* Walk thr' all the task groups that we have */
7065 for_each_leaf_cfs_rq(rq, cfs_rq) {
7066 int i;
7067 unsigned long total_load = 0, total_shares;
7068 struct task_group *tg = cfs_rq->tg;
7069
7070 /* Gather total task load of this group across cpus */
7071 for_each_cpu_mask(i, sdspan)
7072 total_load += tg->cfs_rq[i]->load.weight;
7073
7074 /* Nothing to do if this group has no load */
7075 if (!total_load)
7076 continue;
7077
7078 /*
7079 * tg->shares represents the number of cpu shares the task group
7080 * is eligible to hold on a single cpu. On N cpus, it is
7081 * eligible to hold (N * tg->shares) number of cpu shares.
7082 */
7083 total_shares = tg->shares * cpus_weight(sdspan);
7084
7085 /*
7086 * redistribute total_shares across cpus as per the task load
7087 * distribution.
7088 */
7089 for_each_cpu_mask(i, sdspan) {
7090 unsigned long local_load, local_shares;
7091
7092 local_load = tg->cfs_rq[i]->load.weight;
7093 local_shares = (local_load * total_shares) / total_load;
7094 if (!local_shares)
7095 local_shares = MIN_GROUP_SHARES;
7096 if (local_shares == tg->se[i]->load.weight)
7097 continue;
7098
7099 spin_lock_irq(&cpu_rq(i)->lock);
7100 set_se_shares(tg->se[i], local_shares);
7101 spin_unlock_irq(&cpu_rq(i)->lock);
7102 balanced = 0;
7103 }
7104 }
7105
7106 return balanced;
7107}
7108
7109/*
7110 * How frequently should we rebalance_shares() across cpus?
7111 *
7112 * The more frequently we rebalance shares, the more accurate is the fairness
7113 * of cpu bandwidth distribution between task groups. However higher frequency
7114 * also implies increased scheduling overhead.
7115 *
7116 * sysctl_sched_min_bal_int_shares represents the minimum interval between
7117 * consecutive calls to rebalance_shares() in the same sched domain.
7118 *
7119 * sysctl_sched_max_bal_int_shares represents the maximum interval between
7120 * consecutive calls to rebalance_shares() in the same sched domain.
7121 *
7122 * These settings allows for the appropriate tradeoff between accuracy of
7123 * fairness and the associated overhead.
7124 *
7125 */
7126
7127/* default: 8ms, units: milliseconds */
7128const_debug unsigned int sysctl_sched_min_bal_int_shares = 8;
7129
7130/* default: 128ms, units: milliseconds */
7131const_debug unsigned int sysctl_sched_max_bal_int_shares = 128;
7132
7133/* kernel thread that runs rebalance_shares() periodically */
7134static int load_balance_monitor(void *unused)
7135{
7136 unsigned int timeout = sysctl_sched_min_bal_int_shares;
7137 struct sched_param schedparm;
7138 int ret;
7139
7140 /*
7141 * We don't want this thread's execution to be limited by the shares
7142 * assigned to default group (init_task_group). Hence make it run
7143 * as a SCHED_RR RT task at the lowest priority.
7144 */
7145 schedparm.sched_priority = 1;
7146 ret = sched_setscheduler(current, SCHED_RR, &schedparm);
7147 if (ret)
7148 printk(KERN_ERR "Couldn't set SCHED_RR policy for load balance"
7149 " monitor thread (error = %d) \n", ret);
7150
7151 while (!kthread_should_stop()) {
7152 int i, cpu, balanced = 1;
7153
7154 /* Prevent cpus going down or coming up */
7155 lock_cpu_hotplug();
7156 /* lockout changes to doms_cur[] array */
7157 lock_doms_cur();
7158 /*
7159 * Enter a rcu read-side critical section to safely walk rq->sd
7160 * chain on various cpus and to walk task group list
7161 * (rq->leaf_cfs_rq_list) in rebalance_shares().
7162 */
7163 rcu_read_lock();
7164
7165 for (i = 0; i < ndoms_cur; i++) {
7166 cpumask_t cpumap = doms_cur[i];
7167 struct sched_domain *sd = NULL, *sd_prev = NULL;
7168
7169 cpu = first_cpu(cpumap);
7170
7171 /* Find the highest domain at which to balance shares */
7172 for_each_domain(cpu, sd) {
7173 if (!(sd->flags & SD_LOAD_BALANCE))
7174 continue;
7175 sd_prev = sd;
7176 }
7177
7178 sd = sd_prev;
7179 /* sd == NULL? No load balance reqd in this domain */
7180 if (!sd)
7181 continue;
7182
7183 balanced &= rebalance_shares(sd, cpu);
7184 }
7185
7186 rcu_read_unlock();
7187
7188 unlock_doms_cur();
7189 unlock_cpu_hotplug();
7190
7191 if (!balanced)
7192 timeout = sysctl_sched_min_bal_int_shares;
7193 else if (timeout < sysctl_sched_max_bal_int_shares)
7194 timeout *= 2;
7195
7196 msleep_interruptible(timeout);
7197 }
7198
7199 return 0;
7200}
7201#endif /* CONFIG_SMP */
7202
6991/* allocate runqueue etc for a new task group */ 7203/* allocate runqueue etc for a new task group */
6992struct task_group *sched_create_group(void) 7204struct task_group *sched_create_group(void)
6993{ 7205{
@@ -7144,47 +7356,77 @@ done:
7144 task_rq_unlock(rq, &flags); 7356 task_rq_unlock(rq, &flags);
7145} 7357}
7146 7358
7359/* rq->lock to be locked by caller */
7147static void set_se_shares(struct sched_entity *se, unsigned long shares) 7360static void set_se_shares(struct sched_entity *se, unsigned long shares)
7148{ 7361{
7149 struct cfs_rq *cfs_rq = se->cfs_rq; 7362 struct cfs_rq *cfs_rq = se->cfs_rq;
7150 struct rq *rq = cfs_rq->rq; 7363 struct rq *rq = cfs_rq->rq;
7151 int on_rq; 7364 int on_rq;
7152 7365
7153 spin_lock_irq(&rq->lock); 7366 if (!shares)
7367 shares = MIN_GROUP_SHARES;
7154 7368
7155 on_rq = se->on_rq; 7369 on_rq = se->on_rq;
7156 if (on_rq) 7370 if (on_rq) {
7157 dequeue_entity(cfs_rq, se, 0); 7371 dequeue_entity(cfs_rq, se, 0);
7372 dec_cpu_load(rq, se->load.weight);
7373 }
7158 7374
7159 se->load.weight = shares; 7375 se->load.weight = shares;
7160 se->load.inv_weight = div64_64((1ULL<<32), shares); 7376 se->load.inv_weight = div64_64((1ULL<<32), shares);
7161 7377
7162 if (on_rq) 7378 if (on_rq) {
7163 enqueue_entity(cfs_rq, se, 0); 7379 enqueue_entity(cfs_rq, se, 0);
7164 7380 inc_cpu_load(rq, se->load.weight);
7165 spin_unlock_irq(&rq->lock); 7381 }
7166} 7382}
7167 7383
7168int sched_group_set_shares(struct task_group *tg, unsigned long shares) 7384int sched_group_set_shares(struct task_group *tg, unsigned long shares)
7169{ 7385{
7170 int i; 7386 int i;
7171 7387 struct cfs_rq *cfs_rq;
7172 /* 7388 struct rq *rq;
7173 * A weight of 0 or 1 can cause arithmetics problems.
7174 * (The default weight is 1024 - so there's no practical
7175 * limitation from this.)
7176 */
7177 if (shares < 2)
7178 shares = 2;
7179 7389
7180 lock_task_group_list(); 7390 lock_task_group_list();
7181 if (tg->shares == shares) 7391 if (tg->shares == shares)
7182 goto done; 7392 goto done;
7183 7393
7394 if (shares < MIN_GROUP_SHARES)
7395 shares = MIN_GROUP_SHARES;
7396
7397 /*
7398 * Prevent any load balance activity (rebalance_shares,
7399 * load_balance_fair) from referring to this group first,
7400 * by taking it off the rq->leaf_cfs_rq_list on each cpu.
7401 */
7402 for_each_possible_cpu(i) {
7403 cfs_rq = tg->cfs_rq[i];
7404 list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
7405 }
7406
7407 /* wait for any ongoing reference to this group to finish */
7408 synchronize_sched();
7409
7410 /*
7411 * Now we are free to modify the group's share on each cpu
7412 * w/o tripping rebalance_share or load_balance_fair.
7413 */
7184 tg->shares = shares; 7414 tg->shares = shares;
7185 for_each_possible_cpu(i) 7415 for_each_possible_cpu(i) {
7416 spin_lock_irq(&cpu_rq(i)->lock);
7186 set_se_shares(tg->se[i], shares); 7417 set_se_shares(tg->se[i], shares);
7418 spin_unlock_irq(&cpu_rq(i)->lock);
7419 }
7187 7420
7421 /*
7422 * Enable load balance activity on this group, by inserting it back on
7423 * each cpu's rq->leaf_cfs_rq_list.
7424 */
7425 for_each_possible_cpu(i) {
7426 rq = cpu_rq(i);
7427 cfs_rq = tg->cfs_rq[i];
7428 list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
7429 }
7188done: 7430done:
7189 unlock_task_group_list(); 7431 unlock_task_group_list();
7190 return 0; 7432 return 0;