aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c270
1 files changed, 256 insertions, 14 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index d9585f15043f..86e55a9c2de6 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -168,7 +168,43 @@ struct task_group {
168 struct sched_entity **se; 168 struct sched_entity **se;
169 /* runqueue "owned" by this group on each cpu */ 169 /* runqueue "owned" by this group on each cpu */
170 struct cfs_rq **cfs_rq; 170 struct cfs_rq **cfs_rq;
171
172 /*
173 * shares assigned to a task group governs how much of cpu bandwidth
174 * is allocated to the group. The more shares a group has, the more is
175 * the cpu bandwidth allocated to it.
176 *
177 * For ex, lets say that there are three task groups, A, B and C which
178 * have been assigned shares 1000, 2000 and 3000 respectively. Then,
179 * cpu bandwidth allocated by the scheduler to task groups A, B and C
180 * should be:
181 *
182 * Bw(A) = 1000/(1000+2000+3000) * 100 = 16.66%
183 * Bw(B) = 2000/(1000+2000+3000) * 100 = 33.33%
184 * Bw(C) = 3000/(1000+2000+3000) * 100 = 50%
185 *
186 * The weight assigned to a task group's schedulable entities on every
187 * cpu (task_group.se[a_cpu]->load.weight) is derived from the task
188 * group's shares. For ex: lets say that task group A has been
189 * assigned shares of 1000 and there are two CPUs in a system. Then,
190 *
191 * tg_A->se[0]->load.weight = tg_A->se[1]->load.weight = 1000;
192 *
193 * Note: It's not necessary that each of a task's group schedulable
194 * entity have the same weight on all CPUs. If the group
195 * has 2 of its tasks on CPU0 and 1 task on CPU1, then a
196 * better distribution of weight could be:
197 *
198 * tg_A->se[0]->load.weight = 2/3 * 2000 = 1333
199 * tg_A->se[1]->load.weight = 1/2 * 2000 = 667
200 *
201 * rebalance_shares() is responsible for distributing the shares of a
202 * task groups like this among the group's schedulable entities across
203 * cpus.
204 *
205 */
171 unsigned long shares; 206 unsigned long shares;
207
172 struct rcu_head rcu; 208 struct rcu_head rcu;
173}; 209};
174 210
@@ -188,6 +224,14 @@ static DEFINE_MUTEX(task_group_mutex);
188/* doms_cur_mutex serializes access to doms_cur[] array */ 224/* doms_cur_mutex serializes access to doms_cur[] array */
189static DEFINE_MUTEX(doms_cur_mutex); 225static DEFINE_MUTEX(doms_cur_mutex);
190 226
227#ifdef CONFIG_SMP
228/* kernel thread that runs rebalance_shares() periodically */
229static struct task_struct *lb_monitor_task;
230static int load_balance_monitor(void *unused);
231#endif
232
233static void set_se_shares(struct sched_entity *se, unsigned long shares);
234
191/* Default task group. 235/* Default task group.
192 * Every task in system belong to this group at bootup. 236 * Every task in system belong to this group at bootup.
193 */ 237 */
@@ -202,6 +246,8 @@ struct task_group init_task_group = {
202# define INIT_TASK_GROUP_LOAD NICE_0_LOAD 246# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
203#endif 247#endif
204 248
249#define MIN_GROUP_SHARES 2
250
205static int init_task_group_load = INIT_TASK_GROUP_LOAD; 251static int init_task_group_load = INIT_TASK_GROUP_LOAD;
206 252
207/* return group to which a task belongs */ 253/* return group to which a task belongs */
@@ -6736,6 +6782,21 @@ void __init sched_init_smp(void)
6736 if (set_cpus_allowed(current, non_isolated_cpus) < 0) 6782 if (set_cpus_allowed(current, non_isolated_cpus) < 0)
6737 BUG(); 6783 BUG();
6738 sched_init_granularity(); 6784 sched_init_granularity();
6785
6786#ifdef CONFIG_FAIR_GROUP_SCHED
6787 if (nr_cpu_ids == 1)
6788 return;
6789
6790 lb_monitor_task = kthread_create(load_balance_monitor, NULL,
6791 "group_balance");
6792 if (!IS_ERR(lb_monitor_task)) {
6793 lb_monitor_task->flags |= PF_NOFREEZE;
6794 wake_up_process(lb_monitor_task);
6795 } else {
6796 printk(KERN_ERR "Could not create load balance monitor thread"
6797 "(error = %ld) \n", PTR_ERR(lb_monitor_task));
6798 }
6799#endif
6739} 6800}
6740#else 6801#else
6741void __init sched_init_smp(void) 6802void __init sched_init_smp(void)
@@ -6988,6 +7049,157 @@ void set_curr_task(int cpu, struct task_struct *p)
6988 7049
6989#ifdef CONFIG_FAIR_GROUP_SCHED 7050#ifdef CONFIG_FAIR_GROUP_SCHED
6990 7051
7052#ifdef CONFIG_SMP
7053/*
7054 * distribute shares of all task groups among their schedulable entities,
7055 * to reflect load distrbution across cpus.
7056 */
7057static int rebalance_shares(struct sched_domain *sd, int this_cpu)
7058{
7059 struct cfs_rq *cfs_rq;
7060 struct rq *rq = cpu_rq(this_cpu);
7061 cpumask_t sdspan = sd->span;
7062 int balanced = 1;
7063
7064 /* Walk thr' all the task groups that we have */
7065 for_each_leaf_cfs_rq(rq, cfs_rq) {
7066 int i;
7067 unsigned long total_load = 0, total_shares;
7068 struct task_group *tg = cfs_rq->tg;
7069
7070 /* Gather total task load of this group across cpus */
7071 for_each_cpu_mask(i, sdspan)
7072 total_load += tg->cfs_rq[i]->load.weight;
7073
7074 /* Nothing to do if this group has no load */
7075 if (!total_load)
7076 continue;
7077
7078 /*
7079 * tg->shares represents the number of cpu shares the task group
7080 * is eligible to hold on a single cpu. On N cpus, it is
7081 * eligible to hold (N * tg->shares) number of cpu shares.
7082 */
7083 total_shares = tg->shares * cpus_weight(sdspan);
7084
7085 /*
7086 * redistribute total_shares across cpus as per the task load
7087 * distribution.
7088 */
7089 for_each_cpu_mask(i, sdspan) {
7090 unsigned long local_load, local_shares;
7091
7092 local_load = tg->cfs_rq[i]->load.weight;
7093 local_shares = (local_load * total_shares) / total_load;
7094 if (!local_shares)
7095 local_shares = MIN_GROUP_SHARES;
7096 if (local_shares == tg->se[i]->load.weight)
7097 continue;
7098
7099 spin_lock_irq(&cpu_rq(i)->lock);
7100 set_se_shares(tg->se[i], local_shares);
7101 spin_unlock_irq(&cpu_rq(i)->lock);
7102 balanced = 0;
7103 }
7104 }
7105
7106 return balanced;
7107}
7108
7109/*
7110 * How frequently should we rebalance_shares() across cpus?
7111 *
7112 * The more frequently we rebalance shares, the more accurate is the fairness
7113 * of cpu bandwidth distribution between task groups. However higher frequency
7114 * also implies increased scheduling overhead.
7115 *
7116 * sysctl_sched_min_bal_int_shares represents the minimum interval between
7117 * consecutive calls to rebalance_shares() in the same sched domain.
7118 *
7119 * sysctl_sched_max_bal_int_shares represents the maximum interval between
7120 * consecutive calls to rebalance_shares() in the same sched domain.
7121 *
7122 * These settings allows for the appropriate tradeoff between accuracy of
7123 * fairness and the associated overhead.
7124 *
7125 */
7126
7127/* default: 8ms, units: milliseconds */
7128const_debug unsigned int sysctl_sched_min_bal_int_shares = 8;
7129
7130/* default: 128ms, units: milliseconds */
7131const_debug unsigned int sysctl_sched_max_bal_int_shares = 128;
7132
7133/* kernel thread that runs rebalance_shares() periodically */
7134static int load_balance_monitor(void *unused)
7135{
7136 unsigned int timeout = sysctl_sched_min_bal_int_shares;
7137 struct sched_param schedparm;
7138 int ret;
7139
7140 /*
7141 * We don't want this thread's execution to be limited by the shares
7142 * assigned to default group (init_task_group). Hence make it run
7143 * as a SCHED_RR RT task at the lowest priority.
7144 */
7145 schedparm.sched_priority = 1;
7146 ret = sched_setscheduler(current, SCHED_RR, &schedparm);
7147 if (ret)
7148 printk(KERN_ERR "Couldn't set SCHED_RR policy for load balance"
7149 " monitor thread (error = %d) \n", ret);
7150
7151 while (!kthread_should_stop()) {
7152 int i, cpu, balanced = 1;
7153
7154 /* Prevent cpus going down or coming up */
7155 lock_cpu_hotplug();
7156 /* lockout changes to doms_cur[] array */
7157 lock_doms_cur();
7158 /*
7159 * Enter a rcu read-side critical section to safely walk rq->sd
7160 * chain on various cpus and to walk task group list
7161 * (rq->leaf_cfs_rq_list) in rebalance_shares().
7162 */
7163 rcu_read_lock();
7164
7165 for (i = 0; i < ndoms_cur; i++) {
7166 cpumask_t cpumap = doms_cur[i];
7167 struct sched_domain *sd = NULL, *sd_prev = NULL;
7168
7169 cpu = first_cpu(cpumap);
7170
7171 /* Find the highest domain at which to balance shares */
7172 for_each_domain(cpu, sd) {
7173 if (!(sd->flags & SD_LOAD_BALANCE))
7174 continue;
7175 sd_prev = sd;
7176 }
7177
7178 sd = sd_prev;
7179 /* sd == NULL? No load balance reqd in this domain */
7180 if (!sd)
7181 continue;
7182
7183 balanced &= rebalance_shares(sd, cpu);
7184 }
7185
7186 rcu_read_unlock();
7187
7188 unlock_doms_cur();
7189 unlock_cpu_hotplug();
7190
7191 if (!balanced)
7192 timeout = sysctl_sched_min_bal_int_shares;
7193 else if (timeout < sysctl_sched_max_bal_int_shares)
7194 timeout *= 2;
7195
7196 msleep_interruptible(timeout);
7197 }
7198
7199 return 0;
7200}
7201#endif /* CONFIG_SMP */
7202
6991/* allocate runqueue etc for a new task group */ 7203/* allocate runqueue etc for a new task group */
6992struct task_group *sched_create_group(void) 7204struct task_group *sched_create_group(void)
6993{ 7205{
@@ -7144,47 +7356,77 @@ done:
7144 task_rq_unlock(rq, &flags); 7356 task_rq_unlock(rq, &flags);
7145} 7357}
7146 7358
7359/* rq->lock to be locked by caller */
7147static void set_se_shares(struct sched_entity *se, unsigned long shares) 7360static void set_se_shares(struct sched_entity *se, unsigned long shares)
7148{ 7361{
7149 struct cfs_rq *cfs_rq = se->cfs_rq; 7362 struct cfs_rq *cfs_rq = se->cfs_rq;
7150 struct rq *rq = cfs_rq->rq; 7363 struct rq *rq = cfs_rq->rq;
7151 int on_rq; 7364 int on_rq;
7152 7365
7153 spin_lock_irq(&rq->lock); 7366 if (!shares)
7367 shares = MIN_GROUP_SHARES;
7154 7368
7155 on_rq = se->on_rq; 7369 on_rq = se->on_rq;
7156 if (on_rq) 7370 if (on_rq) {
7157 dequeue_entity(cfs_rq, se, 0); 7371 dequeue_entity(cfs_rq, se, 0);
7372 dec_cpu_load(rq, se->load.weight);
7373 }
7158 7374
7159 se->load.weight = shares; 7375 se->load.weight = shares;
7160 se->load.inv_weight = div64_64((1ULL<<32), shares); 7376 se->load.inv_weight = div64_64((1ULL<<32), shares);
7161 7377
7162 if (on_rq) 7378 if (on_rq) {
7163 enqueue_entity(cfs_rq, se, 0); 7379 enqueue_entity(cfs_rq, se, 0);
7164 7380 inc_cpu_load(rq, se->load.weight);
7165 spin_unlock_irq(&rq->lock); 7381 }
7166} 7382}
7167 7383
7168int sched_group_set_shares(struct task_group *tg, unsigned long shares) 7384int sched_group_set_shares(struct task_group *tg, unsigned long shares)
7169{ 7385{
7170 int i; 7386 int i;
7171 7387 struct cfs_rq *cfs_rq;
7172 /* 7388 struct rq *rq;
7173 * A weight of 0 or 1 can cause arithmetics problems.
7174 * (The default weight is 1024 - so there's no practical
7175 * limitation from this.)
7176 */
7177 if (shares < 2)
7178 shares = 2;
7179 7389
7180 lock_task_group_list(); 7390 lock_task_group_list();
7181 if (tg->shares == shares) 7391 if (tg->shares == shares)
7182 goto done; 7392 goto done;
7183 7393
7394 if (shares < MIN_GROUP_SHARES)
7395 shares = MIN_GROUP_SHARES;
7396
7397 /*
7398 * Prevent any load balance activity (rebalance_shares,
7399 * load_balance_fair) from referring to this group first,
7400 * by taking it off the rq->leaf_cfs_rq_list on each cpu.
7401 */
7402 for_each_possible_cpu(i) {
7403 cfs_rq = tg->cfs_rq[i];
7404 list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
7405 }
7406
7407 /* wait for any ongoing reference to this group to finish */
7408 synchronize_sched();
7409
7410 /*
7411 * Now we are free to modify the group's share on each cpu
7412 * w/o tripping rebalance_share or load_balance_fair.
7413 */
7184 tg->shares = shares; 7414 tg->shares = shares;
7185 for_each_possible_cpu(i) 7415 for_each_possible_cpu(i) {
7416 spin_lock_irq(&cpu_rq(i)->lock);
7186 set_se_shares(tg->se[i], shares); 7417 set_se_shares(tg->se[i], shares);
7418 spin_unlock_irq(&cpu_rq(i)->lock);
7419 }
7187 7420
7421 /*
7422 * Enable load balance activity on this group, by inserting it back on
7423 * each cpu's rq->leaf_cfs_rq_list.
7424 */
7425 for_each_possible_cpu(i) {
7426 rq = cpu_rq(i);
7427 cfs_rq = tg->cfs_rq[i];
7428 list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
7429 }
7188done: 7430done:
7189 unlock_task_group_list(); 7431 unlock_task_group_list();
7190 return 0; 7432 return 0;