aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--include/linux/sched.h4
-rw-r--r--kernel/sched.c270
-rw-r--r--kernel/sched_fair.c84
-rw-r--r--kernel/sysctl.c18
4 files changed, 331 insertions, 45 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d6eacda765ca..288245f83bd4 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1453,6 +1453,10 @@ extern unsigned int sysctl_sched_child_runs_first;
1453extern unsigned int sysctl_sched_features; 1453extern unsigned int sysctl_sched_features;
1454extern unsigned int sysctl_sched_migration_cost; 1454extern unsigned int sysctl_sched_migration_cost;
1455extern unsigned int sysctl_sched_nr_migrate; 1455extern unsigned int sysctl_sched_nr_migrate;
1456#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
1457extern unsigned int sysctl_sched_min_bal_int_shares;
1458extern unsigned int sysctl_sched_max_bal_int_shares;
1459#endif
1456 1460
1457int sched_nr_latency_handler(struct ctl_table *table, int write, 1461int sched_nr_latency_handler(struct ctl_table *table, int write,
1458 struct file *file, void __user *buffer, size_t *length, 1462 struct file *file, void __user *buffer, size_t *length,
diff --git a/kernel/sched.c b/kernel/sched.c
index d9585f15043f..86e55a9c2de6 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -168,7 +168,43 @@ struct task_group {
168 struct sched_entity **se; 168 struct sched_entity **se;
169 /* runqueue "owned" by this group on each cpu */ 169 /* runqueue "owned" by this group on each cpu */
170 struct cfs_rq **cfs_rq; 170 struct cfs_rq **cfs_rq;
171
172 /*
173 * shares assigned to a task group governs how much of cpu bandwidth
174 * is allocated to the group. The more shares a group has, the more is
175 * the cpu bandwidth allocated to it.
176 *
177 * For ex, lets say that there are three task groups, A, B and C which
178 * have been assigned shares 1000, 2000 and 3000 respectively. Then,
179 * cpu bandwidth allocated by the scheduler to task groups A, B and C
180 * should be:
181 *
182 * Bw(A) = 1000/(1000+2000+3000) * 100 = 16.66%
183 * Bw(B) = 2000/(1000+2000+3000) * 100 = 33.33%
184 * Bw(C) = 3000/(1000+2000+3000) * 100 = 50%
185 *
186 * The weight assigned to a task group's schedulable entities on every
187 * cpu (task_group.se[a_cpu]->load.weight) is derived from the task
188 * group's shares. For ex: lets say that task group A has been
189 * assigned shares of 1000 and there are two CPUs in a system. Then,
190 *
191 * tg_A->se[0]->load.weight = tg_A->se[1]->load.weight = 1000;
192 *
193 * Note: It's not necessary that each of a task's group schedulable
194 * entity have the same weight on all CPUs. If the group
195 * has 2 of its tasks on CPU0 and 1 task on CPU1, then a
196 * better distribution of weight could be:
197 *
198 * tg_A->se[0]->load.weight = 2/3 * 2000 = 1333
199 * tg_A->se[1]->load.weight = 1/2 * 2000 = 667
200 *
201 * rebalance_shares() is responsible for distributing the shares of a
202 * task groups like this among the group's schedulable entities across
203 * cpus.
204 *
205 */
171 unsigned long shares; 206 unsigned long shares;
207
172 struct rcu_head rcu; 208 struct rcu_head rcu;
173}; 209};
174 210
@@ -188,6 +224,14 @@ static DEFINE_MUTEX(task_group_mutex);
188/* doms_cur_mutex serializes access to doms_cur[] array */ 224/* doms_cur_mutex serializes access to doms_cur[] array */
189static DEFINE_MUTEX(doms_cur_mutex); 225static DEFINE_MUTEX(doms_cur_mutex);
190 226
227#ifdef CONFIG_SMP
228/* kernel thread that runs rebalance_shares() periodically */
229static struct task_struct *lb_monitor_task;
230static int load_balance_monitor(void *unused);
231#endif
232
233static void set_se_shares(struct sched_entity *se, unsigned long shares);
234
191/* Default task group. 235/* Default task group.
192 * Every task in system belong to this group at bootup. 236 * Every task in system belong to this group at bootup.
193 */ 237 */
@@ -202,6 +246,8 @@ struct task_group init_task_group = {
202# define INIT_TASK_GROUP_LOAD NICE_0_LOAD 246# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
203#endif 247#endif
204 248
249#define MIN_GROUP_SHARES 2
250
205static int init_task_group_load = INIT_TASK_GROUP_LOAD; 251static int init_task_group_load = INIT_TASK_GROUP_LOAD;
206 252
207/* return group to which a task belongs */ 253/* return group to which a task belongs */
@@ -6736,6 +6782,21 @@ void __init sched_init_smp(void)
6736 if (set_cpus_allowed(current, non_isolated_cpus) < 0) 6782 if (set_cpus_allowed(current, non_isolated_cpus) < 0)
6737 BUG(); 6783 BUG();
6738 sched_init_granularity(); 6784 sched_init_granularity();
6785
6786#ifdef CONFIG_FAIR_GROUP_SCHED
6787 if (nr_cpu_ids == 1)
6788 return;
6789
6790 lb_monitor_task = kthread_create(load_balance_monitor, NULL,
6791 "group_balance");
6792 if (!IS_ERR(lb_monitor_task)) {
6793 lb_monitor_task->flags |= PF_NOFREEZE;
6794 wake_up_process(lb_monitor_task);
6795 } else {
6796 printk(KERN_ERR "Could not create load balance monitor thread"
6797 "(error = %ld) \n", PTR_ERR(lb_monitor_task));
6798 }
6799#endif
6739} 6800}
6740#else 6801#else
6741void __init sched_init_smp(void) 6802void __init sched_init_smp(void)
@@ -6988,6 +7049,157 @@ void set_curr_task(int cpu, struct task_struct *p)
6988 7049
6989#ifdef CONFIG_FAIR_GROUP_SCHED 7050#ifdef CONFIG_FAIR_GROUP_SCHED
6990 7051
7052#ifdef CONFIG_SMP
7053/*
7054 * distribute shares of all task groups among their schedulable entities,
7055 * to reflect load distrbution across cpus.
7056 */
7057static int rebalance_shares(struct sched_domain *sd, int this_cpu)
7058{
7059 struct cfs_rq *cfs_rq;
7060 struct rq *rq = cpu_rq(this_cpu);
7061 cpumask_t sdspan = sd->span;
7062 int balanced = 1;
7063
7064 /* Walk thr' all the task groups that we have */
7065 for_each_leaf_cfs_rq(rq, cfs_rq) {
7066 int i;
7067 unsigned long total_load = 0, total_shares;
7068 struct task_group *tg = cfs_rq->tg;
7069
7070 /* Gather total task load of this group across cpus */
7071 for_each_cpu_mask(i, sdspan)
7072 total_load += tg->cfs_rq[i]->load.weight;
7073
7074 /* Nothing to do if this group has no load */
7075 if (!total_load)
7076 continue;
7077
7078 /*
7079 * tg->shares represents the number of cpu shares the task group
7080 * is eligible to hold on a single cpu. On N cpus, it is
7081 * eligible to hold (N * tg->shares) number of cpu shares.
7082 */
7083 total_shares = tg->shares * cpus_weight(sdspan);
7084
7085 /*
7086 * redistribute total_shares across cpus as per the task load
7087 * distribution.
7088 */
7089 for_each_cpu_mask(i, sdspan) {
7090 unsigned long local_load, local_shares;
7091
7092 local_load = tg->cfs_rq[i]->load.weight;
7093 local_shares = (local_load * total_shares) / total_load;
7094 if (!local_shares)
7095 local_shares = MIN_GROUP_SHARES;
7096 if (local_shares == tg->se[i]->load.weight)
7097 continue;
7098
7099 spin_lock_irq(&cpu_rq(i)->lock);
7100 set_se_shares(tg->se[i], local_shares);
7101 spin_unlock_irq(&cpu_rq(i)->lock);
7102 balanced = 0;
7103 }
7104 }
7105
7106 return balanced;
7107}
7108
7109/*
7110 * How frequently should we rebalance_shares() across cpus?
7111 *
7112 * The more frequently we rebalance shares, the more accurate is the fairness
7113 * of cpu bandwidth distribution between task groups. However higher frequency
7114 * also implies increased scheduling overhead.
7115 *
7116 * sysctl_sched_min_bal_int_shares represents the minimum interval between
7117 * consecutive calls to rebalance_shares() in the same sched domain.
7118 *
7119 * sysctl_sched_max_bal_int_shares represents the maximum interval between
7120 * consecutive calls to rebalance_shares() in the same sched domain.
7121 *
7122 * These settings allows for the appropriate tradeoff between accuracy of
7123 * fairness and the associated overhead.
7124 *
7125 */
7126
7127/* default: 8ms, units: milliseconds */
7128const_debug unsigned int sysctl_sched_min_bal_int_shares = 8;
7129
7130/* default: 128ms, units: milliseconds */
7131const_debug unsigned int sysctl_sched_max_bal_int_shares = 128;
7132
7133/* kernel thread that runs rebalance_shares() periodically */
7134static int load_balance_monitor(void *unused)
7135{
7136 unsigned int timeout = sysctl_sched_min_bal_int_shares;
7137 struct sched_param schedparm;
7138 int ret;
7139
7140 /*
7141 * We don't want this thread's execution to be limited by the shares
7142 * assigned to default group (init_task_group). Hence make it run
7143 * as a SCHED_RR RT task at the lowest priority.
7144 */
7145 schedparm.sched_priority = 1;
7146 ret = sched_setscheduler(current, SCHED_RR, &schedparm);
7147 if (ret)
7148 printk(KERN_ERR "Couldn't set SCHED_RR policy for load balance"
7149 " monitor thread (error = %d) \n", ret);
7150
7151 while (!kthread_should_stop()) {
7152 int i, cpu, balanced = 1;
7153
7154 /* Prevent cpus going down or coming up */
7155 lock_cpu_hotplug();
7156 /* lockout changes to doms_cur[] array */
7157 lock_doms_cur();
7158 /*
7159 * Enter a rcu read-side critical section to safely walk rq->sd
7160 * chain on various cpus and to walk task group list
7161 * (rq->leaf_cfs_rq_list) in rebalance_shares().
7162 */
7163 rcu_read_lock();
7164
7165 for (i = 0; i < ndoms_cur; i++) {
7166 cpumask_t cpumap = doms_cur[i];
7167 struct sched_domain *sd = NULL, *sd_prev = NULL;
7168
7169 cpu = first_cpu(cpumap);
7170
7171 /* Find the highest domain at which to balance shares */
7172 for_each_domain(cpu, sd) {
7173 if (!(sd->flags & SD_LOAD_BALANCE))
7174 continue;
7175 sd_prev = sd;
7176 }
7177
7178 sd = sd_prev;
7179 /* sd == NULL? No load balance reqd in this domain */
7180 if (!sd)
7181 continue;
7182
7183 balanced &= rebalance_shares(sd, cpu);
7184 }
7185
7186 rcu_read_unlock();
7187
7188 unlock_doms_cur();
7189 unlock_cpu_hotplug();
7190
7191 if (!balanced)
7192 timeout = sysctl_sched_min_bal_int_shares;
7193 else if (timeout < sysctl_sched_max_bal_int_shares)
7194 timeout *= 2;
7195
7196 msleep_interruptible(timeout);
7197 }
7198
7199 return 0;
7200}
7201#endif /* CONFIG_SMP */
7202
6991/* allocate runqueue etc for a new task group */ 7203/* allocate runqueue etc for a new task group */
6992struct task_group *sched_create_group(void) 7204struct task_group *sched_create_group(void)
6993{ 7205{
@@ -7144,47 +7356,77 @@ done:
7144 task_rq_unlock(rq, &flags); 7356 task_rq_unlock(rq, &flags);
7145} 7357}
7146 7358
7359/* rq->lock to be locked by caller */
7147static void set_se_shares(struct sched_entity *se, unsigned long shares) 7360static void set_se_shares(struct sched_entity *se, unsigned long shares)
7148{ 7361{
7149 struct cfs_rq *cfs_rq = se->cfs_rq; 7362 struct cfs_rq *cfs_rq = se->cfs_rq;
7150 struct rq *rq = cfs_rq->rq; 7363 struct rq *rq = cfs_rq->rq;
7151 int on_rq; 7364 int on_rq;
7152 7365
7153 spin_lock_irq(&rq->lock); 7366 if (!shares)
7367 shares = MIN_GROUP_SHARES;
7154 7368
7155 on_rq = se->on_rq; 7369 on_rq = se->on_rq;
7156 if (on_rq) 7370 if (on_rq) {
7157 dequeue_entity(cfs_rq, se, 0); 7371 dequeue_entity(cfs_rq, se, 0);
7372 dec_cpu_load(rq, se->load.weight);
7373 }
7158 7374
7159 se->load.weight = shares; 7375 se->load.weight = shares;
7160 se->load.inv_weight = div64_64((1ULL<<32), shares); 7376 se->load.inv_weight = div64_64((1ULL<<32), shares);
7161 7377
7162 if (on_rq) 7378 if (on_rq) {
7163 enqueue_entity(cfs_rq, se, 0); 7379 enqueue_entity(cfs_rq, se, 0);
7164 7380 inc_cpu_load(rq, se->load.weight);
7165 spin_unlock_irq(&rq->lock); 7381 }
7166} 7382}
7167 7383
7168int sched_group_set_shares(struct task_group *tg, unsigned long shares) 7384int sched_group_set_shares(struct task_group *tg, unsigned long shares)
7169{ 7385{
7170 int i; 7386 int i;
7171 7387 struct cfs_rq *cfs_rq;
7172 /* 7388 struct rq *rq;
7173 * A weight of 0 or 1 can cause arithmetics problems.
7174 * (The default weight is 1024 - so there's no practical
7175 * limitation from this.)
7176 */
7177 if (shares < 2)
7178 shares = 2;
7179 7389
7180 lock_task_group_list(); 7390 lock_task_group_list();
7181 if (tg->shares == shares) 7391 if (tg->shares == shares)
7182 goto done; 7392 goto done;
7183 7393
7394 if (shares < MIN_GROUP_SHARES)
7395 shares = MIN_GROUP_SHARES;
7396
7397 /*
7398 * Prevent any load balance activity (rebalance_shares,
7399 * load_balance_fair) from referring to this group first,
7400 * by taking it off the rq->leaf_cfs_rq_list on each cpu.
7401 */
7402 for_each_possible_cpu(i) {
7403 cfs_rq = tg->cfs_rq[i];
7404 list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
7405 }
7406
7407 /* wait for any ongoing reference to this group to finish */
7408 synchronize_sched();
7409
7410 /*
7411 * Now we are free to modify the group's share on each cpu
7412 * w/o tripping rebalance_share or load_balance_fair.
7413 */
7184 tg->shares = shares; 7414 tg->shares = shares;
7185 for_each_possible_cpu(i) 7415 for_each_possible_cpu(i) {
7416 spin_lock_irq(&cpu_rq(i)->lock);
7186 set_se_shares(tg->se[i], shares); 7417 set_se_shares(tg->se[i], shares);
7418 spin_unlock_irq(&cpu_rq(i)->lock);
7419 }
7187 7420
7421 /*
7422 * Enable load balance activity on this group, by inserting it back on
7423 * each cpu's rq->leaf_cfs_rq_list.
7424 */
7425 for_each_possible_cpu(i) {
7426 rq = cpu_rq(i);
7427 cfs_rq = tg->cfs_rq[i];
7428 list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
7429 }
7188done: 7430done:
7189 unlock_task_group_list(); 7431 unlock_task_group_list();
7190 return 0; 7432 return 0;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 30ae9c2a2861..5c208e090ae4 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -707,6 +707,8 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se)
707 return se->parent; 707 return se->parent;
708} 708}
709 709
710#define GROUP_IMBALANCE_PCT 20
711
710#else /* CONFIG_FAIR_GROUP_SCHED */ 712#else /* CONFIG_FAIR_GROUP_SCHED */
711 713
712#define for_each_sched_entity(se) \ 714#define for_each_sched_entity(se) \
@@ -967,25 +969,6 @@ static struct task_struct *load_balance_next_fair(void *arg)
967 return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr); 969 return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr);
968} 970}
969 971
970#ifdef CONFIG_FAIR_GROUP_SCHED
971static int cfs_rq_best_prio(struct cfs_rq *cfs_rq)
972{
973 struct sched_entity *curr;
974 struct task_struct *p;
975
976 if (!cfs_rq->nr_running)
977 return MAX_PRIO;
978
979 curr = cfs_rq->curr;
980 if (!curr)
981 curr = __pick_next_entity(cfs_rq);
982
983 p = task_of(curr);
984
985 return p->prio;
986}
987#endif
988
989static unsigned long 972static unsigned long
990load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 973load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
991 unsigned long max_load_move, 974 unsigned long max_load_move,
@@ -995,28 +978,45 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
995 struct cfs_rq *busy_cfs_rq; 978 struct cfs_rq *busy_cfs_rq;
996 long rem_load_move = max_load_move; 979 long rem_load_move = max_load_move;
997 struct rq_iterator cfs_rq_iterator; 980 struct rq_iterator cfs_rq_iterator;
981 unsigned long load_moved;
998 982
999 cfs_rq_iterator.start = load_balance_start_fair; 983 cfs_rq_iterator.start = load_balance_start_fair;
1000 cfs_rq_iterator.next = load_balance_next_fair; 984 cfs_rq_iterator.next = load_balance_next_fair;
1001 985
1002 for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { 986 for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
1003#ifdef CONFIG_FAIR_GROUP_SCHED 987#ifdef CONFIG_FAIR_GROUP_SCHED
1004 struct cfs_rq *this_cfs_rq; 988 struct cfs_rq *this_cfs_rq = busy_cfs_rq->tg->cfs_rq[this_cpu];
1005 long imbalance; 989 unsigned long maxload, task_load, group_weight;
1006 unsigned long maxload; 990 unsigned long thisload, per_task_load;
991 struct sched_entity *se = busy_cfs_rq->tg->se[busiest->cpu];
1007 992
1008 this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu); 993 task_load = busy_cfs_rq->load.weight;
994 group_weight = se->load.weight;
1009 995
1010 imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight; 996 /*
1011 /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */ 997 * 'group_weight' is contributed by tasks of total weight
1012 if (imbalance <= 0) 998 * 'task_load'. To move 'rem_load_move' worth of weight only,
999 * we need to move a maximum task load of:
1000 *
1001 * maxload = (remload / group_weight) * task_load;
1002 */
1003 maxload = (rem_load_move * task_load) / group_weight;
1004
1005 if (!maxload || !task_load)
1013 continue; 1006 continue;
1014 1007
1015 /* Don't pull more than imbalance/2 */ 1008 per_task_load = task_load / busy_cfs_rq->nr_running;
1016 imbalance /= 2; 1009 /*
1017 maxload = min(rem_load_move, imbalance); 1010 * balance_tasks will try to forcibly move atleast one task if
1011 * possible (because of SCHED_LOAD_SCALE_FUZZ). Avoid that if
1012 * maxload is less than GROUP_IMBALANCE_FUZZ% the per_task_load.
1013 */
1014 if (100 * maxload < GROUP_IMBALANCE_PCT * per_task_load)
1015 continue;
1018 1016
1019 *this_best_prio = cfs_rq_best_prio(this_cfs_rq); 1017 /* Disable priority-based load balance */
1018 *this_best_prio = 0;
1019 thisload = this_cfs_rq->load.weight;
1020#else 1020#else
1021# define maxload rem_load_move 1021# define maxload rem_load_move
1022#endif 1022#endif
@@ -1025,11 +1025,33 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1025 * load_balance_[start|next]_fair iterators 1025 * load_balance_[start|next]_fair iterators
1026 */ 1026 */
1027 cfs_rq_iterator.arg = busy_cfs_rq; 1027 cfs_rq_iterator.arg = busy_cfs_rq;
1028 rem_load_move -= balance_tasks(this_rq, this_cpu, busiest, 1028 load_moved = balance_tasks(this_rq, this_cpu, busiest,
1029 maxload, sd, idle, all_pinned, 1029 maxload, sd, idle, all_pinned,
1030 this_best_prio, 1030 this_best_prio,
1031 &cfs_rq_iterator); 1031 &cfs_rq_iterator);
1032 1032
1033#ifdef CONFIG_FAIR_GROUP_SCHED
1034 /*
1035 * load_moved holds the task load that was moved. The
1036 * effective (group) weight moved would be:
1037 * load_moved_eff = load_moved/task_load * group_weight;
1038 */
1039 load_moved = (group_weight * load_moved) / task_load;
1040
1041 /* Adjust shares on both cpus to reflect load_moved */
1042 group_weight -= load_moved;
1043 set_se_shares(se, group_weight);
1044
1045 se = busy_cfs_rq->tg->se[this_cpu];
1046 if (!thisload)
1047 group_weight = load_moved;
1048 else
1049 group_weight = se->load.weight + load_moved;
1050 set_se_shares(se, group_weight);
1051#endif
1052
1053 rem_load_move -= load_moved;
1054
1033 if (rem_load_move <= 0) 1055 if (rem_load_move <= 0)
1034 break; 1056 break;
1035 } 1057 }
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c68f68dcc605..c95f3ed34474 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -309,6 +309,24 @@ static struct ctl_table kern_table[] = {
309 .mode = 644, 309 .mode = 644,
310 .proc_handler = &proc_dointvec, 310 .proc_handler = &proc_dointvec,
311 }, 311 },
312#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
313 {
314 .ctl_name = CTL_UNNUMBERED,
315 .procname = "sched_min_bal_int_shares",
316 .data = &sysctl_sched_min_bal_int_shares,
317 .maxlen = sizeof(unsigned int),
318 .mode = 0644,
319 .proc_handler = &proc_dointvec,
320 },
321 {
322 .ctl_name = CTL_UNNUMBERED,
323 .procname = "sched_max_bal_int_shares",
324 .data = &sysctl_sched_max_bal_int_shares,
325 .maxlen = sizeof(unsigned int),
326 .mode = 0644,
327 .proc_handler = &proc_dointvec,
328 },
329#endif
312#endif 330#endif
313 { 331 {
314 .ctl_name = CTL_UNNUMBERED, 332 .ctl_name = CTL_UNNUMBERED,