diff options
Diffstat (limited to 'kernel/sched.c')
-rw-r--r-- | kernel/sched.c | 270 |
1 files changed, 256 insertions, 14 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index d9585f15043f..86e55a9c2de6 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -168,7 +168,43 @@ struct task_group { | |||
168 | struct sched_entity **se; | 168 | struct sched_entity **se; |
169 | /* runqueue "owned" by this group on each cpu */ | 169 | /* runqueue "owned" by this group on each cpu */ |
170 | struct cfs_rq **cfs_rq; | 170 | struct cfs_rq **cfs_rq; |
171 | |||
172 | /* | ||
173 | * shares assigned to a task group governs how much of cpu bandwidth | ||
174 | * is allocated to the group. The more shares a group has, the more is | ||
175 | * the cpu bandwidth allocated to it. | ||
176 | * | ||
177 | * For ex, lets say that there are three task groups, A, B and C which | ||
178 | * have been assigned shares 1000, 2000 and 3000 respectively. Then, | ||
179 | * cpu bandwidth allocated by the scheduler to task groups A, B and C | ||
180 | * should be: | ||
181 | * | ||
182 | * Bw(A) = 1000/(1000+2000+3000) * 100 = 16.66% | ||
183 | * Bw(B) = 2000/(1000+2000+3000) * 100 = 33.33% | ||
184 | * Bw(C) = 3000/(1000+2000+3000) * 100 = 50% | ||
185 | * | ||
186 | * The weight assigned to a task group's schedulable entities on every | ||
187 | * cpu (task_group.se[a_cpu]->load.weight) is derived from the task | ||
188 | * group's shares. For ex: lets say that task group A has been | ||
189 | * assigned shares of 1000 and there are two CPUs in a system. Then, | ||
190 | * | ||
191 | * tg_A->se[0]->load.weight = tg_A->se[1]->load.weight = 1000; | ||
192 | * | ||
193 | * Note: It's not necessary that each of a task's group schedulable | ||
194 | * entity have the same weight on all CPUs. If the group | ||
195 | * has 2 of its tasks on CPU0 and 1 task on CPU1, then a | ||
196 | * better distribution of weight could be: | ||
197 | * | ||
198 | * tg_A->se[0]->load.weight = 2/3 * 2000 = 1333 | ||
199 | * tg_A->se[1]->load.weight = 1/2 * 2000 = 667 | ||
200 | * | ||
201 | * rebalance_shares() is responsible for distributing the shares of a | ||
202 | * task groups like this among the group's schedulable entities across | ||
203 | * cpus. | ||
204 | * | ||
205 | */ | ||
171 | unsigned long shares; | 206 | unsigned long shares; |
207 | |||
172 | struct rcu_head rcu; | 208 | struct rcu_head rcu; |
173 | }; | 209 | }; |
174 | 210 | ||
@@ -188,6 +224,14 @@ static DEFINE_MUTEX(task_group_mutex); | |||
188 | /* doms_cur_mutex serializes access to doms_cur[] array */ | 224 | /* doms_cur_mutex serializes access to doms_cur[] array */ |
189 | static DEFINE_MUTEX(doms_cur_mutex); | 225 | static DEFINE_MUTEX(doms_cur_mutex); |
190 | 226 | ||
227 | #ifdef CONFIG_SMP | ||
228 | /* kernel thread that runs rebalance_shares() periodically */ | ||
229 | static struct task_struct *lb_monitor_task; | ||
230 | static int load_balance_monitor(void *unused); | ||
231 | #endif | ||
232 | |||
233 | static void set_se_shares(struct sched_entity *se, unsigned long shares); | ||
234 | |||
191 | /* Default task group. | 235 | /* Default task group. |
192 | * Every task in system belong to this group at bootup. | 236 | * Every task in system belong to this group at bootup. |
193 | */ | 237 | */ |
@@ -202,6 +246,8 @@ struct task_group init_task_group = { | |||
202 | # define INIT_TASK_GROUP_LOAD NICE_0_LOAD | 246 | # define INIT_TASK_GROUP_LOAD NICE_0_LOAD |
203 | #endif | 247 | #endif |
204 | 248 | ||
249 | #define MIN_GROUP_SHARES 2 | ||
250 | |||
205 | static int init_task_group_load = INIT_TASK_GROUP_LOAD; | 251 | static int init_task_group_load = INIT_TASK_GROUP_LOAD; |
206 | 252 | ||
207 | /* return group to which a task belongs */ | 253 | /* return group to which a task belongs */ |
@@ -6736,6 +6782,21 @@ void __init sched_init_smp(void) | |||
6736 | if (set_cpus_allowed(current, non_isolated_cpus) < 0) | 6782 | if (set_cpus_allowed(current, non_isolated_cpus) < 0) |
6737 | BUG(); | 6783 | BUG(); |
6738 | sched_init_granularity(); | 6784 | sched_init_granularity(); |
6785 | |||
6786 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
6787 | if (nr_cpu_ids == 1) | ||
6788 | return; | ||
6789 | |||
6790 | lb_monitor_task = kthread_create(load_balance_monitor, NULL, | ||
6791 | "group_balance"); | ||
6792 | if (!IS_ERR(lb_monitor_task)) { | ||
6793 | lb_monitor_task->flags |= PF_NOFREEZE; | ||
6794 | wake_up_process(lb_monitor_task); | ||
6795 | } else { | ||
6796 | printk(KERN_ERR "Could not create load balance monitor thread" | ||
6797 | "(error = %ld) \n", PTR_ERR(lb_monitor_task)); | ||
6798 | } | ||
6799 | #endif | ||
6739 | } | 6800 | } |
6740 | #else | 6801 | #else |
6741 | void __init sched_init_smp(void) | 6802 | void __init sched_init_smp(void) |
@@ -6988,6 +7049,157 @@ void set_curr_task(int cpu, struct task_struct *p) | |||
6988 | 7049 | ||
6989 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7050 | #ifdef CONFIG_FAIR_GROUP_SCHED |
6990 | 7051 | ||
7052 | #ifdef CONFIG_SMP | ||
7053 | /* | ||
7054 | * distribute shares of all task groups among their schedulable entities, | ||
7055 | * to reflect load distrbution across cpus. | ||
7056 | */ | ||
7057 | static int rebalance_shares(struct sched_domain *sd, int this_cpu) | ||
7058 | { | ||
7059 | struct cfs_rq *cfs_rq; | ||
7060 | struct rq *rq = cpu_rq(this_cpu); | ||
7061 | cpumask_t sdspan = sd->span; | ||
7062 | int balanced = 1; | ||
7063 | |||
7064 | /* Walk thr' all the task groups that we have */ | ||
7065 | for_each_leaf_cfs_rq(rq, cfs_rq) { | ||
7066 | int i; | ||
7067 | unsigned long total_load = 0, total_shares; | ||
7068 | struct task_group *tg = cfs_rq->tg; | ||
7069 | |||
7070 | /* Gather total task load of this group across cpus */ | ||
7071 | for_each_cpu_mask(i, sdspan) | ||
7072 | total_load += tg->cfs_rq[i]->load.weight; | ||
7073 | |||
7074 | /* Nothing to do if this group has no load */ | ||
7075 | if (!total_load) | ||
7076 | continue; | ||
7077 | |||
7078 | /* | ||
7079 | * tg->shares represents the number of cpu shares the task group | ||
7080 | * is eligible to hold on a single cpu. On N cpus, it is | ||
7081 | * eligible to hold (N * tg->shares) number of cpu shares. | ||
7082 | */ | ||
7083 | total_shares = tg->shares * cpus_weight(sdspan); | ||
7084 | |||
7085 | /* | ||
7086 | * redistribute total_shares across cpus as per the task load | ||
7087 | * distribution. | ||
7088 | */ | ||
7089 | for_each_cpu_mask(i, sdspan) { | ||
7090 | unsigned long local_load, local_shares; | ||
7091 | |||
7092 | local_load = tg->cfs_rq[i]->load.weight; | ||
7093 | local_shares = (local_load * total_shares) / total_load; | ||
7094 | if (!local_shares) | ||
7095 | local_shares = MIN_GROUP_SHARES; | ||
7096 | if (local_shares == tg->se[i]->load.weight) | ||
7097 | continue; | ||
7098 | |||
7099 | spin_lock_irq(&cpu_rq(i)->lock); | ||
7100 | set_se_shares(tg->se[i], local_shares); | ||
7101 | spin_unlock_irq(&cpu_rq(i)->lock); | ||
7102 | balanced = 0; | ||
7103 | } | ||
7104 | } | ||
7105 | |||
7106 | return balanced; | ||
7107 | } | ||
7108 | |||
7109 | /* | ||
7110 | * How frequently should we rebalance_shares() across cpus? | ||
7111 | * | ||
7112 | * The more frequently we rebalance shares, the more accurate is the fairness | ||
7113 | * of cpu bandwidth distribution between task groups. However higher frequency | ||
7114 | * also implies increased scheduling overhead. | ||
7115 | * | ||
7116 | * sysctl_sched_min_bal_int_shares represents the minimum interval between | ||
7117 | * consecutive calls to rebalance_shares() in the same sched domain. | ||
7118 | * | ||
7119 | * sysctl_sched_max_bal_int_shares represents the maximum interval between | ||
7120 | * consecutive calls to rebalance_shares() in the same sched domain. | ||
7121 | * | ||
7122 | * These settings allows for the appropriate tradeoff between accuracy of | ||
7123 | * fairness and the associated overhead. | ||
7124 | * | ||
7125 | */ | ||
7126 | |||
7127 | /* default: 8ms, units: milliseconds */ | ||
7128 | const_debug unsigned int sysctl_sched_min_bal_int_shares = 8; | ||
7129 | |||
7130 | /* default: 128ms, units: milliseconds */ | ||
7131 | const_debug unsigned int sysctl_sched_max_bal_int_shares = 128; | ||
7132 | |||
7133 | /* kernel thread that runs rebalance_shares() periodically */ | ||
7134 | static int load_balance_monitor(void *unused) | ||
7135 | { | ||
7136 | unsigned int timeout = sysctl_sched_min_bal_int_shares; | ||
7137 | struct sched_param schedparm; | ||
7138 | int ret; | ||
7139 | |||
7140 | /* | ||
7141 | * We don't want this thread's execution to be limited by the shares | ||
7142 | * assigned to default group (init_task_group). Hence make it run | ||
7143 | * as a SCHED_RR RT task at the lowest priority. | ||
7144 | */ | ||
7145 | schedparm.sched_priority = 1; | ||
7146 | ret = sched_setscheduler(current, SCHED_RR, &schedparm); | ||
7147 | if (ret) | ||
7148 | printk(KERN_ERR "Couldn't set SCHED_RR policy for load balance" | ||
7149 | " monitor thread (error = %d) \n", ret); | ||
7150 | |||
7151 | while (!kthread_should_stop()) { | ||
7152 | int i, cpu, balanced = 1; | ||
7153 | |||
7154 | /* Prevent cpus going down or coming up */ | ||
7155 | lock_cpu_hotplug(); | ||
7156 | /* lockout changes to doms_cur[] array */ | ||
7157 | lock_doms_cur(); | ||
7158 | /* | ||
7159 | * Enter a rcu read-side critical section to safely walk rq->sd | ||
7160 | * chain on various cpus and to walk task group list | ||
7161 | * (rq->leaf_cfs_rq_list) in rebalance_shares(). | ||
7162 | */ | ||
7163 | rcu_read_lock(); | ||
7164 | |||
7165 | for (i = 0; i < ndoms_cur; i++) { | ||
7166 | cpumask_t cpumap = doms_cur[i]; | ||
7167 | struct sched_domain *sd = NULL, *sd_prev = NULL; | ||
7168 | |||
7169 | cpu = first_cpu(cpumap); | ||
7170 | |||
7171 | /* Find the highest domain at which to balance shares */ | ||
7172 | for_each_domain(cpu, sd) { | ||
7173 | if (!(sd->flags & SD_LOAD_BALANCE)) | ||
7174 | continue; | ||
7175 | sd_prev = sd; | ||
7176 | } | ||
7177 | |||
7178 | sd = sd_prev; | ||
7179 | /* sd == NULL? No load balance reqd in this domain */ | ||
7180 | if (!sd) | ||
7181 | continue; | ||
7182 | |||
7183 | balanced &= rebalance_shares(sd, cpu); | ||
7184 | } | ||
7185 | |||
7186 | rcu_read_unlock(); | ||
7187 | |||
7188 | unlock_doms_cur(); | ||
7189 | unlock_cpu_hotplug(); | ||
7190 | |||
7191 | if (!balanced) | ||
7192 | timeout = sysctl_sched_min_bal_int_shares; | ||
7193 | else if (timeout < sysctl_sched_max_bal_int_shares) | ||
7194 | timeout *= 2; | ||
7195 | |||
7196 | msleep_interruptible(timeout); | ||
7197 | } | ||
7198 | |||
7199 | return 0; | ||
7200 | } | ||
7201 | #endif /* CONFIG_SMP */ | ||
7202 | |||
6991 | /* allocate runqueue etc for a new task group */ | 7203 | /* allocate runqueue etc for a new task group */ |
6992 | struct task_group *sched_create_group(void) | 7204 | struct task_group *sched_create_group(void) |
6993 | { | 7205 | { |
@@ -7144,47 +7356,77 @@ done: | |||
7144 | task_rq_unlock(rq, &flags); | 7356 | task_rq_unlock(rq, &flags); |
7145 | } | 7357 | } |
7146 | 7358 | ||
7359 | /* rq->lock to be locked by caller */ | ||
7147 | static void set_se_shares(struct sched_entity *se, unsigned long shares) | 7360 | static void set_se_shares(struct sched_entity *se, unsigned long shares) |
7148 | { | 7361 | { |
7149 | struct cfs_rq *cfs_rq = se->cfs_rq; | 7362 | struct cfs_rq *cfs_rq = se->cfs_rq; |
7150 | struct rq *rq = cfs_rq->rq; | 7363 | struct rq *rq = cfs_rq->rq; |
7151 | int on_rq; | 7364 | int on_rq; |
7152 | 7365 | ||
7153 | spin_lock_irq(&rq->lock); | 7366 | if (!shares) |
7367 | shares = MIN_GROUP_SHARES; | ||
7154 | 7368 | ||
7155 | on_rq = se->on_rq; | 7369 | on_rq = se->on_rq; |
7156 | if (on_rq) | 7370 | if (on_rq) { |
7157 | dequeue_entity(cfs_rq, se, 0); | 7371 | dequeue_entity(cfs_rq, se, 0); |
7372 | dec_cpu_load(rq, se->load.weight); | ||
7373 | } | ||
7158 | 7374 | ||
7159 | se->load.weight = shares; | 7375 | se->load.weight = shares; |
7160 | se->load.inv_weight = div64_64((1ULL<<32), shares); | 7376 | se->load.inv_weight = div64_64((1ULL<<32), shares); |
7161 | 7377 | ||
7162 | if (on_rq) | 7378 | if (on_rq) { |
7163 | enqueue_entity(cfs_rq, se, 0); | 7379 | enqueue_entity(cfs_rq, se, 0); |
7164 | 7380 | inc_cpu_load(rq, se->load.weight); | |
7165 | spin_unlock_irq(&rq->lock); | 7381 | } |
7166 | } | 7382 | } |
7167 | 7383 | ||
7168 | int sched_group_set_shares(struct task_group *tg, unsigned long shares) | 7384 | int sched_group_set_shares(struct task_group *tg, unsigned long shares) |
7169 | { | 7385 | { |
7170 | int i; | 7386 | int i; |
7171 | 7387 | struct cfs_rq *cfs_rq; | |
7172 | /* | 7388 | struct rq *rq; |
7173 | * A weight of 0 or 1 can cause arithmetics problems. | ||
7174 | * (The default weight is 1024 - so there's no practical | ||
7175 | * limitation from this.) | ||
7176 | */ | ||
7177 | if (shares < 2) | ||
7178 | shares = 2; | ||
7179 | 7389 | ||
7180 | lock_task_group_list(); | 7390 | lock_task_group_list(); |
7181 | if (tg->shares == shares) | 7391 | if (tg->shares == shares) |
7182 | goto done; | 7392 | goto done; |
7183 | 7393 | ||
7394 | if (shares < MIN_GROUP_SHARES) | ||
7395 | shares = MIN_GROUP_SHARES; | ||
7396 | |||
7397 | /* | ||
7398 | * Prevent any load balance activity (rebalance_shares, | ||
7399 | * load_balance_fair) from referring to this group first, | ||
7400 | * by taking it off the rq->leaf_cfs_rq_list on each cpu. | ||
7401 | */ | ||
7402 | for_each_possible_cpu(i) { | ||
7403 | cfs_rq = tg->cfs_rq[i]; | ||
7404 | list_del_rcu(&cfs_rq->leaf_cfs_rq_list); | ||
7405 | } | ||
7406 | |||
7407 | /* wait for any ongoing reference to this group to finish */ | ||
7408 | synchronize_sched(); | ||
7409 | |||
7410 | /* | ||
7411 | * Now we are free to modify the group's share on each cpu | ||
7412 | * w/o tripping rebalance_share or load_balance_fair. | ||
7413 | */ | ||
7184 | tg->shares = shares; | 7414 | tg->shares = shares; |
7185 | for_each_possible_cpu(i) | 7415 | for_each_possible_cpu(i) { |
7416 | spin_lock_irq(&cpu_rq(i)->lock); | ||
7186 | set_se_shares(tg->se[i], shares); | 7417 | set_se_shares(tg->se[i], shares); |
7418 | spin_unlock_irq(&cpu_rq(i)->lock); | ||
7419 | } | ||
7187 | 7420 | ||
7421 | /* | ||
7422 | * Enable load balance activity on this group, by inserting it back on | ||
7423 | * each cpu's rq->leaf_cfs_rq_list. | ||
7424 | */ | ||
7425 | for_each_possible_cpu(i) { | ||
7426 | rq = cpu_rq(i); | ||
7427 | cfs_rq = tg->cfs_rq[i]; | ||
7428 | list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); | ||
7429 | } | ||
7188 | done: | 7430 | done: |
7189 | unlock_task_group_list(); | 7431 | unlock_task_group_list(); |
7190 | return 0; | 7432 | return 0; |