diff options
-rw-r--r-- | include/linux/sched.h | 4 | ||||
-rw-r--r-- | kernel/sched.c | 270 | ||||
-rw-r--r-- | kernel/sched_fair.c | 84 | ||||
-rw-r--r-- | kernel/sysctl.c | 18 |
4 files changed, 331 insertions, 45 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h index d6eacda765ca..288245f83bd4 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -1453,6 +1453,10 @@ extern unsigned int sysctl_sched_child_runs_first; | |||
1453 | extern unsigned int sysctl_sched_features; | 1453 | extern unsigned int sysctl_sched_features; |
1454 | extern unsigned int sysctl_sched_migration_cost; | 1454 | extern unsigned int sysctl_sched_migration_cost; |
1455 | extern unsigned int sysctl_sched_nr_migrate; | 1455 | extern unsigned int sysctl_sched_nr_migrate; |
1456 | #if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP) | ||
1457 | extern unsigned int sysctl_sched_min_bal_int_shares; | ||
1458 | extern unsigned int sysctl_sched_max_bal_int_shares; | ||
1459 | #endif | ||
1456 | 1460 | ||
1457 | int sched_nr_latency_handler(struct ctl_table *table, int write, | 1461 | int sched_nr_latency_handler(struct ctl_table *table, int write, |
1458 | struct file *file, void __user *buffer, size_t *length, | 1462 | struct file *file, void __user *buffer, size_t *length, |
diff --git a/kernel/sched.c b/kernel/sched.c index d9585f15043f..86e55a9c2de6 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -168,7 +168,43 @@ struct task_group { | |||
168 | struct sched_entity **se; | 168 | struct sched_entity **se; |
169 | /* runqueue "owned" by this group on each cpu */ | 169 | /* runqueue "owned" by this group on each cpu */ |
170 | struct cfs_rq **cfs_rq; | 170 | struct cfs_rq **cfs_rq; |
171 | |||
172 | /* | ||
173 | * shares assigned to a task group governs how much of cpu bandwidth | ||
174 | * is allocated to the group. The more shares a group has, the more is | ||
175 | * the cpu bandwidth allocated to it. | ||
176 | * | ||
177 | * For ex, lets say that there are three task groups, A, B and C which | ||
178 | * have been assigned shares 1000, 2000 and 3000 respectively. Then, | ||
179 | * cpu bandwidth allocated by the scheduler to task groups A, B and C | ||
180 | * should be: | ||
181 | * | ||
182 | * Bw(A) = 1000/(1000+2000+3000) * 100 = 16.66% | ||
183 | * Bw(B) = 2000/(1000+2000+3000) * 100 = 33.33% | ||
184 | * Bw(C) = 3000/(1000+2000+3000) * 100 = 50% | ||
185 | * | ||
186 | * The weight assigned to a task group's schedulable entities on every | ||
187 | * cpu (task_group.se[a_cpu]->load.weight) is derived from the task | ||
188 | * group's shares. For ex: lets say that task group A has been | ||
189 | * assigned shares of 1000 and there are two CPUs in a system. Then, | ||
190 | * | ||
191 | * tg_A->se[0]->load.weight = tg_A->se[1]->load.weight = 1000; | ||
192 | * | ||
193 | * Note: It's not necessary that each of a task's group schedulable | ||
194 | * entity have the same weight on all CPUs. If the group | ||
195 | * has 2 of its tasks on CPU0 and 1 task on CPU1, then a | ||
196 | * better distribution of weight could be: | ||
197 | * | ||
198 | * tg_A->se[0]->load.weight = 2/3 * 2000 = 1333 | ||
199 | * tg_A->se[1]->load.weight = 1/2 * 2000 = 667 | ||
200 | * | ||
201 | * rebalance_shares() is responsible for distributing the shares of a | ||
202 | * task groups like this among the group's schedulable entities across | ||
203 | * cpus. | ||
204 | * | ||
205 | */ | ||
171 | unsigned long shares; | 206 | unsigned long shares; |
207 | |||
172 | struct rcu_head rcu; | 208 | struct rcu_head rcu; |
173 | }; | 209 | }; |
174 | 210 | ||
@@ -188,6 +224,14 @@ static DEFINE_MUTEX(task_group_mutex); | |||
188 | /* doms_cur_mutex serializes access to doms_cur[] array */ | 224 | /* doms_cur_mutex serializes access to doms_cur[] array */ |
189 | static DEFINE_MUTEX(doms_cur_mutex); | 225 | static DEFINE_MUTEX(doms_cur_mutex); |
190 | 226 | ||
227 | #ifdef CONFIG_SMP | ||
228 | /* kernel thread that runs rebalance_shares() periodically */ | ||
229 | static struct task_struct *lb_monitor_task; | ||
230 | static int load_balance_monitor(void *unused); | ||
231 | #endif | ||
232 | |||
233 | static void set_se_shares(struct sched_entity *se, unsigned long shares); | ||
234 | |||
191 | /* Default task group. | 235 | /* Default task group. |
192 | * Every task in system belong to this group at bootup. | 236 | * Every task in system belong to this group at bootup. |
193 | */ | 237 | */ |
@@ -202,6 +246,8 @@ struct task_group init_task_group = { | |||
202 | # define INIT_TASK_GROUP_LOAD NICE_0_LOAD | 246 | # define INIT_TASK_GROUP_LOAD NICE_0_LOAD |
203 | #endif | 247 | #endif |
204 | 248 | ||
249 | #define MIN_GROUP_SHARES 2 | ||
250 | |||
205 | static int init_task_group_load = INIT_TASK_GROUP_LOAD; | 251 | static int init_task_group_load = INIT_TASK_GROUP_LOAD; |
206 | 252 | ||
207 | /* return group to which a task belongs */ | 253 | /* return group to which a task belongs */ |
@@ -6736,6 +6782,21 @@ void __init sched_init_smp(void) | |||
6736 | if (set_cpus_allowed(current, non_isolated_cpus) < 0) | 6782 | if (set_cpus_allowed(current, non_isolated_cpus) < 0) |
6737 | BUG(); | 6783 | BUG(); |
6738 | sched_init_granularity(); | 6784 | sched_init_granularity(); |
6785 | |||
6786 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
6787 | if (nr_cpu_ids == 1) | ||
6788 | return; | ||
6789 | |||
6790 | lb_monitor_task = kthread_create(load_balance_monitor, NULL, | ||
6791 | "group_balance"); | ||
6792 | if (!IS_ERR(lb_monitor_task)) { | ||
6793 | lb_monitor_task->flags |= PF_NOFREEZE; | ||
6794 | wake_up_process(lb_monitor_task); | ||
6795 | } else { | ||
6796 | printk(KERN_ERR "Could not create load balance monitor thread" | ||
6797 | "(error = %ld) \n", PTR_ERR(lb_monitor_task)); | ||
6798 | } | ||
6799 | #endif | ||
6739 | } | 6800 | } |
6740 | #else | 6801 | #else |
6741 | void __init sched_init_smp(void) | 6802 | void __init sched_init_smp(void) |
@@ -6988,6 +7049,157 @@ void set_curr_task(int cpu, struct task_struct *p) | |||
6988 | 7049 | ||
6989 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7050 | #ifdef CONFIG_FAIR_GROUP_SCHED |
6990 | 7051 | ||
7052 | #ifdef CONFIG_SMP | ||
7053 | /* | ||
7054 | * distribute shares of all task groups among their schedulable entities, | ||
7055 | * to reflect load distrbution across cpus. | ||
7056 | */ | ||
7057 | static int rebalance_shares(struct sched_domain *sd, int this_cpu) | ||
7058 | { | ||
7059 | struct cfs_rq *cfs_rq; | ||
7060 | struct rq *rq = cpu_rq(this_cpu); | ||
7061 | cpumask_t sdspan = sd->span; | ||
7062 | int balanced = 1; | ||
7063 | |||
7064 | /* Walk thr' all the task groups that we have */ | ||
7065 | for_each_leaf_cfs_rq(rq, cfs_rq) { | ||
7066 | int i; | ||
7067 | unsigned long total_load = 0, total_shares; | ||
7068 | struct task_group *tg = cfs_rq->tg; | ||
7069 | |||
7070 | /* Gather total task load of this group across cpus */ | ||
7071 | for_each_cpu_mask(i, sdspan) | ||
7072 | total_load += tg->cfs_rq[i]->load.weight; | ||
7073 | |||
7074 | /* Nothing to do if this group has no load */ | ||
7075 | if (!total_load) | ||
7076 | continue; | ||
7077 | |||
7078 | /* | ||
7079 | * tg->shares represents the number of cpu shares the task group | ||
7080 | * is eligible to hold on a single cpu. On N cpus, it is | ||
7081 | * eligible to hold (N * tg->shares) number of cpu shares. | ||
7082 | */ | ||
7083 | total_shares = tg->shares * cpus_weight(sdspan); | ||
7084 | |||
7085 | /* | ||
7086 | * redistribute total_shares across cpus as per the task load | ||
7087 | * distribution. | ||
7088 | */ | ||
7089 | for_each_cpu_mask(i, sdspan) { | ||
7090 | unsigned long local_load, local_shares; | ||
7091 | |||
7092 | local_load = tg->cfs_rq[i]->load.weight; | ||
7093 | local_shares = (local_load * total_shares) / total_load; | ||
7094 | if (!local_shares) | ||
7095 | local_shares = MIN_GROUP_SHARES; | ||
7096 | if (local_shares == tg->se[i]->load.weight) | ||
7097 | continue; | ||
7098 | |||
7099 | spin_lock_irq(&cpu_rq(i)->lock); | ||
7100 | set_se_shares(tg->se[i], local_shares); | ||
7101 | spin_unlock_irq(&cpu_rq(i)->lock); | ||
7102 | balanced = 0; | ||
7103 | } | ||
7104 | } | ||
7105 | |||
7106 | return balanced; | ||
7107 | } | ||
7108 | |||
7109 | /* | ||
7110 | * How frequently should we rebalance_shares() across cpus? | ||
7111 | * | ||
7112 | * The more frequently we rebalance shares, the more accurate is the fairness | ||
7113 | * of cpu bandwidth distribution between task groups. However higher frequency | ||
7114 | * also implies increased scheduling overhead. | ||
7115 | * | ||
7116 | * sysctl_sched_min_bal_int_shares represents the minimum interval between | ||
7117 | * consecutive calls to rebalance_shares() in the same sched domain. | ||
7118 | * | ||
7119 | * sysctl_sched_max_bal_int_shares represents the maximum interval between | ||
7120 | * consecutive calls to rebalance_shares() in the same sched domain. | ||
7121 | * | ||
7122 | * These settings allows for the appropriate tradeoff between accuracy of | ||
7123 | * fairness and the associated overhead. | ||
7124 | * | ||
7125 | */ | ||
7126 | |||
7127 | /* default: 8ms, units: milliseconds */ | ||
7128 | const_debug unsigned int sysctl_sched_min_bal_int_shares = 8; | ||
7129 | |||
7130 | /* default: 128ms, units: milliseconds */ | ||
7131 | const_debug unsigned int sysctl_sched_max_bal_int_shares = 128; | ||
7132 | |||
7133 | /* kernel thread that runs rebalance_shares() periodically */ | ||
7134 | static int load_balance_monitor(void *unused) | ||
7135 | { | ||
7136 | unsigned int timeout = sysctl_sched_min_bal_int_shares; | ||
7137 | struct sched_param schedparm; | ||
7138 | int ret; | ||
7139 | |||
7140 | /* | ||
7141 | * We don't want this thread's execution to be limited by the shares | ||
7142 | * assigned to default group (init_task_group). Hence make it run | ||
7143 | * as a SCHED_RR RT task at the lowest priority. | ||
7144 | */ | ||
7145 | schedparm.sched_priority = 1; | ||
7146 | ret = sched_setscheduler(current, SCHED_RR, &schedparm); | ||
7147 | if (ret) | ||
7148 | printk(KERN_ERR "Couldn't set SCHED_RR policy for load balance" | ||
7149 | " monitor thread (error = %d) \n", ret); | ||
7150 | |||
7151 | while (!kthread_should_stop()) { | ||
7152 | int i, cpu, balanced = 1; | ||
7153 | |||
7154 | /* Prevent cpus going down or coming up */ | ||
7155 | lock_cpu_hotplug(); | ||
7156 | /* lockout changes to doms_cur[] array */ | ||
7157 | lock_doms_cur(); | ||
7158 | /* | ||
7159 | * Enter a rcu read-side critical section to safely walk rq->sd | ||
7160 | * chain on various cpus and to walk task group list | ||
7161 | * (rq->leaf_cfs_rq_list) in rebalance_shares(). | ||
7162 | */ | ||
7163 | rcu_read_lock(); | ||
7164 | |||
7165 | for (i = 0; i < ndoms_cur; i++) { | ||
7166 | cpumask_t cpumap = doms_cur[i]; | ||
7167 | struct sched_domain *sd = NULL, *sd_prev = NULL; | ||
7168 | |||
7169 | cpu = first_cpu(cpumap); | ||
7170 | |||
7171 | /* Find the highest domain at which to balance shares */ | ||
7172 | for_each_domain(cpu, sd) { | ||
7173 | if (!(sd->flags & SD_LOAD_BALANCE)) | ||
7174 | continue; | ||
7175 | sd_prev = sd; | ||
7176 | } | ||
7177 | |||
7178 | sd = sd_prev; | ||
7179 | /* sd == NULL? No load balance reqd in this domain */ | ||
7180 | if (!sd) | ||
7181 | continue; | ||
7182 | |||
7183 | balanced &= rebalance_shares(sd, cpu); | ||
7184 | } | ||
7185 | |||
7186 | rcu_read_unlock(); | ||
7187 | |||
7188 | unlock_doms_cur(); | ||
7189 | unlock_cpu_hotplug(); | ||
7190 | |||
7191 | if (!balanced) | ||
7192 | timeout = sysctl_sched_min_bal_int_shares; | ||
7193 | else if (timeout < sysctl_sched_max_bal_int_shares) | ||
7194 | timeout *= 2; | ||
7195 | |||
7196 | msleep_interruptible(timeout); | ||
7197 | } | ||
7198 | |||
7199 | return 0; | ||
7200 | } | ||
7201 | #endif /* CONFIG_SMP */ | ||
7202 | |||
6991 | /* allocate runqueue etc for a new task group */ | 7203 | /* allocate runqueue etc for a new task group */ |
6992 | struct task_group *sched_create_group(void) | 7204 | struct task_group *sched_create_group(void) |
6993 | { | 7205 | { |
@@ -7144,47 +7356,77 @@ done: | |||
7144 | task_rq_unlock(rq, &flags); | 7356 | task_rq_unlock(rq, &flags); |
7145 | } | 7357 | } |
7146 | 7358 | ||
7359 | /* rq->lock to be locked by caller */ | ||
7147 | static void set_se_shares(struct sched_entity *se, unsigned long shares) | 7360 | static void set_se_shares(struct sched_entity *se, unsigned long shares) |
7148 | { | 7361 | { |
7149 | struct cfs_rq *cfs_rq = se->cfs_rq; | 7362 | struct cfs_rq *cfs_rq = se->cfs_rq; |
7150 | struct rq *rq = cfs_rq->rq; | 7363 | struct rq *rq = cfs_rq->rq; |
7151 | int on_rq; | 7364 | int on_rq; |
7152 | 7365 | ||
7153 | spin_lock_irq(&rq->lock); | 7366 | if (!shares) |
7367 | shares = MIN_GROUP_SHARES; | ||
7154 | 7368 | ||
7155 | on_rq = se->on_rq; | 7369 | on_rq = se->on_rq; |
7156 | if (on_rq) | 7370 | if (on_rq) { |
7157 | dequeue_entity(cfs_rq, se, 0); | 7371 | dequeue_entity(cfs_rq, se, 0); |
7372 | dec_cpu_load(rq, se->load.weight); | ||
7373 | } | ||
7158 | 7374 | ||
7159 | se->load.weight = shares; | 7375 | se->load.weight = shares; |
7160 | se->load.inv_weight = div64_64((1ULL<<32), shares); | 7376 | se->load.inv_weight = div64_64((1ULL<<32), shares); |
7161 | 7377 | ||
7162 | if (on_rq) | 7378 | if (on_rq) { |
7163 | enqueue_entity(cfs_rq, se, 0); | 7379 | enqueue_entity(cfs_rq, se, 0); |
7164 | 7380 | inc_cpu_load(rq, se->load.weight); | |
7165 | spin_unlock_irq(&rq->lock); | 7381 | } |
7166 | } | 7382 | } |
7167 | 7383 | ||
7168 | int sched_group_set_shares(struct task_group *tg, unsigned long shares) | 7384 | int sched_group_set_shares(struct task_group *tg, unsigned long shares) |
7169 | { | 7385 | { |
7170 | int i; | 7386 | int i; |
7171 | 7387 | struct cfs_rq *cfs_rq; | |
7172 | /* | 7388 | struct rq *rq; |
7173 | * A weight of 0 or 1 can cause arithmetics problems. | ||
7174 | * (The default weight is 1024 - so there's no practical | ||
7175 | * limitation from this.) | ||
7176 | */ | ||
7177 | if (shares < 2) | ||
7178 | shares = 2; | ||
7179 | 7389 | ||
7180 | lock_task_group_list(); | 7390 | lock_task_group_list(); |
7181 | if (tg->shares == shares) | 7391 | if (tg->shares == shares) |
7182 | goto done; | 7392 | goto done; |
7183 | 7393 | ||
7394 | if (shares < MIN_GROUP_SHARES) | ||
7395 | shares = MIN_GROUP_SHARES; | ||
7396 | |||
7397 | /* | ||
7398 | * Prevent any load balance activity (rebalance_shares, | ||
7399 | * load_balance_fair) from referring to this group first, | ||
7400 | * by taking it off the rq->leaf_cfs_rq_list on each cpu. | ||
7401 | */ | ||
7402 | for_each_possible_cpu(i) { | ||
7403 | cfs_rq = tg->cfs_rq[i]; | ||
7404 | list_del_rcu(&cfs_rq->leaf_cfs_rq_list); | ||
7405 | } | ||
7406 | |||
7407 | /* wait for any ongoing reference to this group to finish */ | ||
7408 | synchronize_sched(); | ||
7409 | |||
7410 | /* | ||
7411 | * Now we are free to modify the group's share on each cpu | ||
7412 | * w/o tripping rebalance_share or load_balance_fair. | ||
7413 | */ | ||
7184 | tg->shares = shares; | 7414 | tg->shares = shares; |
7185 | for_each_possible_cpu(i) | 7415 | for_each_possible_cpu(i) { |
7416 | spin_lock_irq(&cpu_rq(i)->lock); | ||
7186 | set_se_shares(tg->se[i], shares); | 7417 | set_se_shares(tg->se[i], shares); |
7418 | spin_unlock_irq(&cpu_rq(i)->lock); | ||
7419 | } | ||
7187 | 7420 | ||
7421 | /* | ||
7422 | * Enable load balance activity on this group, by inserting it back on | ||
7423 | * each cpu's rq->leaf_cfs_rq_list. | ||
7424 | */ | ||
7425 | for_each_possible_cpu(i) { | ||
7426 | rq = cpu_rq(i); | ||
7427 | cfs_rq = tg->cfs_rq[i]; | ||
7428 | list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); | ||
7429 | } | ||
7188 | done: | 7430 | done: |
7189 | unlock_task_group_list(); | 7431 | unlock_task_group_list(); |
7190 | return 0; | 7432 | return 0; |
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 30ae9c2a2861..5c208e090ae4 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c | |||
@@ -707,6 +707,8 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se) | |||
707 | return se->parent; | 707 | return se->parent; |
708 | } | 708 | } |
709 | 709 | ||
710 | #define GROUP_IMBALANCE_PCT 20 | ||
711 | |||
710 | #else /* CONFIG_FAIR_GROUP_SCHED */ | 712 | #else /* CONFIG_FAIR_GROUP_SCHED */ |
711 | 713 | ||
712 | #define for_each_sched_entity(se) \ | 714 | #define for_each_sched_entity(se) \ |
@@ -967,25 +969,6 @@ static struct task_struct *load_balance_next_fair(void *arg) | |||
967 | return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr); | 969 | return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr); |
968 | } | 970 | } |
969 | 971 | ||
970 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
971 | static int cfs_rq_best_prio(struct cfs_rq *cfs_rq) | ||
972 | { | ||
973 | struct sched_entity *curr; | ||
974 | struct task_struct *p; | ||
975 | |||
976 | if (!cfs_rq->nr_running) | ||
977 | return MAX_PRIO; | ||
978 | |||
979 | curr = cfs_rq->curr; | ||
980 | if (!curr) | ||
981 | curr = __pick_next_entity(cfs_rq); | ||
982 | |||
983 | p = task_of(curr); | ||
984 | |||
985 | return p->prio; | ||
986 | } | ||
987 | #endif | ||
988 | |||
989 | static unsigned long | 972 | static unsigned long |
990 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | 973 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, |
991 | unsigned long max_load_move, | 974 | unsigned long max_load_move, |
@@ -995,28 +978,45 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
995 | struct cfs_rq *busy_cfs_rq; | 978 | struct cfs_rq *busy_cfs_rq; |
996 | long rem_load_move = max_load_move; | 979 | long rem_load_move = max_load_move; |
997 | struct rq_iterator cfs_rq_iterator; | 980 | struct rq_iterator cfs_rq_iterator; |
981 | unsigned long load_moved; | ||
998 | 982 | ||
999 | cfs_rq_iterator.start = load_balance_start_fair; | 983 | cfs_rq_iterator.start = load_balance_start_fair; |
1000 | cfs_rq_iterator.next = load_balance_next_fair; | 984 | cfs_rq_iterator.next = load_balance_next_fair; |
1001 | 985 | ||
1002 | for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { | 986 | for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { |
1003 | #ifdef CONFIG_FAIR_GROUP_SCHED | 987 | #ifdef CONFIG_FAIR_GROUP_SCHED |
1004 | struct cfs_rq *this_cfs_rq; | 988 | struct cfs_rq *this_cfs_rq = busy_cfs_rq->tg->cfs_rq[this_cpu]; |
1005 | long imbalance; | 989 | unsigned long maxload, task_load, group_weight; |
1006 | unsigned long maxload; | 990 | unsigned long thisload, per_task_load; |
991 | struct sched_entity *se = busy_cfs_rq->tg->se[busiest->cpu]; | ||
1007 | 992 | ||
1008 | this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu); | 993 | task_load = busy_cfs_rq->load.weight; |
994 | group_weight = se->load.weight; | ||
1009 | 995 | ||
1010 | imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight; | 996 | /* |
1011 | /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */ | 997 | * 'group_weight' is contributed by tasks of total weight |
1012 | if (imbalance <= 0) | 998 | * 'task_load'. To move 'rem_load_move' worth of weight only, |
999 | * we need to move a maximum task load of: | ||
1000 | * | ||
1001 | * maxload = (remload / group_weight) * task_load; | ||
1002 | */ | ||
1003 | maxload = (rem_load_move * task_load) / group_weight; | ||
1004 | |||
1005 | if (!maxload || !task_load) | ||
1013 | continue; | 1006 | continue; |
1014 | 1007 | ||
1015 | /* Don't pull more than imbalance/2 */ | 1008 | per_task_load = task_load / busy_cfs_rq->nr_running; |
1016 | imbalance /= 2; | 1009 | /* |
1017 | maxload = min(rem_load_move, imbalance); | 1010 | * balance_tasks will try to forcibly move atleast one task if |
1011 | * possible (because of SCHED_LOAD_SCALE_FUZZ). Avoid that if | ||
1012 | * maxload is less than GROUP_IMBALANCE_FUZZ% the per_task_load. | ||
1013 | */ | ||
1014 | if (100 * maxload < GROUP_IMBALANCE_PCT * per_task_load) | ||
1015 | continue; | ||
1018 | 1016 | ||
1019 | *this_best_prio = cfs_rq_best_prio(this_cfs_rq); | 1017 | /* Disable priority-based load balance */ |
1018 | *this_best_prio = 0; | ||
1019 | thisload = this_cfs_rq->load.weight; | ||
1020 | #else | 1020 | #else |
1021 | # define maxload rem_load_move | 1021 | # define maxload rem_load_move |
1022 | #endif | 1022 | #endif |
@@ -1025,11 +1025,33 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
1025 | * load_balance_[start|next]_fair iterators | 1025 | * load_balance_[start|next]_fair iterators |
1026 | */ | 1026 | */ |
1027 | cfs_rq_iterator.arg = busy_cfs_rq; | 1027 | cfs_rq_iterator.arg = busy_cfs_rq; |
1028 | rem_load_move -= balance_tasks(this_rq, this_cpu, busiest, | 1028 | load_moved = balance_tasks(this_rq, this_cpu, busiest, |
1029 | maxload, sd, idle, all_pinned, | 1029 | maxload, sd, idle, all_pinned, |
1030 | this_best_prio, | 1030 | this_best_prio, |
1031 | &cfs_rq_iterator); | 1031 | &cfs_rq_iterator); |
1032 | 1032 | ||
1033 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
1034 | /* | ||
1035 | * load_moved holds the task load that was moved. The | ||
1036 | * effective (group) weight moved would be: | ||
1037 | * load_moved_eff = load_moved/task_load * group_weight; | ||
1038 | */ | ||
1039 | load_moved = (group_weight * load_moved) / task_load; | ||
1040 | |||
1041 | /* Adjust shares on both cpus to reflect load_moved */ | ||
1042 | group_weight -= load_moved; | ||
1043 | set_se_shares(se, group_weight); | ||
1044 | |||
1045 | se = busy_cfs_rq->tg->se[this_cpu]; | ||
1046 | if (!thisload) | ||
1047 | group_weight = load_moved; | ||
1048 | else | ||
1049 | group_weight = se->load.weight + load_moved; | ||
1050 | set_se_shares(se, group_weight); | ||
1051 | #endif | ||
1052 | |||
1053 | rem_load_move -= load_moved; | ||
1054 | |||
1033 | if (rem_load_move <= 0) | 1055 | if (rem_load_move <= 0) |
1034 | break; | 1056 | break; |
1035 | } | 1057 | } |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c68f68dcc605..c95f3ed34474 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -309,6 +309,24 @@ static struct ctl_table kern_table[] = { | |||
309 | .mode = 644, | 309 | .mode = 644, |
310 | .proc_handler = &proc_dointvec, | 310 | .proc_handler = &proc_dointvec, |
311 | }, | 311 | }, |
312 | #if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP) | ||
313 | { | ||
314 | .ctl_name = CTL_UNNUMBERED, | ||
315 | .procname = "sched_min_bal_int_shares", | ||
316 | .data = &sysctl_sched_min_bal_int_shares, | ||
317 | .maxlen = sizeof(unsigned int), | ||
318 | .mode = 0644, | ||
319 | .proc_handler = &proc_dointvec, | ||
320 | }, | ||
321 | { | ||
322 | .ctl_name = CTL_UNNUMBERED, | ||
323 | .procname = "sched_max_bal_int_shares", | ||
324 | .data = &sysctl_sched_max_bal_int_shares, | ||
325 | .maxlen = sizeof(unsigned int), | ||
326 | .mode = 0644, | ||
327 | .proc_handler = &proc_dointvec, | ||
328 | }, | ||
329 | #endif | ||
312 | #endif | 330 | #endif |
313 | { | 331 | { |
314 | .ctl_name = CTL_UNNUMBERED, | 332 | .ctl_name = CTL_UNNUMBERED, |