aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched_fair.c
diff options
context:
space:
mode:
authorSrivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>2008-01-25 15:08:00 -0500
committerIngo Molnar <mingo@elte.hu>2008-01-25 15:08:00 -0500
commit6b2d7700266b9402e12824e11e0099ae6a4a6a79 (patch)
treed72c25b03150901ad8643f931186a11eb85635dc /kernel/sched_fair.c
parenta183561567b5446d3362b4839bd4f744f4b2af1e (diff)
sched: group scheduler, fix fairness of cpu bandwidth allocation for task groups
The current load balancing scheme isn't good enough for precise group fairness. For example: on a 8-cpu system, I created 3 groups as under: a = 8 tasks (cpu.shares = 1024) b = 4 tasks (cpu.shares = 1024) c = 3 tasks (cpu.shares = 1024) a, b and c are task groups that have equal weight. We would expect each of the groups to receive 33.33% of cpu bandwidth under a fair scheduler. This is what I get with the latest scheduler git tree: Signed-off-by: Ingo Molnar <mingo@elte.hu> -------------------------------------------------------------------------------- Col1 | Col2 | Col3 | Col4 ------|---------|-------|------------------------------------------------------- a | 277.676 | 57.8% | 54.1% 54.1% 54.1% 54.2% 56.7% 62.2% 62.8% 64.5% b | 116.108 | 24.2% | 47.4% 48.1% 48.7% 49.3% c | 86.326 | 18.0% | 47.5% 47.9% 48.5% -------------------------------------------------------------------------------- Explanation of o/p: Col1 -> Group name Col2 -> Cumulative execution time (in seconds) received by all tasks of that group in a 60sec window across 8 cpus Col3 -> CPU bandwidth received by the group in the 60sec window, expressed in percentage. Col3 data is derived as: Col3 = 100 * Col2 / (NR_CPUS * 60) Col4 -> CPU bandwidth received by each individual task of the group. Col4 = 100 * cpu_time_recd_by_task / 60 [I can share the test case that produces a similar o/p if reqd] The deviation from desired group fairness is as below: a = +24.47% b = -9.13% c = -15.33% which is quite high. After the patch below is applied, here are the results: -------------------------------------------------------------------------------- Col1 | Col2 | Col3 | Col4 ------|---------|-------|------------------------------------------------------- a | 163.112 | 34.0% | 33.2% 33.4% 33.5% 33.5% 33.7% 34.4% 34.8% 35.3% b | 156.220 | 32.5% | 63.3% 64.5% 66.1% 66.5% c | 160.653 | 33.5% | 85.8% 90.6% 91.4% -------------------------------------------------------------------------------- Deviation from desired group fairness is as below: a = +0.67% b = -0.83% c = +0.17% which is far better IMO. Most of other runs have yielded a deviation within +-2% at the most, which is good. Why do we see bad (group) fairness with current scheuler? ========================================================= Currently cpu's weight is just the summation of individual task weights. This can yield incorrect results. For ex: consider three groups as below on a 2-cpu system: CPU0 CPU1 --------------------------- A (10) B(5) C(5) --------------------------- Group A has 10 tasks, all on CPU0, Group B and C have 5 tasks each all of which are on CPU1. Each task has the same weight (NICE_0_LOAD = 1024). The current scheme would yield a cpu weight of 10240 (10*1024) for each cpu and the load balancer will think both CPUs are perfectly balanced and won't move around any tasks. This, however, would yield this bandwidth: A = 50% B = 25% C = 25% which is not the desired result. What's changing in the patch? ============================= - How cpu weights are calculated when CONFIF_FAIR_GROUP_SCHED is defined (see below) - API Change - Two tunables introduced in sysfs (under SCHED_DEBUG) to control the frequency at which the load balance monitor thread runs. The basic change made in this patch is how cpu weight (rq->load.weight) is calculated. Its now calculated as the summation of group weights on a cpu, rather than summation of task weights. Weight exerted by a group on a cpu is dependent on the shares allocated to it and also the number of tasks the group has on that cpu compared to the total number of (runnable) tasks the group has in the system. Let, W(K,i) = Weight of group K on cpu i T(K,i) = Task load present in group K's cfs_rq on cpu i T(K) = Total task load of group K across various cpus S(K) = Shares allocated to group K NRCPUS = Number of online cpus in the scheduler domain to which group K is assigned. Then, W(K,i) = S(K) * NRCPUS * T(K,i) / T(K) A load balance monitor thread is created at bootup, which periodically runs and adjusts group's weight on each cpu. To avoid its overhead, two min/max tunables are introduced (under SCHED_DEBUG) to control the rate at which it runs. Fixes from: Peter Zijlstra <a.p.zijlstra@chello.nl> - don't start the load_balance_monitor when there is only a single cpu. - rename the kthread because its currently longer than TASK_COMM_LEN Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel/sched_fair.c')
-rw-r--r--kernel/sched_fair.c84
1 files changed, 53 insertions, 31 deletions
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 30ae9c2a2861..5c208e090ae4 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -707,6 +707,8 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se)
707 return se->parent; 707 return se->parent;
708} 708}
709 709
710#define GROUP_IMBALANCE_PCT 20
711
710#else /* CONFIG_FAIR_GROUP_SCHED */ 712#else /* CONFIG_FAIR_GROUP_SCHED */
711 713
712#define for_each_sched_entity(se) \ 714#define for_each_sched_entity(se) \
@@ -967,25 +969,6 @@ static struct task_struct *load_balance_next_fair(void *arg)
967 return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr); 969 return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr);
968} 970}
969 971
970#ifdef CONFIG_FAIR_GROUP_SCHED
971static int cfs_rq_best_prio(struct cfs_rq *cfs_rq)
972{
973 struct sched_entity *curr;
974 struct task_struct *p;
975
976 if (!cfs_rq->nr_running)
977 return MAX_PRIO;
978
979 curr = cfs_rq->curr;
980 if (!curr)
981 curr = __pick_next_entity(cfs_rq);
982
983 p = task_of(curr);
984
985 return p->prio;
986}
987#endif
988
989static unsigned long 972static unsigned long
990load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 973load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
991 unsigned long max_load_move, 974 unsigned long max_load_move,
@@ -995,28 +978,45 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
995 struct cfs_rq *busy_cfs_rq; 978 struct cfs_rq *busy_cfs_rq;
996 long rem_load_move = max_load_move; 979 long rem_load_move = max_load_move;
997 struct rq_iterator cfs_rq_iterator; 980 struct rq_iterator cfs_rq_iterator;
981 unsigned long load_moved;
998 982
999 cfs_rq_iterator.start = load_balance_start_fair; 983 cfs_rq_iterator.start = load_balance_start_fair;
1000 cfs_rq_iterator.next = load_balance_next_fair; 984 cfs_rq_iterator.next = load_balance_next_fair;
1001 985
1002 for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { 986 for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
1003#ifdef CONFIG_FAIR_GROUP_SCHED 987#ifdef CONFIG_FAIR_GROUP_SCHED
1004 struct cfs_rq *this_cfs_rq; 988 struct cfs_rq *this_cfs_rq = busy_cfs_rq->tg->cfs_rq[this_cpu];
1005 long imbalance; 989 unsigned long maxload, task_load, group_weight;
1006 unsigned long maxload; 990 unsigned long thisload, per_task_load;
991 struct sched_entity *se = busy_cfs_rq->tg->se[busiest->cpu];
1007 992
1008 this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu); 993 task_load = busy_cfs_rq->load.weight;
994 group_weight = se->load.weight;
1009 995
1010 imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight; 996 /*
1011 /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */ 997 * 'group_weight' is contributed by tasks of total weight
1012 if (imbalance <= 0) 998 * 'task_load'. To move 'rem_load_move' worth of weight only,
999 * we need to move a maximum task load of:
1000 *
1001 * maxload = (remload / group_weight) * task_load;
1002 */
1003 maxload = (rem_load_move * task_load) / group_weight;
1004
1005 if (!maxload || !task_load)
1013 continue; 1006 continue;
1014 1007
1015 /* Don't pull more than imbalance/2 */ 1008 per_task_load = task_load / busy_cfs_rq->nr_running;
1016 imbalance /= 2; 1009 /*
1017 maxload = min(rem_load_move, imbalance); 1010 * balance_tasks will try to forcibly move atleast one task if
1011 * possible (because of SCHED_LOAD_SCALE_FUZZ). Avoid that if
1012 * maxload is less than GROUP_IMBALANCE_FUZZ% the per_task_load.
1013 */
1014 if (100 * maxload < GROUP_IMBALANCE_PCT * per_task_load)
1015 continue;
1018 1016
1019 *this_best_prio = cfs_rq_best_prio(this_cfs_rq); 1017 /* Disable priority-based load balance */
1018 *this_best_prio = 0;
1019 thisload = this_cfs_rq->load.weight;
1020#else 1020#else
1021# define maxload rem_load_move 1021# define maxload rem_load_move
1022#endif 1022#endif
@@ -1025,11 +1025,33 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1025 * load_balance_[start|next]_fair iterators 1025 * load_balance_[start|next]_fair iterators
1026 */ 1026 */
1027 cfs_rq_iterator.arg = busy_cfs_rq; 1027 cfs_rq_iterator.arg = busy_cfs_rq;
1028 rem_load_move -= balance_tasks(this_rq, this_cpu, busiest, 1028 load_moved = balance_tasks(this_rq, this_cpu, busiest,
1029 maxload, sd, idle, all_pinned, 1029 maxload, sd, idle, all_pinned,
1030 this_best_prio, 1030 this_best_prio,
1031 &cfs_rq_iterator); 1031 &cfs_rq_iterator);
1032 1032
1033#ifdef CONFIG_FAIR_GROUP_SCHED
1034 /*
1035 * load_moved holds the task load that was moved. The
1036 * effective (group) weight moved would be:
1037 * load_moved_eff = load_moved/task_load * group_weight;
1038 */
1039 load_moved = (group_weight * load_moved) / task_load;
1040
1041 /* Adjust shares on both cpus to reflect load_moved */
1042 group_weight -= load_moved;
1043 set_se_shares(se, group_weight);
1044
1045 se = busy_cfs_rq->tg->se[this_cpu];
1046 if (!thisload)
1047 group_weight = load_moved;
1048 else
1049 group_weight = se->load.weight + load_moved;
1050 set_se_shares(se, group_weight);
1051#endif
1052
1053 rem_load_move -= load_moved;
1054
1033 if (rem_load_move <= 0) 1055 if (rem_load_move <= 0)
1034 break; 1056 break;
1035 } 1057 }