aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
authorVenkatesh Pallipadi <venki@google.com>2010-05-17 21:14:43 -0400
committerIngo Molnar <mingo@elte.hu>2010-06-09 04:34:51 -0400
commitfdf3e95d3916f18bf8703fb065499fdbc4dfe34c (patch)
treeb9bfc0f78135502adf7c83313948a705fb19384b /kernel/sched.c
parent246d86b51845063e4b06b27579990492dc5fa317 (diff)
sched: Avoid side-effect of tickless idle on update_cpu_load
tickless idle has a negative side effect on update_cpu_load(), which in turn can affect load balancing behavior. update_cpu_load() is supposed to be called every tick, to keep track of various load indicies. With tickless idle, there are no scheduler ticks called on the idle CPUs. Idle CPUs may still do load balancing (with idle_load_balance CPU) using the stale cpu_load. It will also cause problems when all CPUs go idle for a while and become active again. In this case loads would not degrade as expected. This is how rq->nr_load_updates change looks like under different conditions: <cpu_num> <nr_load_updates change> All CPUS idle for 10 seconds (HZ=1000) 0 1621 10 496 11 139 12 875 13 1672 14 12 15 21 1 1472 2 2426 3 1161 4 2108 5 1525 6 701 7 249 8 766 9 1967 One CPU busy rest idle for 10 seconds 0 10003 10 601 11 95 12 966 13 1597 14 114 15 98 1 3457 2 93 3 6679 4 1425 5 1479 6 595 7 193 8 633 9 1687 All CPUs busy for 10 seconds 0 10026 10 10026 11 10026 12 10026 13 10025 14 10025 15 10025 1 10026 2 10026 3 10026 4 10026 5 10026 6 10026 7 10026 8 10026 9 10026 That is update_cpu_load works properly only when all CPUs are busy. If all are idle, all the CPUs get way lower updates. And when few CPUs are busy and rest are idle, only busy and ilb CPU does proper updates and rest of the idle CPUs will do lower updates. The patch keeps track of when a last update was done and fixes up the load avg based on current time. On one of my test system SPECjbb with warehouse 1..numcpus, patch improves throughput numbers by ~1% (average of 6 runs). On another test system (with different domain hierarchy) there is no noticable change in perf. Signed-off-by: Venkatesh Pallipadi <venki@google.com> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Thomas Gleixner <tglx@linutronix.de> LKML-Reference: <AANLkTilLtDWQsAUrIxJ6s04WTgmw9GuOODc5AOrYsaR5@mail.gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c100
1 files changed, 95 insertions, 5 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index f37a9618fac3..a757f6b11cbd 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -457,6 +457,7 @@ struct rq {
457 unsigned long nr_running; 457 unsigned long nr_running;
458 #define CPU_LOAD_IDX_MAX 5 458 #define CPU_LOAD_IDX_MAX 5
459 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; 459 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
460 unsigned long last_load_update_tick;
460#ifdef CONFIG_NO_HZ 461#ifdef CONFIG_NO_HZ
461 u64 nohz_stamp; 462 u64 nohz_stamp;
462 unsigned char in_nohz_recently; 463 unsigned char in_nohz_recently;
@@ -1803,6 +1804,7 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1803static void calc_load_account_idle(struct rq *this_rq); 1804static void calc_load_account_idle(struct rq *this_rq);
1804static void update_sysctl(void); 1805static void update_sysctl(void);
1805static int get_update_sysctl_factor(void); 1806static int get_update_sysctl_factor(void);
1807static void update_cpu_load(struct rq *this_rq);
1806 1808
1807static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) 1809static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1808{ 1810{
@@ -3050,23 +3052,102 @@ static void calc_load_account_active(struct rq *this_rq)
3050} 3052}
3051 3053
3052/* 3054/*
3055 * The exact cpuload at various idx values, calculated at every tick would be
3056 * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
3057 *
3058 * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
3059 * on nth tick when cpu may be busy, then we have:
3060 * load = ((2^idx - 1) / 2^idx)^(n-1) * load
3061 * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
3062 *
3063 * decay_load_missed() below does efficient calculation of
3064 * load = ((2^idx - 1) / 2^idx)^(n-1) * load
3065 * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
3066 *
3067 * The calculation is approximated on a 128 point scale.
3068 * degrade_zero_ticks is the number of ticks after which load at any
3069 * particular idx is approximated to be zero.
3070 * degrade_factor is a precomputed table, a row for each load idx.
3071 * Each column corresponds to degradation factor for a power of two ticks,
3072 * based on 128 point scale.
3073 * Example:
3074 * row 2, col 3 (=12) says that the degradation at load idx 2 after
3075 * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
3076 *
3077 * With this power of 2 load factors, we can degrade the load n times
3078 * by looking at 1 bits in n and doing as many mult/shift instead of
3079 * n mult/shifts needed by the exact degradation.
3080 */
3081#define DEGRADE_SHIFT 7
3082static const unsigned char
3083 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
3084static const unsigned char
3085 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
3086 {0, 0, 0, 0, 0, 0, 0, 0},
3087 {64, 32, 8, 0, 0, 0, 0, 0},
3088 {96, 72, 40, 12, 1, 0, 0},
3089 {112, 98, 75, 43, 15, 1, 0},
3090 {120, 112, 98, 76, 45, 16, 2} };
3091
3092/*
3093 * Update cpu_load for any missed ticks, due to tickless idle. The backlog
3094 * would be when CPU is idle and so we just decay the old load without
3095 * adding any new load.
3096 */
3097static unsigned long
3098decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
3099{
3100 int j = 0;
3101
3102 if (!missed_updates)
3103 return load;
3104
3105 if (missed_updates >= degrade_zero_ticks[idx])
3106 return 0;
3107
3108 if (idx == 1)
3109 return load >> missed_updates;
3110
3111 while (missed_updates) {
3112 if (missed_updates % 2)
3113 load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
3114
3115 missed_updates >>= 1;
3116 j++;
3117 }
3118 return load;
3119}
3120
3121/*
3053 * Update rq->cpu_load[] statistics. This function is usually called every 3122 * Update rq->cpu_load[] statistics. This function is usually called every
3054 * scheduler tick (TICK_NSEC). 3123 * scheduler tick (TICK_NSEC). With tickless idle this will not be called
3124 * every tick. We fix it up based on jiffies.
3055 */ 3125 */
3056static void update_cpu_load(struct rq *this_rq) 3126static void update_cpu_load(struct rq *this_rq)
3057{ 3127{
3058 unsigned long this_load = this_rq->load.weight; 3128 unsigned long this_load = this_rq->load.weight;
3129 unsigned long curr_jiffies = jiffies;
3130 unsigned long pending_updates;
3059 int i, scale; 3131 int i, scale;
3060 3132
3061 this_rq->nr_load_updates++; 3133 this_rq->nr_load_updates++;
3062 3134
3135 /* Avoid repeated calls on same jiffy, when moving in and out of idle */
3136 if (curr_jiffies == this_rq->last_load_update_tick)
3137 return;
3138
3139 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
3140 this_rq->last_load_update_tick = curr_jiffies;
3141
3063 /* Update our load: */ 3142 /* Update our load: */
3064 for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { 3143 this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
3144 for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
3065 unsigned long old_load, new_load; 3145 unsigned long old_load, new_load;
3066 3146
3067 /* scale is effectively 1 << i now, and >> i divides by scale */ 3147 /* scale is effectively 1 << i now, and >> i divides by scale */
3068 3148
3069 old_load = this_rq->cpu_load[i]; 3149 old_load = this_rq->cpu_load[i];
3150 old_load = decay_load_missed(old_load, pending_updates - 1, i);
3070 new_load = this_load; 3151 new_load = this_load;
3071 /* 3152 /*
3072 * Round up the averaging division if load is increasing. This 3153 * Round up the averaging division if load is increasing. This
@@ -3074,9 +3155,15 @@ static void update_cpu_load(struct rq *this_rq)
3074 * example. 3155 * example.
3075 */ 3156 */
3076 if (new_load > old_load) 3157 if (new_load > old_load)
3077 new_load += scale-1; 3158 new_load += scale - 1;
3078 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; 3159
3160 this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
3079 } 3161 }
3162}
3163
3164static void update_cpu_load_active(struct rq *this_rq)
3165{
3166 update_cpu_load(this_rq);
3080 3167
3081 calc_load_account_active(this_rq); 3168 calc_load_account_active(this_rq);
3082} 3169}
@@ -3464,7 +3551,7 @@ void scheduler_tick(void)
3464 3551
3465 raw_spin_lock(&rq->lock); 3552 raw_spin_lock(&rq->lock);
3466 update_rq_clock(rq); 3553 update_rq_clock(rq);
3467 update_cpu_load(rq); 3554 update_cpu_load_active(rq);
3468 curr->sched_class->task_tick(rq, curr, 0); 3555 curr->sched_class->task_tick(rq, curr, 0);
3469 raw_spin_unlock(&rq->lock); 3556 raw_spin_unlock(&rq->lock);
3470 3557
@@ -7688,6 +7775,9 @@ void __init sched_init(void)
7688 7775
7689 for (j = 0; j < CPU_LOAD_IDX_MAX; j++) 7776 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
7690 rq->cpu_load[j] = 0; 7777 rq->cpu_load[j] = 0;
7778
7779 rq->last_load_update_tick = jiffies;
7780
7691#ifdef CONFIG_SMP 7781#ifdef CONFIG_SMP
7692 rq->sd = NULL; 7782 rq->sd = NULL;
7693 rq->rd = NULL; 7783 rq->rd = NULL;