aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorThomas Gleixner <tglx@linutronix.de>2009-04-11 04:43:41 -0400
committerThomas Gleixner <tglx@linutronix.de>2009-05-15 09:32:45 -0400
commitdce48a84adf1806676319f6f480e30a6daa012f9 (patch)
tree79151f5d31d9c3dcdc723ab8877cb943b944890e /kernel
parent2ff799d3cff1ecb274049378b28120ee5c1c5e5f (diff)
sched, timers: move calc_load() to scheduler
Dimitri Sivanich noticed that xtime_lock is held write locked across calc_load() which iterates over all online CPUs. That can cause long latencies for xtime_lock readers on large SMP systems. The load average calculation is an rough estimate anyway so there is no real need to protect the readers vs. the update. It's not a problem when the avenrun array is updated while a reader copies the values. Instead of iterating over all online CPUs let the scheduler_tick code update the number of active tasks shortly before the avenrun update happens. The avenrun update itself is handled by the CPU which calls do_timer(). [ Impact: reduce xtime_lock write locked section ] Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Acked-by: Peter Zijlstra <peterz@infradead.org>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/sched.c84
-rw-r--r--kernel/sched_idletask.c3
-rw-r--r--kernel/time/timekeeping.c2
-rw-r--r--kernel/timer.c54
4 files changed, 79 insertions, 64 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 8908d190a348..f4eb88153bd1 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -630,6 +630,10 @@ struct rq {
630 struct list_head migration_queue; 630 struct list_head migration_queue;
631#endif 631#endif
632 632
633 /* calc_load related fields */
634 unsigned long calc_load_update;
635 long calc_load_active;
636
633#ifdef CONFIG_SCHED_HRTICK 637#ifdef CONFIG_SCHED_HRTICK
634#ifdef CONFIG_SMP 638#ifdef CONFIG_SMP
635 int hrtick_csd_pending; 639 int hrtick_csd_pending;
@@ -1728,6 +1732,8 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1728} 1732}
1729#endif 1733#endif
1730 1734
1735static void calc_load_account_active(struct rq *this_rq);
1736
1731#include "sched_stats.h" 1737#include "sched_stats.h"
1732#include "sched_idletask.c" 1738#include "sched_idletask.c"
1733#include "sched_fair.c" 1739#include "sched_fair.c"
@@ -2856,19 +2862,57 @@ unsigned long nr_iowait(void)
2856 return sum; 2862 return sum;
2857} 2863}
2858 2864
2859unsigned long nr_active(void) 2865/* Variables and functions for calc_load */
2866static atomic_long_t calc_load_tasks;
2867static unsigned long calc_load_update;
2868unsigned long avenrun[3];
2869EXPORT_SYMBOL(avenrun);
2870
2871static unsigned long
2872calc_load(unsigned long load, unsigned long exp, unsigned long active)
2860{ 2873{
2861 unsigned long i, running = 0, uninterruptible = 0; 2874 load *= exp;
2875 load += active * (FIXED_1 - exp);
2876 return load >> FSHIFT;
2877}
2862 2878
2863 for_each_online_cpu(i) { 2879/*
2864 running += cpu_rq(i)->nr_running; 2880 * calc_load - update the avenrun load estimates 10 ticks after the
2865 uninterruptible += cpu_rq(i)->nr_uninterruptible; 2881 * CPUs have updated calc_load_tasks.
2866 } 2882 */
2883void calc_global_load(void)
2884{
2885 unsigned long upd = calc_load_update + 10;
2886 long active;
2887
2888 if (time_before(jiffies, upd))
2889 return;
2867 2890
2868 if (unlikely((long)uninterruptible < 0)) 2891 active = atomic_long_read(&calc_load_tasks);
2869 uninterruptible = 0; 2892 active = active > 0 ? active * FIXED_1 : 0;
2870 2893
2871 return running + uninterruptible; 2894 avenrun[0] = calc_load(avenrun[0], EXP_1, active);
2895 avenrun[1] = calc_load(avenrun[1], EXP_5, active);
2896 avenrun[2] = calc_load(avenrun[2], EXP_15, active);
2897
2898 calc_load_update += LOAD_FREQ;
2899}
2900
2901/*
2902 * Either called from update_cpu_load() or from a cpu going idle
2903 */
2904static void calc_load_account_active(struct rq *this_rq)
2905{
2906 long nr_active, delta;
2907
2908 nr_active = this_rq->nr_running;
2909 nr_active += (long) this_rq->nr_uninterruptible;
2910
2911 if (nr_active != this_rq->calc_load_active) {
2912 delta = nr_active - this_rq->calc_load_active;
2913 this_rq->calc_load_active = nr_active;
2914 atomic_long_add(delta, &calc_load_tasks);
2915 }
2872} 2916}
2873 2917
2874/* 2918/*
@@ -2899,6 +2943,11 @@ static void update_cpu_load(struct rq *this_rq)
2899 new_load += scale-1; 2943 new_load += scale-1;
2900 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; 2944 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
2901 } 2945 }
2946
2947 if (time_after_eq(jiffies, this_rq->calc_load_update)) {
2948 this_rq->calc_load_update += LOAD_FREQ;
2949 calc_load_account_active(this_rq);
2950 }
2902} 2951}
2903 2952
2904#ifdef CONFIG_SMP 2953#ifdef CONFIG_SMP
@@ -7091,6 +7140,14 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
7091 7140
7092 } 7141 }
7093} 7142}
7143
7144/*
7145 * remove the tasks which were accounted by rq from calc_load_tasks.
7146 */
7147static void calc_global_load_remove(struct rq *rq)
7148{
7149 atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
7150}
7094#endif /* CONFIG_HOTPLUG_CPU */ 7151#endif /* CONFIG_HOTPLUG_CPU */
7095 7152
7096#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) 7153#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
@@ -7325,6 +7382,8 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7325 /* Update our root-domain */ 7382 /* Update our root-domain */
7326 rq = cpu_rq(cpu); 7383 rq = cpu_rq(cpu);
7327 spin_lock_irqsave(&rq->lock, flags); 7384 spin_lock_irqsave(&rq->lock, flags);
7385 rq->calc_load_update = calc_load_update;
7386 rq->calc_load_active = 0;
7328 if (rq->rd) { 7387 if (rq->rd) {
7329 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 7388 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
7330 7389
@@ -7364,7 +7423,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7364 cpuset_unlock(); 7423 cpuset_unlock();
7365 migrate_nr_uninterruptible(rq); 7424 migrate_nr_uninterruptible(rq);
7366 BUG_ON(rq->nr_running != 0); 7425 BUG_ON(rq->nr_running != 0);
7367 7426 calc_global_load_remove(rq);
7368 /* 7427 /*
7369 * No need to migrate the tasks: it was best-effort if 7428 * No need to migrate the tasks: it was best-effort if
7370 * they didn't take sched_hotcpu_mutex. Just wake up 7429 * they didn't take sched_hotcpu_mutex. Just wake up
@@ -9059,6 +9118,8 @@ void __init sched_init(void)
9059 rq = cpu_rq(i); 9118 rq = cpu_rq(i);
9060 spin_lock_init(&rq->lock); 9119 spin_lock_init(&rq->lock);
9061 rq->nr_running = 0; 9120 rq->nr_running = 0;
9121 rq->calc_load_active = 0;
9122 rq->calc_load_update = jiffies + LOAD_FREQ;
9062 init_cfs_rq(&rq->cfs, rq); 9123 init_cfs_rq(&rq->cfs, rq);
9063 init_rt_rq(&rq->rt, rq); 9124 init_rt_rq(&rq->rt, rq);
9064#ifdef CONFIG_FAIR_GROUP_SCHED 9125#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -9166,6 +9227,9 @@ void __init sched_init(void)
9166 * when this runqueue becomes "idle". 9227 * when this runqueue becomes "idle".
9167 */ 9228 */
9168 init_idle(current, smp_processor_id()); 9229 init_idle(current, smp_processor_id());
9230
9231 calc_load_update = jiffies + LOAD_FREQ;
9232
9169 /* 9233 /*
9170 * During early bootup we pretend to be a normal task: 9234 * During early bootup we pretend to be a normal task:
9171 */ 9235 */
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 8a21a2e28c13..499672c10cbd 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -22,7 +22,8 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int sy
22static struct task_struct *pick_next_task_idle(struct rq *rq) 22static struct task_struct *pick_next_task_idle(struct rq *rq)
23{ 23{
24 schedstat_inc(rq, sched_goidle); 24 schedstat_inc(rq, sched_goidle);
25 25 /* adjust the active tasks as we might go into a long sleep */
26 calc_load_account_active(rq);
26 return rq->idle; 27 return rq->idle;
27} 28}
28 29
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 687dff49f6e7..52a8bf8931f3 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -22,7 +22,7 @@
22 22
23/* 23/*
24 * This read-write spinlock protects us from races in SMP while 24 * This read-write spinlock protects us from races in SMP while
25 * playing with xtime and avenrun. 25 * playing with xtime.
26 */ 26 */
27__cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); 27__cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
28 28
diff --git a/kernel/timer.c b/kernel/timer.c
index cffffad01c31..6a21d7af9620 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1123,47 +1123,6 @@ void update_process_times(int user_tick)
1123} 1123}
1124 1124
1125/* 1125/*
1126 * Nr of active tasks - counted in fixed-point numbers
1127 */
1128static unsigned long count_active_tasks(void)
1129{
1130 return nr_active() * FIXED_1;
1131}
1132
1133/*
1134 * Hmm.. Changed this, as the GNU make sources (load.c) seems to
1135 * imply that avenrun[] is the standard name for this kind of thing.
1136 * Nothing else seems to be standardized: the fractional size etc
1137 * all seem to differ on different machines.
1138 *
1139 * Requires xtime_lock to access.
1140 */
1141unsigned long avenrun[3];
1142
1143EXPORT_SYMBOL(avenrun);
1144
1145/*
1146 * calc_load - given tick count, update the avenrun load estimates.
1147 * This is called while holding a write_lock on xtime_lock.
1148 */
1149static inline void calc_load(unsigned long ticks)
1150{
1151 unsigned long active_tasks; /* fixed-point */
1152 static int count = LOAD_FREQ;
1153
1154 count -= ticks;
1155 if (unlikely(count < 0)) {
1156 active_tasks = count_active_tasks();
1157 do {
1158 CALC_LOAD(avenrun[0], EXP_1, active_tasks);
1159 CALC_LOAD(avenrun[1], EXP_5, active_tasks);
1160 CALC_LOAD(avenrun[2], EXP_15, active_tasks);
1161 count += LOAD_FREQ;
1162 } while (count < 0);
1163 }
1164}
1165
1166/*
1167 * This function runs timers and the timer-tq in bottom half context. 1126 * This function runs timers and the timer-tq in bottom half context.
1168 */ 1127 */
1169static void run_timer_softirq(struct softirq_action *h) 1128static void run_timer_softirq(struct softirq_action *h)
@@ -1187,16 +1146,6 @@ void run_local_timers(void)
1187} 1146}
1188 1147
1189/* 1148/*
1190 * Called by the timer interrupt. xtime_lock must already be taken
1191 * by the timer IRQ!
1192 */
1193static inline void update_times(unsigned long ticks)
1194{
1195 update_wall_time();
1196 calc_load(ticks);
1197}
1198
1199/*
1200 * The 64-bit jiffies value is not atomic - you MUST NOT read it 1149 * The 64-bit jiffies value is not atomic - you MUST NOT read it
1201 * without sampling the sequence number in xtime_lock. 1150 * without sampling the sequence number in xtime_lock.
1202 * jiffies is defined in the linker script... 1151 * jiffies is defined in the linker script...
@@ -1205,7 +1154,8 @@ static inline void update_times(unsigned long ticks)
1205void do_timer(unsigned long ticks) 1154void do_timer(unsigned long ticks)
1206{ 1155{
1207 jiffies_64 += ticks; 1156 jiffies_64 += ticks;
1208 update_times(ticks); 1157 update_wall_time();
1158 calc_global_load();
1209} 1159}
1210 1160
1211#ifdef __ARCH_WANT_SYS_ALARM 1161#ifdef __ARCH_WANT_SYS_ALARM