diff options
author | Thomas Gleixner <tglx@linutronix.de> | 2009-04-11 04:43:41 -0400 |
---|---|---|
committer | Thomas Gleixner <tglx@linutronix.de> | 2009-05-15 09:32:45 -0400 |
commit | dce48a84adf1806676319f6f480e30a6daa012f9 (patch) | |
tree | 79151f5d31d9c3dcdc723ab8877cb943b944890e | |
parent | 2ff799d3cff1ecb274049378b28120ee5c1c5e5f (diff) |
sched, timers: move calc_load() to scheduler
Dimitri Sivanich noticed that xtime_lock is held write locked across
calc_load() which iterates over all online CPUs. That can cause long
latencies for xtime_lock readers on large SMP systems.
The load average calculation is an rough estimate anyway so there is
no real need to protect the readers vs. the update. It's not a problem
when the avenrun array is updated while a reader copies the values.
Instead of iterating over all online CPUs let the scheduler_tick code
update the number of active tasks shortly before the avenrun update
happens. The avenrun update itself is handled by the CPU which calls
do_timer().
[ Impact: reduce xtime_lock write locked section ]
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra <peterz@infradead.org>
-rw-r--r-- | include/linux/sched.h | 2 | ||||
-rw-r--r-- | kernel/sched.c | 84 | ||||
-rw-r--r-- | kernel/sched_idletask.c | 3 | ||||
-rw-r--r-- | kernel/time/timekeeping.c | 2 | ||||
-rw-r--r-- | kernel/timer.c | 54 |
5 files changed, 80 insertions, 65 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h index b4c38bc8049c..6eb4892efe45 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -135,8 +135,8 @@ DECLARE_PER_CPU(unsigned long, process_counts); | |||
135 | extern int nr_processes(void); | 135 | extern int nr_processes(void); |
136 | extern unsigned long nr_running(void); | 136 | extern unsigned long nr_running(void); |
137 | extern unsigned long nr_uninterruptible(void); | 137 | extern unsigned long nr_uninterruptible(void); |
138 | extern unsigned long nr_active(void); | ||
139 | extern unsigned long nr_iowait(void); | 138 | extern unsigned long nr_iowait(void); |
139 | extern void calc_global_load(void); | ||
140 | 140 | ||
141 | extern unsigned long get_parent_ip(unsigned long addr); | 141 | extern unsigned long get_parent_ip(unsigned long addr); |
142 | 142 | ||
diff --git a/kernel/sched.c b/kernel/sched.c index 8908d190a348..f4eb88153bd1 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -630,6 +630,10 @@ struct rq { | |||
630 | struct list_head migration_queue; | 630 | struct list_head migration_queue; |
631 | #endif | 631 | #endif |
632 | 632 | ||
633 | /* calc_load related fields */ | ||
634 | unsigned long calc_load_update; | ||
635 | long calc_load_active; | ||
636 | |||
633 | #ifdef CONFIG_SCHED_HRTICK | 637 | #ifdef CONFIG_SCHED_HRTICK |
634 | #ifdef CONFIG_SMP | 638 | #ifdef CONFIG_SMP |
635 | int hrtick_csd_pending; | 639 | int hrtick_csd_pending; |
@@ -1728,6 +1732,8 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) | |||
1728 | } | 1732 | } |
1729 | #endif | 1733 | #endif |
1730 | 1734 | ||
1735 | static void calc_load_account_active(struct rq *this_rq); | ||
1736 | |||
1731 | #include "sched_stats.h" | 1737 | #include "sched_stats.h" |
1732 | #include "sched_idletask.c" | 1738 | #include "sched_idletask.c" |
1733 | #include "sched_fair.c" | 1739 | #include "sched_fair.c" |
@@ -2856,19 +2862,57 @@ unsigned long nr_iowait(void) | |||
2856 | return sum; | 2862 | return sum; |
2857 | } | 2863 | } |
2858 | 2864 | ||
2859 | unsigned long nr_active(void) | 2865 | /* Variables and functions for calc_load */ |
2866 | static atomic_long_t calc_load_tasks; | ||
2867 | static unsigned long calc_load_update; | ||
2868 | unsigned long avenrun[3]; | ||
2869 | EXPORT_SYMBOL(avenrun); | ||
2870 | |||
2871 | static unsigned long | ||
2872 | calc_load(unsigned long load, unsigned long exp, unsigned long active) | ||
2860 | { | 2873 | { |
2861 | unsigned long i, running = 0, uninterruptible = 0; | 2874 | load *= exp; |
2875 | load += active * (FIXED_1 - exp); | ||
2876 | return load >> FSHIFT; | ||
2877 | } | ||
2862 | 2878 | ||
2863 | for_each_online_cpu(i) { | 2879 | /* |
2864 | running += cpu_rq(i)->nr_running; | 2880 | * calc_load - update the avenrun load estimates 10 ticks after the |
2865 | uninterruptible += cpu_rq(i)->nr_uninterruptible; | 2881 | * CPUs have updated calc_load_tasks. |
2866 | } | 2882 | */ |
2883 | void calc_global_load(void) | ||
2884 | { | ||
2885 | unsigned long upd = calc_load_update + 10; | ||
2886 | long active; | ||
2887 | |||
2888 | if (time_before(jiffies, upd)) | ||
2889 | return; | ||
2867 | 2890 | ||
2868 | if (unlikely((long)uninterruptible < 0)) | 2891 | active = atomic_long_read(&calc_load_tasks); |
2869 | uninterruptible = 0; | 2892 | active = active > 0 ? active * FIXED_1 : 0; |
2870 | 2893 | ||
2871 | return running + uninterruptible; | 2894 | avenrun[0] = calc_load(avenrun[0], EXP_1, active); |
2895 | avenrun[1] = calc_load(avenrun[1], EXP_5, active); | ||
2896 | avenrun[2] = calc_load(avenrun[2], EXP_15, active); | ||
2897 | |||
2898 | calc_load_update += LOAD_FREQ; | ||
2899 | } | ||
2900 | |||
2901 | /* | ||
2902 | * Either called from update_cpu_load() or from a cpu going idle | ||
2903 | */ | ||
2904 | static void calc_load_account_active(struct rq *this_rq) | ||
2905 | { | ||
2906 | long nr_active, delta; | ||
2907 | |||
2908 | nr_active = this_rq->nr_running; | ||
2909 | nr_active += (long) this_rq->nr_uninterruptible; | ||
2910 | |||
2911 | if (nr_active != this_rq->calc_load_active) { | ||
2912 | delta = nr_active - this_rq->calc_load_active; | ||
2913 | this_rq->calc_load_active = nr_active; | ||
2914 | atomic_long_add(delta, &calc_load_tasks); | ||
2915 | } | ||
2872 | } | 2916 | } |
2873 | 2917 | ||
2874 | /* | 2918 | /* |
@@ -2899,6 +2943,11 @@ static void update_cpu_load(struct rq *this_rq) | |||
2899 | new_load += scale-1; | 2943 | new_load += scale-1; |
2900 | this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; | 2944 | this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; |
2901 | } | 2945 | } |
2946 | |||
2947 | if (time_after_eq(jiffies, this_rq->calc_load_update)) { | ||
2948 | this_rq->calc_load_update += LOAD_FREQ; | ||
2949 | calc_load_account_active(this_rq); | ||
2950 | } | ||
2902 | } | 2951 | } |
2903 | 2952 | ||
2904 | #ifdef CONFIG_SMP | 2953 | #ifdef CONFIG_SMP |
@@ -7091,6 +7140,14 @@ static void migrate_dead_tasks(unsigned int dead_cpu) | |||
7091 | 7140 | ||
7092 | } | 7141 | } |
7093 | } | 7142 | } |
7143 | |||
7144 | /* | ||
7145 | * remove the tasks which were accounted by rq from calc_load_tasks. | ||
7146 | */ | ||
7147 | static void calc_global_load_remove(struct rq *rq) | ||
7148 | { | ||
7149 | atomic_long_sub(rq->calc_load_active, &calc_load_tasks); | ||
7150 | } | ||
7094 | #endif /* CONFIG_HOTPLUG_CPU */ | 7151 | #endif /* CONFIG_HOTPLUG_CPU */ |
7095 | 7152 | ||
7096 | #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) | 7153 | #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) |
@@ -7325,6 +7382,8 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
7325 | /* Update our root-domain */ | 7382 | /* Update our root-domain */ |
7326 | rq = cpu_rq(cpu); | 7383 | rq = cpu_rq(cpu); |
7327 | spin_lock_irqsave(&rq->lock, flags); | 7384 | spin_lock_irqsave(&rq->lock, flags); |
7385 | rq->calc_load_update = calc_load_update; | ||
7386 | rq->calc_load_active = 0; | ||
7328 | if (rq->rd) { | 7387 | if (rq->rd) { |
7329 | BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); | 7388 | BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); |
7330 | 7389 | ||
@@ -7364,7 +7423,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
7364 | cpuset_unlock(); | 7423 | cpuset_unlock(); |
7365 | migrate_nr_uninterruptible(rq); | 7424 | migrate_nr_uninterruptible(rq); |
7366 | BUG_ON(rq->nr_running != 0); | 7425 | BUG_ON(rq->nr_running != 0); |
7367 | 7426 | calc_global_load_remove(rq); | |
7368 | /* | 7427 | /* |
7369 | * No need to migrate the tasks: it was best-effort if | 7428 | * No need to migrate the tasks: it was best-effort if |
7370 | * they didn't take sched_hotcpu_mutex. Just wake up | 7429 | * they didn't take sched_hotcpu_mutex. Just wake up |
@@ -9059,6 +9118,8 @@ void __init sched_init(void) | |||
9059 | rq = cpu_rq(i); | 9118 | rq = cpu_rq(i); |
9060 | spin_lock_init(&rq->lock); | 9119 | spin_lock_init(&rq->lock); |
9061 | rq->nr_running = 0; | 9120 | rq->nr_running = 0; |
9121 | rq->calc_load_active = 0; | ||
9122 | rq->calc_load_update = jiffies + LOAD_FREQ; | ||
9062 | init_cfs_rq(&rq->cfs, rq); | 9123 | init_cfs_rq(&rq->cfs, rq); |
9063 | init_rt_rq(&rq->rt, rq); | 9124 | init_rt_rq(&rq->rt, rq); |
9064 | #ifdef CONFIG_FAIR_GROUP_SCHED | 9125 | #ifdef CONFIG_FAIR_GROUP_SCHED |
@@ -9166,6 +9227,9 @@ void __init sched_init(void) | |||
9166 | * when this runqueue becomes "idle". | 9227 | * when this runqueue becomes "idle". |
9167 | */ | 9228 | */ |
9168 | init_idle(current, smp_processor_id()); | 9229 | init_idle(current, smp_processor_id()); |
9230 | |||
9231 | calc_load_update = jiffies + LOAD_FREQ; | ||
9232 | |||
9169 | /* | 9233 | /* |
9170 | * During early bootup we pretend to be a normal task: | 9234 | * During early bootup we pretend to be a normal task: |
9171 | */ | 9235 | */ |
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index 8a21a2e28c13..499672c10cbd 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c | |||
@@ -22,7 +22,8 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int sy | |||
22 | static struct task_struct *pick_next_task_idle(struct rq *rq) | 22 | static struct task_struct *pick_next_task_idle(struct rq *rq) |
23 | { | 23 | { |
24 | schedstat_inc(rq, sched_goidle); | 24 | schedstat_inc(rq, sched_goidle); |
25 | 25 | /* adjust the active tasks as we might go into a long sleep */ | |
26 | calc_load_account_active(rq); | ||
26 | return rq->idle; | 27 | return rq->idle; |
27 | } | 28 | } |
28 | 29 | ||
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 687dff49f6e7..52a8bf8931f3 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c | |||
@@ -22,7 +22,7 @@ | |||
22 | 22 | ||
23 | /* | 23 | /* |
24 | * This read-write spinlock protects us from races in SMP while | 24 | * This read-write spinlock protects us from races in SMP while |
25 | * playing with xtime and avenrun. | 25 | * playing with xtime. |
26 | */ | 26 | */ |
27 | __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); | 27 | __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); |
28 | 28 | ||
diff --git a/kernel/timer.c b/kernel/timer.c index cffffad01c31..6a21d7af9620 100644 --- a/kernel/timer.c +++ b/kernel/timer.c | |||
@@ -1123,47 +1123,6 @@ void update_process_times(int user_tick) | |||
1123 | } | 1123 | } |
1124 | 1124 | ||
1125 | /* | 1125 | /* |
1126 | * Nr of active tasks - counted in fixed-point numbers | ||
1127 | */ | ||
1128 | static unsigned long count_active_tasks(void) | ||
1129 | { | ||
1130 | return nr_active() * FIXED_1; | ||
1131 | } | ||
1132 | |||
1133 | /* | ||
1134 | * Hmm.. Changed this, as the GNU make sources (load.c) seems to | ||
1135 | * imply that avenrun[] is the standard name for this kind of thing. | ||
1136 | * Nothing else seems to be standardized: the fractional size etc | ||
1137 | * all seem to differ on different machines. | ||
1138 | * | ||
1139 | * Requires xtime_lock to access. | ||
1140 | */ | ||
1141 | unsigned long avenrun[3]; | ||
1142 | |||
1143 | EXPORT_SYMBOL(avenrun); | ||
1144 | |||
1145 | /* | ||
1146 | * calc_load - given tick count, update the avenrun load estimates. | ||
1147 | * This is called while holding a write_lock on xtime_lock. | ||
1148 | */ | ||
1149 | static inline void calc_load(unsigned long ticks) | ||
1150 | { | ||
1151 | unsigned long active_tasks; /* fixed-point */ | ||
1152 | static int count = LOAD_FREQ; | ||
1153 | |||
1154 | count -= ticks; | ||
1155 | if (unlikely(count < 0)) { | ||
1156 | active_tasks = count_active_tasks(); | ||
1157 | do { | ||
1158 | CALC_LOAD(avenrun[0], EXP_1, active_tasks); | ||
1159 | CALC_LOAD(avenrun[1], EXP_5, active_tasks); | ||
1160 | CALC_LOAD(avenrun[2], EXP_15, active_tasks); | ||
1161 | count += LOAD_FREQ; | ||
1162 | } while (count < 0); | ||
1163 | } | ||
1164 | } | ||
1165 | |||
1166 | /* | ||
1167 | * This function runs timers and the timer-tq in bottom half context. | 1126 | * This function runs timers and the timer-tq in bottom half context. |
1168 | */ | 1127 | */ |
1169 | static void run_timer_softirq(struct softirq_action *h) | 1128 | static void run_timer_softirq(struct softirq_action *h) |
@@ -1187,16 +1146,6 @@ void run_local_timers(void) | |||
1187 | } | 1146 | } |
1188 | 1147 | ||
1189 | /* | 1148 | /* |
1190 | * Called by the timer interrupt. xtime_lock must already be taken | ||
1191 | * by the timer IRQ! | ||
1192 | */ | ||
1193 | static inline void update_times(unsigned long ticks) | ||
1194 | { | ||
1195 | update_wall_time(); | ||
1196 | calc_load(ticks); | ||
1197 | } | ||
1198 | |||
1199 | /* | ||
1200 | * The 64-bit jiffies value is not atomic - you MUST NOT read it | 1149 | * The 64-bit jiffies value is not atomic - you MUST NOT read it |
1201 | * without sampling the sequence number in xtime_lock. | 1150 | * without sampling the sequence number in xtime_lock. |
1202 | * jiffies is defined in the linker script... | 1151 | * jiffies is defined in the linker script... |
@@ -1205,7 +1154,8 @@ static inline void update_times(unsigned long ticks) | |||
1205 | void do_timer(unsigned long ticks) | 1154 | void do_timer(unsigned long ticks) |
1206 | { | 1155 | { |
1207 | jiffies_64 += ticks; | 1156 | jiffies_64 += ticks; |
1208 | update_times(ticks); | 1157 | update_wall_time(); |
1158 | calc_global_load(); | ||
1209 | } | 1159 | } |
1210 | 1160 | ||
1211 | #ifdef __ARCH_WANT_SYS_ALARM | 1161 | #ifdef __ARCH_WANT_SYS_ALARM |