diff options
author | Thomas Gleixner <tglx@linutronix.de> | 2009-04-11 04:43:41 -0400 |
---|---|---|
committer | Thomas Gleixner <tglx@linutronix.de> | 2009-05-15 09:32:45 -0400 |
commit | dce48a84adf1806676319f6f480e30a6daa012f9 (patch) | |
tree | 79151f5d31d9c3dcdc723ab8877cb943b944890e /kernel/sched.c | |
parent | 2ff799d3cff1ecb274049378b28120ee5c1c5e5f (diff) |
sched, timers: move calc_load() to scheduler
Dimitri Sivanich noticed that xtime_lock is held write locked across
calc_load() which iterates over all online CPUs. That can cause long
latencies for xtime_lock readers on large SMP systems.
The load average calculation is an rough estimate anyway so there is
no real need to protect the readers vs. the update. It's not a problem
when the avenrun array is updated while a reader copies the values.
Instead of iterating over all online CPUs let the scheduler_tick code
update the number of active tasks shortly before the avenrun update
happens. The avenrun update itself is handled by the CPU which calls
do_timer().
[ Impact: reduce xtime_lock write locked section ]
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Diffstat (limited to 'kernel/sched.c')
-rw-r--r-- | kernel/sched.c | 84 |
1 files changed, 74 insertions, 10 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index 8908d190a348..f4eb88153bd1 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -630,6 +630,10 @@ struct rq { | |||
630 | struct list_head migration_queue; | 630 | struct list_head migration_queue; |
631 | #endif | 631 | #endif |
632 | 632 | ||
633 | /* calc_load related fields */ | ||
634 | unsigned long calc_load_update; | ||
635 | long calc_load_active; | ||
636 | |||
633 | #ifdef CONFIG_SCHED_HRTICK | 637 | #ifdef CONFIG_SCHED_HRTICK |
634 | #ifdef CONFIG_SMP | 638 | #ifdef CONFIG_SMP |
635 | int hrtick_csd_pending; | 639 | int hrtick_csd_pending; |
@@ -1728,6 +1732,8 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) | |||
1728 | } | 1732 | } |
1729 | #endif | 1733 | #endif |
1730 | 1734 | ||
1735 | static void calc_load_account_active(struct rq *this_rq); | ||
1736 | |||
1731 | #include "sched_stats.h" | 1737 | #include "sched_stats.h" |
1732 | #include "sched_idletask.c" | 1738 | #include "sched_idletask.c" |
1733 | #include "sched_fair.c" | 1739 | #include "sched_fair.c" |
@@ -2856,19 +2862,57 @@ unsigned long nr_iowait(void) | |||
2856 | return sum; | 2862 | return sum; |
2857 | } | 2863 | } |
2858 | 2864 | ||
2859 | unsigned long nr_active(void) | 2865 | /* Variables and functions for calc_load */ |
2866 | static atomic_long_t calc_load_tasks; | ||
2867 | static unsigned long calc_load_update; | ||
2868 | unsigned long avenrun[3]; | ||
2869 | EXPORT_SYMBOL(avenrun); | ||
2870 | |||
2871 | static unsigned long | ||
2872 | calc_load(unsigned long load, unsigned long exp, unsigned long active) | ||
2860 | { | 2873 | { |
2861 | unsigned long i, running = 0, uninterruptible = 0; | 2874 | load *= exp; |
2875 | load += active * (FIXED_1 - exp); | ||
2876 | return load >> FSHIFT; | ||
2877 | } | ||
2862 | 2878 | ||
2863 | for_each_online_cpu(i) { | 2879 | /* |
2864 | running += cpu_rq(i)->nr_running; | 2880 | * calc_load - update the avenrun load estimates 10 ticks after the |
2865 | uninterruptible += cpu_rq(i)->nr_uninterruptible; | 2881 | * CPUs have updated calc_load_tasks. |
2866 | } | 2882 | */ |
2883 | void calc_global_load(void) | ||
2884 | { | ||
2885 | unsigned long upd = calc_load_update + 10; | ||
2886 | long active; | ||
2887 | |||
2888 | if (time_before(jiffies, upd)) | ||
2889 | return; | ||
2867 | 2890 | ||
2868 | if (unlikely((long)uninterruptible < 0)) | 2891 | active = atomic_long_read(&calc_load_tasks); |
2869 | uninterruptible = 0; | 2892 | active = active > 0 ? active * FIXED_1 : 0; |
2870 | 2893 | ||
2871 | return running + uninterruptible; | 2894 | avenrun[0] = calc_load(avenrun[0], EXP_1, active); |
2895 | avenrun[1] = calc_load(avenrun[1], EXP_5, active); | ||
2896 | avenrun[2] = calc_load(avenrun[2], EXP_15, active); | ||
2897 | |||
2898 | calc_load_update += LOAD_FREQ; | ||
2899 | } | ||
2900 | |||
2901 | /* | ||
2902 | * Either called from update_cpu_load() or from a cpu going idle | ||
2903 | */ | ||
2904 | static void calc_load_account_active(struct rq *this_rq) | ||
2905 | { | ||
2906 | long nr_active, delta; | ||
2907 | |||
2908 | nr_active = this_rq->nr_running; | ||
2909 | nr_active += (long) this_rq->nr_uninterruptible; | ||
2910 | |||
2911 | if (nr_active != this_rq->calc_load_active) { | ||
2912 | delta = nr_active - this_rq->calc_load_active; | ||
2913 | this_rq->calc_load_active = nr_active; | ||
2914 | atomic_long_add(delta, &calc_load_tasks); | ||
2915 | } | ||
2872 | } | 2916 | } |
2873 | 2917 | ||
2874 | /* | 2918 | /* |
@@ -2899,6 +2943,11 @@ static void update_cpu_load(struct rq *this_rq) | |||
2899 | new_load += scale-1; | 2943 | new_load += scale-1; |
2900 | this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; | 2944 | this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; |
2901 | } | 2945 | } |
2946 | |||
2947 | if (time_after_eq(jiffies, this_rq->calc_load_update)) { | ||
2948 | this_rq->calc_load_update += LOAD_FREQ; | ||
2949 | calc_load_account_active(this_rq); | ||
2950 | } | ||
2902 | } | 2951 | } |
2903 | 2952 | ||
2904 | #ifdef CONFIG_SMP | 2953 | #ifdef CONFIG_SMP |
@@ -7091,6 +7140,14 @@ static void migrate_dead_tasks(unsigned int dead_cpu) | |||
7091 | 7140 | ||
7092 | } | 7141 | } |
7093 | } | 7142 | } |
7143 | |||
7144 | /* | ||
7145 | * remove the tasks which were accounted by rq from calc_load_tasks. | ||
7146 | */ | ||
7147 | static void calc_global_load_remove(struct rq *rq) | ||
7148 | { | ||
7149 | atomic_long_sub(rq->calc_load_active, &calc_load_tasks); | ||
7150 | } | ||
7094 | #endif /* CONFIG_HOTPLUG_CPU */ | 7151 | #endif /* CONFIG_HOTPLUG_CPU */ |
7095 | 7152 | ||
7096 | #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) | 7153 | #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) |
@@ -7325,6 +7382,8 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
7325 | /* Update our root-domain */ | 7382 | /* Update our root-domain */ |
7326 | rq = cpu_rq(cpu); | 7383 | rq = cpu_rq(cpu); |
7327 | spin_lock_irqsave(&rq->lock, flags); | 7384 | spin_lock_irqsave(&rq->lock, flags); |
7385 | rq->calc_load_update = calc_load_update; | ||
7386 | rq->calc_load_active = 0; | ||
7328 | if (rq->rd) { | 7387 | if (rq->rd) { |
7329 | BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); | 7388 | BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); |
7330 | 7389 | ||
@@ -7364,7 +7423,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
7364 | cpuset_unlock(); | 7423 | cpuset_unlock(); |
7365 | migrate_nr_uninterruptible(rq); | 7424 | migrate_nr_uninterruptible(rq); |
7366 | BUG_ON(rq->nr_running != 0); | 7425 | BUG_ON(rq->nr_running != 0); |
7367 | 7426 | calc_global_load_remove(rq); | |
7368 | /* | 7427 | /* |
7369 | * No need to migrate the tasks: it was best-effort if | 7428 | * No need to migrate the tasks: it was best-effort if |
7370 | * they didn't take sched_hotcpu_mutex. Just wake up | 7429 | * they didn't take sched_hotcpu_mutex. Just wake up |
@@ -9059,6 +9118,8 @@ void __init sched_init(void) | |||
9059 | rq = cpu_rq(i); | 9118 | rq = cpu_rq(i); |
9060 | spin_lock_init(&rq->lock); | 9119 | spin_lock_init(&rq->lock); |
9061 | rq->nr_running = 0; | 9120 | rq->nr_running = 0; |
9121 | rq->calc_load_active = 0; | ||
9122 | rq->calc_load_update = jiffies + LOAD_FREQ; | ||
9062 | init_cfs_rq(&rq->cfs, rq); | 9123 | init_cfs_rq(&rq->cfs, rq); |
9063 | init_rt_rq(&rq->rt, rq); | 9124 | init_rt_rq(&rq->rt, rq); |
9064 | #ifdef CONFIG_FAIR_GROUP_SCHED | 9125 | #ifdef CONFIG_FAIR_GROUP_SCHED |
@@ -9166,6 +9227,9 @@ void __init sched_init(void) | |||
9166 | * when this runqueue becomes "idle". | 9227 | * when this runqueue becomes "idle". |
9167 | */ | 9228 | */ |
9168 | init_idle(current, smp_processor_id()); | 9229 | init_idle(current, smp_processor_id()); |
9230 | |||
9231 | calc_load_update = jiffies + LOAD_FREQ; | ||
9232 | |||
9169 | /* | 9233 | /* |
9170 | * During early bootup we pretend to be a normal task: | 9234 | * During early bootup we pretend to be a normal task: |
9171 | */ | 9235 | */ |