diff options
author | Ingo Molnar <mingo@elte.hu> | 2011-01-05 08:14:42 -0500 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2011-01-05 08:14:46 -0500 |
commit | 27066fd484a32c80630136aa2b91c980f3198f9d (patch) | |
tree | 78ddabdedbfd7525d13ecd62a745525843f1d0e8 /kernel/sched.c | |
parent | 101e5f77bf35679809586e250b6c62193d2ed179 (diff) | |
parent | 3c0eee3fe6a3a1c745379547c7e7c904aa64f6d5 (diff) |
Merge commit 'v2.6.37' into sched/core
Merge reason: Merge the final .37 tree.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel/sched.c')
-rw-r--r-- | kernel/sched.c | 287 |
1 files changed, 236 insertions, 51 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index 9f9dd8dda53..f2f914e0c47 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -642,22 +642,18 @@ static inline struct task_group *task_group(struct task_struct *p) | |||
642 | 642 | ||
643 | #endif /* CONFIG_CGROUP_SCHED */ | 643 | #endif /* CONFIG_CGROUP_SCHED */ |
644 | 644 | ||
645 | static u64 irq_time_cpu(int cpu); | 645 | static void update_rq_clock_task(struct rq *rq, s64 delta); |
646 | static void sched_irq_time_avg_update(struct rq *rq, u64 irq_time); | ||
647 | 646 | ||
648 | inline void update_rq_clock(struct rq *rq) | 647 | static void update_rq_clock(struct rq *rq) |
649 | { | 648 | { |
650 | if (!rq->skip_clock_update) { | 649 | s64 delta; |
651 | int cpu = cpu_of(rq); | ||
652 | u64 irq_time; | ||
653 | 650 | ||
654 | rq->clock = sched_clock_cpu(cpu); | 651 | if (rq->skip_clock_update) |
655 | irq_time = irq_time_cpu(cpu); | 652 | return; |
656 | if (rq->clock - irq_time > rq->clock_task) | ||
657 | rq->clock_task = rq->clock - irq_time; | ||
658 | 653 | ||
659 | sched_irq_time_avg_update(rq, irq_time); | 654 | delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; |
660 | } | 655 | rq->clock += delta; |
656 | update_rq_clock_task(rq, delta); | ||
661 | } | 657 | } |
662 | 658 | ||
663 | /* | 659 | /* |
@@ -1795,10 +1791,9 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags) | |||
1795 | * They are read and saved off onto struct rq in update_rq_clock(). | 1791 | * They are read and saved off onto struct rq in update_rq_clock(). |
1796 | * This may result in other CPU reading this CPU's irq time and can | 1792 | * This may result in other CPU reading this CPU's irq time and can |
1797 | * race with irq/account_system_vtime on this CPU. We would either get old | 1793 | * race with irq/account_system_vtime on this CPU. We would either get old |
1798 | * or new value (or semi updated value on 32 bit) with a side effect of | 1794 | * or new value with a side effect of accounting a slice of irq time to wrong |
1799 | * accounting a slice of irq time to wrong task when irq is in progress | 1795 | * task when irq is in progress while we read rq->clock. That is a worthy |
1800 | * while we read rq->clock. That is a worthy compromise in place of having | 1796 | * compromise in place of having locks on each irq in account_system_time. |
1801 | * locks on each irq in account_system_time. | ||
1802 | */ | 1797 | */ |
1803 | static DEFINE_PER_CPU(u64, cpu_hardirq_time); | 1798 | static DEFINE_PER_CPU(u64, cpu_hardirq_time); |
1804 | static DEFINE_PER_CPU(u64, cpu_softirq_time); | 1799 | static DEFINE_PER_CPU(u64, cpu_softirq_time); |
@@ -1816,19 +1811,58 @@ void disable_sched_clock_irqtime(void) | |||
1816 | sched_clock_irqtime = 0; | 1811 | sched_clock_irqtime = 0; |
1817 | } | 1812 | } |
1818 | 1813 | ||
1819 | static u64 irq_time_cpu(int cpu) | 1814 | #ifndef CONFIG_64BIT |
1815 | static DEFINE_PER_CPU(seqcount_t, irq_time_seq); | ||
1816 | |||
1817 | static inline void irq_time_write_begin(void) | ||
1820 | { | 1818 | { |
1821 | if (!sched_clock_irqtime) | 1819 | __this_cpu_inc(irq_time_seq.sequence); |
1822 | return 0; | 1820 | smp_wmb(); |
1821 | } | ||
1822 | |||
1823 | static inline void irq_time_write_end(void) | ||
1824 | { | ||
1825 | smp_wmb(); | ||
1826 | __this_cpu_inc(irq_time_seq.sequence); | ||
1827 | } | ||
1828 | |||
1829 | static inline u64 irq_time_read(int cpu) | ||
1830 | { | ||
1831 | u64 irq_time; | ||
1832 | unsigned seq; | ||
1823 | 1833 | ||
1834 | do { | ||
1835 | seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu)); | ||
1836 | irq_time = per_cpu(cpu_softirq_time, cpu) + | ||
1837 | per_cpu(cpu_hardirq_time, cpu); | ||
1838 | } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq)); | ||
1839 | |||
1840 | return irq_time; | ||
1841 | } | ||
1842 | #else /* CONFIG_64BIT */ | ||
1843 | static inline void irq_time_write_begin(void) | ||
1844 | { | ||
1845 | } | ||
1846 | |||
1847 | static inline void irq_time_write_end(void) | ||
1848 | { | ||
1849 | } | ||
1850 | |||
1851 | static inline u64 irq_time_read(int cpu) | ||
1852 | { | ||
1824 | return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu); | 1853 | return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu); |
1825 | } | 1854 | } |
1855 | #endif /* CONFIG_64BIT */ | ||
1826 | 1856 | ||
1857 | /* | ||
1858 | * Called before incrementing preempt_count on {soft,}irq_enter | ||
1859 | * and before decrementing preempt_count on {soft,}irq_exit. | ||
1860 | */ | ||
1827 | void account_system_vtime(struct task_struct *curr) | 1861 | void account_system_vtime(struct task_struct *curr) |
1828 | { | 1862 | { |
1829 | unsigned long flags; | 1863 | unsigned long flags; |
1864 | s64 delta; | ||
1830 | int cpu; | 1865 | int cpu; |
1831 | u64 now, delta; | ||
1832 | 1866 | ||
1833 | if (!sched_clock_irqtime) | 1867 | if (!sched_clock_irqtime) |
1834 | return; | 1868 | return; |
@@ -1836,9 +1870,10 @@ void account_system_vtime(struct task_struct *curr) | |||
1836 | local_irq_save(flags); | 1870 | local_irq_save(flags); |
1837 | 1871 | ||
1838 | cpu = smp_processor_id(); | 1872 | cpu = smp_processor_id(); |
1839 | now = sched_clock_cpu(cpu); | 1873 | delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time); |
1840 | delta = now - per_cpu(irq_start_time, cpu); | 1874 | __this_cpu_add(irq_start_time, delta); |
1841 | per_cpu(irq_start_time, cpu) = now; | 1875 | |
1876 | irq_time_write_begin(); | ||
1842 | /* | 1877 | /* |
1843 | * We do not account for softirq time from ksoftirqd here. | 1878 | * We do not account for softirq time from ksoftirqd here. |
1844 | * We want to continue accounting softirq time to ksoftirqd thread | 1879 | * We want to continue accounting softirq time to ksoftirqd thread |
@@ -1846,33 +1881,55 @@ void account_system_vtime(struct task_struct *curr) | |||
1846 | * that do not consume any time, but still wants to run. | 1881 | * that do not consume any time, but still wants to run. |
1847 | */ | 1882 | */ |
1848 | if (hardirq_count()) | 1883 | if (hardirq_count()) |
1849 | per_cpu(cpu_hardirq_time, cpu) += delta; | 1884 | __this_cpu_add(cpu_hardirq_time, delta); |
1850 | else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD)) | 1885 | else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD)) |
1851 | per_cpu(cpu_softirq_time, cpu) += delta; | 1886 | __this_cpu_add(cpu_softirq_time, delta); |
1852 | 1887 | ||
1888 | irq_time_write_end(); | ||
1853 | local_irq_restore(flags); | 1889 | local_irq_restore(flags); |
1854 | } | 1890 | } |
1855 | EXPORT_SYMBOL_GPL(account_system_vtime); | 1891 | EXPORT_SYMBOL_GPL(account_system_vtime); |
1856 | 1892 | ||
1857 | static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) | 1893 | static void update_rq_clock_task(struct rq *rq, s64 delta) |
1858 | { | 1894 | { |
1859 | if (sched_clock_irqtime && sched_feat(NONIRQ_POWER)) { | 1895 | s64 irq_delta; |
1860 | u64 delta_irq = curr_irq_time - rq->prev_irq_time; | 1896 | |
1861 | rq->prev_irq_time = curr_irq_time; | 1897 | irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; |
1862 | sched_rt_avg_update(rq, delta_irq); | 1898 | |
1863 | } | 1899 | /* |
1900 | * Since irq_time is only updated on {soft,}irq_exit, we might run into | ||
1901 | * this case when a previous update_rq_clock() happened inside a | ||
1902 | * {soft,}irq region. | ||
1903 | * | ||
1904 | * When this happens, we stop ->clock_task and only update the | ||
1905 | * prev_irq_time stamp to account for the part that fit, so that a next | ||
1906 | * update will consume the rest. This ensures ->clock_task is | ||
1907 | * monotonic. | ||
1908 | * | ||
1909 | * It does however cause some slight miss-attribution of {soft,}irq | ||
1910 | * time, a more accurate solution would be to update the irq_time using | ||
1911 | * the current rq->clock timestamp, except that would require using | ||
1912 | * atomic ops. | ||
1913 | */ | ||
1914 | if (irq_delta > delta) | ||
1915 | irq_delta = delta; | ||
1916 | |||
1917 | rq->prev_irq_time += irq_delta; | ||
1918 | delta -= irq_delta; | ||
1919 | rq->clock_task += delta; | ||
1920 | |||
1921 | if (irq_delta && sched_feat(NONIRQ_POWER)) | ||
1922 | sched_rt_avg_update(rq, irq_delta); | ||
1864 | } | 1923 | } |
1865 | 1924 | ||
1866 | #else | 1925 | #else /* CONFIG_IRQ_TIME_ACCOUNTING */ |
1867 | 1926 | ||
1868 | static u64 irq_time_cpu(int cpu) | 1927 | static void update_rq_clock_task(struct rq *rq, s64 delta) |
1869 | { | 1928 | { |
1870 | return 0; | 1929 | rq->clock_task += delta; |
1871 | } | 1930 | } |
1872 | 1931 | ||
1873 | static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) { } | 1932 | #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ |
1874 | |||
1875 | #endif | ||
1876 | 1933 | ||
1877 | #include "sched_idletask.c" | 1934 | #include "sched_idletask.c" |
1878 | #include "sched_fair.c" | 1935 | #include "sched_fair.c" |
@@ -2001,7 +2058,7 @@ static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) | |||
2001 | * A queue event has occurred, and we're going to schedule. In | 2058 | * A queue event has occurred, and we're going to schedule. In |
2002 | * this case, we can save a useless back to back clock update. | 2059 | * this case, we can save a useless back to back clock update. |
2003 | */ | 2060 | */ |
2004 | if (test_tsk_need_resched(rq->curr)) | 2061 | if (rq->curr->se.on_rq && test_tsk_need_resched(rq->curr)) |
2005 | rq->skip_clock_update = 1; | 2062 | rq->skip_clock_update = 1; |
2006 | } | 2063 | } |
2007 | 2064 | ||
@@ -2988,6 +3045,15 @@ static long calc_load_fold_active(struct rq *this_rq) | |||
2988 | return delta; | 3045 | return delta; |
2989 | } | 3046 | } |
2990 | 3047 | ||
3048 | static unsigned long | ||
3049 | calc_load(unsigned long load, unsigned long exp, unsigned long active) | ||
3050 | { | ||
3051 | load *= exp; | ||
3052 | load += active * (FIXED_1 - exp); | ||
3053 | load += 1UL << (FSHIFT - 1); | ||
3054 | return load >> FSHIFT; | ||
3055 | } | ||
3056 | |||
2991 | #ifdef CONFIG_NO_HZ | 3057 | #ifdef CONFIG_NO_HZ |
2992 | /* | 3058 | /* |
2993 | * For NO_HZ we delay the active fold to the next LOAD_FREQ update. | 3059 | * For NO_HZ we delay the active fold to the next LOAD_FREQ update. |
@@ -3017,6 +3083,128 @@ static long calc_load_fold_idle(void) | |||
3017 | 3083 | ||
3018 | return delta; | 3084 | return delta; |
3019 | } | 3085 | } |
3086 | |||
3087 | /** | ||
3088 | * fixed_power_int - compute: x^n, in O(log n) time | ||
3089 | * | ||
3090 | * @x: base of the power | ||
3091 | * @frac_bits: fractional bits of @x | ||
3092 | * @n: power to raise @x to. | ||
3093 | * | ||
3094 | * By exploiting the relation between the definition of the natural power | ||
3095 | * function: x^n := x*x*...*x (x multiplied by itself for n times), and | ||
3096 | * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, | ||
3097 | * (where: n_i \elem {0, 1}, the binary vector representing n), | ||
3098 | * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is | ||
3099 | * of course trivially computable in O(log_2 n), the length of our binary | ||
3100 | * vector. | ||
3101 | */ | ||
3102 | static unsigned long | ||
3103 | fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n) | ||
3104 | { | ||
3105 | unsigned long result = 1UL << frac_bits; | ||
3106 | |||
3107 | if (n) for (;;) { | ||
3108 | if (n & 1) { | ||
3109 | result *= x; | ||
3110 | result += 1UL << (frac_bits - 1); | ||
3111 | result >>= frac_bits; | ||
3112 | } | ||
3113 | n >>= 1; | ||
3114 | if (!n) | ||
3115 | break; | ||
3116 | x *= x; | ||
3117 | x += 1UL << (frac_bits - 1); | ||
3118 | x >>= frac_bits; | ||
3119 | } | ||
3120 | |||
3121 | return result; | ||
3122 | } | ||
3123 | |||
3124 | /* | ||
3125 | * a1 = a0 * e + a * (1 - e) | ||
3126 | * | ||
3127 | * a2 = a1 * e + a * (1 - e) | ||
3128 | * = (a0 * e + a * (1 - e)) * e + a * (1 - e) | ||
3129 | * = a0 * e^2 + a * (1 - e) * (1 + e) | ||
3130 | * | ||
3131 | * a3 = a2 * e + a * (1 - e) | ||
3132 | * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e) | ||
3133 | * = a0 * e^3 + a * (1 - e) * (1 + e + e^2) | ||
3134 | * | ||
3135 | * ... | ||
3136 | * | ||
3137 | * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1] | ||
3138 | * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e) | ||
3139 | * = a0 * e^n + a * (1 - e^n) | ||
3140 | * | ||
3141 | * [1] application of the geometric series: | ||
3142 | * | ||
3143 | * n 1 - x^(n+1) | ||
3144 | * S_n := \Sum x^i = ------------- | ||
3145 | * i=0 1 - x | ||
3146 | */ | ||
3147 | static unsigned long | ||
3148 | calc_load_n(unsigned long load, unsigned long exp, | ||
3149 | unsigned long active, unsigned int n) | ||
3150 | { | ||
3151 | |||
3152 | return calc_load(load, fixed_power_int(exp, FSHIFT, n), active); | ||
3153 | } | ||
3154 | |||
3155 | /* | ||
3156 | * NO_HZ can leave us missing all per-cpu ticks calling | ||
3157 | * calc_load_account_active(), but since an idle CPU folds its delta into | ||
3158 | * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold | ||
3159 | * in the pending idle delta if our idle period crossed a load cycle boundary. | ||
3160 | * | ||
3161 | * Once we've updated the global active value, we need to apply the exponential | ||
3162 | * weights adjusted to the number of cycles missed. | ||
3163 | */ | ||
3164 | static void calc_global_nohz(unsigned long ticks) | ||
3165 | { | ||
3166 | long delta, active, n; | ||
3167 | |||
3168 | if (time_before(jiffies, calc_load_update)) | ||
3169 | return; | ||
3170 | |||
3171 | /* | ||
3172 | * If we crossed a calc_load_update boundary, make sure to fold | ||
3173 | * any pending idle changes, the respective CPUs might have | ||
3174 | * missed the tick driven calc_load_account_active() update | ||
3175 | * due to NO_HZ. | ||
3176 | */ | ||
3177 | delta = calc_load_fold_idle(); | ||
3178 | if (delta) | ||
3179 | atomic_long_add(delta, &calc_load_tasks); | ||
3180 | |||
3181 | /* | ||
3182 | * If we were idle for multiple load cycles, apply them. | ||
3183 | */ | ||
3184 | if (ticks >= LOAD_FREQ) { | ||
3185 | n = ticks / LOAD_FREQ; | ||
3186 | |||
3187 | active = atomic_long_read(&calc_load_tasks); | ||
3188 | active = active > 0 ? active * FIXED_1 : 0; | ||
3189 | |||
3190 | avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); | ||
3191 | avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); | ||
3192 | avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); | ||
3193 | |||
3194 | calc_load_update += n * LOAD_FREQ; | ||
3195 | } | ||
3196 | |||
3197 | /* | ||
3198 | * Its possible the remainder of the above division also crosses | ||
3199 | * a LOAD_FREQ period, the regular check in calc_global_load() | ||
3200 | * which comes after this will take care of that. | ||
3201 | * | ||
3202 | * Consider us being 11 ticks before a cycle completion, and us | ||
3203 | * sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will | ||
3204 | * age us 4 cycles, and the test in calc_global_load() will | ||
3205 | * pick up the final one. | ||
3206 | */ | ||
3207 | } | ||
3020 | #else | 3208 | #else |
3021 | static void calc_load_account_idle(struct rq *this_rq) | 3209 | static void calc_load_account_idle(struct rq *this_rq) |
3022 | { | 3210 | { |
@@ -3026,6 +3214,10 @@ static inline long calc_load_fold_idle(void) | |||
3026 | { | 3214 | { |
3027 | return 0; | 3215 | return 0; |
3028 | } | 3216 | } |
3217 | |||
3218 | static void calc_global_nohz(unsigned long ticks) | ||
3219 | { | ||
3220 | } | ||
3029 | #endif | 3221 | #endif |
3030 | 3222 | ||
3031 | /** | 3223 | /** |
@@ -3043,24 +3235,17 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift) | |||
3043 | loads[2] = (avenrun[2] + offset) << shift; | 3235 | loads[2] = (avenrun[2] + offset) << shift; |
3044 | } | 3236 | } |
3045 | 3237 | ||
3046 | static unsigned long | ||
3047 | calc_load(unsigned long load, unsigned long exp, unsigned long active) | ||
3048 | { | ||
3049 | load *= exp; | ||
3050 | load += active * (FIXED_1 - exp); | ||
3051 | return load >> FSHIFT; | ||
3052 | } | ||
3053 | |||
3054 | /* | 3238 | /* |
3055 | * calc_load - update the avenrun load estimates 10 ticks after the | 3239 | * calc_load - update the avenrun load estimates 10 ticks after the |
3056 | * CPUs have updated calc_load_tasks. | 3240 | * CPUs have updated calc_load_tasks. |
3057 | */ | 3241 | */ |
3058 | void calc_global_load(void) | 3242 | void calc_global_load(unsigned long ticks) |
3059 | { | 3243 | { |
3060 | unsigned long upd = calc_load_update + 10; | ||
3061 | long active; | 3244 | long active; |
3062 | 3245 | ||
3063 | if (time_before(jiffies, upd)) | 3246 | calc_global_nohz(ticks); |
3247 | |||
3248 | if (time_before(jiffies, calc_load_update + 10)) | ||
3064 | return; | 3249 | return; |
3065 | 3250 | ||
3066 | active = atomic_long_read(&calc_load_tasks); | 3251 | active = atomic_long_read(&calc_load_tasks); |
@@ -3714,7 +3899,6 @@ static void put_prev_task(struct rq *rq, struct task_struct *prev) | |||
3714 | { | 3899 | { |
3715 | if (prev->se.on_rq) | 3900 | if (prev->se.on_rq) |
3716 | update_rq_clock(rq); | 3901 | update_rq_clock(rq); |
3717 | rq->skip_clock_update = 0; | ||
3718 | prev->sched_class->put_prev_task(rq, prev); | 3902 | prev->sched_class->put_prev_task(rq, prev); |
3719 | } | 3903 | } |
3720 | 3904 | ||
@@ -3772,7 +3956,6 @@ need_resched_nonpreemptible: | |||
3772 | hrtick_clear(rq); | 3956 | hrtick_clear(rq); |
3773 | 3957 | ||
3774 | raw_spin_lock_irq(&rq->lock); | 3958 | raw_spin_lock_irq(&rq->lock); |
3775 | clear_tsk_need_resched(prev); | ||
3776 | 3959 | ||
3777 | switch_count = &prev->nivcsw; | 3960 | switch_count = &prev->nivcsw; |
3778 | if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { | 3961 | if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { |
@@ -3804,6 +3987,8 @@ need_resched_nonpreemptible: | |||
3804 | 3987 | ||
3805 | put_prev_task(rq, prev); | 3988 | put_prev_task(rq, prev); |
3806 | next = pick_next_task(rq); | 3989 | next = pick_next_task(rq); |
3990 | clear_tsk_need_resched(prev); | ||
3991 | rq->skip_clock_update = 0; | ||
3807 | 3992 | ||
3808 | if (likely(prev != next)) { | 3993 | if (likely(prev != next)) { |
3809 | sched_info_switch(prev, next); | 3994 | sched_info_switch(prev, next); |