aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c557
1 files changed, 480 insertions, 77 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index d1e8889872a1..e6f8f1254319 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -426,9 +426,7 @@ struct root_domain {
426 */ 426 */
427 cpumask_var_t rto_mask; 427 cpumask_var_t rto_mask;
428 atomic_t rto_count; 428 atomic_t rto_count;
429#ifdef CONFIG_SMP
430 struct cpupri cpupri; 429 struct cpupri cpupri;
431#endif
432}; 430};
433 431
434/* 432/*
@@ -437,7 +435,7 @@ struct root_domain {
437 */ 435 */
438static struct root_domain def_root_domain; 436static struct root_domain def_root_domain;
439 437
440#endif 438#endif /* CONFIG_SMP */
441 439
442/* 440/*
443 * This is the main, per-CPU runqueue data structure. 441 * This is the main, per-CPU runqueue data structure.
@@ -488,11 +486,12 @@ struct rq {
488 */ 486 */
489 unsigned long nr_uninterruptible; 487 unsigned long nr_uninterruptible;
490 488
491 struct task_struct *curr, *idle; 489 struct task_struct *curr, *idle, *stop;
492 unsigned long next_balance; 490 unsigned long next_balance;
493 struct mm_struct *prev_mm; 491 struct mm_struct *prev_mm;
494 492
495 u64 clock; 493 u64 clock;
494 u64 clock_task;
496 495
497 atomic_t nr_iowait; 496 atomic_t nr_iowait;
498 497
@@ -520,6 +519,10 @@ struct rq {
520 u64 avg_idle; 519 u64 avg_idle;
521#endif 520#endif
522 521
522#ifdef CONFIG_IRQ_TIME_ACCOUNTING
523 u64 prev_irq_time;
524#endif
525
523 /* calc_load related fields */ 526 /* calc_load related fields */
524 unsigned long calc_load_update; 527 unsigned long calc_load_update;
525 long calc_load_active; 528 long calc_load_active;
@@ -557,18 +560,8 @@ struct rq {
557 560
558static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 561static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
559 562
560static inline
561void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
562{
563 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
564 563
565 /* 564static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
566 * A queue event has occurred, and we're going to schedule. In
567 * this case, we can save a useless back to back clock update.
568 */
569 if (test_tsk_need_resched(p))
570 rq->skip_clock_update = 1;
571}
572 565
573static inline int cpu_of(struct rq *rq) 566static inline int cpu_of(struct rq *rq)
574{ 567{
@@ -643,10 +636,18 @@ static inline struct task_group *task_group(struct task_struct *p)
643 636
644#endif /* CONFIG_CGROUP_SCHED */ 637#endif /* CONFIG_CGROUP_SCHED */
645 638
646inline void update_rq_clock(struct rq *rq) 639static void update_rq_clock_task(struct rq *rq, s64 delta);
640
641static void update_rq_clock(struct rq *rq)
647{ 642{
648 if (!rq->skip_clock_update) 643 s64 delta;
649 rq->clock = sched_clock_cpu(cpu_of(rq)); 644
645 if (rq->skip_clock_update)
646 return;
647
648 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
649 rq->clock += delta;
650 update_rq_clock_task(rq, delta);
650} 651}
651 652
652/* 653/*
@@ -723,7 +724,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
723 size_t cnt, loff_t *ppos) 724 size_t cnt, loff_t *ppos)
724{ 725{
725 char buf[64]; 726 char buf[64];
726 char *cmp = buf; 727 char *cmp;
727 int neg = 0; 728 int neg = 0;
728 int i; 729 int i;
729 730
@@ -734,6 +735,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
734 return -EFAULT; 735 return -EFAULT;
735 736
736 buf[cnt] = 0; 737 buf[cnt] = 0;
738 cmp = strstrip(buf);
737 739
738 if (strncmp(buf, "NO_", 3) == 0) { 740 if (strncmp(buf, "NO_", 3) == 0) {
739 neg = 1; 741 neg = 1;
@@ -741,9 +743,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
741 } 743 }
742 744
743 for (i = 0; sched_feat_names[i]; i++) { 745 for (i = 0; sched_feat_names[i]; i++) {
744 int len = strlen(sched_feat_names[i]); 746 if (strcmp(cmp, sched_feat_names[i]) == 0) {
745
746 if (strncmp(cmp, sched_feat_names[i], len) == 0) {
747 if (neg) 747 if (neg)
748 sysctl_sched_features &= ~(1UL << i); 748 sysctl_sched_features &= ~(1UL << i);
749 else 749 else
@@ -1840,7 +1840,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1840 1840
1841static const struct sched_class rt_sched_class; 1841static const struct sched_class rt_sched_class;
1842 1842
1843#define sched_class_highest (&rt_sched_class) 1843#define sched_class_highest (&stop_sched_class)
1844#define for_each_class(class) \ 1844#define for_each_class(class) \
1845 for (class = sched_class_highest; class; class = class->next) 1845 for (class = sched_class_highest; class; class = class->next)
1846 1846
@@ -1858,12 +1858,6 @@ static void dec_nr_running(struct rq *rq)
1858 1858
1859static void set_load_weight(struct task_struct *p) 1859static void set_load_weight(struct task_struct *p)
1860{ 1860{
1861 if (task_has_rt_policy(p)) {
1862 p->se.load.weight = 0;
1863 p->se.load.inv_weight = WMULT_CONST;
1864 return;
1865 }
1866
1867 /* 1861 /*
1868 * SCHED_IDLE tasks get minimal weight: 1862 * SCHED_IDLE tasks get minimal weight:
1869 */ 1863 */
@@ -1917,13 +1911,193 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
1917 dec_nr_running(rq); 1911 dec_nr_running(rq);
1918} 1912}
1919 1913
1914#ifdef CONFIG_IRQ_TIME_ACCOUNTING
1915
1916/*
1917 * There are no locks covering percpu hardirq/softirq time.
1918 * They are only modified in account_system_vtime, on corresponding CPU
1919 * with interrupts disabled. So, writes are safe.
1920 * They are read and saved off onto struct rq in update_rq_clock().
1921 * This may result in other CPU reading this CPU's irq time and can
1922 * race with irq/account_system_vtime on this CPU. We would either get old
1923 * or new value with a side effect of accounting a slice of irq time to wrong
1924 * task when irq is in progress while we read rq->clock. That is a worthy
1925 * compromise in place of having locks on each irq in account_system_time.
1926 */
1927static DEFINE_PER_CPU(u64, cpu_hardirq_time);
1928static DEFINE_PER_CPU(u64, cpu_softirq_time);
1929
1930static DEFINE_PER_CPU(u64, irq_start_time);
1931static int sched_clock_irqtime;
1932
1933void enable_sched_clock_irqtime(void)
1934{
1935 sched_clock_irqtime = 1;
1936}
1937
1938void disable_sched_clock_irqtime(void)
1939{
1940 sched_clock_irqtime = 0;
1941}
1942
1943#ifndef CONFIG_64BIT
1944static DEFINE_PER_CPU(seqcount_t, irq_time_seq);
1945
1946static inline void irq_time_write_begin(void)
1947{
1948 __this_cpu_inc(irq_time_seq.sequence);
1949 smp_wmb();
1950}
1951
1952static inline void irq_time_write_end(void)
1953{
1954 smp_wmb();
1955 __this_cpu_inc(irq_time_seq.sequence);
1956}
1957
1958static inline u64 irq_time_read(int cpu)
1959{
1960 u64 irq_time;
1961 unsigned seq;
1962
1963 do {
1964 seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
1965 irq_time = per_cpu(cpu_softirq_time, cpu) +
1966 per_cpu(cpu_hardirq_time, cpu);
1967 } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
1968
1969 return irq_time;
1970}
1971#else /* CONFIG_64BIT */
1972static inline void irq_time_write_begin(void)
1973{
1974}
1975
1976static inline void irq_time_write_end(void)
1977{
1978}
1979
1980static inline u64 irq_time_read(int cpu)
1981{
1982 return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
1983}
1984#endif /* CONFIG_64BIT */
1985
1986/*
1987 * Called before incrementing preempt_count on {soft,}irq_enter
1988 * and before decrementing preempt_count on {soft,}irq_exit.
1989 */
1990void account_system_vtime(struct task_struct *curr)
1991{
1992 unsigned long flags;
1993 s64 delta;
1994 int cpu;
1995
1996 if (!sched_clock_irqtime)
1997 return;
1998
1999 local_irq_save(flags);
2000
2001 cpu = smp_processor_id();
2002 delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
2003 __this_cpu_add(irq_start_time, delta);
2004
2005 irq_time_write_begin();
2006 /*
2007 * We do not account for softirq time from ksoftirqd here.
2008 * We want to continue accounting softirq time to ksoftirqd thread
2009 * in that case, so as not to confuse scheduler with a special task
2010 * that do not consume any time, but still wants to run.
2011 */
2012 if (hardirq_count())
2013 __this_cpu_add(cpu_hardirq_time, delta);
2014 else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD))
2015 __this_cpu_add(cpu_softirq_time, delta);
2016
2017 irq_time_write_end();
2018 local_irq_restore(flags);
2019}
2020EXPORT_SYMBOL_GPL(account_system_vtime);
2021
2022static void update_rq_clock_task(struct rq *rq, s64 delta)
2023{
2024 s64 irq_delta;
2025
2026 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
2027
2028 /*
2029 * Since irq_time is only updated on {soft,}irq_exit, we might run into
2030 * this case when a previous update_rq_clock() happened inside a
2031 * {soft,}irq region.
2032 *
2033 * When this happens, we stop ->clock_task and only update the
2034 * prev_irq_time stamp to account for the part that fit, so that a next
2035 * update will consume the rest. This ensures ->clock_task is
2036 * monotonic.
2037 *
2038 * It does however cause some slight miss-attribution of {soft,}irq
2039 * time, a more accurate solution would be to update the irq_time using
2040 * the current rq->clock timestamp, except that would require using
2041 * atomic ops.
2042 */
2043 if (irq_delta > delta)
2044 irq_delta = delta;
2045
2046 rq->prev_irq_time += irq_delta;
2047 delta -= irq_delta;
2048 rq->clock_task += delta;
2049
2050 if (irq_delta && sched_feat(NONIRQ_POWER))
2051 sched_rt_avg_update(rq, irq_delta);
2052}
2053
2054#else /* CONFIG_IRQ_TIME_ACCOUNTING */
2055
2056static void update_rq_clock_task(struct rq *rq, s64 delta)
2057{
2058 rq->clock_task += delta;
2059}
2060
2061#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
2062
1920#include "sched_idletask.c" 2063#include "sched_idletask.c"
1921#include "sched_fair.c" 2064#include "sched_fair.c"
1922#include "sched_rt.c" 2065#include "sched_rt.c"
2066#include "sched_stoptask.c"
1923#ifdef CONFIG_SCHED_DEBUG 2067#ifdef CONFIG_SCHED_DEBUG
1924# include "sched_debug.c" 2068# include "sched_debug.c"
1925#endif 2069#endif
1926 2070
2071void sched_set_stop_task(int cpu, struct task_struct *stop)
2072{
2073 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
2074 struct task_struct *old_stop = cpu_rq(cpu)->stop;
2075
2076 if (stop) {
2077 /*
2078 * Make it appear like a SCHED_FIFO task, its something
2079 * userspace knows about and won't get confused about.
2080 *
2081 * Also, it will make PI more or less work without too
2082 * much confusion -- but then, stop work should not
2083 * rely on PI working anyway.
2084 */
2085 sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);
2086
2087 stop->sched_class = &stop_sched_class;
2088 }
2089
2090 cpu_rq(cpu)->stop = stop;
2091
2092 if (old_stop) {
2093 /*
2094 * Reset it back to a normal scheduling class so that
2095 * it can die in pieces.
2096 */
2097 old_stop->sched_class = &rt_sched_class;
2098 }
2099}
2100
1927/* 2101/*
1928 * __normal_prio - return the priority that is based on the static prio 2102 * __normal_prio - return the priority that is based on the static prio
1929 */ 2103 */
@@ -1991,6 +2165,31 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
1991 p->sched_class->prio_changed(rq, p, oldprio, running); 2165 p->sched_class->prio_changed(rq, p, oldprio, running);
1992} 2166}
1993 2167
2168static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
2169{
2170 const struct sched_class *class;
2171
2172 if (p->sched_class == rq->curr->sched_class) {
2173 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
2174 } else {
2175 for_each_class(class) {
2176 if (class == rq->curr->sched_class)
2177 break;
2178 if (class == p->sched_class) {
2179 resched_task(rq->curr);
2180 break;
2181 }
2182 }
2183 }
2184
2185 /*
2186 * A queue event has occurred, and we're going to schedule. In
2187 * this case, we can save a useless back to back clock update.
2188 */
2189 if (rq->curr->se.on_rq && test_tsk_need_resched(rq->curr))
2190 rq->skip_clock_update = 1;
2191}
2192
1994#ifdef CONFIG_SMP 2193#ifdef CONFIG_SMP
1995/* 2194/*
1996 * Is this task likely cache-hot: 2195 * Is this task likely cache-hot:
@@ -2003,6 +2202,9 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
2003 if (p->sched_class != &fair_sched_class) 2202 if (p->sched_class != &fair_sched_class)
2004 return 0; 2203 return 0;
2005 2204
2205 if (unlikely(p->policy == SCHED_IDLE))
2206 return 0;
2207
2006 /* 2208 /*
2007 * Buddy candidates are cache hot: 2209 * Buddy candidates are cache hot:
2008 */ 2210 */
@@ -2852,14 +3054,14 @@ context_switch(struct rq *rq, struct task_struct *prev,
2852 */ 3054 */
2853 arch_start_context_switch(prev); 3055 arch_start_context_switch(prev);
2854 3056
2855 if (likely(!mm)) { 3057 if (!mm) {
2856 next->active_mm = oldmm; 3058 next->active_mm = oldmm;
2857 atomic_inc(&oldmm->mm_count); 3059 atomic_inc(&oldmm->mm_count);
2858 enter_lazy_tlb(oldmm, next); 3060 enter_lazy_tlb(oldmm, next);
2859 } else 3061 } else
2860 switch_mm(oldmm, mm, next); 3062 switch_mm(oldmm, mm, next);
2861 3063
2862 if (likely(!prev->mm)) { 3064 if (!prev->mm) {
2863 prev->active_mm = NULL; 3065 prev->active_mm = NULL;
2864 rq->prev_mm = oldmm; 3066 rq->prev_mm = oldmm;
2865 } 3067 }
@@ -2974,6 +3176,15 @@ static long calc_load_fold_active(struct rq *this_rq)
2974 return delta; 3176 return delta;
2975} 3177}
2976 3178
3179static unsigned long
3180calc_load(unsigned long load, unsigned long exp, unsigned long active)
3181{
3182 load *= exp;
3183 load += active * (FIXED_1 - exp);
3184 load += 1UL << (FSHIFT - 1);
3185 return load >> FSHIFT;
3186}
3187
2977#ifdef CONFIG_NO_HZ 3188#ifdef CONFIG_NO_HZ
2978/* 3189/*
2979 * For NO_HZ we delay the active fold to the next LOAD_FREQ update. 3190 * For NO_HZ we delay the active fold to the next LOAD_FREQ update.
@@ -3003,6 +3214,128 @@ static long calc_load_fold_idle(void)
3003 3214
3004 return delta; 3215 return delta;
3005} 3216}
3217
3218/**
3219 * fixed_power_int - compute: x^n, in O(log n) time
3220 *
3221 * @x: base of the power
3222 * @frac_bits: fractional bits of @x
3223 * @n: power to raise @x to.
3224 *
3225 * By exploiting the relation between the definition of the natural power
3226 * function: x^n := x*x*...*x (x multiplied by itself for n times), and
3227 * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
3228 * (where: n_i \elem {0, 1}, the binary vector representing n),
3229 * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
3230 * of course trivially computable in O(log_2 n), the length of our binary
3231 * vector.
3232 */
3233static unsigned long
3234fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
3235{
3236 unsigned long result = 1UL << frac_bits;
3237
3238 if (n) for (;;) {
3239 if (n & 1) {
3240 result *= x;
3241 result += 1UL << (frac_bits - 1);
3242 result >>= frac_bits;
3243 }
3244 n >>= 1;
3245 if (!n)
3246 break;
3247 x *= x;
3248 x += 1UL << (frac_bits - 1);
3249 x >>= frac_bits;
3250 }
3251
3252 return result;
3253}
3254
3255/*
3256 * a1 = a0 * e + a * (1 - e)
3257 *
3258 * a2 = a1 * e + a * (1 - e)
3259 * = (a0 * e + a * (1 - e)) * e + a * (1 - e)
3260 * = a0 * e^2 + a * (1 - e) * (1 + e)
3261 *
3262 * a3 = a2 * e + a * (1 - e)
3263 * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
3264 * = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
3265 *
3266 * ...
3267 *
3268 * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
3269 * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
3270 * = a0 * e^n + a * (1 - e^n)
3271 *
3272 * [1] application of the geometric series:
3273 *
3274 * n 1 - x^(n+1)
3275 * S_n := \Sum x^i = -------------
3276 * i=0 1 - x
3277 */
3278static unsigned long
3279calc_load_n(unsigned long load, unsigned long exp,
3280 unsigned long active, unsigned int n)
3281{
3282
3283 return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
3284}
3285
3286/*
3287 * NO_HZ can leave us missing all per-cpu ticks calling
3288 * calc_load_account_active(), but since an idle CPU folds its delta into
3289 * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold
3290 * in the pending idle delta if our idle period crossed a load cycle boundary.
3291 *
3292 * Once we've updated the global active value, we need to apply the exponential
3293 * weights adjusted to the number of cycles missed.
3294 */
3295static void calc_global_nohz(unsigned long ticks)
3296{
3297 long delta, active, n;
3298
3299 if (time_before(jiffies, calc_load_update))
3300 return;
3301
3302 /*
3303 * If we crossed a calc_load_update boundary, make sure to fold
3304 * any pending idle changes, the respective CPUs might have
3305 * missed the tick driven calc_load_account_active() update
3306 * due to NO_HZ.
3307 */
3308 delta = calc_load_fold_idle();
3309 if (delta)
3310 atomic_long_add(delta, &calc_load_tasks);
3311
3312 /*
3313 * If we were idle for multiple load cycles, apply them.
3314 */
3315 if (ticks >= LOAD_FREQ) {
3316 n = ticks / LOAD_FREQ;
3317
3318 active = atomic_long_read(&calc_load_tasks);
3319 active = active > 0 ? active * FIXED_1 : 0;
3320
3321 avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
3322 avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
3323 avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
3324
3325 calc_load_update += n * LOAD_FREQ;
3326 }
3327
3328 /*
3329 * Its possible the remainder of the above division also crosses
3330 * a LOAD_FREQ period, the regular check in calc_global_load()
3331 * which comes after this will take care of that.
3332 *
3333 * Consider us being 11 ticks before a cycle completion, and us
3334 * sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will
3335 * age us 4 cycles, and the test in calc_global_load() will
3336 * pick up the final one.
3337 */
3338}
3006#else 3339#else
3007static void calc_load_account_idle(struct rq *this_rq) 3340static void calc_load_account_idle(struct rq *this_rq)
3008{ 3341{
@@ -3012,6 +3345,10 @@ static inline long calc_load_fold_idle(void)
3012{ 3345{
3013 return 0; 3346 return 0;
3014} 3347}
3348
3349static void calc_global_nohz(unsigned long ticks)
3350{
3351}
3015#endif 3352#endif
3016 3353
3017/** 3354/**
@@ -3029,24 +3366,17 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
3029 loads[2] = (avenrun[2] + offset) << shift; 3366 loads[2] = (avenrun[2] + offset) << shift;
3030} 3367}
3031 3368
3032static unsigned long
3033calc_load(unsigned long load, unsigned long exp, unsigned long active)
3034{
3035 load *= exp;
3036 load += active * (FIXED_1 - exp);
3037 return load >> FSHIFT;
3038}
3039
3040/* 3369/*
3041 * calc_load - update the avenrun load estimates 10 ticks after the 3370 * calc_load - update the avenrun load estimates 10 ticks after the
3042 * CPUs have updated calc_load_tasks. 3371 * CPUs have updated calc_load_tasks.
3043 */ 3372 */
3044void calc_global_load(void) 3373void calc_global_load(unsigned long ticks)
3045{ 3374{
3046 unsigned long upd = calc_load_update + 10;
3047 long active; 3375 long active;
3048 3376
3049 if (time_before(jiffies, upd)) 3377 calc_global_nohz(ticks);
3378
3379 if (time_before(jiffies, calc_load_update + 10))
3050 return; 3380 return;
3051 3381
3052 active = atomic_long_read(&calc_load_tasks); 3382 active = atomic_long_read(&calc_load_tasks);
@@ -3248,7 +3578,7 @@ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
3248 3578
3249 if (task_current(rq, p)) { 3579 if (task_current(rq, p)) {
3250 update_rq_clock(rq); 3580 update_rq_clock(rq);
3251 ns = rq->clock - p->se.exec_start; 3581 ns = rq->clock_task - p->se.exec_start;
3252 if ((s64)ns < 0) 3582 if ((s64)ns < 0)
3253 ns = 0; 3583 ns = 0;
3254 } 3584 }
@@ -3397,7 +3727,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
3397 tmp = cputime_to_cputime64(cputime); 3727 tmp = cputime_to_cputime64(cputime);
3398 if (hardirq_count() - hardirq_offset) 3728 if (hardirq_count() - hardirq_offset)
3399 cpustat->irq = cputime64_add(cpustat->irq, tmp); 3729 cpustat->irq = cputime64_add(cpustat->irq, tmp);
3400 else if (softirq_count()) 3730 else if (in_serving_softirq())
3401 cpustat->softirq = cputime64_add(cpustat->softirq, tmp); 3731 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
3402 else 3732 else
3403 cpustat->system = cputime64_add(cpustat->system, tmp); 3733 cpustat->system = cputime64_add(cpustat->system, tmp);
@@ -3584,7 +3914,7 @@ void scheduler_tick(void)
3584 curr->sched_class->task_tick(rq, curr, 0); 3914 curr->sched_class->task_tick(rq, curr, 0);
3585 raw_spin_unlock(&rq->lock); 3915 raw_spin_unlock(&rq->lock);
3586 3916
3587 perf_event_task_tick(curr); 3917 perf_event_task_tick();
3588 3918
3589#ifdef CONFIG_SMP 3919#ifdef CONFIG_SMP
3590 rq->idle_at_tick = idle_cpu(cpu); 3920 rq->idle_at_tick = idle_cpu(cpu);
@@ -3700,7 +4030,6 @@ static void put_prev_task(struct rq *rq, struct task_struct *prev)
3700{ 4030{
3701 if (prev->se.on_rq) 4031 if (prev->se.on_rq)
3702 update_rq_clock(rq); 4032 update_rq_clock(rq);
3703 rq->skip_clock_update = 0;
3704 prev->sched_class->put_prev_task(rq, prev); 4033 prev->sched_class->put_prev_task(rq, prev);
3705} 4034}
3706 4035
@@ -3723,17 +4052,13 @@ pick_next_task(struct rq *rq)
3723 return p; 4052 return p;
3724 } 4053 }
3725 4054
3726 class = sched_class_highest; 4055 for_each_class(class) {
3727 for ( ; ; ) {
3728 p = class->pick_next_task(rq); 4056 p = class->pick_next_task(rq);
3729 if (p) 4057 if (p)
3730 return p; 4058 return p;
3731 /*
3732 * Will never be NULL as the idle class always
3733 * returns a non-NULL p:
3734 */
3735 class = class->next;
3736 } 4059 }
4060
4061 BUG(); /* the idle class will always have a runnable task */
3737} 4062}
3738 4063
3739/* 4064/*
@@ -3762,7 +4087,6 @@ need_resched_nonpreemptible:
3762 hrtick_clear(rq); 4087 hrtick_clear(rq);
3763 4088
3764 raw_spin_lock_irq(&rq->lock); 4089 raw_spin_lock_irq(&rq->lock);
3765 clear_tsk_need_resched(prev);
3766 4090
3767 switch_count = &prev->nivcsw; 4091 switch_count = &prev->nivcsw;
3768 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { 4092 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
@@ -3794,6 +4118,8 @@ need_resched_nonpreemptible:
3794 4118
3795 put_prev_task(rq, prev); 4119 put_prev_task(rq, prev);
3796 next = pick_next_task(rq); 4120 next = pick_next_task(rq);
4121 clear_tsk_need_resched(prev);
4122 rq->skip_clock_update = 0;
3797 4123
3798 if (likely(prev != next)) { 4124 if (likely(prev != next)) {
3799 sched_info_switch(prev, next); 4125 sched_info_switch(prev, next);
@@ -4358,6 +4684,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
4358 4684
4359 rq = task_rq_lock(p, &flags); 4685 rq = task_rq_lock(p, &flags);
4360 4686
4687 trace_sched_pi_setprio(p, prio);
4361 oldprio = p->prio; 4688 oldprio = p->prio;
4362 prev_class = p->sched_class; 4689 prev_class = p->sched_class;
4363 on_rq = p->se.on_rq; 4690 on_rq = p->se.on_rq;
@@ -4645,7 +4972,7 @@ recheck:
4645 } 4972 }
4646 4973
4647 if (user) { 4974 if (user) {
4648 retval = security_task_setscheduler(p, policy, param); 4975 retval = security_task_setscheduler(p);
4649 if (retval) 4976 if (retval)
4650 return retval; 4977 return retval;
4651 } 4978 }
@@ -4661,6 +4988,15 @@ recheck:
4661 */ 4988 */
4662 rq = __task_rq_lock(p); 4989 rq = __task_rq_lock(p);
4663 4990
4991 /*
4992 * Changing the policy of the stop threads its a very bad idea
4993 */
4994 if (p == rq->stop) {
4995 __task_rq_unlock(rq);
4996 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4997 return -EINVAL;
4998 }
4999
4664#ifdef CONFIG_RT_GROUP_SCHED 5000#ifdef CONFIG_RT_GROUP_SCHED
4665 if (user) { 5001 if (user) {
4666 /* 5002 /*
@@ -4887,13 +5223,13 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
4887 if (!check_same_owner(p) && !capable(CAP_SYS_NICE)) 5223 if (!check_same_owner(p) && !capable(CAP_SYS_NICE))
4888 goto out_unlock; 5224 goto out_unlock;
4889 5225
4890 retval = security_task_setscheduler(p, 0, NULL); 5226 retval = security_task_setscheduler(p);
4891 if (retval) 5227 if (retval)
4892 goto out_unlock; 5228 goto out_unlock;
4893 5229
4894 cpuset_cpus_allowed(p, cpus_allowed); 5230 cpuset_cpus_allowed(p, cpus_allowed);
4895 cpumask_and(new_mask, in_mask, cpus_allowed); 5231 cpumask_and(new_mask, in_mask, cpus_allowed);
4896 again: 5232again:
4897 retval = set_cpus_allowed_ptr(p, new_mask); 5233 retval = set_cpus_allowed_ptr(p, new_mask);
4898 5234
4899 if (!retval) { 5235 if (!retval) {
@@ -6526,6 +6862,7 @@ struct s_data {
6526 cpumask_var_t nodemask; 6862 cpumask_var_t nodemask;
6527 cpumask_var_t this_sibling_map; 6863 cpumask_var_t this_sibling_map;
6528 cpumask_var_t this_core_map; 6864 cpumask_var_t this_core_map;
6865 cpumask_var_t this_book_map;
6529 cpumask_var_t send_covered; 6866 cpumask_var_t send_covered;
6530 cpumask_var_t tmpmask; 6867 cpumask_var_t tmpmask;
6531 struct sched_group **sched_group_nodes; 6868 struct sched_group **sched_group_nodes;
@@ -6537,6 +6874,7 @@ enum s_alloc {
6537 sa_rootdomain, 6874 sa_rootdomain,
6538 sa_tmpmask, 6875 sa_tmpmask,
6539 sa_send_covered, 6876 sa_send_covered,
6877 sa_this_book_map,
6540 sa_this_core_map, 6878 sa_this_core_map,
6541 sa_this_sibling_map, 6879 sa_this_sibling_map,
6542 sa_nodemask, 6880 sa_nodemask,
@@ -6572,31 +6910,48 @@ cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map,
6572#ifdef CONFIG_SCHED_MC 6910#ifdef CONFIG_SCHED_MC
6573static DEFINE_PER_CPU(struct static_sched_domain, core_domains); 6911static DEFINE_PER_CPU(struct static_sched_domain, core_domains);
6574static DEFINE_PER_CPU(struct static_sched_group, sched_group_core); 6912static DEFINE_PER_CPU(struct static_sched_group, sched_group_core);
6575#endif /* CONFIG_SCHED_MC */
6576 6913
6577#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
6578static int 6914static int
6579cpu_to_core_group(int cpu, const struct cpumask *cpu_map, 6915cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
6580 struct sched_group **sg, struct cpumask *mask) 6916 struct sched_group **sg, struct cpumask *mask)
6581{ 6917{
6582 int group; 6918 int group;
6583 6919#ifdef CONFIG_SCHED_SMT
6584 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); 6920 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
6585 group = cpumask_first(mask); 6921 group = cpumask_first(mask);
6922#else
6923 group = cpu;
6924#endif
6586 if (sg) 6925 if (sg)
6587 *sg = &per_cpu(sched_group_core, group).sg; 6926 *sg = &per_cpu(sched_group_core, group).sg;
6588 return group; 6927 return group;
6589} 6928}
6590#elif defined(CONFIG_SCHED_MC) 6929#endif /* CONFIG_SCHED_MC */
6930
6931/*
6932 * book sched-domains:
6933 */
6934#ifdef CONFIG_SCHED_BOOK
6935static DEFINE_PER_CPU(struct static_sched_domain, book_domains);
6936static DEFINE_PER_CPU(struct static_sched_group, sched_group_book);
6937
6591static int 6938static int
6592cpu_to_core_group(int cpu, const struct cpumask *cpu_map, 6939cpu_to_book_group(int cpu, const struct cpumask *cpu_map,
6593 struct sched_group **sg, struct cpumask *unused) 6940 struct sched_group **sg, struct cpumask *mask)
6594{ 6941{
6942 int group = cpu;
6943#ifdef CONFIG_SCHED_MC
6944 cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
6945 group = cpumask_first(mask);
6946#elif defined(CONFIG_SCHED_SMT)
6947 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
6948 group = cpumask_first(mask);
6949#endif
6595 if (sg) 6950 if (sg)
6596 *sg = &per_cpu(sched_group_core, cpu).sg; 6951 *sg = &per_cpu(sched_group_book, group).sg;
6597 return cpu; 6952 return group;
6598} 6953}
6599#endif 6954#endif /* CONFIG_SCHED_BOOK */
6600 6955
6601static DEFINE_PER_CPU(struct static_sched_domain, phys_domains); 6956static DEFINE_PER_CPU(struct static_sched_domain, phys_domains);
6602static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys); 6957static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys);
@@ -6606,7 +6961,10 @@ cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
6606 struct sched_group **sg, struct cpumask *mask) 6961 struct sched_group **sg, struct cpumask *mask)
6607{ 6962{
6608 int group; 6963 int group;
6609#ifdef CONFIG_SCHED_MC 6964#ifdef CONFIG_SCHED_BOOK
6965 cpumask_and(mask, cpu_book_mask(cpu), cpu_map);
6966 group = cpumask_first(mask);
6967#elif defined(CONFIG_SCHED_MC)
6610 cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map); 6968 cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
6611 group = cpumask_first(mask); 6969 group = cpumask_first(mask);
6612#elif defined(CONFIG_SCHED_SMT) 6970#elif defined(CONFIG_SCHED_SMT)
@@ -6802,6 +7160,8 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
6802 if (cpu != group_first_cpu(sd->groups)) 7160 if (cpu != group_first_cpu(sd->groups))
6803 return; 7161 return;
6804 7162
7163 sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups));
7164
6805 child = sd->child; 7165 child = sd->child;
6806 7166
6807 sd->groups->cpu_power = 0; 7167 sd->groups->cpu_power = 0;
@@ -6867,6 +7227,9 @@ SD_INIT_FUNC(CPU)
6867#ifdef CONFIG_SCHED_MC 7227#ifdef CONFIG_SCHED_MC
6868 SD_INIT_FUNC(MC) 7228 SD_INIT_FUNC(MC)
6869#endif 7229#endif
7230#ifdef CONFIG_SCHED_BOOK
7231 SD_INIT_FUNC(BOOK)
7232#endif
6870 7233
6871static int default_relax_domain_level = -1; 7234static int default_relax_domain_level = -1;
6872 7235
@@ -6916,6 +7279,8 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
6916 free_cpumask_var(d->tmpmask); /* fall through */ 7279 free_cpumask_var(d->tmpmask); /* fall through */
6917 case sa_send_covered: 7280 case sa_send_covered:
6918 free_cpumask_var(d->send_covered); /* fall through */ 7281 free_cpumask_var(d->send_covered); /* fall through */
7282 case sa_this_book_map:
7283 free_cpumask_var(d->this_book_map); /* fall through */
6919 case sa_this_core_map: 7284 case sa_this_core_map:
6920 free_cpumask_var(d->this_core_map); /* fall through */ 7285 free_cpumask_var(d->this_core_map); /* fall through */
6921 case sa_this_sibling_map: 7286 case sa_this_sibling_map:
@@ -6962,8 +7327,10 @@ static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
6962 return sa_nodemask; 7327 return sa_nodemask;
6963 if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL)) 7328 if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL))
6964 return sa_this_sibling_map; 7329 return sa_this_sibling_map;
6965 if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL)) 7330 if (!alloc_cpumask_var(&d->this_book_map, GFP_KERNEL))
6966 return sa_this_core_map; 7331 return sa_this_core_map;
7332 if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
7333 return sa_this_book_map;
6967 if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL)) 7334 if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL))
6968 return sa_send_covered; 7335 return sa_send_covered;
6969 d->rd = alloc_rootdomain(); 7336 d->rd = alloc_rootdomain();
@@ -7021,6 +7388,23 @@ static struct sched_domain *__build_cpu_sched_domain(struct s_data *d,
7021 return sd; 7388 return sd;
7022} 7389}
7023 7390
7391static struct sched_domain *__build_book_sched_domain(struct s_data *d,
7392 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
7393 struct sched_domain *parent, int i)
7394{
7395 struct sched_domain *sd = parent;
7396#ifdef CONFIG_SCHED_BOOK
7397 sd = &per_cpu(book_domains, i).sd;
7398 SD_INIT(sd, BOOK);
7399 set_domain_attribute(sd, attr);
7400 cpumask_and(sched_domain_span(sd), cpu_map, cpu_book_mask(i));
7401 sd->parent = parent;
7402 parent->child = sd;
7403 cpu_to_book_group(i, cpu_map, &sd->groups, d->tmpmask);
7404#endif
7405 return sd;
7406}
7407
7024static struct sched_domain *__build_mc_sched_domain(struct s_data *d, 7408static struct sched_domain *__build_mc_sched_domain(struct s_data *d,
7025 const struct cpumask *cpu_map, struct sched_domain_attr *attr, 7409 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
7026 struct sched_domain *parent, int i) 7410 struct sched_domain *parent, int i)
@@ -7078,6 +7462,15 @@ static void build_sched_groups(struct s_data *d, enum sched_domain_level l,
7078 d->send_covered, d->tmpmask); 7462 d->send_covered, d->tmpmask);
7079 break; 7463 break;
7080#endif 7464#endif
7465#ifdef CONFIG_SCHED_BOOK
7466 case SD_LV_BOOK: /* set up book groups */
7467 cpumask_and(d->this_book_map, cpu_map, cpu_book_mask(cpu));
7468 if (cpu == cpumask_first(d->this_book_map))
7469 init_sched_build_groups(d->this_book_map, cpu_map,
7470 &cpu_to_book_group,
7471 d->send_covered, d->tmpmask);
7472 break;
7473#endif
7081 case SD_LV_CPU: /* set up physical groups */ 7474 case SD_LV_CPU: /* set up physical groups */
7082 cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map); 7475 cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map);
7083 if (!cpumask_empty(d->nodemask)) 7476 if (!cpumask_empty(d->nodemask))
@@ -7125,12 +7518,14 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
7125 7518
7126 sd = __build_numa_sched_domains(&d, cpu_map, attr, i); 7519 sd = __build_numa_sched_domains(&d, cpu_map, attr, i);
7127 sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i); 7520 sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i);
7521 sd = __build_book_sched_domain(&d, cpu_map, attr, sd, i);
7128 sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i); 7522 sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i);
7129 sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i); 7523 sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i);
7130 } 7524 }
7131 7525
7132 for_each_cpu(i, cpu_map) { 7526 for_each_cpu(i, cpu_map) {
7133 build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i); 7527 build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i);
7528 build_sched_groups(&d, SD_LV_BOOK, cpu_map, i);
7134 build_sched_groups(&d, SD_LV_MC, cpu_map, i); 7529 build_sched_groups(&d, SD_LV_MC, cpu_map, i);
7135 } 7530 }
7136 7531
@@ -7161,6 +7556,12 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
7161 init_sched_groups_power(i, sd); 7556 init_sched_groups_power(i, sd);
7162 } 7557 }
7163#endif 7558#endif
7559#ifdef CONFIG_SCHED_BOOK
7560 for_each_cpu(i, cpu_map) {
7561 sd = &per_cpu(book_domains, i).sd;
7562 init_sched_groups_power(i, sd);
7563 }
7564#endif
7164 7565
7165 for_each_cpu(i, cpu_map) { 7566 for_each_cpu(i, cpu_map) {
7166 sd = &per_cpu(phys_domains, i).sd; 7567 sd = &per_cpu(phys_domains, i).sd;
@@ -7186,6 +7587,8 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
7186 sd = &per_cpu(cpu_domains, i).sd; 7587 sd = &per_cpu(cpu_domains, i).sd;
7187#elif defined(CONFIG_SCHED_MC) 7588#elif defined(CONFIG_SCHED_MC)
7188 sd = &per_cpu(core_domains, i).sd; 7589 sd = &per_cpu(core_domains, i).sd;
7590#elif defined(CONFIG_SCHED_BOOK)
7591 sd = &per_cpu(book_domains, i).sd;
7189#else 7592#else
7190 sd = &per_cpu(phys_domains, i).sd; 7593 sd = &per_cpu(phys_domains, i).sd;
7191#endif 7594#endif
@@ -8090,9 +8493,9 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8090 8493
8091 return 1; 8494 return 1;
8092 8495
8093 err_free_rq: 8496err_free_rq:
8094 kfree(cfs_rq); 8497 kfree(cfs_rq);
8095 err: 8498err:
8096 return 0; 8499 return 0;
8097} 8500}
8098 8501
@@ -8180,9 +8583,9 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8180 8583
8181 return 1; 8584 return 1;
8182 8585
8183 err_free_rq: 8586err_free_rq:
8184 kfree(rt_rq); 8587 kfree(rt_rq);
8185 err: 8588err:
8186 return 0; 8589 return 0;
8187} 8590}
8188 8591
@@ -8309,12 +8712,12 @@ void sched_move_task(struct task_struct *tsk)
8309 if (unlikely(running)) 8712 if (unlikely(running))
8310 tsk->sched_class->put_prev_task(rq, tsk); 8713 tsk->sched_class->put_prev_task(rq, tsk);
8311 8714
8312 set_task_rq(tsk, task_cpu(tsk));
8313
8314#ifdef CONFIG_FAIR_GROUP_SCHED 8715#ifdef CONFIG_FAIR_GROUP_SCHED
8315 if (tsk->sched_class->moved_group) 8716 if (tsk->sched_class->task_move_group)
8316 tsk->sched_class->moved_group(tsk, on_rq); 8717 tsk->sched_class->task_move_group(tsk, on_rq);
8718 else
8317#endif 8719#endif
8720 set_task_rq(tsk, task_cpu(tsk));
8318 8721
8319 if (unlikely(running)) 8722 if (unlikely(running))
8320 tsk->sched_class->set_curr_task(rq); 8723 tsk->sched_class->set_curr_task(rq);
@@ -8540,7 +8943,7 @@ static int tg_set_bandwidth(struct task_group *tg,
8540 raw_spin_unlock(&rt_rq->rt_runtime_lock); 8943 raw_spin_unlock(&rt_rq->rt_runtime_lock);
8541 } 8944 }
8542 raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); 8945 raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
8543 unlock: 8946unlock:
8544 read_unlock(&tasklist_lock); 8947 read_unlock(&tasklist_lock);
8545 mutex_unlock(&rt_constraints_mutex); 8948 mutex_unlock(&rt_constraints_mutex);
8546 8949