aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c730
1 files changed, 576 insertions, 154 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index f52a8801b7a2..d42992bccdfa 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -77,6 +77,7 @@
77#include <asm/irq_regs.h> 77#include <asm/irq_regs.h>
78 78
79#include "sched_cpupri.h" 79#include "sched_cpupri.h"
80#include "workqueue_sched.h"
80 81
81#define CREATE_TRACE_POINTS 82#define CREATE_TRACE_POINTS
82#include <trace/events/sched.h> 83#include <trace/events/sched.h>
@@ -425,9 +426,7 @@ struct root_domain {
425 */ 426 */
426 cpumask_var_t rto_mask; 427 cpumask_var_t rto_mask;
427 atomic_t rto_count; 428 atomic_t rto_count;
428#ifdef CONFIG_SMP
429 struct cpupri cpupri; 429 struct cpupri cpupri;
430#endif
431}; 430};
432 431
433/* 432/*
@@ -436,7 +435,7 @@ struct root_domain {
436 */ 435 */
437static struct root_domain def_root_domain; 436static struct root_domain def_root_domain;
438 437
439#endif 438#endif /* CONFIG_SMP */
440 439
441/* 440/*
442 * This is the main, per-CPU runqueue data structure. 441 * This is the main, per-CPU runqueue data structure.
@@ -456,9 +455,10 @@ struct rq {
456 unsigned long nr_running; 455 unsigned long nr_running;
457 #define CPU_LOAD_IDX_MAX 5 456 #define CPU_LOAD_IDX_MAX 5
458 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; 457 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
458 unsigned long last_load_update_tick;
459#ifdef CONFIG_NO_HZ 459#ifdef CONFIG_NO_HZ
460 u64 nohz_stamp; 460 u64 nohz_stamp;
461 unsigned char in_nohz_recently; 461 unsigned char nohz_balance_kick;
462#endif 462#endif
463 unsigned int skip_clock_update; 463 unsigned int skip_clock_update;
464 464
@@ -486,11 +486,12 @@ struct rq {
486 */ 486 */
487 unsigned long nr_uninterruptible; 487 unsigned long nr_uninterruptible;
488 488
489 struct task_struct *curr, *idle; 489 struct task_struct *curr, *idle, *stop;
490 unsigned long next_balance; 490 unsigned long next_balance;
491 struct mm_struct *prev_mm; 491 struct mm_struct *prev_mm;
492 492
493 u64 clock; 493 u64 clock;
494 u64 clock_task;
494 495
495 atomic_t nr_iowait; 496 atomic_t nr_iowait;
496 497
@@ -518,6 +519,10 @@ struct rq {
518 u64 avg_idle; 519 u64 avg_idle;
519#endif 520#endif
520 521
522#ifdef CONFIG_IRQ_TIME_ACCOUNTING
523 u64 prev_irq_time;
524#endif
525
521 /* calc_load related fields */ 526 /* calc_load related fields */
522 unsigned long calc_load_update; 527 unsigned long calc_load_update;
523 long calc_load_active; 528 long calc_load_active;
@@ -641,10 +646,22 @@ static inline struct task_group *task_group(struct task_struct *p)
641 646
642#endif /* CONFIG_CGROUP_SCHED */ 647#endif /* CONFIG_CGROUP_SCHED */
643 648
649static u64 irq_time_cpu(int cpu);
650static void sched_irq_time_avg_update(struct rq *rq, u64 irq_time);
651
644inline void update_rq_clock(struct rq *rq) 652inline void update_rq_clock(struct rq *rq)
645{ 653{
646 if (!rq->skip_clock_update) 654 if (!rq->skip_clock_update) {
647 rq->clock = sched_clock_cpu(cpu_of(rq)); 655 int cpu = cpu_of(rq);
656 u64 irq_time;
657
658 rq->clock = sched_clock_cpu(cpu);
659 irq_time = irq_time_cpu(cpu);
660 if (rq->clock - irq_time > rq->clock_task)
661 rq->clock_task = rq->clock - irq_time;
662
663 sched_irq_time_avg_update(rq, irq_time);
664 }
648} 665}
649 666
650/* 667/*
@@ -721,7 +738,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
721 size_t cnt, loff_t *ppos) 738 size_t cnt, loff_t *ppos)
722{ 739{
723 char buf[64]; 740 char buf[64];
724 char *cmp = buf; 741 char *cmp;
725 int neg = 0; 742 int neg = 0;
726 int i; 743 int i;
727 744
@@ -732,6 +749,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
732 return -EFAULT; 749 return -EFAULT;
733 750
734 buf[cnt] = 0; 751 buf[cnt] = 0;
752 cmp = strstrip(buf);
735 753
736 if (strncmp(buf, "NO_", 3) == 0) { 754 if (strncmp(buf, "NO_", 3) == 0) {
737 neg = 1; 755 neg = 1;
@@ -739,9 +757,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
739 } 757 }
740 758
741 for (i = 0; sched_feat_names[i]; i++) { 759 for (i = 0; sched_feat_names[i]; i++) {
742 int len = strlen(sched_feat_names[i]); 760 if (strcmp(cmp, sched_feat_names[i]) == 0) {
743
744 if (strncmp(cmp, sched_feat_names[i], len) == 0) {
745 if (neg) 761 if (neg)
746 sysctl_sched_features &= ~(1UL << i); 762 sysctl_sched_features &= ~(1UL << i);
747 else 763 else
@@ -1193,6 +1209,27 @@ static void resched_cpu(int cpu)
1193 1209
1194#ifdef CONFIG_NO_HZ 1210#ifdef CONFIG_NO_HZ
1195/* 1211/*
1212 * In the semi idle case, use the nearest busy cpu for migrating timers
1213 * from an idle cpu. This is good for power-savings.
1214 *
1215 * We don't do similar optimization for completely idle system, as
1216 * selecting an idle cpu will add more delays to the timers than intended
1217 * (as that cpu's timer base may not be uptodate wrt jiffies etc).
1218 */
1219int get_nohz_timer_target(void)
1220{
1221 int cpu = smp_processor_id();
1222 int i;
1223 struct sched_domain *sd;
1224
1225 for_each_domain(cpu, sd) {
1226 for_each_cpu(i, sched_domain_span(sd))
1227 if (!idle_cpu(i))
1228 return i;
1229 }
1230 return cpu;
1231}
1232/*
1196 * When add_timer_on() enqueues a timer into the timer wheel of an 1233 * When add_timer_on() enqueues a timer into the timer wheel of an
1197 * idle CPU then this timer might expire before the next timer event 1234 * idle CPU then this timer might expire before the next timer event
1198 * which is scheduled to wake up that CPU. In case of a completely 1235 * which is scheduled to wake up that CPU. In case of a completely
@@ -1232,16 +1269,6 @@ void wake_up_idle_cpu(int cpu)
1232 smp_send_reschedule(cpu); 1269 smp_send_reschedule(cpu);
1233} 1270}
1234 1271
1235int nohz_ratelimit(int cpu)
1236{
1237 struct rq *rq = cpu_rq(cpu);
1238 u64 diff = rq->clock - rq->nohz_stamp;
1239
1240 rq->nohz_stamp = rq->clock;
1241
1242 return diff < (NSEC_PER_SEC / HZ) >> 1;
1243}
1244
1245#endif /* CONFIG_NO_HZ */ 1272#endif /* CONFIG_NO_HZ */
1246 1273
1247static u64 sched_avg_period(void) 1274static u64 sched_avg_period(void)
@@ -1281,6 +1308,10 @@ static void resched_task(struct task_struct *p)
1281static void sched_rt_avg_update(struct rq *rq, u64 rt_delta) 1308static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
1282{ 1309{
1283} 1310}
1311
1312static void sched_avg_update(struct rq *rq)
1313{
1314}
1284#endif /* CONFIG_SMP */ 1315#endif /* CONFIG_SMP */
1285 1316
1286#if BITS_PER_LONG == 32 1317#if BITS_PER_LONG == 32
@@ -1652,7 +1683,7 @@ static void update_shares(struct sched_domain *sd)
1652 if (root_task_group_empty()) 1683 if (root_task_group_empty())
1653 return; 1684 return;
1654 1685
1655 now = cpu_clock(raw_smp_processor_id()); 1686 now = local_clock();
1656 elapsed = now - sd->last_update; 1687 elapsed = now - sd->last_update;
1657 1688
1658 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { 1689 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
@@ -1805,6 +1836,7 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1805static void calc_load_account_idle(struct rq *this_rq); 1836static void calc_load_account_idle(struct rq *this_rq);
1806static void update_sysctl(void); 1837static void update_sysctl(void);
1807static int get_update_sysctl_factor(void); 1838static int get_update_sysctl_factor(void);
1839static void update_cpu_load(struct rq *this_rq);
1808 1840
1809static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) 1841static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1810{ 1842{
@@ -1822,7 +1854,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1822 1854
1823static const struct sched_class rt_sched_class; 1855static const struct sched_class rt_sched_class;
1824 1856
1825#define sched_class_highest (&rt_sched_class) 1857#define sched_class_highest (&stop_sched_class)
1826#define for_each_class(class) \ 1858#define for_each_class(class) \
1827 for (class = sched_class_highest; class; class = class->next) 1859 for (class = sched_class_highest; class; class = class->next)
1828 1860
@@ -1840,12 +1872,6 @@ static void dec_nr_running(struct rq *rq)
1840 1872
1841static void set_load_weight(struct task_struct *p) 1873static void set_load_weight(struct task_struct *p)
1842{ 1874{
1843 if (task_has_rt_policy(p)) {
1844 p->se.load.weight = 0;
1845 p->se.load.inv_weight = WMULT_CONST;
1846 return;
1847 }
1848
1849 /* 1875 /*
1850 * SCHED_IDLE tasks get minimal weight: 1876 * SCHED_IDLE tasks get minimal weight:
1851 */ 1877 */
@@ -1899,13 +1925,132 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
1899 dec_nr_running(rq); 1925 dec_nr_running(rq);
1900} 1926}
1901 1927
1928#ifdef CONFIG_IRQ_TIME_ACCOUNTING
1929
1930/*
1931 * There are no locks covering percpu hardirq/softirq time.
1932 * They are only modified in account_system_vtime, on corresponding CPU
1933 * with interrupts disabled. So, writes are safe.
1934 * They are read and saved off onto struct rq in update_rq_clock().
1935 * This may result in other CPU reading this CPU's irq time and can
1936 * race with irq/account_system_vtime on this CPU. We would either get old
1937 * or new value (or semi updated value on 32 bit) with a side effect of
1938 * accounting a slice of irq time to wrong task when irq is in progress
1939 * while we read rq->clock. That is a worthy compromise in place of having
1940 * locks on each irq in account_system_time.
1941 */
1942static DEFINE_PER_CPU(u64, cpu_hardirq_time);
1943static DEFINE_PER_CPU(u64, cpu_softirq_time);
1944
1945static DEFINE_PER_CPU(u64, irq_start_time);
1946static int sched_clock_irqtime;
1947
1948void enable_sched_clock_irqtime(void)
1949{
1950 sched_clock_irqtime = 1;
1951}
1952
1953void disable_sched_clock_irqtime(void)
1954{
1955 sched_clock_irqtime = 0;
1956}
1957
1958static u64 irq_time_cpu(int cpu)
1959{
1960 if (!sched_clock_irqtime)
1961 return 0;
1962
1963 return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
1964}
1965
1966void account_system_vtime(struct task_struct *curr)
1967{
1968 unsigned long flags;
1969 int cpu;
1970 u64 now, delta;
1971
1972 if (!sched_clock_irqtime)
1973 return;
1974
1975 local_irq_save(flags);
1976
1977 cpu = smp_processor_id();
1978 now = sched_clock_cpu(cpu);
1979 delta = now - per_cpu(irq_start_time, cpu);
1980 per_cpu(irq_start_time, cpu) = now;
1981 /*
1982 * We do not account for softirq time from ksoftirqd here.
1983 * We want to continue accounting softirq time to ksoftirqd thread
1984 * in that case, so as not to confuse scheduler with a special task
1985 * that do not consume any time, but still wants to run.
1986 */
1987 if (hardirq_count())
1988 per_cpu(cpu_hardirq_time, cpu) += delta;
1989 else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD))
1990 per_cpu(cpu_softirq_time, cpu) += delta;
1991
1992 local_irq_restore(flags);
1993}
1994EXPORT_SYMBOL_GPL(account_system_vtime);
1995
1996static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time)
1997{
1998 if (sched_clock_irqtime && sched_feat(NONIRQ_POWER)) {
1999 u64 delta_irq = curr_irq_time - rq->prev_irq_time;
2000 rq->prev_irq_time = curr_irq_time;
2001 sched_rt_avg_update(rq, delta_irq);
2002 }
2003}
2004
2005#else
2006
2007static u64 irq_time_cpu(int cpu)
2008{
2009 return 0;
2010}
2011
2012static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) { }
2013
2014#endif
2015
1902#include "sched_idletask.c" 2016#include "sched_idletask.c"
1903#include "sched_fair.c" 2017#include "sched_fair.c"
1904#include "sched_rt.c" 2018#include "sched_rt.c"
2019#include "sched_stoptask.c"
1905#ifdef CONFIG_SCHED_DEBUG 2020#ifdef CONFIG_SCHED_DEBUG
1906# include "sched_debug.c" 2021# include "sched_debug.c"
1907#endif 2022#endif
1908 2023
2024void sched_set_stop_task(int cpu, struct task_struct *stop)
2025{
2026 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
2027 struct task_struct *old_stop = cpu_rq(cpu)->stop;
2028
2029 if (stop) {
2030 /*
2031 * Make it appear like a SCHED_FIFO task, its something
2032 * userspace knows about and won't get confused about.
2033 *
2034 * Also, it will make PI more or less work without too
2035 * much confusion -- but then, stop work should not
2036 * rely on PI working anyway.
2037 */
2038 sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);
2039
2040 stop->sched_class = &stop_sched_class;
2041 }
2042
2043 cpu_rq(cpu)->stop = stop;
2044
2045 if (old_stop) {
2046 /*
2047 * Reset it back to a normal scheduling class so that
2048 * it can die in pieces.
2049 */
2050 old_stop->sched_class = &rt_sched_class;
2051 }
2052}
2053
1909/* 2054/*
1910 * __normal_prio - return the priority that is based on the static prio 2055 * __normal_prio - return the priority that is based on the static prio
1911 */ 2056 */
@@ -1985,6 +2130,9 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
1985 if (p->sched_class != &fair_sched_class) 2130 if (p->sched_class != &fair_sched_class)
1986 return 0; 2131 return 0;
1987 2132
2133 if (unlikely(p->policy == SCHED_IDLE))
2134 return 0;
2135
1988 /* 2136 /*
1989 * Buddy candidates are cache hot: 2137 * Buddy candidates are cache hot:
1990 */ 2138 */
@@ -2267,11 +2415,55 @@ static void update_avg(u64 *avg, u64 sample)
2267} 2415}
2268#endif 2416#endif
2269 2417
2270/*** 2418static inline void ttwu_activate(struct task_struct *p, struct rq *rq,
2419 bool is_sync, bool is_migrate, bool is_local,
2420 unsigned long en_flags)
2421{
2422 schedstat_inc(p, se.statistics.nr_wakeups);
2423 if (is_sync)
2424 schedstat_inc(p, se.statistics.nr_wakeups_sync);
2425 if (is_migrate)
2426 schedstat_inc(p, se.statistics.nr_wakeups_migrate);
2427 if (is_local)
2428 schedstat_inc(p, se.statistics.nr_wakeups_local);
2429 else
2430 schedstat_inc(p, se.statistics.nr_wakeups_remote);
2431
2432 activate_task(rq, p, en_flags);
2433}
2434
2435static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
2436 int wake_flags, bool success)
2437{
2438 trace_sched_wakeup(p, success);
2439 check_preempt_curr(rq, p, wake_flags);
2440
2441 p->state = TASK_RUNNING;
2442#ifdef CONFIG_SMP
2443 if (p->sched_class->task_woken)
2444 p->sched_class->task_woken(rq, p);
2445
2446 if (unlikely(rq->idle_stamp)) {
2447 u64 delta = rq->clock - rq->idle_stamp;
2448 u64 max = 2*sysctl_sched_migration_cost;
2449
2450 if (delta > max)
2451 rq->avg_idle = max;
2452 else
2453 update_avg(&rq->avg_idle, delta);
2454 rq->idle_stamp = 0;
2455 }
2456#endif
2457 /* if a worker is waking up, notify workqueue */
2458 if ((p->flags & PF_WQ_WORKER) && success)
2459 wq_worker_waking_up(p, cpu_of(rq));
2460}
2461
2462/**
2271 * try_to_wake_up - wake up a thread 2463 * try_to_wake_up - wake up a thread
2272 * @p: the to-be-woken-up thread 2464 * @p: the thread to be awakened
2273 * @state: the mask of task states that can be woken 2465 * @state: the mask of task states that can be woken
2274 * @sync: do a synchronous wakeup? 2466 * @wake_flags: wake modifier flags (WF_*)
2275 * 2467 *
2276 * Put it on the run-queue if it's not already there. The "current" 2468 * Put it on the run-queue if it's not already there. The "current"
2277 * thread is always on the run-queue (except when the actual 2469 * thread is always on the run-queue (except when the actual
@@ -2279,7 +2471,8 @@ static void update_avg(u64 *avg, u64 sample)
2279 * the simpler "current->state = TASK_RUNNING" to mark yourself 2471 * the simpler "current->state = TASK_RUNNING" to mark yourself
2280 * runnable without the overhead of this. 2472 * runnable without the overhead of this.
2281 * 2473 *
2282 * returns failure only if the task is already active. 2474 * Returns %true if @p was woken up, %false if it was already running
2475 * or @state didn't match @p's state.
2283 */ 2476 */
2284static int try_to_wake_up(struct task_struct *p, unsigned int state, 2477static int try_to_wake_up(struct task_struct *p, unsigned int state,
2285 int wake_flags) 2478 int wake_flags)
@@ -2359,38 +2552,11 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
2359 2552
2360out_activate: 2553out_activate:
2361#endif /* CONFIG_SMP */ 2554#endif /* CONFIG_SMP */
2362 schedstat_inc(p, se.statistics.nr_wakeups); 2555 ttwu_activate(p, rq, wake_flags & WF_SYNC, orig_cpu != cpu,
2363 if (wake_flags & WF_SYNC) 2556 cpu == this_cpu, en_flags);
2364 schedstat_inc(p, se.statistics.nr_wakeups_sync);
2365 if (orig_cpu != cpu)
2366 schedstat_inc(p, se.statistics.nr_wakeups_migrate);
2367 if (cpu == this_cpu)
2368 schedstat_inc(p, se.statistics.nr_wakeups_local);
2369 else
2370 schedstat_inc(p, se.statistics.nr_wakeups_remote);
2371 activate_task(rq, p, en_flags);
2372 success = 1; 2557 success = 1;
2373
2374out_running: 2558out_running:
2375 trace_sched_wakeup(p, success); 2559 ttwu_post_activation(p, rq, wake_flags, success);
2376 check_preempt_curr(rq, p, wake_flags);
2377
2378 p->state = TASK_RUNNING;
2379#ifdef CONFIG_SMP
2380 if (p->sched_class->task_woken)
2381 p->sched_class->task_woken(rq, p);
2382
2383 if (unlikely(rq->idle_stamp)) {
2384 u64 delta = rq->clock - rq->idle_stamp;
2385 u64 max = 2*sysctl_sched_migration_cost;
2386
2387 if (delta > max)
2388 rq->avg_idle = max;
2389 else
2390 update_avg(&rq->avg_idle, delta);
2391 rq->idle_stamp = 0;
2392 }
2393#endif
2394out: 2560out:
2395 task_rq_unlock(rq, &flags); 2561 task_rq_unlock(rq, &flags);
2396 put_cpu(); 2562 put_cpu();
@@ -2399,6 +2565,37 @@ out:
2399} 2565}
2400 2566
2401/** 2567/**
2568 * try_to_wake_up_local - try to wake up a local task with rq lock held
2569 * @p: the thread to be awakened
2570 *
2571 * Put @p on the run-queue if it's not alredy there. The caller must
2572 * ensure that this_rq() is locked, @p is bound to this_rq() and not
2573 * the current task. this_rq() stays locked over invocation.
2574 */
2575static void try_to_wake_up_local(struct task_struct *p)
2576{
2577 struct rq *rq = task_rq(p);
2578 bool success = false;
2579
2580 BUG_ON(rq != this_rq());
2581 BUG_ON(p == current);
2582 lockdep_assert_held(&rq->lock);
2583
2584 if (!(p->state & TASK_NORMAL))
2585 return;
2586
2587 if (!p->se.on_rq) {
2588 if (likely(!task_running(rq, p))) {
2589 schedstat_inc(rq, ttwu_count);
2590 schedstat_inc(rq, ttwu_local);
2591 }
2592 ttwu_activate(p, rq, false, false, true, ENQUEUE_WAKEUP);
2593 success = true;
2594 }
2595 ttwu_post_activation(p, rq, 0, success);
2596}
2597
2598/**
2402 * wake_up_process - Wake up a specific process 2599 * wake_up_process - Wake up a specific process
2403 * @p: The process to be woken up. 2600 * @p: The process to be woken up.
2404 * 2601 *
@@ -2785,14 +2982,14 @@ context_switch(struct rq *rq, struct task_struct *prev,
2785 */ 2982 */
2786 arch_start_context_switch(prev); 2983 arch_start_context_switch(prev);
2787 2984
2788 if (likely(!mm)) { 2985 if (!mm) {
2789 next->active_mm = oldmm; 2986 next->active_mm = oldmm;
2790 atomic_inc(&oldmm->mm_count); 2987 atomic_inc(&oldmm->mm_count);
2791 enter_lazy_tlb(oldmm, next); 2988 enter_lazy_tlb(oldmm, next);
2792 } else 2989 } else
2793 switch_mm(oldmm, mm, next); 2990 switch_mm(oldmm, mm, next);
2794 2991
2795 if (likely(!prev->mm)) { 2992 if (!prev->mm) {
2796 prev->active_mm = NULL; 2993 prev->active_mm = NULL;
2797 rq->prev_mm = oldmm; 2994 rq->prev_mm = oldmm;
2798 } 2995 }
@@ -3012,23 +3209,102 @@ static void calc_load_account_active(struct rq *this_rq)
3012} 3209}
3013 3210
3014/* 3211/*
3212 * The exact cpuload at various idx values, calculated at every tick would be
3213 * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
3214 *
3215 * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
3216 * on nth tick when cpu may be busy, then we have:
3217 * load = ((2^idx - 1) / 2^idx)^(n-1) * load
3218 * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
3219 *
3220 * decay_load_missed() below does efficient calculation of
3221 * load = ((2^idx - 1) / 2^idx)^(n-1) * load
3222 * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
3223 *
3224 * The calculation is approximated on a 128 point scale.
3225 * degrade_zero_ticks is the number of ticks after which load at any
3226 * particular idx is approximated to be zero.
3227 * degrade_factor is a precomputed table, a row for each load idx.
3228 * Each column corresponds to degradation factor for a power of two ticks,
3229 * based on 128 point scale.
3230 * Example:
3231 * row 2, col 3 (=12) says that the degradation at load idx 2 after
3232 * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
3233 *
3234 * With this power of 2 load factors, we can degrade the load n times
3235 * by looking at 1 bits in n and doing as many mult/shift instead of
3236 * n mult/shifts needed by the exact degradation.
3237 */
3238#define DEGRADE_SHIFT 7
3239static const unsigned char
3240 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
3241static const unsigned char
3242 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
3243 {0, 0, 0, 0, 0, 0, 0, 0},
3244 {64, 32, 8, 0, 0, 0, 0, 0},
3245 {96, 72, 40, 12, 1, 0, 0},
3246 {112, 98, 75, 43, 15, 1, 0},
3247 {120, 112, 98, 76, 45, 16, 2} };
3248
3249/*
3250 * Update cpu_load for any missed ticks, due to tickless idle. The backlog
3251 * would be when CPU is idle and so we just decay the old load without
3252 * adding any new load.
3253 */
3254static unsigned long
3255decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
3256{
3257 int j = 0;
3258
3259 if (!missed_updates)
3260 return load;
3261
3262 if (missed_updates >= degrade_zero_ticks[idx])
3263 return 0;
3264
3265 if (idx == 1)
3266 return load >> missed_updates;
3267
3268 while (missed_updates) {
3269 if (missed_updates % 2)
3270 load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
3271
3272 missed_updates >>= 1;
3273 j++;
3274 }
3275 return load;
3276}
3277
3278/*
3015 * Update rq->cpu_load[] statistics. This function is usually called every 3279 * Update rq->cpu_load[] statistics. This function is usually called every
3016 * scheduler tick (TICK_NSEC). 3280 * scheduler tick (TICK_NSEC). With tickless idle this will not be called
3281 * every tick. We fix it up based on jiffies.
3017 */ 3282 */
3018static void update_cpu_load(struct rq *this_rq) 3283static void update_cpu_load(struct rq *this_rq)
3019{ 3284{
3020 unsigned long this_load = this_rq->load.weight; 3285 unsigned long this_load = this_rq->load.weight;
3286 unsigned long curr_jiffies = jiffies;
3287 unsigned long pending_updates;
3021 int i, scale; 3288 int i, scale;
3022 3289
3023 this_rq->nr_load_updates++; 3290 this_rq->nr_load_updates++;
3024 3291
3292 /* Avoid repeated calls on same jiffy, when moving in and out of idle */
3293 if (curr_jiffies == this_rq->last_load_update_tick)
3294 return;
3295
3296 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
3297 this_rq->last_load_update_tick = curr_jiffies;
3298
3025 /* Update our load: */ 3299 /* Update our load: */
3026 for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { 3300 this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
3301 for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
3027 unsigned long old_load, new_load; 3302 unsigned long old_load, new_load;
3028 3303
3029 /* scale is effectively 1 << i now, and >> i divides by scale */ 3304 /* scale is effectively 1 << i now, and >> i divides by scale */
3030 3305
3031 old_load = this_rq->cpu_load[i]; 3306 old_load = this_rq->cpu_load[i];
3307 old_load = decay_load_missed(old_load, pending_updates - 1, i);
3032 new_load = this_load; 3308 new_load = this_load;
3033 /* 3309 /*
3034 * Round up the averaging division if load is increasing. This 3310 * Round up the averaging division if load is increasing. This
@@ -3036,10 +3312,18 @@ static void update_cpu_load(struct rq *this_rq)
3036 * example. 3312 * example.
3037 */ 3313 */
3038 if (new_load > old_load) 3314 if (new_load > old_load)
3039 new_load += scale-1; 3315 new_load += scale - 1;
3040 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; 3316
3317 this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
3041 } 3318 }
3042 3319
3320 sched_avg_update(this_rq);
3321}
3322
3323static void update_cpu_load_active(struct rq *this_rq)
3324{
3325 update_cpu_load(this_rq);
3326
3043 calc_load_account_active(this_rq); 3327 calc_load_account_active(this_rq);
3044} 3328}
3045 3329
@@ -3094,7 +3378,7 @@ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
3094 3378
3095 if (task_current(rq, p)) { 3379 if (task_current(rq, p)) {
3096 update_rq_clock(rq); 3380 update_rq_clock(rq);
3097 ns = rq->clock - p->se.exec_start; 3381 ns = rq->clock_task - p->se.exec_start;
3098 if ((s64)ns < 0) 3382 if ((s64)ns < 0)
3099 ns = 0; 3383 ns = 0;
3100 } 3384 }
@@ -3243,7 +3527,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
3243 tmp = cputime_to_cputime64(cputime); 3527 tmp = cputime_to_cputime64(cputime);
3244 if (hardirq_count() - hardirq_offset) 3528 if (hardirq_count() - hardirq_offset)
3245 cpustat->irq = cputime64_add(cpustat->irq, tmp); 3529 cpustat->irq = cputime64_add(cpustat->irq, tmp);
3246 else if (softirq_count()) 3530 else if (in_serving_softirq())
3247 cpustat->softirq = cputime64_add(cpustat->softirq, tmp); 3531 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
3248 else 3532 else
3249 cpustat->system = cputime64_add(cpustat->system, tmp); 3533 cpustat->system = cputime64_add(cpustat->system, tmp);
@@ -3359,9 +3643,9 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3359 rtime = nsecs_to_cputime(p->se.sum_exec_runtime); 3643 rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
3360 3644
3361 if (total) { 3645 if (total) {
3362 u64 temp; 3646 u64 temp = rtime;
3363 3647
3364 temp = (u64)(rtime * utime); 3648 temp *= utime;
3365 do_div(temp, total); 3649 do_div(temp, total);
3366 utime = (cputime_t)temp; 3650 utime = (cputime_t)temp;
3367 } else 3651 } else
@@ -3392,9 +3676,9 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3392 rtime = nsecs_to_cputime(cputime.sum_exec_runtime); 3676 rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
3393 3677
3394 if (total) { 3678 if (total) {
3395 u64 temp; 3679 u64 temp = rtime;
3396 3680
3397 temp = (u64)(rtime * cputime.utime); 3681 temp *= cputime.utime;
3398 do_div(temp, total); 3682 do_div(temp, total);
3399 utime = (cputime_t)temp; 3683 utime = (cputime_t)temp;
3400 } else 3684 } else
@@ -3426,11 +3710,11 @@ void scheduler_tick(void)
3426 3710
3427 raw_spin_lock(&rq->lock); 3711 raw_spin_lock(&rq->lock);
3428 update_rq_clock(rq); 3712 update_rq_clock(rq);
3429 update_cpu_load(rq); 3713 update_cpu_load_active(rq);
3430 curr->sched_class->task_tick(rq, curr, 0); 3714 curr->sched_class->task_tick(rq, curr, 0);
3431 raw_spin_unlock(&rq->lock); 3715 raw_spin_unlock(&rq->lock);
3432 3716
3433 perf_event_task_tick(curr); 3717 perf_event_task_tick();
3434 3718
3435#ifdef CONFIG_SMP 3719#ifdef CONFIG_SMP
3436 rq->idle_at_tick = idle_cpu(cpu); 3720 rq->idle_at_tick = idle_cpu(cpu);
@@ -3569,17 +3853,13 @@ pick_next_task(struct rq *rq)
3569 return p; 3853 return p;
3570 } 3854 }
3571 3855
3572 class = sched_class_highest; 3856 for_each_class(class) {
3573 for ( ; ; ) {
3574 p = class->pick_next_task(rq); 3857 p = class->pick_next_task(rq);
3575 if (p) 3858 if (p)
3576 return p; 3859 return p;
3577 /*
3578 * Will never be NULL as the idle class always
3579 * returns a non-NULL p:
3580 */
3581 class = class->next;
3582 } 3860 }
3861
3862 BUG(); /* the idle class will always have a runnable task */
3583} 3863}
3584 3864
3585/* 3865/*
@@ -3598,7 +3878,6 @@ need_resched:
3598 rq = cpu_rq(cpu); 3878 rq = cpu_rq(cpu);
3599 rcu_note_context_switch(cpu); 3879 rcu_note_context_switch(cpu);
3600 prev = rq->curr; 3880 prev = rq->curr;
3601 switch_count = &prev->nivcsw;
3602 3881
3603 release_kernel_lock(prev); 3882 release_kernel_lock(prev);
3604need_resched_nonpreemptible: 3883need_resched_nonpreemptible:
@@ -3611,11 +3890,26 @@ need_resched_nonpreemptible:
3611 raw_spin_lock_irq(&rq->lock); 3890 raw_spin_lock_irq(&rq->lock);
3612 clear_tsk_need_resched(prev); 3891 clear_tsk_need_resched(prev);
3613 3892
3893 switch_count = &prev->nivcsw;
3614 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { 3894 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
3615 if (unlikely(signal_pending_state(prev->state, prev))) 3895 if (unlikely(signal_pending_state(prev->state, prev))) {
3616 prev->state = TASK_RUNNING; 3896 prev->state = TASK_RUNNING;
3617 else 3897 } else {
3898 /*
3899 * If a worker is going to sleep, notify and
3900 * ask workqueue whether it wants to wake up a
3901 * task to maintain concurrency. If so, wake
3902 * up the task.
3903 */
3904 if (prev->flags & PF_WQ_WORKER) {
3905 struct task_struct *to_wakeup;
3906
3907 to_wakeup = wq_worker_sleeping(prev, cpu);
3908 if (to_wakeup)
3909 try_to_wake_up_local(to_wakeup);
3910 }
3618 deactivate_task(rq, prev, DEQUEUE_SLEEP); 3911 deactivate_task(rq, prev, DEQUEUE_SLEEP);
3912 }
3619 switch_count = &prev->nvcsw; 3913 switch_count = &prev->nvcsw;
3620 } 3914 }
3621 3915
@@ -3637,8 +3931,10 @@ need_resched_nonpreemptible:
3637 3931
3638 context_switch(rq, prev, next); /* unlocks the rq */ 3932 context_switch(rq, prev, next); /* unlocks the rq */
3639 /* 3933 /*
3640 * the context switch might have flipped the stack from under 3934 * The context switch have flipped the stack from under us
3641 * us, hence refresh the local variables. 3935 * and restored the local variables which were saved when
3936 * this task called schedule() in the past. prev == current
3937 * is still correct, but it can be moved to another cpu/rq.
3642 */ 3938 */
3643 cpu = smp_processor_id(); 3939 cpu = smp_processor_id();
3644 rq = cpu_rq(cpu); 3940 rq = cpu_rq(cpu);
@@ -3647,11 +3943,8 @@ need_resched_nonpreemptible:
3647 3943
3648 post_schedule(rq); 3944 post_schedule(rq);
3649 3945
3650 if (unlikely(reacquire_kernel_lock(current) < 0)) { 3946 if (unlikely(reacquire_kernel_lock(prev)))
3651 prev = rq->curr;
3652 switch_count = &prev->nivcsw;
3653 goto need_resched_nonpreemptible; 3947 goto need_resched_nonpreemptible;
3654 }
3655 3948
3656 preempt_enable_no_resched(); 3949 preempt_enable_no_resched();
3657 if (need_resched()) 3950 if (need_resched())
@@ -3704,8 +3997,16 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
3704 /* 3997 /*
3705 * Owner changed, break to re-assess state. 3998 * Owner changed, break to re-assess state.
3706 */ 3999 */
3707 if (lock->owner != owner) 4000 if (lock->owner != owner) {
4001 /*
4002 * If the lock has switched to a different owner,
4003 * we likely have heavy contention. Return 0 to quit
4004 * optimistic spinning and not contend further:
4005 */
4006 if (lock->owner)
4007 return 0;
3708 break; 4008 break;
4009 }
3709 4010
3710 /* 4011 /*
3711 * Is that owner really running on that cpu? 4012 * Is that owner really running on that cpu?
@@ -3726,7 +4027,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
3726 * off of preempt_enable. Kernel preemptions off return from interrupt 4027 * off of preempt_enable. Kernel preemptions off return from interrupt
3727 * occur there and call schedule directly. 4028 * occur there and call schedule directly.
3728 */ 4029 */
3729asmlinkage void __sched preempt_schedule(void) 4030asmlinkage void __sched notrace preempt_schedule(void)
3730{ 4031{
3731 struct thread_info *ti = current_thread_info(); 4032 struct thread_info *ti = current_thread_info();
3732 4033
@@ -3738,9 +4039,9 @@ asmlinkage void __sched preempt_schedule(void)
3738 return; 4039 return;
3739 4040
3740 do { 4041 do {
3741 add_preempt_count(PREEMPT_ACTIVE); 4042 add_preempt_count_notrace(PREEMPT_ACTIVE);
3742 schedule(); 4043 schedule();
3743 sub_preempt_count(PREEMPT_ACTIVE); 4044 sub_preempt_count_notrace(PREEMPT_ACTIVE);
3744 4045
3745 /* 4046 /*
3746 * Check again in case we missed a preemption opportunity 4047 * Check again in case we missed a preemption opportunity
@@ -4183,6 +4484,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
4183 4484
4184 rq = task_rq_lock(p, &flags); 4485 rq = task_rq_lock(p, &flags);
4185 4486
4487 trace_sched_pi_setprio(p, prio);
4186 oldprio = p->prio; 4488 oldprio = p->prio;
4187 prev_class = p->sched_class; 4489 prev_class = p->sched_class;
4188 on_rq = p->se.on_rq; 4490 on_rq = p->se.on_rq;
@@ -4441,12 +4743,8 @@ recheck:
4441 */ 4743 */
4442 if (user && !capable(CAP_SYS_NICE)) { 4744 if (user && !capable(CAP_SYS_NICE)) {
4443 if (rt_policy(policy)) { 4745 if (rt_policy(policy)) {
4444 unsigned long rlim_rtprio; 4746 unsigned long rlim_rtprio =
4445 4747 task_rlimit(p, RLIMIT_RTPRIO);
4446 if (!lock_task_sighand(p, &flags))
4447 return -ESRCH;
4448 rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO);
4449 unlock_task_sighand(p, &flags);
4450 4748
4451 /* can't set/change the rt policy */ 4749 /* can't set/change the rt policy */
4452 if (policy != p->policy && !rlim_rtprio) 4750 if (policy != p->policy && !rlim_rtprio)
@@ -4474,7 +4772,7 @@ recheck:
4474 } 4772 }
4475 4773
4476 if (user) { 4774 if (user) {
4477 retval = security_task_setscheduler(p, policy, param); 4775 retval = security_task_setscheduler(p);
4478 if (retval) 4776 if (retval)
4479 return retval; 4777 return retval;
4480 } 4778 }
@@ -4490,6 +4788,15 @@ recheck:
4490 */ 4788 */
4491 rq = __task_rq_lock(p); 4789 rq = __task_rq_lock(p);
4492 4790
4791 /*
4792 * Changing the policy of the stop threads its a very bad idea
4793 */
4794 if (p == rq->stop) {
4795 __task_rq_unlock(rq);
4796 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4797 return -EINVAL;
4798 }
4799
4493#ifdef CONFIG_RT_GROUP_SCHED 4800#ifdef CONFIG_RT_GROUP_SCHED
4494 if (user) { 4801 if (user) {
4495 /* 4802 /*
@@ -4716,13 +5023,13 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
4716 if (!check_same_owner(p) && !capable(CAP_SYS_NICE)) 5023 if (!check_same_owner(p) && !capable(CAP_SYS_NICE))
4717 goto out_unlock; 5024 goto out_unlock;
4718 5025
4719 retval = security_task_setscheduler(p, 0, NULL); 5026 retval = security_task_setscheduler(p);
4720 if (retval) 5027 if (retval)
4721 goto out_unlock; 5028 goto out_unlock;
4722 5029
4723 cpuset_cpus_allowed(p, cpus_allowed); 5030 cpuset_cpus_allowed(p, cpus_allowed);
4724 cpumask_and(new_mask, in_mask, cpus_allowed); 5031 cpumask_and(new_mask, in_mask, cpus_allowed);
4725 again: 5032again:
4726 retval = set_cpus_allowed_ptr(p, new_mask); 5033 retval = set_cpus_allowed_ptr(p, new_mask);
4727 5034
4728 if (!retval) { 5035 if (!retval) {
@@ -5166,7 +5473,19 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5166 idle->se.exec_start = sched_clock(); 5473 idle->se.exec_start = sched_clock();
5167 5474
5168 cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu)); 5475 cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));
5476 /*
5477 * We're having a chicken and egg problem, even though we are
5478 * holding rq->lock, the cpu isn't yet set to this cpu so the
5479 * lockdep check in task_group() will fail.
5480 *
5481 * Similar case to sched_fork(). / Alternatively we could
5482 * use task_rq_lock() here and obtain the other rq->lock.
5483 *
5484 * Silence PROVE_RCU
5485 */
5486 rcu_read_lock();
5169 __set_task_cpu(idle, cpu); 5487 __set_task_cpu(idle, cpu);
5488 rcu_read_unlock();
5170 5489
5171 rq->curr = rq->idle = idle; 5490 rq->curr = rq->idle = idle;
5172#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) 5491#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
@@ -5816,20 +6135,49 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5816 */ 6135 */
5817static struct notifier_block __cpuinitdata migration_notifier = { 6136static struct notifier_block __cpuinitdata migration_notifier = {
5818 .notifier_call = migration_call, 6137 .notifier_call = migration_call,
5819 .priority = 10 6138 .priority = CPU_PRI_MIGRATION,
5820}; 6139};
5821 6140
6141static int __cpuinit sched_cpu_active(struct notifier_block *nfb,
6142 unsigned long action, void *hcpu)
6143{
6144 switch (action & ~CPU_TASKS_FROZEN) {
6145 case CPU_ONLINE:
6146 case CPU_DOWN_FAILED:
6147 set_cpu_active((long)hcpu, true);
6148 return NOTIFY_OK;
6149 default:
6150 return NOTIFY_DONE;
6151 }
6152}
6153
6154static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb,
6155 unsigned long action, void *hcpu)
6156{
6157 switch (action & ~CPU_TASKS_FROZEN) {
6158 case CPU_DOWN_PREPARE:
6159 set_cpu_active((long)hcpu, false);
6160 return NOTIFY_OK;
6161 default:
6162 return NOTIFY_DONE;
6163 }
6164}
6165
5822static int __init migration_init(void) 6166static int __init migration_init(void)
5823{ 6167{
5824 void *cpu = (void *)(long)smp_processor_id(); 6168 void *cpu = (void *)(long)smp_processor_id();
5825 int err; 6169 int err;
5826 6170
5827 /* Start one for the boot CPU: */ 6171 /* Initialize migration for the boot CPU */
5828 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); 6172 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
5829 BUG_ON(err == NOTIFY_BAD); 6173 BUG_ON(err == NOTIFY_BAD);
5830 migration_call(&migration_notifier, CPU_ONLINE, cpu); 6174 migration_call(&migration_notifier, CPU_ONLINE, cpu);
5831 register_cpu_notifier(&migration_notifier); 6175 register_cpu_notifier(&migration_notifier);
5832 6176
6177 /* Register cpu active notifiers */
6178 cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE);
6179 cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE);
6180
5833 return 0; 6181 return 0;
5834} 6182}
5835early_initcall(migration_init); 6183early_initcall(migration_init);
@@ -6064,23 +6412,18 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
6064 free_rootdomain(old_rd); 6412 free_rootdomain(old_rd);
6065} 6413}
6066 6414
6067static int init_rootdomain(struct root_domain *rd, bool bootmem) 6415static int init_rootdomain(struct root_domain *rd)
6068{ 6416{
6069 gfp_t gfp = GFP_KERNEL;
6070
6071 memset(rd, 0, sizeof(*rd)); 6417 memset(rd, 0, sizeof(*rd));
6072 6418
6073 if (bootmem) 6419 if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
6074 gfp = GFP_NOWAIT;
6075
6076 if (!alloc_cpumask_var(&rd->span, gfp))
6077 goto out; 6420 goto out;
6078 if (!alloc_cpumask_var(&rd->online, gfp)) 6421 if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
6079 goto free_span; 6422 goto free_span;
6080 if (!alloc_cpumask_var(&rd->rto_mask, gfp)) 6423 if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
6081 goto free_online; 6424 goto free_online;
6082 6425
6083 if (cpupri_init(&rd->cpupri, bootmem) != 0) 6426 if (cpupri_init(&rd->cpupri) != 0)
6084 goto free_rto_mask; 6427 goto free_rto_mask;
6085 return 0; 6428 return 0;
6086 6429
@@ -6096,7 +6439,7 @@ out:
6096 6439
6097static void init_defrootdomain(void) 6440static void init_defrootdomain(void)
6098{ 6441{
6099 init_rootdomain(&def_root_domain, true); 6442 init_rootdomain(&def_root_domain);
6100 6443
6101 atomic_set(&def_root_domain.refcount, 1); 6444 atomic_set(&def_root_domain.refcount, 1);
6102} 6445}
@@ -6109,7 +6452,7 @@ static struct root_domain *alloc_rootdomain(void)
6109 if (!rd) 6452 if (!rd)
6110 return NULL; 6453 return NULL;
6111 6454
6112 if (init_rootdomain(rd, false) != 0) { 6455 if (init_rootdomain(rd) != 0) {
6113 kfree(rd); 6456 kfree(rd);
6114 return NULL; 6457 return NULL;
6115 } 6458 }
@@ -6319,6 +6662,7 @@ struct s_data {
6319 cpumask_var_t nodemask; 6662 cpumask_var_t nodemask;
6320 cpumask_var_t this_sibling_map; 6663 cpumask_var_t this_sibling_map;
6321 cpumask_var_t this_core_map; 6664 cpumask_var_t this_core_map;
6665 cpumask_var_t this_book_map;
6322 cpumask_var_t send_covered; 6666 cpumask_var_t send_covered;
6323 cpumask_var_t tmpmask; 6667 cpumask_var_t tmpmask;
6324 struct sched_group **sched_group_nodes; 6668 struct sched_group **sched_group_nodes;
@@ -6330,6 +6674,7 @@ enum s_alloc {
6330 sa_rootdomain, 6674 sa_rootdomain,
6331 sa_tmpmask, 6675 sa_tmpmask,
6332 sa_send_covered, 6676 sa_send_covered,
6677 sa_this_book_map,
6333 sa_this_core_map, 6678 sa_this_core_map,
6334 sa_this_sibling_map, 6679 sa_this_sibling_map,
6335 sa_nodemask, 6680 sa_nodemask,
@@ -6365,31 +6710,48 @@ cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map,
6365#ifdef CONFIG_SCHED_MC 6710#ifdef CONFIG_SCHED_MC
6366static DEFINE_PER_CPU(struct static_sched_domain, core_domains); 6711static DEFINE_PER_CPU(struct static_sched_domain, core_domains);
6367static DEFINE_PER_CPU(struct static_sched_group, sched_group_core); 6712static DEFINE_PER_CPU(struct static_sched_group, sched_group_core);
6368#endif /* CONFIG_SCHED_MC */
6369 6713
6370#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
6371static int 6714static int
6372cpu_to_core_group(int cpu, const struct cpumask *cpu_map, 6715cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
6373 struct sched_group **sg, struct cpumask *mask) 6716 struct sched_group **sg, struct cpumask *mask)
6374{ 6717{
6375 int group; 6718 int group;
6376 6719#ifdef CONFIG_SCHED_SMT
6377 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); 6720 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
6378 group = cpumask_first(mask); 6721 group = cpumask_first(mask);
6722#else
6723 group = cpu;
6724#endif
6379 if (sg) 6725 if (sg)
6380 *sg = &per_cpu(sched_group_core, group).sg; 6726 *sg = &per_cpu(sched_group_core, group).sg;
6381 return group; 6727 return group;
6382} 6728}
6383#elif defined(CONFIG_SCHED_MC) 6729#endif /* CONFIG_SCHED_MC */
6730
6731/*
6732 * book sched-domains:
6733 */
6734#ifdef CONFIG_SCHED_BOOK
6735static DEFINE_PER_CPU(struct static_sched_domain, book_domains);
6736static DEFINE_PER_CPU(struct static_sched_group, sched_group_book);
6737
6384static int 6738static int
6385cpu_to_core_group(int cpu, const struct cpumask *cpu_map, 6739cpu_to_book_group(int cpu, const struct cpumask *cpu_map,
6386 struct sched_group **sg, struct cpumask *unused) 6740 struct sched_group **sg, struct cpumask *mask)
6387{ 6741{
6742 int group = cpu;
6743#ifdef CONFIG_SCHED_MC
6744 cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
6745 group = cpumask_first(mask);
6746#elif defined(CONFIG_SCHED_SMT)
6747 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
6748 group = cpumask_first(mask);
6749#endif
6388 if (sg) 6750 if (sg)
6389 *sg = &per_cpu(sched_group_core, cpu).sg; 6751 *sg = &per_cpu(sched_group_book, group).sg;
6390 return cpu; 6752 return group;
6391} 6753}
6392#endif 6754#endif /* CONFIG_SCHED_BOOK */
6393 6755
6394static DEFINE_PER_CPU(struct static_sched_domain, phys_domains); 6756static DEFINE_PER_CPU(struct static_sched_domain, phys_domains);
6395static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys); 6757static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys);
@@ -6399,7 +6761,10 @@ cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
6399 struct sched_group **sg, struct cpumask *mask) 6761 struct sched_group **sg, struct cpumask *mask)
6400{ 6762{
6401 int group; 6763 int group;
6402#ifdef CONFIG_SCHED_MC 6764#ifdef CONFIG_SCHED_BOOK
6765 cpumask_and(mask, cpu_book_mask(cpu), cpu_map);
6766 group = cpumask_first(mask);
6767#elif defined(CONFIG_SCHED_MC)
6403 cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map); 6768 cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
6404 group = cpumask_first(mask); 6769 group = cpumask_first(mask);
6405#elif defined(CONFIG_SCHED_SMT) 6770#elif defined(CONFIG_SCHED_SMT)
@@ -6660,6 +7025,9 @@ SD_INIT_FUNC(CPU)
6660#ifdef CONFIG_SCHED_MC 7025#ifdef CONFIG_SCHED_MC
6661 SD_INIT_FUNC(MC) 7026 SD_INIT_FUNC(MC)
6662#endif 7027#endif
7028#ifdef CONFIG_SCHED_BOOK
7029 SD_INIT_FUNC(BOOK)
7030#endif
6663 7031
6664static int default_relax_domain_level = -1; 7032static int default_relax_domain_level = -1;
6665 7033
@@ -6709,6 +7077,8 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
6709 free_cpumask_var(d->tmpmask); /* fall through */ 7077 free_cpumask_var(d->tmpmask); /* fall through */
6710 case sa_send_covered: 7078 case sa_send_covered:
6711 free_cpumask_var(d->send_covered); /* fall through */ 7079 free_cpumask_var(d->send_covered); /* fall through */
7080 case sa_this_book_map:
7081 free_cpumask_var(d->this_book_map); /* fall through */
6712 case sa_this_core_map: 7082 case sa_this_core_map:
6713 free_cpumask_var(d->this_core_map); /* fall through */ 7083 free_cpumask_var(d->this_core_map); /* fall through */
6714 case sa_this_sibling_map: 7084 case sa_this_sibling_map:
@@ -6755,8 +7125,10 @@ static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
6755 return sa_nodemask; 7125 return sa_nodemask;
6756 if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL)) 7126 if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL))
6757 return sa_this_sibling_map; 7127 return sa_this_sibling_map;
6758 if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL)) 7128 if (!alloc_cpumask_var(&d->this_book_map, GFP_KERNEL))
6759 return sa_this_core_map; 7129 return sa_this_core_map;
7130 if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
7131 return sa_this_book_map;
6760 if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL)) 7132 if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL))
6761 return sa_send_covered; 7133 return sa_send_covered;
6762 d->rd = alloc_rootdomain(); 7134 d->rd = alloc_rootdomain();
@@ -6814,6 +7186,23 @@ static struct sched_domain *__build_cpu_sched_domain(struct s_data *d,
6814 return sd; 7186 return sd;
6815} 7187}
6816 7188
7189static struct sched_domain *__build_book_sched_domain(struct s_data *d,
7190 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
7191 struct sched_domain *parent, int i)
7192{
7193 struct sched_domain *sd = parent;
7194#ifdef CONFIG_SCHED_BOOK
7195 sd = &per_cpu(book_domains, i).sd;
7196 SD_INIT(sd, BOOK);
7197 set_domain_attribute(sd, attr);
7198 cpumask_and(sched_domain_span(sd), cpu_map, cpu_book_mask(i));
7199 sd->parent = parent;
7200 parent->child = sd;
7201 cpu_to_book_group(i, cpu_map, &sd->groups, d->tmpmask);
7202#endif
7203 return sd;
7204}
7205
6817static struct sched_domain *__build_mc_sched_domain(struct s_data *d, 7206static struct sched_domain *__build_mc_sched_domain(struct s_data *d,
6818 const struct cpumask *cpu_map, struct sched_domain_attr *attr, 7207 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
6819 struct sched_domain *parent, int i) 7208 struct sched_domain *parent, int i)
@@ -6871,6 +7260,15 @@ static void build_sched_groups(struct s_data *d, enum sched_domain_level l,
6871 d->send_covered, d->tmpmask); 7260 d->send_covered, d->tmpmask);
6872 break; 7261 break;
6873#endif 7262#endif
7263#ifdef CONFIG_SCHED_BOOK
7264 case SD_LV_BOOK: /* set up book groups */
7265 cpumask_and(d->this_book_map, cpu_map, cpu_book_mask(cpu));
7266 if (cpu == cpumask_first(d->this_book_map))
7267 init_sched_build_groups(d->this_book_map, cpu_map,
7268 &cpu_to_book_group,
7269 d->send_covered, d->tmpmask);
7270 break;
7271#endif
6874 case SD_LV_CPU: /* set up physical groups */ 7272 case SD_LV_CPU: /* set up physical groups */
6875 cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map); 7273 cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map);
6876 if (!cpumask_empty(d->nodemask)) 7274 if (!cpumask_empty(d->nodemask))
@@ -6918,12 +7316,14 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
6918 7316
6919 sd = __build_numa_sched_domains(&d, cpu_map, attr, i); 7317 sd = __build_numa_sched_domains(&d, cpu_map, attr, i);
6920 sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i); 7318 sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i);
7319 sd = __build_book_sched_domain(&d, cpu_map, attr, sd, i);
6921 sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i); 7320 sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i);
6922 sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i); 7321 sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i);
6923 } 7322 }
6924 7323
6925 for_each_cpu(i, cpu_map) { 7324 for_each_cpu(i, cpu_map) {
6926 build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i); 7325 build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i);
7326 build_sched_groups(&d, SD_LV_BOOK, cpu_map, i);
6927 build_sched_groups(&d, SD_LV_MC, cpu_map, i); 7327 build_sched_groups(&d, SD_LV_MC, cpu_map, i);
6928 } 7328 }
6929 7329
@@ -6954,6 +7354,12 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
6954 init_sched_groups_power(i, sd); 7354 init_sched_groups_power(i, sd);
6955 } 7355 }
6956#endif 7356#endif
7357#ifdef CONFIG_SCHED_BOOK
7358 for_each_cpu(i, cpu_map) {
7359 sd = &per_cpu(book_domains, i).sd;
7360 init_sched_groups_power(i, sd);
7361 }
7362#endif
6957 7363
6958 for_each_cpu(i, cpu_map) { 7364 for_each_cpu(i, cpu_map) {
6959 sd = &per_cpu(phys_domains, i).sd; 7365 sd = &per_cpu(phys_domains, i).sd;
@@ -6979,6 +7385,8 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
6979 sd = &per_cpu(cpu_domains, i).sd; 7385 sd = &per_cpu(cpu_domains, i).sd;
6980#elif defined(CONFIG_SCHED_MC) 7386#elif defined(CONFIG_SCHED_MC)
6981 sd = &per_cpu(core_domains, i).sd; 7387 sd = &per_cpu(core_domains, i).sd;
7388#elif defined(CONFIG_SCHED_BOOK)
7389 sd = &per_cpu(book_domains, i).sd;
6982#else 7390#else
6983 sd = &per_cpu(phys_domains, i).sd; 7391 sd = &per_cpu(phys_domains, i).sd;
6984#endif 7392#endif
@@ -7288,29 +7696,35 @@ int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
7288} 7696}
7289#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ 7697#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
7290 7698
7291#ifndef CONFIG_CPUSETS
7292/* 7699/*
7293 * Add online and remove offline CPUs from the scheduler domains. 7700 * Update cpusets according to cpu_active mask. If cpusets are
7294 * When cpusets are enabled they take over this function. 7701 * disabled, cpuset_update_active_cpus() becomes a simple wrapper
7702 * around partition_sched_domains().
7295 */ 7703 */
7296static int update_sched_domains(struct notifier_block *nfb, 7704static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
7297 unsigned long action, void *hcpu) 7705 void *hcpu)
7298{ 7706{
7299 switch (action) { 7707 switch (action & ~CPU_TASKS_FROZEN) {
7300 case CPU_ONLINE: 7708 case CPU_ONLINE:
7301 case CPU_ONLINE_FROZEN:
7302 case CPU_DOWN_PREPARE:
7303 case CPU_DOWN_PREPARE_FROZEN:
7304 case CPU_DOWN_FAILED: 7709 case CPU_DOWN_FAILED:
7305 case CPU_DOWN_FAILED_FROZEN: 7710 cpuset_update_active_cpus();
7306 partition_sched_domains(1, NULL, NULL);
7307 return NOTIFY_OK; 7711 return NOTIFY_OK;
7712 default:
7713 return NOTIFY_DONE;
7714 }
7715}
7308 7716
7717static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
7718 void *hcpu)
7719{
7720 switch (action & ~CPU_TASKS_FROZEN) {
7721 case CPU_DOWN_PREPARE:
7722 cpuset_update_active_cpus();
7723 return NOTIFY_OK;
7309 default: 7724 default:
7310 return NOTIFY_DONE; 7725 return NOTIFY_DONE;
7311 } 7726 }
7312} 7727}
7313#endif
7314 7728
7315static int update_runtime(struct notifier_block *nfb, 7729static int update_runtime(struct notifier_block *nfb,
7316 unsigned long action, void *hcpu) 7730 unsigned long action, void *hcpu)
@@ -7356,10 +7770,8 @@ void __init sched_init_smp(void)
7356 mutex_unlock(&sched_domains_mutex); 7770 mutex_unlock(&sched_domains_mutex);
7357 put_online_cpus(); 7771 put_online_cpus();
7358 7772
7359#ifndef CONFIG_CPUSETS 7773 hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
7360 /* XXX: Theoretical race here - CPU may be hotplugged now */ 7774 hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
7361 hotcpu_notifier(update_sched_domains, 0);
7362#endif
7363 7775
7364 /* RT runtime code needs to handle some hotplug events */ 7776 /* RT runtime code needs to handle some hotplug events */
7365 hotcpu_notifier(update_runtime, 0); 7777 hotcpu_notifier(update_runtime, 0);
@@ -7604,6 +8016,9 @@ void __init sched_init(void)
7604 8016
7605 for (j = 0; j < CPU_LOAD_IDX_MAX; j++) 8017 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
7606 rq->cpu_load[j] = 0; 8018 rq->cpu_load[j] = 0;
8019
8020 rq->last_load_update_tick = jiffies;
8021
7607#ifdef CONFIG_SMP 8022#ifdef CONFIG_SMP
7608 rq->sd = NULL; 8023 rq->sd = NULL;
7609 rq->rd = NULL; 8024 rq->rd = NULL;
@@ -7617,6 +8032,10 @@ void __init sched_init(void)
7617 rq->idle_stamp = 0; 8032 rq->idle_stamp = 0;
7618 rq->avg_idle = 2*sysctl_sched_migration_cost; 8033 rq->avg_idle = 2*sysctl_sched_migration_cost;
7619 rq_attach_root(rq, &def_root_domain); 8034 rq_attach_root(rq, &def_root_domain);
8035#ifdef CONFIG_NO_HZ
8036 rq->nohz_balance_kick = 0;
8037 init_sched_softirq_csd(&per_cpu(remote_sched_softirq_cb, i));
8038#endif
7620#endif 8039#endif
7621 init_rq_hrtick(rq); 8040 init_rq_hrtick(rq);
7622 atomic_set(&rq->nr_iowait, 0); 8041 atomic_set(&rq->nr_iowait, 0);
@@ -7661,8 +8080,11 @@ void __init sched_init(void)
7661 zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); 8080 zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
7662#ifdef CONFIG_SMP 8081#ifdef CONFIG_SMP
7663#ifdef CONFIG_NO_HZ 8082#ifdef CONFIG_NO_HZ
7664 zalloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT); 8083 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
7665 alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT); 8084 alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT);
8085 atomic_set(&nohz.load_balancer, nr_cpu_ids);
8086 atomic_set(&nohz.first_pick_cpu, nr_cpu_ids);
8087 atomic_set(&nohz.second_pick_cpu, nr_cpu_ids);
7666#endif 8088#endif
7667 /* May be allocated at isolcpus cmdline parse time */ 8089 /* May be allocated at isolcpus cmdline parse time */
7668 if (cpu_isolated_map == NULL) 8090 if (cpu_isolated_map == NULL)
@@ -7869,9 +8291,9 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
7869 8291
7870 return 1; 8292 return 1;
7871 8293
7872 err_free_rq: 8294err_free_rq:
7873 kfree(cfs_rq); 8295 kfree(cfs_rq);
7874 err: 8296err:
7875 return 0; 8297 return 0;
7876} 8298}
7877 8299
@@ -7959,9 +8381,9 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
7959 8381
7960 return 1; 8382 return 1;
7961 8383
7962 err_free_rq: 8384err_free_rq:
7963 kfree(rt_rq); 8385 kfree(rt_rq);
7964 err: 8386err:
7965 return 0; 8387 return 0;
7966} 8388}
7967 8389
@@ -8319,7 +8741,7 @@ static int tg_set_bandwidth(struct task_group *tg,
8319 raw_spin_unlock(&rt_rq->rt_runtime_lock); 8741 raw_spin_unlock(&rt_rq->rt_runtime_lock);
8320 } 8742 }
8321 raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); 8743 raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
8322 unlock: 8744unlock:
8323 read_unlock(&tasklist_lock); 8745 read_unlock(&tasklist_lock);
8324 mutex_unlock(&rt_constraints_mutex); 8746 mutex_unlock(&rt_constraints_mutex);
8325 8747