diff options
Diffstat (limited to 'kernel/sched.c')
| -rw-r--r-- | kernel/sched.c | 730 |
1 files changed, 576 insertions, 154 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index f52a8801b7a2..d42992bccdfa 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
| @@ -77,6 +77,7 @@ | |||
| 77 | #include <asm/irq_regs.h> | 77 | #include <asm/irq_regs.h> |
| 78 | 78 | ||
| 79 | #include "sched_cpupri.h" | 79 | #include "sched_cpupri.h" |
| 80 | #include "workqueue_sched.h" | ||
| 80 | 81 | ||
| 81 | #define CREATE_TRACE_POINTS | 82 | #define CREATE_TRACE_POINTS |
| 82 | #include <trace/events/sched.h> | 83 | #include <trace/events/sched.h> |
| @@ -425,9 +426,7 @@ struct root_domain { | |||
| 425 | */ | 426 | */ |
| 426 | cpumask_var_t rto_mask; | 427 | cpumask_var_t rto_mask; |
| 427 | atomic_t rto_count; | 428 | atomic_t rto_count; |
| 428 | #ifdef CONFIG_SMP | ||
| 429 | struct cpupri cpupri; | 429 | struct cpupri cpupri; |
| 430 | #endif | ||
| 431 | }; | 430 | }; |
| 432 | 431 | ||
| 433 | /* | 432 | /* |
| @@ -436,7 +435,7 @@ struct root_domain { | |||
| 436 | */ | 435 | */ |
| 437 | static struct root_domain def_root_domain; | 436 | static struct root_domain def_root_domain; |
| 438 | 437 | ||
| 439 | #endif | 438 | #endif /* CONFIG_SMP */ |
| 440 | 439 | ||
| 441 | /* | 440 | /* |
| 442 | * This is the main, per-CPU runqueue data structure. | 441 | * This is the main, per-CPU runqueue data structure. |
| @@ -456,9 +455,10 @@ struct rq { | |||
| 456 | unsigned long nr_running; | 455 | unsigned long nr_running; |
| 457 | #define CPU_LOAD_IDX_MAX 5 | 456 | #define CPU_LOAD_IDX_MAX 5 |
| 458 | unsigned long cpu_load[CPU_LOAD_IDX_MAX]; | 457 | unsigned long cpu_load[CPU_LOAD_IDX_MAX]; |
| 458 | unsigned long last_load_update_tick; | ||
| 459 | #ifdef CONFIG_NO_HZ | 459 | #ifdef CONFIG_NO_HZ |
| 460 | u64 nohz_stamp; | 460 | u64 nohz_stamp; |
| 461 | unsigned char in_nohz_recently; | 461 | unsigned char nohz_balance_kick; |
| 462 | #endif | 462 | #endif |
| 463 | unsigned int skip_clock_update; | 463 | unsigned int skip_clock_update; |
| 464 | 464 | ||
| @@ -486,11 +486,12 @@ struct rq { | |||
| 486 | */ | 486 | */ |
| 487 | unsigned long nr_uninterruptible; | 487 | unsigned long nr_uninterruptible; |
| 488 | 488 | ||
| 489 | struct task_struct *curr, *idle; | 489 | struct task_struct *curr, *idle, *stop; |
| 490 | unsigned long next_balance; | 490 | unsigned long next_balance; |
| 491 | struct mm_struct *prev_mm; | 491 | struct mm_struct *prev_mm; |
| 492 | 492 | ||
| 493 | u64 clock; | 493 | u64 clock; |
| 494 | u64 clock_task; | ||
| 494 | 495 | ||
| 495 | atomic_t nr_iowait; | 496 | atomic_t nr_iowait; |
| 496 | 497 | ||
| @@ -518,6 +519,10 @@ struct rq { | |||
| 518 | u64 avg_idle; | 519 | u64 avg_idle; |
| 519 | #endif | 520 | #endif |
| 520 | 521 | ||
| 522 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | ||
| 523 | u64 prev_irq_time; | ||
| 524 | #endif | ||
| 525 | |||
| 521 | /* calc_load related fields */ | 526 | /* calc_load related fields */ |
| 522 | unsigned long calc_load_update; | 527 | unsigned long calc_load_update; |
| 523 | long calc_load_active; | 528 | long calc_load_active; |
| @@ -641,10 +646,22 @@ static inline struct task_group *task_group(struct task_struct *p) | |||
| 641 | 646 | ||
| 642 | #endif /* CONFIG_CGROUP_SCHED */ | 647 | #endif /* CONFIG_CGROUP_SCHED */ |
| 643 | 648 | ||
| 649 | static u64 irq_time_cpu(int cpu); | ||
| 650 | static void sched_irq_time_avg_update(struct rq *rq, u64 irq_time); | ||
| 651 | |||
| 644 | inline void update_rq_clock(struct rq *rq) | 652 | inline void update_rq_clock(struct rq *rq) |
| 645 | { | 653 | { |
| 646 | if (!rq->skip_clock_update) | 654 | if (!rq->skip_clock_update) { |
| 647 | rq->clock = sched_clock_cpu(cpu_of(rq)); | 655 | int cpu = cpu_of(rq); |
| 656 | u64 irq_time; | ||
| 657 | |||
| 658 | rq->clock = sched_clock_cpu(cpu); | ||
| 659 | irq_time = irq_time_cpu(cpu); | ||
| 660 | if (rq->clock - irq_time > rq->clock_task) | ||
| 661 | rq->clock_task = rq->clock - irq_time; | ||
| 662 | |||
| 663 | sched_irq_time_avg_update(rq, irq_time); | ||
| 664 | } | ||
| 648 | } | 665 | } |
| 649 | 666 | ||
| 650 | /* | 667 | /* |
| @@ -721,7 +738,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf, | |||
| 721 | size_t cnt, loff_t *ppos) | 738 | size_t cnt, loff_t *ppos) |
| 722 | { | 739 | { |
| 723 | char buf[64]; | 740 | char buf[64]; |
| 724 | char *cmp = buf; | 741 | char *cmp; |
| 725 | int neg = 0; | 742 | int neg = 0; |
| 726 | int i; | 743 | int i; |
| 727 | 744 | ||
| @@ -732,6 +749,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf, | |||
| 732 | return -EFAULT; | 749 | return -EFAULT; |
| 733 | 750 | ||
| 734 | buf[cnt] = 0; | 751 | buf[cnt] = 0; |
| 752 | cmp = strstrip(buf); | ||
| 735 | 753 | ||
| 736 | if (strncmp(buf, "NO_", 3) == 0) { | 754 | if (strncmp(buf, "NO_", 3) == 0) { |
| 737 | neg = 1; | 755 | neg = 1; |
| @@ -739,9 +757,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf, | |||
| 739 | } | 757 | } |
| 740 | 758 | ||
| 741 | for (i = 0; sched_feat_names[i]; i++) { | 759 | for (i = 0; sched_feat_names[i]; i++) { |
| 742 | int len = strlen(sched_feat_names[i]); | 760 | if (strcmp(cmp, sched_feat_names[i]) == 0) { |
| 743 | |||
| 744 | if (strncmp(cmp, sched_feat_names[i], len) == 0) { | ||
| 745 | if (neg) | 761 | if (neg) |
| 746 | sysctl_sched_features &= ~(1UL << i); | 762 | sysctl_sched_features &= ~(1UL << i); |
| 747 | else | 763 | else |
| @@ -1193,6 +1209,27 @@ static void resched_cpu(int cpu) | |||
| 1193 | 1209 | ||
| 1194 | #ifdef CONFIG_NO_HZ | 1210 | #ifdef CONFIG_NO_HZ |
| 1195 | /* | 1211 | /* |
| 1212 | * In the semi idle case, use the nearest busy cpu for migrating timers | ||
| 1213 | * from an idle cpu. This is good for power-savings. | ||
| 1214 | * | ||
| 1215 | * We don't do similar optimization for completely idle system, as | ||
| 1216 | * selecting an idle cpu will add more delays to the timers than intended | ||
| 1217 | * (as that cpu's timer base may not be uptodate wrt jiffies etc). | ||
| 1218 | */ | ||
| 1219 | int get_nohz_timer_target(void) | ||
| 1220 | { | ||
| 1221 | int cpu = smp_processor_id(); | ||
| 1222 | int i; | ||
| 1223 | struct sched_domain *sd; | ||
| 1224 | |||
| 1225 | for_each_domain(cpu, sd) { | ||
| 1226 | for_each_cpu(i, sched_domain_span(sd)) | ||
| 1227 | if (!idle_cpu(i)) | ||
| 1228 | return i; | ||
| 1229 | } | ||
| 1230 | return cpu; | ||
| 1231 | } | ||
| 1232 | /* | ||
| 1196 | * When add_timer_on() enqueues a timer into the timer wheel of an | 1233 | * When add_timer_on() enqueues a timer into the timer wheel of an |
| 1197 | * idle CPU then this timer might expire before the next timer event | 1234 | * idle CPU then this timer might expire before the next timer event |
| 1198 | * which is scheduled to wake up that CPU. In case of a completely | 1235 | * which is scheduled to wake up that CPU. In case of a completely |
| @@ -1232,16 +1269,6 @@ void wake_up_idle_cpu(int cpu) | |||
| 1232 | smp_send_reschedule(cpu); | 1269 | smp_send_reschedule(cpu); |
| 1233 | } | 1270 | } |
| 1234 | 1271 | ||
| 1235 | int nohz_ratelimit(int cpu) | ||
| 1236 | { | ||
| 1237 | struct rq *rq = cpu_rq(cpu); | ||
| 1238 | u64 diff = rq->clock - rq->nohz_stamp; | ||
| 1239 | |||
| 1240 | rq->nohz_stamp = rq->clock; | ||
| 1241 | |||
| 1242 | return diff < (NSEC_PER_SEC / HZ) >> 1; | ||
| 1243 | } | ||
| 1244 | |||
| 1245 | #endif /* CONFIG_NO_HZ */ | 1272 | #endif /* CONFIG_NO_HZ */ |
| 1246 | 1273 | ||
| 1247 | static u64 sched_avg_period(void) | 1274 | static u64 sched_avg_period(void) |
| @@ -1281,6 +1308,10 @@ static void resched_task(struct task_struct *p) | |||
| 1281 | static void sched_rt_avg_update(struct rq *rq, u64 rt_delta) | 1308 | static void sched_rt_avg_update(struct rq *rq, u64 rt_delta) |
| 1282 | { | 1309 | { |
| 1283 | } | 1310 | } |
| 1311 | |||
| 1312 | static void sched_avg_update(struct rq *rq) | ||
| 1313 | { | ||
| 1314 | } | ||
| 1284 | #endif /* CONFIG_SMP */ | 1315 | #endif /* CONFIG_SMP */ |
| 1285 | 1316 | ||
| 1286 | #if BITS_PER_LONG == 32 | 1317 | #if BITS_PER_LONG == 32 |
| @@ -1652,7 +1683,7 @@ static void update_shares(struct sched_domain *sd) | |||
| 1652 | if (root_task_group_empty()) | 1683 | if (root_task_group_empty()) |
| 1653 | return; | 1684 | return; |
| 1654 | 1685 | ||
| 1655 | now = cpu_clock(raw_smp_processor_id()); | 1686 | now = local_clock(); |
| 1656 | elapsed = now - sd->last_update; | 1687 | elapsed = now - sd->last_update; |
| 1657 | 1688 | ||
| 1658 | if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { | 1689 | if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { |
| @@ -1805,6 +1836,7 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) | |||
| 1805 | static void calc_load_account_idle(struct rq *this_rq); | 1836 | static void calc_load_account_idle(struct rq *this_rq); |
| 1806 | static void update_sysctl(void); | 1837 | static void update_sysctl(void); |
| 1807 | static int get_update_sysctl_factor(void); | 1838 | static int get_update_sysctl_factor(void); |
| 1839 | static void update_cpu_load(struct rq *this_rq); | ||
| 1808 | 1840 | ||
| 1809 | static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) | 1841 | static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) |
| 1810 | { | 1842 | { |
| @@ -1822,7 +1854,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) | |||
| 1822 | 1854 | ||
| 1823 | static const struct sched_class rt_sched_class; | 1855 | static const struct sched_class rt_sched_class; |
| 1824 | 1856 | ||
| 1825 | #define sched_class_highest (&rt_sched_class) | 1857 | #define sched_class_highest (&stop_sched_class) |
| 1826 | #define for_each_class(class) \ | 1858 | #define for_each_class(class) \ |
| 1827 | for (class = sched_class_highest; class; class = class->next) | 1859 | for (class = sched_class_highest; class; class = class->next) |
| 1828 | 1860 | ||
| @@ -1840,12 +1872,6 @@ static void dec_nr_running(struct rq *rq) | |||
| 1840 | 1872 | ||
| 1841 | static void set_load_weight(struct task_struct *p) | 1873 | static void set_load_weight(struct task_struct *p) |
| 1842 | { | 1874 | { |
| 1843 | if (task_has_rt_policy(p)) { | ||
| 1844 | p->se.load.weight = 0; | ||
| 1845 | p->se.load.inv_weight = WMULT_CONST; | ||
| 1846 | return; | ||
| 1847 | } | ||
| 1848 | |||
| 1849 | /* | 1875 | /* |
| 1850 | * SCHED_IDLE tasks get minimal weight: | 1876 | * SCHED_IDLE tasks get minimal weight: |
| 1851 | */ | 1877 | */ |
| @@ -1899,13 +1925,132 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags) | |||
| 1899 | dec_nr_running(rq); | 1925 | dec_nr_running(rq); |
| 1900 | } | 1926 | } |
| 1901 | 1927 | ||
| 1928 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | ||
| 1929 | |||
| 1930 | /* | ||
| 1931 | * There are no locks covering percpu hardirq/softirq time. | ||
| 1932 | * They are only modified in account_system_vtime, on corresponding CPU | ||
| 1933 | * with interrupts disabled. So, writes are safe. | ||
| 1934 | * They are read and saved off onto struct rq in update_rq_clock(). | ||
| 1935 | * This may result in other CPU reading this CPU's irq time and can | ||
| 1936 | * race with irq/account_system_vtime on this CPU. We would either get old | ||
| 1937 | * or new value (or semi updated value on 32 bit) with a side effect of | ||
| 1938 | * accounting a slice of irq time to wrong task when irq is in progress | ||
| 1939 | * while we read rq->clock. That is a worthy compromise in place of having | ||
| 1940 | * locks on each irq in account_system_time. | ||
| 1941 | */ | ||
| 1942 | static DEFINE_PER_CPU(u64, cpu_hardirq_time); | ||
| 1943 | static DEFINE_PER_CPU(u64, cpu_softirq_time); | ||
| 1944 | |||
| 1945 | static DEFINE_PER_CPU(u64, irq_start_time); | ||
| 1946 | static int sched_clock_irqtime; | ||
| 1947 | |||
| 1948 | void enable_sched_clock_irqtime(void) | ||
| 1949 | { | ||
| 1950 | sched_clock_irqtime = 1; | ||
| 1951 | } | ||
| 1952 | |||
| 1953 | void disable_sched_clock_irqtime(void) | ||
| 1954 | { | ||
| 1955 | sched_clock_irqtime = 0; | ||
| 1956 | } | ||
| 1957 | |||
| 1958 | static u64 irq_time_cpu(int cpu) | ||
| 1959 | { | ||
| 1960 | if (!sched_clock_irqtime) | ||
| 1961 | return 0; | ||
| 1962 | |||
| 1963 | return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu); | ||
| 1964 | } | ||
| 1965 | |||
| 1966 | void account_system_vtime(struct task_struct *curr) | ||
| 1967 | { | ||
| 1968 | unsigned long flags; | ||
| 1969 | int cpu; | ||
| 1970 | u64 now, delta; | ||
| 1971 | |||
| 1972 | if (!sched_clock_irqtime) | ||
| 1973 | return; | ||
| 1974 | |||
| 1975 | local_irq_save(flags); | ||
| 1976 | |||
| 1977 | cpu = smp_processor_id(); | ||
| 1978 | now = sched_clock_cpu(cpu); | ||
| 1979 | delta = now - per_cpu(irq_start_time, cpu); | ||
| 1980 | per_cpu(irq_start_time, cpu) = now; | ||
| 1981 | /* | ||
| 1982 | * We do not account for softirq time from ksoftirqd here. | ||
| 1983 | * We want to continue accounting softirq time to ksoftirqd thread | ||
| 1984 | * in that case, so as not to confuse scheduler with a special task | ||
| 1985 | * that do not consume any time, but still wants to run. | ||
| 1986 | */ | ||
| 1987 | if (hardirq_count()) | ||
| 1988 | per_cpu(cpu_hardirq_time, cpu) += delta; | ||
| 1989 | else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD)) | ||
| 1990 | per_cpu(cpu_softirq_time, cpu) += delta; | ||
| 1991 | |||
| 1992 | local_irq_restore(flags); | ||
| 1993 | } | ||
| 1994 | EXPORT_SYMBOL_GPL(account_system_vtime); | ||
| 1995 | |||
| 1996 | static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) | ||
| 1997 | { | ||
| 1998 | if (sched_clock_irqtime && sched_feat(NONIRQ_POWER)) { | ||
| 1999 | u64 delta_irq = curr_irq_time - rq->prev_irq_time; | ||
| 2000 | rq->prev_irq_time = curr_irq_time; | ||
| 2001 | sched_rt_avg_update(rq, delta_irq); | ||
| 2002 | } | ||
| 2003 | } | ||
| 2004 | |||
| 2005 | #else | ||
| 2006 | |||
| 2007 | static u64 irq_time_cpu(int cpu) | ||
| 2008 | { | ||
| 2009 | return 0; | ||
| 2010 | } | ||
| 2011 | |||
| 2012 | static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) { } | ||
| 2013 | |||
| 2014 | #endif | ||
| 2015 | |||
| 1902 | #include "sched_idletask.c" | 2016 | #include "sched_idletask.c" |
| 1903 | #include "sched_fair.c" | 2017 | #include "sched_fair.c" |
| 1904 | #include "sched_rt.c" | 2018 | #include "sched_rt.c" |
| 2019 | #include "sched_stoptask.c" | ||
| 1905 | #ifdef CONFIG_SCHED_DEBUG | 2020 | #ifdef CONFIG_SCHED_DEBUG |
| 1906 | # include "sched_debug.c" | 2021 | # include "sched_debug.c" |
| 1907 | #endif | 2022 | #endif |
| 1908 | 2023 | ||
| 2024 | void sched_set_stop_task(int cpu, struct task_struct *stop) | ||
| 2025 | { | ||
| 2026 | struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; | ||
| 2027 | struct task_struct *old_stop = cpu_rq(cpu)->stop; | ||
| 2028 | |||
| 2029 | if (stop) { | ||
| 2030 | /* | ||
| 2031 | * Make it appear like a SCHED_FIFO task, its something | ||
| 2032 | * userspace knows about and won't get confused about. | ||
| 2033 | * | ||
| 2034 | * Also, it will make PI more or less work without too | ||
| 2035 | * much confusion -- but then, stop work should not | ||
| 2036 | * rely on PI working anyway. | ||
| 2037 | */ | ||
| 2038 | sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m); | ||
| 2039 | |||
| 2040 | stop->sched_class = &stop_sched_class; | ||
| 2041 | } | ||
| 2042 | |||
| 2043 | cpu_rq(cpu)->stop = stop; | ||
| 2044 | |||
| 2045 | if (old_stop) { | ||
| 2046 | /* | ||
| 2047 | * Reset it back to a normal scheduling class so that | ||
| 2048 | * it can die in pieces. | ||
| 2049 | */ | ||
| 2050 | old_stop->sched_class = &rt_sched_class; | ||
| 2051 | } | ||
| 2052 | } | ||
| 2053 | |||
| 1909 | /* | 2054 | /* |
| 1910 | * __normal_prio - return the priority that is based on the static prio | 2055 | * __normal_prio - return the priority that is based on the static prio |
| 1911 | */ | 2056 | */ |
| @@ -1985,6 +2130,9 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) | |||
| 1985 | if (p->sched_class != &fair_sched_class) | 2130 | if (p->sched_class != &fair_sched_class) |
| 1986 | return 0; | 2131 | return 0; |
| 1987 | 2132 | ||
| 2133 | if (unlikely(p->policy == SCHED_IDLE)) | ||
| 2134 | return 0; | ||
| 2135 | |||
| 1988 | /* | 2136 | /* |
| 1989 | * Buddy candidates are cache hot: | 2137 | * Buddy candidates are cache hot: |
| 1990 | */ | 2138 | */ |
| @@ -2267,11 +2415,55 @@ static void update_avg(u64 *avg, u64 sample) | |||
| 2267 | } | 2415 | } |
| 2268 | #endif | 2416 | #endif |
| 2269 | 2417 | ||
| 2270 | /*** | 2418 | static inline void ttwu_activate(struct task_struct *p, struct rq *rq, |
| 2419 | bool is_sync, bool is_migrate, bool is_local, | ||
| 2420 | unsigned long en_flags) | ||
| 2421 | { | ||
| 2422 | schedstat_inc(p, se.statistics.nr_wakeups); | ||
| 2423 | if (is_sync) | ||
| 2424 | schedstat_inc(p, se.statistics.nr_wakeups_sync); | ||
| 2425 | if (is_migrate) | ||
| 2426 | schedstat_inc(p, se.statistics.nr_wakeups_migrate); | ||
| 2427 | if (is_local) | ||
| 2428 | schedstat_inc(p, se.statistics.nr_wakeups_local); | ||
| 2429 | else | ||
| 2430 | schedstat_inc(p, se.statistics.nr_wakeups_remote); | ||
| 2431 | |||
| 2432 | activate_task(rq, p, en_flags); | ||
| 2433 | } | ||
| 2434 | |||
| 2435 | static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq, | ||
| 2436 | int wake_flags, bool success) | ||
| 2437 | { | ||
| 2438 | trace_sched_wakeup(p, success); | ||
| 2439 | check_preempt_curr(rq, p, wake_flags); | ||
| 2440 | |||
| 2441 | p->state = TASK_RUNNING; | ||
| 2442 | #ifdef CONFIG_SMP | ||
| 2443 | if (p->sched_class->task_woken) | ||
| 2444 | p->sched_class->task_woken(rq, p); | ||
| 2445 | |||
| 2446 | if (unlikely(rq->idle_stamp)) { | ||
| 2447 | u64 delta = rq->clock - rq->idle_stamp; | ||
| 2448 | u64 max = 2*sysctl_sched_migration_cost; | ||
| 2449 | |||
| 2450 | if (delta > max) | ||
| 2451 | rq->avg_idle = max; | ||
| 2452 | else | ||
| 2453 | update_avg(&rq->avg_idle, delta); | ||
| 2454 | rq->idle_stamp = 0; | ||
| 2455 | } | ||
| 2456 | #endif | ||
| 2457 | /* if a worker is waking up, notify workqueue */ | ||
| 2458 | if ((p->flags & PF_WQ_WORKER) && success) | ||
| 2459 | wq_worker_waking_up(p, cpu_of(rq)); | ||
| 2460 | } | ||
| 2461 | |||
| 2462 | /** | ||
| 2271 | * try_to_wake_up - wake up a thread | 2463 | * try_to_wake_up - wake up a thread |
| 2272 | * @p: the to-be-woken-up thread | 2464 | * @p: the thread to be awakened |
| 2273 | * @state: the mask of task states that can be woken | 2465 | * @state: the mask of task states that can be woken |
| 2274 | * @sync: do a synchronous wakeup? | 2466 | * @wake_flags: wake modifier flags (WF_*) |
| 2275 | * | 2467 | * |
| 2276 | * Put it on the run-queue if it's not already there. The "current" | 2468 | * Put it on the run-queue if it's not already there. The "current" |
| 2277 | * thread is always on the run-queue (except when the actual | 2469 | * thread is always on the run-queue (except when the actual |
| @@ -2279,7 +2471,8 @@ static void update_avg(u64 *avg, u64 sample) | |||
| 2279 | * the simpler "current->state = TASK_RUNNING" to mark yourself | 2471 | * the simpler "current->state = TASK_RUNNING" to mark yourself |
| 2280 | * runnable without the overhead of this. | 2472 | * runnable without the overhead of this. |
| 2281 | * | 2473 | * |
| 2282 | * returns failure only if the task is already active. | 2474 | * Returns %true if @p was woken up, %false if it was already running |
| 2475 | * or @state didn't match @p's state. | ||
| 2283 | */ | 2476 | */ |
| 2284 | static int try_to_wake_up(struct task_struct *p, unsigned int state, | 2477 | static int try_to_wake_up(struct task_struct *p, unsigned int state, |
| 2285 | int wake_flags) | 2478 | int wake_flags) |
| @@ -2359,38 +2552,11 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, | |||
| 2359 | 2552 | ||
| 2360 | out_activate: | 2553 | out_activate: |
| 2361 | #endif /* CONFIG_SMP */ | 2554 | #endif /* CONFIG_SMP */ |
| 2362 | schedstat_inc(p, se.statistics.nr_wakeups); | 2555 | ttwu_activate(p, rq, wake_flags & WF_SYNC, orig_cpu != cpu, |
| 2363 | if (wake_flags & WF_SYNC) | 2556 | cpu == this_cpu, en_flags); |
| 2364 | schedstat_inc(p, se.statistics.nr_wakeups_sync); | ||
| 2365 | if (orig_cpu != cpu) | ||
| 2366 | schedstat_inc(p, se.statistics.nr_wakeups_migrate); | ||
| 2367 | if (cpu == this_cpu) | ||
| 2368 | schedstat_inc(p, se.statistics.nr_wakeups_local); | ||
| 2369 | else | ||
| 2370 | schedstat_inc(p, se.statistics.nr_wakeups_remote); | ||
| 2371 | activate_task(rq, p, en_flags); | ||
| 2372 | success = 1; | 2557 | success = 1; |
| 2373 | |||
| 2374 | out_running: | 2558 | out_running: |
| 2375 | trace_sched_wakeup(p, success); | 2559 | ttwu_post_activation(p, rq, wake_flags, success); |
| 2376 | check_preempt_curr(rq, p, wake_flags); | ||
| 2377 | |||
| 2378 | p->state = TASK_RUNNING; | ||
| 2379 | #ifdef CONFIG_SMP | ||
| 2380 | if (p->sched_class->task_woken) | ||
| 2381 | p->sched_class->task_woken(rq, p); | ||
| 2382 | |||
| 2383 | if (unlikely(rq->idle_stamp)) { | ||
| 2384 | u64 delta = rq->clock - rq->idle_stamp; | ||
| 2385 | u64 max = 2*sysctl_sched_migration_cost; | ||
| 2386 | |||
| 2387 | if (delta > max) | ||
| 2388 | rq->avg_idle = max; | ||
| 2389 | else | ||
| 2390 | update_avg(&rq->avg_idle, delta); | ||
| 2391 | rq->idle_stamp = 0; | ||
| 2392 | } | ||
| 2393 | #endif | ||
| 2394 | out: | 2560 | out: |
| 2395 | task_rq_unlock(rq, &flags); | 2561 | task_rq_unlock(rq, &flags); |
| 2396 | put_cpu(); | 2562 | put_cpu(); |
| @@ -2399,6 +2565,37 @@ out: | |||
| 2399 | } | 2565 | } |
| 2400 | 2566 | ||
| 2401 | /** | 2567 | /** |
| 2568 | * try_to_wake_up_local - try to wake up a local task with rq lock held | ||
| 2569 | * @p: the thread to be awakened | ||
| 2570 | * | ||
| 2571 | * Put @p on the run-queue if it's not alredy there. The caller must | ||
| 2572 | * ensure that this_rq() is locked, @p is bound to this_rq() and not | ||
| 2573 | * the current task. this_rq() stays locked over invocation. | ||
| 2574 | */ | ||
| 2575 | static void try_to_wake_up_local(struct task_struct *p) | ||
| 2576 | { | ||
| 2577 | struct rq *rq = task_rq(p); | ||
| 2578 | bool success = false; | ||
| 2579 | |||
| 2580 | BUG_ON(rq != this_rq()); | ||
| 2581 | BUG_ON(p == current); | ||
| 2582 | lockdep_assert_held(&rq->lock); | ||
| 2583 | |||
| 2584 | if (!(p->state & TASK_NORMAL)) | ||
| 2585 | return; | ||
| 2586 | |||
| 2587 | if (!p->se.on_rq) { | ||
| 2588 | if (likely(!task_running(rq, p))) { | ||
| 2589 | schedstat_inc(rq, ttwu_count); | ||
| 2590 | schedstat_inc(rq, ttwu_local); | ||
| 2591 | } | ||
| 2592 | ttwu_activate(p, rq, false, false, true, ENQUEUE_WAKEUP); | ||
| 2593 | success = true; | ||
| 2594 | } | ||
| 2595 | ttwu_post_activation(p, rq, 0, success); | ||
| 2596 | } | ||
| 2597 | |||
| 2598 | /** | ||
| 2402 | * wake_up_process - Wake up a specific process | 2599 | * wake_up_process - Wake up a specific process |
| 2403 | * @p: The process to be woken up. | 2600 | * @p: The process to be woken up. |
| 2404 | * | 2601 | * |
| @@ -2785,14 +2982,14 @@ context_switch(struct rq *rq, struct task_struct *prev, | |||
| 2785 | */ | 2982 | */ |
| 2786 | arch_start_context_switch(prev); | 2983 | arch_start_context_switch(prev); |
| 2787 | 2984 | ||
| 2788 | if (likely(!mm)) { | 2985 | if (!mm) { |
| 2789 | next->active_mm = oldmm; | 2986 | next->active_mm = oldmm; |
| 2790 | atomic_inc(&oldmm->mm_count); | 2987 | atomic_inc(&oldmm->mm_count); |
| 2791 | enter_lazy_tlb(oldmm, next); | 2988 | enter_lazy_tlb(oldmm, next); |
| 2792 | } else | 2989 | } else |
| 2793 | switch_mm(oldmm, mm, next); | 2990 | switch_mm(oldmm, mm, next); |
| 2794 | 2991 | ||
| 2795 | if (likely(!prev->mm)) { | 2992 | if (!prev->mm) { |
| 2796 | prev->active_mm = NULL; | 2993 | prev->active_mm = NULL; |
| 2797 | rq->prev_mm = oldmm; | 2994 | rq->prev_mm = oldmm; |
| 2798 | } | 2995 | } |
| @@ -3012,23 +3209,102 @@ static void calc_load_account_active(struct rq *this_rq) | |||
| 3012 | } | 3209 | } |
| 3013 | 3210 | ||
| 3014 | /* | 3211 | /* |
| 3212 | * The exact cpuload at various idx values, calculated at every tick would be | ||
| 3213 | * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load | ||
| 3214 | * | ||
| 3215 | * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called | ||
| 3216 | * on nth tick when cpu may be busy, then we have: | ||
| 3217 | * load = ((2^idx - 1) / 2^idx)^(n-1) * load | ||
| 3218 | * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load | ||
| 3219 | * | ||
| 3220 | * decay_load_missed() below does efficient calculation of | ||
| 3221 | * load = ((2^idx - 1) / 2^idx)^(n-1) * load | ||
| 3222 | * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load | ||
| 3223 | * | ||
| 3224 | * The calculation is approximated on a 128 point scale. | ||
| 3225 | * degrade_zero_ticks is the number of ticks after which load at any | ||
| 3226 | * particular idx is approximated to be zero. | ||
| 3227 | * degrade_factor is a precomputed table, a row for each load idx. | ||
| 3228 | * Each column corresponds to degradation factor for a power of two ticks, | ||
| 3229 | * based on 128 point scale. | ||
| 3230 | * Example: | ||
| 3231 | * row 2, col 3 (=12) says that the degradation at load idx 2 after | ||
| 3232 | * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8). | ||
| 3233 | * | ||
| 3234 | * With this power of 2 load factors, we can degrade the load n times | ||
| 3235 | * by looking at 1 bits in n and doing as many mult/shift instead of | ||
| 3236 | * n mult/shifts needed by the exact degradation. | ||
| 3237 | */ | ||
| 3238 | #define DEGRADE_SHIFT 7 | ||
| 3239 | static const unsigned char | ||
| 3240 | degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128}; | ||
| 3241 | static const unsigned char | ||
| 3242 | degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = { | ||
| 3243 | {0, 0, 0, 0, 0, 0, 0, 0}, | ||
| 3244 | {64, 32, 8, 0, 0, 0, 0, 0}, | ||
| 3245 | {96, 72, 40, 12, 1, 0, 0}, | ||
| 3246 | {112, 98, 75, 43, 15, 1, 0}, | ||
| 3247 | {120, 112, 98, 76, 45, 16, 2} }; | ||
| 3248 | |||
| 3249 | /* | ||
| 3250 | * Update cpu_load for any missed ticks, due to tickless idle. The backlog | ||
| 3251 | * would be when CPU is idle and so we just decay the old load without | ||
| 3252 | * adding any new load. | ||
| 3253 | */ | ||
| 3254 | static unsigned long | ||
| 3255 | decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) | ||
| 3256 | { | ||
| 3257 | int j = 0; | ||
| 3258 | |||
| 3259 | if (!missed_updates) | ||
| 3260 | return load; | ||
| 3261 | |||
| 3262 | if (missed_updates >= degrade_zero_ticks[idx]) | ||
| 3263 | return 0; | ||
| 3264 | |||
| 3265 | if (idx == 1) | ||
| 3266 | return load >> missed_updates; | ||
| 3267 | |||
| 3268 | while (missed_updates) { | ||
| 3269 | if (missed_updates % 2) | ||
| 3270 | load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT; | ||
| 3271 | |||
| 3272 | missed_updates >>= 1; | ||
| 3273 | j++; | ||
| 3274 | } | ||
| 3275 | return load; | ||
| 3276 | } | ||
| 3277 | |||
| 3278 | /* | ||
| 3015 | * Update rq->cpu_load[] statistics. This function is usually called every | 3279 | * Update rq->cpu_load[] statistics. This function is usually called every |
| 3016 | * scheduler tick (TICK_NSEC). | 3280 | * scheduler tick (TICK_NSEC). With tickless idle this will not be called |
| 3281 | * every tick. We fix it up based on jiffies. | ||
| 3017 | */ | 3282 | */ |
| 3018 | static void update_cpu_load(struct rq *this_rq) | 3283 | static void update_cpu_load(struct rq *this_rq) |
| 3019 | { | 3284 | { |
| 3020 | unsigned long this_load = this_rq->load.weight; | 3285 | unsigned long this_load = this_rq->load.weight; |
| 3286 | unsigned long curr_jiffies = jiffies; | ||
| 3287 | unsigned long pending_updates; | ||
| 3021 | int i, scale; | 3288 | int i, scale; |
| 3022 | 3289 | ||
| 3023 | this_rq->nr_load_updates++; | 3290 | this_rq->nr_load_updates++; |
| 3024 | 3291 | ||
| 3292 | /* Avoid repeated calls on same jiffy, when moving in and out of idle */ | ||
| 3293 | if (curr_jiffies == this_rq->last_load_update_tick) | ||
| 3294 | return; | ||
| 3295 | |||
| 3296 | pending_updates = curr_jiffies - this_rq->last_load_update_tick; | ||
| 3297 | this_rq->last_load_update_tick = curr_jiffies; | ||
| 3298 | |||
| 3025 | /* Update our load: */ | 3299 | /* Update our load: */ |
| 3026 | for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { | 3300 | this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ |
| 3301 | for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { | ||
| 3027 | unsigned long old_load, new_load; | 3302 | unsigned long old_load, new_load; |
| 3028 | 3303 | ||
| 3029 | /* scale is effectively 1 << i now, and >> i divides by scale */ | 3304 | /* scale is effectively 1 << i now, and >> i divides by scale */ |
| 3030 | 3305 | ||
| 3031 | old_load = this_rq->cpu_load[i]; | 3306 | old_load = this_rq->cpu_load[i]; |
| 3307 | old_load = decay_load_missed(old_load, pending_updates - 1, i); | ||
| 3032 | new_load = this_load; | 3308 | new_load = this_load; |
| 3033 | /* | 3309 | /* |
| 3034 | * Round up the averaging division if load is increasing. This | 3310 | * Round up the averaging division if load is increasing. This |
| @@ -3036,10 +3312,18 @@ static void update_cpu_load(struct rq *this_rq) | |||
| 3036 | * example. | 3312 | * example. |
| 3037 | */ | 3313 | */ |
| 3038 | if (new_load > old_load) | 3314 | if (new_load > old_load) |
| 3039 | new_load += scale-1; | 3315 | new_load += scale - 1; |
| 3040 | this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; | 3316 | |
| 3317 | this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i; | ||
| 3041 | } | 3318 | } |
| 3042 | 3319 | ||
| 3320 | sched_avg_update(this_rq); | ||
| 3321 | } | ||
| 3322 | |||
| 3323 | static void update_cpu_load_active(struct rq *this_rq) | ||
| 3324 | { | ||
| 3325 | update_cpu_load(this_rq); | ||
| 3326 | |||
| 3043 | calc_load_account_active(this_rq); | 3327 | calc_load_account_active(this_rq); |
| 3044 | } | 3328 | } |
| 3045 | 3329 | ||
| @@ -3094,7 +3378,7 @@ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq) | |||
| 3094 | 3378 | ||
| 3095 | if (task_current(rq, p)) { | 3379 | if (task_current(rq, p)) { |
| 3096 | update_rq_clock(rq); | 3380 | update_rq_clock(rq); |
| 3097 | ns = rq->clock - p->se.exec_start; | 3381 | ns = rq->clock_task - p->se.exec_start; |
| 3098 | if ((s64)ns < 0) | 3382 | if ((s64)ns < 0) |
| 3099 | ns = 0; | 3383 | ns = 0; |
| 3100 | } | 3384 | } |
| @@ -3243,7 +3527,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset, | |||
| 3243 | tmp = cputime_to_cputime64(cputime); | 3527 | tmp = cputime_to_cputime64(cputime); |
| 3244 | if (hardirq_count() - hardirq_offset) | 3528 | if (hardirq_count() - hardirq_offset) |
| 3245 | cpustat->irq = cputime64_add(cpustat->irq, tmp); | 3529 | cpustat->irq = cputime64_add(cpustat->irq, tmp); |
| 3246 | else if (softirq_count()) | 3530 | else if (in_serving_softirq()) |
| 3247 | cpustat->softirq = cputime64_add(cpustat->softirq, tmp); | 3531 | cpustat->softirq = cputime64_add(cpustat->softirq, tmp); |
| 3248 | else | 3532 | else |
| 3249 | cpustat->system = cputime64_add(cpustat->system, tmp); | 3533 | cpustat->system = cputime64_add(cpustat->system, tmp); |
| @@ -3359,9 +3643,9 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | |||
| 3359 | rtime = nsecs_to_cputime(p->se.sum_exec_runtime); | 3643 | rtime = nsecs_to_cputime(p->se.sum_exec_runtime); |
| 3360 | 3644 | ||
| 3361 | if (total) { | 3645 | if (total) { |
| 3362 | u64 temp; | 3646 | u64 temp = rtime; |
| 3363 | 3647 | ||
| 3364 | temp = (u64)(rtime * utime); | 3648 | temp *= utime; |
| 3365 | do_div(temp, total); | 3649 | do_div(temp, total); |
| 3366 | utime = (cputime_t)temp; | 3650 | utime = (cputime_t)temp; |
| 3367 | } else | 3651 | } else |
| @@ -3392,9 +3676,9 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | |||
| 3392 | rtime = nsecs_to_cputime(cputime.sum_exec_runtime); | 3676 | rtime = nsecs_to_cputime(cputime.sum_exec_runtime); |
| 3393 | 3677 | ||
| 3394 | if (total) { | 3678 | if (total) { |
| 3395 | u64 temp; | 3679 | u64 temp = rtime; |
| 3396 | 3680 | ||
| 3397 | temp = (u64)(rtime * cputime.utime); | 3681 | temp *= cputime.utime; |
| 3398 | do_div(temp, total); | 3682 | do_div(temp, total); |
| 3399 | utime = (cputime_t)temp; | 3683 | utime = (cputime_t)temp; |
| 3400 | } else | 3684 | } else |
| @@ -3426,11 +3710,11 @@ void scheduler_tick(void) | |||
| 3426 | 3710 | ||
| 3427 | raw_spin_lock(&rq->lock); | 3711 | raw_spin_lock(&rq->lock); |
| 3428 | update_rq_clock(rq); | 3712 | update_rq_clock(rq); |
| 3429 | update_cpu_load(rq); | 3713 | update_cpu_load_active(rq); |
| 3430 | curr->sched_class->task_tick(rq, curr, 0); | 3714 | curr->sched_class->task_tick(rq, curr, 0); |
| 3431 | raw_spin_unlock(&rq->lock); | 3715 | raw_spin_unlock(&rq->lock); |
| 3432 | 3716 | ||
| 3433 | perf_event_task_tick(curr); | 3717 | perf_event_task_tick(); |
| 3434 | 3718 | ||
| 3435 | #ifdef CONFIG_SMP | 3719 | #ifdef CONFIG_SMP |
| 3436 | rq->idle_at_tick = idle_cpu(cpu); | 3720 | rq->idle_at_tick = idle_cpu(cpu); |
| @@ -3569,17 +3853,13 @@ pick_next_task(struct rq *rq) | |||
| 3569 | return p; | 3853 | return p; |
| 3570 | } | 3854 | } |
| 3571 | 3855 | ||
| 3572 | class = sched_class_highest; | 3856 | for_each_class(class) { |
| 3573 | for ( ; ; ) { | ||
| 3574 | p = class->pick_next_task(rq); | 3857 | p = class->pick_next_task(rq); |
| 3575 | if (p) | 3858 | if (p) |
| 3576 | return p; | 3859 | return p; |
| 3577 | /* | ||
| 3578 | * Will never be NULL as the idle class always | ||
| 3579 | * returns a non-NULL p: | ||
| 3580 | */ | ||
| 3581 | class = class->next; | ||
| 3582 | } | 3860 | } |
| 3861 | |||
| 3862 | BUG(); /* the idle class will always have a runnable task */ | ||
| 3583 | } | 3863 | } |
| 3584 | 3864 | ||
| 3585 | /* | 3865 | /* |
| @@ -3598,7 +3878,6 @@ need_resched: | |||
| 3598 | rq = cpu_rq(cpu); | 3878 | rq = cpu_rq(cpu); |
| 3599 | rcu_note_context_switch(cpu); | 3879 | rcu_note_context_switch(cpu); |
| 3600 | prev = rq->curr; | 3880 | prev = rq->curr; |
| 3601 | switch_count = &prev->nivcsw; | ||
| 3602 | 3881 | ||
| 3603 | release_kernel_lock(prev); | 3882 | release_kernel_lock(prev); |
| 3604 | need_resched_nonpreemptible: | 3883 | need_resched_nonpreemptible: |
| @@ -3611,11 +3890,26 @@ need_resched_nonpreemptible: | |||
| 3611 | raw_spin_lock_irq(&rq->lock); | 3890 | raw_spin_lock_irq(&rq->lock); |
| 3612 | clear_tsk_need_resched(prev); | 3891 | clear_tsk_need_resched(prev); |
| 3613 | 3892 | ||
| 3893 | switch_count = &prev->nivcsw; | ||
| 3614 | if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { | 3894 | if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { |
| 3615 | if (unlikely(signal_pending_state(prev->state, prev))) | 3895 | if (unlikely(signal_pending_state(prev->state, prev))) { |
| 3616 | prev->state = TASK_RUNNING; | 3896 | prev->state = TASK_RUNNING; |
| 3617 | else | 3897 | } else { |
| 3898 | /* | ||
| 3899 | * If a worker is going to sleep, notify and | ||
| 3900 | * ask workqueue whether it wants to wake up a | ||
| 3901 | * task to maintain concurrency. If so, wake | ||
| 3902 | * up the task. | ||
| 3903 | */ | ||
| 3904 | if (prev->flags & PF_WQ_WORKER) { | ||
| 3905 | struct task_struct *to_wakeup; | ||
| 3906 | |||
| 3907 | to_wakeup = wq_worker_sleeping(prev, cpu); | ||
| 3908 | if (to_wakeup) | ||
| 3909 | try_to_wake_up_local(to_wakeup); | ||
| 3910 | } | ||
| 3618 | deactivate_task(rq, prev, DEQUEUE_SLEEP); | 3911 | deactivate_task(rq, prev, DEQUEUE_SLEEP); |
| 3912 | } | ||
| 3619 | switch_count = &prev->nvcsw; | 3913 | switch_count = &prev->nvcsw; |
| 3620 | } | 3914 | } |
| 3621 | 3915 | ||
| @@ -3637,8 +3931,10 @@ need_resched_nonpreemptible: | |||
| 3637 | 3931 | ||
| 3638 | context_switch(rq, prev, next); /* unlocks the rq */ | 3932 | context_switch(rq, prev, next); /* unlocks the rq */ |
| 3639 | /* | 3933 | /* |
| 3640 | * the context switch might have flipped the stack from under | 3934 | * The context switch have flipped the stack from under us |
| 3641 | * us, hence refresh the local variables. | 3935 | * and restored the local variables which were saved when |
| 3936 | * this task called schedule() in the past. prev == current | ||
| 3937 | * is still correct, but it can be moved to another cpu/rq. | ||
| 3642 | */ | 3938 | */ |
| 3643 | cpu = smp_processor_id(); | 3939 | cpu = smp_processor_id(); |
| 3644 | rq = cpu_rq(cpu); | 3940 | rq = cpu_rq(cpu); |
| @@ -3647,11 +3943,8 @@ need_resched_nonpreemptible: | |||
| 3647 | 3943 | ||
| 3648 | post_schedule(rq); | 3944 | post_schedule(rq); |
| 3649 | 3945 | ||
| 3650 | if (unlikely(reacquire_kernel_lock(current) < 0)) { | 3946 | if (unlikely(reacquire_kernel_lock(prev))) |
| 3651 | prev = rq->curr; | ||
| 3652 | switch_count = &prev->nivcsw; | ||
| 3653 | goto need_resched_nonpreemptible; | 3947 | goto need_resched_nonpreemptible; |
| 3654 | } | ||
| 3655 | 3948 | ||
| 3656 | preempt_enable_no_resched(); | 3949 | preempt_enable_no_resched(); |
| 3657 | if (need_resched()) | 3950 | if (need_resched()) |
| @@ -3704,8 +3997,16 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner) | |||
| 3704 | /* | 3997 | /* |
| 3705 | * Owner changed, break to re-assess state. | 3998 | * Owner changed, break to re-assess state. |
| 3706 | */ | 3999 | */ |
| 3707 | if (lock->owner != owner) | 4000 | if (lock->owner != owner) { |
| 4001 | /* | ||
| 4002 | * If the lock has switched to a different owner, | ||
| 4003 | * we likely have heavy contention. Return 0 to quit | ||
| 4004 | * optimistic spinning and not contend further: | ||
| 4005 | */ | ||
| 4006 | if (lock->owner) | ||
| 4007 | return 0; | ||
| 3708 | break; | 4008 | break; |
| 4009 | } | ||
| 3709 | 4010 | ||
| 3710 | /* | 4011 | /* |
| 3711 | * Is that owner really running on that cpu? | 4012 | * Is that owner really running on that cpu? |
| @@ -3726,7 +4027,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner) | |||
| 3726 | * off of preempt_enable. Kernel preemptions off return from interrupt | 4027 | * off of preempt_enable. Kernel preemptions off return from interrupt |
| 3727 | * occur there and call schedule directly. | 4028 | * occur there and call schedule directly. |
| 3728 | */ | 4029 | */ |
| 3729 | asmlinkage void __sched preempt_schedule(void) | 4030 | asmlinkage void __sched notrace preempt_schedule(void) |
| 3730 | { | 4031 | { |
| 3731 | struct thread_info *ti = current_thread_info(); | 4032 | struct thread_info *ti = current_thread_info(); |
| 3732 | 4033 | ||
| @@ -3738,9 +4039,9 @@ asmlinkage void __sched preempt_schedule(void) | |||
| 3738 | return; | 4039 | return; |
| 3739 | 4040 | ||
| 3740 | do { | 4041 | do { |
| 3741 | add_preempt_count(PREEMPT_ACTIVE); | 4042 | add_preempt_count_notrace(PREEMPT_ACTIVE); |
| 3742 | schedule(); | 4043 | schedule(); |
| 3743 | sub_preempt_count(PREEMPT_ACTIVE); | 4044 | sub_preempt_count_notrace(PREEMPT_ACTIVE); |
| 3744 | 4045 | ||
| 3745 | /* | 4046 | /* |
| 3746 | * Check again in case we missed a preemption opportunity | 4047 | * Check again in case we missed a preemption opportunity |
| @@ -4183,6 +4484,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
| 4183 | 4484 | ||
| 4184 | rq = task_rq_lock(p, &flags); | 4485 | rq = task_rq_lock(p, &flags); |
| 4185 | 4486 | ||
| 4487 | trace_sched_pi_setprio(p, prio); | ||
| 4186 | oldprio = p->prio; | 4488 | oldprio = p->prio; |
| 4187 | prev_class = p->sched_class; | 4489 | prev_class = p->sched_class; |
| 4188 | on_rq = p->se.on_rq; | 4490 | on_rq = p->se.on_rq; |
| @@ -4441,12 +4743,8 @@ recheck: | |||
| 4441 | */ | 4743 | */ |
| 4442 | if (user && !capable(CAP_SYS_NICE)) { | 4744 | if (user && !capable(CAP_SYS_NICE)) { |
| 4443 | if (rt_policy(policy)) { | 4745 | if (rt_policy(policy)) { |
| 4444 | unsigned long rlim_rtprio; | 4746 | unsigned long rlim_rtprio = |
| 4445 | 4747 | task_rlimit(p, RLIMIT_RTPRIO); | |
| 4446 | if (!lock_task_sighand(p, &flags)) | ||
| 4447 | return -ESRCH; | ||
| 4448 | rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO); | ||
| 4449 | unlock_task_sighand(p, &flags); | ||
| 4450 | 4748 | ||
| 4451 | /* can't set/change the rt policy */ | 4749 | /* can't set/change the rt policy */ |
| 4452 | if (policy != p->policy && !rlim_rtprio) | 4750 | if (policy != p->policy && !rlim_rtprio) |
| @@ -4474,7 +4772,7 @@ recheck: | |||
| 4474 | } | 4772 | } |
| 4475 | 4773 | ||
| 4476 | if (user) { | 4774 | if (user) { |
| 4477 | retval = security_task_setscheduler(p, policy, param); | 4775 | retval = security_task_setscheduler(p); |
| 4478 | if (retval) | 4776 | if (retval) |
| 4479 | return retval; | 4777 | return retval; |
| 4480 | } | 4778 | } |
| @@ -4490,6 +4788,15 @@ recheck: | |||
| 4490 | */ | 4788 | */ |
| 4491 | rq = __task_rq_lock(p); | 4789 | rq = __task_rq_lock(p); |
| 4492 | 4790 | ||
| 4791 | /* | ||
| 4792 | * Changing the policy of the stop threads its a very bad idea | ||
| 4793 | */ | ||
| 4794 | if (p == rq->stop) { | ||
| 4795 | __task_rq_unlock(rq); | ||
| 4796 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); | ||
| 4797 | return -EINVAL; | ||
| 4798 | } | ||
| 4799 | |||
| 4493 | #ifdef CONFIG_RT_GROUP_SCHED | 4800 | #ifdef CONFIG_RT_GROUP_SCHED |
| 4494 | if (user) { | 4801 | if (user) { |
| 4495 | /* | 4802 | /* |
| @@ -4716,13 +5023,13 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) | |||
| 4716 | if (!check_same_owner(p) && !capable(CAP_SYS_NICE)) | 5023 | if (!check_same_owner(p) && !capable(CAP_SYS_NICE)) |
| 4717 | goto out_unlock; | 5024 | goto out_unlock; |
| 4718 | 5025 | ||
| 4719 | retval = security_task_setscheduler(p, 0, NULL); | 5026 | retval = security_task_setscheduler(p); |
| 4720 | if (retval) | 5027 | if (retval) |
| 4721 | goto out_unlock; | 5028 | goto out_unlock; |
| 4722 | 5029 | ||
| 4723 | cpuset_cpus_allowed(p, cpus_allowed); | 5030 | cpuset_cpus_allowed(p, cpus_allowed); |
| 4724 | cpumask_and(new_mask, in_mask, cpus_allowed); | 5031 | cpumask_and(new_mask, in_mask, cpus_allowed); |
| 4725 | again: | 5032 | again: |
| 4726 | retval = set_cpus_allowed_ptr(p, new_mask); | 5033 | retval = set_cpus_allowed_ptr(p, new_mask); |
| 4727 | 5034 | ||
| 4728 | if (!retval) { | 5035 | if (!retval) { |
| @@ -5166,7 +5473,19 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) | |||
| 5166 | idle->se.exec_start = sched_clock(); | 5473 | idle->se.exec_start = sched_clock(); |
| 5167 | 5474 | ||
| 5168 | cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu)); | 5475 | cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu)); |
| 5476 | /* | ||
| 5477 | * We're having a chicken and egg problem, even though we are | ||
| 5478 | * holding rq->lock, the cpu isn't yet set to this cpu so the | ||
| 5479 | * lockdep check in task_group() will fail. | ||
| 5480 | * | ||
| 5481 | * Similar case to sched_fork(). / Alternatively we could | ||
| 5482 | * use task_rq_lock() here and obtain the other rq->lock. | ||
| 5483 | * | ||
| 5484 | * Silence PROVE_RCU | ||
| 5485 | */ | ||
| 5486 | rcu_read_lock(); | ||
| 5169 | __set_task_cpu(idle, cpu); | 5487 | __set_task_cpu(idle, cpu); |
| 5488 | rcu_read_unlock(); | ||
| 5170 | 5489 | ||
| 5171 | rq->curr = rq->idle = idle; | 5490 | rq->curr = rq->idle = idle; |
| 5172 | #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) | 5491 | #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) |
| @@ -5816,20 +6135,49 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
| 5816 | */ | 6135 | */ |
| 5817 | static struct notifier_block __cpuinitdata migration_notifier = { | 6136 | static struct notifier_block __cpuinitdata migration_notifier = { |
| 5818 | .notifier_call = migration_call, | 6137 | .notifier_call = migration_call, |
| 5819 | .priority = 10 | 6138 | .priority = CPU_PRI_MIGRATION, |
| 5820 | }; | 6139 | }; |
| 5821 | 6140 | ||
| 6141 | static int __cpuinit sched_cpu_active(struct notifier_block *nfb, | ||
| 6142 | unsigned long action, void *hcpu) | ||
| 6143 | { | ||
| 6144 | switch (action & ~CPU_TASKS_FROZEN) { | ||
| 6145 | case CPU_ONLINE: | ||
| 6146 | case CPU_DOWN_FAILED: | ||
| 6147 | set_cpu_active((long)hcpu, true); | ||
| 6148 | return NOTIFY_OK; | ||
| 6149 | default: | ||
| 6150 | return NOTIFY_DONE; | ||
| 6151 | } | ||
| 6152 | } | ||
| 6153 | |||
| 6154 | static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb, | ||
| 6155 | unsigned long action, void *hcpu) | ||
| 6156 | { | ||
| 6157 | switch (action & ~CPU_TASKS_FROZEN) { | ||
| 6158 | case CPU_DOWN_PREPARE: | ||
| 6159 | set_cpu_active((long)hcpu, false); | ||
| 6160 | return NOTIFY_OK; | ||
| 6161 | default: | ||
| 6162 | return NOTIFY_DONE; | ||
| 6163 | } | ||
| 6164 | } | ||
| 6165 | |||
| 5822 | static int __init migration_init(void) | 6166 | static int __init migration_init(void) |
| 5823 | { | 6167 | { |
| 5824 | void *cpu = (void *)(long)smp_processor_id(); | 6168 | void *cpu = (void *)(long)smp_processor_id(); |
| 5825 | int err; | 6169 | int err; |
| 5826 | 6170 | ||
| 5827 | /* Start one for the boot CPU: */ | 6171 | /* Initialize migration for the boot CPU */ |
| 5828 | err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); | 6172 | err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); |
| 5829 | BUG_ON(err == NOTIFY_BAD); | 6173 | BUG_ON(err == NOTIFY_BAD); |
| 5830 | migration_call(&migration_notifier, CPU_ONLINE, cpu); | 6174 | migration_call(&migration_notifier, CPU_ONLINE, cpu); |
| 5831 | register_cpu_notifier(&migration_notifier); | 6175 | register_cpu_notifier(&migration_notifier); |
| 5832 | 6176 | ||
| 6177 | /* Register cpu active notifiers */ | ||
| 6178 | cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE); | ||
| 6179 | cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE); | ||
| 6180 | |||
| 5833 | return 0; | 6181 | return 0; |
| 5834 | } | 6182 | } |
| 5835 | early_initcall(migration_init); | 6183 | early_initcall(migration_init); |
| @@ -6064,23 +6412,18 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) | |||
| 6064 | free_rootdomain(old_rd); | 6412 | free_rootdomain(old_rd); |
| 6065 | } | 6413 | } |
| 6066 | 6414 | ||
| 6067 | static int init_rootdomain(struct root_domain *rd, bool bootmem) | 6415 | static int init_rootdomain(struct root_domain *rd) |
| 6068 | { | 6416 | { |
| 6069 | gfp_t gfp = GFP_KERNEL; | ||
| 6070 | |||
| 6071 | memset(rd, 0, sizeof(*rd)); | 6417 | memset(rd, 0, sizeof(*rd)); |
| 6072 | 6418 | ||
| 6073 | if (bootmem) | 6419 | if (!alloc_cpumask_var(&rd->span, GFP_KERNEL)) |
| 6074 | gfp = GFP_NOWAIT; | ||
| 6075 | |||
| 6076 | if (!alloc_cpumask_var(&rd->span, gfp)) | ||
| 6077 | goto out; | 6420 | goto out; |
| 6078 | if (!alloc_cpumask_var(&rd->online, gfp)) | 6421 | if (!alloc_cpumask_var(&rd->online, GFP_KERNEL)) |
| 6079 | goto free_span; | 6422 | goto free_span; |
| 6080 | if (!alloc_cpumask_var(&rd->rto_mask, gfp)) | 6423 | if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) |
| 6081 | goto free_online; | 6424 | goto free_online; |
| 6082 | 6425 | ||
| 6083 | if (cpupri_init(&rd->cpupri, bootmem) != 0) | 6426 | if (cpupri_init(&rd->cpupri) != 0) |
| 6084 | goto free_rto_mask; | 6427 | goto free_rto_mask; |
| 6085 | return 0; | 6428 | return 0; |
| 6086 | 6429 | ||
| @@ -6096,7 +6439,7 @@ out: | |||
| 6096 | 6439 | ||
| 6097 | static void init_defrootdomain(void) | 6440 | static void init_defrootdomain(void) |
| 6098 | { | 6441 | { |
| 6099 | init_rootdomain(&def_root_domain, true); | 6442 | init_rootdomain(&def_root_domain); |
| 6100 | 6443 | ||
| 6101 | atomic_set(&def_root_domain.refcount, 1); | 6444 | atomic_set(&def_root_domain.refcount, 1); |
| 6102 | } | 6445 | } |
| @@ -6109,7 +6452,7 @@ static struct root_domain *alloc_rootdomain(void) | |||
| 6109 | if (!rd) | 6452 | if (!rd) |
| 6110 | return NULL; | 6453 | return NULL; |
| 6111 | 6454 | ||
| 6112 | if (init_rootdomain(rd, false) != 0) { | 6455 | if (init_rootdomain(rd) != 0) { |
| 6113 | kfree(rd); | 6456 | kfree(rd); |
| 6114 | return NULL; | 6457 | return NULL; |
| 6115 | } | 6458 | } |
| @@ -6319,6 +6662,7 @@ struct s_data { | |||
| 6319 | cpumask_var_t nodemask; | 6662 | cpumask_var_t nodemask; |
| 6320 | cpumask_var_t this_sibling_map; | 6663 | cpumask_var_t this_sibling_map; |
| 6321 | cpumask_var_t this_core_map; | 6664 | cpumask_var_t this_core_map; |
| 6665 | cpumask_var_t this_book_map; | ||
| 6322 | cpumask_var_t send_covered; | 6666 | cpumask_var_t send_covered; |
| 6323 | cpumask_var_t tmpmask; | 6667 | cpumask_var_t tmpmask; |
| 6324 | struct sched_group **sched_group_nodes; | 6668 | struct sched_group **sched_group_nodes; |
| @@ -6330,6 +6674,7 @@ enum s_alloc { | |||
| 6330 | sa_rootdomain, | 6674 | sa_rootdomain, |
| 6331 | sa_tmpmask, | 6675 | sa_tmpmask, |
| 6332 | sa_send_covered, | 6676 | sa_send_covered, |
| 6677 | sa_this_book_map, | ||
| 6333 | sa_this_core_map, | 6678 | sa_this_core_map, |
| 6334 | sa_this_sibling_map, | 6679 | sa_this_sibling_map, |
| 6335 | sa_nodemask, | 6680 | sa_nodemask, |
| @@ -6365,31 +6710,48 @@ cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map, | |||
| 6365 | #ifdef CONFIG_SCHED_MC | 6710 | #ifdef CONFIG_SCHED_MC |
| 6366 | static DEFINE_PER_CPU(struct static_sched_domain, core_domains); | 6711 | static DEFINE_PER_CPU(struct static_sched_domain, core_domains); |
| 6367 | static DEFINE_PER_CPU(struct static_sched_group, sched_group_core); | 6712 | static DEFINE_PER_CPU(struct static_sched_group, sched_group_core); |
| 6368 | #endif /* CONFIG_SCHED_MC */ | ||
| 6369 | 6713 | ||
| 6370 | #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) | ||
| 6371 | static int | 6714 | static int |
| 6372 | cpu_to_core_group(int cpu, const struct cpumask *cpu_map, | 6715 | cpu_to_core_group(int cpu, const struct cpumask *cpu_map, |
| 6373 | struct sched_group **sg, struct cpumask *mask) | 6716 | struct sched_group **sg, struct cpumask *mask) |
| 6374 | { | 6717 | { |
| 6375 | int group; | 6718 | int group; |
| 6376 | 6719 | #ifdef CONFIG_SCHED_SMT | |
| 6377 | cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); | 6720 | cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); |
| 6378 | group = cpumask_first(mask); | 6721 | group = cpumask_first(mask); |
| 6722 | #else | ||
| 6723 | group = cpu; | ||
| 6724 | #endif | ||
| 6379 | if (sg) | 6725 | if (sg) |
| 6380 | *sg = &per_cpu(sched_group_core, group).sg; | 6726 | *sg = &per_cpu(sched_group_core, group).sg; |
| 6381 | return group; | 6727 | return group; |
| 6382 | } | 6728 | } |
| 6383 | #elif defined(CONFIG_SCHED_MC) | 6729 | #endif /* CONFIG_SCHED_MC */ |
| 6730 | |||
| 6731 | /* | ||
| 6732 | * book sched-domains: | ||
| 6733 | */ | ||
| 6734 | #ifdef CONFIG_SCHED_BOOK | ||
| 6735 | static DEFINE_PER_CPU(struct static_sched_domain, book_domains); | ||
| 6736 | static DEFINE_PER_CPU(struct static_sched_group, sched_group_book); | ||
| 6737 | |||
| 6384 | static int | 6738 | static int |
| 6385 | cpu_to_core_group(int cpu, const struct cpumask *cpu_map, | 6739 | cpu_to_book_group(int cpu, const struct cpumask *cpu_map, |
| 6386 | struct sched_group **sg, struct cpumask *unused) | 6740 | struct sched_group **sg, struct cpumask *mask) |
| 6387 | { | 6741 | { |
| 6742 | int group = cpu; | ||
| 6743 | #ifdef CONFIG_SCHED_MC | ||
| 6744 | cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map); | ||
| 6745 | group = cpumask_first(mask); | ||
| 6746 | #elif defined(CONFIG_SCHED_SMT) | ||
| 6747 | cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); | ||
| 6748 | group = cpumask_first(mask); | ||
| 6749 | #endif | ||
| 6388 | if (sg) | 6750 | if (sg) |
| 6389 | *sg = &per_cpu(sched_group_core, cpu).sg; | 6751 | *sg = &per_cpu(sched_group_book, group).sg; |
| 6390 | return cpu; | 6752 | return group; |
| 6391 | } | 6753 | } |
| 6392 | #endif | 6754 | #endif /* CONFIG_SCHED_BOOK */ |
| 6393 | 6755 | ||
| 6394 | static DEFINE_PER_CPU(struct static_sched_domain, phys_domains); | 6756 | static DEFINE_PER_CPU(struct static_sched_domain, phys_domains); |
| 6395 | static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys); | 6757 | static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys); |
| @@ -6399,7 +6761,10 @@ cpu_to_phys_group(int cpu, const struct cpumask *cpu_map, | |||
| 6399 | struct sched_group **sg, struct cpumask *mask) | 6761 | struct sched_group **sg, struct cpumask *mask) |
| 6400 | { | 6762 | { |
| 6401 | int group; | 6763 | int group; |
| 6402 | #ifdef CONFIG_SCHED_MC | 6764 | #ifdef CONFIG_SCHED_BOOK |
| 6765 | cpumask_and(mask, cpu_book_mask(cpu), cpu_map); | ||
| 6766 | group = cpumask_first(mask); | ||
| 6767 | #elif defined(CONFIG_SCHED_MC) | ||
| 6403 | cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map); | 6768 | cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map); |
| 6404 | group = cpumask_first(mask); | 6769 | group = cpumask_first(mask); |
| 6405 | #elif defined(CONFIG_SCHED_SMT) | 6770 | #elif defined(CONFIG_SCHED_SMT) |
| @@ -6660,6 +7025,9 @@ SD_INIT_FUNC(CPU) | |||
| 6660 | #ifdef CONFIG_SCHED_MC | 7025 | #ifdef CONFIG_SCHED_MC |
| 6661 | SD_INIT_FUNC(MC) | 7026 | SD_INIT_FUNC(MC) |
| 6662 | #endif | 7027 | #endif |
| 7028 | #ifdef CONFIG_SCHED_BOOK | ||
| 7029 | SD_INIT_FUNC(BOOK) | ||
| 7030 | #endif | ||
| 6663 | 7031 | ||
| 6664 | static int default_relax_domain_level = -1; | 7032 | static int default_relax_domain_level = -1; |
| 6665 | 7033 | ||
| @@ -6709,6 +7077,8 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what, | |||
| 6709 | free_cpumask_var(d->tmpmask); /* fall through */ | 7077 | free_cpumask_var(d->tmpmask); /* fall through */ |
| 6710 | case sa_send_covered: | 7078 | case sa_send_covered: |
| 6711 | free_cpumask_var(d->send_covered); /* fall through */ | 7079 | free_cpumask_var(d->send_covered); /* fall through */ |
| 7080 | case sa_this_book_map: | ||
| 7081 | free_cpumask_var(d->this_book_map); /* fall through */ | ||
| 6712 | case sa_this_core_map: | 7082 | case sa_this_core_map: |
| 6713 | free_cpumask_var(d->this_core_map); /* fall through */ | 7083 | free_cpumask_var(d->this_core_map); /* fall through */ |
| 6714 | case sa_this_sibling_map: | 7084 | case sa_this_sibling_map: |
| @@ -6755,8 +7125,10 @@ static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, | |||
| 6755 | return sa_nodemask; | 7125 | return sa_nodemask; |
| 6756 | if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL)) | 7126 | if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL)) |
| 6757 | return sa_this_sibling_map; | 7127 | return sa_this_sibling_map; |
| 6758 | if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL)) | 7128 | if (!alloc_cpumask_var(&d->this_book_map, GFP_KERNEL)) |
| 6759 | return sa_this_core_map; | 7129 | return sa_this_core_map; |
| 7130 | if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL)) | ||
| 7131 | return sa_this_book_map; | ||
| 6760 | if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL)) | 7132 | if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL)) |
| 6761 | return sa_send_covered; | 7133 | return sa_send_covered; |
| 6762 | d->rd = alloc_rootdomain(); | 7134 | d->rd = alloc_rootdomain(); |
| @@ -6814,6 +7186,23 @@ static struct sched_domain *__build_cpu_sched_domain(struct s_data *d, | |||
| 6814 | return sd; | 7186 | return sd; |
| 6815 | } | 7187 | } |
| 6816 | 7188 | ||
| 7189 | static struct sched_domain *__build_book_sched_domain(struct s_data *d, | ||
| 7190 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, | ||
| 7191 | struct sched_domain *parent, int i) | ||
| 7192 | { | ||
| 7193 | struct sched_domain *sd = parent; | ||
| 7194 | #ifdef CONFIG_SCHED_BOOK | ||
| 7195 | sd = &per_cpu(book_domains, i).sd; | ||
| 7196 | SD_INIT(sd, BOOK); | ||
| 7197 | set_domain_attribute(sd, attr); | ||
| 7198 | cpumask_and(sched_domain_span(sd), cpu_map, cpu_book_mask(i)); | ||
| 7199 | sd->parent = parent; | ||
| 7200 | parent->child = sd; | ||
| 7201 | cpu_to_book_group(i, cpu_map, &sd->groups, d->tmpmask); | ||
| 7202 | #endif | ||
| 7203 | return sd; | ||
| 7204 | } | ||
| 7205 | |||
| 6817 | static struct sched_domain *__build_mc_sched_domain(struct s_data *d, | 7206 | static struct sched_domain *__build_mc_sched_domain(struct s_data *d, |
| 6818 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, | 7207 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, |
| 6819 | struct sched_domain *parent, int i) | 7208 | struct sched_domain *parent, int i) |
| @@ -6871,6 +7260,15 @@ static void build_sched_groups(struct s_data *d, enum sched_domain_level l, | |||
| 6871 | d->send_covered, d->tmpmask); | 7260 | d->send_covered, d->tmpmask); |
| 6872 | break; | 7261 | break; |
| 6873 | #endif | 7262 | #endif |
| 7263 | #ifdef CONFIG_SCHED_BOOK | ||
| 7264 | case SD_LV_BOOK: /* set up book groups */ | ||
| 7265 | cpumask_and(d->this_book_map, cpu_map, cpu_book_mask(cpu)); | ||
| 7266 | if (cpu == cpumask_first(d->this_book_map)) | ||
| 7267 | init_sched_build_groups(d->this_book_map, cpu_map, | ||
| 7268 | &cpu_to_book_group, | ||
| 7269 | d->send_covered, d->tmpmask); | ||
| 7270 | break; | ||
| 7271 | #endif | ||
| 6874 | case SD_LV_CPU: /* set up physical groups */ | 7272 | case SD_LV_CPU: /* set up physical groups */ |
| 6875 | cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map); | 7273 | cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map); |
| 6876 | if (!cpumask_empty(d->nodemask)) | 7274 | if (!cpumask_empty(d->nodemask)) |
| @@ -6918,12 +7316,14 @@ static int __build_sched_domains(const struct cpumask *cpu_map, | |||
| 6918 | 7316 | ||
| 6919 | sd = __build_numa_sched_domains(&d, cpu_map, attr, i); | 7317 | sd = __build_numa_sched_domains(&d, cpu_map, attr, i); |
| 6920 | sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i); | 7318 | sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i); |
| 7319 | sd = __build_book_sched_domain(&d, cpu_map, attr, sd, i); | ||
| 6921 | sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i); | 7320 | sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i); |
| 6922 | sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i); | 7321 | sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i); |
| 6923 | } | 7322 | } |
| 6924 | 7323 | ||
| 6925 | for_each_cpu(i, cpu_map) { | 7324 | for_each_cpu(i, cpu_map) { |
| 6926 | build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i); | 7325 | build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i); |
| 7326 | build_sched_groups(&d, SD_LV_BOOK, cpu_map, i); | ||
| 6927 | build_sched_groups(&d, SD_LV_MC, cpu_map, i); | 7327 | build_sched_groups(&d, SD_LV_MC, cpu_map, i); |
| 6928 | } | 7328 | } |
| 6929 | 7329 | ||
| @@ -6954,6 +7354,12 @@ static int __build_sched_domains(const struct cpumask *cpu_map, | |||
| 6954 | init_sched_groups_power(i, sd); | 7354 | init_sched_groups_power(i, sd); |
| 6955 | } | 7355 | } |
| 6956 | #endif | 7356 | #endif |
| 7357 | #ifdef CONFIG_SCHED_BOOK | ||
| 7358 | for_each_cpu(i, cpu_map) { | ||
| 7359 | sd = &per_cpu(book_domains, i).sd; | ||
| 7360 | init_sched_groups_power(i, sd); | ||
| 7361 | } | ||
| 7362 | #endif | ||
| 6957 | 7363 | ||
| 6958 | for_each_cpu(i, cpu_map) { | 7364 | for_each_cpu(i, cpu_map) { |
| 6959 | sd = &per_cpu(phys_domains, i).sd; | 7365 | sd = &per_cpu(phys_domains, i).sd; |
| @@ -6979,6 +7385,8 @@ static int __build_sched_domains(const struct cpumask *cpu_map, | |||
| 6979 | sd = &per_cpu(cpu_domains, i).sd; | 7385 | sd = &per_cpu(cpu_domains, i).sd; |
| 6980 | #elif defined(CONFIG_SCHED_MC) | 7386 | #elif defined(CONFIG_SCHED_MC) |
| 6981 | sd = &per_cpu(core_domains, i).sd; | 7387 | sd = &per_cpu(core_domains, i).sd; |
| 7388 | #elif defined(CONFIG_SCHED_BOOK) | ||
| 7389 | sd = &per_cpu(book_domains, i).sd; | ||
| 6982 | #else | 7390 | #else |
| 6983 | sd = &per_cpu(phys_domains, i).sd; | 7391 | sd = &per_cpu(phys_domains, i).sd; |
| 6984 | #endif | 7392 | #endif |
| @@ -7288,29 +7696,35 @@ int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) | |||
| 7288 | } | 7696 | } |
| 7289 | #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ | 7697 | #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ |
| 7290 | 7698 | ||
| 7291 | #ifndef CONFIG_CPUSETS | ||
| 7292 | /* | 7699 | /* |
| 7293 | * Add online and remove offline CPUs from the scheduler domains. | 7700 | * Update cpusets according to cpu_active mask. If cpusets are |
| 7294 | * When cpusets are enabled they take over this function. | 7701 | * disabled, cpuset_update_active_cpus() becomes a simple wrapper |
| 7702 | * around partition_sched_domains(). | ||
| 7295 | */ | 7703 | */ |
| 7296 | static int update_sched_domains(struct notifier_block *nfb, | 7704 | static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, |
| 7297 | unsigned long action, void *hcpu) | 7705 | void *hcpu) |
| 7298 | { | 7706 | { |
| 7299 | switch (action) { | 7707 | switch (action & ~CPU_TASKS_FROZEN) { |
| 7300 | case CPU_ONLINE: | 7708 | case CPU_ONLINE: |
| 7301 | case CPU_ONLINE_FROZEN: | ||
| 7302 | case CPU_DOWN_PREPARE: | ||
| 7303 | case CPU_DOWN_PREPARE_FROZEN: | ||
| 7304 | case CPU_DOWN_FAILED: | 7709 | case CPU_DOWN_FAILED: |
| 7305 | case CPU_DOWN_FAILED_FROZEN: | 7710 | cpuset_update_active_cpus(); |
| 7306 | partition_sched_domains(1, NULL, NULL); | ||
| 7307 | return NOTIFY_OK; | 7711 | return NOTIFY_OK; |
| 7712 | default: | ||
| 7713 | return NOTIFY_DONE; | ||
| 7714 | } | ||
| 7715 | } | ||
| 7308 | 7716 | ||
| 7717 | static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, | ||
| 7718 | void *hcpu) | ||
| 7719 | { | ||
| 7720 | switch (action & ~CPU_TASKS_FROZEN) { | ||
| 7721 | case CPU_DOWN_PREPARE: | ||
| 7722 | cpuset_update_active_cpus(); | ||
| 7723 | return NOTIFY_OK; | ||
| 7309 | default: | 7724 | default: |
| 7310 | return NOTIFY_DONE; | 7725 | return NOTIFY_DONE; |
| 7311 | } | 7726 | } |
| 7312 | } | 7727 | } |
| 7313 | #endif | ||
| 7314 | 7728 | ||
| 7315 | static int update_runtime(struct notifier_block *nfb, | 7729 | static int update_runtime(struct notifier_block *nfb, |
| 7316 | unsigned long action, void *hcpu) | 7730 | unsigned long action, void *hcpu) |
| @@ -7356,10 +7770,8 @@ void __init sched_init_smp(void) | |||
| 7356 | mutex_unlock(&sched_domains_mutex); | 7770 | mutex_unlock(&sched_domains_mutex); |
| 7357 | put_online_cpus(); | 7771 | put_online_cpus(); |
| 7358 | 7772 | ||
| 7359 | #ifndef CONFIG_CPUSETS | 7773 | hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); |
| 7360 | /* XXX: Theoretical race here - CPU may be hotplugged now */ | 7774 | hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE); |
| 7361 | hotcpu_notifier(update_sched_domains, 0); | ||
| 7362 | #endif | ||
| 7363 | 7775 | ||
| 7364 | /* RT runtime code needs to handle some hotplug events */ | 7776 | /* RT runtime code needs to handle some hotplug events */ |
| 7365 | hotcpu_notifier(update_runtime, 0); | 7777 | hotcpu_notifier(update_runtime, 0); |
| @@ -7604,6 +8016,9 @@ void __init sched_init(void) | |||
| 7604 | 8016 | ||
| 7605 | for (j = 0; j < CPU_LOAD_IDX_MAX; j++) | 8017 | for (j = 0; j < CPU_LOAD_IDX_MAX; j++) |
| 7606 | rq->cpu_load[j] = 0; | 8018 | rq->cpu_load[j] = 0; |
| 8019 | |||
| 8020 | rq->last_load_update_tick = jiffies; | ||
| 8021 | |||
| 7607 | #ifdef CONFIG_SMP | 8022 | #ifdef CONFIG_SMP |
| 7608 | rq->sd = NULL; | 8023 | rq->sd = NULL; |
| 7609 | rq->rd = NULL; | 8024 | rq->rd = NULL; |
| @@ -7617,6 +8032,10 @@ void __init sched_init(void) | |||
| 7617 | rq->idle_stamp = 0; | 8032 | rq->idle_stamp = 0; |
| 7618 | rq->avg_idle = 2*sysctl_sched_migration_cost; | 8033 | rq->avg_idle = 2*sysctl_sched_migration_cost; |
| 7619 | rq_attach_root(rq, &def_root_domain); | 8034 | rq_attach_root(rq, &def_root_domain); |
| 8035 | #ifdef CONFIG_NO_HZ | ||
| 8036 | rq->nohz_balance_kick = 0; | ||
| 8037 | init_sched_softirq_csd(&per_cpu(remote_sched_softirq_cb, i)); | ||
| 8038 | #endif | ||
| 7620 | #endif | 8039 | #endif |
| 7621 | init_rq_hrtick(rq); | 8040 | init_rq_hrtick(rq); |
| 7622 | atomic_set(&rq->nr_iowait, 0); | 8041 | atomic_set(&rq->nr_iowait, 0); |
| @@ -7661,8 +8080,11 @@ void __init sched_init(void) | |||
| 7661 | zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); | 8080 | zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); |
| 7662 | #ifdef CONFIG_SMP | 8081 | #ifdef CONFIG_SMP |
| 7663 | #ifdef CONFIG_NO_HZ | 8082 | #ifdef CONFIG_NO_HZ |
| 7664 | zalloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT); | 8083 | zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); |
| 7665 | alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT); | 8084 | alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT); |
| 8085 | atomic_set(&nohz.load_balancer, nr_cpu_ids); | ||
| 8086 | atomic_set(&nohz.first_pick_cpu, nr_cpu_ids); | ||
| 8087 | atomic_set(&nohz.second_pick_cpu, nr_cpu_ids); | ||
| 7666 | #endif | 8088 | #endif |
| 7667 | /* May be allocated at isolcpus cmdline parse time */ | 8089 | /* May be allocated at isolcpus cmdline parse time */ |
| 7668 | if (cpu_isolated_map == NULL) | 8090 | if (cpu_isolated_map == NULL) |
| @@ -7869,9 +8291,9 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) | |||
| 7869 | 8291 | ||
| 7870 | return 1; | 8292 | return 1; |
| 7871 | 8293 | ||
| 7872 | err_free_rq: | 8294 | err_free_rq: |
| 7873 | kfree(cfs_rq); | 8295 | kfree(cfs_rq); |
| 7874 | err: | 8296 | err: |
| 7875 | return 0; | 8297 | return 0; |
| 7876 | } | 8298 | } |
| 7877 | 8299 | ||
| @@ -7959,9 +8381,9 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) | |||
| 7959 | 8381 | ||
| 7960 | return 1; | 8382 | return 1; |
| 7961 | 8383 | ||
| 7962 | err_free_rq: | 8384 | err_free_rq: |
| 7963 | kfree(rt_rq); | 8385 | kfree(rt_rq); |
| 7964 | err: | 8386 | err: |
| 7965 | return 0; | 8387 | return 0; |
| 7966 | } | 8388 | } |
| 7967 | 8389 | ||
| @@ -8319,7 +8741,7 @@ static int tg_set_bandwidth(struct task_group *tg, | |||
| 8319 | raw_spin_unlock(&rt_rq->rt_runtime_lock); | 8741 | raw_spin_unlock(&rt_rq->rt_runtime_lock); |
| 8320 | } | 8742 | } |
| 8321 | raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); | 8743 | raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); |
| 8322 | unlock: | 8744 | unlock: |
| 8323 | read_unlock(&tasklist_lock); | 8745 | read_unlock(&tasklist_lock); |
| 8324 | mutex_unlock(&rt_constraints_mutex); | 8746 | mutex_unlock(&rt_constraints_mutex); |
| 8325 | 8747 | ||
