diff options
Diffstat (limited to 'kernel/sched.c')
-rw-r--r-- | kernel/sched.c | 356 |
1 files changed, 293 insertions, 63 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index dc85ceb90832..dc91a4d09ac3 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -426,9 +426,7 @@ struct root_domain { | |||
426 | */ | 426 | */ |
427 | cpumask_var_t rto_mask; | 427 | cpumask_var_t rto_mask; |
428 | atomic_t rto_count; | 428 | atomic_t rto_count; |
429 | #ifdef CONFIG_SMP | ||
430 | struct cpupri cpupri; | 429 | struct cpupri cpupri; |
431 | #endif | ||
432 | }; | 430 | }; |
433 | 431 | ||
434 | /* | 432 | /* |
@@ -437,7 +435,7 @@ struct root_domain { | |||
437 | */ | 435 | */ |
438 | static struct root_domain def_root_domain; | 436 | static struct root_domain def_root_domain; |
439 | 437 | ||
440 | #endif | 438 | #endif /* CONFIG_SMP */ |
441 | 439 | ||
442 | /* | 440 | /* |
443 | * This is the main, per-CPU runqueue data structure. | 441 | * This is the main, per-CPU runqueue data structure. |
@@ -488,11 +486,12 @@ struct rq { | |||
488 | */ | 486 | */ |
489 | unsigned long nr_uninterruptible; | 487 | unsigned long nr_uninterruptible; |
490 | 488 | ||
491 | struct task_struct *curr, *idle; | 489 | struct task_struct *curr, *idle, *stop; |
492 | unsigned long next_balance; | 490 | unsigned long next_balance; |
493 | struct mm_struct *prev_mm; | 491 | struct mm_struct *prev_mm; |
494 | 492 | ||
495 | u64 clock; | 493 | u64 clock; |
494 | u64 clock_task; | ||
496 | 495 | ||
497 | atomic_t nr_iowait; | 496 | atomic_t nr_iowait; |
498 | 497 | ||
@@ -520,6 +519,10 @@ struct rq { | |||
520 | u64 avg_idle; | 519 | u64 avg_idle; |
521 | #endif | 520 | #endif |
522 | 521 | ||
522 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | ||
523 | u64 prev_irq_time; | ||
524 | #endif | ||
525 | |||
523 | /* calc_load related fields */ | 526 | /* calc_load related fields */ |
524 | unsigned long calc_load_update; | 527 | unsigned long calc_load_update; |
525 | long calc_load_active; | 528 | long calc_load_active; |
@@ -557,18 +560,8 @@ struct rq { | |||
557 | 560 | ||
558 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); | 561 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); |
559 | 562 | ||
560 | static inline | ||
561 | void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) | ||
562 | { | ||
563 | rq->curr->sched_class->check_preempt_curr(rq, p, flags); | ||
564 | 563 | ||
565 | /* | 564 | static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags); |
566 | * A queue event has occurred, and we're going to schedule. In | ||
567 | * this case, we can save a useless back to back clock update. | ||
568 | */ | ||
569 | if (test_tsk_need_resched(p)) | ||
570 | rq->skip_clock_update = 1; | ||
571 | } | ||
572 | 565 | ||
573 | static inline int cpu_of(struct rq *rq) | 566 | static inline int cpu_of(struct rq *rq) |
574 | { | 567 | { |
@@ -643,10 +636,22 @@ static inline struct task_group *task_group(struct task_struct *p) | |||
643 | 636 | ||
644 | #endif /* CONFIG_CGROUP_SCHED */ | 637 | #endif /* CONFIG_CGROUP_SCHED */ |
645 | 638 | ||
639 | static u64 irq_time_cpu(int cpu); | ||
640 | static void sched_irq_time_avg_update(struct rq *rq, u64 irq_time); | ||
641 | |||
646 | inline void update_rq_clock(struct rq *rq) | 642 | inline void update_rq_clock(struct rq *rq) |
647 | { | 643 | { |
648 | if (!rq->skip_clock_update) | 644 | if (!rq->skip_clock_update) { |
649 | rq->clock = sched_clock_cpu(cpu_of(rq)); | 645 | int cpu = cpu_of(rq); |
646 | u64 irq_time; | ||
647 | |||
648 | rq->clock = sched_clock_cpu(cpu); | ||
649 | irq_time = irq_time_cpu(cpu); | ||
650 | if (rq->clock - irq_time > rq->clock_task) | ||
651 | rq->clock_task = rq->clock - irq_time; | ||
652 | |||
653 | sched_irq_time_avg_update(rq, irq_time); | ||
654 | } | ||
650 | } | 655 | } |
651 | 656 | ||
652 | /* | 657 | /* |
@@ -723,7 +728,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf, | |||
723 | size_t cnt, loff_t *ppos) | 728 | size_t cnt, loff_t *ppos) |
724 | { | 729 | { |
725 | char buf[64]; | 730 | char buf[64]; |
726 | char *cmp = buf; | 731 | char *cmp; |
727 | int neg = 0; | 732 | int neg = 0; |
728 | int i; | 733 | int i; |
729 | 734 | ||
@@ -734,6 +739,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf, | |||
734 | return -EFAULT; | 739 | return -EFAULT; |
735 | 740 | ||
736 | buf[cnt] = 0; | 741 | buf[cnt] = 0; |
742 | cmp = strstrip(buf); | ||
737 | 743 | ||
738 | if (strncmp(buf, "NO_", 3) == 0) { | 744 | if (strncmp(buf, "NO_", 3) == 0) { |
739 | neg = 1; | 745 | neg = 1; |
@@ -741,9 +747,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf, | |||
741 | } | 747 | } |
742 | 748 | ||
743 | for (i = 0; sched_feat_names[i]; i++) { | 749 | for (i = 0; sched_feat_names[i]; i++) { |
744 | int len = strlen(sched_feat_names[i]); | 750 | if (strcmp(cmp, sched_feat_names[i]) == 0) { |
745 | |||
746 | if (strncmp(cmp, sched_feat_names[i], len) == 0) { | ||
747 | if (neg) | 751 | if (neg) |
748 | sysctl_sched_features &= ~(1UL << i); | 752 | sysctl_sched_features &= ~(1UL << i); |
749 | else | 753 | else |
@@ -1840,7 +1844,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) | |||
1840 | 1844 | ||
1841 | static const struct sched_class rt_sched_class; | 1845 | static const struct sched_class rt_sched_class; |
1842 | 1846 | ||
1843 | #define sched_class_highest (&rt_sched_class) | 1847 | #define sched_class_highest (&stop_sched_class) |
1844 | #define for_each_class(class) \ | 1848 | #define for_each_class(class) \ |
1845 | for (class = sched_class_highest; class; class = class->next) | 1849 | for (class = sched_class_highest; class; class = class->next) |
1846 | 1850 | ||
@@ -1858,12 +1862,6 @@ static void dec_nr_running(struct rq *rq) | |||
1858 | 1862 | ||
1859 | static void set_load_weight(struct task_struct *p) | 1863 | static void set_load_weight(struct task_struct *p) |
1860 | { | 1864 | { |
1861 | if (task_has_rt_policy(p)) { | ||
1862 | p->se.load.weight = 0; | ||
1863 | p->se.load.inv_weight = WMULT_CONST; | ||
1864 | return; | ||
1865 | } | ||
1866 | |||
1867 | /* | 1865 | /* |
1868 | * SCHED_IDLE tasks get minimal weight: | 1866 | * SCHED_IDLE tasks get minimal weight: |
1869 | */ | 1867 | */ |
@@ -1917,13 +1915,132 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags) | |||
1917 | dec_nr_running(rq); | 1915 | dec_nr_running(rq); |
1918 | } | 1916 | } |
1919 | 1917 | ||
1918 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | ||
1919 | |||
1920 | /* | ||
1921 | * There are no locks covering percpu hardirq/softirq time. | ||
1922 | * They are only modified in account_system_vtime, on corresponding CPU | ||
1923 | * with interrupts disabled. So, writes are safe. | ||
1924 | * They are read and saved off onto struct rq in update_rq_clock(). | ||
1925 | * This may result in other CPU reading this CPU's irq time and can | ||
1926 | * race with irq/account_system_vtime on this CPU. We would either get old | ||
1927 | * or new value (or semi updated value on 32 bit) with a side effect of | ||
1928 | * accounting a slice of irq time to wrong task when irq is in progress | ||
1929 | * while we read rq->clock. That is a worthy compromise in place of having | ||
1930 | * locks on each irq in account_system_time. | ||
1931 | */ | ||
1932 | static DEFINE_PER_CPU(u64, cpu_hardirq_time); | ||
1933 | static DEFINE_PER_CPU(u64, cpu_softirq_time); | ||
1934 | |||
1935 | static DEFINE_PER_CPU(u64, irq_start_time); | ||
1936 | static int sched_clock_irqtime; | ||
1937 | |||
1938 | void enable_sched_clock_irqtime(void) | ||
1939 | { | ||
1940 | sched_clock_irqtime = 1; | ||
1941 | } | ||
1942 | |||
1943 | void disable_sched_clock_irqtime(void) | ||
1944 | { | ||
1945 | sched_clock_irqtime = 0; | ||
1946 | } | ||
1947 | |||
1948 | static u64 irq_time_cpu(int cpu) | ||
1949 | { | ||
1950 | if (!sched_clock_irqtime) | ||
1951 | return 0; | ||
1952 | |||
1953 | return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu); | ||
1954 | } | ||
1955 | |||
1956 | void account_system_vtime(struct task_struct *curr) | ||
1957 | { | ||
1958 | unsigned long flags; | ||
1959 | int cpu; | ||
1960 | u64 now, delta; | ||
1961 | |||
1962 | if (!sched_clock_irqtime) | ||
1963 | return; | ||
1964 | |||
1965 | local_irq_save(flags); | ||
1966 | |||
1967 | cpu = smp_processor_id(); | ||
1968 | now = sched_clock_cpu(cpu); | ||
1969 | delta = now - per_cpu(irq_start_time, cpu); | ||
1970 | per_cpu(irq_start_time, cpu) = now; | ||
1971 | /* | ||
1972 | * We do not account for softirq time from ksoftirqd here. | ||
1973 | * We want to continue accounting softirq time to ksoftirqd thread | ||
1974 | * in that case, so as not to confuse scheduler with a special task | ||
1975 | * that do not consume any time, but still wants to run. | ||
1976 | */ | ||
1977 | if (hardirq_count()) | ||
1978 | per_cpu(cpu_hardirq_time, cpu) += delta; | ||
1979 | else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD)) | ||
1980 | per_cpu(cpu_softirq_time, cpu) += delta; | ||
1981 | |||
1982 | local_irq_restore(flags); | ||
1983 | } | ||
1984 | EXPORT_SYMBOL_GPL(account_system_vtime); | ||
1985 | |||
1986 | static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) | ||
1987 | { | ||
1988 | if (sched_clock_irqtime && sched_feat(NONIRQ_POWER)) { | ||
1989 | u64 delta_irq = curr_irq_time - rq->prev_irq_time; | ||
1990 | rq->prev_irq_time = curr_irq_time; | ||
1991 | sched_rt_avg_update(rq, delta_irq); | ||
1992 | } | ||
1993 | } | ||
1994 | |||
1995 | #else | ||
1996 | |||
1997 | static u64 irq_time_cpu(int cpu) | ||
1998 | { | ||
1999 | return 0; | ||
2000 | } | ||
2001 | |||
2002 | static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) { } | ||
2003 | |||
2004 | #endif | ||
2005 | |||
1920 | #include "sched_idletask.c" | 2006 | #include "sched_idletask.c" |
1921 | #include "sched_fair.c" | 2007 | #include "sched_fair.c" |
1922 | #include "sched_rt.c" | 2008 | #include "sched_rt.c" |
2009 | #include "sched_stoptask.c" | ||
1923 | #ifdef CONFIG_SCHED_DEBUG | 2010 | #ifdef CONFIG_SCHED_DEBUG |
1924 | # include "sched_debug.c" | 2011 | # include "sched_debug.c" |
1925 | #endif | 2012 | #endif |
1926 | 2013 | ||
2014 | void sched_set_stop_task(int cpu, struct task_struct *stop) | ||
2015 | { | ||
2016 | struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; | ||
2017 | struct task_struct *old_stop = cpu_rq(cpu)->stop; | ||
2018 | |||
2019 | if (stop) { | ||
2020 | /* | ||
2021 | * Make it appear like a SCHED_FIFO task, its something | ||
2022 | * userspace knows about and won't get confused about. | ||
2023 | * | ||
2024 | * Also, it will make PI more or less work without too | ||
2025 | * much confusion -- but then, stop work should not | ||
2026 | * rely on PI working anyway. | ||
2027 | */ | ||
2028 | sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m); | ||
2029 | |||
2030 | stop->sched_class = &stop_sched_class; | ||
2031 | } | ||
2032 | |||
2033 | cpu_rq(cpu)->stop = stop; | ||
2034 | |||
2035 | if (old_stop) { | ||
2036 | /* | ||
2037 | * Reset it back to a normal scheduling class so that | ||
2038 | * it can die in pieces. | ||
2039 | */ | ||
2040 | old_stop->sched_class = &rt_sched_class; | ||
2041 | } | ||
2042 | } | ||
2043 | |||
1927 | /* | 2044 | /* |
1928 | * __normal_prio - return the priority that is based on the static prio | 2045 | * __normal_prio - return the priority that is based on the static prio |
1929 | */ | 2046 | */ |
@@ -1991,6 +2108,31 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, | |||
1991 | p->sched_class->prio_changed(rq, p, oldprio, running); | 2108 | p->sched_class->prio_changed(rq, p, oldprio, running); |
1992 | } | 2109 | } |
1993 | 2110 | ||
2111 | static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) | ||
2112 | { | ||
2113 | const struct sched_class *class; | ||
2114 | |||
2115 | if (p->sched_class == rq->curr->sched_class) { | ||
2116 | rq->curr->sched_class->check_preempt_curr(rq, p, flags); | ||
2117 | } else { | ||
2118 | for_each_class(class) { | ||
2119 | if (class == rq->curr->sched_class) | ||
2120 | break; | ||
2121 | if (class == p->sched_class) { | ||
2122 | resched_task(rq->curr); | ||
2123 | break; | ||
2124 | } | ||
2125 | } | ||
2126 | } | ||
2127 | |||
2128 | /* | ||
2129 | * A queue event has occurred, and we're going to schedule. In | ||
2130 | * this case, we can save a useless back to back clock update. | ||
2131 | */ | ||
2132 | if (test_tsk_need_resched(rq->curr)) | ||
2133 | rq->skip_clock_update = 1; | ||
2134 | } | ||
2135 | |||
1994 | #ifdef CONFIG_SMP | 2136 | #ifdef CONFIG_SMP |
1995 | /* | 2137 | /* |
1996 | * Is this task likely cache-hot: | 2138 | * Is this task likely cache-hot: |
@@ -2003,6 +2145,9 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) | |||
2003 | if (p->sched_class != &fair_sched_class) | 2145 | if (p->sched_class != &fair_sched_class) |
2004 | return 0; | 2146 | return 0; |
2005 | 2147 | ||
2148 | if (unlikely(p->policy == SCHED_IDLE)) | ||
2149 | return 0; | ||
2150 | |||
2006 | /* | 2151 | /* |
2007 | * Buddy candidates are cache hot: | 2152 | * Buddy candidates are cache hot: |
2008 | */ | 2153 | */ |
@@ -2852,14 +2997,14 @@ context_switch(struct rq *rq, struct task_struct *prev, | |||
2852 | */ | 2997 | */ |
2853 | arch_start_context_switch(prev); | 2998 | arch_start_context_switch(prev); |
2854 | 2999 | ||
2855 | if (likely(!mm)) { | 3000 | if (!mm) { |
2856 | next->active_mm = oldmm; | 3001 | next->active_mm = oldmm; |
2857 | atomic_inc(&oldmm->mm_count); | 3002 | atomic_inc(&oldmm->mm_count); |
2858 | enter_lazy_tlb(oldmm, next); | 3003 | enter_lazy_tlb(oldmm, next); |
2859 | } else | 3004 | } else |
2860 | switch_mm(oldmm, mm, next); | 3005 | switch_mm(oldmm, mm, next); |
2861 | 3006 | ||
2862 | if (likely(!prev->mm)) { | 3007 | if (!prev->mm) { |
2863 | prev->active_mm = NULL; | 3008 | prev->active_mm = NULL; |
2864 | rq->prev_mm = oldmm; | 3009 | rq->prev_mm = oldmm; |
2865 | } | 3010 | } |
@@ -3248,7 +3393,7 @@ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq) | |||
3248 | 3393 | ||
3249 | if (task_current(rq, p)) { | 3394 | if (task_current(rq, p)) { |
3250 | update_rq_clock(rq); | 3395 | update_rq_clock(rq); |
3251 | ns = rq->clock - p->se.exec_start; | 3396 | ns = rq->clock_task - p->se.exec_start; |
3252 | if ((s64)ns < 0) | 3397 | if ((s64)ns < 0) |
3253 | ns = 0; | 3398 | ns = 0; |
3254 | } | 3399 | } |
@@ -3397,7 +3542,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset, | |||
3397 | tmp = cputime_to_cputime64(cputime); | 3542 | tmp = cputime_to_cputime64(cputime); |
3398 | if (hardirq_count() - hardirq_offset) | 3543 | if (hardirq_count() - hardirq_offset) |
3399 | cpustat->irq = cputime64_add(cpustat->irq, tmp); | 3544 | cpustat->irq = cputime64_add(cpustat->irq, tmp); |
3400 | else if (softirq_count()) | 3545 | else if (in_serving_softirq()) |
3401 | cpustat->softirq = cputime64_add(cpustat->softirq, tmp); | 3546 | cpustat->softirq = cputime64_add(cpustat->softirq, tmp); |
3402 | else | 3547 | else |
3403 | cpustat->system = cputime64_add(cpustat->system, tmp); | 3548 | cpustat->system = cputime64_add(cpustat->system, tmp); |
@@ -3584,7 +3729,7 @@ void scheduler_tick(void) | |||
3584 | curr->sched_class->task_tick(rq, curr, 0); | 3729 | curr->sched_class->task_tick(rq, curr, 0); |
3585 | raw_spin_unlock(&rq->lock); | 3730 | raw_spin_unlock(&rq->lock); |
3586 | 3731 | ||
3587 | perf_event_task_tick(curr); | 3732 | perf_event_task_tick(); |
3588 | 3733 | ||
3589 | #ifdef CONFIG_SMP | 3734 | #ifdef CONFIG_SMP |
3590 | rq->idle_at_tick = idle_cpu(cpu); | 3735 | rq->idle_at_tick = idle_cpu(cpu); |
@@ -3723,17 +3868,13 @@ pick_next_task(struct rq *rq) | |||
3723 | return p; | 3868 | return p; |
3724 | } | 3869 | } |
3725 | 3870 | ||
3726 | class = sched_class_highest; | 3871 | for_each_class(class) { |
3727 | for ( ; ; ) { | ||
3728 | p = class->pick_next_task(rq); | 3872 | p = class->pick_next_task(rq); |
3729 | if (p) | 3873 | if (p) |
3730 | return p; | 3874 | return p; |
3731 | /* | ||
3732 | * Will never be NULL as the idle class always | ||
3733 | * returns a non-NULL p: | ||
3734 | */ | ||
3735 | class = class->next; | ||
3736 | } | 3875 | } |
3876 | |||
3877 | BUG(); /* the idle class will always have a runnable task */ | ||
3737 | } | 3878 | } |
3738 | 3879 | ||
3739 | /* | 3880 | /* |
@@ -4358,6 +4499,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
4358 | 4499 | ||
4359 | rq = task_rq_lock(p, &flags); | 4500 | rq = task_rq_lock(p, &flags); |
4360 | 4501 | ||
4502 | trace_sched_pi_setprio(p, prio); | ||
4361 | oldprio = p->prio; | 4503 | oldprio = p->prio; |
4362 | prev_class = p->sched_class; | 4504 | prev_class = p->sched_class; |
4363 | on_rq = p->se.on_rq; | 4505 | on_rq = p->se.on_rq; |
@@ -4645,7 +4787,7 @@ recheck: | |||
4645 | } | 4787 | } |
4646 | 4788 | ||
4647 | if (user) { | 4789 | if (user) { |
4648 | retval = security_task_setscheduler(p, policy, param); | 4790 | retval = security_task_setscheduler(p); |
4649 | if (retval) | 4791 | if (retval) |
4650 | return retval; | 4792 | return retval; |
4651 | } | 4793 | } |
@@ -4661,6 +4803,15 @@ recheck: | |||
4661 | */ | 4803 | */ |
4662 | rq = __task_rq_lock(p); | 4804 | rq = __task_rq_lock(p); |
4663 | 4805 | ||
4806 | /* | ||
4807 | * Changing the policy of the stop threads its a very bad idea | ||
4808 | */ | ||
4809 | if (p == rq->stop) { | ||
4810 | __task_rq_unlock(rq); | ||
4811 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); | ||
4812 | return -EINVAL; | ||
4813 | } | ||
4814 | |||
4664 | #ifdef CONFIG_RT_GROUP_SCHED | 4815 | #ifdef CONFIG_RT_GROUP_SCHED |
4665 | if (user) { | 4816 | if (user) { |
4666 | /* | 4817 | /* |
@@ -4887,13 +5038,13 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) | |||
4887 | if (!check_same_owner(p) && !capable(CAP_SYS_NICE)) | 5038 | if (!check_same_owner(p) && !capable(CAP_SYS_NICE)) |
4888 | goto out_unlock; | 5039 | goto out_unlock; |
4889 | 5040 | ||
4890 | retval = security_task_setscheduler(p, 0, NULL); | 5041 | retval = security_task_setscheduler(p); |
4891 | if (retval) | 5042 | if (retval) |
4892 | goto out_unlock; | 5043 | goto out_unlock; |
4893 | 5044 | ||
4894 | cpuset_cpus_allowed(p, cpus_allowed); | 5045 | cpuset_cpus_allowed(p, cpus_allowed); |
4895 | cpumask_and(new_mask, in_mask, cpus_allowed); | 5046 | cpumask_and(new_mask, in_mask, cpus_allowed); |
4896 | again: | 5047 | again: |
4897 | retval = set_cpus_allowed_ptr(p, new_mask); | 5048 | retval = set_cpus_allowed_ptr(p, new_mask); |
4898 | 5049 | ||
4899 | if (!retval) { | 5050 | if (!retval) { |
@@ -5337,7 +5488,19 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) | |||
5337 | idle->se.exec_start = sched_clock(); | 5488 | idle->se.exec_start = sched_clock(); |
5338 | 5489 | ||
5339 | cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu)); | 5490 | cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu)); |
5491 | /* | ||
5492 | * We're having a chicken and egg problem, even though we are | ||
5493 | * holding rq->lock, the cpu isn't yet set to this cpu so the | ||
5494 | * lockdep check in task_group() will fail. | ||
5495 | * | ||
5496 | * Similar case to sched_fork(). / Alternatively we could | ||
5497 | * use task_rq_lock() here and obtain the other rq->lock. | ||
5498 | * | ||
5499 | * Silence PROVE_RCU | ||
5500 | */ | ||
5501 | rcu_read_lock(); | ||
5340 | __set_task_cpu(idle, cpu); | 5502 | __set_task_cpu(idle, cpu); |
5503 | rcu_read_unlock(); | ||
5341 | 5504 | ||
5342 | rq->curr = rq->idle = idle; | 5505 | rq->curr = rq->idle = idle; |
5343 | #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) | 5506 | #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) |
@@ -6514,6 +6677,7 @@ struct s_data { | |||
6514 | cpumask_var_t nodemask; | 6677 | cpumask_var_t nodemask; |
6515 | cpumask_var_t this_sibling_map; | 6678 | cpumask_var_t this_sibling_map; |
6516 | cpumask_var_t this_core_map; | 6679 | cpumask_var_t this_core_map; |
6680 | cpumask_var_t this_book_map; | ||
6517 | cpumask_var_t send_covered; | 6681 | cpumask_var_t send_covered; |
6518 | cpumask_var_t tmpmask; | 6682 | cpumask_var_t tmpmask; |
6519 | struct sched_group **sched_group_nodes; | 6683 | struct sched_group **sched_group_nodes; |
@@ -6525,6 +6689,7 @@ enum s_alloc { | |||
6525 | sa_rootdomain, | 6689 | sa_rootdomain, |
6526 | sa_tmpmask, | 6690 | sa_tmpmask, |
6527 | sa_send_covered, | 6691 | sa_send_covered, |
6692 | sa_this_book_map, | ||
6528 | sa_this_core_map, | 6693 | sa_this_core_map, |
6529 | sa_this_sibling_map, | 6694 | sa_this_sibling_map, |
6530 | sa_nodemask, | 6695 | sa_nodemask, |
@@ -6560,31 +6725,48 @@ cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map, | |||
6560 | #ifdef CONFIG_SCHED_MC | 6725 | #ifdef CONFIG_SCHED_MC |
6561 | static DEFINE_PER_CPU(struct static_sched_domain, core_domains); | 6726 | static DEFINE_PER_CPU(struct static_sched_domain, core_domains); |
6562 | static DEFINE_PER_CPU(struct static_sched_group, sched_group_core); | 6727 | static DEFINE_PER_CPU(struct static_sched_group, sched_group_core); |
6563 | #endif /* CONFIG_SCHED_MC */ | ||
6564 | 6728 | ||
6565 | #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) | ||
6566 | static int | 6729 | static int |
6567 | cpu_to_core_group(int cpu, const struct cpumask *cpu_map, | 6730 | cpu_to_core_group(int cpu, const struct cpumask *cpu_map, |
6568 | struct sched_group **sg, struct cpumask *mask) | 6731 | struct sched_group **sg, struct cpumask *mask) |
6569 | { | 6732 | { |
6570 | int group; | 6733 | int group; |
6571 | 6734 | #ifdef CONFIG_SCHED_SMT | |
6572 | cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); | 6735 | cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); |
6573 | group = cpumask_first(mask); | 6736 | group = cpumask_first(mask); |
6737 | #else | ||
6738 | group = cpu; | ||
6739 | #endif | ||
6574 | if (sg) | 6740 | if (sg) |
6575 | *sg = &per_cpu(sched_group_core, group).sg; | 6741 | *sg = &per_cpu(sched_group_core, group).sg; |
6576 | return group; | 6742 | return group; |
6577 | } | 6743 | } |
6578 | #elif defined(CONFIG_SCHED_MC) | 6744 | #endif /* CONFIG_SCHED_MC */ |
6745 | |||
6746 | /* | ||
6747 | * book sched-domains: | ||
6748 | */ | ||
6749 | #ifdef CONFIG_SCHED_BOOK | ||
6750 | static DEFINE_PER_CPU(struct static_sched_domain, book_domains); | ||
6751 | static DEFINE_PER_CPU(struct static_sched_group, sched_group_book); | ||
6752 | |||
6579 | static int | 6753 | static int |
6580 | cpu_to_core_group(int cpu, const struct cpumask *cpu_map, | 6754 | cpu_to_book_group(int cpu, const struct cpumask *cpu_map, |
6581 | struct sched_group **sg, struct cpumask *unused) | 6755 | struct sched_group **sg, struct cpumask *mask) |
6582 | { | 6756 | { |
6757 | int group = cpu; | ||
6758 | #ifdef CONFIG_SCHED_MC | ||
6759 | cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map); | ||
6760 | group = cpumask_first(mask); | ||
6761 | #elif defined(CONFIG_SCHED_SMT) | ||
6762 | cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); | ||
6763 | group = cpumask_first(mask); | ||
6764 | #endif | ||
6583 | if (sg) | 6765 | if (sg) |
6584 | *sg = &per_cpu(sched_group_core, cpu).sg; | 6766 | *sg = &per_cpu(sched_group_book, group).sg; |
6585 | return cpu; | 6767 | return group; |
6586 | } | 6768 | } |
6587 | #endif | 6769 | #endif /* CONFIG_SCHED_BOOK */ |
6588 | 6770 | ||
6589 | static DEFINE_PER_CPU(struct static_sched_domain, phys_domains); | 6771 | static DEFINE_PER_CPU(struct static_sched_domain, phys_domains); |
6590 | static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys); | 6772 | static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys); |
@@ -6594,7 +6776,10 @@ cpu_to_phys_group(int cpu, const struct cpumask *cpu_map, | |||
6594 | struct sched_group **sg, struct cpumask *mask) | 6776 | struct sched_group **sg, struct cpumask *mask) |
6595 | { | 6777 | { |
6596 | int group; | 6778 | int group; |
6597 | #ifdef CONFIG_SCHED_MC | 6779 | #ifdef CONFIG_SCHED_BOOK |
6780 | cpumask_and(mask, cpu_book_mask(cpu), cpu_map); | ||
6781 | group = cpumask_first(mask); | ||
6782 | #elif defined(CONFIG_SCHED_MC) | ||
6598 | cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map); | 6783 | cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map); |
6599 | group = cpumask_first(mask); | 6784 | group = cpumask_first(mask); |
6600 | #elif defined(CONFIG_SCHED_SMT) | 6785 | #elif defined(CONFIG_SCHED_SMT) |
@@ -6790,6 +6975,8 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) | |||
6790 | if (cpu != group_first_cpu(sd->groups)) | 6975 | if (cpu != group_first_cpu(sd->groups)) |
6791 | return; | 6976 | return; |
6792 | 6977 | ||
6978 | sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups)); | ||
6979 | |||
6793 | child = sd->child; | 6980 | child = sd->child; |
6794 | 6981 | ||
6795 | sd->groups->cpu_power = 0; | 6982 | sd->groups->cpu_power = 0; |
@@ -6855,6 +7042,9 @@ SD_INIT_FUNC(CPU) | |||
6855 | #ifdef CONFIG_SCHED_MC | 7042 | #ifdef CONFIG_SCHED_MC |
6856 | SD_INIT_FUNC(MC) | 7043 | SD_INIT_FUNC(MC) |
6857 | #endif | 7044 | #endif |
7045 | #ifdef CONFIG_SCHED_BOOK | ||
7046 | SD_INIT_FUNC(BOOK) | ||
7047 | #endif | ||
6858 | 7048 | ||
6859 | static int default_relax_domain_level = -1; | 7049 | static int default_relax_domain_level = -1; |
6860 | 7050 | ||
@@ -6904,6 +7094,8 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what, | |||
6904 | free_cpumask_var(d->tmpmask); /* fall through */ | 7094 | free_cpumask_var(d->tmpmask); /* fall through */ |
6905 | case sa_send_covered: | 7095 | case sa_send_covered: |
6906 | free_cpumask_var(d->send_covered); /* fall through */ | 7096 | free_cpumask_var(d->send_covered); /* fall through */ |
7097 | case sa_this_book_map: | ||
7098 | free_cpumask_var(d->this_book_map); /* fall through */ | ||
6907 | case sa_this_core_map: | 7099 | case sa_this_core_map: |
6908 | free_cpumask_var(d->this_core_map); /* fall through */ | 7100 | free_cpumask_var(d->this_core_map); /* fall through */ |
6909 | case sa_this_sibling_map: | 7101 | case sa_this_sibling_map: |
@@ -6950,8 +7142,10 @@ static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, | |||
6950 | return sa_nodemask; | 7142 | return sa_nodemask; |
6951 | if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL)) | 7143 | if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL)) |
6952 | return sa_this_sibling_map; | 7144 | return sa_this_sibling_map; |
6953 | if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL)) | 7145 | if (!alloc_cpumask_var(&d->this_book_map, GFP_KERNEL)) |
6954 | return sa_this_core_map; | 7146 | return sa_this_core_map; |
7147 | if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL)) | ||
7148 | return sa_this_book_map; | ||
6955 | if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL)) | 7149 | if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL)) |
6956 | return sa_send_covered; | 7150 | return sa_send_covered; |
6957 | d->rd = alloc_rootdomain(); | 7151 | d->rd = alloc_rootdomain(); |
@@ -7009,6 +7203,23 @@ static struct sched_domain *__build_cpu_sched_domain(struct s_data *d, | |||
7009 | return sd; | 7203 | return sd; |
7010 | } | 7204 | } |
7011 | 7205 | ||
7206 | static struct sched_domain *__build_book_sched_domain(struct s_data *d, | ||
7207 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, | ||
7208 | struct sched_domain *parent, int i) | ||
7209 | { | ||
7210 | struct sched_domain *sd = parent; | ||
7211 | #ifdef CONFIG_SCHED_BOOK | ||
7212 | sd = &per_cpu(book_domains, i).sd; | ||
7213 | SD_INIT(sd, BOOK); | ||
7214 | set_domain_attribute(sd, attr); | ||
7215 | cpumask_and(sched_domain_span(sd), cpu_map, cpu_book_mask(i)); | ||
7216 | sd->parent = parent; | ||
7217 | parent->child = sd; | ||
7218 | cpu_to_book_group(i, cpu_map, &sd->groups, d->tmpmask); | ||
7219 | #endif | ||
7220 | return sd; | ||
7221 | } | ||
7222 | |||
7012 | static struct sched_domain *__build_mc_sched_domain(struct s_data *d, | 7223 | static struct sched_domain *__build_mc_sched_domain(struct s_data *d, |
7013 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, | 7224 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, |
7014 | struct sched_domain *parent, int i) | 7225 | struct sched_domain *parent, int i) |
@@ -7066,6 +7277,15 @@ static void build_sched_groups(struct s_data *d, enum sched_domain_level l, | |||
7066 | d->send_covered, d->tmpmask); | 7277 | d->send_covered, d->tmpmask); |
7067 | break; | 7278 | break; |
7068 | #endif | 7279 | #endif |
7280 | #ifdef CONFIG_SCHED_BOOK | ||
7281 | case SD_LV_BOOK: /* set up book groups */ | ||
7282 | cpumask_and(d->this_book_map, cpu_map, cpu_book_mask(cpu)); | ||
7283 | if (cpu == cpumask_first(d->this_book_map)) | ||
7284 | init_sched_build_groups(d->this_book_map, cpu_map, | ||
7285 | &cpu_to_book_group, | ||
7286 | d->send_covered, d->tmpmask); | ||
7287 | break; | ||
7288 | #endif | ||
7069 | case SD_LV_CPU: /* set up physical groups */ | 7289 | case SD_LV_CPU: /* set up physical groups */ |
7070 | cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map); | 7290 | cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map); |
7071 | if (!cpumask_empty(d->nodemask)) | 7291 | if (!cpumask_empty(d->nodemask)) |
@@ -7113,12 +7333,14 @@ static int __build_sched_domains(const struct cpumask *cpu_map, | |||
7113 | 7333 | ||
7114 | sd = __build_numa_sched_domains(&d, cpu_map, attr, i); | 7334 | sd = __build_numa_sched_domains(&d, cpu_map, attr, i); |
7115 | sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i); | 7335 | sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i); |
7336 | sd = __build_book_sched_domain(&d, cpu_map, attr, sd, i); | ||
7116 | sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i); | 7337 | sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i); |
7117 | sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i); | 7338 | sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i); |
7118 | } | 7339 | } |
7119 | 7340 | ||
7120 | for_each_cpu(i, cpu_map) { | 7341 | for_each_cpu(i, cpu_map) { |
7121 | build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i); | 7342 | build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i); |
7343 | build_sched_groups(&d, SD_LV_BOOK, cpu_map, i); | ||
7122 | build_sched_groups(&d, SD_LV_MC, cpu_map, i); | 7344 | build_sched_groups(&d, SD_LV_MC, cpu_map, i); |
7123 | } | 7345 | } |
7124 | 7346 | ||
@@ -7149,6 +7371,12 @@ static int __build_sched_domains(const struct cpumask *cpu_map, | |||
7149 | init_sched_groups_power(i, sd); | 7371 | init_sched_groups_power(i, sd); |
7150 | } | 7372 | } |
7151 | #endif | 7373 | #endif |
7374 | #ifdef CONFIG_SCHED_BOOK | ||
7375 | for_each_cpu(i, cpu_map) { | ||
7376 | sd = &per_cpu(book_domains, i).sd; | ||
7377 | init_sched_groups_power(i, sd); | ||
7378 | } | ||
7379 | #endif | ||
7152 | 7380 | ||
7153 | for_each_cpu(i, cpu_map) { | 7381 | for_each_cpu(i, cpu_map) { |
7154 | sd = &per_cpu(phys_domains, i).sd; | 7382 | sd = &per_cpu(phys_domains, i).sd; |
@@ -7174,6 +7402,8 @@ static int __build_sched_domains(const struct cpumask *cpu_map, | |||
7174 | sd = &per_cpu(cpu_domains, i).sd; | 7402 | sd = &per_cpu(cpu_domains, i).sd; |
7175 | #elif defined(CONFIG_SCHED_MC) | 7403 | #elif defined(CONFIG_SCHED_MC) |
7176 | sd = &per_cpu(core_domains, i).sd; | 7404 | sd = &per_cpu(core_domains, i).sd; |
7405 | #elif defined(CONFIG_SCHED_BOOK) | ||
7406 | sd = &per_cpu(book_domains, i).sd; | ||
7177 | #else | 7407 | #else |
7178 | sd = &per_cpu(phys_domains, i).sd; | 7408 | sd = &per_cpu(phys_domains, i).sd; |
7179 | #endif | 7409 | #endif |
@@ -8078,9 +8308,9 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) | |||
8078 | 8308 | ||
8079 | return 1; | 8309 | return 1; |
8080 | 8310 | ||
8081 | err_free_rq: | 8311 | err_free_rq: |
8082 | kfree(cfs_rq); | 8312 | kfree(cfs_rq); |
8083 | err: | 8313 | err: |
8084 | return 0; | 8314 | return 0; |
8085 | } | 8315 | } |
8086 | 8316 | ||
@@ -8168,9 +8398,9 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) | |||
8168 | 8398 | ||
8169 | return 1; | 8399 | return 1; |
8170 | 8400 | ||
8171 | err_free_rq: | 8401 | err_free_rq: |
8172 | kfree(rt_rq); | 8402 | kfree(rt_rq); |
8173 | err: | 8403 | err: |
8174 | return 0; | 8404 | return 0; |
8175 | } | 8405 | } |
8176 | 8406 | ||
@@ -8297,12 +8527,12 @@ void sched_move_task(struct task_struct *tsk) | |||
8297 | if (unlikely(running)) | 8527 | if (unlikely(running)) |
8298 | tsk->sched_class->put_prev_task(rq, tsk); | 8528 | tsk->sched_class->put_prev_task(rq, tsk); |
8299 | 8529 | ||
8300 | set_task_rq(tsk, task_cpu(tsk)); | ||
8301 | |||
8302 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8530 | #ifdef CONFIG_FAIR_GROUP_SCHED |
8303 | if (tsk->sched_class->moved_group) | 8531 | if (tsk->sched_class->task_move_group) |
8304 | tsk->sched_class->moved_group(tsk, on_rq); | 8532 | tsk->sched_class->task_move_group(tsk, on_rq); |
8533 | else | ||
8305 | #endif | 8534 | #endif |
8535 | set_task_rq(tsk, task_cpu(tsk)); | ||
8306 | 8536 | ||
8307 | if (unlikely(running)) | 8537 | if (unlikely(running)) |
8308 | tsk->sched_class->set_curr_task(rq); | 8538 | tsk->sched_class->set_curr_task(rq); |
@@ -8528,7 +8758,7 @@ static int tg_set_bandwidth(struct task_group *tg, | |||
8528 | raw_spin_unlock(&rt_rq->rt_runtime_lock); | 8758 | raw_spin_unlock(&rt_rq->rt_runtime_lock); |
8529 | } | 8759 | } |
8530 | raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); | 8760 | raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); |
8531 | unlock: | 8761 | unlock: |
8532 | read_unlock(&tasklist_lock); | 8762 | read_unlock(&tasklist_lock); |
8533 | mutex_unlock(&rt_constraints_mutex); | 8763 | mutex_unlock(&rt_constraints_mutex); |
8534 | 8764 | ||