aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c356
1 files changed, 293 insertions, 63 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index dc85ceb90832..dc91a4d09ac3 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -426,9 +426,7 @@ struct root_domain {
426 */ 426 */
427 cpumask_var_t rto_mask; 427 cpumask_var_t rto_mask;
428 atomic_t rto_count; 428 atomic_t rto_count;
429#ifdef CONFIG_SMP
430 struct cpupri cpupri; 429 struct cpupri cpupri;
431#endif
432}; 430};
433 431
434/* 432/*
@@ -437,7 +435,7 @@ struct root_domain {
437 */ 435 */
438static struct root_domain def_root_domain; 436static struct root_domain def_root_domain;
439 437
440#endif 438#endif /* CONFIG_SMP */
441 439
442/* 440/*
443 * This is the main, per-CPU runqueue data structure. 441 * This is the main, per-CPU runqueue data structure.
@@ -488,11 +486,12 @@ struct rq {
488 */ 486 */
489 unsigned long nr_uninterruptible; 487 unsigned long nr_uninterruptible;
490 488
491 struct task_struct *curr, *idle; 489 struct task_struct *curr, *idle, *stop;
492 unsigned long next_balance; 490 unsigned long next_balance;
493 struct mm_struct *prev_mm; 491 struct mm_struct *prev_mm;
494 492
495 u64 clock; 493 u64 clock;
494 u64 clock_task;
496 495
497 atomic_t nr_iowait; 496 atomic_t nr_iowait;
498 497
@@ -520,6 +519,10 @@ struct rq {
520 u64 avg_idle; 519 u64 avg_idle;
521#endif 520#endif
522 521
522#ifdef CONFIG_IRQ_TIME_ACCOUNTING
523 u64 prev_irq_time;
524#endif
525
523 /* calc_load related fields */ 526 /* calc_load related fields */
524 unsigned long calc_load_update; 527 unsigned long calc_load_update;
525 long calc_load_active; 528 long calc_load_active;
@@ -557,18 +560,8 @@ struct rq {
557 560
558static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 561static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
559 562
560static inline
561void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
562{
563 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
564 563
565 /* 564static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
566 * A queue event has occurred, and we're going to schedule. In
567 * this case, we can save a useless back to back clock update.
568 */
569 if (test_tsk_need_resched(p))
570 rq->skip_clock_update = 1;
571}
572 565
573static inline int cpu_of(struct rq *rq) 566static inline int cpu_of(struct rq *rq)
574{ 567{
@@ -643,10 +636,22 @@ static inline struct task_group *task_group(struct task_struct *p)
643 636
644#endif /* CONFIG_CGROUP_SCHED */ 637#endif /* CONFIG_CGROUP_SCHED */
645 638
639static u64 irq_time_cpu(int cpu);
640static void sched_irq_time_avg_update(struct rq *rq, u64 irq_time);
641
646inline void update_rq_clock(struct rq *rq) 642inline void update_rq_clock(struct rq *rq)
647{ 643{
648 if (!rq->skip_clock_update) 644 if (!rq->skip_clock_update) {
649 rq->clock = sched_clock_cpu(cpu_of(rq)); 645 int cpu = cpu_of(rq);
646 u64 irq_time;
647
648 rq->clock = sched_clock_cpu(cpu);
649 irq_time = irq_time_cpu(cpu);
650 if (rq->clock - irq_time > rq->clock_task)
651 rq->clock_task = rq->clock - irq_time;
652
653 sched_irq_time_avg_update(rq, irq_time);
654 }
650} 655}
651 656
652/* 657/*
@@ -723,7 +728,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
723 size_t cnt, loff_t *ppos) 728 size_t cnt, loff_t *ppos)
724{ 729{
725 char buf[64]; 730 char buf[64];
726 char *cmp = buf; 731 char *cmp;
727 int neg = 0; 732 int neg = 0;
728 int i; 733 int i;
729 734
@@ -734,6 +739,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
734 return -EFAULT; 739 return -EFAULT;
735 740
736 buf[cnt] = 0; 741 buf[cnt] = 0;
742 cmp = strstrip(buf);
737 743
738 if (strncmp(buf, "NO_", 3) == 0) { 744 if (strncmp(buf, "NO_", 3) == 0) {
739 neg = 1; 745 neg = 1;
@@ -741,9 +747,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
741 } 747 }
742 748
743 for (i = 0; sched_feat_names[i]; i++) { 749 for (i = 0; sched_feat_names[i]; i++) {
744 int len = strlen(sched_feat_names[i]); 750 if (strcmp(cmp, sched_feat_names[i]) == 0) {
745
746 if (strncmp(cmp, sched_feat_names[i], len) == 0) {
747 if (neg) 751 if (neg)
748 sysctl_sched_features &= ~(1UL << i); 752 sysctl_sched_features &= ~(1UL << i);
749 else 753 else
@@ -1840,7 +1844,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1840 1844
1841static const struct sched_class rt_sched_class; 1845static const struct sched_class rt_sched_class;
1842 1846
1843#define sched_class_highest (&rt_sched_class) 1847#define sched_class_highest (&stop_sched_class)
1844#define for_each_class(class) \ 1848#define for_each_class(class) \
1845 for (class = sched_class_highest; class; class = class->next) 1849 for (class = sched_class_highest; class; class = class->next)
1846 1850
@@ -1858,12 +1862,6 @@ static void dec_nr_running(struct rq *rq)
1858 1862
1859static void set_load_weight(struct task_struct *p) 1863static void set_load_weight(struct task_struct *p)
1860{ 1864{
1861 if (task_has_rt_policy(p)) {
1862 p->se.load.weight = 0;
1863 p->se.load.inv_weight = WMULT_CONST;
1864 return;
1865 }
1866
1867 /* 1865 /*
1868 * SCHED_IDLE tasks get minimal weight: 1866 * SCHED_IDLE tasks get minimal weight:
1869 */ 1867 */
@@ -1917,13 +1915,132 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
1917 dec_nr_running(rq); 1915 dec_nr_running(rq);
1918} 1916}
1919 1917
1918#ifdef CONFIG_IRQ_TIME_ACCOUNTING
1919
1920/*
1921 * There are no locks covering percpu hardirq/softirq time.
1922 * They are only modified in account_system_vtime, on corresponding CPU
1923 * with interrupts disabled. So, writes are safe.
1924 * They are read and saved off onto struct rq in update_rq_clock().
1925 * This may result in other CPU reading this CPU's irq time and can
1926 * race with irq/account_system_vtime on this CPU. We would either get old
1927 * or new value (or semi updated value on 32 bit) with a side effect of
1928 * accounting a slice of irq time to wrong task when irq is in progress
1929 * while we read rq->clock. That is a worthy compromise in place of having
1930 * locks on each irq in account_system_time.
1931 */
1932static DEFINE_PER_CPU(u64, cpu_hardirq_time);
1933static DEFINE_PER_CPU(u64, cpu_softirq_time);
1934
1935static DEFINE_PER_CPU(u64, irq_start_time);
1936static int sched_clock_irqtime;
1937
1938void enable_sched_clock_irqtime(void)
1939{
1940 sched_clock_irqtime = 1;
1941}
1942
1943void disable_sched_clock_irqtime(void)
1944{
1945 sched_clock_irqtime = 0;
1946}
1947
1948static u64 irq_time_cpu(int cpu)
1949{
1950 if (!sched_clock_irqtime)
1951 return 0;
1952
1953 return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
1954}
1955
1956void account_system_vtime(struct task_struct *curr)
1957{
1958 unsigned long flags;
1959 int cpu;
1960 u64 now, delta;
1961
1962 if (!sched_clock_irqtime)
1963 return;
1964
1965 local_irq_save(flags);
1966
1967 cpu = smp_processor_id();
1968 now = sched_clock_cpu(cpu);
1969 delta = now - per_cpu(irq_start_time, cpu);
1970 per_cpu(irq_start_time, cpu) = now;
1971 /*
1972 * We do not account for softirq time from ksoftirqd here.
1973 * We want to continue accounting softirq time to ksoftirqd thread
1974 * in that case, so as not to confuse scheduler with a special task
1975 * that do not consume any time, but still wants to run.
1976 */
1977 if (hardirq_count())
1978 per_cpu(cpu_hardirq_time, cpu) += delta;
1979 else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD))
1980 per_cpu(cpu_softirq_time, cpu) += delta;
1981
1982 local_irq_restore(flags);
1983}
1984EXPORT_SYMBOL_GPL(account_system_vtime);
1985
1986static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time)
1987{
1988 if (sched_clock_irqtime && sched_feat(NONIRQ_POWER)) {
1989 u64 delta_irq = curr_irq_time - rq->prev_irq_time;
1990 rq->prev_irq_time = curr_irq_time;
1991 sched_rt_avg_update(rq, delta_irq);
1992 }
1993}
1994
1995#else
1996
1997static u64 irq_time_cpu(int cpu)
1998{
1999 return 0;
2000}
2001
2002static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) { }
2003
2004#endif
2005
1920#include "sched_idletask.c" 2006#include "sched_idletask.c"
1921#include "sched_fair.c" 2007#include "sched_fair.c"
1922#include "sched_rt.c" 2008#include "sched_rt.c"
2009#include "sched_stoptask.c"
1923#ifdef CONFIG_SCHED_DEBUG 2010#ifdef CONFIG_SCHED_DEBUG
1924# include "sched_debug.c" 2011# include "sched_debug.c"
1925#endif 2012#endif
1926 2013
2014void sched_set_stop_task(int cpu, struct task_struct *stop)
2015{
2016 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
2017 struct task_struct *old_stop = cpu_rq(cpu)->stop;
2018
2019 if (stop) {
2020 /*
2021 * Make it appear like a SCHED_FIFO task, its something
2022 * userspace knows about and won't get confused about.
2023 *
2024 * Also, it will make PI more or less work without too
2025 * much confusion -- but then, stop work should not
2026 * rely on PI working anyway.
2027 */
2028 sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);
2029
2030 stop->sched_class = &stop_sched_class;
2031 }
2032
2033 cpu_rq(cpu)->stop = stop;
2034
2035 if (old_stop) {
2036 /*
2037 * Reset it back to a normal scheduling class so that
2038 * it can die in pieces.
2039 */
2040 old_stop->sched_class = &rt_sched_class;
2041 }
2042}
2043
1927/* 2044/*
1928 * __normal_prio - return the priority that is based on the static prio 2045 * __normal_prio - return the priority that is based on the static prio
1929 */ 2046 */
@@ -1991,6 +2108,31 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
1991 p->sched_class->prio_changed(rq, p, oldprio, running); 2108 p->sched_class->prio_changed(rq, p, oldprio, running);
1992} 2109}
1993 2110
2111static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
2112{
2113 const struct sched_class *class;
2114
2115 if (p->sched_class == rq->curr->sched_class) {
2116 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
2117 } else {
2118 for_each_class(class) {
2119 if (class == rq->curr->sched_class)
2120 break;
2121 if (class == p->sched_class) {
2122 resched_task(rq->curr);
2123 break;
2124 }
2125 }
2126 }
2127
2128 /*
2129 * A queue event has occurred, and we're going to schedule. In
2130 * this case, we can save a useless back to back clock update.
2131 */
2132 if (test_tsk_need_resched(rq->curr))
2133 rq->skip_clock_update = 1;
2134}
2135
1994#ifdef CONFIG_SMP 2136#ifdef CONFIG_SMP
1995/* 2137/*
1996 * Is this task likely cache-hot: 2138 * Is this task likely cache-hot:
@@ -2003,6 +2145,9 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
2003 if (p->sched_class != &fair_sched_class) 2145 if (p->sched_class != &fair_sched_class)
2004 return 0; 2146 return 0;
2005 2147
2148 if (unlikely(p->policy == SCHED_IDLE))
2149 return 0;
2150
2006 /* 2151 /*
2007 * Buddy candidates are cache hot: 2152 * Buddy candidates are cache hot:
2008 */ 2153 */
@@ -2852,14 +2997,14 @@ context_switch(struct rq *rq, struct task_struct *prev,
2852 */ 2997 */
2853 arch_start_context_switch(prev); 2998 arch_start_context_switch(prev);
2854 2999
2855 if (likely(!mm)) { 3000 if (!mm) {
2856 next->active_mm = oldmm; 3001 next->active_mm = oldmm;
2857 atomic_inc(&oldmm->mm_count); 3002 atomic_inc(&oldmm->mm_count);
2858 enter_lazy_tlb(oldmm, next); 3003 enter_lazy_tlb(oldmm, next);
2859 } else 3004 } else
2860 switch_mm(oldmm, mm, next); 3005 switch_mm(oldmm, mm, next);
2861 3006
2862 if (likely(!prev->mm)) { 3007 if (!prev->mm) {
2863 prev->active_mm = NULL; 3008 prev->active_mm = NULL;
2864 rq->prev_mm = oldmm; 3009 rq->prev_mm = oldmm;
2865 } 3010 }
@@ -3248,7 +3393,7 @@ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
3248 3393
3249 if (task_current(rq, p)) { 3394 if (task_current(rq, p)) {
3250 update_rq_clock(rq); 3395 update_rq_clock(rq);
3251 ns = rq->clock - p->se.exec_start; 3396 ns = rq->clock_task - p->se.exec_start;
3252 if ((s64)ns < 0) 3397 if ((s64)ns < 0)
3253 ns = 0; 3398 ns = 0;
3254 } 3399 }
@@ -3397,7 +3542,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
3397 tmp = cputime_to_cputime64(cputime); 3542 tmp = cputime_to_cputime64(cputime);
3398 if (hardirq_count() - hardirq_offset) 3543 if (hardirq_count() - hardirq_offset)
3399 cpustat->irq = cputime64_add(cpustat->irq, tmp); 3544 cpustat->irq = cputime64_add(cpustat->irq, tmp);
3400 else if (softirq_count()) 3545 else if (in_serving_softirq())
3401 cpustat->softirq = cputime64_add(cpustat->softirq, tmp); 3546 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
3402 else 3547 else
3403 cpustat->system = cputime64_add(cpustat->system, tmp); 3548 cpustat->system = cputime64_add(cpustat->system, tmp);
@@ -3584,7 +3729,7 @@ void scheduler_tick(void)
3584 curr->sched_class->task_tick(rq, curr, 0); 3729 curr->sched_class->task_tick(rq, curr, 0);
3585 raw_spin_unlock(&rq->lock); 3730 raw_spin_unlock(&rq->lock);
3586 3731
3587 perf_event_task_tick(curr); 3732 perf_event_task_tick();
3588 3733
3589#ifdef CONFIG_SMP 3734#ifdef CONFIG_SMP
3590 rq->idle_at_tick = idle_cpu(cpu); 3735 rq->idle_at_tick = idle_cpu(cpu);
@@ -3723,17 +3868,13 @@ pick_next_task(struct rq *rq)
3723 return p; 3868 return p;
3724 } 3869 }
3725 3870
3726 class = sched_class_highest; 3871 for_each_class(class) {
3727 for ( ; ; ) {
3728 p = class->pick_next_task(rq); 3872 p = class->pick_next_task(rq);
3729 if (p) 3873 if (p)
3730 return p; 3874 return p;
3731 /*
3732 * Will never be NULL as the idle class always
3733 * returns a non-NULL p:
3734 */
3735 class = class->next;
3736 } 3875 }
3876
3877 BUG(); /* the idle class will always have a runnable task */
3737} 3878}
3738 3879
3739/* 3880/*
@@ -4358,6 +4499,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
4358 4499
4359 rq = task_rq_lock(p, &flags); 4500 rq = task_rq_lock(p, &flags);
4360 4501
4502 trace_sched_pi_setprio(p, prio);
4361 oldprio = p->prio; 4503 oldprio = p->prio;
4362 prev_class = p->sched_class; 4504 prev_class = p->sched_class;
4363 on_rq = p->se.on_rq; 4505 on_rq = p->se.on_rq;
@@ -4645,7 +4787,7 @@ recheck:
4645 } 4787 }
4646 4788
4647 if (user) { 4789 if (user) {
4648 retval = security_task_setscheduler(p, policy, param); 4790 retval = security_task_setscheduler(p);
4649 if (retval) 4791 if (retval)
4650 return retval; 4792 return retval;
4651 } 4793 }
@@ -4661,6 +4803,15 @@ recheck:
4661 */ 4803 */
4662 rq = __task_rq_lock(p); 4804 rq = __task_rq_lock(p);
4663 4805
4806 /*
4807 * Changing the policy of the stop threads its a very bad idea
4808 */
4809 if (p == rq->stop) {
4810 __task_rq_unlock(rq);
4811 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4812 return -EINVAL;
4813 }
4814
4664#ifdef CONFIG_RT_GROUP_SCHED 4815#ifdef CONFIG_RT_GROUP_SCHED
4665 if (user) { 4816 if (user) {
4666 /* 4817 /*
@@ -4887,13 +5038,13 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
4887 if (!check_same_owner(p) && !capable(CAP_SYS_NICE)) 5038 if (!check_same_owner(p) && !capable(CAP_SYS_NICE))
4888 goto out_unlock; 5039 goto out_unlock;
4889 5040
4890 retval = security_task_setscheduler(p, 0, NULL); 5041 retval = security_task_setscheduler(p);
4891 if (retval) 5042 if (retval)
4892 goto out_unlock; 5043 goto out_unlock;
4893 5044
4894 cpuset_cpus_allowed(p, cpus_allowed); 5045 cpuset_cpus_allowed(p, cpus_allowed);
4895 cpumask_and(new_mask, in_mask, cpus_allowed); 5046 cpumask_and(new_mask, in_mask, cpus_allowed);
4896 again: 5047again:
4897 retval = set_cpus_allowed_ptr(p, new_mask); 5048 retval = set_cpus_allowed_ptr(p, new_mask);
4898 5049
4899 if (!retval) { 5050 if (!retval) {
@@ -5337,7 +5488,19 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5337 idle->se.exec_start = sched_clock(); 5488 idle->se.exec_start = sched_clock();
5338 5489
5339 cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu)); 5490 cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));
5491 /*
5492 * We're having a chicken and egg problem, even though we are
5493 * holding rq->lock, the cpu isn't yet set to this cpu so the
5494 * lockdep check in task_group() will fail.
5495 *
5496 * Similar case to sched_fork(). / Alternatively we could
5497 * use task_rq_lock() here and obtain the other rq->lock.
5498 *
5499 * Silence PROVE_RCU
5500 */
5501 rcu_read_lock();
5340 __set_task_cpu(idle, cpu); 5502 __set_task_cpu(idle, cpu);
5503 rcu_read_unlock();
5341 5504
5342 rq->curr = rq->idle = idle; 5505 rq->curr = rq->idle = idle;
5343#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) 5506#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
@@ -6514,6 +6677,7 @@ struct s_data {
6514 cpumask_var_t nodemask; 6677 cpumask_var_t nodemask;
6515 cpumask_var_t this_sibling_map; 6678 cpumask_var_t this_sibling_map;
6516 cpumask_var_t this_core_map; 6679 cpumask_var_t this_core_map;
6680 cpumask_var_t this_book_map;
6517 cpumask_var_t send_covered; 6681 cpumask_var_t send_covered;
6518 cpumask_var_t tmpmask; 6682 cpumask_var_t tmpmask;
6519 struct sched_group **sched_group_nodes; 6683 struct sched_group **sched_group_nodes;
@@ -6525,6 +6689,7 @@ enum s_alloc {
6525 sa_rootdomain, 6689 sa_rootdomain,
6526 sa_tmpmask, 6690 sa_tmpmask,
6527 sa_send_covered, 6691 sa_send_covered,
6692 sa_this_book_map,
6528 sa_this_core_map, 6693 sa_this_core_map,
6529 sa_this_sibling_map, 6694 sa_this_sibling_map,
6530 sa_nodemask, 6695 sa_nodemask,
@@ -6560,31 +6725,48 @@ cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map,
6560#ifdef CONFIG_SCHED_MC 6725#ifdef CONFIG_SCHED_MC
6561static DEFINE_PER_CPU(struct static_sched_domain, core_domains); 6726static DEFINE_PER_CPU(struct static_sched_domain, core_domains);
6562static DEFINE_PER_CPU(struct static_sched_group, sched_group_core); 6727static DEFINE_PER_CPU(struct static_sched_group, sched_group_core);
6563#endif /* CONFIG_SCHED_MC */
6564 6728
6565#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
6566static int 6729static int
6567cpu_to_core_group(int cpu, const struct cpumask *cpu_map, 6730cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
6568 struct sched_group **sg, struct cpumask *mask) 6731 struct sched_group **sg, struct cpumask *mask)
6569{ 6732{
6570 int group; 6733 int group;
6571 6734#ifdef CONFIG_SCHED_SMT
6572 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); 6735 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
6573 group = cpumask_first(mask); 6736 group = cpumask_first(mask);
6737#else
6738 group = cpu;
6739#endif
6574 if (sg) 6740 if (sg)
6575 *sg = &per_cpu(sched_group_core, group).sg; 6741 *sg = &per_cpu(sched_group_core, group).sg;
6576 return group; 6742 return group;
6577} 6743}
6578#elif defined(CONFIG_SCHED_MC) 6744#endif /* CONFIG_SCHED_MC */
6745
6746/*
6747 * book sched-domains:
6748 */
6749#ifdef CONFIG_SCHED_BOOK
6750static DEFINE_PER_CPU(struct static_sched_domain, book_domains);
6751static DEFINE_PER_CPU(struct static_sched_group, sched_group_book);
6752
6579static int 6753static int
6580cpu_to_core_group(int cpu, const struct cpumask *cpu_map, 6754cpu_to_book_group(int cpu, const struct cpumask *cpu_map,
6581 struct sched_group **sg, struct cpumask *unused) 6755 struct sched_group **sg, struct cpumask *mask)
6582{ 6756{
6757 int group = cpu;
6758#ifdef CONFIG_SCHED_MC
6759 cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
6760 group = cpumask_first(mask);
6761#elif defined(CONFIG_SCHED_SMT)
6762 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
6763 group = cpumask_first(mask);
6764#endif
6583 if (sg) 6765 if (sg)
6584 *sg = &per_cpu(sched_group_core, cpu).sg; 6766 *sg = &per_cpu(sched_group_book, group).sg;
6585 return cpu; 6767 return group;
6586} 6768}
6587#endif 6769#endif /* CONFIG_SCHED_BOOK */
6588 6770
6589static DEFINE_PER_CPU(struct static_sched_domain, phys_domains); 6771static DEFINE_PER_CPU(struct static_sched_domain, phys_domains);
6590static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys); 6772static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys);
@@ -6594,7 +6776,10 @@ cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
6594 struct sched_group **sg, struct cpumask *mask) 6776 struct sched_group **sg, struct cpumask *mask)
6595{ 6777{
6596 int group; 6778 int group;
6597#ifdef CONFIG_SCHED_MC 6779#ifdef CONFIG_SCHED_BOOK
6780 cpumask_and(mask, cpu_book_mask(cpu), cpu_map);
6781 group = cpumask_first(mask);
6782#elif defined(CONFIG_SCHED_MC)
6598 cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map); 6783 cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
6599 group = cpumask_first(mask); 6784 group = cpumask_first(mask);
6600#elif defined(CONFIG_SCHED_SMT) 6785#elif defined(CONFIG_SCHED_SMT)
@@ -6790,6 +6975,8 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
6790 if (cpu != group_first_cpu(sd->groups)) 6975 if (cpu != group_first_cpu(sd->groups))
6791 return; 6976 return;
6792 6977
6978 sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups));
6979
6793 child = sd->child; 6980 child = sd->child;
6794 6981
6795 sd->groups->cpu_power = 0; 6982 sd->groups->cpu_power = 0;
@@ -6855,6 +7042,9 @@ SD_INIT_FUNC(CPU)
6855#ifdef CONFIG_SCHED_MC 7042#ifdef CONFIG_SCHED_MC
6856 SD_INIT_FUNC(MC) 7043 SD_INIT_FUNC(MC)
6857#endif 7044#endif
7045#ifdef CONFIG_SCHED_BOOK
7046 SD_INIT_FUNC(BOOK)
7047#endif
6858 7048
6859static int default_relax_domain_level = -1; 7049static int default_relax_domain_level = -1;
6860 7050
@@ -6904,6 +7094,8 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
6904 free_cpumask_var(d->tmpmask); /* fall through */ 7094 free_cpumask_var(d->tmpmask); /* fall through */
6905 case sa_send_covered: 7095 case sa_send_covered:
6906 free_cpumask_var(d->send_covered); /* fall through */ 7096 free_cpumask_var(d->send_covered); /* fall through */
7097 case sa_this_book_map:
7098 free_cpumask_var(d->this_book_map); /* fall through */
6907 case sa_this_core_map: 7099 case sa_this_core_map:
6908 free_cpumask_var(d->this_core_map); /* fall through */ 7100 free_cpumask_var(d->this_core_map); /* fall through */
6909 case sa_this_sibling_map: 7101 case sa_this_sibling_map:
@@ -6950,8 +7142,10 @@ static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
6950 return sa_nodemask; 7142 return sa_nodemask;
6951 if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL)) 7143 if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL))
6952 return sa_this_sibling_map; 7144 return sa_this_sibling_map;
6953 if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL)) 7145 if (!alloc_cpumask_var(&d->this_book_map, GFP_KERNEL))
6954 return sa_this_core_map; 7146 return sa_this_core_map;
7147 if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
7148 return sa_this_book_map;
6955 if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL)) 7149 if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL))
6956 return sa_send_covered; 7150 return sa_send_covered;
6957 d->rd = alloc_rootdomain(); 7151 d->rd = alloc_rootdomain();
@@ -7009,6 +7203,23 @@ static struct sched_domain *__build_cpu_sched_domain(struct s_data *d,
7009 return sd; 7203 return sd;
7010} 7204}
7011 7205
7206static struct sched_domain *__build_book_sched_domain(struct s_data *d,
7207 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
7208 struct sched_domain *parent, int i)
7209{
7210 struct sched_domain *sd = parent;
7211#ifdef CONFIG_SCHED_BOOK
7212 sd = &per_cpu(book_domains, i).sd;
7213 SD_INIT(sd, BOOK);
7214 set_domain_attribute(sd, attr);
7215 cpumask_and(sched_domain_span(sd), cpu_map, cpu_book_mask(i));
7216 sd->parent = parent;
7217 parent->child = sd;
7218 cpu_to_book_group(i, cpu_map, &sd->groups, d->tmpmask);
7219#endif
7220 return sd;
7221}
7222
7012static struct sched_domain *__build_mc_sched_domain(struct s_data *d, 7223static struct sched_domain *__build_mc_sched_domain(struct s_data *d,
7013 const struct cpumask *cpu_map, struct sched_domain_attr *attr, 7224 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
7014 struct sched_domain *parent, int i) 7225 struct sched_domain *parent, int i)
@@ -7066,6 +7277,15 @@ static void build_sched_groups(struct s_data *d, enum sched_domain_level l,
7066 d->send_covered, d->tmpmask); 7277 d->send_covered, d->tmpmask);
7067 break; 7278 break;
7068#endif 7279#endif
7280#ifdef CONFIG_SCHED_BOOK
7281 case SD_LV_BOOK: /* set up book groups */
7282 cpumask_and(d->this_book_map, cpu_map, cpu_book_mask(cpu));
7283 if (cpu == cpumask_first(d->this_book_map))
7284 init_sched_build_groups(d->this_book_map, cpu_map,
7285 &cpu_to_book_group,
7286 d->send_covered, d->tmpmask);
7287 break;
7288#endif
7069 case SD_LV_CPU: /* set up physical groups */ 7289 case SD_LV_CPU: /* set up physical groups */
7070 cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map); 7290 cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map);
7071 if (!cpumask_empty(d->nodemask)) 7291 if (!cpumask_empty(d->nodemask))
@@ -7113,12 +7333,14 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
7113 7333
7114 sd = __build_numa_sched_domains(&d, cpu_map, attr, i); 7334 sd = __build_numa_sched_domains(&d, cpu_map, attr, i);
7115 sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i); 7335 sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i);
7336 sd = __build_book_sched_domain(&d, cpu_map, attr, sd, i);
7116 sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i); 7337 sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i);
7117 sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i); 7338 sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i);
7118 } 7339 }
7119 7340
7120 for_each_cpu(i, cpu_map) { 7341 for_each_cpu(i, cpu_map) {
7121 build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i); 7342 build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i);
7343 build_sched_groups(&d, SD_LV_BOOK, cpu_map, i);
7122 build_sched_groups(&d, SD_LV_MC, cpu_map, i); 7344 build_sched_groups(&d, SD_LV_MC, cpu_map, i);
7123 } 7345 }
7124 7346
@@ -7149,6 +7371,12 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
7149 init_sched_groups_power(i, sd); 7371 init_sched_groups_power(i, sd);
7150 } 7372 }
7151#endif 7373#endif
7374#ifdef CONFIG_SCHED_BOOK
7375 for_each_cpu(i, cpu_map) {
7376 sd = &per_cpu(book_domains, i).sd;
7377 init_sched_groups_power(i, sd);
7378 }
7379#endif
7152 7380
7153 for_each_cpu(i, cpu_map) { 7381 for_each_cpu(i, cpu_map) {
7154 sd = &per_cpu(phys_domains, i).sd; 7382 sd = &per_cpu(phys_domains, i).sd;
@@ -7174,6 +7402,8 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
7174 sd = &per_cpu(cpu_domains, i).sd; 7402 sd = &per_cpu(cpu_domains, i).sd;
7175#elif defined(CONFIG_SCHED_MC) 7403#elif defined(CONFIG_SCHED_MC)
7176 sd = &per_cpu(core_domains, i).sd; 7404 sd = &per_cpu(core_domains, i).sd;
7405#elif defined(CONFIG_SCHED_BOOK)
7406 sd = &per_cpu(book_domains, i).sd;
7177#else 7407#else
7178 sd = &per_cpu(phys_domains, i).sd; 7408 sd = &per_cpu(phys_domains, i).sd;
7179#endif 7409#endif
@@ -8078,9 +8308,9 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8078 8308
8079 return 1; 8309 return 1;
8080 8310
8081 err_free_rq: 8311err_free_rq:
8082 kfree(cfs_rq); 8312 kfree(cfs_rq);
8083 err: 8313err:
8084 return 0; 8314 return 0;
8085} 8315}
8086 8316
@@ -8168,9 +8398,9 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8168 8398
8169 return 1; 8399 return 1;
8170 8400
8171 err_free_rq: 8401err_free_rq:
8172 kfree(rt_rq); 8402 kfree(rt_rq);
8173 err: 8403err:
8174 return 0; 8404 return 0;
8175} 8405}
8176 8406
@@ -8297,12 +8527,12 @@ void sched_move_task(struct task_struct *tsk)
8297 if (unlikely(running)) 8527 if (unlikely(running))
8298 tsk->sched_class->put_prev_task(rq, tsk); 8528 tsk->sched_class->put_prev_task(rq, tsk);
8299 8529
8300 set_task_rq(tsk, task_cpu(tsk));
8301
8302#ifdef CONFIG_FAIR_GROUP_SCHED 8530#ifdef CONFIG_FAIR_GROUP_SCHED
8303 if (tsk->sched_class->moved_group) 8531 if (tsk->sched_class->task_move_group)
8304 tsk->sched_class->moved_group(tsk, on_rq); 8532 tsk->sched_class->task_move_group(tsk, on_rq);
8533 else
8305#endif 8534#endif
8535 set_task_rq(tsk, task_cpu(tsk));
8306 8536
8307 if (unlikely(running)) 8537 if (unlikely(running))
8308 tsk->sched_class->set_curr_task(rq); 8538 tsk->sched_class->set_curr_task(rq);
@@ -8528,7 +8758,7 @@ static int tg_set_bandwidth(struct task_group *tg,
8528 raw_spin_unlock(&rt_rq->rt_runtime_lock); 8758 raw_spin_unlock(&rt_rq->rt_runtime_lock);
8529 } 8759 }
8530 raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); 8760 raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
8531 unlock: 8761unlock:
8532 read_unlock(&tasklist_lock); 8762 read_unlock(&tasklist_lock);
8533 mutex_unlock(&rt_constraints_mutex); 8763 mutex_unlock(&rt_constraints_mutex);
8534 8764