aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2014-06-03 17:00:15 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2014-06-03 17:00:15 -0400
commitc84a1e32ee58fc1cc9d3fd42619b917cce67e30a (patch)
treed3e5bed273f747e7c9e399864219bea76f4c30ea /kernel
parent3d521f9151dacab566904d1f57dcb3e7080cdd8f (diff)
parent096aa33863a5e48de52d2ff30e0801b7487944f4 (diff)
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip into next
Pull scheduler updates from Ingo Molnar: "The main scheduling related changes in this cycle were: - various sched/numa updates, for better performance - tree wide cleanup of open coded nice levels - nohz fix related to rq->nr_running use - cpuidle changes and continued consolidation to improve the kernel/sched/idle.c high level idle scheduling logic. As part of this effort I pulled cpuidle driver changes from Rafael as well. - standardized idle polling amongst architectures - continued work on preparing better power/energy aware scheduling - sched/rt updates - misc fixlets and cleanups" * 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (49 commits) sched/numa: Decay ->wakee_flips instead of zeroing sched/numa: Update migrate_improves/degrades_locality() sched/numa: Allow task switch if load imbalance improves sched/rt: Fix 'struct sched_dl_entity' and dl_task_time() comments, to match the current upstream code sched: Consolidate open coded implementations of nice level frobbing into nice_to_rlimit() and rlimit_to_nice() sched: Initialize rq->age_stamp on processor start sched, nohz: Change rq->nr_running to always use wrappers sched: Fix the rq->next_balance logic in rebalance_domains() and idle_balance() sched: Use clamp() and clamp_val() to make sys_nice() more readable sched: Do not zero sg->cpumask and sg->sgp->power in build_sched_groups() sched/numa: Fix initialization of sched_domain_topology for NUMA sched: Call select_idle_sibling() when not affine_sd sched: Simplify return logic in sched_read_attr() sched: Simplify return logic in sched_copy_attr() sched: Fix exec_start/task_hot on migrated tasks arm64: Remove TIF_POLLING_NRFLAG metag: Remove TIF_POLLING_NRFLAG sched/idle: Make cpuidle_idle_call() void sched/idle: Reflow cpuidle_idle_call() sched/idle: Delay clearing the polling bit ...
Diffstat (limited to 'kernel')
-rw-r--r--kernel/locking/locktorture.c2
-rw-r--r--kernel/power/suspend.c2
-rw-r--r--kernel/sched/core.c324
-rw-r--r--kernel/sched/deadline.c6
-rw-r--r--kernel/sched/fair.c226
-rw-r--r--kernel/sched/idle.c140
-rw-r--r--kernel/sched/rt.c119
-rw-r--r--kernel/sched/sched.h26
-rw-r--r--kernel/sched/stop_task.c4
-rw-r--r--kernel/sys.c6
-rw-r--r--kernel/workqueue.c6
11 files changed, 522 insertions, 339 deletions
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index dbafeac18e4d..0955b885d0dc 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -216,7 +216,7 @@ static int lock_torture_writer(void *arg)
216 static DEFINE_TORTURE_RANDOM(rand); 216 static DEFINE_TORTURE_RANDOM(rand);
217 217
218 VERBOSE_TOROUT_STRING("lock_torture_writer task started"); 218 VERBOSE_TOROUT_STRING("lock_torture_writer task started");
219 set_user_nice(current, 19); 219 set_user_nice(current, MAX_NICE);
220 220
221 do { 221 do {
222 if ((torture_random(&rand) & 0xfffff) == 0) 222 if ((torture_random(&rand) & 0xfffff) == 0)
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 8233cd4047d7..155721f7f909 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -54,9 +54,11 @@ static void freeze_begin(void)
54 54
55static void freeze_enter(void) 55static void freeze_enter(void)
56{ 56{
57 cpuidle_use_deepest_state(true);
57 cpuidle_resume(); 58 cpuidle_resume();
58 wait_event(suspend_freeze_wait_head, suspend_freeze_wake); 59 wait_event(suspend_freeze_wait_head, suspend_freeze_wake);
59 cpuidle_pause(); 60 cpuidle_pause();
61 cpuidle_use_deepest_state(false);
60} 62}
61 63
62void freeze_wake(void) 64void freeze_wake(void)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a62a7dec3986..913c6d6cc2c1 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -522,6 +522,39 @@ static inline void init_hrtick(void)
522#endif /* CONFIG_SCHED_HRTICK */ 522#endif /* CONFIG_SCHED_HRTICK */
523 523
524/* 524/*
525 * cmpxchg based fetch_or, macro so it works for different integer types
526 */
527#define fetch_or(ptr, val) \
528({ typeof(*(ptr)) __old, __val = *(ptr); \
529 for (;;) { \
530 __old = cmpxchg((ptr), __val, __val | (val)); \
531 if (__old == __val) \
532 break; \
533 __val = __old; \
534 } \
535 __old; \
536})
537
538#ifdef TIF_POLLING_NRFLAG
539/*
540 * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG,
541 * this avoids any races wrt polling state changes and thereby avoids
542 * spurious IPIs.
543 */
544static bool set_nr_and_not_polling(struct task_struct *p)
545{
546 struct thread_info *ti = task_thread_info(p);
547 return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);
548}
549#else
550static bool set_nr_and_not_polling(struct task_struct *p)
551{
552 set_tsk_need_resched(p);
553 return true;
554}
555#endif
556
557/*
525 * resched_task - mark a task 'to be rescheduled now'. 558 * resched_task - mark a task 'to be rescheduled now'.
526 * 559 *
527 * On UP this means the setting of the need_resched flag, on SMP it 560 * On UP this means the setting of the need_resched flag, on SMP it
@@ -537,17 +570,15 @@ void resched_task(struct task_struct *p)
537 if (test_tsk_need_resched(p)) 570 if (test_tsk_need_resched(p))
538 return; 571 return;
539 572
540 set_tsk_need_resched(p);
541
542 cpu = task_cpu(p); 573 cpu = task_cpu(p);
574
543 if (cpu == smp_processor_id()) { 575 if (cpu == smp_processor_id()) {
576 set_tsk_need_resched(p);
544 set_preempt_need_resched(); 577 set_preempt_need_resched();
545 return; 578 return;
546 } 579 }
547 580
548 /* NEED_RESCHED must be visible before we test polling */ 581 if (set_nr_and_not_polling(p))
549 smp_mb();
550 if (!tsk_is_polling(p))
551 smp_send_reschedule(cpu); 582 smp_send_reschedule(cpu);
552} 583}
553 584
@@ -3018,7 +3049,7 @@ EXPORT_SYMBOL(set_user_nice);
3018int can_nice(const struct task_struct *p, const int nice) 3049int can_nice(const struct task_struct *p, const int nice)
3019{ 3050{
3020 /* convert nice value [19,-20] to rlimit style value [1,40] */ 3051 /* convert nice value [19,-20] to rlimit style value [1,40] */
3021 int nice_rlim = 20 - nice; 3052 int nice_rlim = nice_to_rlimit(nice);
3022 3053
3023 return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || 3054 return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
3024 capable(CAP_SYS_NICE)); 3055 capable(CAP_SYS_NICE));
@@ -3042,17 +3073,10 @@ SYSCALL_DEFINE1(nice, int, increment)
3042 * We don't have to worry. Conceptually one call occurs first 3073 * We don't have to worry. Conceptually one call occurs first
3043 * and we have a single winner. 3074 * and we have a single winner.
3044 */ 3075 */
3045 if (increment < -40) 3076 increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH);
3046 increment = -40;
3047 if (increment > 40)
3048 increment = 40;
3049
3050 nice = task_nice(current) + increment; 3077 nice = task_nice(current) + increment;
3051 if (nice < MIN_NICE)
3052 nice = MIN_NICE;
3053 if (nice > MAX_NICE)
3054 nice = MAX_NICE;
3055 3078
3079 nice = clamp_val(nice, MIN_NICE, MAX_NICE);
3056 if (increment < 0 && !can_nice(current, nice)) 3080 if (increment < 0 && !can_nice(current, nice))
3057 return -EPERM; 3081 return -EPERM;
3058 3082
@@ -3642,13 +3666,11 @@ static int sched_copy_attr(struct sched_attr __user *uattr,
3642 */ 3666 */
3643 attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE); 3667 attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE);
3644 3668
3645out: 3669 return 0;
3646 return ret;
3647 3670
3648err_size: 3671err_size:
3649 put_user(sizeof(*attr), &uattr->size); 3672 put_user(sizeof(*attr), &uattr->size);
3650 ret = -E2BIG; 3673 return -E2BIG;
3651 goto out;
3652} 3674}
3653 3675
3654/** 3676/**
@@ -3808,7 +3830,7 @@ static int sched_read_attr(struct sched_attr __user *uattr,
3808 3830
3809 for (; addr < end; addr++) { 3831 for (; addr < end; addr++) {
3810 if (*addr) 3832 if (*addr)
3811 goto err_size; 3833 return -EFBIG;
3812 } 3834 }
3813 3835
3814 attr->size = usize; 3836 attr->size = usize;
@@ -3818,12 +3840,7 @@ static int sched_read_attr(struct sched_attr __user *uattr,
3818 if (ret) 3840 if (ret)
3819 return -EFAULT; 3841 return -EFAULT;
3820 3842
3821out: 3843 return 0;
3822 return ret;
3823
3824err_size:
3825 ret = -E2BIG;
3826 goto out;
3827} 3844}
3828 3845
3829/** 3846/**
@@ -5093,10 +5110,20 @@ static struct notifier_block migration_notifier = {
5093 .priority = CPU_PRI_MIGRATION, 5110 .priority = CPU_PRI_MIGRATION,
5094}; 5111};
5095 5112
5113static void __cpuinit set_cpu_rq_start_time(void)
5114{
5115 int cpu = smp_processor_id();
5116 struct rq *rq = cpu_rq(cpu);
5117 rq->age_stamp = sched_clock_cpu(cpu);
5118}
5119
5096static int sched_cpu_active(struct notifier_block *nfb, 5120static int sched_cpu_active(struct notifier_block *nfb,
5097 unsigned long action, void *hcpu) 5121 unsigned long action, void *hcpu)
5098{ 5122{
5099 switch (action & ~CPU_TASKS_FROZEN) { 5123 switch (action & ~CPU_TASKS_FROZEN) {
5124 case CPU_STARTING:
5125 set_cpu_rq_start_time();
5126 return NOTIFY_OK;
5100 case CPU_DOWN_FAILED: 5127 case CPU_DOWN_FAILED:
5101 set_cpu_active((long)hcpu, true); 5128 set_cpu_active((long)hcpu, true);
5102 return NOTIFY_OK; 5129 return NOTIFY_OK;
@@ -5305,7 +5332,8 @@ static int sd_degenerate(struct sched_domain *sd)
5305 SD_BALANCE_FORK | 5332 SD_BALANCE_FORK |
5306 SD_BALANCE_EXEC | 5333 SD_BALANCE_EXEC |
5307 SD_SHARE_CPUPOWER | 5334 SD_SHARE_CPUPOWER |
5308 SD_SHARE_PKG_RESOURCES)) { 5335 SD_SHARE_PKG_RESOURCES |
5336 SD_SHARE_POWERDOMAIN)) {
5309 if (sd->groups != sd->groups->next) 5337 if (sd->groups != sd->groups->next)
5310 return 0; 5338 return 0;
5311 } 5339 }
@@ -5336,7 +5364,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
5336 SD_BALANCE_EXEC | 5364 SD_BALANCE_EXEC |
5337 SD_SHARE_CPUPOWER | 5365 SD_SHARE_CPUPOWER |
5338 SD_SHARE_PKG_RESOURCES | 5366 SD_SHARE_PKG_RESOURCES |
5339 SD_PREFER_SIBLING); 5367 SD_PREFER_SIBLING |
5368 SD_SHARE_POWERDOMAIN);
5340 if (nr_node_ids == 1) 5369 if (nr_node_ids == 1)
5341 pflags &= ~SD_SERIALIZE; 5370 pflags &= ~SD_SERIALIZE;
5342 } 5371 }
@@ -5610,17 +5639,6 @@ static int __init isolated_cpu_setup(char *str)
5610 5639
5611__setup("isolcpus=", isolated_cpu_setup); 5640__setup("isolcpus=", isolated_cpu_setup);
5612 5641
5613static const struct cpumask *cpu_cpu_mask(int cpu)
5614{
5615 return cpumask_of_node(cpu_to_node(cpu));
5616}
5617
5618struct sd_data {
5619 struct sched_domain **__percpu sd;
5620 struct sched_group **__percpu sg;
5621 struct sched_group_power **__percpu sgp;
5622};
5623
5624struct s_data { 5642struct s_data {
5625 struct sched_domain ** __percpu sd; 5643 struct sched_domain ** __percpu sd;
5626 struct root_domain *rd; 5644 struct root_domain *rd;
@@ -5633,21 +5651,6 @@ enum s_alloc {
5633 sa_none, 5651 sa_none,
5634}; 5652};
5635 5653
5636struct sched_domain_topology_level;
5637
5638typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);
5639typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
5640
5641#define SDTL_OVERLAP 0x01
5642
5643struct sched_domain_topology_level {
5644 sched_domain_init_f init;
5645 sched_domain_mask_f mask;
5646 int flags;
5647 int numa_level;
5648 struct sd_data data;
5649};
5650
5651/* 5654/*
5652 * Build an iteration mask that can exclude certain CPUs from the upwards 5655 * Build an iteration mask that can exclude certain CPUs from the upwards
5653 * domain traversal. 5656 * domain traversal.
@@ -5815,8 +5818,6 @@ build_sched_groups(struct sched_domain *sd, int cpu)
5815 continue; 5818 continue;
5816 5819
5817 group = get_group(i, sdd, &sg); 5820 group = get_group(i, sdd, &sg);
5818 cpumask_clear(sched_group_cpus(sg));
5819 sg->sgp->power = 0;
5820 cpumask_setall(sched_group_mask(sg)); 5821 cpumask_setall(sched_group_mask(sg));
5821 5822
5822 for_each_cpu(j, span) { 5823 for_each_cpu(j, span) {
@@ -5866,44 +5867,11 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
5866 atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight); 5867 atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight);
5867} 5868}
5868 5869
5869int __weak arch_sd_sibling_asym_packing(void)
5870{
5871 return 0*SD_ASYM_PACKING;
5872}
5873
5874/* 5870/*
5875 * Initializers for schedule domains 5871 * Initializers for schedule domains
5876 * Non-inlined to reduce accumulated stack pressure in build_sched_domains() 5872 * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
5877 */ 5873 */
5878 5874
5879#ifdef CONFIG_SCHED_DEBUG
5880# define SD_INIT_NAME(sd, type) sd->name = #type
5881#else
5882# define SD_INIT_NAME(sd, type) do { } while (0)
5883#endif
5884
5885#define SD_INIT_FUNC(type) \
5886static noinline struct sched_domain * \
5887sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \
5888{ \
5889 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \
5890 *sd = SD_##type##_INIT; \
5891 SD_INIT_NAME(sd, type); \
5892 sd->private = &tl->data; \
5893 return sd; \
5894}
5895
5896SD_INIT_FUNC(CPU)
5897#ifdef CONFIG_SCHED_SMT
5898 SD_INIT_FUNC(SIBLING)
5899#endif
5900#ifdef CONFIG_SCHED_MC
5901 SD_INIT_FUNC(MC)
5902#endif
5903#ifdef CONFIG_SCHED_BOOK
5904 SD_INIT_FUNC(BOOK)
5905#endif
5906
5907static int default_relax_domain_level = -1; 5875static int default_relax_domain_level = -1;
5908int sched_domain_level_max; 5876int sched_domain_level_max;
5909 5877
@@ -5991,99 +5959,154 @@ static void claim_allocations(int cpu, struct sched_domain *sd)
5991 *per_cpu_ptr(sdd->sgp, cpu) = NULL; 5959 *per_cpu_ptr(sdd->sgp, cpu) = NULL;
5992} 5960}
5993 5961
5994#ifdef CONFIG_SCHED_SMT
5995static const struct cpumask *cpu_smt_mask(int cpu)
5996{
5997 return topology_thread_cpumask(cpu);
5998}
5999#endif
6000
6001/*
6002 * Topology list, bottom-up.
6003 */
6004static struct sched_domain_topology_level default_topology[] = {
6005#ifdef CONFIG_SCHED_SMT
6006 { sd_init_SIBLING, cpu_smt_mask, },
6007#endif
6008#ifdef CONFIG_SCHED_MC
6009 { sd_init_MC, cpu_coregroup_mask, },
6010#endif
6011#ifdef CONFIG_SCHED_BOOK
6012 { sd_init_BOOK, cpu_book_mask, },
6013#endif
6014 { sd_init_CPU, cpu_cpu_mask, },
6015 { NULL, },
6016};
6017
6018static struct sched_domain_topology_level *sched_domain_topology = default_topology;
6019
6020#define for_each_sd_topology(tl) \
6021 for (tl = sched_domain_topology; tl->init; tl++)
6022
6023#ifdef CONFIG_NUMA 5962#ifdef CONFIG_NUMA
6024
6025static int sched_domains_numa_levels; 5963static int sched_domains_numa_levels;
6026static int *sched_domains_numa_distance; 5964static int *sched_domains_numa_distance;
6027static struct cpumask ***sched_domains_numa_masks; 5965static struct cpumask ***sched_domains_numa_masks;
6028static int sched_domains_curr_level; 5966static int sched_domains_curr_level;
5967#endif
6029 5968
6030static inline int sd_local_flags(int level) 5969/*
6031{ 5970 * SD_flags allowed in topology descriptions.
6032 if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE) 5971 *
6033 return 0; 5972 * SD_SHARE_CPUPOWER - describes SMT topologies
6034 5973 * SD_SHARE_PKG_RESOURCES - describes shared caches
6035 return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE; 5974 * SD_NUMA - describes NUMA topologies
6036} 5975 * SD_SHARE_POWERDOMAIN - describes shared power domain
5976 *
5977 * Odd one out:
5978 * SD_ASYM_PACKING - describes SMT quirks
5979 */
5980#define TOPOLOGY_SD_FLAGS \
5981 (SD_SHARE_CPUPOWER | \
5982 SD_SHARE_PKG_RESOURCES | \
5983 SD_NUMA | \
5984 SD_ASYM_PACKING | \
5985 SD_SHARE_POWERDOMAIN)
6037 5986
6038static struct sched_domain * 5987static struct sched_domain *
6039sd_numa_init(struct sched_domain_topology_level *tl, int cpu) 5988sd_init(struct sched_domain_topology_level *tl, int cpu)
6040{ 5989{
6041 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); 5990 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
6042 int level = tl->numa_level; 5991 int sd_weight, sd_flags = 0;
6043 int sd_weight = cpumask_weight( 5992
6044 sched_domains_numa_masks[level][cpu_to_node(cpu)]); 5993#ifdef CONFIG_NUMA
5994 /*
5995 * Ugly hack to pass state to sd_numa_mask()...
5996 */
5997 sched_domains_curr_level = tl->numa_level;
5998#endif
5999
6000 sd_weight = cpumask_weight(tl->mask(cpu));
6001
6002 if (tl->sd_flags)
6003 sd_flags = (*tl->sd_flags)();
6004 if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS,
6005 "wrong sd_flags in topology description\n"))
6006 sd_flags &= ~TOPOLOGY_SD_FLAGS;
6045 6007
6046 *sd = (struct sched_domain){ 6008 *sd = (struct sched_domain){
6047 .min_interval = sd_weight, 6009 .min_interval = sd_weight,
6048 .max_interval = 2*sd_weight, 6010 .max_interval = 2*sd_weight,
6049 .busy_factor = 32, 6011 .busy_factor = 32,
6050 .imbalance_pct = 125, 6012 .imbalance_pct = 125,
6051 .cache_nice_tries = 2, 6013
6052 .busy_idx = 3, 6014 .cache_nice_tries = 0,
6053 .idle_idx = 2, 6015 .busy_idx = 0,
6016 .idle_idx = 0,
6054 .newidle_idx = 0, 6017 .newidle_idx = 0,
6055 .wake_idx = 0, 6018 .wake_idx = 0,
6056 .forkexec_idx = 0, 6019 .forkexec_idx = 0,
6057 6020
6058 .flags = 1*SD_LOAD_BALANCE 6021 .flags = 1*SD_LOAD_BALANCE
6059 | 1*SD_BALANCE_NEWIDLE 6022 | 1*SD_BALANCE_NEWIDLE
6060 | 0*SD_BALANCE_EXEC 6023 | 1*SD_BALANCE_EXEC
6061 | 0*SD_BALANCE_FORK 6024 | 1*SD_BALANCE_FORK
6062 | 0*SD_BALANCE_WAKE 6025 | 0*SD_BALANCE_WAKE
6063 | 0*SD_WAKE_AFFINE 6026 | 1*SD_WAKE_AFFINE
6064 | 0*SD_SHARE_CPUPOWER 6027 | 0*SD_SHARE_CPUPOWER
6065 | 0*SD_SHARE_PKG_RESOURCES 6028 | 0*SD_SHARE_PKG_RESOURCES
6066 | 1*SD_SERIALIZE 6029 | 0*SD_SERIALIZE
6067 | 0*SD_PREFER_SIBLING 6030 | 0*SD_PREFER_SIBLING
6068 | 1*SD_NUMA 6031 | 0*SD_NUMA
6069 | sd_local_flags(level) 6032 | sd_flags
6070 , 6033 ,
6034
6071 .last_balance = jiffies, 6035 .last_balance = jiffies,
6072 .balance_interval = sd_weight, 6036 .balance_interval = sd_weight,
6037 .smt_gain = 0,
6073 .max_newidle_lb_cost = 0, 6038 .max_newidle_lb_cost = 0,
6074 .next_decay_max_lb_cost = jiffies, 6039 .next_decay_max_lb_cost = jiffies,
6040#ifdef CONFIG_SCHED_DEBUG
6041 .name = tl->name,
6042#endif
6075 }; 6043 };
6076 SD_INIT_NAME(sd, NUMA);
6077 sd->private = &tl->data;
6078 6044
6079 /* 6045 /*
6080 * Ugly hack to pass state to sd_numa_mask()... 6046 * Convert topological properties into behaviour.
6081 */ 6047 */
6082 sched_domains_curr_level = tl->numa_level; 6048
6049 if (sd->flags & SD_SHARE_CPUPOWER) {
6050 sd->imbalance_pct = 110;
6051 sd->smt_gain = 1178; /* ~15% */
6052
6053 } else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
6054 sd->imbalance_pct = 117;
6055 sd->cache_nice_tries = 1;
6056 sd->busy_idx = 2;
6057
6058#ifdef CONFIG_NUMA
6059 } else if (sd->flags & SD_NUMA) {
6060 sd->cache_nice_tries = 2;
6061 sd->busy_idx = 3;
6062 sd->idle_idx = 2;
6063
6064 sd->flags |= SD_SERIALIZE;
6065 if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) {
6066 sd->flags &= ~(SD_BALANCE_EXEC |
6067 SD_BALANCE_FORK |
6068 SD_WAKE_AFFINE);
6069 }
6070
6071#endif
6072 } else {
6073 sd->flags |= SD_PREFER_SIBLING;
6074 sd->cache_nice_tries = 1;
6075 sd->busy_idx = 2;
6076 sd->idle_idx = 1;
6077 }
6078
6079 sd->private = &tl->data;
6083 6080
6084 return sd; 6081 return sd;
6085} 6082}
6086 6083
6084/*
6085 * Topology list, bottom-up.
6086 */
6087static struct sched_domain_topology_level default_topology[] = {
6088#ifdef CONFIG_SCHED_SMT
6089 { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
6090#endif
6091#ifdef CONFIG_SCHED_MC
6092 { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
6093#endif
6094 { cpu_cpu_mask, SD_INIT_NAME(DIE) },
6095 { NULL, },
6096};
6097
6098struct sched_domain_topology_level *sched_domain_topology = default_topology;
6099
6100#define for_each_sd_topology(tl) \
6101 for (tl = sched_domain_topology; tl->mask; tl++)
6102
6103void set_sched_topology(struct sched_domain_topology_level *tl)
6104{
6105 sched_domain_topology = tl;
6106}
6107
6108#ifdef CONFIG_NUMA
6109
6087static const struct cpumask *sd_numa_mask(int cpu) 6110static const struct cpumask *sd_numa_mask(int cpu)
6088{ 6111{
6089 return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)]; 6112 return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
@@ -6227,7 +6250,10 @@ static void sched_init_numa(void)
6227 } 6250 }
6228 } 6251 }
6229 6252
6230 tl = kzalloc((ARRAY_SIZE(default_topology) + level) * 6253 /* Compute default topology size */
6254 for (i = 0; sched_domain_topology[i].mask; i++);
6255
6256 tl = kzalloc((i + level + 1) *
6231 sizeof(struct sched_domain_topology_level), GFP_KERNEL); 6257 sizeof(struct sched_domain_topology_level), GFP_KERNEL);
6232 if (!tl) 6258 if (!tl)
6233 return; 6259 return;
@@ -6235,18 +6261,19 @@ static void sched_init_numa(void)
6235 /* 6261 /*
6236 * Copy the default topology bits.. 6262 * Copy the default topology bits..
6237 */ 6263 */
6238 for (i = 0; default_topology[i].init; i++) 6264 for (i = 0; sched_domain_topology[i].mask; i++)
6239 tl[i] = default_topology[i]; 6265 tl[i] = sched_domain_topology[i];
6240 6266
6241 /* 6267 /*
6242 * .. and append 'j' levels of NUMA goodness. 6268 * .. and append 'j' levels of NUMA goodness.
6243 */ 6269 */
6244 for (j = 0; j < level; i++, j++) { 6270 for (j = 0; j < level; i++, j++) {
6245 tl[i] = (struct sched_domain_topology_level){ 6271 tl[i] = (struct sched_domain_topology_level){
6246 .init = sd_numa_init,
6247 .mask = sd_numa_mask, 6272 .mask = sd_numa_mask,
6273 .sd_flags = cpu_numa_flags,
6248 .flags = SDTL_OVERLAP, 6274 .flags = SDTL_OVERLAP,
6249 .numa_level = j, 6275 .numa_level = j,
6276 SD_INIT_NAME(NUMA)
6250 }; 6277 };
6251 } 6278 }
6252 6279
@@ -6404,7 +6431,7 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
6404 const struct cpumask *cpu_map, struct sched_domain_attr *attr, 6431 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
6405 struct sched_domain *child, int cpu) 6432 struct sched_domain *child, int cpu)
6406{ 6433{
6407 struct sched_domain *sd = tl->init(tl, cpu); 6434 struct sched_domain *sd = sd_init(tl, cpu);
6408 if (!sd) 6435 if (!sd)
6409 return child; 6436 return child;
6410 6437
@@ -6974,6 +7001,7 @@ void __init sched_init(void)
6974 if (cpu_isolated_map == NULL) 7001 if (cpu_isolated_map == NULL)
6975 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); 7002 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
6976 idle_thread_set_boot_cpu(); 7003 idle_thread_set_boot_cpu();
7004 set_cpu_rq_start_time();
6977#endif 7005#endif
6978 init_sched_fair_class(); 7006 init_sched_fair_class();
6979 7007
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 800e99b99075..f9ca7d19781a 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -520,7 +520,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
520 * We need to take care of a possible races here. In fact, the 520 * We need to take care of a possible races here. In fact, the
521 * task might have changed its scheduling policy to something 521 * task might have changed its scheduling policy to something
522 * different from SCHED_DEADLINE or changed its reservation 522 * different from SCHED_DEADLINE or changed its reservation
523 * parameters (through sched_setscheduler()). 523 * parameters (through sched_setattr()).
524 */ 524 */
525 if (!dl_task(p) || dl_se->dl_new) 525 if (!dl_task(p) || dl_se->dl_new)
526 goto unlock; 526 goto unlock;
@@ -741,7 +741,7 @@ void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
741 741
742 WARN_ON(!dl_prio(prio)); 742 WARN_ON(!dl_prio(prio));
743 dl_rq->dl_nr_running++; 743 dl_rq->dl_nr_running++;
744 inc_nr_running(rq_of_dl_rq(dl_rq)); 744 add_nr_running(rq_of_dl_rq(dl_rq), 1);
745 745
746 inc_dl_deadline(dl_rq, deadline); 746 inc_dl_deadline(dl_rq, deadline);
747 inc_dl_migration(dl_se, dl_rq); 747 inc_dl_migration(dl_se, dl_rq);
@@ -755,7 +755,7 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
755 WARN_ON(!dl_prio(prio)); 755 WARN_ON(!dl_prio(prio));
756 WARN_ON(!dl_rq->dl_nr_running); 756 WARN_ON(!dl_rq->dl_nr_running);
757 dl_rq->dl_nr_running--; 757 dl_rq->dl_nr_running--;
758 dec_nr_running(rq_of_dl_rq(dl_rq)); 758 sub_nr_running(rq_of_dl_rq(dl_rq), 1);
759 759
760 dec_dl_deadline(dl_rq, dl_se->deadline); 760 dec_dl_deadline(dl_rq, dl_se->deadline);
761 dec_dl_migration(dl_se, dl_rq); 761 dec_dl_migration(dl_se, dl_rq);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0fdb96de81a5..c9617b73bcc0 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1095,6 +1095,34 @@ static void task_numa_assign(struct task_numa_env *env,
1095 env->best_cpu = env->dst_cpu; 1095 env->best_cpu = env->dst_cpu;
1096} 1096}
1097 1097
1098static bool load_too_imbalanced(long orig_src_load, long orig_dst_load,
1099 long src_load, long dst_load,
1100 struct task_numa_env *env)
1101{
1102 long imb, old_imb;
1103
1104 /* We care about the slope of the imbalance, not the direction. */
1105 if (dst_load < src_load)
1106 swap(dst_load, src_load);
1107
1108 /* Is the difference below the threshold? */
1109 imb = dst_load * 100 - src_load * env->imbalance_pct;
1110 if (imb <= 0)
1111 return false;
1112
1113 /*
1114 * The imbalance is above the allowed threshold.
1115 * Compare it with the old imbalance.
1116 */
1117 if (orig_dst_load < orig_src_load)
1118 swap(orig_dst_load, orig_src_load);
1119
1120 old_imb = orig_dst_load * 100 - orig_src_load * env->imbalance_pct;
1121
1122 /* Would this change make things worse? */
1123 return (old_imb > imb);
1124}
1125
1098/* 1126/*
1099 * This checks if the overall compute and NUMA accesses of the system would 1127 * This checks if the overall compute and NUMA accesses of the system would
1100 * be improved if the source tasks was migrated to the target dst_cpu taking 1128 * be improved if the source tasks was migrated to the target dst_cpu taking
@@ -1107,7 +1135,8 @@ static void task_numa_compare(struct task_numa_env *env,
1107 struct rq *src_rq = cpu_rq(env->src_cpu); 1135 struct rq *src_rq = cpu_rq(env->src_cpu);
1108 struct rq *dst_rq = cpu_rq(env->dst_cpu); 1136 struct rq *dst_rq = cpu_rq(env->dst_cpu);
1109 struct task_struct *cur; 1137 struct task_struct *cur;
1110 long dst_load, src_load; 1138 long orig_src_load, src_load;
1139 long orig_dst_load, dst_load;
1111 long load; 1140 long load;
1112 long imp = (groupimp > 0) ? groupimp : taskimp; 1141 long imp = (groupimp > 0) ? groupimp : taskimp;
1113 1142
@@ -1181,13 +1210,13 @@ static void task_numa_compare(struct task_numa_env *env,
1181 * In the overloaded case, try and keep the load balanced. 1210 * In the overloaded case, try and keep the load balanced.
1182 */ 1211 */
1183balance: 1212balance:
1184 dst_load = env->dst_stats.load; 1213 orig_dst_load = env->dst_stats.load;
1185 src_load = env->src_stats.load; 1214 orig_src_load = env->src_stats.load;
1186 1215
1187 /* XXX missing power terms */ 1216 /* XXX missing power terms */
1188 load = task_h_load(env->p); 1217 load = task_h_load(env->p);
1189 dst_load += load; 1218 dst_load = orig_dst_load + load;
1190 src_load -= load; 1219 src_load = orig_src_load - load;
1191 1220
1192 if (cur) { 1221 if (cur) {
1193 load = task_h_load(cur); 1222 load = task_h_load(cur);
@@ -1195,11 +1224,8 @@ balance:
1195 src_load += load; 1224 src_load += load;
1196 } 1225 }
1197 1226
1198 /* make src_load the smaller */ 1227 if (load_too_imbalanced(orig_src_load, orig_dst_load,
1199 if (dst_load < src_load) 1228 src_load, dst_load, env))
1200 swap(dst_load, src_load);
1201
1202 if (src_load * env->imbalance_pct < dst_load * 100)
1203 goto unlock; 1229 goto unlock;
1204 1230
1205assign: 1231assign:
@@ -1301,7 +1327,16 @@ static int task_numa_migrate(struct task_struct *p)
1301 if (env.best_cpu == -1) 1327 if (env.best_cpu == -1)
1302 return -EAGAIN; 1328 return -EAGAIN;
1303 1329
1304 sched_setnuma(p, env.dst_nid); 1330 /*
1331 * If the task is part of a workload that spans multiple NUMA nodes,
1332 * and is migrating into one of the workload's active nodes, remember
1333 * this node as the task's preferred numa node, so the workload can
1334 * settle down.
1335 * A task that migrated to a second choice node will be better off
1336 * trying for a better one later. Do not set the preferred node here.
1337 */
1338 if (p->numa_group && node_isset(env.dst_nid, p->numa_group->active_nodes))
1339 sched_setnuma(p, env.dst_nid);
1305 1340
1306 /* 1341 /*
1307 * Reset the scan period if the task is being rescheduled on an 1342 * Reset the scan period if the task is being rescheduled on an
@@ -1326,12 +1361,15 @@ static int task_numa_migrate(struct task_struct *p)
1326/* Attempt to migrate a task to a CPU on the preferred node. */ 1361/* Attempt to migrate a task to a CPU on the preferred node. */
1327static void numa_migrate_preferred(struct task_struct *p) 1362static void numa_migrate_preferred(struct task_struct *p)
1328{ 1363{
1364 unsigned long interval = HZ;
1365
1329 /* This task has no NUMA fault statistics yet */ 1366 /* This task has no NUMA fault statistics yet */
1330 if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults_memory)) 1367 if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults_memory))
1331 return; 1368 return;
1332 1369
1333 /* Periodically retry migrating the task to the preferred node */ 1370 /* Periodically retry migrating the task to the preferred node */
1334 p->numa_migrate_retry = jiffies + HZ; 1371 interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
1372 p->numa_migrate_retry = jiffies + interval;
1335 1373
1336 /* Success if task is already running on preferred CPU */ 1374 /* Success if task is already running on preferred CPU */
1337 if (task_node(p) == p->numa_preferred_nid) 1375 if (task_node(p) == p->numa_preferred_nid)
@@ -1738,6 +1776,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
1738 struct task_struct *p = current; 1776 struct task_struct *p = current;
1739 bool migrated = flags & TNF_MIGRATED; 1777 bool migrated = flags & TNF_MIGRATED;
1740 int cpu_node = task_node(current); 1778 int cpu_node = task_node(current);
1779 int local = !!(flags & TNF_FAULT_LOCAL);
1741 int priv; 1780 int priv;
1742 1781
1743 if (!numabalancing_enabled) 1782 if (!numabalancing_enabled)
@@ -1786,6 +1825,17 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
1786 task_numa_group(p, last_cpupid, flags, &priv); 1825 task_numa_group(p, last_cpupid, flags, &priv);
1787 } 1826 }
1788 1827
1828 /*
1829 * If a workload spans multiple NUMA nodes, a shared fault that
1830 * occurs wholly within the set of nodes that the workload is
1831 * actively using should be counted as local. This allows the
1832 * scan rate to slow down when a workload has settled down.
1833 */
1834 if (!priv && !local && p->numa_group &&
1835 node_isset(cpu_node, p->numa_group->active_nodes) &&
1836 node_isset(mem_node, p->numa_group->active_nodes))
1837 local = 1;
1838
1789 task_numa_placement(p); 1839 task_numa_placement(p);
1790 1840
1791 /* 1841 /*
@@ -1800,7 +1850,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
1800 1850
1801 p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages; 1851 p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages;
1802 p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages; 1852 p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages;
1803 p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages; 1853 p->numa_faults_locality[local] += pages;
1804} 1854}
1805 1855
1806static void reset_ptenuma_scan(struct task_struct *p) 1856static void reset_ptenuma_scan(struct task_struct *p)
@@ -3301,7 +3351,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
3301 } 3351 }
3302 3352
3303 if (!se) 3353 if (!se)
3304 rq->nr_running -= task_delta; 3354 sub_nr_running(rq, task_delta);
3305 3355
3306 cfs_rq->throttled = 1; 3356 cfs_rq->throttled = 1;
3307 cfs_rq->throttled_clock = rq_clock(rq); 3357 cfs_rq->throttled_clock = rq_clock(rq);
@@ -3352,7 +3402,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
3352 } 3402 }
3353 3403
3354 if (!se) 3404 if (!se)
3355 rq->nr_running += task_delta; 3405 add_nr_running(rq, task_delta);
3356 3406
3357 /* determine whether we need to wake up potentially idle cpu */ 3407 /* determine whether we need to wake up potentially idle cpu */
3358 if (rq->curr == rq->idle && rq->cfs.nr_running) 3408 if (rq->curr == rq->idle && rq->cfs.nr_running)
@@ -3884,7 +3934,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
3884 3934
3885 if (!se) { 3935 if (!se) {
3886 update_rq_runnable_avg(rq, rq->nr_running); 3936 update_rq_runnable_avg(rq, rq->nr_running);
3887 inc_nr_running(rq); 3937 add_nr_running(rq, 1);
3888 } 3938 }
3889 hrtick_update(rq); 3939 hrtick_update(rq);
3890} 3940}
@@ -3944,7 +3994,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
3944 } 3994 }
3945 3995
3946 if (!se) { 3996 if (!se) {
3947 dec_nr_running(rq); 3997 sub_nr_running(rq, 1);
3948 update_rq_runnable_avg(rq, 1); 3998 update_rq_runnable_avg(rq, 1);
3949 } 3999 }
3950 hrtick_update(rq); 4000 hrtick_update(rq);
@@ -4015,7 +4065,7 @@ static void record_wakee(struct task_struct *p)
4015 * about the loss. 4065 * about the loss.
4016 */ 4066 */
4017 if (jiffies > current->wakee_flip_decay_ts + HZ) { 4067 if (jiffies > current->wakee_flip_decay_ts + HZ) {
4018 current->wakee_flips = 0; 4068 current->wakee_flips >>= 1;
4019 current->wakee_flip_decay_ts = jiffies; 4069 current->wakee_flip_decay_ts = jiffies;
4020 } 4070 }
4021 4071
@@ -4449,10 +4499,10 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
4449 sd = tmp; 4499 sd = tmp;
4450 } 4500 }
4451 4501
4452 if (affine_sd) { 4502 if (affine_sd && cpu != prev_cpu && wake_affine(affine_sd, p, sync))
4453 if (cpu != prev_cpu && wake_affine(affine_sd, p, sync)) 4503 prev_cpu = cpu;
4454 prev_cpu = cpu;
4455 4504
4505 if (sd_flag & SD_BALANCE_WAKE) {
4456 new_cpu = select_idle_sibling(p, prev_cpu); 4506 new_cpu = select_idle_sibling(p, prev_cpu);
4457 goto unlock; 4507 goto unlock;
4458 } 4508 }
@@ -4520,6 +4570,9 @@ migrate_task_rq_fair(struct task_struct *p, int next_cpu)
4520 atomic_long_add(se->avg.load_avg_contrib, 4570 atomic_long_add(se->avg.load_avg_contrib,
4521 &cfs_rq->removed_load); 4571 &cfs_rq->removed_load);
4522 } 4572 }
4573
4574 /* We have migrated, no longer consider this task hot */
4575 se->exec_start = 0;
4523} 4576}
4524#endif /* CONFIG_SMP */ 4577#endif /* CONFIG_SMP */
4525 4578
@@ -5070,6 +5123,7 @@ task_hot(struct task_struct *p, u64 now)
5070/* Returns true if the destination node has incurred more faults */ 5123/* Returns true if the destination node has incurred more faults */
5071static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env) 5124static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
5072{ 5125{
5126 struct numa_group *numa_group = rcu_dereference(p->numa_group);
5073 int src_nid, dst_nid; 5127 int src_nid, dst_nid;
5074 5128
5075 if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults_memory || 5129 if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults_memory ||
@@ -5083,21 +5137,29 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
5083 if (src_nid == dst_nid) 5137 if (src_nid == dst_nid)
5084 return false; 5138 return false;
5085 5139
5086 /* Always encourage migration to the preferred node. */ 5140 if (numa_group) {
5087 if (dst_nid == p->numa_preferred_nid) 5141 /* Task is already in the group's interleave set. */
5088 return true; 5142 if (node_isset(src_nid, numa_group->active_nodes))
5143 return false;
5144
5145 /* Task is moving into the group's interleave set. */
5146 if (node_isset(dst_nid, numa_group->active_nodes))
5147 return true;
5089 5148
5090 /* If both task and group weight improve, this move is a winner. */ 5149 return group_faults(p, dst_nid) > group_faults(p, src_nid);
5091 if (task_weight(p, dst_nid) > task_weight(p, src_nid) && 5150 }
5092 group_weight(p, dst_nid) > group_weight(p, src_nid)) 5151
5152 /* Encourage migration to the preferred node. */
5153 if (dst_nid == p->numa_preferred_nid)
5093 return true; 5154 return true;
5094 5155
5095 return false; 5156 return task_faults(p, dst_nid) > task_faults(p, src_nid);
5096} 5157}
5097 5158
5098 5159
5099static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env) 5160static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
5100{ 5161{
5162 struct numa_group *numa_group = rcu_dereference(p->numa_group);
5101 int src_nid, dst_nid; 5163 int src_nid, dst_nid;
5102 5164
5103 if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER)) 5165 if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
@@ -5112,16 +5174,23 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
5112 if (src_nid == dst_nid) 5174 if (src_nid == dst_nid)
5113 return false; 5175 return false;
5114 5176
5177 if (numa_group) {
5178 /* Task is moving within/into the group's interleave set. */
5179 if (node_isset(dst_nid, numa_group->active_nodes))
5180 return false;
5181
5182 /* Task is moving out of the group's interleave set. */
5183 if (node_isset(src_nid, numa_group->active_nodes))
5184 return true;
5185
5186 return group_faults(p, dst_nid) < group_faults(p, src_nid);
5187 }
5188
5115 /* Migrating away from the preferred node is always bad. */ 5189 /* Migrating away from the preferred node is always bad. */
5116 if (src_nid == p->numa_preferred_nid) 5190 if (src_nid == p->numa_preferred_nid)
5117 return true; 5191 return true;
5118 5192
5119 /* If either task or group weight get worse, don't do it. */ 5193 return task_faults(p, dst_nid) < task_faults(p, src_nid);
5120 if (task_weight(p, dst_nid) < task_weight(p, src_nid) ||
5121 group_weight(p, dst_nid) < group_weight(p, src_nid))
5122 return true;
5123
5124 return false;
5125} 5194}
5126 5195
5127#else 5196#else
@@ -5564,6 +5633,7 @@ static unsigned long scale_rt_power(int cpu)
5564{ 5633{
5565 struct rq *rq = cpu_rq(cpu); 5634 struct rq *rq = cpu_rq(cpu);
5566 u64 total, available, age_stamp, avg; 5635 u64 total, available, age_stamp, avg;
5636 s64 delta;
5567 5637
5568 /* 5638 /*
5569 * Since we're reading these variables without serialization make sure 5639 * Since we're reading these variables without serialization make sure
@@ -5572,7 +5642,11 @@ static unsigned long scale_rt_power(int cpu)
5572 age_stamp = ACCESS_ONCE(rq->age_stamp); 5642 age_stamp = ACCESS_ONCE(rq->age_stamp);
5573 avg = ACCESS_ONCE(rq->rt_avg); 5643 avg = ACCESS_ONCE(rq->rt_avg);
5574 5644
5575 total = sched_avg_period() + (rq_clock(rq) - age_stamp); 5645 delta = rq_clock(rq) - age_stamp;
5646 if (unlikely(delta < 0))
5647 delta = 0;
5648
5649 total = sched_avg_period() + delta;
5576 5650
5577 if (unlikely(total < avg)) { 5651 if (unlikely(total < avg)) {
5578 /* Ensures that power won't end up being negative */ 5652 /* Ensures that power won't end up being negative */
@@ -6640,17 +6714,44 @@ out:
6640 return ld_moved; 6714 return ld_moved;
6641} 6715}
6642 6716
6717static inline unsigned long
6718get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
6719{
6720 unsigned long interval = sd->balance_interval;
6721
6722 if (cpu_busy)
6723 interval *= sd->busy_factor;
6724
6725 /* scale ms to jiffies */
6726 interval = msecs_to_jiffies(interval);
6727 interval = clamp(interval, 1UL, max_load_balance_interval);
6728
6729 return interval;
6730}
6731
6732static inline void
6733update_next_balance(struct sched_domain *sd, int cpu_busy, unsigned long *next_balance)
6734{
6735 unsigned long interval, next;
6736
6737 interval = get_sd_balance_interval(sd, cpu_busy);
6738 next = sd->last_balance + interval;
6739
6740 if (time_after(*next_balance, next))
6741 *next_balance = next;
6742}
6743
6643/* 6744/*
6644 * idle_balance is called by schedule() if this_cpu is about to become 6745 * idle_balance is called by schedule() if this_cpu is about to become
6645 * idle. Attempts to pull tasks from other CPUs. 6746 * idle. Attempts to pull tasks from other CPUs.
6646 */ 6747 */
6647static int idle_balance(struct rq *this_rq) 6748static int idle_balance(struct rq *this_rq)
6648{ 6749{
6750 unsigned long next_balance = jiffies + HZ;
6751 int this_cpu = this_rq->cpu;
6649 struct sched_domain *sd; 6752 struct sched_domain *sd;
6650 int pulled_task = 0; 6753 int pulled_task = 0;
6651 unsigned long next_balance = jiffies + HZ;
6652 u64 curr_cost = 0; 6754 u64 curr_cost = 0;
6653 int this_cpu = this_rq->cpu;
6654 6755
6655 idle_enter_fair(this_rq); 6756 idle_enter_fair(this_rq);
6656 6757
@@ -6660,8 +6761,15 @@ static int idle_balance(struct rq *this_rq)
6660 */ 6761 */
6661 this_rq->idle_stamp = rq_clock(this_rq); 6762 this_rq->idle_stamp = rq_clock(this_rq);
6662 6763
6663 if (this_rq->avg_idle < sysctl_sched_migration_cost) 6764 if (this_rq->avg_idle < sysctl_sched_migration_cost) {
6765 rcu_read_lock();
6766 sd = rcu_dereference_check_sched_domain(this_rq->sd);
6767 if (sd)
6768 update_next_balance(sd, 0, &next_balance);
6769 rcu_read_unlock();
6770
6664 goto out; 6771 goto out;
6772 }
6665 6773
6666 /* 6774 /*
6667 * Drop the rq->lock, but keep IRQ/preempt disabled. 6775 * Drop the rq->lock, but keep IRQ/preempt disabled.
@@ -6671,20 +6779,20 @@ static int idle_balance(struct rq *this_rq)
6671 update_blocked_averages(this_cpu); 6779 update_blocked_averages(this_cpu);
6672 rcu_read_lock(); 6780 rcu_read_lock();
6673 for_each_domain(this_cpu, sd) { 6781 for_each_domain(this_cpu, sd) {
6674 unsigned long interval;
6675 int continue_balancing = 1; 6782 int continue_balancing = 1;
6676 u64 t0, domain_cost; 6783 u64 t0, domain_cost;
6677 6784
6678 if (!(sd->flags & SD_LOAD_BALANCE)) 6785 if (!(sd->flags & SD_LOAD_BALANCE))
6679 continue; 6786 continue;
6680 6787
6681 if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) 6788 if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
6789 update_next_balance(sd, 0, &next_balance);
6682 break; 6790 break;
6791 }
6683 6792
6684 if (sd->flags & SD_BALANCE_NEWIDLE) { 6793 if (sd->flags & SD_BALANCE_NEWIDLE) {
6685 t0 = sched_clock_cpu(this_cpu); 6794 t0 = sched_clock_cpu(this_cpu);
6686 6795
6687 /* If we've pulled tasks over stop searching: */
6688 pulled_task = load_balance(this_cpu, this_rq, 6796 pulled_task = load_balance(this_cpu, this_rq,
6689 sd, CPU_NEWLY_IDLE, 6797 sd, CPU_NEWLY_IDLE,
6690 &continue_balancing); 6798 &continue_balancing);
@@ -6696,10 +6804,13 @@ static int idle_balance(struct rq *this_rq)
6696 curr_cost += domain_cost; 6804 curr_cost += domain_cost;
6697 } 6805 }
6698 6806
6699 interval = msecs_to_jiffies(sd->balance_interval); 6807 update_next_balance(sd, 0, &next_balance);
6700 if (time_after(next_balance, sd->last_balance + interval)) 6808
6701 next_balance = sd->last_balance + interval; 6809 /*
6702 if (pulled_task) 6810 * Stop searching for tasks to pull if there are
6811 * now runnable tasks on this rq.
6812 */
6813 if (pulled_task || this_rq->nr_running > 0)
6703 break; 6814 break;
6704 } 6815 }
6705 rcu_read_unlock(); 6816 rcu_read_unlock();
@@ -6717,20 +6828,13 @@ static int idle_balance(struct rq *this_rq)
6717 if (this_rq->cfs.h_nr_running && !pulled_task) 6828 if (this_rq->cfs.h_nr_running && !pulled_task)
6718 pulled_task = 1; 6829 pulled_task = 1;
6719 6830
6720 if (pulled_task || time_after(jiffies, this_rq->next_balance)) { 6831out:
6721 /* 6832 /* Move the next balance forward */
6722 * We are going idle. next_balance may be set based on 6833 if (time_after(this_rq->next_balance, next_balance))
6723 * a busy processor. So reset next_balance.
6724 */
6725 this_rq->next_balance = next_balance; 6834 this_rq->next_balance = next_balance;
6726 }
6727 6835
6728out:
6729 /* Is there a task of a high priority class? */ 6836 /* Is there a task of a high priority class? */
6730 if (this_rq->nr_running != this_rq->cfs.h_nr_running && 6837 if (this_rq->nr_running != this_rq->cfs.h_nr_running)
6731 ((this_rq->stop && this_rq->stop->on_rq) ||
6732 this_rq->dl.dl_nr_running ||
6733 (this_rq->rt.rt_nr_running && !rt_rq_throttled(&this_rq->rt))))
6734 pulled_task = -1; 6838 pulled_task = -1;
6735 6839
6736 if (pulled_task) { 6840 if (pulled_task) {
@@ -7011,16 +7115,9 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
7011 break; 7115 break;
7012 } 7116 }
7013 7117
7014 interval = sd->balance_interval; 7118 interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
7015 if (idle != CPU_IDLE)
7016 interval *= sd->busy_factor;
7017
7018 /* scale ms to jiffies */
7019 interval = msecs_to_jiffies(interval);
7020 interval = clamp(interval, 1UL, max_load_balance_interval);
7021 7119
7022 need_serialize = sd->flags & SD_SERIALIZE; 7120 need_serialize = sd->flags & SD_SERIALIZE;
7023
7024 if (need_serialize) { 7121 if (need_serialize) {
7025 if (!spin_trylock(&balancing)) 7122 if (!spin_trylock(&balancing))
7026 goto out; 7123 goto out;
@@ -7036,6 +7133,7 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
7036 idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE; 7133 idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
7037 } 7134 }
7038 sd->last_balance = jiffies; 7135 sd->last_balance = jiffies;
7136 interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
7039 } 7137 }
7040 if (need_serialize) 7138 if (need_serialize)
7041 spin_unlock(&balancing); 7139 spin_unlock(&balancing);
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 8f4390a079c7..25b9423abce9 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -67,24 +67,21 @@ void __weak arch_cpu_idle(void)
67 * cpuidle_idle_call - the main idle function 67 * cpuidle_idle_call - the main idle function
68 * 68 *
69 * NOTE: no locks or semaphores should be used here 69 * NOTE: no locks or semaphores should be used here
70 * return non-zero on failure
71 */ 70 */
72static int cpuidle_idle_call(void) 71static void cpuidle_idle_call(void)
73{ 72{
74 struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices); 73 struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices);
75 struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev); 74 struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
76 int next_state, entered_state, ret; 75 int next_state, entered_state;
77 bool broadcast; 76 bool broadcast;
78 77
79 /* 78 /*
80 * Check if the idle task must be rescheduled. If it is the 79 * Check if the idle task must be rescheduled. If it is the
81 * case, exit the function after re-enabling the local irq and 80 * case, exit the function after re-enabling the local irq.
82 * set again the polling flag
83 */ 81 */
84 if (current_clr_polling_and_test()) { 82 if (need_resched()) {
85 local_irq_enable(); 83 local_irq_enable();
86 __current_set_polling(); 84 return;
87 return 0;
88 } 85 }
89 86
90 /* 87 /*
@@ -101,96 +98,79 @@ static int cpuidle_idle_call(void)
101 rcu_idle_enter(); 98 rcu_idle_enter();
102 99
103 /* 100 /*
104 * Check if the cpuidle framework is ready, otherwise fallback 101 * Ask the cpuidle framework to choose a convenient idle state.
105 * to the default arch specific idle method 102 * Fall back to the default arch idle method on errors.
106 */ 103 */
107 ret = cpuidle_enabled(drv, dev); 104 next_state = cpuidle_select(drv, dev);
108 105 if (next_state < 0) {
109 if (!ret) { 106use_default:
110 /* 107 /*
111 * Ask the governor to choose an idle state it thinks 108 * We can't use the cpuidle framework, let's use the default
112 * it is convenient to go to. There is *always* a 109 * idle routine.
113 * convenient idle state
114 */ 110 */
115 next_state = cpuidle_select(drv, dev); 111 if (current_clr_polling_and_test())
116
117 /*
118 * The idle task must be scheduled, it is pointless to
119 * go to idle, just update no idle residency and get
120 * out of this function
121 */
122 if (current_clr_polling_and_test()) {
123 dev->last_residency = 0;
124 entered_state = next_state;
125 local_irq_enable(); 112 local_irq_enable();
126 } else { 113 else
127 broadcast = !!(drv->states[next_state].flags & 114 arch_cpu_idle();
128 CPUIDLE_FLAG_TIMER_STOP); 115
129 116 goto exit_idle;
130 if (broadcast)
131 /*
132 * Tell the time framework to switch
133 * to a broadcast timer because our
134 * local timer will be shutdown. If a
135 * local timer is used from another
136 * cpu as a broadcast timer, this call
137 * may fail if it is not available
138 */
139 ret = clockevents_notify(
140 CLOCK_EVT_NOTIFY_BROADCAST_ENTER,
141 &dev->cpu);
142
143 if (!ret) {
144 trace_cpu_idle_rcuidle(next_state, dev->cpu);
145
146 /*
147 * Enter the idle state previously
148 * returned by the governor
149 * decision. This function will block
150 * until an interrupt occurs and will
151 * take care of re-enabling the local
152 * interrupts
153 */
154 entered_state = cpuidle_enter(drv, dev,
155 next_state);
156
157 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT,
158 dev->cpu);
159
160 if (broadcast)
161 clockevents_notify(
162 CLOCK_EVT_NOTIFY_BROADCAST_EXIT,
163 &dev->cpu);
164
165 /*
166 * Give the governor an opportunity to reflect on the
167 * outcome
168 */
169 cpuidle_reflect(dev, entered_state);
170 }
171 }
172 } 117 }
173 118
119
174 /* 120 /*
175 * We can't use the cpuidle framework, let's use the default 121 * The idle task must be scheduled, it is pointless to
176 * idle routine 122 * go to idle, just update no idle residency and get
123 * out of this function
177 */ 124 */
178 if (ret) 125 if (current_clr_polling_and_test()) {
179 arch_cpu_idle(); 126 dev->last_residency = 0;
127 entered_state = next_state;
128 local_irq_enable();
129 goto exit_idle;
130 }
131
132 broadcast = !!(drv->states[next_state].flags & CPUIDLE_FLAG_TIMER_STOP);
180 133
134 /*
135 * Tell the time framework to switch to a broadcast timer
136 * because our local timer will be shutdown. If a local timer
137 * is used from another cpu as a broadcast timer, this call may
138 * fail if it is not available
139 */
140 if (broadcast &&
141 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu))
142 goto use_default;
143
144 trace_cpu_idle_rcuidle(next_state, dev->cpu);
145
146 /*
147 * Enter the idle state previously returned by the governor decision.
148 * This function will block until an interrupt occurs and will take
149 * care of re-enabling the local interrupts
150 */
151 entered_state = cpuidle_enter(drv, dev, next_state);
152
153 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu);
154
155 if (broadcast)
156 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu);
157
158 /*
159 * Give the governor an opportunity to reflect on the outcome
160 */
161 cpuidle_reflect(dev, entered_state);
162
163exit_idle:
181 __current_set_polling(); 164 __current_set_polling();
182 165
183 /* 166 /*
184 * It is up to the idle functions to enable back the local 167 * It is up to the idle functions to reenable local interrupts
185 * interrupt
186 */ 168 */
187 if (WARN_ON_ONCE(irqs_disabled())) 169 if (WARN_ON_ONCE(irqs_disabled()))
188 local_irq_enable(); 170 local_irq_enable();
189 171
190 rcu_idle_exit(); 172 rcu_idle_exit();
191 start_critical_timings(); 173 start_critical_timings();
192
193 return 0;
194} 174}
195 175
196/* 176/*
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index bd2267ad404f..0ebfd7a29472 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -79,6 +79,8 @@ void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
79 rt_rq->overloaded = 0; 79 rt_rq->overloaded = 0;
80 plist_head_init(&rt_rq->pushable_tasks); 80 plist_head_init(&rt_rq->pushable_tasks);
81#endif 81#endif
82 /* We start is dequeued state, because no RT tasks are queued */
83 rt_rq->rt_queued = 0;
82 84
83 rt_rq->rt_time = 0; 85 rt_rq->rt_time = 0;
84 rt_rq->rt_throttled = 0; 86 rt_rq->rt_throttled = 0;
@@ -112,6 +114,13 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
112 return rt_se->rt_rq; 114 return rt_se->rt_rq;
113} 115}
114 116
117static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
118{
119 struct rt_rq *rt_rq = rt_se->rt_rq;
120
121 return rt_rq->rq;
122}
123
115void free_rt_sched_group(struct task_group *tg) 124void free_rt_sched_group(struct task_group *tg)
116{ 125{
117 int i; 126 int i;
@@ -211,10 +220,16 @@ static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
211 return container_of(rt_rq, struct rq, rt); 220 return container_of(rt_rq, struct rq, rt);
212} 221}
213 222
214static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) 223static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
215{ 224{
216 struct task_struct *p = rt_task_of(rt_se); 225 struct task_struct *p = rt_task_of(rt_se);
217 struct rq *rq = task_rq(p); 226
227 return task_rq(p);
228}
229
230static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
231{
232 struct rq *rq = rq_of_rt_se(rt_se);
218 233
219 return &rq->rt; 234 return &rq->rt;
220} 235}
@@ -391,6 +406,9 @@ static inline void set_post_schedule(struct rq *rq)
391} 406}
392#endif /* CONFIG_SMP */ 407#endif /* CONFIG_SMP */
393 408
409static void enqueue_top_rt_rq(struct rt_rq *rt_rq);
410static void dequeue_top_rt_rq(struct rt_rq *rt_rq);
411
394static inline int on_rt_rq(struct sched_rt_entity *rt_se) 412static inline int on_rt_rq(struct sched_rt_entity *rt_se)
395{ 413{
396 return !list_empty(&rt_se->run_list); 414 return !list_empty(&rt_se->run_list);
@@ -452,8 +470,11 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
452 rt_se = rt_rq->tg->rt_se[cpu]; 470 rt_se = rt_rq->tg->rt_se[cpu];
453 471
454 if (rt_rq->rt_nr_running) { 472 if (rt_rq->rt_nr_running) {
455 if (rt_se && !on_rt_rq(rt_se)) 473 if (!rt_se)
474 enqueue_top_rt_rq(rt_rq);
475 else if (!on_rt_rq(rt_se))
456 enqueue_rt_entity(rt_se, false); 476 enqueue_rt_entity(rt_se, false);
477
457 if (rt_rq->highest_prio.curr < curr->prio) 478 if (rt_rq->highest_prio.curr < curr->prio)
458 resched_task(curr); 479 resched_task(curr);
459 } 480 }
@@ -466,10 +487,17 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
466 487
467 rt_se = rt_rq->tg->rt_se[cpu]; 488 rt_se = rt_rq->tg->rt_se[cpu];
468 489
469 if (rt_se && on_rt_rq(rt_se)) 490 if (!rt_se)
491 dequeue_top_rt_rq(rt_rq);
492 else if (on_rt_rq(rt_se))
470 dequeue_rt_entity(rt_se); 493 dequeue_rt_entity(rt_se);
471} 494}
472 495
496static inline int rt_rq_throttled(struct rt_rq *rt_rq)
497{
498 return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
499}
500
473static int rt_se_boosted(struct sched_rt_entity *rt_se) 501static int rt_se_boosted(struct sched_rt_entity *rt_se)
474{ 502{
475 struct rt_rq *rt_rq = group_rt_rq(rt_se); 503 struct rt_rq *rt_rq = group_rt_rq(rt_se);
@@ -532,12 +560,23 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
532 560
533static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq) 561static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
534{ 562{
535 if (rt_rq->rt_nr_running) 563 struct rq *rq = rq_of_rt_rq(rt_rq);
536 resched_task(rq_of_rt_rq(rt_rq)->curr); 564
565 if (!rt_rq->rt_nr_running)
566 return;
567
568 enqueue_top_rt_rq(rt_rq);
569 resched_task(rq->curr);
537} 570}
538 571
539static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq) 572static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
540{ 573{
574 dequeue_top_rt_rq(rt_rq);
575}
576
577static inline int rt_rq_throttled(struct rt_rq *rt_rq)
578{
579 return rt_rq->rt_throttled;
541} 580}
542 581
543static inline const struct cpumask *sched_rt_period_mask(void) 582static inline const struct cpumask *sched_rt_period_mask(void)
@@ -922,6 +961,38 @@ static void update_curr_rt(struct rq *rq)
922 } 961 }
923} 962}
924 963
964static void
965dequeue_top_rt_rq(struct rt_rq *rt_rq)
966{
967 struct rq *rq = rq_of_rt_rq(rt_rq);
968
969 BUG_ON(&rq->rt != rt_rq);
970
971 if (!rt_rq->rt_queued)
972 return;
973
974 BUG_ON(!rq->nr_running);
975
976 sub_nr_running(rq, rt_rq->rt_nr_running);
977 rt_rq->rt_queued = 0;
978}
979
980static void
981enqueue_top_rt_rq(struct rt_rq *rt_rq)
982{
983 struct rq *rq = rq_of_rt_rq(rt_rq);
984
985 BUG_ON(&rq->rt != rt_rq);
986
987 if (rt_rq->rt_queued)
988 return;
989 if (rt_rq_throttled(rt_rq) || !rt_rq->rt_nr_running)
990 return;
991
992 add_nr_running(rq, rt_rq->rt_nr_running);
993 rt_rq->rt_queued = 1;
994}
995
925#if defined CONFIG_SMP 996#if defined CONFIG_SMP
926 997
927static void 998static void
@@ -1045,12 +1116,23 @@ void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
1045#endif /* CONFIG_RT_GROUP_SCHED */ 1116#endif /* CONFIG_RT_GROUP_SCHED */
1046 1117
1047static inline 1118static inline
1119unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se)
1120{
1121 struct rt_rq *group_rq = group_rt_rq(rt_se);
1122
1123 if (group_rq)
1124 return group_rq->rt_nr_running;
1125 else
1126 return 1;
1127}
1128
1129static inline
1048void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 1130void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1049{ 1131{
1050 int prio = rt_se_prio(rt_se); 1132 int prio = rt_se_prio(rt_se);
1051 1133
1052 WARN_ON(!rt_prio(prio)); 1134 WARN_ON(!rt_prio(prio));
1053 rt_rq->rt_nr_running++; 1135 rt_rq->rt_nr_running += rt_se_nr_running(rt_se);
1054 1136
1055 inc_rt_prio(rt_rq, prio); 1137 inc_rt_prio(rt_rq, prio);
1056 inc_rt_migration(rt_se, rt_rq); 1138 inc_rt_migration(rt_se, rt_rq);
@@ -1062,7 +1144,7 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1062{ 1144{
1063 WARN_ON(!rt_prio(rt_se_prio(rt_se))); 1145 WARN_ON(!rt_prio(rt_se_prio(rt_se)));
1064 WARN_ON(!rt_rq->rt_nr_running); 1146 WARN_ON(!rt_rq->rt_nr_running);
1065 rt_rq->rt_nr_running--; 1147 rt_rq->rt_nr_running -= rt_se_nr_running(rt_se);
1066 1148
1067 dec_rt_prio(rt_rq, rt_se_prio(rt_se)); 1149 dec_rt_prio(rt_rq, rt_se_prio(rt_se));
1068 dec_rt_migration(rt_se, rt_rq); 1150 dec_rt_migration(rt_se, rt_rq);
@@ -1119,6 +1201,8 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
1119 back = rt_se; 1201 back = rt_se;
1120 } 1202 }
1121 1203
1204 dequeue_top_rt_rq(rt_rq_of_se(back));
1205
1122 for (rt_se = back; rt_se; rt_se = rt_se->back) { 1206 for (rt_se = back; rt_se; rt_se = rt_se->back) {
1123 if (on_rt_rq(rt_se)) 1207 if (on_rt_rq(rt_se))
1124 __dequeue_rt_entity(rt_se); 1208 __dequeue_rt_entity(rt_se);
@@ -1127,13 +1211,18 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
1127 1211
1128static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) 1212static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
1129{ 1213{
1214 struct rq *rq = rq_of_rt_se(rt_se);
1215
1130 dequeue_rt_stack(rt_se); 1216 dequeue_rt_stack(rt_se);
1131 for_each_sched_rt_entity(rt_se) 1217 for_each_sched_rt_entity(rt_se)
1132 __enqueue_rt_entity(rt_se, head); 1218 __enqueue_rt_entity(rt_se, head);
1219 enqueue_top_rt_rq(&rq->rt);
1133} 1220}
1134 1221
1135static void dequeue_rt_entity(struct sched_rt_entity *rt_se) 1222static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
1136{ 1223{
1224 struct rq *rq = rq_of_rt_se(rt_se);
1225
1137 dequeue_rt_stack(rt_se); 1226 dequeue_rt_stack(rt_se);
1138 1227
1139 for_each_sched_rt_entity(rt_se) { 1228 for_each_sched_rt_entity(rt_se) {
@@ -1142,6 +1231,7 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
1142 if (rt_rq && rt_rq->rt_nr_running) 1231 if (rt_rq && rt_rq->rt_nr_running)
1143 __enqueue_rt_entity(rt_se, false); 1232 __enqueue_rt_entity(rt_se, false);
1144 } 1233 }
1234 enqueue_top_rt_rq(&rq->rt);
1145} 1235}
1146 1236
1147/* 1237/*
@@ -1159,8 +1249,6 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
1159 1249
1160 if (!task_current(rq, p) && p->nr_cpus_allowed > 1) 1250 if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
1161 enqueue_pushable_task(rq, p); 1251 enqueue_pushable_task(rq, p);
1162
1163 inc_nr_running(rq);
1164} 1252}
1165 1253
1166static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) 1254static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
@@ -1171,8 +1259,6 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
1171 dequeue_rt_entity(rt_se); 1259 dequeue_rt_entity(rt_se);
1172 1260
1173 dequeue_pushable_task(rq, p); 1261 dequeue_pushable_task(rq, p);
1174
1175 dec_nr_running(rq);
1176} 1262}
1177 1263
1178/* 1264/*
@@ -1377,10 +1463,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev)
1377 if (prev->sched_class == &rt_sched_class) 1463 if (prev->sched_class == &rt_sched_class)
1378 update_curr_rt(rq); 1464 update_curr_rt(rq);
1379 1465
1380 if (!rt_rq->rt_nr_running) 1466 if (!rt_rq->rt_queued)
1381 return NULL;
1382
1383 if (rt_rq_throttled(rt_rq))
1384 return NULL; 1467 return NULL;
1385 1468
1386 put_prev_task(rq, prev); 1469 put_prev_task(rq, prev);
@@ -1892,9 +1975,9 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
1892 */ 1975 */
1893 if (p->on_rq && rq->curr != p) { 1976 if (p->on_rq && rq->curr != p) {
1894#ifdef CONFIG_SMP 1977#ifdef CONFIG_SMP
1895 if (rq->rt.overloaded && push_rt_task(rq) && 1978 if (p->nr_cpus_allowed > 1 && rq->rt.overloaded &&
1896 /* Don't resched if we changed runqueues */ 1979 /* Don't resched if we changed runqueues */
1897 rq != task_rq(p)) 1980 push_rt_task(rq) && rq != task_rq(p))
1898 check_resched = 0; 1981 check_resched = 0;
1899#endif /* CONFIG_SMP */ 1982#endif /* CONFIG_SMP */
1900 if (check_resched && p->prio < rq->curr->prio) 1983 if (check_resched && p->prio < rq->curr->prio)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 456e492a3dca..600e2291a75c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -409,6 +409,8 @@ struct rt_rq {
409 int overloaded; 409 int overloaded;
410 struct plist_head pushable_tasks; 410 struct plist_head pushable_tasks;
411#endif 411#endif
412 int rt_queued;
413
412 int rt_throttled; 414 int rt_throttled;
413 u64 rt_time; 415 u64 rt_time;
414 u64 rt_runtime; 416 u64 rt_runtime;
@@ -423,18 +425,6 @@ struct rt_rq {
423#endif 425#endif
424}; 426};
425 427
426#ifdef CONFIG_RT_GROUP_SCHED
427static inline int rt_rq_throttled(struct rt_rq *rt_rq)
428{
429 return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
430}
431#else
432static inline int rt_rq_throttled(struct rt_rq *rt_rq)
433{
434 return rt_rq->rt_throttled;
435}
436#endif
437
438/* Deadline class' related fields in a runqueue */ 428/* Deadline class' related fields in a runqueue */
439struct dl_rq { 429struct dl_rq {
440 /* runqueue is an rbtree, ordered by deadline */ 430 /* runqueue is an rbtree, ordered by deadline */
@@ -1216,12 +1206,14 @@ extern void update_idle_cpu_load(struct rq *this_rq);
1216 1206
1217extern void init_task_runnable_average(struct task_struct *p); 1207extern void init_task_runnable_average(struct task_struct *p);
1218 1208
1219static inline void inc_nr_running(struct rq *rq) 1209static inline void add_nr_running(struct rq *rq, unsigned count)
1220{ 1210{
1221 rq->nr_running++; 1211 unsigned prev_nr = rq->nr_running;
1212
1213 rq->nr_running = prev_nr + count;
1222 1214
1223#ifdef CONFIG_NO_HZ_FULL 1215#ifdef CONFIG_NO_HZ_FULL
1224 if (rq->nr_running == 2) { 1216 if (prev_nr < 2 && rq->nr_running >= 2) {
1225 if (tick_nohz_full_cpu(rq->cpu)) { 1217 if (tick_nohz_full_cpu(rq->cpu)) {
1226 /* Order rq->nr_running write against the IPI */ 1218 /* Order rq->nr_running write against the IPI */
1227 smp_wmb(); 1219 smp_wmb();
@@ -1231,9 +1223,9 @@ static inline void inc_nr_running(struct rq *rq)
1231#endif 1223#endif
1232} 1224}
1233 1225
1234static inline void dec_nr_running(struct rq *rq) 1226static inline void sub_nr_running(struct rq *rq, unsigned count)
1235{ 1227{
1236 rq->nr_running--; 1228 rq->nr_running -= count;
1237} 1229}
1238 1230
1239static inline void rq_last_tick_reset(struct rq *rq) 1231static inline void rq_last_tick_reset(struct rq *rq)
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index d6ce65dde541..bfe0edadbfbb 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -41,13 +41,13 @@ pick_next_task_stop(struct rq *rq, struct task_struct *prev)
41static void 41static void
42enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags) 42enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)
43{ 43{
44 inc_nr_running(rq); 44 add_nr_running(rq, 1);
45} 45}
46 46
47static void 47static void
48dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags) 48dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags)
49{ 49{
50 dec_nr_running(rq); 50 sub_nr_running(rq, 1);
51} 51}
52 52
53static void yield_task_stop(struct rq *rq) 53static void yield_task_stop(struct rq *rq)
diff --git a/kernel/sys.c b/kernel/sys.c
index fba0f29401ea..66a751ebf9d9 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -250,7 +250,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
250 else 250 else
251 p = current; 251 p = current;
252 if (p) { 252 if (p) {
253 niceval = 20 - task_nice(p); 253 niceval = nice_to_rlimit(task_nice(p));
254 if (niceval > retval) 254 if (niceval > retval)
255 retval = niceval; 255 retval = niceval;
256 } 256 }
@@ -261,7 +261,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
261 else 261 else
262 pgrp = task_pgrp(current); 262 pgrp = task_pgrp(current);
263 do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { 263 do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
264 niceval = 20 - task_nice(p); 264 niceval = nice_to_rlimit(task_nice(p));
265 if (niceval > retval) 265 if (niceval > retval)
266 retval = niceval; 266 retval = niceval;
267 } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); 267 } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
@@ -277,7 +277,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
277 277
278 do_each_thread(g, p) { 278 do_each_thread(g, p) {
279 if (uid_eq(task_uid(p), uid)) { 279 if (uid_eq(task_uid(p), uid)) {
280 niceval = 20 - task_nice(p); 280 niceval = nice_to_rlimit(task_nice(p));
281 if (niceval > retval) 281 if (niceval > retval)
282 retval = niceval; 282 retval = niceval;
283 } 283 }
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 8edc87185427..a4bab46cd38e 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -100,10 +100,10 @@ enum {
100 100
101 /* 101 /*
102 * Rescue workers are used only on emergencies and shared by 102 * Rescue workers are used only on emergencies and shared by
103 * all cpus. Give -20. 103 * all cpus. Give MIN_NICE.
104 */ 104 */
105 RESCUER_NICE_LEVEL = -20, 105 RESCUER_NICE_LEVEL = MIN_NICE,
106 HIGHPRI_NICE_LEVEL = -20, 106 HIGHPRI_NICE_LEVEL = MIN_NICE,
107 107
108 WQ_NAME_LEN = 24, 108 WQ_NAME_LEN = 24,
109}; 109};