Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip into next

Pull scheduler updates from Ingo Molnar: "The main scheduling related changes in this cycle were: - various sched/numa updates, for better performance - tree wide cleanup of open coded nice levels - nohz fix related to rq->nr_running use - cpuidle changes and continued consolidation to improve the kernel/sched/idle.c high level idle scheduling logic. As part of this effort I pulled cpuidle driver changes from Rafael as well. - standardized idle polling amongst architectures - continued work on preparing better power/energy aware scheduling - sched/rt updates - misc fixlets and cleanups" * 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (49 commits) sched/numa: Decay ->wakee_flips instead of zeroing sched/numa: Update migrate_improves/degrades_locality() sched/numa: Allow task switch if load imbalance improves sched/rt: Fix 'struct sched_dl_entity' and dl_task_time() comments, to match the current upstream code sched: Consolidate open coded implementations of nice level frobbing into nice_to_rlimit() and rlimit_to_nice() sched: Initialize rq->age_stamp on processor start sched, nohz: Change rq->nr_running to always use wrappers sched: Fix the rq->next_balance logic in rebalance_domains() and idle_balance() sched: Use clamp() and clamp_val() to make sys_nice() more readable sched: Do not zero sg->cpumask and sg->sgp->power in build_sched_groups() sched/numa: Fix initialization of sched_domain_topology for NUMA sched: Call select_idle_sibling() when not affine_sd sched: Simplify return logic in sched_read_attr() sched: Simplify return logic in sched_copy_attr() sched: Fix exec_start/task_hot on migrated tasks arm64: Remove TIF_POLLING_NRFLAG metag: Remove TIF_POLLING_NRFLAG sched/idle: Make cpuidle_idle_call() void sched/idle: Reflow cpuidle_idle_call() sched/idle: Delay clearing the polling bit ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2014-06-03 17:00:15 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2014-06-03 17:00:15 -0400
commit: c84a1e32ee58fc1cc9d3fd42619b917cce67e30a (patch)
tree: d3e5bed273f747e7c9e399864219bea76f4c30ea /kernel
parent: 3d521f9151dacab566904d1f57dcb3e7080cdd8f (diff)
parent: 096aa33863a5e48de52d2ff30e0801b7487944f4 (diff)
11 files changed, 522 insertions, 339 deletions
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index dbafeac18e4d..0955b885d0dc 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -216,7 +216,7 @@ static int lock_torture_writer(void *arg)
        static DEFINE_TORTURE_RANDOM(rand);
        VERBOSE_TOROUT_STRING("lock_torture_writer task started");
-        set_user_nice(current, 19);
+        set_user_nice(current, MAX_NICE);
        do {
                if ((torture_random(&rand) & 0xfffff) == 0)
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 8233cd4047d7..155721f7f909 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -54,9 +54,11 @@ static void freeze_begin(void)
 static void freeze_enter(void)
 {
+        cpuidle_use_deepest_state(true);
        cpuidle_resume();
        wait_event(suspend_freeze_wait_head, suspend_freeze_wake);
        cpuidle_pause();
+        cpuidle_use_deepest_state(false);
 }
 void freeze_wake(void)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a62a7dec3986..913c6d6cc2c1 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -522,6 +522,39 @@ static inline void init_hrtick(void)
 #endif  /* CONFIG_SCHED_HRTICK */
 /*
+ * cmpxchg based fetch_or, macro so it works for different integer types
+ */
+#define fetch_or(ptr, val)                                              \
+({      typeof(*(ptr)) __old, __val = *(ptr);                           \
+        for (;;) {                                                      \
+                __old = cmpxchg((ptr), __val, __val | (val));           \
+                if (__old == __val)                                     \
+                        break;                                          \
+                __val = __old;                                          \
+        }                                                               \
+        __old;                                                          \
+})
+#ifdef TIF_POLLING_NRFLAG
+/*
+ * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG,
+ * this avoids any races wrt polling state changes and thereby avoids
+ * spurious IPIs.
+ */
+static bool set_nr_and_not_polling(struct task_struct *p)
+{
+        struct thread_info *ti = task_thread_info(p);
+        return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);
+}
+#else
+static bool set_nr_and_not_polling(struct task_struct *p)
+{
+        set_tsk_need_resched(p);
+        return true;
+}
+#endif
+/*
 * resched_task - mark a task 'to be rescheduled now'.
 *
 * On UP this means the setting of the need_resched flag, on SMP it
@@ -537,17 +570,15 @@ void resched_task(struct task_struct *p)
        if (test_tsk_need_resched(p))
                return;
-        set_tsk_need_resched(p);
        cpu = task_cpu(p);
        if (cpu == smp_processor_id()) {
+                set_tsk_need_resched(p);
                set_preempt_need_resched();
                return;
        }
-        /* NEED_RESCHED must be visible before we test polling */
+        if (set_nr_and_not_polling(p))
-        smp_mb();
-        if (!tsk_is_polling(p))
                smp_send_reschedule(cpu);
 }
@@ -3018,7 +3049,7 @@ EXPORT_SYMBOL(set_user_nice);
 int can_nice(const struct task_struct *p, const int nice)
 {
        /* convert nice value [19,-20] to rlimit style value [1,40] */
-        int nice_rlim = 20 - nice;
+        int nice_rlim = nice_to_rlimit(nice);
        return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
                capable(CAP_SYS_NICE));
@@ -3042,17 +3073,10 @@ SYSCALL_DEFINE1(nice, int, increment)
         * We don't have to worry. Conceptually one call occurs first
         * and we have a single winner.
         */
-        if (increment < -40)
+        increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH);
-                increment = -40;
-        if (increment > 40)
-                increment = 40;
        nice = task_nice(current) + increment;
-        if (nice < MIN_NICE)
-                nice = MIN_NICE;
-        if (nice > MAX_NICE)
-                nice = MAX_NICE;
+        nice = clamp_val(nice, MIN_NICE, MAX_NICE);
        if (increment < 0 && !can_nice(current, nice))
                return -EPERM;
@@ -3642,13 +3666,11 @@ static int sched_copy_attr(struct sched_attr __user *uattr,
         */
        attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE);
-out:
+        return 0;
-        return ret;
 err_size:
        put_user(sizeof(*attr), &uattr->size);
-        ret = -E2BIG;
+        return -E2BIG;
-        goto out;
 }
 /**
@@ -3808,7 +3830,7 @@ static int sched_read_attr(struct sched_attr __user *uattr,
                for (; addr < end; addr++) {
                        if (*addr)
-                                goto err_size;
+                                return -EFBIG;
                }
                attr->size = usize;
@@ -3818,12 +3840,7 @@ static int sched_read_attr(struct sched_attr __user *uattr,
        if (ret)
                return -EFAULT;
-out:
+        return 0;
-        return ret;
-err_size:
-        ret = -E2BIG;
-        goto out;
 }
 /**
@@ -5093,10 +5110,20 @@ static struct notifier_block migration_notifier = {
        .priority = CPU_PRI_MIGRATION,
 };
+static void __cpuinit set_cpu_rq_start_time(void)
+{
+        int cpu = smp_processor_id();
+        struct rq *rq = cpu_rq(cpu);
+        rq->age_stamp = sched_clock_cpu(cpu);
+}
 static int sched_cpu_active(struct notifier_block *nfb,
                                      unsigned long action, void *hcpu)
 {
        switch (action & ~CPU_TASKS_FROZEN) {
+        case CPU_STARTING:
+                set_cpu_rq_start_time();
+                return NOTIFY_OK;
        case CPU_DOWN_FAILED:
                set_cpu_active((long)hcpu, true);
                return NOTIFY_OK;
@@ -5305,7 +5332,8 @@ static int sd_degenerate(struct sched_domain *sd)
                         SD_BALANCE_FORK |
                         SD_BALANCE_EXEC |
                         SD_SHARE_CPUPOWER |
-                         SD_SHARE_PKG_RESOURCES)) {
+                         SD_SHARE_PKG_RESOURCES |
+                         SD_SHARE_POWERDOMAIN)) {
                if (sd->groups != sd->groups->next)
                        return 0;
        }
@@ -5336,7 +5364,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
                                SD_BALANCE_EXEC |
                                SD_SHARE_CPUPOWER |
                                SD_SHARE_PKG_RESOURCES |
-                                SD_PREFER_SIBLING);
+                                SD_PREFER_SIBLING |
+                                SD_SHARE_POWERDOMAIN);
                if (nr_node_ids == 1)
                        pflags &= ~SD_SERIALIZE;
        }
@@ -5610,17 +5639,6 @@ static int __init isolated_cpu_setup(char *str)
 __setup("isolcpus=", isolated_cpu_setup);
-static const struct cpumask *cpu_cpu_mask(int cpu)
-{
-        return cpumask_of_node(cpu_to_node(cpu));
-}
-struct sd_data {
-        struct sched_domain **__percpu sd;
-        struct sched_group **__percpu sg;
-        struct sched_group_power **__percpu sgp;
-};
 struct s_data {
        struct sched_domain ** __percpu sd;
        struct root_domain      *rd;
@@ -5633,21 +5651,6 @@ enum s_alloc {
        sa_none,
 };
-struct sched_domain_topology_level;
-typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);
-typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
-#define SDTL_OVERLAP    0x01
-struct sched_domain_topology_level {
-        sched_domain_init_f init;
-        sched_domain_mask_f mask;
-        int                 flags;
-        int                 numa_level;
-        struct sd_data      data;
-};
 /*
 * Build an iteration mask that can exclude certain CPUs from the upwards
 * domain traversal.
@@ -5815,8 +5818,6 @@ build_sched_groups(struct sched_domain *sd, int cpu)
                        continue;
                group = get_group(i, sdd, &sg);
-                cpumask_clear(sched_group_cpus(sg));
-                sg->sgp->power = 0;
                cpumask_setall(sched_group_mask(sg));
                for_each_cpu(j, span) {
@@ -5866,44 +5867,11 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
        atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight);
 }
-int __weak arch_sd_sibling_asym_packing(void)
-{
-       return 0*SD_ASYM_PACKING;
-}
 /*
 * Initializers for schedule domains
 * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
 */
-#ifdef CONFIG_SCHED_DEBUG
-# define SD_INIT_NAME(sd, type)         sd->name = #type
-#else
-# define SD_INIT_NAME(sd, type)         do { } while (0)
-#endif
-#define SD_INIT_FUNC(type)                                              \
-static noinline struct sched_domain *                                   \
-sd_init_##type(struct sched_domain_topology_level *tl, int cpu)         \
-{                                                                       \
-        struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);       \
-        *sd = SD_##type##_INIT;                                         \
-        SD_INIT_NAME(sd, type);                                         \
-        sd->private = &tl->data;                                        \
-        return sd;                                                      \
-}
-SD_INIT_FUNC(CPU)
-#ifdef CONFIG_SCHED_SMT
- SD_INIT_FUNC(SIBLING)
-#endif
-#ifdef CONFIG_SCHED_MC
- SD_INIT_FUNC(MC)
-#endif
-#ifdef CONFIG_SCHED_BOOK
- SD_INIT_FUNC(BOOK)
-#endif
 static int default_relax_domain_level = -1;
 int sched_domain_level_max;
@@ -5991,99 +5959,154 @@ static void claim_allocations(int cpu, struct sched_domain *sd)
                *per_cpu_ptr(sdd->sgp, cpu) = NULL;
 }
-#ifdef CONFIG_SCHED_SMT
-static const struct cpumask *cpu_smt_mask(int cpu)
-{
-        return topology_thread_cpumask(cpu);
-}
-#endif
-/*
- * Topology list, bottom-up.
- */
-static struct sched_domain_topology_level default_topology[] = {
-#ifdef CONFIG_SCHED_SMT
-        { sd_init_SIBLING, cpu_smt_mask, },
-#endif
-#ifdef CONFIG_SCHED_MC
-        { sd_init_MC, cpu_coregroup_mask, },
-#endif
-#ifdef CONFIG_SCHED_BOOK
-        { sd_init_BOOK, cpu_book_mask, },
-#endif
-        { sd_init_CPU, cpu_cpu_mask, },
-        { NULL, },
-};
-static struct sched_domain_topology_level *sched_domain_topology = default_topology;
-#define for_each_sd_topology(tl)                        \
-        for (tl = sched_domain_topology; tl->init; tl++)
 #ifdef CONFIG_NUMA
 static int sched_domains_numa_levels;
 static int *sched_domains_numa_distance;
 static struct cpumask ***sched_domains_numa_masks;
 static int sched_domains_curr_level;
+#endif
-static inline int sd_local_flags(int level)
+/*
-{
+ * SD_flags allowed in topology descriptions.
-        if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE)
+ *
-                return 0;
+ * SD_SHARE_CPUPOWER      - describes SMT topologies
+ * SD_SHARE_PKG_RESOURCES - describes shared caches
-        return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE;
+ * SD_NUMA                - describes NUMA topologies
-}
+ * SD_SHARE_POWERDOMAIN   - describes shared power domain
+ *
+ * Odd one out:
+ * SD_ASYM_PACKING        - describes SMT quirks
+ */
+#define TOPOLOGY_SD_FLAGS               \
+        (SD_SHARE_CPUPOWER |            \
+         SD_SHARE_PKG_RESOURCES |       \
+         SD_NUMA |                      \
+         SD_ASYM_PACKING |              \
+         SD_SHARE_POWERDOMAIN)
 static struct sched_domain *
-sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
+sd_init(struct sched_domain_topology_level *tl, int cpu)
 {
        struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
-        int level = tl->numa_level;
+        int sd_weight, sd_flags = 0;
-        int sd_weight = cpumask_weight(
-                        sched_domains_numa_masks[level][cpu_to_node(cpu)]);
+#ifdef CONFIG_NUMA
+        /*
+         * Ugly hack to pass state to sd_numa_mask()...
+         */
+        sched_domains_curr_level = tl->numa_level;
+#endif
+        sd_weight = cpumask_weight(tl->mask(cpu));
+        if (tl->sd_flags)
+                sd_flags = (*tl->sd_flags)();
+        if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS,
+                        "wrong sd_flags in topology description\n"))
+                sd_flags &= ~TOPOLOGY_SD_FLAGS;
        *sd = (struct sched_domain){
                .min_interval           = sd_weight,
                .max_interval           = 2*sd_weight,
                .busy_factor            = 32,
                .imbalance_pct          = 125,
-                .cache_nice_tries       = 2,
-                .busy_idx               = 3,
+                .cache_nice_tries       = 0,
-                .idle_idx               = 2,
+                .busy_idx               = 0,
+                .idle_idx               = 0,
                .newidle_idx            = 0,
                .wake_idx               = 0,
                .forkexec_idx           = 0,
                .flags                  = 1*SD_LOAD_BALANCE
                                        | 1*SD_BALANCE_NEWIDLE
-                                        | 0*SD_BALANCE_EXEC
+                                        | 1*SD_BALANCE_EXEC
-                                        | 0*SD_BALANCE_FORK
+                                        | 1*SD_BALANCE_FORK
                                        | 0*SD_BALANCE_WAKE
-                                        | 0*SD_WAKE_AFFINE
+                                        | 1*SD_WAKE_AFFINE
                                        | 0*SD_SHARE_CPUPOWER
                                        | 0*SD_SHARE_PKG_RESOURCES
-                                        | 1*SD_SERIALIZE
+                                        | 0*SD_SERIALIZE
                                        | 0*SD_PREFER_SIBLING
-                                        | 1*SD_NUMA
+                                        | 0*SD_NUMA
-                                        | sd_local_flags(level)
+                                        | sd_flags
                                        ,
                .last_balance           = jiffies,
                .balance_interval       = sd_weight,
+                .smt_gain               = 0,
                .max_newidle_lb_cost    = 0,
                .next_decay_max_lb_cost = jiffies,
+#ifdef CONFIG_SCHED_DEBUG
+                .name                   = tl->name,
+#endif
        };
-        SD_INIT_NAME(sd, NUMA);
-        sd->private = &tl->data;
        /*
-         * Ugly hack to pass state to sd_numa_mask()...
+         * Convert topological properties into behaviour.
         */
-        sched_domains_curr_level = tl->numa_level;
+        if (sd->flags & SD_SHARE_CPUPOWER) {
+                sd->imbalance_pct = 110;
+                sd->smt_gain = 1178; /* ~15% */
+        } else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
+                sd->imbalance_pct = 117;
+                sd->cache_nice_tries = 1;
+                sd->busy_idx = 2;
+#ifdef CONFIG_NUMA
+        } else if (sd->flags & SD_NUMA) {
+                sd->cache_nice_tries = 2;
+                sd->busy_idx = 3;
+                sd->idle_idx = 2;
+                sd->flags |= SD_SERIALIZE;
+                if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) {
+                        sd->flags &= ~(SD_BALANCE_EXEC |
+                                       SD_BALANCE_FORK |
+                                       SD_WAKE_AFFINE);
+                }
+#endif
+        } else {
+                sd->flags |= SD_PREFER_SIBLING;
+                sd->cache_nice_tries = 1;
+                sd->busy_idx = 2;
+                sd->idle_idx = 1;
+        }
+        sd->private = &tl->data;
        return sd;
 }
+/*
+ * Topology list, bottom-up.
+ */
+static struct sched_domain_topology_level default_topology[] = {
+#ifdef CONFIG_SCHED_SMT
+        { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
+#endif
+#ifdef CONFIG_SCHED_MC
+        { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
+#endif
+        { cpu_cpu_mask, SD_INIT_NAME(DIE) },
+        { NULL, },
+};
+struct sched_domain_topology_level *sched_domain_topology = default_topology;
+#define for_each_sd_topology(tl)                        \
+        for (tl = sched_domain_topology; tl->mask; tl++)
+void set_sched_topology(struct sched_domain_topology_level *tl)
+{
+        sched_domain_topology = tl;
+}
+#ifdef CONFIG_NUMA
 static const struct cpumask *sd_numa_mask(int cpu)
 {
        return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
@@ -6227,7 +6250,10 @@ static void sched_init_numa(void)
                }
        }
-        tl = kzalloc((ARRAY_SIZE(default_topology) + level) *
+        /* Compute default topology size */
+        for (i = 0; sched_domain_topology[i].mask; i++);
+        tl = kzalloc((i + level + 1) *
                        sizeof(struct sched_domain_topology_level), GFP_KERNEL);
        if (!tl)
                return;
@@ -6235,18 +6261,19 @@ static void sched_init_numa(void)
        /*
         * Copy the default topology bits..
         */
-        for (i = 0; default_topology[i].init; i++)
+        for (i = 0; sched_domain_topology[i].mask; i++)
-                tl[i] = default_topology[i];
+                tl[i] = sched_domain_topology[i];
        /*
         * .. and append 'j' levels of NUMA goodness.
         */
        for (j = 0; j < level; i++, j++) {
                tl[i] = (struct sched_domain_topology_level){
-                        .init = sd_numa_init,
                        .mask = sd_numa_mask,
+                        .sd_flags = cpu_numa_flags,
                        .flags = SDTL_OVERLAP,
                        .numa_level = j,
+                        SD_INIT_NAME(NUMA)
                };
        }
@@ -6404,7 +6431,7 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
                const struct cpumask *cpu_map, struct sched_domain_attr *attr,
                struct sched_domain *child, int cpu)
 {
-        struct sched_domain *sd = tl->init(tl, cpu);
+        struct sched_domain *sd = sd_init(tl, cpu);
        if (!sd)
                return child;
@@ -6974,6 +7001,7 @@ void __init sched_init(void)
        if (cpu_isolated_map == NULL)
                zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
        idle_thread_set_boot_cpu();
+        set_cpu_rq_start_time();
 #endif
        init_sched_fair_class();
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 800e99b99075..f9ca7d19781a 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -520,7 +520,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
         * We need to take care of a possible races here. In fact, the
         * task might have changed its scheduling policy to something
         * different from SCHED_DEADLINE or changed its reservation
-         * parameters (through sched_setscheduler()).
+         * parameters (through sched_setattr()).
         */
        if (!dl_task(p) || dl_se->dl_new)
                goto unlock;
@@ -741,7 +741,7 @@ void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
        WARN_ON(!dl_prio(prio));
        dl_rq->dl_nr_running++;
-        inc_nr_running(rq_of_dl_rq(dl_rq));
+        add_nr_running(rq_of_dl_rq(dl_rq), 1);
        inc_dl_deadline(dl_rq, deadline);
        inc_dl_migration(dl_se, dl_rq);
@@ -755,7 +755,7 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
        WARN_ON(!dl_prio(prio));
        WARN_ON(!dl_rq->dl_nr_running);
        dl_rq->dl_nr_running--;
-        dec_nr_running(rq_of_dl_rq(dl_rq));
+        sub_nr_running(rq_of_dl_rq(dl_rq), 1);
        dec_dl_deadline(dl_rq, dl_se->deadline);
        dec_dl_migration(dl_se, dl_rq);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0fdb96de81a5..c9617b73bcc0 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1095,6 +1095,34 @@ static void task_numa_assign(struct task_numa_env *env,
        env->best_cpu = env->dst_cpu;
 }
+static bool load_too_imbalanced(long orig_src_load, long orig_dst_load,
+                                long src_load, long dst_load,
+                                struct task_numa_env *env)
+{
+        long imb, old_imb;
+        /* We care about the slope of the imbalance, not the direction. */
+        if (dst_load < src_load)
+                swap(dst_load, src_load);
+        /* Is the difference below the threshold? */
+        imb = dst_load * 100 - src_load * env->imbalance_pct;
+        if (imb <= 0)
+                return false;
+        /*
+         * The imbalance is above the allowed threshold.
+         * Compare it with the old imbalance.
+         */
+        if (orig_dst_load < orig_src_load)
+                swap(orig_dst_load, orig_src_load);
+        old_imb = orig_dst_load * 100 - orig_src_load * env->imbalance_pct;
+        /* Would this change make things worse? */
+        return (old_imb > imb);
+}
 /*
 * This checks if the overall compute and NUMA accesses of the system would
 * be improved if the source tasks was migrated to the target dst_cpu taking
@@ -1107,7 +1135,8 @@ static void task_numa_compare(struct task_numa_env *env,
        struct rq *src_rq = cpu_rq(env->src_cpu);
        struct rq *dst_rq = cpu_rq(env->dst_cpu);
        struct task_struct *cur;
-        long dst_load, src_load;
+        long orig_src_load, src_load;
+        long orig_dst_load, dst_load;
        long load;
        long imp = (groupimp > 0) ? groupimp : taskimp;
@@ -1181,13 +1210,13 @@ static void task_numa_compare(struct task_numa_env *env,
         * In the overloaded case, try and keep the load balanced.
         */
 balance:
-        dst_load = env->dst_stats.load;
+        orig_dst_load = env->dst_stats.load;
-        src_load = env->src_stats.load;
+        orig_src_load = env->src_stats.load;
        /* XXX missing power terms */
        load = task_h_load(env->p);
-        dst_load += load;
+        dst_load = orig_dst_load + load;
-        src_load -= load;
+        src_load = orig_src_load - load;
        if (cur) {
                load = task_h_load(cur);
@@ -1195,11 +1224,8 @@ balance:
                src_load += load;
        }
-        /* make src_load the smaller */
+        if (load_too_imbalanced(orig_src_load, orig_dst_load,
-        if (dst_load < src_load)
+                                src_load, dst_load, env))
-                swap(dst_load, src_load);
-        if (src_load * env->imbalance_pct < dst_load * 100)
                goto unlock;
 assign:
@@ -1301,7 +1327,16 @@ static int task_numa_migrate(struct task_struct *p)
        if (env.best_cpu == -1)
                return -EAGAIN;
-        sched_setnuma(p, env.dst_nid);
+        /*
+         * If the task is part of a workload that spans multiple NUMA nodes,
+         * and is migrating into one of the workload's active nodes, remember
+         * this node as the task's preferred numa node, so the workload can
+         * settle down.
+         * A task that migrated to a second choice node will be better off
+         * trying for a better one later. Do not set the preferred node here.
+         */
+        if (p->numa_group && node_isset(env.dst_nid, p->numa_group->active_nodes))
+                sched_setnuma(p, env.dst_nid);
        /*
         * Reset the scan period if the task is being rescheduled on an
@@ -1326,12 +1361,15 @@ static int task_numa_migrate(struct task_struct *p)
 /* Attempt to migrate a task to a CPU on the preferred node. */
 static void numa_migrate_preferred(struct task_struct *p)
 {
+        unsigned long interval = HZ;
        /* This task has no NUMA fault statistics yet */
        if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults_memory))
                return;
        /* Periodically retry migrating the task to the preferred node */
-        p->numa_migrate_retry = jiffies + HZ;
+        interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
+        p->numa_migrate_retry = jiffies + interval;
        /* Success if task is already running on preferred CPU */
        if (task_node(p) == p->numa_preferred_nid)
@@ -1738,6 +1776,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
        struct task_struct *p = current;
        bool migrated = flags & TNF_MIGRATED;
        int cpu_node = task_node(current);
+        int local = !!(flags & TNF_FAULT_LOCAL);
        int priv;
        if (!numabalancing_enabled)
@@ -1786,6 +1825,17 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
                        task_numa_group(p, last_cpupid, flags, &priv);
        }
+        /*
+         * If a workload spans multiple NUMA nodes, a shared fault that
+         * occurs wholly within the set of nodes that the workload is
+         * actively using should be counted as local. This allows the
+         * scan rate to slow down when a workload has settled down.
+         */
+        if (!priv && !local && p->numa_group &&
+                        node_isset(cpu_node, p->numa_group->active_nodes) &&
+                        node_isset(mem_node, p->numa_group->active_nodes))
+                local = 1;
        task_numa_placement(p);
        /*
@@ -1800,7 +1850,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
        p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages;
        p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages;
-        p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages;
+        p->numa_faults_locality[local] += pages;
 }
 static void reset_ptenuma_scan(struct task_struct *p)
@@ -3301,7 +3351,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
        }
        if (!se)
-                rq->nr_running -= task_delta;
+                sub_nr_running(rq, task_delta);
        cfs_rq->throttled = 1;
        cfs_rq->throttled_clock = rq_clock(rq);
@@ -3352,7 +3402,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
        }
        if (!se)
-                rq->nr_running += task_delta;
+                add_nr_running(rq, task_delta);
        /* determine whether we need to wake up potentially idle cpu */
        if (rq->curr == rq->idle && rq->cfs.nr_running)
@@ -3884,7 +3934,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
        if (!se) {
                update_rq_runnable_avg(rq, rq->nr_running);
-                inc_nr_running(rq);
+                add_nr_running(rq, 1);
        }
        hrtick_update(rq);
 }
@@ -3944,7 +3994,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
        }
        if (!se) {
-                dec_nr_running(rq);
+                sub_nr_running(rq, 1);
                update_rq_runnable_avg(rq, 1);
        }
        hrtick_update(rq);
@@ -4015,7 +4065,7 @@ static void record_wakee(struct task_struct *p)
         * about the loss.
         */
        if (jiffies > current->wakee_flip_decay_ts + HZ) {
-                current->wakee_flips = 0;
+                current->wakee_flips >>= 1;
                current->wakee_flip_decay_ts = jiffies;
        }
@@ -4449,10 +4499,10 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
                        sd = tmp;
        }
-        if (affine_sd) {
+        if (affine_sd && cpu != prev_cpu && wake_affine(affine_sd, p, sync))
-                if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
+                prev_cpu = cpu;
-                        prev_cpu = cpu;
+        if (sd_flag & SD_BALANCE_WAKE) {
                new_cpu = select_idle_sibling(p, prev_cpu);
                goto unlock;
        }
@@ -4520,6 +4570,9 @@ migrate_task_rq_fair(struct task_struct *p, int next_cpu)
                atomic_long_add(se->avg.load_avg_contrib,
                                                &cfs_rq->removed_load);
        }
+        /* We have migrated, no longer consider this task hot */
+        se->exec_start = 0;
 }
 #endif /* CONFIG_SMP */
@@ -5070,6 +5123,7 @@ task_hot(struct task_struct *p, u64 now)
 /* Returns true if the destination node has incurred more faults */
 static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
 {
+        struct numa_group *numa_group = rcu_dereference(p->numa_group);
        int src_nid, dst_nid;
        if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults_memory ||
@@ -5083,21 +5137,29 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
        if (src_nid == dst_nid)
                return false;
-        /* Always encourage migration to the preferred node. */
+        if (numa_group) {
-        if (dst_nid == p->numa_preferred_nid)
+                /* Task is already in the group's interleave set. */
-                return true;
+                if (node_isset(src_nid, numa_group->active_nodes))
+                        return false;
+                /* Task is moving into the group's interleave set. */
+                if (node_isset(dst_nid, numa_group->active_nodes))
+                        return true;
-        /* If both task and group weight improve, this move is a winner. */
+                return group_faults(p, dst_nid) > group_faults(p, src_nid);
-        if (task_weight(p, dst_nid) > task_weight(p, src_nid) &&
+        }
-            group_weight(p, dst_nid) > group_weight(p, src_nid))
+        /* Encourage migration to the preferred node. */
+        if (dst_nid == p->numa_preferred_nid)
                return true;
-        return false;
+        return task_faults(p, dst_nid) > task_faults(p, src_nid);
 }
 static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
 {
+        struct numa_group *numa_group = rcu_dereference(p->numa_group);
        int src_nid, dst_nid;
        if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
@@ -5112,16 +5174,23 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
        if (src_nid == dst_nid)
                return false;
+        if (numa_group) {
+                /* Task is moving within/into the group's interleave set. */
+                if (node_isset(dst_nid, numa_group->active_nodes))
+                        return false;
+                /* Task is moving out of the group's interleave set. */
+                if (node_isset(src_nid, numa_group->active_nodes))
+                        return true;
+                return group_faults(p, dst_nid) < group_faults(p, src_nid);
+        }
        /* Migrating away from the preferred node is always bad. */
        if (src_nid == p->numa_preferred_nid)
                return true;
-        /* If either task or group weight get worse, don't do it. */
+        return task_faults(p, dst_nid) < task_faults(p, src_nid);
-        if (task_weight(p, dst_nid) < task_weight(p, src_nid) ||
-            group_weight(p, dst_nid) < group_weight(p, src_nid))
-                return true;
-        return false;
 }
 #else
@@ -5564,6 +5633,7 @@ static unsigned long scale_rt_power(int cpu)
 {
        struct rq *rq = cpu_rq(cpu);
        u64 total, available, age_stamp, avg;
+        s64 delta;
        /*
         * Since we're reading these variables without serialization make sure
@@ -5572,7 +5642,11 @@ static unsigned long scale_rt_power(int cpu)
        age_stamp = ACCESS_ONCE(rq->age_stamp);
        avg = ACCESS_ONCE(rq->rt_avg);
-        total = sched_avg_period() + (rq_clock(rq) - age_stamp);
+        delta = rq_clock(rq) - age_stamp;
+        if (unlikely(delta < 0))
+                delta = 0;
+        total = sched_avg_period() + delta;
        if (unlikely(total < avg)) {
                /* Ensures that power won't end up being negative */
@@ -6640,17 +6714,44 @@ out:
        return ld_moved;
 }
+static inline unsigned long
+get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
+{
+        unsigned long interval = sd->balance_interval;
+        if (cpu_busy)
+                interval *= sd->busy_factor;
+        /* scale ms to jiffies */
+        interval = msecs_to_jiffies(interval);
+        interval = clamp(interval, 1UL, max_load_balance_interval);
+        return interval;
+}
+static inline void
+update_next_balance(struct sched_domain *sd, int cpu_busy, unsigned long *next_balance)
+{
+        unsigned long interval, next;
+        interval = get_sd_balance_interval(sd, cpu_busy);
+        next = sd->last_balance + interval;
+        if (time_after(*next_balance, next))
+                *next_balance = next;
+}
 /*
 * idle_balance is called by schedule() if this_cpu is about to become
 * idle. Attempts to pull tasks from other CPUs.
 */
 static int idle_balance(struct rq *this_rq)
 {
+        unsigned long next_balance = jiffies + HZ;
+        int this_cpu = this_rq->cpu;
        struct sched_domain *sd;
        int pulled_task = 0;
-        unsigned long next_balance = jiffies + HZ;
        u64 curr_cost = 0;
-        int this_cpu = this_rq->cpu;
        idle_enter_fair(this_rq);
@@ -6660,8 +6761,15 @@ static int idle_balance(struct rq *this_rq)
         */
        this_rq->idle_stamp = rq_clock(this_rq);
-        if (this_rq->avg_idle < sysctl_sched_migration_cost)
+        if (this_rq->avg_idle < sysctl_sched_migration_cost) {
+                rcu_read_lock();
+                sd = rcu_dereference_check_sched_domain(this_rq->sd);
+                if (sd)
+                        update_next_balance(sd, 0, &next_balance);
+                rcu_read_unlock();
                goto out;
+        }
        /*
         * Drop the rq->lock, but keep IRQ/preempt disabled.
@@ -6671,20 +6779,20 @@ static int idle_balance(struct rq *this_rq)
        update_blocked_averages(this_cpu);
        rcu_read_lock();
        for_each_domain(this_cpu, sd) {
-                unsigned long interval;
                int continue_balancing = 1;
                u64 t0, domain_cost;
                if (!(sd->flags & SD_LOAD_BALANCE))
                        continue;
-                if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost)
+                if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
+                        update_next_balance(sd, 0, &next_balance);
                        break;
+                }
                if (sd->flags & SD_BALANCE_NEWIDLE) {
                        t0 = sched_clock_cpu(this_cpu);
-                        /* If we've pulled tasks over stop searching: */
                        pulled_task = load_balance(this_cpu, this_rq,
                                                   sd, CPU_NEWLY_IDLE,
                                                   &continue_balancing);
@@ -6696,10 +6804,13 @@ static int idle_balance(struct rq *this_rq)
                        curr_cost += domain_cost;
                }
-                interval = msecs_to_jiffies(sd->balance_interval);
+                update_next_balance(sd, 0, &next_balance);
-                if (time_after(next_balance, sd->last_balance + interval))
-                        next_balance = sd->last_balance + interval;
+                /*
-                if (pulled_task)
+                 * Stop searching for tasks to pull if there are
+                 * now runnable tasks on this rq.
+                 */
+                if (pulled_task || this_rq->nr_running > 0)
                        break;
        }
        rcu_read_unlock();
@@ -6717,20 +6828,13 @@ static int idle_balance(struct rq *this_rq)
        if (this_rq->cfs.h_nr_running && !pulled_task)
                pulled_task = 1;
-        if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
+out:
-                /*
+        /* Move the next balance forward */
-                 * We are going idle. next_balance may be set based on
+        if (time_after(this_rq->next_balance, next_balance))
-                 * a busy processor. So reset next_balance.
-                 */
                this_rq->next_balance = next_balance;
-        }
-out:
        /* Is there a task of a high priority class? */
-        if (this_rq->nr_running != this_rq->cfs.h_nr_running &&
+        if (this_rq->nr_running != this_rq->cfs.h_nr_running)
-            ((this_rq->stop && this_rq->stop->on_rq) ||
-             this_rq->dl.dl_nr_running ||
-             (this_rq->rt.rt_nr_running && !rt_rq_throttled(&this_rq->rt))))
                pulled_task = -1;
        if (pulled_task) {
@@ -7011,16 +7115,9 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
                        break;
                }
-                interval = sd->balance_interval;
+                interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
-                if (idle != CPU_IDLE)
-                        interval *= sd->busy_factor;
-                /* scale ms to jiffies */
-                interval = msecs_to_jiffies(interval);
-                interval = clamp(interval, 1UL, max_load_balance_interval);
                need_serialize = sd->flags & SD_SERIALIZE;
                if (need_serialize) {
                        if (!spin_trylock(&balancing))
                                goto out;
@@ -7036,6 +7133,7 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
                                idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
                        }
                        sd->last_balance = jiffies;
+                        interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
                }
                if (need_serialize)
                        spin_unlock(&balancing);
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 8f4390a079c7..25b9423abce9 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -67,24 +67,21 @@ void __weak arch_cpu_idle(void)
 * cpuidle_idle_call - the main idle function
 *
 * NOTE: no locks or semaphores should be used here
- * return non-zero on failure
 */
-static int cpuidle_idle_call(void)
+static void cpuidle_idle_call(void)
 {
        struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices);
        struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
-        int next_state, entered_state, ret;
+        int next_state, entered_state;
        bool broadcast;
        /*
         * Check if the idle task must be rescheduled. If it is the
-         * case, exit the function after re-enabling the local irq and
+         * case, exit the function after re-enabling the local irq.
-         * set again the polling flag
         */
-        if (current_clr_polling_and_test()) {
+        if (need_resched()) {
                local_irq_enable();
-                __current_set_polling();
+                return;
-                return 0;
        }
        /*
@@ -101,96 +98,79 @@ static int cpuidle_idle_call(void)
        rcu_idle_enter();
        /*
-         * Check if the cpuidle framework is ready, otherwise fallback
+         * Ask the cpuidle framework to choose a convenient idle state.
-         * to the default arch specific idle method
+         * Fall back to the default arch idle method on errors.
         */
-        ret = cpuidle_enabled(drv, dev);
+        next_state = cpuidle_select(drv, dev);
+        if (next_state < 0) {
-        if (!ret) {
+use_default:
                /*
-                 * Ask the governor to choose an idle state it thinks
+                 * We can't use the cpuidle framework, let's use the default
-                 * it is convenient to go to. There is *always* a
+                 * idle routine.
-                 * convenient idle state
                 */
-                next_state = cpuidle_select(drv, dev);
+                if (current_clr_polling_and_test())
-                /*
-                 * The idle task must be scheduled, it is pointless to
-                 * go to idle, just update no idle residency and get
-                 * out of this function
-                 */
-                if (current_clr_polling_and_test()) {
-                        dev->last_residency = 0;
-                        entered_state = next_state;
                        local_irq_enable();
-                } else {
+                else
-                        broadcast = !!(drv->states[next_state].flags &
+                        arch_cpu_idle();
-                                       CPUIDLE_FLAG_TIMER_STOP);
+                goto exit_idle;
-                        if (broadcast)
-                                /*
-                                 * Tell the time framework to switch
-                                 * to a broadcast timer because our
-                                 * local timer will be shutdown. If a
-                                 * local timer is used from another
-                                 * cpu as a broadcast timer, this call
-                                 * may fail if it is not available
-                                 */
-                                ret = clockevents_notify(
-                                        CLOCK_EVT_NOTIFY_BROADCAST_ENTER,
-                                        &dev->cpu);
-                        if (!ret) {
-                                trace_cpu_idle_rcuidle(next_state, dev->cpu);
-                                /*
-                                 * Enter the idle state previously
-                                 * returned by the governor
-                                 * decision. This function will block
-                                 * until an interrupt occurs and will
-                                 * take care of re-enabling the local
-                                 * interrupts
-                                 */
-                                entered_state = cpuidle_enter(drv, dev,
-                                                              next_state);
-                                trace_cpu_idle_rcuidle(PWR_EVENT_EXIT,
-                                                       dev->cpu);
-                                if (broadcast)
-                                        clockevents_notify(
-                                                CLOCK_EVT_NOTIFY_BROADCAST_EXIT,
-                                                &dev->cpu);
-                                /*
-                                 * Give the governor an opportunity to reflect on the
-                                 * outcome
-                                 */
-                                cpuidle_reflect(dev, entered_state);
-                        }
-                }
        }
        /*
-         * We can't use the cpuidle framework, let's use the default
+         * The idle task must be scheduled, it is pointless to
-         * idle routine
+         * go to idle, just update no idle residency and get
+         * out of this function
         */
-        if (ret)
+        if (current_clr_polling_and_test()) {
-                arch_cpu_idle();
+                dev->last_residency = 0;
+                entered_state = next_state;
+                local_irq_enable();
+                goto exit_idle;
+        }
+        broadcast = !!(drv->states[next_state].flags & CPUIDLE_FLAG_TIMER_STOP);
+        /*
+         * Tell the time framework to switch to a broadcast timer
+         * because our local timer will be shutdown. If a local timer
+         * is used from another cpu as a broadcast timer, this call may
+         * fail if it is not available
+         */
+        if (broadcast &&
+            clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu))
+                goto use_default;
+        trace_cpu_idle_rcuidle(next_state, dev->cpu);
+        /*
+         * Enter the idle state previously returned by the governor decision.
+         * This function will block until an interrupt occurs and will take
+         * care of re-enabling the local interrupts
+         */
+        entered_state = cpuidle_enter(drv, dev, next_state);
+        trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu);
+        if (broadcast)
+                clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu);
+        /*
+         * Give the governor an opportunity to reflect on the outcome
+         */
+        cpuidle_reflect(dev, entered_state);
+exit_idle:
        __current_set_polling();
        /*
-         * It is up to the idle functions to enable back the local
+         * It is up to the idle functions to reenable local interrupts
-         * interrupt
         */
        if (WARN_ON_ONCE(irqs_disabled()))
                local_irq_enable();
        rcu_idle_exit();
        start_critical_timings();
-        return 0;
 }
 /*
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index bd2267ad404f..0ebfd7a29472 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -79,6 +79,8 @@ void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
        rt_rq->overloaded = 0;
        plist_head_init(&rt_rq->pushable_tasks);
 #endif
+        /* We start is dequeued state, because no RT tasks are queued */
+        rt_rq->rt_queued = 0;
        rt_rq->rt_time = 0;
        rt_rq->rt_throttled = 0;
@@ -112,6 +114,13 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
        return rt_se->rt_rq;
 }
+static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
+{
+        struct rt_rq *rt_rq = rt_se->rt_rq;
+        return rt_rq->rq;
+}
 void free_rt_sched_group(struct task_group *tg)
 {
        int i;
@@ -211,10 +220,16 @@ static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
        return container_of(rt_rq, struct rq, rt);
 }
-static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
+static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
 {
        struct task_struct *p = rt_task_of(rt_se);
-        struct rq *rq = task_rq(p);
+        return task_rq(p);
+}
+static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
+{
+        struct rq *rq = rq_of_rt_se(rt_se);
        return &rq->rt;
 }
@@ -391,6 +406,9 @@ static inline void set_post_schedule(struct rq *rq)
 }
 #endif /* CONFIG_SMP */
+static void enqueue_top_rt_rq(struct rt_rq *rt_rq);
+static void dequeue_top_rt_rq(struct rt_rq *rt_rq);
 static inline int on_rt_rq(struct sched_rt_entity *rt_se)
 {
        return !list_empty(&rt_se->run_list);
@@ -452,8 +470,11 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
        rt_se = rt_rq->tg->rt_se[cpu];
        if (rt_rq->rt_nr_running) {
-                if (rt_se && !on_rt_rq(rt_se))
+                if (!rt_se)
+                        enqueue_top_rt_rq(rt_rq);
+                else if (!on_rt_rq(rt_se))
                        enqueue_rt_entity(rt_se, false);
                if (rt_rq->highest_prio.curr < curr->prio)
                        resched_task(curr);
        }
@@ -466,10 +487,17 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
        rt_se = rt_rq->tg->rt_se[cpu];
-        if (rt_se && on_rt_rq(rt_se))
+        if (!rt_se)
+                dequeue_top_rt_rq(rt_rq);
+        else if (on_rt_rq(rt_se))
                dequeue_rt_entity(rt_se);
 }
+static inline int rt_rq_throttled(struct rt_rq *rt_rq)
+{
+        return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
+}
 static int rt_se_boosted(struct sched_rt_entity *rt_se)
 {
        struct rt_rq *rt_rq = group_rt_rq(rt_se);
@@ -532,12 +560,23 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
 static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 {
-        if (rt_rq->rt_nr_running)
+        struct rq *rq = rq_of_rt_rq(rt_rq);
-                resched_task(rq_of_rt_rq(rt_rq)->curr);
+        if (!rt_rq->rt_nr_running)
+                return;
+        enqueue_top_rt_rq(rt_rq);
+        resched_task(rq->curr);
 }
 static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
 {
+        dequeue_top_rt_rq(rt_rq);
+}
+static inline int rt_rq_throttled(struct rt_rq *rt_rq)
+{
+        return rt_rq->rt_throttled;
 }
 static inline const struct cpumask *sched_rt_period_mask(void)
@@ -922,6 +961,38 @@ static void update_curr_rt(struct rq *rq)
        }
 }
+static void
+dequeue_top_rt_rq(struct rt_rq *rt_rq)
+{
+        struct rq *rq = rq_of_rt_rq(rt_rq);
+        BUG_ON(&rq->rt != rt_rq);
+        if (!rt_rq->rt_queued)
+                return;
+        BUG_ON(!rq->nr_running);
+        sub_nr_running(rq, rt_rq->rt_nr_running);
+        rt_rq->rt_queued = 0;
+}
+static void
+enqueue_top_rt_rq(struct rt_rq *rt_rq)
+{
+        struct rq *rq = rq_of_rt_rq(rt_rq);
+        BUG_ON(&rq->rt != rt_rq);
+        if (rt_rq->rt_queued)
+                return;
+        if (rt_rq_throttled(rt_rq) || !rt_rq->rt_nr_running)
+                return;
+        add_nr_running(rq, rt_rq->rt_nr_running);
+        rt_rq->rt_queued = 1;
+}
 #if defined CONFIG_SMP
 static void
@@ -1045,12 +1116,23 @@ void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
 #endif /* CONFIG_RT_GROUP_SCHED */
 static inline
+unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se)
+{
+        struct rt_rq *group_rq = group_rt_rq(rt_se);
+        if (group_rq)
+                return group_rq->rt_nr_running;
+        else
+                return 1;
+}
+static inline
 void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 {
        int prio = rt_se_prio(rt_se);
        WARN_ON(!rt_prio(prio));
-        rt_rq->rt_nr_running++;
+        rt_rq->rt_nr_running += rt_se_nr_running(rt_se);
        inc_rt_prio(rt_rq, prio);
        inc_rt_migration(rt_se, rt_rq);
@@ -1062,7 +1144,7 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 {
        WARN_ON(!rt_prio(rt_se_prio(rt_se)));
        WARN_ON(!rt_rq->rt_nr_running);
-        rt_rq->rt_nr_running--;
+        rt_rq->rt_nr_running -= rt_se_nr_running(rt_se);
        dec_rt_prio(rt_rq, rt_se_prio(rt_se));
        dec_rt_migration(rt_se, rt_rq);
@@ -1119,6 +1201,8 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
                back = rt_se;
        }
+        dequeue_top_rt_rq(rt_rq_of_se(back));
        for (rt_se = back; rt_se; rt_se = rt_se->back) {
                if (on_rt_rq(rt_se))
                        __dequeue_rt_entity(rt_se);
@@ -1127,13 +1211,18 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
 static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
 {
+        struct rq *rq = rq_of_rt_se(rt_se);
        dequeue_rt_stack(rt_se);
        for_each_sched_rt_entity(rt_se)
                __enqueue_rt_entity(rt_se, head);
+        enqueue_top_rt_rq(&rq->rt);
 }
 static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
 {
+        struct rq *rq = rq_of_rt_se(rt_se);
        dequeue_rt_stack(rt_se);
        for_each_sched_rt_entity(rt_se) {
@@ -1142,6 +1231,7 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
                if (rt_rq && rt_rq->rt_nr_running)
                        __enqueue_rt_entity(rt_se, false);
        }
+        enqueue_top_rt_rq(&rq->rt);
 }
 /*
@@ -1159,8 +1249,6 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
        if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
                enqueue_pushable_task(rq, p);
-        inc_nr_running(rq);
 }
 static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
@@ -1171,8 +1259,6 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
        dequeue_rt_entity(rt_se);
        dequeue_pushable_task(rq, p);
-        dec_nr_running(rq);
 }
 /*
@@ -1377,10 +1463,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev)
        if (prev->sched_class == &rt_sched_class)
                update_curr_rt(rq);
-        if (!rt_rq->rt_nr_running)
+        if (!rt_rq->rt_queued)
-                return NULL;
-        if (rt_rq_throttled(rt_rq))
                return NULL;
        put_prev_task(rq, prev);
@@ -1892,9 +1975,9 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
         */
        if (p->on_rq && rq->curr != p) {
 #ifdef CONFIG_SMP
-                if (rq->rt.overloaded && push_rt_task(rq) &&
+                if (p->nr_cpus_allowed > 1 && rq->rt.overloaded &&
                    /* Don't resched if we changed runqueues */
-                    rq != task_rq(p))
+                    push_rt_task(rq) && rq != task_rq(p))
                        check_resched = 0;
 #endif /* CONFIG_SMP */
                if (check_resched && p->prio < rq->curr->prio)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 456e492a3dca..600e2291a75c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -409,6 +409,8 @@ struct rt_rq {
        int overloaded;
        struct plist_head pushable_tasks;
 #endif
+        int rt_queued;
        int rt_throttled;
        u64 rt_time;
        u64 rt_runtime;
@@ -423,18 +425,6 @@ struct rt_rq {
 #endif
 };
-#ifdef CONFIG_RT_GROUP_SCHED
-static inline int rt_rq_throttled(struct rt_rq *rt_rq)
-{
-        return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
-}
-#else
-static inline int rt_rq_throttled(struct rt_rq *rt_rq)
-{
-        return rt_rq->rt_throttled;
-}
-#endif
 /* Deadline class' related fields in a runqueue */
 struct dl_rq {
        /* runqueue is an rbtree, ordered by deadline */
@@ -1216,12 +1206,14 @@ extern void update_idle_cpu_load(struct rq *this_rq);
 extern void init_task_runnable_average(struct task_struct *p);
-static inline void inc_nr_running(struct rq *rq)
+static inline void add_nr_running(struct rq *rq, unsigned count)
 {
-        rq->nr_running++;
+        unsigned prev_nr = rq->nr_running;
+        rq->nr_running = prev_nr + count;
 #ifdef CONFIG_NO_HZ_FULL
-        if (rq->nr_running == 2) {
+        if (prev_nr < 2 && rq->nr_running >= 2) {
                if (tick_nohz_full_cpu(rq->cpu)) {
                        /* Order rq->nr_running write against the IPI */
                        smp_wmb();
@@ -1231,9 +1223,9 @@ static inline void inc_nr_running(struct rq *rq)
 #endif
 }
-static inline void dec_nr_running(struct rq *rq)
+static inline void sub_nr_running(struct rq *rq, unsigned count)
 {
-        rq->nr_running--;
+        rq->nr_running -= count;
 }
 static inline void rq_last_tick_reset(struct rq *rq)
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index d6ce65dde541..bfe0edadbfbb 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -41,13 +41,13 @@ pick_next_task_stop(struct rq *rq, struct task_struct *prev)
 static void
 enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)
 {
-        inc_nr_running(rq);
+        add_nr_running(rq, 1);
 }
 static void
 dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags)
 {
-        dec_nr_running(rq);
+        sub_nr_running(rq, 1);
 }
 static void yield_task_stop(struct rq *rq)
diff --git a/kernel/sys.c b/kernel/sys.c
index fba0f29401ea..66a751ebf9d9 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -250,7 +250,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
                        else
                                p = current;
                        if (p) {
-                                niceval = 20 - task_nice(p);
+                                niceval = nice_to_rlimit(task_nice(p));
                                if (niceval > retval)
                                        retval = niceval;
                        }
@@ -261,7 +261,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
                        else
                                pgrp = task_pgrp(current);
                        do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
-                                niceval = 20 - task_nice(p);
+                                niceval = nice_to_rlimit(task_nice(p));
                                if (niceval > retval)
                                        retval = niceval;
                        } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
@@ -277,7 +277,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
                        do_each_thread(g, p) {
                                if (uid_eq(task_uid(p), uid)) {
-                                        niceval = 20 - task_nice(p);
+                                        niceval = nice_to_rlimit(task_nice(p));
                                        if (niceval > retval)
                                                retval = niceval;
                                }
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 8edc87185427..a4bab46cd38e 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -100,10 +100,10 @@ enum {
        /*
         * Rescue workers are used only on emergencies and shared by
-         * all cpus.  Give -20.
+         * all cpus.  Give MIN_NICE.
         */
-        RESCUER_NICE_LEVEL      = -20,
+        RESCUER_NICE_LEVEL      = MIN_NICE,
-        HIGHPRI_NICE_LEVEL      = -20,
+        HIGHPRI_NICE_LEVEL      = MIN_NICE,
        WQ_NAME_LEN             = 24,
 };
author	Linus Torvalds <torvalds@linux-foundation.org>	2014-06-03 17:00:15 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2014-06-03 17:00:15 -0400
commit	c84a1e32ee58fc1cc9d3fd42619b917cce67e30a (patch)
tree	d3e5bed273f747e7c9e399864219bea76f4c30ea /kernel
parent	3d521f9151dacab566904d1f57dcb3e7080cdd8f (diff)
parent	096aa33863a5e48de52d2ff30e0801b7487944f4 (diff)