aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2009-12-12 14:34:10 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2009-12-12 14:34:10 -0500
commit702a7c7609bec3a940b6a46b0d6ab9ce45274580 (patch)
tree6c169691449259410b9b51a146acb0e837dae96a /kernel
parent053fe57ac249a9531c396175778160d9e9509399 (diff)
parentb9889ed1ddeca5a3f3569c8de7354e9e97d803ae (diff)
Merge branch 'sched-fixes-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
* 'sched-fixes-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (21 commits) sched: Remove forced2_migrations stats sched: Fix memory leak in two error corner cases sched: Fix build warning in get_update_sysctl_factor() sched: Update normalized values on user updates via proc sched: Make tunable scaling style configurable sched: Fix missing sched tunable recalculation on cpu add/remove sched: Fix task priority bug sched: cgroup: Implement different treatment for idle shares sched: Remove unnecessary RCU exclusion sched: Discard some old bits sched: Clean up check_preempt_wakeup() sched: Move update_curr() in check_preempt_wakeup() to avoid redundant call sched: Sanitize fork() handling sched: Clean up ttwu() rq locking sched: Remove rq->clock coupling from set_task_cpu() sched: Consolidate select_task_rq() callers sched: Remove sysctl.sched_features sched: Protect sched_rr_get_param() access to task->sched_class sched: Protect task->cpus_allowed access in sched_getaffinity() sched: Fix balance vs hotplug race ... Fixed up conflicts in kernel/sysctl.c (due to sysctl cleanup)
Diffstat (limited to 'kernel')
-rw-r--r--kernel/cpu.c18
-rw-r--r--kernel/cpuset.c18
-rw-r--r--kernel/sched.c218
-rw-r--r--kernel/sched_debug.c13
-rw-r--r--kernel/sched_fair.c155
-rw-r--r--kernel/sched_features.h5
-rw-r--r--kernel/sched_idletask.c2
-rw-r--r--kernel/sched_rt.c2
-rw-r--r--kernel/sysctl.c30
9 files changed, 256 insertions, 205 deletions
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 7c4e2713df0..291ac586f37 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -212,6 +212,8 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
212 err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod, 212 err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod,
213 hcpu, -1, &nr_calls); 213 hcpu, -1, &nr_calls);
214 if (err == NOTIFY_BAD) { 214 if (err == NOTIFY_BAD) {
215 set_cpu_active(cpu, true);
216
215 nr_calls--; 217 nr_calls--;
216 __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod, 218 __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod,
217 hcpu, nr_calls, NULL); 219 hcpu, nr_calls, NULL);
@@ -223,11 +225,11 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
223 225
224 /* Ensure that we are not runnable on dying cpu */ 226 /* Ensure that we are not runnable on dying cpu */
225 cpumask_copy(old_allowed, &current->cpus_allowed); 227 cpumask_copy(old_allowed, &current->cpus_allowed);
226 set_cpus_allowed_ptr(current, 228 set_cpus_allowed_ptr(current, cpu_active_mask);
227 cpumask_of(cpumask_any_but(cpu_online_mask, cpu)));
228 229
229 err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); 230 err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
230 if (err) { 231 if (err) {
232 set_cpu_active(cpu, true);
231 /* CPU didn't die: tell everyone. Can't complain. */ 233 /* CPU didn't die: tell everyone. Can't complain. */
232 if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod, 234 if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod,
233 hcpu) == NOTIFY_BAD) 235 hcpu) == NOTIFY_BAD)
@@ -292,9 +294,6 @@ int __ref cpu_down(unsigned int cpu)
292 294
293 err = _cpu_down(cpu, 0); 295 err = _cpu_down(cpu, 0);
294 296
295 if (cpu_online(cpu))
296 set_cpu_active(cpu, true);
297
298out: 297out:
299 cpu_maps_update_done(); 298 cpu_maps_update_done();
300 stop_machine_destroy(); 299 stop_machine_destroy();
@@ -387,6 +386,15 @@ int disable_nonboot_cpus(void)
387 * with the userspace trying to use the CPU hotplug at the same time 386 * with the userspace trying to use the CPU hotplug at the same time
388 */ 387 */
389 cpumask_clear(frozen_cpus); 388 cpumask_clear(frozen_cpus);
389
390 for_each_online_cpu(cpu) {
391 if (cpu == first_cpu)
392 continue;
393 set_cpu_active(cpu, false);
394 }
395
396 synchronize_sched();
397
390 printk("Disabling non-boot CPUs ...\n"); 398 printk("Disabling non-boot CPUs ...\n");
391 for_each_online_cpu(cpu) { 399 for_each_online_cpu(cpu) {
392 if (cpu == first_cpu) 400 if (cpu == first_cpu)
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 3cf2183b472..ba401fab459 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -737,7 +737,7 @@ static void do_rebuild_sched_domains(struct work_struct *unused)
737{ 737{
738} 738}
739 739
740static int generate_sched_domains(struct cpumask **domains, 740static int generate_sched_domains(cpumask_var_t **domains,
741 struct sched_domain_attr **attributes) 741 struct sched_domain_attr **attributes)
742{ 742{
743 *domains = NULL; 743 *domains = NULL;
@@ -872,7 +872,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
872 if (retval < 0) 872 if (retval < 0)
873 return retval; 873 return retval;
874 874
875 if (!cpumask_subset(trialcs->cpus_allowed, cpu_online_mask)) 875 if (!cpumask_subset(trialcs->cpus_allowed, cpu_active_mask))
876 return -EINVAL; 876 return -EINVAL;
877 } 877 }
878 retval = validate_change(cs, trialcs); 878 retval = validate_change(cs, trialcs);
@@ -2010,7 +2010,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)
2010 } 2010 }
2011 2011
2012 /* Continue past cpusets with all cpus, mems online */ 2012 /* Continue past cpusets with all cpus, mems online */
2013 if (cpumask_subset(cp->cpus_allowed, cpu_online_mask) && 2013 if (cpumask_subset(cp->cpus_allowed, cpu_active_mask) &&
2014 nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY])) 2014 nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
2015 continue; 2015 continue;
2016 2016
@@ -2019,7 +2019,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)
2019 /* Remove offline cpus and mems from this cpuset. */ 2019 /* Remove offline cpus and mems from this cpuset. */
2020 mutex_lock(&callback_mutex); 2020 mutex_lock(&callback_mutex);
2021 cpumask_and(cp->cpus_allowed, cp->cpus_allowed, 2021 cpumask_and(cp->cpus_allowed, cp->cpus_allowed,
2022 cpu_online_mask); 2022 cpu_active_mask);
2023 nodes_and(cp->mems_allowed, cp->mems_allowed, 2023 nodes_and(cp->mems_allowed, cp->mems_allowed,
2024 node_states[N_HIGH_MEMORY]); 2024 node_states[N_HIGH_MEMORY]);
2025 mutex_unlock(&callback_mutex); 2025 mutex_unlock(&callback_mutex);
@@ -2057,8 +2057,10 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
2057 switch (phase) { 2057 switch (phase) {
2058 case CPU_ONLINE: 2058 case CPU_ONLINE:
2059 case CPU_ONLINE_FROZEN: 2059 case CPU_ONLINE_FROZEN:
2060 case CPU_DEAD: 2060 case CPU_DOWN_PREPARE:
2061 case CPU_DEAD_FROZEN: 2061 case CPU_DOWN_PREPARE_FROZEN:
2062 case CPU_DOWN_FAILED:
2063 case CPU_DOWN_FAILED_FROZEN:
2062 break; 2064 break;
2063 2065
2064 default: 2066 default:
@@ -2067,7 +2069,7 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
2067 2069
2068 cgroup_lock(); 2070 cgroup_lock();
2069 mutex_lock(&callback_mutex); 2071 mutex_lock(&callback_mutex);
2070 cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask); 2072 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
2071 mutex_unlock(&callback_mutex); 2073 mutex_unlock(&callback_mutex);
2072 scan_for_empty_cpusets(&top_cpuset); 2074 scan_for_empty_cpusets(&top_cpuset);
2073 ndoms = generate_sched_domains(&doms, &attr); 2075 ndoms = generate_sched_domains(&doms, &attr);
@@ -2114,7 +2116,7 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
2114 2116
2115void __init cpuset_init_smp(void) 2117void __init cpuset_init_smp(void)
2116{ 2118{
2117 cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask); 2119 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
2118 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; 2120 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2119 2121
2120 hotcpu_notifier(cpuset_track_online_cpus, 0); 2122 hotcpu_notifier(cpuset_track_online_cpus, 0);
diff --git a/kernel/sched.c b/kernel/sched.c
index e7f2cfa6a25..ff39cadf621 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -814,6 +814,7 @@ const_debug unsigned int sysctl_sched_nr_migrate = 32;
814 * default: 0.25ms 814 * default: 0.25ms
815 */ 815 */
816unsigned int sysctl_sched_shares_ratelimit = 250000; 816unsigned int sysctl_sched_shares_ratelimit = 250000;
817unsigned int normalized_sysctl_sched_shares_ratelimit = 250000;
817 818
818/* 819/*
819 * Inject some fuzzyness into changing the per-cpu group shares 820 * Inject some fuzzyness into changing the per-cpu group shares
@@ -1614,7 +1615,7 @@ static void update_group_shares_cpu(struct task_group *tg, int cpu,
1614 */ 1615 */
1615static int tg_shares_up(struct task_group *tg, void *data) 1616static int tg_shares_up(struct task_group *tg, void *data)
1616{ 1617{
1617 unsigned long weight, rq_weight = 0, shares = 0; 1618 unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0;
1618 unsigned long *usd_rq_weight; 1619 unsigned long *usd_rq_weight;
1619 struct sched_domain *sd = data; 1620 struct sched_domain *sd = data;
1620 unsigned long flags; 1621 unsigned long flags;
@@ -1630,6 +1631,7 @@ static int tg_shares_up(struct task_group *tg, void *data)
1630 weight = tg->cfs_rq[i]->load.weight; 1631 weight = tg->cfs_rq[i]->load.weight;
1631 usd_rq_weight[i] = weight; 1632 usd_rq_weight[i] = weight;
1632 1633
1634 rq_weight += weight;
1633 /* 1635 /*
1634 * If there are currently no tasks on the cpu pretend there 1636 * If there are currently no tasks on the cpu pretend there
1635 * is one of average load so that when a new task gets to 1637 * is one of average load so that when a new task gets to
@@ -1638,10 +1640,13 @@ static int tg_shares_up(struct task_group *tg, void *data)
1638 if (!weight) 1640 if (!weight)
1639 weight = NICE_0_LOAD; 1641 weight = NICE_0_LOAD;
1640 1642
1641 rq_weight += weight; 1643 sum_weight += weight;
1642 shares += tg->cfs_rq[i]->shares; 1644 shares += tg->cfs_rq[i]->shares;
1643 } 1645 }
1644 1646
1647 if (!rq_weight)
1648 rq_weight = sum_weight;
1649
1645 if ((!shares && rq_weight) || shares > tg->shares) 1650 if ((!shares && rq_weight) || shares > tg->shares)
1646 shares = tg->shares; 1651 shares = tg->shares;
1647 1652
@@ -1810,6 +1815,22 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1810#endif 1815#endif
1811 1816
1812static void calc_load_account_active(struct rq *this_rq); 1817static void calc_load_account_active(struct rq *this_rq);
1818static void update_sysctl(void);
1819static int get_update_sysctl_factor(void);
1820
1821static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1822{
1823 set_task_rq(p, cpu);
1824#ifdef CONFIG_SMP
1825 /*
1826 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
1827 * successfuly executed on another CPU. We must ensure that updates of
1828 * per-task data have been completed by this moment.
1829 */
1830 smp_wmb();
1831 task_thread_info(p)->cpu = cpu;
1832#endif
1833}
1813 1834
1814#include "sched_stats.h" 1835#include "sched_stats.h"
1815#include "sched_idletask.c" 1836#include "sched_idletask.c"
@@ -1967,20 +1988,6 @@ inline int task_curr(const struct task_struct *p)
1967 return cpu_curr(task_cpu(p)) == p; 1988 return cpu_curr(task_cpu(p)) == p;
1968} 1989}
1969 1990
1970static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1971{
1972 set_task_rq(p, cpu);
1973#ifdef CONFIG_SMP
1974 /*
1975 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
1976 * successfuly executed on another CPU. We must ensure that updates of
1977 * per-task data have been completed by this moment.
1978 */
1979 smp_wmb();
1980 task_thread_info(p)->cpu = cpu;
1981#endif
1982}
1983
1984static inline void check_class_changed(struct rq *rq, struct task_struct *p, 1991static inline void check_class_changed(struct rq *rq, struct task_struct *p,
1985 const struct sched_class *prev_class, 1992 const struct sched_class *prev_class,
1986 int oldprio, int running) 1993 int oldprio, int running)
@@ -2060,29 +2067,13 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
2060void set_task_cpu(struct task_struct *p, unsigned int new_cpu) 2067void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
2061{ 2068{
2062 int old_cpu = task_cpu(p); 2069 int old_cpu = task_cpu(p);
2063 struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu);
2064 struct cfs_rq *old_cfsrq = task_cfs_rq(p), 2070 struct cfs_rq *old_cfsrq = task_cfs_rq(p),
2065 *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu); 2071 *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu);
2066 u64 clock_offset;
2067
2068 clock_offset = old_rq->clock - new_rq->clock;
2069 2072
2070 trace_sched_migrate_task(p, new_cpu); 2073 trace_sched_migrate_task(p, new_cpu);
2071 2074
2072#ifdef CONFIG_SCHEDSTATS
2073 if (p->se.wait_start)
2074 p->se.wait_start -= clock_offset;
2075 if (p->se.sleep_start)
2076 p->se.sleep_start -= clock_offset;
2077 if (p->se.block_start)
2078 p->se.block_start -= clock_offset;
2079#endif
2080 if (old_cpu != new_cpu) { 2075 if (old_cpu != new_cpu) {
2081 p->se.nr_migrations++; 2076 p->se.nr_migrations++;
2082#ifdef CONFIG_SCHEDSTATS
2083 if (task_hot(p, old_rq->clock, NULL))
2084 schedstat_inc(p, se.nr_forced2_migrations);
2085#endif
2086 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 2077 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS,
2087 1, 1, NULL, 0); 2078 1, 1, NULL, 0);
2088 } 2079 }
@@ -2323,6 +2314,14 @@ void task_oncpu_function_call(struct task_struct *p,
2323 preempt_enable(); 2314 preempt_enable();
2324} 2315}
2325 2316
2317#ifdef CONFIG_SMP
2318static inline
2319int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
2320{
2321 return p->sched_class->select_task_rq(p, sd_flags, wake_flags);
2322}
2323#endif
2324
2326/*** 2325/***
2327 * try_to_wake_up - wake up a thread 2326 * try_to_wake_up - wake up a thread
2328 * @p: the to-be-woken-up thread 2327 * @p: the to-be-woken-up thread
@@ -2374,17 +2373,14 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
2374 if (task_contributes_to_load(p)) 2373 if (task_contributes_to_load(p))
2375 rq->nr_uninterruptible--; 2374 rq->nr_uninterruptible--;
2376 p->state = TASK_WAKING; 2375 p->state = TASK_WAKING;
2377 task_rq_unlock(rq, &flags); 2376 __task_rq_unlock(rq);
2378 2377
2379 cpu = p->sched_class->select_task_rq(p, SD_BALANCE_WAKE, wake_flags); 2378 cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
2380 if (cpu != orig_cpu) { 2379 if (cpu != orig_cpu)
2381 local_irq_save(flags);
2382 rq = cpu_rq(cpu);
2383 update_rq_clock(rq);
2384 set_task_cpu(p, cpu); 2380 set_task_cpu(p, cpu);
2385 local_irq_restore(flags); 2381
2386 } 2382 rq = __task_rq_lock(p);
2387 rq = task_rq_lock(p, &flags); 2383 update_rq_clock(rq);
2388 2384
2389 WARN_ON(p->state != TASK_WAKING); 2385 WARN_ON(p->state != TASK_WAKING);
2390 cpu = task_cpu(p); 2386 cpu = task_cpu(p);
@@ -2499,7 +2495,6 @@ static void __sched_fork(struct task_struct *p)
2499 p->se.avg_overlap = 0; 2495 p->se.avg_overlap = 0;
2500 p->se.start_runtime = 0; 2496 p->se.start_runtime = 0;
2501 p->se.avg_wakeup = sysctl_sched_wakeup_granularity; 2497 p->se.avg_wakeup = sysctl_sched_wakeup_granularity;
2502 p->se.avg_running = 0;
2503 2498
2504#ifdef CONFIG_SCHEDSTATS 2499#ifdef CONFIG_SCHEDSTATS
2505 p->se.wait_start = 0; 2500 p->se.wait_start = 0;
@@ -2521,7 +2516,6 @@ static void __sched_fork(struct task_struct *p)
2521 p->se.nr_failed_migrations_running = 0; 2516 p->se.nr_failed_migrations_running = 0;
2522 p->se.nr_failed_migrations_hot = 0; 2517 p->se.nr_failed_migrations_hot = 0;
2523 p->se.nr_forced_migrations = 0; 2518 p->se.nr_forced_migrations = 0;
2524 p->se.nr_forced2_migrations = 0;
2525 2519
2526 p->se.nr_wakeups = 0; 2520 p->se.nr_wakeups = 0;
2527 p->se.nr_wakeups_sync = 0; 2521 p->se.nr_wakeups_sync = 0;
@@ -2558,7 +2552,6 @@ static void __sched_fork(struct task_struct *p)
2558void sched_fork(struct task_struct *p, int clone_flags) 2552void sched_fork(struct task_struct *p, int clone_flags)
2559{ 2553{
2560 int cpu = get_cpu(); 2554 int cpu = get_cpu();
2561 unsigned long flags;
2562 2555
2563 __sched_fork(p); 2556 __sched_fork(p);
2564 2557
@@ -2592,13 +2585,13 @@ void sched_fork(struct task_struct *p, int clone_flags)
2592 if (!rt_prio(p->prio)) 2585 if (!rt_prio(p->prio))
2593 p->sched_class = &fair_sched_class; 2586 p->sched_class = &fair_sched_class;
2594 2587
2588 if (p->sched_class->task_fork)
2589 p->sched_class->task_fork(p);
2590
2595#ifdef CONFIG_SMP 2591#ifdef CONFIG_SMP
2596 cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0); 2592 cpu = select_task_rq(p, SD_BALANCE_FORK, 0);
2597#endif 2593#endif
2598 local_irq_save(flags);
2599 update_rq_clock(cpu_rq(cpu));
2600 set_task_cpu(p, cpu); 2594 set_task_cpu(p, cpu);
2601 local_irq_restore(flags);
2602 2595
2603#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 2596#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
2604 if (likely(sched_info_on())) 2597 if (likely(sched_info_on()))
@@ -2631,17 +2624,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2631 rq = task_rq_lock(p, &flags); 2624 rq = task_rq_lock(p, &flags);
2632 BUG_ON(p->state != TASK_RUNNING); 2625 BUG_ON(p->state != TASK_RUNNING);
2633 update_rq_clock(rq); 2626 update_rq_clock(rq);
2634 2627 activate_task(rq, p, 0);
2635 if (!p->sched_class->task_new || !current->se.on_rq) {
2636 activate_task(rq, p, 0);
2637 } else {
2638 /*
2639 * Let the scheduling class do new task startup
2640 * management (if any):
2641 */
2642 p->sched_class->task_new(rq, p);
2643 inc_nr_running(rq);
2644 }
2645 trace_sched_wakeup_new(rq, p, 1); 2628 trace_sched_wakeup_new(rq, p, 1);
2646 check_preempt_curr(rq, p, WF_FORK); 2629 check_preempt_curr(rq, p, WF_FORK);
2647#ifdef CONFIG_SMP 2630#ifdef CONFIG_SMP
@@ -3156,7 +3139,7 @@ out:
3156void sched_exec(void) 3139void sched_exec(void)
3157{ 3140{
3158 int new_cpu, this_cpu = get_cpu(); 3141 int new_cpu, this_cpu = get_cpu();
3159 new_cpu = current->sched_class->select_task_rq(current, SD_BALANCE_EXEC, 0); 3142 new_cpu = select_task_rq(current, SD_BALANCE_EXEC, 0);
3160 put_cpu(); 3143 put_cpu();
3161 if (new_cpu != this_cpu) 3144 if (new_cpu != this_cpu)
3162 sched_migrate_task(current, new_cpu); 3145 sched_migrate_task(current, new_cpu);
@@ -3172,10 +3155,6 @@ static void pull_task(struct rq *src_rq, struct task_struct *p,
3172 deactivate_task(src_rq, p, 0); 3155 deactivate_task(src_rq, p, 0);
3173 set_task_cpu(p, this_cpu); 3156 set_task_cpu(p, this_cpu);
3174 activate_task(this_rq, p, 0); 3157 activate_task(this_rq, p, 0);
3175 /*
3176 * Note that idle threads have a prio of MAX_PRIO, for this test
3177 * to be always true for them.
3178 */
3179 check_preempt_curr(this_rq, p, 0); 3158 check_preempt_curr(this_rq, p, 0);
3180} 3159}
3181 3160
@@ -4134,7 +4113,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
4134 unsigned long flags; 4113 unsigned long flags;
4135 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); 4114 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
4136 4115
4137 cpumask_copy(cpus, cpu_online_mask); 4116 cpumask_copy(cpus, cpu_active_mask);
4138 4117
4139 /* 4118 /*
4140 * When power savings policy is enabled for the parent domain, idle 4119 * When power savings policy is enabled for the parent domain, idle
@@ -4297,7 +4276,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
4297 int all_pinned = 0; 4276 int all_pinned = 0;
4298 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); 4277 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
4299 4278
4300 cpumask_copy(cpus, cpu_online_mask); 4279 cpumask_copy(cpus, cpu_active_mask);
4301 4280
4302 /* 4281 /*
4303 * When power savings policy is enabled for the parent domain, idle 4282 * When power savings policy is enabled for the parent domain, idle
@@ -4694,7 +4673,7 @@ int select_nohz_load_balancer(int stop_tick)
4694 cpumask_set_cpu(cpu, nohz.cpu_mask); 4673 cpumask_set_cpu(cpu, nohz.cpu_mask);
4695 4674
4696 /* time for ilb owner also to sleep */ 4675 /* time for ilb owner also to sleep */
4697 if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) { 4676 if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) {
4698 if (atomic_read(&nohz.load_balancer) == cpu) 4677 if (atomic_read(&nohz.load_balancer) == cpu)
4699 atomic_set(&nohz.load_balancer, -1); 4678 atomic_set(&nohz.load_balancer, -1);
4700 return 0; 4679 return 0;
@@ -5396,13 +5375,14 @@ static inline void schedule_debug(struct task_struct *prev)
5396#endif 5375#endif
5397} 5376}
5398 5377
5399static void put_prev_task(struct rq *rq, struct task_struct *p) 5378static void put_prev_task(struct rq *rq, struct task_struct *prev)
5400{ 5379{
5401 u64 runtime = p->se.sum_exec_runtime - p->se.prev_sum_exec_runtime; 5380 if (prev->state == TASK_RUNNING) {
5381 u64 runtime = prev->se.sum_exec_runtime;
5402 5382
5403 update_avg(&p->se.avg_running, runtime); 5383 runtime -= prev->se.prev_sum_exec_runtime;
5384 runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
5404 5385
5405 if (p->state == TASK_RUNNING) {
5406 /* 5386 /*
5407 * In order to avoid avg_overlap growing stale when we are 5387 * In order to avoid avg_overlap growing stale when we are
5408 * indeed overlapping and hence not getting put to sleep, grow 5388 * indeed overlapping and hence not getting put to sleep, grow
@@ -5412,12 +5392,9 @@ static void put_prev_task(struct rq *rq, struct task_struct *p)
5412 * correlates to the amount of cache footprint a task can 5392 * correlates to the amount of cache footprint a task can
5413 * build up. 5393 * build up.
5414 */ 5394 */
5415 runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost); 5395 update_avg(&prev->se.avg_overlap, runtime);
5416 update_avg(&p->se.avg_overlap, runtime);
5417 } else {
5418 update_avg(&p->se.avg_running, 0);
5419 } 5396 }
5420 p->sched_class->put_prev_task(rq, p); 5397 prev->sched_class->put_prev_task(rq, prev);
5421} 5398}
5422 5399
5423/* 5400/*
@@ -6631,6 +6608,8 @@ SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
6631long sched_getaffinity(pid_t pid, struct cpumask *mask) 6608long sched_getaffinity(pid_t pid, struct cpumask *mask)
6632{ 6609{
6633 struct task_struct *p; 6610 struct task_struct *p;
6611 unsigned long flags;
6612 struct rq *rq;
6634 int retval; 6613 int retval;
6635 6614
6636 get_online_cpus(); 6615 get_online_cpus();
@@ -6645,7 +6624,9 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
6645 if (retval) 6624 if (retval)
6646 goto out_unlock; 6625 goto out_unlock;
6647 6626
6627 rq = task_rq_lock(p, &flags);
6648 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); 6628 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
6629 task_rq_unlock(rq, &flags);
6649 6630
6650out_unlock: 6631out_unlock:
6651 read_unlock(&tasklist_lock); 6632 read_unlock(&tasklist_lock);
@@ -6883,6 +6864,8 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
6883{ 6864{
6884 struct task_struct *p; 6865 struct task_struct *p;
6885 unsigned int time_slice; 6866 unsigned int time_slice;
6867 unsigned long flags;
6868 struct rq *rq;
6886 int retval; 6869 int retval;
6887 struct timespec t; 6870 struct timespec t;
6888 6871
@@ -6899,7 +6882,9 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
6899 if (retval) 6882 if (retval)
6900 goto out_unlock; 6883 goto out_unlock;
6901 6884
6902 time_slice = p->sched_class->get_rr_interval(p); 6885 rq = task_rq_lock(p, &flags);
6886 time_slice = p->sched_class->get_rr_interval(rq, p);
6887 task_rq_unlock(rq, &flags);
6903 6888
6904 read_unlock(&tasklist_lock); 6889 read_unlock(&tasklist_lock);
6905 jiffies_to_timespec(time_slice, &t); 6890 jiffies_to_timespec(time_slice, &t);
@@ -7000,7 +6985,6 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
7000 __sched_fork(idle); 6985 __sched_fork(idle);
7001 idle->se.exec_start = sched_clock(); 6986 idle->se.exec_start = sched_clock();
7002 6987
7003 idle->prio = idle->normal_prio = MAX_PRIO;
7004 cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu)); 6988 cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));
7005 __set_task_cpu(idle, cpu); 6989 __set_task_cpu(idle, cpu);
7006 6990
@@ -7041,22 +7025,43 @@ cpumask_var_t nohz_cpu_mask;
7041 * 7025 *
7042 * This idea comes from the SD scheduler of Con Kolivas: 7026 * This idea comes from the SD scheduler of Con Kolivas:
7043 */ 7027 */
7044static inline void sched_init_granularity(void) 7028static int get_update_sysctl_factor(void)
7045{ 7029{
7046 unsigned int factor = 1 + ilog2(num_online_cpus()); 7030 unsigned int cpus = min_t(int, num_online_cpus(), 8);
7047 const unsigned long limit = 200000000; 7031 unsigned int factor;
7032
7033 switch (sysctl_sched_tunable_scaling) {
7034 case SCHED_TUNABLESCALING_NONE:
7035 factor = 1;
7036 break;
7037 case SCHED_TUNABLESCALING_LINEAR:
7038 factor = cpus;
7039 break;
7040 case SCHED_TUNABLESCALING_LOG:
7041 default:
7042 factor = 1 + ilog2(cpus);
7043 break;
7044 }
7048 7045
7049 sysctl_sched_min_granularity *= factor; 7046 return factor;
7050 if (sysctl_sched_min_granularity > limit) 7047}
7051 sysctl_sched_min_granularity = limit;
7052 7048
7053 sysctl_sched_latency *= factor; 7049static void update_sysctl(void)
7054 if (sysctl_sched_latency > limit) 7050{
7055 sysctl_sched_latency = limit; 7051 unsigned int factor = get_update_sysctl_factor();
7056 7052
7057 sysctl_sched_wakeup_granularity *= factor; 7053#define SET_SYSCTL(name) \
7054 (sysctl_##name = (factor) * normalized_sysctl_##name)
7055 SET_SYSCTL(sched_min_granularity);
7056 SET_SYSCTL(sched_latency);
7057 SET_SYSCTL(sched_wakeup_granularity);
7058 SET_SYSCTL(sched_shares_ratelimit);
7059#undef SET_SYSCTL
7060}
7058 7061
7059 sysctl_sched_shares_ratelimit *= factor; 7062static inline void sched_init_granularity(void)
7063{
7064 update_sysctl();
7060} 7065}
7061 7066
7062#ifdef CONFIG_SMP 7067#ifdef CONFIG_SMP
@@ -7093,7 +7098,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
7093 int ret = 0; 7098 int ret = 0;
7094 7099
7095 rq = task_rq_lock(p, &flags); 7100 rq = task_rq_lock(p, &flags);
7096 if (!cpumask_intersects(new_mask, cpu_online_mask)) { 7101 if (!cpumask_intersects(new_mask, cpu_active_mask)) {
7097 ret = -EINVAL; 7102 ret = -EINVAL;
7098 goto out; 7103 goto out;
7099 } 7104 }
@@ -7115,7 +7120,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
7115 if (cpumask_test_cpu(task_cpu(p), new_mask)) 7120 if (cpumask_test_cpu(task_cpu(p), new_mask))
7116 goto out; 7121 goto out;
7117 7122
7118 if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) { 7123 if (migrate_task(p, cpumask_any_and(cpu_active_mask, new_mask), &req)) {
7119 /* Need help from migration thread: drop lock and wait. */ 7124 /* Need help from migration thread: drop lock and wait. */
7120 struct task_struct *mt = rq->migration_thread; 7125 struct task_struct *mt = rq->migration_thread;
7121 7126
@@ -7269,19 +7274,19 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
7269 7274
7270again: 7275again:
7271 /* Look for allowed, online CPU in same node. */ 7276 /* Look for allowed, online CPU in same node. */
7272 for_each_cpu_and(dest_cpu, nodemask, cpu_online_mask) 7277 for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask)
7273 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) 7278 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
7274 goto move; 7279 goto move;
7275 7280
7276 /* Any allowed, online CPU? */ 7281 /* Any allowed, online CPU? */
7277 dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_online_mask); 7282 dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask);
7278 if (dest_cpu < nr_cpu_ids) 7283 if (dest_cpu < nr_cpu_ids)
7279 goto move; 7284 goto move;
7280 7285
7281 /* No more Mr. Nice Guy. */ 7286 /* No more Mr. Nice Guy. */
7282 if (dest_cpu >= nr_cpu_ids) { 7287 if (dest_cpu >= nr_cpu_ids) {
7283 cpuset_cpus_allowed_locked(p, &p->cpus_allowed); 7288 cpuset_cpus_allowed_locked(p, &p->cpus_allowed);
7284 dest_cpu = cpumask_any_and(cpu_online_mask, &p->cpus_allowed); 7289 dest_cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed);
7285 7290
7286 /* 7291 /*
7287 * Don't tell them about moving exiting tasks or 7292 * Don't tell them about moving exiting tasks or
@@ -7310,7 +7315,7 @@ move:
7310 */ 7315 */
7311static void migrate_nr_uninterruptible(struct rq *rq_src) 7316static void migrate_nr_uninterruptible(struct rq *rq_src)
7312{ 7317{
7313 struct rq *rq_dest = cpu_rq(cpumask_any(cpu_online_mask)); 7318 struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask));
7314 unsigned long flags; 7319 unsigned long flags;
7315 7320
7316 local_irq_save(flags); 7321 local_irq_save(flags);
@@ -7563,7 +7568,7 @@ static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
7563static struct ctl_table_header *sd_sysctl_header; 7568static struct ctl_table_header *sd_sysctl_header;
7564static void register_sched_domain_sysctl(void) 7569static void register_sched_domain_sysctl(void)
7565{ 7570{
7566 int i, cpu_num = num_online_cpus(); 7571 int i, cpu_num = num_possible_cpus();
7567 struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); 7572 struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
7568 char buf[32]; 7573 char buf[32];
7569 7574
@@ -7573,7 +7578,7 @@ static void register_sched_domain_sysctl(void)
7573 if (entry == NULL) 7578 if (entry == NULL)
7574 return; 7579 return;
7575 7580
7576 for_each_online_cpu(i) { 7581 for_each_possible_cpu(i) {
7577 snprintf(buf, 32, "cpu%d", i); 7582 snprintf(buf, 32, "cpu%d", i);
7578 entry->procname = kstrdup(buf, GFP_KERNEL); 7583 entry->procname = kstrdup(buf, GFP_KERNEL);
7579 entry->mode = 0555; 7584 entry->mode = 0555;
@@ -7703,7 +7708,6 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7703 spin_lock_irq(&rq->lock); 7708 spin_lock_irq(&rq->lock);
7704 update_rq_clock(rq); 7709 update_rq_clock(rq);
7705 deactivate_task(rq, rq->idle, 0); 7710 deactivate_task(rq, rq->idle, 0);
7706 rq->idle->static_prio = MAX_PRIO;
7707 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0); 7711 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
7708 rq->idle->sched_class = &idle_sched_class; 7712 rq->idle->sched_class = &idle_sched_class;
7709 migrate_dead_tasks(cpu); 7713 migrate_dead_tasks(cpu);
@@ -9099,7 +9103,7 @@ match1:
9099 if (doms_new == NULL) { 9103 if (doms_new == NULL) {
9100 ndoms_cur = 0; 9104 ndoms_cur = 0;
9101 doms_new = &fallback_doms; 9105 doms_new = &fallback_doms;
9102 cpumask_andnot(doms_new[0], cpu_online_mask, cpu_isolated_map); 9106 cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
9103 WARN_ON_ONCE(dattr_new); 9107 WARN_ON_ONCE(dattr_new);
9104 } 9108 }
9105 9109
@@ -9230,8 +9234,10 @@ static int update_sched_domains(struct notifier_block *nfb,
9230 switch (action) { 9234 switch (action) {
9231 case CPU_ONLINE: 9235 case CPU_ONLINE:
9232 case CPU_ONLINE_FROZEN: 9236 case CPU_ONLINE_FROZEN:
9233 case CPU_DEAD: 9237 case CPU_DOWN_PREPARE:
9234 case CPU_DEAD_FROZEN: 9238 case CPU_DOWN_PREPARE_FROZEN:
9239 case CPU_DOWN_FAILED:
9240 case CPU_DOWN_FAILED_FROZEN:
9235 partition_sched_domains(1, NULL, NULL); 9241 partition_sched_domains(1, NULL, NULL);
9236 return NOTIFY_OK; 9242 return NOTIFY_OK;
9237 9243
@@ -9278,7 +9284,7 @@ void __init sched_init_smp(void)
9278#endif 9284#endif
9279 get_online_cpus(); 9285 get_online_cpus();
9280 mutex_lock(&sched_domains_mutex); 9286 mutex_lock(&sched_domains_mutex);
9281 arch_init_sched_domains(cpu_online_mask); 9287 arch_init_sched_domains(cpu_active_mask);
9282 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); 9288 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
9283 if (cpumask_empty(non_isolated_cpus)) 9289 if (cpumask_empty(non_isolated_cpus))
9284 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); 9290 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
@@ -9842,13 +9848,15 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
9842 se = kzalloc_node(sizeof(struct sched_entity), 9848 se = kzalloc_node(sizeof(struct sched_entity),
9843 GFP_KERNEL, cpu_to_node(i)); 9849 GFP_KERNEL, cpu_to_node(i));
9844 if (!se) 9850 if (!se)
9845 goto err; 9851 goto err_free_rq;
9846 9852
9847 init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]); 9853 init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]);
9848 } 9854 }
9849 9855
9850 return 1; 9856 return 1;
9851 9857
9858 err_free_rq:
9859 kfree(cfs_rq);
9852 err: 9860 err:
9853 return 0; 9861 return 0;
9854} 9862}
@@ -9930,13 +9938,15 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
9930 rt_se = kzalloc_node(sizeof(struct sched_rt_entity), 9938 rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
9931 GFP_KERNEL, cpu_to_node(i)); 9939 GFP_KERNEL, cpu_to_node(i));
9932 if (!rt_se) 9940 if (!rt_se)
9933 goto err; 9941 goto err_free_rq;
9934 9942
9935 init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]); 9943 init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]);
9936 } 9944 }
9937 9945
9938 return 1; 9946 return 1;
9939 9947
9948 err_free_rq:
9949 kfree(rt_rq);
9940 err: 9950 err:
9941 return 0; 9951 return 0;
9942} 9952}
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 6988cf08f70..5ae24fc65d7 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -309,6 +309,12 @@ static void print_cpu(struct seq_file *m, int cpu)
309 print_rq(m, rq, cpu); 309 print_rq(m, rq, cpu);
310} 310}
311 311
312static const char *sched_tunable_scaling_names[] = {
313 "none",
314 "logaritmic",
315 "linear"
316};
317
312static int sched_debug_show(struct seq_file *m, void *v) 318static int sched_debug_show(struct seq_file *m, void *v)
313{ 319{
314 u64 now = ktime_to_ns(ktime_get()); 320 u64 now = ktime_to_ns(ktime_get());
@@ -334,6 +340,10 @@ static int sched_debug_show(struct seq_file *m, void *v)
334#undef PN 340#undef PN
335#undef P 341#undef P
336 342
343 SEQ_printf(m, " .%-40s: %d (%s)\n", "sysctl_sched_tunable_scaling",
344 sysctl_sched_tunable_scaling,
345 sched_tunable_scaling_names[sysctl_sched_tunable_scaling]);
346
337 for_each_online_cpu(cpu) 347 for_each_online_cpu(cpu)
338 print_cpu(m, cpu); 348 print_cpu(m, cpu);
339 349
@@ -399,7 +409,6 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
399 PN(se.sum_exec_runtime); 409 PN(se.sum_exec_runtime);
400 PN(se.avg_overlap); 410 PN(se.avg_overlap);
401 PN(se.avg_wakeup); 411 PN(se.avg_wakeup);
402 PN(se.avg_running);
403 412
404 nr_switches = p->nvcsw + p->nivcsw; 413 nr_switches = p->nvcsw + p->nivcsw;
405 414
@@ -423,7 +432,6 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
423 P(se.nr_failed_migrations_running); 432 P(se.nr_failed_migrations_running);
424 P(se.nr_failed_migrations_hot); 433 P(se.nr_failed_migrations_hot);
425 P(se.nr_forced_migrations); 434 P(se.nr_forced_migrations);
426 P(se.nr_forced2_migrations);
427 P(se.nr_wakeups); 435 P(se.nr_wakeups);
428 P(se.nr_wakeups_sync); 436 P(se.nr_wakeups_sync);
429 P(se.nr_wakeups_migrate); 437 P(se.nr_wakeups_migrate);
@@ -499,7 +507,6 @@ void proc_sched_set_task(struct task_struct *p)
499 p->se.nr_failed_migrations_running = 0; 507 p->se.nr_failed_migrations_running = 0;
500 p->se.nr_failed_migrations_hot = 0; 508 p->se.nr_failed_migrations_hot = 0;
501 p->se.nr_forced_migrations = 0; 509 p->se.nr_forced_migrations = 0;
502 p->se.nr_forced2_migrations = 0;
503 p->se.nr_wakeups = 0; 510 p->se.nr_wakeups = 0;
504 p->se.nr_wakeups_sync = 0; 511 p->se.nr_wakeups_sync = 0;
505 p->se.nr_wakeups_migrate = 0; 512 p->se.nr_wakeups_migrate = 0;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index f61837ad336..804a411838f 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -21,6 +21,7 @@
21 */ 21 */
22 22
23#include <linux/latencytop.h> 23#include <linux/latencytop.h>
24#include <linux/sched.h>
24 25
25/* 26/*
26 * Targeted preemption latency for CPU-bound tasks: 27 * Targeted preemption latency for CPU-bound tasks:
@@ -35,12 +36,26 @@
35 * run vmstat and monitor the context-switches (cs) field) 36 * run vmstat and monitor the context-switches (cs) field)
36 */ 37 */
37unsigned int sysctl_sched_latency = 5000000ULL; 38unsigned int sysctl_sched_latency = 5000000ULL;
39unsigned int normalized_sysctl_sched_latency = 5000000ULL;
40
41/*
42 * The initial- and re-scaling of tunables is configurable
43 * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
44 *
45 * Options are:
46 * SCHED_TUNABLESCALING_NONE - unscaled, always *1
47 * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
48 * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
49 */
50enum sched_tunable_scaling sysctl_sched_tunable_scaling
51 = SCHED_TUNABLESCALING_LOG;
38 52
39/* 53/*
40 * Minimal preemption granularity for CPU-bound tasks: 54 * Minimal preemption granularity for CPU-bound tasks:
41 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) 55 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
42 */ 56 */
43unsigned int sysctl_sched_min_granularity = 1000000ULL; 57unsigned int sysctl_sched_min_granularity = 1000000ULL;
58unsigned int normalized_sysctl_sched_min_granularity = 1000000ULL;
44 59
45/* 60/*
46 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity 61 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
@@ -70,6 +85,7 @@ unsigned int __read_mostly sysctl_sched_compat_yield;
70 * have immediate wakeup/sleep latencies. 85 * have immediate wakeup/sleep latencies.
71 */ 86 */
72unsigned int sysctl_sched_wakeup_granularity = 1000000UL; 87unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
88unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
73 89
74const_debug unsigned int sysctl_sched_migration_cost = 500000UL; 90const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
75 91
@@ -383,11 +399,12 @@ static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
383 */ 399 */
384 400
385#ifdef CONFIG_SCHED_DEBUG 401#ifdef CONFIG_SCHED_DEBUG
386int sched_nr_latency_handler(struct ctl_table *table, int write, 402int sched_proc_update_handler(struct ctl_table *table, int write,
387 void __user *buffer, size_t *lenp, 403 void __user *buffer, size_t *lenp,
388 loff_t *ppos) 404 loff_t *ppos)
389{ 405{
390 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); 406 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
407 int factor = get_update_sysctl_factor();
391 408
392 if (ret || !write) 409 if (ret || !write)
393 return ret; 410 return ret;
@@ -395,6 +412,14 @@ int sched_nr_latency_handler(struct ctl_table *table, int write,
395 sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency, 412 sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
396 sysctl_sched_min_granularity); 413 sysctl_sched_min_granularity);
397 414
415#define WRT_SYSCTL(name) \
416 (normalized_sysctl_##name = sysctl_##name / (factor))
417 WRT_SYSCTL(sched_min_granularity);
418 WRT_SYSCTL(sched_latency);
419 WRT_SYSCTL(sched_wakeup_granularity);
420 WRT_SYSCTL(sched_shares_ratelimit);
421#undef WRT_SYSCTL
422
398 return 0; 423 return 0;
399} 424}
400#endif 425#endif
@@ -1403,7 +1428,6 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1403 new_cpu = prev_cpu; 1428 new_cpu = prev_cpu;
1404 } 1429 }
1405 1430
1406 rcu_read_lock();
1407 for_each_domain(cpu, tmp) { 1431 for_each_domain(cpu, tmp) {
1408 /* 1432 /*
1409 * If power savings logic is enabled for a domain, see if we 1433 * If power savings logic is enabled for a domain, see if we
@@ -1484,10 +1508,8 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1484 update_shares(tmp); 1508 update_shares(tmp);
1485 } 1509 }
1486 1510
1487 if (affine_sd && wake_affine(affine_sd, p, sync)) { 1511 if (affine_sd && wake_affine(affine_sd, p, sync))
1488 new_cpu = cpu; 1512 return cpu;
1489 goto out;
1490 }
1491 1513
1492 while (sd) { 1514 while (sd) {
1493 int load_idx = sd->forkexec_idx; 1515 int load_idx = sd->forkexec_idx;
@@ -1528,8 +1550,6 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1528 /* while loop will break here if sd == NULL */ 1550 /* while loop will break here if sd == NULL */
1529 } 1551 }
1530 1552
1531out:
1532 rcu_read_unlock();
1533 return new_cpu; 1553 return new_cpu;
1534} 1554}
1535#endif /* CONFIG_SMP */ 1555#endif /* CONFIG_SMP */
@@ -1651,12 +1671,8 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1651 int sync = wake_flags & WF_SYNC; 1671 int sync = wake_flags & WF_SYNC;
1652 int scale = cfs_rq->nr_running >= sched_nr_latency; 1672 int scale = cfs_rq->nr_running >= sched_nr_latency;
1653 1673
1654 update_curr(cfs_rq); 1674 if (unlikely(rt_prio(p->prio)))
1655 1675 goto preempt;
1656 if (unlikely(rt_prio(p->prio))) {
1657 resched_task(curr);
1658 return;
1659 }
1660 1676
1661 if (unlikely(p->sched_class != &fair_sched_class)) 1677 if (unlikely(p->sched_class != &fair_sched_class))
1662 return; 1678 return;
@@ -1682,50 +1698,44 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1682 return; 1698 return;
1683 1699
1684 /* Idle tasks are by definition preempted by everybody. */ 1700 /* Idle tasks are by definition preempted by everybody. */
1685 if (unlikely(curr->policy == SCHED_IDLE)) { 1701 if (unlikely(curr->policy == SCHED_IDLE))
1686 resched_task(curr); 1702 goto preempt;
1687 return;
1688 }
1689 1703
1690 if ((sched_feat(WAKEUP_SYNC) && sync) || 1704 if (sched_feat(WAKEUP_SYNC) && sync)
1691 (sched_feat(WAKEUP_OVERLAP) && 1705 goto preempt;
1692 (se->avg_overlap < sysctl_sched_migration_cost &&
1693 pse->avg_overlap < sysctl_sched_migration_cost))) {
1694 resched_task(curr);
1695 return;
1696 }
1697 1706
1698 if (sched_feat(WAKEUP_RUNNING)) { 1707 if (sched_feat(WAKEUP_OVERLAP) &&
1699 if (pse->avg_running < se->avg_running) { 1708 se->avg_overlap < sysctl_sched_migration_cost &&
1700 set_next_buddy(pse); 1709 pse->avg_overlap < sysctl_sched_migration_cost)
1701 resched_task(curr); 1710 goto preempt;
1702 return;
1703 }
1704 }
1705 1711
1706 if (!sched_feat(WAKEUP_PREEMPT)) 1712 if (!sched_feat(WAKEUP_PREEMPT))
1707 return; 1713 return;
1708 1714
1715 update_curr(cfs_rq);
1709 find_matching_se(&se, &pse); 1716 find_matching_se(&se, &pse);
1710
1711 BUG_ON(!pse); 1717 BUG_ON(!pse);
1718 if (wakeup_preempt_entity(se, pse) == 1)
1719 goto preempt;
1712 1720
1713 if (wakeup_preempt_entity(se, pse) == 1) { 1721 return;
1714 resched_task(curr); 1722
1715 /* 1723preempt:
1716 * Only set the backward buddy when the current task is still 1724 resched_task(curr);
1717 * on the rq. This can happen when a wakeup gets interleaved 1725 /*
1718 * with schedule on the ->pre_schedule() or idle_balance() 1726 * Only set the backward buddy when the current task is still
1719 * point, either of which can * drop the rq lock. 1727 * on the rq. This can happen when a wakeup gets interleaved
1720 * 1728 * with schedule on the ->pre_schedule() or idle_balance()
1721 * Also, during early boot the idle thread is in the fair class, 1729 * point, either of which can * drop the rq lock.
1722 * for obvious reasons its a bad idea to schedule back to it. 1730 *
1723 */ 1731 * Also, during early boot the idle thread is in the fair class,
1724 if (unlikely(!se->on_rq || curr == rq->idle)) 1732 * for obvious reasons its a bad idea to schedule back to it.
1725 return; 1733 */
1726 if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se)) 1734 if (unlikely(!se->on_rq || curr == rq->idle))
1727 set_last_buddy(se); 1735 return;
1728 } 1736
1737 if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
1738 set_last_buddy(se);
1729} 1739}
1730 1740
1731static struct task_struct *pick_next_task_fair(struct rq *rq) 1741static struct task_struct *pick_next_task_fair(struct rq *rq)
@@ -1905,6 +1915,17 @@ move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1905 1915
1906 return 0; 1916 return 0;
1907} 1917}
1918
1919static void rq_online_fair(struct rq *rq)
1920{
1921 update_sysctl();
1922}
1923
1924static void rq_offline_fair(struct rq *rq)
1925{
1926 update_sysctl();
1927}
1928
1908#endif /* CONFIG_SMP */ 1929#endif /* CONFIG_SMP */
1909 1930
1910/* 1931/*
@@ -1922,28 +1943,30 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
1922} 1943}
1923 1944
1924/* 1945/*
1925 * Share the fairness runtime between parent and child, thus the 1946 * called on fork with the child task as argument from the parent's context
1926 * total amount of pressure for CPU stays equal - new tasks 1947 * - child not yet on the tasklist
1927 * get a chance to run but frequent forkers are not allowed to 1948 * - preemption disabled
1928 * monopolize the CPU. Note: the parent runqueue is locked,
1929 * the child is not running yet.
1930 */ 1949 */
1931static void task_new_fair(struct rq *rq, struct task_struct *p) 1950static void task_fork_fair(struct task_struct *p)
1932{ 1951{
1933 struct cfs_rq *cfs_rq = task_cfs_rq(p); 1952 struct cfs_rq *cfs_rq = task_cfs_rq(current);
1934 struct sched_entity *se = &p->se, *curr = cfs_rq->curr; 1953 struct sched_entity *se = &p->se, *curr = cfs_rq->curr;
1935 int this_cpu = smp_processor_id(); 1954 int this_cpu = smp_processor_id();
1955 struct rq *rq = this_rq();
1956 unsigned long flags;
1957
1958 spin_lock_irqsave(&rq->lock, flags);
1936 1959
1937 sched_info_queued(p); 1960 if (unlikely(task_cpu(p) != this_cpu))
1961 __set_task_cpu(p, this_cpu);
1938 1962
1939 update_curr(cfs_rq); 1963 update_curr(cfs_rq);
1964
1940 if (curr) 1965 if (curr)
1941 se->vruntime = curr->vruntime; 1966 se->vruntime = curr->vruntime;
1942 place_entity(cfs_rq, se, 1); 1967 place_entity(cfs_rq, se, 1);
1943 1968
1944 /* 'curr' will be NULL if the child belongs to a different group */ 1969 if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
1945 if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) &&
1946 curr && entity_before(curr, se)) {
1947 /* 1970 /*
1948 * Upon rescheduling, sched_class::put_prev_task() will place 1971 * Upon rescheduling, sched_class::put_prev_task() will place
1949 * 'current' within the tree based on its new key value. 1972 * 'current' within the tree based on its new key value.
@@ -1952,7 +1975,7 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
1952 resched_task(rq->curr); 1975 resched_task(rq->curr);
1953 } 1976 }
1954 1977
1955 enqueue_task_fair(rq, p, 0); 1978 spin_unlock_irqrestore(&rq->lock, flags);
1956} 1979}
1957 1980
1958/* 1981/*
@@ -2014,21 +2037,17 @@ static void moved_group_fair(struct task_struct *p)
2014} 2037}
2015#endif 2038#endif
2016 2039
2017unsigned int get_rr_interval_fair(struct task_struct *task) 2040unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
2018{ 2041{
2019 struct sched_entity *se = &task->se; 2042 struct sched_entity *se = &task->se;
2020 unsigned long flags;
2021 struct rq *rq;
2022 unsigned int rr_interval = 0; 2043 unsigned int rr_interval = 0;
2023 2044
2024 /* 2045 /*
2025 * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise 2046 * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
2026 * idle runqueue: 2047 * idle runqueue:
2027 */ 2048 */
2028 rq = task_rq_lock(task, &flags);
2029 if (rq->cfs.load.weight) 2049 if (rq->cfs.load.weight)
2030 rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se)); 2050 rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));
2031 task_rq_unlock(rq, &flags);
2032 2051
2033 return rr_interval; 2052 return rr_interval;
2034} 2053}
@@ -2052,11 +2071,13 @@ static const struct sched_class fair_sched_class = {
2052 2071
2053 .load_balance = load_balance_fair, 2072 .load_balance = load_balance_fair,
2054 .move_one_task = move_one_task_fair, 2073 .move_one_task = move_one_task_fair,
2074 .rq_online = rq_online_fair,
2075 .rq_offline = rq_offline_fair,
2055#endif 2076#endif
2056 2077
2057 .set_curr_task = set_curr_task_fair, 2078 .set_curr_task = set_curr_task_fair,
2058 .task_tick = task_tick_fair, 2079 .task_tick = task_tick_fair,
2059 .task_new = task_new_fair, 2080 .task_fork = task_fork_fair,
2060 2081
2061 .prio_changed = prio_changed_fair, 2082 .prio_changed = prio_changed_fair,
2062 .switched_to = switched_to_fair, 2083 .switched_to = switched_to_fair,
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 0d94083582c..d5059fd761d 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -54,11 +54,6 @@ SCHED_FEAT(WAKEUP_SYNC, 0)
54SCHED_FEAT(WAKEUP_OVERLAP, 0) 54SCHED_FEAT(WAKEUP_OVERLAP, 0)
55 55
56/* 56/*
57 * Wakeup preemption towards tasks that run short
58 */
59SCHED_FEAT(WAKEUP_RUNNING, 0)
60
61/*
62 * Use the SYNC wakeup hint, pipes and the likes use this to indicate 57 * Use the SYNC wakeup hint, pipes and the likes use this to indicate
63 * the remote end is likely to consume the data we just wrote, and 58 * the remote end is likely to consume the data we just wrote, and
64 * therefore has cache benefit from being placed on the same cpu, see 59 * therefore has cache benefit from being placed on the same cpu, see
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index b133a28fcde..33d5384a73a 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -97,7 +97,7 @@ static void prio_changed_idle(struct rq *rq, struct task_struct *p,
97 check_preempt_curr(rq, p, 0); 97 check_preempt_curr(rq, p, 0);
98} 98}
99 99
100unsigned int get_rr_interval_idle(struct task_struct *task) 100unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task)
101{ 101{
102 return 0; 102 return 0;
103} 103}
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 5c5fef37841..aecbd9c6b20 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1721,7 +1721,7 @@ static void set_curr_task_rt(struct rq *rq)
1721 dequeue_pushable_task(rq, p); 1721 dequeue_pushable_task(rq, p);
1722} 1722}
1723 1723
1724unsigned int get_rr_interval_rt(struct task_struct *task) 1724unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
1725{ 1725{
1726 /* 1726 /*
1727 * Time slice is 0 for SCHED_FIFO tasks 1727 * Time slice is 0 for SCHED_FIFO tasks
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 9327a26765c..554ac4894f0 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -244,6 +244,10 @@ static int min_sched_granularity_ns = 100000; /* 100 usecs */
244static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ 244static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */
245static int min_wakeup_granularity_ns; /* 0 usecs */ 245static int min_wakeup_granularity_ns; /* 0 usecs */
246static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ 246static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */
247static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
248static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
249static int min_sched_shares_ratelimit = 100000; /* 100 usec */
250static int max_sched_shares_ratelimit = NSEC_PER_SEC; /* 1 second */
247#endif 251#endif
248 252
249static struct ctl_table kern_table[] = { 253static struct ctl_table kern_table[] = {
@@ -260,7 +264,7 @@ static struct ctl_table kern_table[] = {
260 .data = &sysctl_sched_min_granularity, 264 .data = &sysctl_sched_min_granularity,
261 .maxlen = sizeof(unsigned int), 265 .maxlen = sizeof(unsigned int),
262 .mode = 0644, 266 .mode = 0644,
263 .proc_handler = sched_nr_latency_handler, 267 .proc_handler = sched_proc_update_handler,
264 .extra1 = &min_sched_granularity_ns, 268 .extra1 = &min_sched_granularity_ns,
265 .extra2 = &max_sched_granularity_ns, 269 .extra2 = &max_sched_granularity_ns,
266 }, 270 },
@@ -269,7 +273,7 @@ static struct ctl_table kern_table[] = {
269 .data = &sysctl_sched_latency, 273 .data = &sysctl_sched_latency,
270 .maxlen = sizeof(unsigned int), 274 .maxlen = sizeof(unsigned int),
271 .mode = 0644, 275 .mode = 0644,
272 .proc_handler = sched_nr_latency_handler, 276 .proc_handler = sched_proc_update_handler,
273 .extra1 = &min_sched_granularity_ns, 277 .extra1 = &min_sched_granularity_ns,
274 .extra2 = &max_sched_granularity_ns, 278 .extra2 = &max_sched_granularity_ns,
275 }, 279 },
@@ -278,7 +282,7 @@ static struct ctl_table kern_table[] = {
278 .data = &sysctl_sched_wakeup_granularity, 282 .data = &sysctl_sched_wakeup_granularity,
279 .maxlen = sizeof(unsigned int), 283 .maxlen = sizeof(unsigned int),
280 .mode = 0644, 284 .mode = 0644,
281 .proc_handler = proc_dointvec_minmax, 285 .proc_handler = sched_proc_update_handler,
282 .extra1 = &min_wakeup_granularity_ns, 286 .extra1 = &min_wakeup_granularity_ns,
283 .extra2 = &max_wakeup_granularity_ns, 287 .extra2 = &max_wakeup_granularity_ns,
284 }, 288 },
@@ -287,7 +291,18 @@ static struct ctl_table kern_table[] = {
287 .data = &sysctl_sched_shares_ratelimit, 291 .data = &sysctl_sched_shares_ratelimit,
288 .maxlen = sizeof(unsigned int), 292 .maxlen = sizeof(unsigned int),
289 .mode = 0644, 293 .mode = 0644,
290 .proc_handler = proc_dointvec, 294 .proc_handler = sched_proc_update_handler,
295 .extra1 = &min_sched_shares_ratelimit,
296 .extra2 = &max_sched_shares_ratelimit,
297 },
298 {
299 .procname = "sched_tunable_scaling",
300 .data = &sysctl_sched_tunable_scaling,
301 .maxlen = sizeof(enum sched_tunable_scaling),
302 .mode = 0644,
303 .proc_handler = sched_proc_update_handler,
304 .extra1 = &min_sched_tunable_scaling,
305 .extra2 = &max_sched_tunable_scaling,
291 }, 306 },
292 { 307 {
293 .procname = "sched_shares_thresh", 308 .procname = "sched_shares_thresh",
@@ -298,13 +313,6 @@ static struct ctl_table kern_table[] = {
298 .extra1 = &zero, 313 .extra1 = &zero,
299 }, 314 },
300 { 315 {
301 .procname = "sched_features",
302 .data = &sysctl_sched_features,
303 .maxlen = sizeof(unsigned int),
304 .mode = 0644,
305 .proc_handler = proc_dointvec,
306 },
307 {
308 .procname = "sched_migration_cost", 316 .procname = "sched_migration_cost",
309 .data = &sysctl_sched_migration_cost, 317 .data = &sysctl_sched_migration_cost,
310 .maxlen = sizeof(unsigned int), 318 .maxlen = sizeof(unsigned int),