aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--arch/x86/include/asm/topology.h47
-rw-r--r--fs/dcache.c1
-rw-r--r--fs/locks.c2
-rw-r--r--include/linux/hardirq.h6
-rw-r--r--include/linux/kernel.h5
-rw-r--r--include/linux/sched.h94
-rw-r--r--include/linux/topology.h168
-rw-r--r--include/trace/events/sched.h95
-rw-r--r--init/main.c2
-rw-r--r--kernel/kthread.c4
-rw-r--r--kernel/sched.c1099
-rw-r--r--kernel/sched_cpupri.c30
-rw-r--r--kernel/sched_debug.c4
-rw-r--r--kernel/sched_fair.c84
-rw-r--r--kernel/sched_features.h2
-rw-r--r--kernel/sched_rt.c62
-rw-r--r--kernel/sysctl.c24
-rw-r--r--kernel/workqueue.c2
18 files changed, 1117 insertions, 614 deletions
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 066ef590d7e0..26d06e052a18 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -129,25 +129,34 @@ extern unsigned long node_remap_size[];
129#endif 129#endif
130 130
131/* sched_domains SD_NODE_INIT for NUMA machines */ 131/* sched_domains SD_NODE_INIT for NUMA machines */
132#define SD_NODE_INIT (struct sched_domain) { \ 132#define SD_NODE_INIT (struct sched_domain) { \
133 .min_interval = 8, \ 133 .min_interval = 8, \
134 .max_interval = 32, \ 134 .max_interval = 32, \
135 .busy_factor = 32, \ 135 .busy_factor = 32, \
136 .imbalance_pct = 125, \ 136 .imbalance_pct = 125, \
137 .cache_nice_tries = SD_CACHE_NICE_TRIES, \ 137 .cache_nice_tries = SD_CACHE_NICE_TRIES, \
138 .busy_idx = 3, \ 138 .busy_idx = 3, \
139 .idle_idx = SD_IDLE_IDX, \ 139 .idle_idx = SD_IDLE_IDX, \
140 .newidle_idx = SD_NEWIDLE_IDX, \ 140 .newidle_idx = SD_NEWIDLE_IDX, \
141 .wake_idx = 1, \ 141 .wake_idx = 1, \
142 .forkexec_idx = SD_FORKEXEC_IDX, \ 142 .forkexec_idx = SD_FORKEXEC_IDX, \
143 .flags = SD_LOAD_BALANCE \ 143 \
144 | SD_BALANCE_EXEC \ 144 .flags = 1*SD_LOAD_BALANCE \
145 | SD_BALANCE_FORK \ 145 | 1*SD_BALANCE_NEWIDLE \
146 | SD_WAKE_AFFINE \ 146 | 1*SD_BALANCE_EXEC \
147 | SD_WAKE_BALANCE \ 147 | 1*SD_BALANCE_FORK \
148 | SD_SERIALIZE, \ 148 | 0*SD_WAKE_IDLE \
149 .last_balance = jiffies, \ 149 | 1*SD_WAKE_AFFINE \
150 .balance_interval = 1, \ 150 | 1*SD_WAKE_BALANCE \
151 | 0*SD_SHARE_CPUPOWER \
152 | 0*SD_POWERSAVINGS_BALANCE \
153 | 0*SD_SHARE_PKG_RESOURCES \
154 | 1*SD_SERIALIZE \
155 | 1*SD_WAKE_IDLE_FAR \
156 | 0*SD_PREFER_SIBLING \
157 , \
158 .last_balance = jiffies, \
159 .balance_interval = 1, \
151} 160}
152 161
153#ifdef CONFIG_X86_64_ACPI_NUMA 162#ifdef CONFIG_X86_64_ACPI_NUMA
diff --git a/fs/dcache.c b/fs/dcache.c
index 9e5cd3c3a6ba..a100fa35a48f 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -32,6 +32,7 @@
32#include <linux/swap.h> 32#include <linux/swap.h>
33#include <linux/bootmem.h> 33#include <linux/bootmem.h>
34#include <linux/fs_struct.h> 34#include <linux/fs_struct.h>
35#include <linux/hardirq.h>
35#include "internal.h" 36#include "internal.h"
36 37
37int sysctl_vfs_cache_pressure __read_mostly = 100; 38int sysctl_vfs_cache_pressure __read_mostly = 100;
diff --git a/fs/locks.c b/fs/locks.c
index 52366e877d76..19ee18a6829b 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -768,7 +768,7 @@ static int flock_lock_file(struct file *filp, struct file_lock *request)
768 * give it the opportunity to lock the file. 768 * give it the opportunity to lock the file.
769 */ 769 */
770 if (found) 770 if (found)
771 cond_resched_bkl(); 771 cond_resched();
772 772
773find_conflict: 773find_conflict:
774 for_each_lock(inode, before) { 774 for_each_lock(inode, before) {
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index 330cb31bb496..6d527ee82b2b 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -64,6 +64,12 @@
64#define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT) 64#define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT)
65#define NMI_OFFSET (1UL << NMI_SHIFT) 65#define NMI_OFFSET (1UL << NMI_SHIFT)
66 66
67#ifndef PREEMPT_ACTIVE
68#define PREEMPT_ACTIVE_BITS 1
69#define PREEMPT_ACTIVE_SHIFT (NMI_SHIFT + NMI_BITS)
70#define PREEMPT_ACTIVE (__IRQ_MASK(PREEMPT_ACTIVE_BITS) << PREEMPT_ACTIVE_SHIFT)
71#endif
72
67#if PREEMPT_ACTIVE < (1 << (NMI_SHIFT + NMI_BITS)) 73#if PREEMPT_ACTIVE < (1 << (NMI_SHIFT + NMI_BITS))
68#error PREEMPT_ACTIVE is too low! 74#error PREEMPT_ACTIVE is too low!
69#endif 75#endif
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index d6320a3e8def..2b5b1e0899a8 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -125,7 +125,7 @@ extern int _cond_resched(void);
125#endif 125#endif
126 126
127#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP 127#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
128 void __might_sleep(char *file, int line); 128 void __might_sleep(char *file, int line, int preempt_offset);
129/** 129/**
130 * might_sleep - annotation for functions that can sleep 130 * might_sleep - annotation for functions that can sleep
131 * 131 *
@@ -137,8 +137,9 @@ extern int _cond_resched(void);
137 * supposed to. 137 * supposed to.
138 */ 138 */
139# define might_sleep() \ 139# define might_sleep() \
140 do { __might_sleep(__FILE__, __LINE__); might_resched(); } while (0) 140 do { __might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0)
141#else 141#else
142 static inline void __might_sleep(char *file, int line, int preempt_offset) { }
142# define might_sleep() do { might_resched(); } while (0) 143# define might_sleep() do { might_resched(); } while (0)
143#endif 144#endif
144 145
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 379531c08975..f3d74bd04d18 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -38,6 +38,8 @@
38#define SCHED_BATCH 3 38#define SCHED_BATCH 3
39/* SCHED_ISO: reserved but not implemented yet */ 39/* SCHED_ISO: reserved but not implemented yet */
40#define SCHED_IDLE 5 40#define SCHED_IDLE 5
41/* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */
42#define SCHED_RESET_ON_FORK 0x40000000
41 43
42#ifdef __KERNEL__ 44#ifdef __KERNEL__
43 45
@@ -796,18 +798,19 @@ enum cpu_idle_type {
796#define SCHED_LOAD_SCALE_FUZZ SCHED_LOAD_SCALE 798#define SCHED_LOAD_SCALE_FUZZ SCHED_LOAD_SCALE
797 799
798#ifdef CONFIG_SMP 800#ifdef CONFIG_SMP
799#define SD_LOAD_BALANCE 1 /* Do load balancing on this domain. */ 801#define SD_LOAD_BALANCE 0x0001 /* Do load balancing on this domain. */
800#define SD_BALANCE_NEWIDLE 2 /* Balance when about to become idle */ 802#define SD_BALANCE_NEWIDLE 0x0002 /* Balance when about to become idle */
801#define SD_BALANCE_EXEC 4 /* Balance on exec */ 803#define SD_BALANCE_EXEC 0x0004 /* Balance on exec */
802#define SD_BALANCE_FORK 8 /* Balance on fork, clone */ 804#define SD_BALANCE_FORK 0x0008 /* Balance on fork, clone */
803#define SD_WAKE_IDLE 16 /* Wake to idle CPU on task wakeup */ 805#define SD_WAKE_IDLE 0x0010 /* Wake to idle CPU on task wakeup */
804#define SD_WAKE_AFFINE 32 /* Wake task to waking CPU */ 806#define SD_WAKE_AFFINE 0x0020 /* Wake task to waking CPU */
805#define SD_WAKE_BALANCE 64 /* Perform balancing at task wakeup */ 807#define SD_WAKE_BALANCE 0x0040 /* Perform balancing at task wakeup */
806#define SD_SHARE_CPUPOWER 128 /* Domain members share cpu power */ 808#define SD_SHARE_CPUPOWER 0x0080 /* Domain members share cpu power */
807#define SD_POWERSAVINGS_BALANCE 256 /* Balance for power savings */ 809#define SD_POWERSAVINGS_BALANCE 0x0100 /* Balance for power savings */
808#define SD_SHARE_PKG_RESOURCES 512 /* Domain members share cpu pkg resources */ 810#define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */
809#define SD_SERIALIZE 1024 /* Only a single load balancing instance */ 811#define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */
810#define SD_WAKE_IDLE_FAR 2048 /* Gain latency sacrificing cache hit */ 812#define SD_WAKE_IDLE_FAR 0x0800 /* Gain latency sacrificing cache hit */
813#define SD_PREFER_SIBLING 0x1000 /* Prefer to place tasks in a sibling domain */
811 814
812enum powersavings_balance_level { 815enum powersavings_balance_level {
813 POWERSAVINGS_BALANCE_NONE = 0, /* No power saving load balance */ 816 POWERSAVINGS_BALANCE_NONE = 0, /* No power saving load balance */
@@ -827,7 +830,7 @@ static inline int sd_balance_for_mc_power(void)
827 if (sched_smt_power_savings) 830 if (sched_smt_power_savings)
828 return SD_POWERSAVINGS_BALANCE; 831 return SD_POWERSAVINGS_BALANCE;
829 832
830 return 0; 833 return SD_PREFER_SIBLING;
831} 834}
832 835
833static inline int sd_balance_for_package_power(void) 836static inline int sd_balance_for_package_power(void)
@@ -835,7 +838,7 @@ static inline int sd_balance_for_package_power(void)
835 if (sched_mc_power_savings | sched_smt_power_savings) 838 if (sched_mc_power_savings | sched_smt_power_savings)
836 return SD_POWERSAVINGS_BALANCE; 839 return SD_POWERSAVINGS_BALANCE;
837 840
838 return 0; 841 return SD_PREFER_SIBLING;
839} 842}
840 843
841/* 844/*
@@ -857,15 +860,9 @@ struct sched_group {
857 860
858 /* 861 /*
859 * CPU power of this group, SCHED_LOAD_SCALE being max power for a 862 * CPU power of this group, SCHED_LOAD_SCALE being max power for a
860 * single CPU. This is read only (except for setup, hotplug CPU). 863 * single CPU.
861 * Note : Never change cpu_power without recompute its reciprocal
862 */
863 unsigned int __cpu_power;
864 /*
865 * reciprocal value of cpu_power to avoid expensive divides
866 * (see include/linux/reciprocal_div.h)
867 */ 864 */
868 u32 reciprocal_cpu_power; 865 unsigned int cpu_power;
869 866
870 /* 867 /*
871 * The CPUs this group covers. 868 * The CPUs this group covers.
@@ -918,6 +915,7 @@ struct sched_domain {
918 unsigned int newidle_idx; 915 unsigned int newidle_idx;
919 unsigned int wake_idx; 916 unsigned int wake_idx;
920 unsigned int forkexec_idx; 917 unsigned int forkexec_idx;
918 unsigned int smt_gain;
921 int flags; /* See SD_* */ 919 int flags; /* See SD_* */
922 enum sched_domain_level level; 920 enum sched_domain_level level;
923 921
@@ -1045,7 +1043,6 @@ struct sched_class {
1045 struct rq *busiest, struct sched_domain *sd, 1043 struct rq *busiest, struct sched_domain *sd,
1046 enum cpu_idle_type idle); 1044 enum cpu_idle_type idle);
1047 void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); 1045 void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
1048 int (*needs_post_schedule) (struct rq *this_rq);
1049 void (*post_schedule) (struct rq *this_rq); 1046 void (*post_schedule) (struct rq *this_rq);
1050 void (*task_wake_up) (struct rq *this_rq, struct task_struct *task); 1047 void (*task_wake_up) (struct rq *this_rq, struct task_struct *task);
1051 1048
@@ -1110,6 +1107,8 @@ struct sched_entity {
1110 u64 wait_max; 1107 u64 wait_max;
1111 u64 wait_count; 1108 u64 wait_count;
1112 u64 wait_sum; 1109 u64 wait_sum;
1110 u64 iowait_count;
1111 u64 iowait_sum;
1113 1112
1114 u64 sleep_start; 1113 u64 sleep_start;
1115 u64 sleep_max; 1114 u64 sleep_max;
@@ -1234,11 +1233,19 @@ struct task_struct {
1234 unsigned did_exec:1; 1233 unsigned did_exec:1;
1235 unsigned in_execve:1; /* Tell the LSMs that the process is doing an 1234 unsigned in_execve:1; /* Tell the LSMs that the process is doing an
1236 * execve */ 1235 * execve */
1236 unsigned in_iowait:1;
1237
1238
1239 /* Revert to default priority/policy when forking */
1240 unsigned sched_reset_on_fork:1;
1241
1237 pid_t pid; 1242 pid_t pid;
1238 pid_t tgid; 1243 pid_t tgid;
1239 1244
1245#ifdef CONFIG_CC_STACKPROTECTOR
1240 /* Canary value for the -fstack-protector gcc feature */ 1246 /* Canary value for the -fstack-protector gcc feature */
1241 unsigned long stack_canary; 1247 unsigned long stack_canary;
1248#endif
1242 1249
1243 /* 1250 /*
1244 * pointers to (original) parent process, youngest child, younger sibling, 1251 * pointers to (original) parent process, youngest child, younger sibling,
@@ -1840,11 +1847,12 @@ extern unsigned int sysctl_sched_min_granularity;
1840extern unsigned int sysctl_sched_wakeup_granularity; 1847extern unsigned int sysctl_sched_wakeup_granularity;
1841extern unsigned int sysctl_sched_shares_ratelimit; 1848extern unsigned int sysctl_sched_shares_ratelimit;
1842extern unsigned int sysctl_sched_shares_thresh; 1849extern unsigned int sysctl_sched_shares_thresh;
1843#ifdef CONFIG_SCHED_DEBUG
1844extern unsigned int sysctl_sched_child_runs_first; 1850extern unsigned int sysctl_sched_child_runs_first;
1851#ifdef CONFIG_SCHED_DEBUG
1845extern unsigned int sysctl_sched_features; 1852extern unsigned int sysctl_sched_features;
1846extern unsigned int sysctl_sched_migration_cost; 1853extern unsigned int sysctl_sched_migration_cost;
1847extern unsigned int sysctl_sched_nr_migrate; 1854extern unsigned int sysctl_sched_nr_migrate;
1855extern unsigned int sysctl_sched_time_avg;
1848extern unsigned int sysctl_timer_migration; 1856extern unsigned int sysctl_timer_migration;
1849 1857
1850int sched_nr_latency_handler(struct ctl_table *table, int write, 1858int sched_nr_latency_handler(struct ctl_table *table, int write,
@@ -2308,23 +2316,31 @@ static inline int need_resched(void)
2308 * cond_resched_softirq() will enable bhs before scheduling. 2316 * cond_resched_softirq() will enable bhs before scheduling.
2309 */ 2317 */
2310extern int _cond_resched(void); 2318extern int _cond_resched(void);
2311#ifdef CONFIG_PREEMPT_BKL 2319
2312static inline int cond_resched(void) 2320#define cond_resched() ({ \
2313{ 2321 __might_sleep(__FILE__, __LINE__, 0); \
2314 return 0; 2322 _cond_resched(); \
2315} 2323})
2324
2325extern int __cond_resched_lock(spinlock_t *lock);
2326
2327#ifdef CONFIG_PREEMPT
2328#define PREEMPT_LOCK_OFFSET PREEMPT_OFFSET
2316#else 2329#else
2317static inline int cond_resched(void) 2330#define PREEMPT_LOCK_OFFSET 0
2318{
2319 return _cond_resched();
2320}
2321#endif 2331#endif
2322extern int cond_resched_lock(spinlock_t * lock); 2332
2323extern int cond_resched_softirq(void); 2333#define cond_resched_lock(lock) ({ \
2324static inline int cond_resched_bkl(void) 2334 __might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET); \
2325{ 2335 __cond_resched_lock(lock); \
2326 return _cond_resched(); 2336})
2327} 2337
2338extern int __cond_resched_softirq(void);
2339
2340#define cond_resched_softirq() ({ \
2341 __might_sleep(__FILE__, __LINE__, SOFTIRQ_OFFSET); \
2342 __cond_resched_softirq(); \
2343})
2328 2344
2329/* 2345/*
2330 * Does a critical section need to be broken due to another 2346 * Does a critical section need to be broken due to another
diff --git a/include/linux/topology.h b/include/linux/topology.h
index 7402c1a27c4f..85e8cf7d393c 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -85,20 +85,29 @@ int arch_update_cpu_topology(void);
85#define ARCH_HAS_SCHED_WAKE_IDLE 85#define ARCH_HAS_SCHED_WAKE_IDLE
86/* Common values for SMT siblings */ 86/* Common values for SMT siblings */
87#ifndef SD_SIBLING_INIT 87#ifndef SD_SIBLING_INIT
88#define SD_SIBLING_INIT (struct sched_domain) { \ 88#define SD_SIBLING_INIT (struct sched_domain) { \
89 .min_interval = 1, \ 89 .min_interval = 1, \
90 .max_interval = 2, \ 90 .max_interval = 2, \
91 .busy_factor = 64, \ 91 .busy_factor = 64, \
92 .imbalance_pct = 110, \ 92 .imbalance_pct = 110, \
93 .flags = SD_LOAD_BALANCE \ 93 \
94 | SD_BALANCE_NEWIDLE \ 94 .flags = 1*SD_LOAD_BALANCE \
95 | SD_BALANCE_FORK \ 95 | 1*SD_BALANCE_NEWIDLE \
96 | SD_BALANCE_EXEC \ 96 | 1*SD_BALANCE_EXEC \
97 | SD_WAKE_AFFINE \ 97 | 1*SD_BALANCE_FORK \
98 | SD_WAKE_BALANCE \ 98 | 0*SD_WAKE_IDLE \
99 | SD_SHARE_CPUPOWER, \ 99 | 1*SD_WAKE_AFFINE \
100 .last_balance = jiffies, \ 100 | 1*SD_WAKE_BALANCE \
101 .balance_interval = 1, \ 101 | 1*SD_SHARE_CPUPOWER \
102 | 0*SD_POWERSAVINGS_BALANCE \
103 | 0*SD_SHARE_PKG_RESOURCES \
104 | 0*SD_SERIALIZE \
105 | 0*SD_WAKE_IDLE_FAR \
106 | 0*SD_PREFER_SIBLING \
107 , \
108 .last_balance = jiffies, \
109 .balance_interval = 1, \
110 .smt_gain = 1178, /* 15% */ \
102} 111}
103#endif 112#endif
104#endif /* CONFIG_SCHED_SMT */ 113#endif /* CONFIG_SCHED_SMT */
@@ -106,69 +115,94 @@ int arch_update_cpu_topology(void);
106#ifdef CONFIG_SCHED_MC 115#ifdef CONFIG_SCHED_MC
107/* Common values for MC siblings. for now mostly derived from SD_CPU_INIT */ 116/* Common values for MC siblings. for now mostly derived from SD_CPU_INIT */
108#ifndef SD_MC_INIT 117#ifndef SD_MC_INIT
109#define SD_MC_INIT (struct sched_domain) { \ 118#define SD_MC_INIT (struct sched_domain) { \
110 .min_interval = 1, \ 119 .min_interval = 1, \
111 .max_interval = 4, \ 120 .max_interval = 4, \
112 .busy_factor = 64, \ 121 .busy_factor = 64, \
113 .imbalance_pct = 125, \ 122 .imbalance_pct = 125, \
114 .cache_nice_tries = 1, \ 123 .cache_nice_tries = 1, \
115 .busy_idx = 2, \ 124 .busy_idx = 2, \
116 .wake_idx = 1, \ 125 .wake_idx = 1, \
117 .forkexec_idx = 1, \ 126 .forkexec_idx = 1, \
118 .flags = SD_LOAD_BALANCE \ 127 \
119 | SD_BALANCE_FORK \ 128 .flags = 1*SD_LOAD_BALANCE \
120 | SD_BALANCE_EXEC \ 129 | 1*SD_BALANCE_NEWIDLE \
121 | SD_WAKE_AFFINE \ 130 | 1*SD_BALANCE_EXEC \
122 | SD_WAKE_BALANCE \ 131 | 1*SD_BALANCE_FORK \
123 | SD_SHARE_PKG_RESOURCES\ 132 | 1*SD_WAKE_IDLE \
124 | sd_balance_for_mc_power()\ 133 | 1*SD_WAKE_AFFINE \
125 | sd_power_saving_flags(),\ 134 | 1*SD_WAKE_BALANCE \
126 .last_balance = jiffies, \ 135 | 0*SD_SHARE_CPUPOWER \
127 .balance_interval = 1, \ 136 | 1*SD_SHARE_PKG_RESOURCES \
137 | 0*SD_SERIALIZE \
138 | 0*SD_WAKE_IDLE_FAR \
139 | sd_balance_for_mc_power() \
140 | sd_power_saving_flags() \
141 , \
142 .last_balance = jiffies, \
143 .balance_interval = 1, \
128} 144}
129#endif 145#endif
130#endif /* CONFIG_SCHED_MC */ 146#endif /* CONFIG_SCHED_MC */
131 147
132/* Common values for CPUs */ 148/* Common values for CPUs */
133#ifndef SD_CPU_INIT 149#ifndef SD_CPU_INIT
134#define SD_CPU_INIT (struct sched_domain) { \ 150#define SD_CPU_INIT (struct sched_domain) { \
135 .min_interval = 1, \ 151 .min_interval = 1, \
136 .max_interval = 4, \ 152 .max_interval = 4, \
137 .busy_factor = 64, \ 153 .busy_factor = 64, \
138 .imbalance_pct = 125, \ 154 .imbalance_pct = 125, \
139 .cache_nice_tries = 1, \ 155 .cache_nice_tries = 1, \
140 .busy_idx = 2, \ 156 .busy_idx = 2, \
141 .idle_idx = 1, \ 157 .idle_idx = 1, \
142 .newidle_idx = 2, \ 158 .newidle_idx = 2, \
143 .wake_idx = 1, \ 159 .wake_idx = 1, \
144 .forkexec_idx = 1, \ 160 .forkexec_idx = 1, \
145 .flags = SD_LOAD_BALANCE \ 161 \
146 | SD_BALANCE_EXEC \ 162 .flags = 1*SD_LOAD_BALANCE \
147 | SD_BALANCE_FORK \ 163 | 1*SD_BALANCE_NEWIDLE \
148 | SD_WAKE_AFFINE \ 164 | 1*SD_BALANCE_EXEC \
149 | SD_WAKE_BALANCE \ 165 | 1*SD_BALANCE_FORK \
150 | sd_balance_for_package_power()\ 166 | 1*SD_WAKE_IDLE \
151 | sd_power_saving_flags(),\ 167 | 0*SD_WAKE_AFFINE \
152 .last_balance = jiffies, \ 168 | 1*SD_WAKE_BALANCE \
153 .balance_interval = 1, \ 169 | 0*SD_SHARE_CPUPOWER \
170 | 0*SD_SHARE_PKG_RESOURCES \
171 | 0*SD_SERIALIZE \
172 | 0*SD_WAKE_IDLE_FAR \
173 | sd_balance_for_package_power() \
174 | sd_power_saving_flags() \
175 , \
176 .last_balance = jiffies, \
177 .balance_interval = 1, \
154} 178}
155#endif 179#endif
156 180
157/* sched_domains SD_ALLNODES_INIT for NUMA machines */ 181/* sched_domains SD_ALLNODES_INIT for NUMA machines */
158#define SD_ALLNODES_INIT (struct sched_domain) { \ 182#define SD_ALLNODES_INIT (struct sched_domain) { \
159 .min_interval = 64, \ 183 .min_interval = 64, \
160 .max_interval = 64*num_online_cpus(), \ 184 .max_interval = 64*num_online_cpus(), \
161 .busy_factor = 128, \ 185 .busy_factor = 128, \
162 .imbalance_pct = 133, \ 186 .imbalance_pct = 133, \
163 .cache_nice_tries = 1, \ 187 .cache_nice_tries = 1, \
164 .busy_idx = 3, \ 188 .busy_idx = 3, \
165 .idle_idx = 3, \ 189 .idle_idx = 3, \
166 .flags = SD_LOAD_BALANCE \ 190 .flags = 1*SD_LOAD_BALANCE \
167 | SD_BALANCE_NEWIDLE \ 191 | 1*SD_BALANCE_NEWIDLE \
168 | SD_WAKE_AFFINE \ 192 | 0*SD_BALANCE_EXEC \
169 | SD_SERIALIZE, \ 193 | 0*SD_BALANCE_FORK \
170 .last_balance = jiffies, \ 194 | 0*SD_WAKE_IDLE \
171 .balance_interval = 64, \ 195 | 1*SD_WAKE_AFFINE \
196 | 0*SD_WAKE_BALANCE \
197 | 0*SD_SHARE_CPUPOWER \
198 | 0*SD_POWERSAVINGS_BALANCE \
199 | 0*SD_SHARE_PKG_RESOURCES \
200 | 1*SD_SERIALIZE \
201 | 1*SD_WAKE_IDLE_FAR \
202 | 0*SD_PREFER_SIBLING \
203 , \
204 .last_balance = jiffies, \
205 .balance_interval = 64, \
172} 206}
173 207
174#ifdef CONFIG_NUMA 208#ifdef CONFIG_NUMA
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 8949bb7eb082..a4c369ec328f 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -340,6 +340,101 @@ TRACE_EVENT(sched_signal_send,
340 __entry->sig, __entry->comm, __entry->pid) 340 __entry->sig, __entry->comm, __entry->pid)
341); 341);
342 342
343/*
344 * XXX the below sched_stat tracepoints only apply to SCHED_OTHER/BATCH/IDLE
345 * adding sched_stat support to SCHED_FIFO/RR would be welcome.
346 */
347
348/*
349 * Tracepoint for accounting wait time (time the task is runnable
350 * but not actually running due to scheduler contention).
351 */
352TRACE_EVENT(sched_stat_wait,
353
354 TP_PROTO(struct task_struct *tsk, u64 delay),
355
356 TP_ARGS(tsk, delay),
357
358 TP_STRUCT__entry(
359 __array( char, comm, TASK_COMM_LEN )
360 __field( pid_t, pid )
361 __field( u64, delay )
362 ),
363
364 TP_fast_assign(
365 memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
366 __entry->pid = tsk->pid;
367 __entry->delay = delay;
368 )
369 TP_perf_assign(
370 __perf_count(delay);
371 ),
372
373 TP_printk("task: %s:%d wait: %Lu [ns]",
374 __entry->comm, __entry->pid,
375 (unsigned long long)__entry->delay)
376);
377
378/*
379 * Tracepoint for accounting sleep time (time the task is not runnable,
380 * including iowait, see below).
381 */
382TRACE_EVENT(sched_stat_sleep,
383
384 TP_PROTO(struct task_struct *tsk, u64 delay),
385
386 TP_ARGS(tsk, delay),
387
388 TP_STRUCT__entry(
389 __array( char, comm, TASK_COMM_LEN )
390 __field( pid_t, pid )
391 __field( u64, delay )
392 ),
393
394 TP_fast_assign(
395 memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
396 __entry->pid = tsk->pid;
397 __entry->delay = delay;
398 )
399 TP_perf_assign(
400 __perf_count(delay);
401 ),
402
403 TP_printk("task: %s:%d sleep: %Lu [ns]",
404 __entry->comm, __entry->pid,
405 (unsigned long long)__entry->delay)
406);
407
408/*
409 * Tracepoint for accounting iowait time (time the task is not runnable
410 * due to waiting on IO to complete).
411 */
412TRACE_EVENT(sched_stat_iowait,
413
414 TP_PROTO(struct task_struct *tsk, u64 delay),
415
416 TP_ARGS(tsk, delay),
417
418 TP_STRUCT__entry(
419 __array( char, comm, TASK_COMM_LEN )
420 __field( pid_t, pid )
421 __field( u64, delay )
422 ),
423
424 TP_fast_assign(
425 memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
426 __entry->pid = tsk->pid;
427 __entry->delay = delay;
428 )
429 TP_perf_assign(
430 __perf_count(delay);
431 ),
432
433 TP_printk("task: %s:%d iowait: %Lu [ns]",
434 __entry->comm, __entry->pid,
435 (unsigned long long)__entry->delay)
436);
437
343#endif /* _TRACE_SCHED_H */ 438#endif /* _TRACE_SCHED_H */
344 439
345/* This part must be outside protection */ 440/* This part must be outside protection */
diff --git a/init/main.c b/init/main.c
index 525f6fb2bd22..b34fd8e5edef 100644
--- a/init/main.c
+++ b/init/main.c
@@ -631,7 +631,6 @@ asmlinkage void __init start_kernel(void)
631 softirq_init(); 631 softirq_init();
632 timekeeping_init(); 632 timekeeping_init();
633 time_init(); 633 time_init();
634 sched_clock_init();
635 profile_init(); 634 profile_init();
636 if (!irqs_disabled()) 635 if (!irqs_disabled())
637 printk(KERN_CRIT "start_kernel(): bug: interrupts were " 636 printk(KERN_CRIT "start_kernel(): bug: interrupts were "
@@ -682,6 +681,7 @@ asmlinkage void __init start_kernel(void)
682 numa_policy_init(); 681 numa_policy_init();
683 if (late_time_init) 682 if (late_time_init)
684 late_time_init(); 683 late_time_init();
684 sched_clock_init();
685 calibrate_delay(); 685 calibrate_delay();
686 pidmap_init(); 686 pidmap_init();
687 anon_vma_init(); 687 anon_vma_init();
diff --git a/kernel/kthread.c b/kernel/kthread.c
index eb8751aa0418..5fe709982caa 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -16,8 +16,6 @@
16#include <linux/mutex.h> 16#include <linux/mutex.h>
17#include <trace/events/sched.h> 17#include <trace/events/sched.h>
18 18
19#define KTHREAD_NICE_LEVEL (-5)
20
21static DEFINE_SPINLOCK(kthread_create_lock); 19static DEFINE_SPINLOCK(kthread_create_lock);
22static LIST_HEAD(kthread_create_list); 20static LIST_HEAD(kthread_create_list);
23struct task_struct *kthreadd_task; 21struct task_struct *kthreadd_task;
@@ -145,7 +143,6 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
145 * The kernel thread should not inherit these properties. 143 * The kernel thread should not inherit these properties.
146 */ 144 */
147 sched_setscheduler_nocheck(create.result, SCHED_NORMAL, &param); 145 sched_setscheduler_nocheck(create.result, SCHED_NORMAL, &param);
148 set_user_nice(create.result, KTHREAD_NICE_LEVEL);
149 set_cpus_allowed_ptr(create.result, cpu_all_mask); 146 set_cpus_allowed_ptr(create.result, cpu_all_mask);
150 } 147 }
151 return create.result; 148 return create.result;
@@ -221,7 +218,6 @@ int kthreadd(void *unused)
221 /* Setup a clean context for our children to inherit. */ 218 /* Setup a clean context for our children to inherit. */
222 set_task_comm(tsk, "kthreadd"); 219 set_task_comm(tsk, "kthreadd");
223 ignore_signals(tsk); 220 ignore_signals(tsk);
224 set_user_nice(tsk, KTHREAD_NICE_LEVEL);
225 set_cpus_allowed_ptr(tsk, cpu_all_mask); 221 set_cpus_allowed_ptr(tsk, cpu_all_mask);
226 set_mems_allowed(node_possible_map); 222 set_mems_allowed(node_possible_map);
227 223
diff --git a/kernel/sched.c b/kernel/sched.c
index 4066241ae9f4..e27a53685ed9 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -64,7 +64,6 @@
64#include <linux/tsacct_kern.h> 64#include <linux/tsacct_kern.h>
65#include <linux/kprobes.h> 65#include <linux/kprobes.h>
66#include <linux/delayacct.h> 66#include <linux/delayacct.h>
67#include <linux/reciprocal_div.h>
68#include <linux/unistd.h> 67#include <linux/unistd.h>
69#include <linux/pagemap.h> 68#include <linux/pagemap.h>
70#include <linux/hrtimer.h> 69#include <linux/hrtimer.h>
@@ -120,30 +119,8 @@
120 */ 119 */
121#define RUNTIME_INF ((u64)~0ULL) 120#define RUNTIME_INF ((u64)~0ULL)
122 121
123#ifdef CONFIG_SMP
124
125static void double_rq_lock(struct rq *rq1, struct rq *rq2); 122static void double_rq_lock(struct rq *rq1, struct rq *rq2);
126 123
127/*
128 * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
129 * Since cpu_power is a 'constant', we can use a reciprocal divide.
130 */
131static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load)
132{
133 return reciprocal_divide(load, sg->reciprocal_cpu_power);
134}
135
136/*
137 * Each time a sched group cpu_power is changed,
138 * we must compute its reciprocal value
139 */
140static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
141{
142 sg->__cpu_power += val;
143 sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power);
144}
145#endif
146
147static inline int rt_policy(int policy) 124static inline int rt_policy(int policy)
148{ 125{
149 if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR)) 126 if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))
@@ -309,8 +286,8 @@ void set_tg_uid(struct user_struct *user)
309 286
310/* 287/*
311 * Root task group. 288 * Root task group.
312 * Every UID task group (including init_task_group aka UID-0) will 289 * Every UID task group (including init_task_group aka UID-0) will
313 * be a child to this group. 290 * be a child to this group.
314 */ 291 */
315struct task_group root_task_group; 292struct task_group root_task_group;
316 293
@@ -318,7 +295,7 @@ struct task_group root_task_group;
318/* Default task group's sched entity on each cpu */ 295/* Default task group's sched entity on each cpu */
319static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); 296static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
320/* Default task group's cfs_rq on each cpu */ 297/* Default task group's cfs_rq on each cpu */
321static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; 298static DEFINE_PER_CPU(struct cfs_rq, init_tg_cfs_rq) ____cacheline_aligned_in_smp;
322#endif /* CONFIG_FAIR_GROUP_SCHED */ 299#endif /* CONFIG_FAIR_GROUP_SCHED */
323 300
324#ifdef CONFIG_RT_GROUP_SCHED 301#ifdef CONFIG_RT_GROUP_SCHED
@@ -616,6 +593,7 @@ struct rq {
616 593
617 unsigned char idle_at_tick; 594 unsigned char idle_at_tick;
618 /* For active balancing */ 595 /* For active balancing */
596 int post_schedule;
619 int active_balance; 597 int active_balance;
620 int push_cpu; 598 int push_cpu;
621 /* cpu of this runqueue: */ 599 /* cpu of this runqueue: */
@@ -626,6 +604,9 @@ struct rq {
626 604
627 struct task_struct *migration_thread; 605 struct task_struct *migration_thread;
628 struct list_head migration_queue; 606 struct list_head migration_queue;
607
608 u64 rt_avg;
609 u64 age_stamp;
629#endif 610#endif
630 611
631 /* calc_load related fields */ 612 /* calc_load related fields */
@@ -693,6 +674,7 @@ static inline int cpu_of(struct rq *rq)
693#define this_rq() (&__get_cpu_var(runqueues)) 674#define this_rq() (&__get_cpu_var(runqueues))
694#define task_rq(p) cpu_rq(task_cpu(p)) 675#define task_rq(p) cpu_rq(task_cpu(p))
695#define cpu_curr(cpu) (cpu_rq(cpu)->curr) 676#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
677#define raw_rq() (&__raw_get_cpu_var(runqueues))
696 678
697inline void update_rq_clock(struct rq *rq) 679inline void update_rq_clock(struct rq *rq)
698{ 680{
@@ -861,6 +843,14 @@ unsigned int sysctl_sched_shares_ratelimit = 250000;
861unsigned int sysctl_sched_shares_thresh = 4; 843unsigned int sysctl_sched_shares_thresh = 4;
862 844
863/* 845/*
846 * period over which we average the RT time consumption, measured
847 * in ms.
848 *
849 * default: 1s
850 */
851const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
852
853/*
864 * period over which we measure -rt task cpu usage in us. 854 * period over which we measure -rt task cpu usage in us.
865 * default: 1s 855 * default: 1s
866 */ 856 */
@@ -1278,12 +1268,37 @@ void wake_up_idle_cpu(int cpu)
1278} 1268}
1279#endif /* CONFIG_NO_HZ */ 1269#endif /* CONFIG_NO_HZ */
1280 1270
1271static u64 sched_avg_period(void)
1272{
1273 return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
1274}
1275
1276static void sched_avg_update(struct rq *rq)
1277{
1278 s64 period = sched_avg_period();
1279
1280 while ((s64)(rq->clock - rq->age_stamp) > period) {
1281 rq->age_stamp += period;
1282 rq->rt_avg /= 2;
1283 }
1284}
1285
1286static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
1287{
1288 rq->rt_avg += rt_delta;
1289 sched_avg_update(rq);
1290}
1291
1281#else /* !CONFIG_SMP */ 1292#else /* !CONFIG_SMP */
1282static void resched_task(struct task_struct *p) 1293static void resched_task(struct task_struct *p)
1283{ 1294{
1284 assert_spin_locked(&task_rq(p)->lock); 1295 assert_spin_locked(&task_rq(p)->lock);
1285 set_tsk_need_resched(p); 1296 set_tsk_need_resched(p);
1286} 1297}
1298
1299static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
1300{
1301}
1287#endif /* CONFIG_SMP */ 1302#endif /* CONFIG_SMP */
1288 1303
1289#if BITS_PER_LONG == 32 1304#if BITS_PER_LONG == 32
@@ -1513,28 +1528,35 @@ static unsigned long cpu_avg_load_per_task(int cpu)
1513 1528
1514#ifdef CONFIG_FAIR_GROUP_SCHED 1529#ifdef CONFIG_FAIR_GROUP_SCHED
1515 1530
1531struct update_shares_data {
1532 unsigned long rq_weight[NR_CPUS];
1533};
1534
1535static DEFINE_PER_CPU(struct update_shares_data, update_shares_data);
1536
1516static void __set_se_shares(struct sched_entity *se, unsigned long shares); 1537static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1517 1538
1518/* 1539/*
1519 * Calculate and set the cpu's group shares. 1540 * Calculate and set the cpu's group shares.
1520 */ 1541 */
1521static void 1542static void update_group_shares_cpu(struct task_group *tg, int cpu,
1522update_group_shares_cpu(struct task_group *tg, int cpu, 1543 unsigned long sd_shares,
1523 unsigned long sd_shares, unsigned long sd_rq_weight) 1544 unsigned long sd_rq_weight,
1545 struct update_shares_data *usd)
1524{ 1546{
1525 unsigned long shares; 1547 unsigned long shares, rq_weight;
1526 unsigned long rq_weight; 1548 int boost = 0;
1527 1549
1528 if (!tg->se[cpu]) 1550 rq_weight = usd->rq_weight[cpu];
1529 return; 1551 if (!rq_weight) {
1530 1552 boost = 1;
1531 rq_weight = tg->cfs_rq[cpu]->rq_weight; 1553 rq_weight = NICE_0_LOAD;
1554 }
1532 1555
1533 /* 1556 /*
1534 * \Sum shares * rq_weight 1557 * \Sum_j shares_j * rq_weight_i
1535 * shares = ----------------------- 1558 * shares_i = -----------------------------
1536 * \Sum rq_weight 1559 * \Sum_j rq_weight_j
1537 *
1538 */ 1560 */
1539 shares = (sd_shares * rq_weight) / sd_rq_weight; 1561 shares = (sd_shares * rq_weight) / sd_rq_weight;
1540 shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES); 1562 shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
@@ -1545,8 +1567,8 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
1545 unsigned long flags; 1567 unsigned long flags;
1546 1568
1547 spin_lock_irqsave(&rq->lock, flags); 1569 spin_lock_irqsave(&rq->lock, flags);
1548 tg->cfs_rq[cpu]->shares = shares; 1570 tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight;
1549 1571 tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
1550 __set_se_shares(tg->se[cpu], shares); 1572 __set_se_shares(tg->se[cpu], shares);
1551 spin_unlock_irqrestore(&rq->lock, flags); 1573 spin_unlock_irqrestore(&rq->lock, flags);
1552 } 1574 }
@@ -1559,22 +1581,30 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
1559 */ 1581 */
1560static int tg_shares_up(struct task_group *tg, void *data) 1582static int tg_shares_up(struct task_group *tg, void *data)
1561{ 1583{
1562 unsigned long weight, rq_weight = 0; 1584 unsigned long weight, rq_weight = 0, shares = 0;
1563 unsigned long shares = 0; 1585 struct update_shares_data *usd;
1564 struct sched_domain *sd = data; 1586 struct sched_domain *sd = data;
1587 unsigned long flags;
1565 int i; 1588 int i;
1566 1589
1590 if (!tg->se[0])
1591 return 0;
1592
1593 local_irq_save(flags);
1594 usd = &__get_cpu_var(update_shares_data);
1595
1567 for_each_cpu(i, sched_domain_span(sd)) { 1596 for_each_cpu(i, sched_domain_span(sd)) {
1597 weight = tg->cfs_rq[i]->load.weight;
1598 usd->rq_weight[i] = weight;
1599
1568 /* 1600 /*
1569 * If there are currently no tasks on the cpu pretend there 1601 * If there are currently no tasks on the cpu pretend there
1570 * is one of average load so that when a new task gets to 1602 * is one of average load so that when a new task gets to
1571 * run here it will not get delayed by group starvation. 1603 * run here it will not get delayed by group starvation.
1572 */ 1604 */
1573 weight = tg->cfs_rq[i]->load.weight;
1574 if (!weight) 1605 if (!weight)
1575 weight = NICE_0_LOAD; 1606 weight = NICE_0_LOAD;
1576 1607
1577 tg->cfs_rq[i]->rq_weight = weight;
1578 rq_weight += weight; 1608 rq_weight += weight;
1579 shares += tg->cfs_rq[i]->shares; 1609 shares += tg->cfs_rq[i]->shares;
1580 } 1610 }
@@ -1586,7 +1616,9 @@ static int tg_shares_up(struct task_group *tg, void *data)
1586 shares = tg->shares; 1616 shares = tg->shares;
1587 1617
1588 for_each_cpu(i, sched_domain_span(sd)) 1618 for_each_cpu(i, sched_domain_span(sd))
1589 update_group_shares_cpu(tg, i, shares, rq_weight); 1619 update_group_shares_cpu(tg, i, shares, rq_weight, usd);
1620
1621 local_irq_restore(flags);
1590 1622
1591 return 0; 1623 return 0;
1592} 1624}
@@ -1616,8 +1648,14 @@ static int tg_load_down(struct task_group *tg, void *data)
1616 1648
1617static void update_shares(struct sched_domain *sd) 1649static void update_shares(struct sched_domain *sd)
1618{ 1650{
1619 u64 now = cpu_clock(raw_smp_processor_id()); 1651 s64 elapsed;
1620 s64 elapsed = now - sd->last_update; 1652 u64 now;
1653
1654 if (root_task_group_empty())
1655 return;
1656
1657 now = cpu_clock(raw_smp_processor_id());
1658 elapsed = now - sd->last_update;
1621 1659
1622 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { 1660 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
1623 sd->last_update = now; 1661 sd->last_update = now;
@@ -1627,6 +1665,9 @@ static void update_shares(struct sched_domain *sd)
1627 1665
1628static void update_shares_locked(struct rq *rq, struct sched_domain *sd) 1666static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1629{ 1667{
1668 if (root_task_group_empty())
1669 return;
1670
1630 spin_unlock(&rq->lock); 1671 spin_unlock(&rq->lock);
1631 update_shares(sd); 1672 update_shares(sd);
1632 spin_lock(&rq->lock); 1673 spin_lock(&rq->lock);
@@ -1634,6 +1675,9 @@ static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1634 1675
1635static void update_h_load(long cpu) 1676static void update_h_load(long cpu)
1636{ 1677{
1678 if (root_task_group_empty())
1679 return;
1680
1637 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); 1681 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
1638} 1682}
1639 1683
@@ -2268,8 +2312,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
2268 } 2312 }
2269 2313
2270 /* Adjust by relative CPU power of the group */ 2314 /* Adjust by relative CPU power of the group */
2271 avg_load = sg_div_cpu_power(group, 2315 avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
2272 avg_load * SCHED_LOAD_SCALE);
2273 2316
2274 if (local_group) { 2317 if (local_group) {
2275 this_load = avg_load; 2318 this_load = avg_load;
@@ -2637,9 +2680,32 @@ void sched_fork(struct task_struct *p, int clone_flags)
2637 set_task_cpu(p, cpu); 2680 set_task_cpu(p, cpu);
2638 2681
2639 /* 2682 /*
2640 * Make sure we do not leak PI boosting priority to the child: 2683 * Make sure we do not leak PI boosting priority to the child.
2641 */ 2684 */
2642 p->prio = current->normal_prio; 2685 p->prio = current->normal_prio;
2686
2687 /*
2688 * Revert to default priority/policy on fork if requested.
2689 */
2690 if (unlikely(p->sched_reset_on_fork)) {
2691 if (p->policy == SCHED_FIFO || p->policy == SCHED_RR)
2692 p->policy = SCHED_NORMAL;
2693
2694 if (p->normal_prio < DEFAULT_PRIO)
2695 p->prio = DEFAULT_PRIO;
2696
2697 if (PRIO_TO_NICE(p->static_prio) < 0) {
2698 p->static_prio = NICE_TO_PRIO(0);
2699 set_load_weight(p);
2700 }
2701
2702 /*
2703 * We don't need the reset flag anymore after the fork. It has
2704 * fulfilled its duty:
2705 */
2706 p->sched_reset_on_fork = 0;
2707 }
2708
2643 if (!rt_prio(p->prio)) 2709 if (!rt_prio(p->prio))
2644 p->sched_class = &fair_sched_class; 2710 p->sched_class = &fair_sched_class;
2645 2711
@@ -2796,12 +2862,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2796{ 2862{
2797 struct mm_struct *mm = rq->prev_mm; 2863 struct mm_struct *mm = rq->prev_mm;
2798 long prev_state; 2864 long prev_state;
2799#ifdef CONFIG_SMP
2800 int post_schedule = 0;
2801
2802 if (current->sched_class->needs_post_schedule)
2803 post_schedule = current->sched_class->needs_post_schedule(rq);
2804#endif
2805 2865
2806 rq->prev_mm = NULL; 2866 rq->prev_mm = NULL;
2807 2867
@@ -2820,10 +2880,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2820 finish_arch_switch(prev); 2880 finish_arch_switch(prev);
2821 perf_counter_task_sched_in(current, cpu_of(rq)); 2881 perf_counter_task_sched_in(current, cpu_of(rq));
2822 finish_lock_switch(rq, prev); 2882 finish_lock_switch(rq, prev);
2823#ifdef CONFIG_SMP
2824 if (post_schedule)
2825 current->sched_class->post_schedule(rq);
2826#endif
2827 2883
2828 fire_sched_in_preempt_notifiers(current); 2884 fire_sched_in_preempt_notifiers(current);
2829 if (mm) 2885 if (mm)
@@ -2838,6 +2894,42 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2838 } 2894 }
2839} 2895}
2840 2896
2897#ifdef CONFIG_SMP
2898
2899/* assumes rq->lock is held */
2900static inline void pre_schedule(struct rq *rq, struct task_struct *prev)
2901{
2902 if (prev->sched_class->pre_schedule)
2903 prev->sched_class->pre_schedule(rq, prev);
2904}
2905
2906/* rq->lock is NOT held, but preemption is disabled */
2907static inline void post_schedule(struct rq *rq)
2908{
2909 if (rq->post_schedule) {
2910 unsigned long flags;
2911
2912 spin_lock_irqsave(&rq->lock, flags);
2913 if (rq->curr->sched_class->post_schedule)
2914 rq->curr->sched_class->post_schedule(rq);
2915 spin_unlock_irqrestore(&rq->lock, flags);
2916
2917 rq->post_schedule = 0;
2918 }
2919}
2920
2921#else
2922
2923static inline void pre_schedule(struct rq *rq, struct task_struct *p)
2924{
2925}
2926
2927static inline void post_schedule(struct rq *rq)
2928{
2929}
2930
2931#endif
2932
2841/** 2933/**
2842 * schedule_tail - first thing a freshly forked thread must call. 2934 * schedule_tail - first thing a freshly forked thread must call.
2843 * @prev: the thread we just switched away from. 2935 * @prev: the thread we just switched away from.
@@ -2848,6 +2940,13 @@ asmlinkage void schedule_tail(struct task_struct *prev)
2848 struct rq *rq = this_rq(); 2940 struct rq *rq = this_rq();
2849 2941
2850 finish_task_switch(rq, prev); 2942 finish_task_switch(rq, prev);
2943
2944 /*
2945 * FIXME: do we need to worry about rq being invalidated by the
2946 * task_switch?
2947 */
2948 post_schedule(rq);
2949
2851#ifdef __ARCH_WANT_UNLOCKED_CTXSW 2950#ifdef __ARCH_WANT_UNLOCKED_CTXSW
2852 /* In this case, finish_task_switch does not reenable preemption */ 2951 /* In this case, finish_task_switch does not reenable preemption */
2853 preempt_enable(); 2952 preempt_enable();
@@ -3379,9 +3478,10 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
3379{ 3478{
3380 const struct sched_class *class; 3479 const struct sched_class *class;
3381 3480
3382 for (class = sched_class_highest; class; class = class->next) 3481 for_each_class(class) {
3383 if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle)) 3482 if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle))
3384 return 1; 3483 return 1;
3484 }
3385 3485
3386 return 0; 3486 return 0;
3387} 3487}
@@ -3544,7 +3644,7 @@ static inline void update_sd_power_savings_stats(struct sched_group *group,
3544 * capacity but still has some space to pick up some load 3644 * capacity but still has some space to pick up some load
3545 * from other group and save more power 3645 * from other group and save more power
3546 */ 3646 */
3547 if (sgs->sum_nr_running > sgs->group_capacity - 1) 3647 if (sgs->sum_nr_running + 1 > sgs->group_capacity)
3548 return; 3648 return;
3549 3649
3550 if (sgs->sum_nr_running > sds->leader_nr_running || 3650 if (sgs->sum_nr_running > sds->leader_nr_running ||
@@ -3611,6 +3711,77 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3611} 3711}
3612#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ 3712#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
3613 3713
3714unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
3715{
3716 unsigned long weight = cpumask_weight(sched_domain_span(sd));
3717 unsigned long smt_gain = sd->smt_gain;
3718
3719 smt_gain /= weight;
3720
3721 return smt_gain;
3722}
3723
3724unsigned long scale_rt_power(int cpu)
3725{
3726 struct rq *rq = cpu_rq(cpu);
3727 u64 total, available;
3728
3729 sched_avg_update(rq);
3730
3731 total = sched_avg_period() + (rq->clock - rq->age_stamp);
3732 available = total - rq->rt_avg;
3733
3734 if (unlikely((s64)total < SCHED_LOAD_SCALE))
3735 total = SCHED_LOAD_SCALE;
3736
3737 total >>= SCHED_LOAD_SHIFT;
3738
3739 return div_u64(available, total);
3740}
3741
3742static void update_cpu_power(struct sched_domain *sd, int cpu)
3743{
3744 unsigned long weight = cpumask_weight(sched_domain_span(sd));
3745 unsigned long power = SCHED_LOAD_SCALE;
3746 struct sched_group *sdg = sd->groups;
3747
3748 /* here we could scale based on cpufreq */
3749
3750 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
3751 power *= arch_scale_smt_power(sd, cpu);
3752 power >>= SCHED_LOAD_SHIFT;
3753 }
3754
3755 power *= scale_rt_power(cpu);
3756 power >>= SCHED_LOAD_SHIFT;
3757
3758 if (!power)
3759 power = 1;
3760
3761 sdg->cpu_power = power;
3762}
3763
3764static void update_group_power(struct sched_domain *sd, int cpu)
3765{
3766 struct sched_domain *child = sd->child;
3767 struct sched_group *group, *sdg = sd->groups;
3768 unsigned long power;
3769
3770 if (!child) {
3771 update_cpu_power(sd, cpu);
3772 return;
3773 }
3774
3775 power = 0;
3776
3777 group = child->groups;
3778 do {
3779 power += group->cpu_power;
3780 group = group->next;
3781 } while (group != child->groups);
3782
3783 sdg->cpu_power = power;
3784}
3614 3785
3615/** 3786/**
3616 * update_sg_lb_stats - Update sched_group's statistics for load balancing. 3787 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
@@ -3624,7 +3795,8 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3624 * @balance: Should we balance. 3795 * @balance: Should we balance.
3625 * @sgs: variable to hold the statistics for this group. 3796 * @sgs: variable to hold the statistics for this group.
3626 */ 3797 */
3627static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu, 3798static inline void update_sg_lb_stats(struct sched_domain *sd,
3799 struct sched_group *group, int this_cpu,
3628 enum cpu_idle_type idle, int load_idx, int *sd_idle, 3800 enum cpu_idle_type idle, int load_idx, int *sd_idle,
3629 int local_group, const struct cpumask *cpus, 3801 int local_group, const struct cpumask *cpus,
3630 int *balance, struct sg_lb_stats *sgs) 3802 int *balance, struct sg_lb_stats *sgs)
@@ -3635,8 +3807,11 @@ static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu,
3635 unsigned long sum_avg_load_per_task; 3807 unsigned long sum_avg_load_per_task;
3636 unsigned long avg_load_per_task; 3808 unsigned long avg_load_per_task;
3637 3809
3638 if (local_group) 3810 if (local_group) {
3639 balance_cpu = group_first_cpu(group); 3811 balance_cpu = group_first_cpu(group);
3812 if (balance_cpu == this_cpu)
3813 update_group_power(sd, this_cpu);
3814 }
3640 3815
3641 /* Tally up the load of all CPUs in the group */ 3816 /* Tally up the load of all CPUs in the group */
3642 sum_avg_load_per_task = avg_load_per_task = 0; 3817 sum_avg_load_per_task = avg_load_per_task = 0;
@@ -3685,8 +3860,7 @@ static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu,
3685 } 3860 }
3686 3861
3687 /* Adjust by relative CPU power of the group */ 3862 /* Adjust by relative CPU power of the group */
3688 sgs->avg_load = sg_div_cpu_power(group, 3863 sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
3689 sgs->group_load * SCHED_LOAD_SCALE);
3690 3864
3691 3865
3692 /* 3866 /*
@@ -3698,14 +3872,14 @@ static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu,
3698 * normalized nr_running number somewhere that negates 3872 * normalized nr_running number somewhere that negates
3699 * the hierarchy? 3873 * the hierarchy?
3700 */ 3874 */
3701 avg_load_per_task = sg_div_cpu_power(group, 3875 avg_load_per_task = (sum_avg_load_per_task * SCHED_LOAD_SCALE) /
3702 sum_avg_load_per_task * SCHED_LOAD_SCALE); 3876 group->cpu_power;
3703 3877
3704 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) 3878 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
3705 sgs->group_imb = 1; 3879 sgs->group_imb = 1;
3706 3880
3707 sgs->group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; 3881 sgs->group_capacity =
3708 3882 DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
3709} 3883}
3710 3884
3711/** 3885/**
@@ -3723,9 +3897,13 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
3723 const struct cpumask *cpus, int *balance, 3897 const struct cpumask *cpus, int *balance,
3724 struct sd_lb_stats *sds) 3898 struct sd_lb_stats *sds)
3725{ 3899{
3900 struct sched_domain *child = sd->child;
3726 struct sched_group *group = sd->groups; 3901 struct sched_group *group = sd->groups;
3727 struct sg_lb_stats sgs; 3902 struct sg_lb_stats sgs;
3728 int load_idx; 3903 int load_idx, prefer_sibling = 0;
3904
3905 if (child && child->flags & SD_PREFER_SIBLING)
3906 prefer_sibling = 1;
3729 3907
3730 init_sd_power_savings_stats(sd, sds, idle); 3908 init_sd_power_savings_stats(sd, sds, idle);
3731 load_idx = get_sd_load_idx(sd, idle); 3909 load_idx = get_sd_load_idx(sd, idle);
@@ -3736,14 +3914,22 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
3736 local_group = cpumask_test_cpu(this_cpu, 3914 local_group = cpumask_test_cpu(this_cpu,
3737 sched_group_cpus(group)); 3915 sched_group_cpus(group));
3738 memset(&sgs, 0, sizeof(sgs)); 3916 memset(&sgs, 0, sizeof(sgs));
3739 update_sg_lb_stats(group, this_cpu, idle, load_idx, sd_idle, 3917 update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle,
3740 local_group, cpus, balance, &sgs); 3918 local_group, cpus, balance, &sgs);
3741 3919
3742 if (local_group && balance && !(*balance)) 3920 if (local_group && balance && !(*balance))
3743 return; 3921 return;
3744 3922
3745 sds->total_load += sgs.group_load; 3923 sds->total_load += sgs.group_load;
3746 sds->total_pwr += group->__cpu_power; 3924 sds->total_pwr += group->cpu_power;
3925
3926 /*
3927 * In case the child domain prefers tasks go to siblings
3928 * first, lower the group capacity to one so that we'll try
3929 * and move all the excess tasks away.
3930 */
3931 if (prefer_sibling)
3932 sgs.group_capacity = min(sgs.group_capacity, 1UL);
3747 3933
3748 if (local_group) { 3934 if (local_group) {
3749 sds->this_load = sgs.avg_load; 3935 sds->this_load = sgs.avg_load;
@@ -3763,7 +3949,6 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
3763 update_sd_power_savings_stats(group, sds, local_group, &sgs); 3949 update_sd_power_savings_stats(group, sds, local_group, &sgs);
3764 group = group->next; 3950 group = group->next;
3765 } while (group != sd->groups); 3951 } while (group != sd->groups);
3766
3767} 3952}
3768 3953
3769/** 3954/**
@@ -3801,28 +3986,28 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
3801 * moving them. 3986 * moving them.
3802 */ 3987 */
3803 3988
3804 pwr_now += sds->busiest->__cpu_power * 3989 pwr_now += sds->busiest->cpu_power *
3805 min(sds->busiest_load_per_task, sds->max_load); 3990 min(sds->busiest_load_per_task, sds->max_load);
3806 pwr_now += sds->this->__cpu_power * 3991 pwr_now += sds->this->cpu_power *
3807 min(sds->this_load_per_task, sds->this_load); 3992 min(sds->this_load_per_task, sds->this_load);
3808 pwr_now /= SCHED_LOAD_SCALE; 3993 pwr_now /= SCHED_LOAD_SCALE;
3809 3994
3810 /* Amount of load we'd subtract */ 3995 /* Amount of load we'd subtract */
3811 tmp = sg_div_cpu_power(sds->busiest, 3996 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
3812 sds->busiest_load_per_task * SCHED_LOAD_SCALE); 3997 sds->busiest->cpu_power;
3813 if (sds->max_load > tmp) 3998 if (sds->max_load > tmp)
3814 pwr_move += sds->busiest->__cpu_power * 3999 pwr_move += sds->busiest->cpu_power *
3815 min(sds->busiest_load_per_task, sds->max_load - tmp); 4000 min(sds->busiest_load_per_task, sds->max_load - tmp);
3816 4001
3817 /* Amount of load we'd add */ 4002 /* Amount of load we'd add */
3818 if (sds->max_load * sds->busiest->__cpu_power < 4003 if (sds->max_load * sds->busiest->cpu_power <
3819 sds->busiest_load_per_task * SCHED_LOAD_SCALE) 4004 sds->busiest_load_per_task * SCHED_LOAD_SCALE)
3820 tmp = sg_div_cpu_power(sds->this, 4005 tmp = (sds->max_load * sds->busiest->cpu_power) /
3821 sds->max_load * sds->busiest->__cpu_power); 4006 sds->this->cpu_power;
3822 else 4007 else
3823 tmp = sg_div_cpu_power(sds->this, 4008 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
3824 sds->busiest_load_per_task * SCHED_LOAD_SCALE); 4009 sds->this->cpu_power;
3825 pwr_move += sds->this->__cpu_power * 4010 pwr_move += sds->this->cpu_power *
3826 min(sds->this_load_per_task, sds->this_load + tmp); 4011 min(sds->this_load_per_task, sds->this_load + tmp);
3827 pwr_move /= SCHED_LOAD_SCALE; 4012 pwr_move /= SCHED_LOAD_SCALE;
3828 4013
@@ -3857,8 +4042,8 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
3857 sds->max_load - sds->busiest_load_per_task); 4042 sds->max_load - sds->busiest_load_per_task);
3858 4043
3859 /* How much load to actually move to equalise the imbalance */ 4044 /* How much load to actually move to equalise the imbalance */
3860 *imbalance = min(max_pull * sds->busiest->__cpu_power, 4045 *imbalance = min(max_pull * sds->busiest->cpu_power,
3861 (sds->avg_load - sds->this_load) * sds->this->__cpu_power) 4046 (sds->avg_load - sds->this_load) * sds->this->cpu_power)
3862 / SCHED_LOAD_SCALE; 4047 / SCHED_LOAD_SCALE;
3863 4048
3864 /* 4049 /*
@@ -3976,6 +4161,26 @@ ret:
3976 return NULL; 4161 return NULL;
3977} 4162}
3978 4163
4164static struct sched_group *group_of(int cpu)
4165{
4166 struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd);
4167
4168 if (!sd)
4169 return NULL;
4170
4171 return sd->groups;
4172}
4173
4174static unsigned long power_of(int cpu)
4175{
4176 struct sched_group *group = group_of(cpu);
4177
4178 if (!group)
4179 return SCHED_LOAD_SCALE;
4180
4181 return group->cpu_power;
4182}
4183
3979/* 4184/*
3980 * find_busiest_queue - find the busiest runqueue among the cpus in group. 4185 * find_busiest_queue - find the busiest runqueue among the cpus in group.
3981 */ 4186 */
@@ -3988,15 +4193,18 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
3988 int i; 4193 int i;
3989 4194
3990 for_each_cpu(i, sched_group_cpus(group)) { 4195 for_each_cpu(i, sched_group_cpus(group)) {
4196 unsigned long power = power_of(i);
4197 unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
3991 unsigned long wl; 4198 unsigned long wl;
3992 4199
3993 if (!cpumask_test_cpu(i, cpus)) 4200 if (!cpumask_test_cpu(i, cpus))
3994 continue; 4201 continue;
3995 4202
3996 rq = cpu_rq(i); 4203 rq = cpu_rq(i);
3997 wl = weighted_cpuload(i); 4204 wl = weighted_cpuload(i) * SCHED_LOAD_SCALE;
4205 wl /= power;
3998 4206
3999 if (rq->nr_running == 1 && wl > imbalance) 4207 if (capacity && rq->nr_running == 1 && wl > imbalance)
4000 continue; 4208 continue;
4001 4209
4002 if (wl > max_load) { 4210 if (wl > max_load) {
@@ -5349,10 +5557,7 @@ need_resched_nonpreemptible:
5349 switch_count = &prev->nvcsw; 5557 switch_count = &prev->nvcsw;
5350 } 5558 }
5351 5559
5352#ifdef CONFIG_SMP 5560 pre_schedule(rq, prev);
5353 if (prev->sched_class->pre_schedule)
5354 prev->sched_class->pre_schedule(rq, prev);
5355#endif
5356 5561
5357 if (unlikely(!rq->nr_running)) 5562 if (unlikely(!rq->nr_running))
5358 idle_balance(cpu, rq); 5563 idle_balance(cpu, rq);
@@ -5378,6 +5583,8 @@ need_resched_nonpreemptible:
5378 } else 5583 } else
5379 spin_unlock_irq(&rq->lock); 5584 spin_unlock_irq(&rq->lock);
5380 5585
5586 post_schedule(rq);
5587
5381 if (unlikely(reacquire_kernel_lock(current) < 0)) 5588 if (unlikely(reacquire_kernel_lock(current) < 0))
5382 goto need_resched_nonpreemptible; 5589 goto need_resched_nonpreemptible;
5383 5590
@@ -6123,17 +6330,25 @@ static int __sched_setscheduler(struct task_struct *p, int policy,
6123 unsigned long flags; 6330 unsigned long flags;
6124 const struct sched_class *prev_class = p->sched_class; 6331 const struct sched_class *prev_class = p->sched_class;
6125 struct rq *rq; 6332 struct rq *rq;
6333 int reset_on_fork;
6126 6334
6127 /* may grab non-irq protected spin_locks */ 6335 /* may grab non-irq protected spin_locks */
6128 BUG_ON(in_interrupt()); 6336 BUG_ON(in_interrupt());
6129recheck: 6337recheck:
6130 /* double check policy once rq lock held */ 6338 /* double check policy once rq lock held */
6131 if (policy < 0) 6339 if (policy < 0) {
6340 reset_on_fork = p->sched_reset_on_fork;
6132 policy = oldpolicy = p->policy; 6341 policy = oldpolicy = p->policy;
6133 else if (policy != SCHED_FIFO && policy != SCHED_RR && 6342 } else {
6134 policy != SCHED_NORMAL && policy != SCHED_BATCH && 6343 reset_on_fork = !!(policy & SCHED_RESET_ON_FORK);
6135 policy != SCHED_IDLE) 6344 policy &= ~SCHED_RESET_ON_FORK;
6136 return -EINVAL; 6345
6346 if (policy != SCHED_FIFO && policy != SCHED_RR &&
6347 policy != SCHED_NORMAL && policy != SCHED_BATCH &&
6348 policy != SCHED_IDLE)
6349 return -EINVAL;
6350 }
6351
6137 /* 6352 /*
6138 * Valid priorities for SCHED_FIFO and SCHED_RR are 6353 * Valid priorities for SCHED_FIFO and SCHED_RR are
6139 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, 6354 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
@@ -6177,6 +6392,10 @@ recheck:
6177 /* can't change other user's priorities */ 6392 /* can't change other user's priorities */
6178 if (!check_same_owner(p)) 6393 if (!check_same_owner(p))
6179 return -EPERM; 6394 return -EPERM;
6395
6396 /* Normal users shall not reset the sched_reset_on_fork flag */
6397 if (p->sched_reset_on_fork && !reset_on_fork)
6398 return -EPERM;
6180 } 6399 }
6181 6400
6182 if (user) { 6401 if (user) {
@@ -6220,6 +6439,8 @@ recheck:
6220 if (running) 6439 if (running)
6221 p->sched_class->put_prev_task(rq, p); 6440 p->sched_class->put_prev_task(rq, p);
6222 6441
6442 p->sched_reset_on_fork = reset_on_fork;
6443
6223 oldprio = p->prio; 6444 oldprio = p->prio;
6224 __setscheduler(rq, p, policy, param->sched_priority); 6445 __setscheduler(rq, p, policy, param->sched_priority);
6225 6446
@@ -6336,14 +6557,15 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
6336 if (p) { 6557 if (p) {
6337 retval = security_task_getscheduler(p); 6558 retval = security_task_getscheduler(p);
6338 if (!retval) 6559 if (!retval)
6339 retval = p->policy; 6560 retval = p->policy
6561 | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
6340 } 6562 }
6341 read_unlock(&tasklist_lock); 6563 read_unlock(&tasklist_lock);
6342 return retval; 6564 return retval;
6343} 6565}
6344 6566
6345/** 6567/**
6346 * sys_sched_getscheduler - get the RT priority of a thread 6568 * sys_sched_getparam - get the RT priority of a thread
6347 * @pid: the pid in question. 6569 * @pid: the pid in question.
6348 * @param: structure containing the RT priority. 6570 * @param: structure containing the RT priority.
6349 */ 6571 */
@@ -6571,19 +6793,9 @@ static inline int should_resched(void)
6571 6793
6572static void __cond_resched(void) 6794static void __cond_resched(void)
6573{ 6795{
6574#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP 6796 add_preempt_count(PREEMPT_ACTIVE);
6575 __might_sleep(__FILE__, __LINE__); 6797 schedule();
6576#endif 6798 sub_preempt_count(PREEMPT_ACTIVE);
6577 /*
6578 * The BKS might be reacquired before we have dropped
6579 * PREEMPT_ACTIVE, which could trigger a second
6580 * cond_resched() call.
6581 */
6582 do {
6583 add_preempt_count(PREEMPT_ACTIVE);
6584 schedule();
6585 sub_preempt_count(PREEMPT_ACTIVE);
6586 } while (need_resched());
6587} 6799}
6588 6800
6589int __sched _cond_resched(void) 6801int __sched _cond_resched(void)
@@ -6597,14 +6809,14 @@ int __sched _cond_resched(void)
6597EXPORT_SYMBOL(_cond_resched); 6809EXPORT_SYMBOL(_cond_resched);
6598 6810
6599/* 6811/*
6600 * cond_resched_lock() - if a reschedule is pending, drop the given lock, 6812 * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
6601 * call schedule, and on return reacquire the lock. 6813 * call schedule, and on return reacquire the lock.
6602 * 6814 *
6603 * This works OK both with and without CONFIG_PREEMPT. We do strange low-level 6815 * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
6604 * operations here to prevent schedule() from being called twice (once via 6816 * operations here to prevent schedule() from being called twice (once via
6605 * spin_unlock(), once by hand). 6817 * spin_unlock(), once by hand).
6606 */ 6818 */
6607int cond_resched_lock(spinlock_t *lock) 6819int __cond_resched_lock(spinlock_t *lock)
6608{ 6820{
6609 int resched = should_resched(); 6821 int resched = should_resched();
6610 int ret = 0; 6822 int ret = 0;
@@ -6622,9 +6834,9 @@ int cond_resched_lock(spinlock_t *lock)
6622 } 6834 }
6623 return ret; 6835 return ret;
6624} 6836}
6625EXPORT_SYMBOL(cond_resched_lock); 6837EXPORT_SYMBOL(__cond_resched_lock);
6626 6838
6627int __sched cond_resched_softirq(void) 6839int __sched __cond_resched_softirq(void)
6628{ 6840{
6629 BUG_ON(!in_softirq()); 6841 BUG_ON(!in_softirq());
6630 6842
@@ -6636,7 +6848,7 @@ int __sched cond_resched_softirq(void)
6636 } 6848 }
6637 return 0; 6849 return 0;
6638} 6850}
6639EXPORT_SYMBOL(cond_resched_softirq); 6851EXPORT_SYMBOL(__cond_resched_softirq);
6640 6852
6641/** 6853/**
6642 * yield - yield the current processor to other threads. 6854 * yield - yield the current processor to other threads.
@@ -6660,11 +6872,13 @@ EXPORT_SYMBOL(yield);
6660 */ 6872 */
6661void __sched io_schedule(void) 6873void __sched io_schedule(void)
6662{ 6874{
6663 struct rq *rq = &__raw_get_cpu_var(runqueues); 6875 struct rq *rq = raw_rq();
6664 6876
6665 delayacct_blkio_start(); 6877 delayacct_blkio_start();
6666 atomic_inc(&rq->nr_iowait); 6878 atomic_inc(&rq->nr_iowait);
6879 current->in_iowait = 1;
6667 schedule(); 6880 schedule();
6881 current->in_iowait = 0;
6668 atomic_dec(&rq->nr_iowait); 6882 atomic_dec(&rq->nr_iowait);
6669 delayacct_blkio_end(); 6883 delayacct_blkio_end();
6670} 6884}
@@ -6672,12 +6886,14 @@ EXPORT_SYMBOL(io_schedule);
6672 6886
6673long __sched io_schedule_timeout(long timeout) 6887long __sched io_schedule_timeout(long timeout)
6674{ 6888{
6675 struct rq *rq = &__raw_get_cpu_var(runqueues); 6889 struct rq *rq = raw_rq();
6676 long ret; 6890 long ret;
6677 6891
6678 delayacct_blkio_start(); 6892 delayacct_blkio_start();
6679 atomic_inc(&rq->nr_iowait); 6893 atomic_inc(&rq->nr_iowait);
6894 current->in_iowait = 1;
6680 ret = schedule_timeout(timeout); 6895 ret = schedule_timeout(timeout);
6896 current->in_iowait = 0;
6681 atomic_dec(&rq->nr_iowait); 6897 atomic_dec(&rq->nr_iowait);
6682 delayacct_blkio_end(); 6898 delayacct_blkio_end();
6683 return ret; 6899 return ret;
@@ -6994,8 +7210,12 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
6994 7210
6995 if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) { 7211 if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) {
6996 /* Need help from migration thread: drop lock and wait. */ 7212 /* Need help from migration thread: drop lock and wait. */
7213 struct task_struct *mt = rq->migration_thread;
7214
7215 get_task_struct(mt);
6997 task_rq_unlock(rq, &flags); 7216 task_rq_unlock(rq, &flags);
6998 wake_up_process(rq->migration_thread); 7217 wake_up_process(rq->migration_thread);
7218 put_task_struct(mt);
6999 wait_for_completion(&req.done); 7219 wait_for_completion(&req.done);
7000 tlb_migrate_finish(p->mm); 7220 tlb_migrate_finish(p->mm);
7001 return 0; 7221 return 0;
@@ -7642,7 +7862,7 @@ static int __init migration_init(void)
7642 migration_call(&migration_notifier, CPU_ONLINE, cpu); 7862 migration_call(&migration_notifier, CPU_ONLINE, cpu);
7643 register_cpu_notifier(&migration_notifier); 7863 register_cpu_notifier(&migration_notifier);
7644 7864
7645 return err; 7865 return 0;
7646} 7866}
7647early_initcall(migration_init); 7867early_initcall(migration_init);
7648#endif 7868#endif
@@ -7689,7 +7909,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
7689 break; 7909 break;
7690 } 7910 }
7691 7911
7692 if (!group->__cpu_power) { 7912 if (!group->cpu_power) {
7693 printk(KERN_CONT "\n"); 7913 printk(KERN_CONT "\n");
7694 printk(KERN_ERR "ERROR: domain->cpu_power not " 7914 printk(KERN_ERR "ERROR: domain->cpu_power not "
7695 "set\n"); 7915 "set\n");
@@ -7713,9 +7933,9 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
7713 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); 7933 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
7714 7934
7715 printk(KERN_CONT " %s", str); 7935 printk(KERN_CONT " %s", str);
7716 if (group->__cpu_power != SCHED_LOAD_SCALE) { 7936 if (group->cpu_power != SCHED_LOAD_SCALE) {
7717 printk(KERN_CONT " (__cpu_power = %d)", 7937 printk(KERN_CONT " (cpu_power = %d)",
7718 group->__cpu_power); 7938 group->cpu_power);
7719 } 7939 }
7720 7940
7721 group = group->next; 7941 group = group->next;
@@ -7858,7 +8078,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
7858 rq->rd = rd; 8078 rq->rd = rd;
7859 8079
7860 cpumask_set_cpu(rq->cpu, rd->span); 8080 cpumask_set_cpu(rq->cpu, rd->span);
7861 if (cpumask_test_cpu(rq->cpu, cpu_online_mask)) 8081 if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
7862 set_rq_online(rq); 8082 set_rq_online(rq);
7863 8083
7864 spin_unlock_irqrestore(&rq->lock, flags); 8084 spin_unlock_irqrestore(&rq->lock, flags);
@@ -8000,7 +8220,7 @@ init_sched_build_groups(const struct cpumask *span,
8000 continue; 8220 continue;
8001 8221
8002 cpumask_clear(sched_group_cpus(sg)); 8222 cpumask_clear(sched_group_cpus(sg));
8003 sg->__cpu_power = 0; 8223 sg->cpu_power = 0;
8004 8224
8005 for_each_cpu(j, span) { 8225 for_each_cpu(j, span) {
8006 if (group_fn(j, cpu_map, NULL, tmpmask) != group) 8226 if (group_fn(j, cpu_map, NULL, tmpmask) != group)
@@ -8108,6 +8328,39 @@ struct static_sched_domain {
8108 DECLARE_BITMAP(span, CONFIG_NR_CPUS); 8328 DECLARE_BITMAP(span, CONFIG_NR_CPUS);
8109}; 8329};
8110 8330
8331struct s_data {
8332#ifdef CONFIG_NUMA
8333 int sd_allnodes;
8334 cpumask_var_t domainspan;
8335 cpumask_var_t covered;
8336 cpumask_var_t notcovered;
8337#endif
8338 cpumask_var_t nodemask;
8339 cpumask_var_t this_sibling_map;
8340 cpumask_var_t this_core_map;
8341 cpumask_var_t send_covered;
8342 cpumask_var_t tmpmask;
8343 struct sched_group **sched_group_nodes;
8344 struct root_domain *rd;
8345};
8346
8347enum s_alloc {
8348 sa_sched_groups = 0,
8349 sa_rootdomain,
8350 sa_tmpmask,
8351 sa_send_covered,
8352 sa_this_core_map,
8353 sa_this_sibling_map,
8354 sa_nodemask,
8355 sa_sched_group_nodes,
8356#ifdef CONFIG_NUMA
8357 sa_notcovered,
8358 sa_covered,
8359 sa_domainspan,
8360#endif
8361 sa_none,
8362};
8363
8111/* 8364/*
8112 * SMT sched-domains: 8365 * SMT sched-domains:
8113 */ 8366 */
@@ -8225,11 +8478,76 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
8225 continue; 8478 continue;
8226 } 8479 }
8227 8480
8228 sg_inc_cpu_power(sg, sd->groups->__cpu_power); 8481 sg->cpu_power += sd->groups->cpu_power;
8229 } 8482 }
8230 sg = sg->next; 8483 sg = sg->next;
8231 } while (sg != group_head); 8484 } while (sg != group_head);
8232} 8485}
8486
8487static int build_numa_sched_groups(struct s_data *d,
8488 const struct cpumask *cpu_map, int num)
8489{
8490 struct sched_domain *sd;
8491 struct sched_group *sg, *prev;
8492 int n, j;
8493
8494 cpumask_clear(d->covered);
8495 cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map);
8496 if (cpumask_empty(d->nodemask)) {
8497 d->sched_group_nodes[num] = NULL;
8498 goto out;
8499 }
8500
8501 sched_domain_node_span(num, d->domainspan);
8502 cpumask_and(d->domainspan, d->domainspan, cpu_map);
8503
8504 sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
8505 GFP_KERNEL, num);
8506 if (!sg) {
8507 printk(KERN_WARNING "Can not alloc domain group for node %d\n",
8508 num);
8509 return -ENOMEM;
8510 }
8511 d->sched_group_nodes[num] = sg;
8512
8513 for_each_cpu(j, d->nodemask) {
8514 sd = &per_cpu(node_domains, j).sd;
8515 sd->groups = sg;
8516 }
8517
8518 sg->cpu_power = 0;
8519 cpumask_copy(sched_group_cpus(sg), d->nodemask);
8520 sg->next = sg;
8521 cpumask_or(d->covered, d->covered, d->nodemask);
8522
8523 prev = sg;
8524 for (j = 0; j < nr_node_ids; j++) {
8525 n = (num + j) % nr_node_ids;
8526 cpumask_complement(d->notcovered, d->covered);
8527 cpumask_and(d->tmpmask, d->notcovered, cpu_map);
8528 cpumask_and(d->tmpmask, d->tmpmask, d->domainspan);
8529 if (cpumask_empty(d->tmpmask))
8530 break;
8531 cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n));
8532 if (cpumask_empty(d->tmpmask))
8533 continue;
8534 sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
8535 GFP_KERNEL, num);
8536 if (!sg) {
8537 printk(KERN_WARNING
8538 "Can not alloc domain group for node %d\n", j);
8539 return -ENOMEM;
8540 }
8541 sg->cpu_power = 0;
8542 cpumask_copy(sched_group_cpus(sg), d->tmpmask);
8543 sg->next = prev->next;
8544 cpumask_or(d->covered, d->covered, d->tmpmask);
8545 prev->next = sg;
8546 prev = sg;
8547 }
8548out:
8549 return 0;
8550}
8233#endif /* CONFIG_NUMA */ 8551#endif /* CONFIG_NUMA */
8234 8552
8235#ifdef CONFIG_NUMA 8553#ifdef CONFIG_NUMA
@@ -8283,15 +8601,13 @@ static void free_sched_groups(const struct cpumask *cpu_map,
8283 * there are asymmetries in the topology. If there are asymmetries, group 8601 * there are asymmetries in the topology. If there are asymmetries, group
8284 * having more cpu_power will pickup more load compared to the group having 8602 * having more cpu_power will pickup more load compared to the group having
8285 * less cpu_power. 8603 * less cpu_power.
8286 *
8287 * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents
8288 * the maximum number of tasks a group can handle in the presence of other idle
8289 * or lightly loaded groups in the same sched domain.
8290 */ 8604 */
8291static void init_sched_groups_power(int cpu, struct sched_domain *sd) 8605static void init_sched_groups_power(int cpu, struct sched_domain *sd)
8292{ 8606{
8293 struct sched_domain *child; 8607 struct sched_domain *child;
8294 struct sched_group *group; 8608 struct sched_group *group;
8609 long power;
8610 int weight;
8295 8611
8296 WARN_ON(!sd || !sd->groups); 8612 WARN_ON(!sd || !sd->groups);
8297 8613
@@ -8300,28 +8616,32 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
8300 8616
8301 child = sd->child; 8617 child = sd->child;
8302 8618
8303 sd->groups->__cpu_power = 0; 8619 sd->groups->cpu_power = 0;
8304 8620
8305 /* 8621 if (!child) {
8306 * For perf policy, if the groups in child domain share resources 8622 power = SCHED_LOAD_SCALE;
8307 * (for example cores sharing some portions of the cache hierarchy 8623 weight = cpumask_weight(sched_domain_span(sd));
8308 * or SMT), then set this domain groups cpu_power such that each group 8624 /*
8309 * can handle only one task, when there are other idle groups in the 8625 * SMT siblings share the power of a single core.
8310 * same sched domain. 8626 * Usually multiple threads get a better yield out of
8311 */ 8627 * that one core than a single thread would have,
8312 if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) && 8628 * reflect that in sd->smt_gain.
8313 (child->flags & 8629 */
8314 (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) { 8630 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
8315 sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE); 8631 power *= sd->smt_gain;
8632 power /= weight;
8633 power >>= SCHED_LOAD_SHIFT;
8634 }
8635 sd->groups->cpu_power += power;
8316 return; 8636 return;
8317 } 8637 }
8318 8638
8319 /* 8639 /*
8320 * add cpu_power of each child group to this groups cpu_power 8640 * Add cpu_power of each child group to this groups cpu_power.
8321 */ 8641 */
8322 group = child->groups; 8642 group = child->groups;
8323 do { 8643 do {
8324 sg_inc_cpu_power(sd->groups, group->__cpu_power); 8644 sd->groups->cpu_power += group->cpu_power;
8325 group = group->next; 8645 group = group->next;
8326 } while (group != child->groups); 8646 } while (group != child->groups);
8327} 8647}
@@ -8395,280 +8715,285 @@ static void set_domain_attribute(struct sched_domain *sd,
8395 } 8715 }
8396} 8716}
8397 8717
8398/* 8718static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
8399 * Build sched domains for a given set of cpus and attach the sched domains 8719 const struct cpumask *cpu_map)
8400 * to the individual cpus 8720{
8401 */ 8721 switch (what) {
8402static int __build_sched_domains(const struct cpumask *cpu_map, 8722 case sa_sched_groups:
8403 struct sched_domain_attr *attr) 8723 free_sched_groups(cpu_map, d->tmpmask); /* fall through */
8404{ 8724 d->sched_group_nodes = NULL;
8405 int i, err = -ENOMEM; 8725 case sa_rootdomain:
8406 struct root_domain *rd; 8726 free_rootdomain(d->rd); /* fall through */
8407 cpumask_var_t nodemask, this_sibling_map, this_core_map, send_covered, 8727 case sa_tmpmask:
8408 tmpmask; 8728 free_cpumask_var(d->tmpmask); /* fall through */
8729 case sa_send_covered:
8730 free_cpumask_var(d->send_covered); /* fall through */
8731 case sa_this_core_map:
8732 free_cpumask_var(d->this_core_map); /* fall through */
8733 case sa_this_sibling_map:
8734 free_cpumask_var(d->this_sibling_map); /* fall through */
8735 case sa_nodemask:
8736 free_cpumask_var(d->nodemask); /* fall through */
8737 case sa_sched_group_nodes:
8409#ifdef CONFIG_NUMA 8738#ifdef CONFIG_NUMA
8410 cpumask_var_t domainspan, covered, notcovered; 8739 kfree(d->sched_group_nodes); /* fall through */
8411 struct sched_group **sched_group_nodes = NULL; 8740 case sa_notcovered:
8412 int sd_allnodes = 0; 8741 free_cpumask_var(d->notcovered); /* fall through */
8413 8742 case sa_covered:
8414 if (!alloc_cpumask_var(&domainspan, GFP_KERNEL)) 8743 free_cpumask_var(d->covered); /* fall through */
8415 goto out; 8744 case sa_domainspan:
8416 if (!alloc_cpumask_var(&covered, GFP_KERNEL)) 8745 free_cpumask_var(d->domainspan); /* fall through */
8417 goto free_domainspan; 8746#endif
8418 if (!alloc_cpumask_var(&notcovered, GFP_KERNEL)) 8747 case sa_none:
8419 goto free_covered; 8748 break;
8420#endif 8749 }
8421 8750}
8422 if (!alloc_cpumask_var(&nodemask, GFP_KERNEL))
8423 goto free_notcovered;
8424 if (!alloc_cpumask_var(&this_sibling_map, GFP_KERNEL))
8425 goto free_nodemask;
8426 if (!alloc_cpumask_var(&this_core_map, GFP_KERNEL))
8427 goto free_this_sibling_map;
8428 if (!alloc_cpumask_var(&send_covered, GFP_KERNEL))
8429 goto free_this_core_map;
8430 if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL))
8431 goto free_send_covered;
8432 8751
8752static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
8753 const struct cpumask *cpu_map)
8754{
8433#ifdef CONFIG_NUMA 8755#ifdef CONFIG_NUMA
8434 /* 8756 if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL))
8435 * Allocate the per-node list of sched groups 8757 return sa_none;
8436 */ 8758 if (!alloc_cpumask_var(&d->covered, GFP_KERNEL))
8437 sched_group_nodes = kcalloc(nr_node_ids, sizeof(struct sched_group *), 8759 return sa_domainspan;
8438 GFP_KERNEL); 8760 if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL))
8439 if (!sched_group_nodes) { 8761 return sa_covered;
8762 /* Allocate the per-node list of sched groups */
8763 d->sched_group_nodes = kcalloc(nr_node_ids,
8764 sizeof(struct sched_group *), GFP_KERNEL);
8765 if (!d->sched_group_nodes) {
8440 printk(KERN_WARNING "Can not alloc sched group node list\n"); 8766 printk(KERN_WARNING "Can not alloc sched group node list\n");
8441 goto free_tmpmask; 8767 return sa_notcovered;
8442 } 8768 }
8443#endif 8769 sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes;
8444 8770#endif
8445 rd = alloc_rootdomain(); 8771 if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL))
8446 if (!rd) { 8772 return sa_sched_group_nodes;
8773 if (!alloc_cpumask_var(&d->this_sibling_map, GFP_KERNEL))
8774 return sa_nodemask;
8775 if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL))
8776 return sa_this_sibling_map;
8777 if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
8778 return sa_this_core_map;
8779 if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL))
8780 return sa_send_covered;
8781 d->rd = alloc_rootdomain();
8782 if (!d->rd) {
8447 printk(KERN_WARNING "Cannot alloc root domain\n"); 8783 printk(KERN_WARNING "Cannot alloc root domain\n");
8448 goto free_sched_groups; 8784 return sa_tmpmask;
8449 } 8785 }
8786 return sa_rootdomain;
8787}
8450 8788
8789static struct sched_domain *__build_numa_sched_domains(struct s_data *d,
8790 const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i)
8791{
8792 struct sched_domain *sd = NULL;
8451#ifdef CONFIG_NUMA 8793#ifdef CONFIG_NUMA
8452 sched_group_nodes_bycpu[cpumask_first(cpu_map)] = sched_group_nodes; 8794 struct sched_domain *parent;
8453#endif
8454
8455 /*
8456 * Set up domains for cpus specified by the cpu_map.
8457 */
8458 for_each_cpu(i, cpu_map) {
8459 struct sched_domain *sd = NULL, *p;
8460 8795
8461 cpumask_and(nodemask, cpumask_of_node(cpu_to_node(i)), cpu_map); 8796 d->sd_allnodes = 0;
8462 8797 if (cpumask_weight(cpu_map) >
8463#ifdef CONFIG_NUMA 8798 SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) {
8464 if (cpumask_weight(cpu_map) > 8799 sd = &per_cpu(allnodes_domains, i).sd;
8465 SD_NODES_PER_DOMAIN*cpumask_weight(nodemask)) { 8800 SD_INIT(sd, ALLNODES);
8466 sd = &per_cpu(allnodes_domains, i).sd;
8467 SD_INIT(sd, ALLNODES);
8468 set_domain_attribute(sd, attr);
8469 cpumask_copy(sched_domain_span(sd), cpu_map);
8470 cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask);
8471 p = sd;
8472 sd_allnodes = 1;
8473 } else
8474 p = NULL;
8475
8476 sd = &per_cpu(node_domains, i).sd;
8477 SD_INIT(sd, NODE);
8478 set_domain_attribute(sd, attr); 8801 set_domain_attribute(sd, attr);
8479 sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd)); 8802 cpumask_copy(sched_domain_span(sd), cpu_map);
8480 sd->parent = p; 8803 cpu_to_allnodes_group(i, cpu_map, &sd->groups, d->tmpmask);
8481 if (p) 8804 d->sd_allnodes = 1;
8482 p->child = sd; 8805 }
8483 cpumask_and(sched_domain_span(sd), 8806 parent = sd;
8484 sched_domain_span(sd), cpu_map); 8807
8808 sd = &per_cpu(node_domains, i).sd;
8809 SD_INIT(sd, NODE);
8810 set_domain_attribute(sd, attr);
8811 sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));
8812 sd->parent = parent;
8813 if (parent)
8814 parent->child = sd;
8815 cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map);
8485#endif 8816#endif
8817 return sd;
8818}
8486 8819
8487 p = sd; 8820static struct sched_domain *__build_cpu_sched_domain(struct s_data *d,
8488 sd = &per_cpu(phys_domains, i).sd; 8821 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
8489 SD_INIT(sd, CPU); 8822 struct sched_domain *parent, int i)
8490 set_domain_attribute(sd, attr); 8823{
8491 cpumask_copy(sched_domain_span(sd), nodemask); 8824 struct sched_domain *sd;
8492 sd->parent = p; 8825 sd = &per_cpu(phys_domains, i).sd;
8493 if (p) 8826 SD_INIT(sd, CPU);
8494 p->child = sd; 8827 set_domain_attribute(sd, attr);
8495 cpu_to_phys_group(i, cpu_map, &sd->groups, tmpmask); 8828 cpumask_copy(sched_domain_span(sd), d->nodemask);
8829 sd->parent = parent;
8830 if (parent)
8831 parent->child = sd;
8832 cpu_to_phys_group(i, cpu_map, &sd->groups, d->tmpmask);
8833 return sd;
8834}
8496 8835
8836static struct sched_domain *__build_mc_sched_domain(struct s_data *d,
8837 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
8838 struct sched_domain *parent, int i)
8839{
8840 struct sched_domain *sd = parent;
8497#ifdef CONFIG_SCHED_MC 8841#ifdef CONFIG_SCHED_MC
8498 p = sd; 8842 sd = &per_cpu(core_domains, i).sd;
8499 sd = &per_cpu(core_domains, i).sd; 8843 SD_INIT(sd, MC);
8500 SD_INIT(sd, MC); 8844 set_domain_attribute(sd, attr);
8501 set_domain_attribute(sd, attr); 8845 cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i));
8502 cpumask_and(sched_domain_span(sd), cpu_map, 8846 sd->parent = parent;
8503 cpu_coregroup_mask(i)); 8847 parent->child = sd;
8504 sd->parent = p; 8848 cpu_to_core_group(i, cpu_map, &sd->groups, d->tmpmask);
8505 p->child = sd;
8506 cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask);
8507#endif 8849#endif
8850 return sd;
8851}
8508 8852
8853static struct sched_domain *__build_smt_sched_domain(struct s_data *d,
8854 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
8855 struct sched_domain *parent, int i)
8856{
8857 struct sched_domain *sd = parent;
8509#ifdef CONFIG_SCHED_SMT 8858#ifdef CONFIG_SCHED_SMT
8510 p = sd; 8859 sd = &per_cpu(cpu_domains, i).sd;
8511 sd = &per_cpu(cpu_domains, i).sd; 8860 SD_INIT(sd, SIBLING);
8512 SD_INIT(sd, SIBLING); 8861 set_domain_attribute(sd, attr);
8513 set_domain_attribute(sd, attr); 8862 cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i));
8514 cpumask_and(sched_domain_span(sd), 8863 sd->parent = parent;
8515 topology_thread_cpumask(i), cpu_map); 8864 parent->child = sd;
8516 sd->parent = p; 8865 cpu_to_cpu_group(i, cpu_map, &sd->groups, d->tmpmask);
8517 p->child = sd;
8518 cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask);
8519#endif 8866#endif
8520 } 8867 return sd;
8868}
8521 8869
8870static void build_sched_groups(struct s_data *d, enum sched_domain_level l,
8871 const struct cpumask *cpu_map, int cpu)
8872{
8873 switch (l) {
8522#ifdef CONFIG_SCHED_SMT 8874#ifdef CONFIG_SCHED_SMT
8523 /* Set up CPU (sibling) groups */ 8875 case SD_LV_SIBLING: /* set up CPU (sibling) groups */
8524 for_each_cpu(i, cpu_map) { 8876 cpumask_and(d->this_sibling_map, cpu_map,
8525 cpumask_and(this_sibling_map, 8877 topology_thread_cpumask(cpu));
8526 topology_thread_cpumask(i), cpu_map); 8878 if (cpu == cpumask_first(d->this_sibling_map))
8527 if (i != cpumask_first(this_sibling_map)) 8879 init_sched_build_groups(d->this_sibling_map, cpu_map,
8528 continue; 8880 &cpu_to_cpu_group,
8529 8881 d->send_covered, d->tmpmask);
8530 init_sched_build_groups(this_sibling_map, cpu_map, 8882 break;
8531 &cpu_to_cpu_group,
8532 send_covered, tmpmask);
8533 }
8534#endif 8883#endif
8535
8536#ifdef CONFIG_SCHED_MC 8884#ifdef CONFIG_SCHED_MC
8537 /* Set up multi-core groups */ 8885 case SD_LV_MC: /* set up multi-core groups */
8538 for_each_cpu(i, cpu_map) { 8886 cpumask_and(d->this_core_map, cpu_map, cpu_coregroup_mask(cpu));
8539 cpumask_and(this_core_map, cpu_coregroup_mask(i), cpu_map); 8887 if (cpu == cpumask_first(d->this_core_map))
8540 if (i != cpumask_first(this_core_map)) 8888 init_sched_build_groups(d->this_core_map, cpu_map,
8541 continue; 8889 &cpu_to_core_group,
8542 8890 d->send_covered, d->tmpmask);
8543 init_sched_build_groups(this_core_map, cpu_map, 8891 break;
8544 &cpu_to_core_group,
8545 send_covered, tmpmask);
8546 }
8547#endif 8892#endif
8548 8893 case SD_LV_CPU: /* set up physical groups */
8549 /* Set up physical groups */ 8894 cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map);
8550 for (i = 0; i < nr_node_ids; i++) { 8895 if (!cpumask_empty(d->nodemask))
8551 cpumask_and(nodemask, cpumask_of_node(i), cpu_map); 8896 init_sched_build_groups(d->nodemask, cpu_map,
8552 if (cpumask_empty(nodemask)) 8897 &cpu_to_phys_group,
8553 continue; 8898 d->send_covered, d->tmpmask);
8554 8899 break;
8555 init_sched_build_groups(nodemask, cpu_map,
8556 &cpu_to_phys_group,
8557 send_covered, tmpmask);
8558 }
8559
8560#ifdef CONFIG_NUMA 8900#ifdef CONFIG_NUMA
8561 /* Set up node groups */ 8901 case SD_LV_ALLNODES:
8562 if (sd_allnodes) { 8902 init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group,
8563 init_sched_build_groups(cpu_map, cpu_map, 8903 d->send_covered, d->tmpmask);
8564 &cpu_to_allnodes_group, 8904 break;
8565 send_covered, tmpmask); 8905#endif
8906 default:
8907 break;
8566 } 8908 }
8909}
8567 8910
8568 for (i = 0; i < nr_node_ids; i++) { 8911/*
8569 /* Set up node groups */ 8912 * Build sched domains for a given set of cpus and attach the sched domains
8570 struct sched_group *sg, *prev; 8913 * to the individual cpus
8571 int j; 8914 */
8572 8915static int __build_sched_domains(const struct cpumask *cpu_map,
8573 cpumask_clear(covered); 8916 struct sched_domain_attr *attr)
8574 cpumask_and(nodemask, cpumask_of_node(i), cpu_map); 8917{
8575 if (cpumask_empty(nodemask)) { 8918 enum s_alloc alloc_state = sa_none;
8576 sched_group_nodes[i] = NULL; 8919 struct s_data d;
8577 continue; 8920 struct sched_domain *sd;
8578 } 8921 int i;
8922#ifdef CONFIG_NUMA
8923 d.sd_allnodes = 0;
8924#endif
8579 8925
8580 sched_domain_node_span(i, domainspan); 8926 alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
8581 cpumask_and(domainspan, domainspan, cpu_map); 8927 if (alloc_state != sa_rootdomain)
8928 goto error;
8929 alloc_state = sa_sched_groups;
8582 8930
8583 sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), 8931 /*
8584 GFP_KERNEL, i); 8932 * Set up domains for cpus specified by the cpu_map.
8585 if (!sg) { 8933 */
8586 printk(KERN_WARNING "Can not alloc domain group for " 8934 for_each_cpu(i, cpu_map) {
8587 "node %d\n", i); 8935 cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)),
8588 goto error; 8936 cpu_map);
8589 }
8590 sched_group_nodes[i] = sg;
8591 for_each_cpu(j, nodemask) {
8592 struct sched_domain *sd;
8593 8937
8594 sd = &per_cpu(node_domains, j).sd; 8938 sd = __build_numa_sched_domains(&d, cpu_map, attr, i);
8595 sd->groups = sg; 8939 sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i);
8596 } 8940 sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i);
8597 sg->__cpu_power = 0; 8941 sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i);
8598 cpumask_copy(sched_group_cpus(sg), nodemask); 8942 }
8599 sg->next = sg;
8600 cpumask_or(covered, covered, nodemask);
8601 prev = sg;
8602 8943
8603 for (j = 0; j < nr_node_ids; j++) { 8944 for_each_cpu(i, cpu_map) {
8604 int n = (i + j) % nr_node_ids; 8945 build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i);
8946 build_sched_groups(&d, SD_LV_MC, cpu_map, i);
8947 }
8605 8948
8606 cpumask_complement(notcovered, covered); 8949 /* Set up physical groups */
8607 cpumask_and(tmpmask, notcovered, cpu_map); 8950 for (i = 0; i < nr_node_ids; i++)
8608 cpumask_and(tmpmask, tmpmask, domainspan); 8951 build_sched_groups(&d, SD_LV_CPU, cpu_map, i);
8609 if (cpumask_empty(tmpmask))
8610 break;
8611 8952
8612 cpumask_and(tmpmask, tmpmask, cpumask_of_node(n)); 8953#ifdef CONFIG_NUMA
8613 if (cpumask_empty(tmpmask)) 8954 /* Set up node groups */
8614 continue; 8955 if (d.sd_allnodes)
8956 build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0);
8615 8957
8616 sg = kmalloc_node(sizeof(struct sched_group) + 8958 for (i = 0; i < nr_node_ids; i++)
8617 cpumask_size(), 8959 if (build_numa_sched_groups(&d, cpu_map, i))
8618 GFP_KERNEL, i); 8960 goto error;
8619 if (!sg) {
8620 printk(KERN_WARNING
8621 "Can not alloc domain group for node %d\n", j);
8622 goto error;
8623 }
8624 sg->__cpu_power = 0;
8625 cpumask_copy(sched_group_cpus(sg), tmpmask);
8626 sg->next = prev->next;
8627 cpumask_or(covered, covered, tmpmask);
8628 prev->next = sg;
8629 prev = sg;
8630 }
8631 }
8632#endif 8961#endif
8633 8962
8634 /* Calculate CPU power for physical packages and nodes */ 8963 /* Calculate CPU power for physical packages and nodes */
8635#ifdef CONFIG_SCHED_SMT 8964#ifdef CONFIG_SCHED_SMT
8636 for_each_cpu(i, cpu_map) { 8965 for_each_cpu(i, cpu_map) {
8637 struct sched_domain *sd = &per_cpu(cpu_domains, i).sd; 8966 sd = &per_cpu(cpu_domains, i).sd;
8638
8639 init_sched_groups_power(i, sd); 8967 init_sched_groups_power(i, sd);
8640 } 8968 }
8641#endif 8969#endif
8642#ifdef CONFIG_SCHED_MC 8970#ifdef CONFIG_SCHED_MC
8643 for_each_cpu(i, cpu_map) { 8971 for_each_cpu(i, cpu_map) {
8644 struct sched_domain *sd = &per_cpu(core_domains, i).sd; 8972 sd = &per_cpu(core_domains, i).sd;
8645
8646 init_sched_groups_power(i, sd); 8973 init_sched_groups_power(i, sd);
8647 } 8974 }
8648#endif 8975#endif
8649 8976
8650 for_each_cpu(i, cpu_map) { 8977 for_each_cpu(i, cpu_map) {
8651 struct sched_domain *sd = &per_cpu(phys_domains, i).sd; 8978 sd = &per_cpu(phys_domains, i).sd;
8652
8653 init_sched_groups_power(i, sd); 8979 init_sched_groups_power(i, sd);
8654 } 8980 }
8655 8981
8656#ifdef CONFIG_NUMA 8982#ifdef CONFIG_NUMA
8657 for (i = 0; i < nr_node_ids; i++) 8983 for (i = 0; i < nr_node_ids; i++)
8658 init_numa_sched_groups_power(sched_group_nodes[i]); 8984 init_numa_sched_groups_power(d.sched_group_nodes[i]);
8659 8985
8660 if (sd_allnodes) { 8986 if (d.sd_allnodes) {
8661 struct sched_group *sg; 8987 struct sched_group *sg;
8662 8988
8663 cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg, 8989 cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg,
8664 tmpmask); 8990 d.tmpmask);
8665 init_numa_sched_groups_power(sg); 8991 init_numa_sched_groups_power(sg);
8666 } 8992 }
8667#endif 8993#endif
8668 8994
8669 /* Attach the domains */ 8995 /* Attach the domains */
8670 for_each_cpu(i, cpu_map) { 8996 for_each_cpu(i, cpu_map) {
8671 struct sched_domain *sd;
8672#ifdef CONFIG_SCHED_SMT 8997#ifdef CONFIG_SCHED_SMT
8673 sd = &per_cpu(cpu_domains, i).sd; 8998 sd = &per_cpu(cpu_domains, i).sd;
8674#elif defined(CONFIG_SCHED_MC) 8999#elif defined(CONFIG_SCHED_MC)
@@ -8676,44 +9001,16 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
8676#else 9001#else
8677 sd = &per_cpu(phys_domains, i).sd; 9002 sd = &per_cpu(phys_domains, i).sd;
8678#endif 9003#endif
8679 cpu_attach_domain(sd, rd, i); 9004 cpu_attach_domain(sd, d.rd, i);
8680 } 9005 }
8681 9006
8682 err = 0; 9007 d.sched_group_nodes = NULL; /* don't free this we still need it */
8683 9008 __free_domain_allocs(&d, sa_tmpmask, cpu_map);
8684free_tmpmask: 9009 return 0;
8685 free_cpumask_var(tmpmask);
8686free_send_covered:
8687 free_cpumask_var(send_covered);
8688free_this_core_map:
8689 free_cpumask_var(this_core_map);
8690free_this_sibling_map:
8691 free_cpumask_var(this_sibling_map);
8692free_nodemask:
8693 free_cpumask_var(nodemask);
8694free_notcovered:
8695#ifdef CONFIG_NUMA
8696 free_cpumask_var(notcovered);
8697free_covered:
8698 free_cpumask_var(covered);
8699free_domainspan:
8700 free_cpumask_var(domainspan);
8701out:
8702#endif
8703 return err;
8704
8705free_sched_groups:
8706#ifdef CONFIG_NUMA
8707 kfree(sched_group_nodes);
8708#endif
8709 goto free_tmpmask;
8710 9010
8711#ifdef CONFIG_NUMA
8712error: 9011error:
8713 free_sched_groups(cpu_map, tmpmask); 9012 __free_domain_allocs(&d, alloc_state, cpu_map);
8714 free_rootdomain(rd); 9013 return -ENOMEM;
8715 goto free_tmpmask;
8716#endif
8717} 9014}
8718 9015
8719static int build_sched_domains(const struct cpumask *cpu_map) 9016static int build_sched_domains(const struct cpumask *cpu_map)
@@ -9321,11 +9618,11 @@ void __init sched_init(void)
9321 * system cpu resource, based on the weight assigned to root 9618 * system cpu resource, based on the weight assigned to root
9322 * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished 9619 * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished
9323 * by letting tasks of init_task_group sit in a separate cfs_rq 9620 * by letting tasks of init_task_group sit in a separate cfs_rq
9324 * (init_cfs_rq) and having one entity represent this group of 9621 * (init_tg_cfs_rq) and having one entity represent this group of
9325 * tasks in rq->cfs (i.e init_task_group->se[] != NULL). 9622 * tasks in rq->cfs (i.e init_task_group->se[] != NULL).
9326 */ 9623 */
9327 init_tg_cfs_entry(&init_task_group, 9624 init_tg_cfs_entry(&init_task_group,
9328 &per_cpu(init_cfs_rq, i), 9625 &per_cpu(init_tg_cfs_rq, i),
9329 &per_cpu(init_sched_entity, i), i, 1, 9626 &per_cpu(init_sched_entity, i), i, 1,
9330 root_task_group.se[i]); 9627 root_task_group.se[i]);
9331 9628
@@ -9351,6 +9648,7 @@ void __init sched_init(void)
9351#ifdef CONFIG_SMP 9648#ifdef CONFIG_SMP
9352 rq->sd = NULL; 9649 rq->sd = NULL;
9353 rq->rd = NULL; 9650 rq->rd = NULL;
9651 rq->post_schedule = 0;
9354 rq->active_balance = 0; 9652 rq->active_balance = 0;
9355 rq->next_balance = jiffies; 9653 rq->next_balance = jiffies;
9356 rq->push_cpu = 0; 9654 rq->push_cpu = 0;
@@ -9415,13 +9713,20 @@ void __init sched_init(void)
9415} 9713}
9416 9714
9417#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP 9715#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
9418void __might_sleep(char *file, int line) 9716static inline int preempt_count_equals(int preempt_offset)
9717{
9718 int nested = preempt_count() & ~PREEMPT_ACTIVE;
9719
9720 return (nested == PREEMPT_INATOMIC_BASE + preempt_offset);
9721}
9722
9723void __might_sleep(char *file, int line, int preempt_offset)
9419{ 9724{
9420#ifdef in_atomic 9725#ifdef in_atomic
9421 static unsigned long prev_jiffy; /* ratelimiting */ 9726 static unsigned long prev_jiffy; /* ratelimiting */
9422 9727
9423 if ((!in_atomic() && !irqs_disabled()) || 9728 if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
9424 system_state != SYSTEM_RUNNING || oops_in_progress) 9729 system_state != SYSTEM_RUNNING || oops_in_progress)
9425 return; 9730 return;
9426 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) 9731 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
9427 return; 9732 return;
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index d014efbf947a..0f052fc674d5 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -127,21 +127,11 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
127 127
128 /* 128 /*
129 * If the cpu was currently mapped to a different value, we 129 * If the cpu was currently mapped to a different value, we
130 * first need to unmap the old value 130 * need to map it to the new value then remove the old value.
131 * Note, we must add the new value first, otherwise we risk the
132 * cpu being cleared from pri_active, and this cpu could be
133 * missed for a push or pull.
131 */ 134 */
132 if (likely(oldpri != CPUPRI_INVALID)) {
133 struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri];
134
135 spin_lock_irqsave(&vec->lock, flags);
136
137 vec->count--;
138 if (!vec->count)
139 clear_bit(oldpri, cp->pri_active);
140 cpumask_clear_cpu(cpu, vec->mask);
141
142 spin_unlock_irqrestore(&vec->lock, flags);
143 }
144
145 if (likely(newpri != CPUPRI_INVALID)) { 135 if (likely(newpri != CPUPRI_INVALID)) {
146 struct cpupri_vec *vec = &cp->pri_to_cpu[newpri]; 136 struct cpupri_vec *vec = &cp->pri_to_cpu[newpri];
147 137
@@ -154,6 +144,18 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
154 144
155 spin_unlock_irqrestore(&vec->lock, flags); 145 spin_unlock_irqrestore(&vec->lock, flags);
156 } 146 }
147 if (likely(oldpri != CPUPRI_INVALID)) {
148 struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri];
149
150 spin_lock_irqsave(&vec->lock, flags);
151
152 vec->count--;
153 if (!vec->count)
154 clear_bit(oldpri, cp->pri_active);
155 cpumask_clear_cpu(cpu, vec->mask);
156
157 spin_unlock_irqrestore(&vec->lock, flags);
158 }
157 159
158 *currpri = newpri; 160 *currpri = newpri;
159} 161}
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 70c7e0b79946..5ddbd0891267 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -409,6 +409,8 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
409 PN(se.wait_max); 409 PN(se.wait_max);
410 PN(se.wait_sum); 410 PN(se.wait_sum);
411 P(se.wait_count); 411 P(se.wait_count);
412 PN(se.iowait_sum);
413 P(se.iowait_count);
412 P(sched_info.bkl_count); 414 P(sched_info.bkl_count);
413 P(se.nr_migrations); 415 P(se.nr_migrations);
414 P(se.nr_migrations_cold); 416 P(se.nr_migrations_cold);
@@ -479,6 +481,8 @@ void proc_sched_set_task(struct task_struct *p)
479 p->se.wait_max = 0; 481 p->se.wait_max = 0;
480 p->se.wait_sum = 0; 482 p->se.wait_sum = 0;
481 p->se.wait_count = 0; 483 p->se.wait_count = 0;
484 p->se.iowait_sum = 0;
485 p->se.iowait_count = 0;
482 p->se.sleep_max = 0; 486 p->se.sleep_max = 0;
483 p->se.sum_sleep_runtime = 0; 487 p->se.sum_sleep_runtime = 0;
484 p->se.block_max = 0; 488 p->se.block_max = 0;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 652e8bdef9aa..aa7f84121016 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -24,7 +24,7 @@
24 24
25/* 25/*
26 * Targeted preemption latency for CPU-bound tasks: 26 * Targeted preemption latency for CPU-bound tasks:
27 * (default: 20ms * (1 + ilog(ncpus)), units: nanoseconds) 27 * (default: 5ms * (1 + ilog(ncpus)), units: nanoseconds)
28 * 28 *
29 * NOTE: this latency value is not the same as the concept of 29 * NOTE: this latency value is not the same as the concept of
30 * 'timeslice length' - timeslices in CFS are of variable length 30 * 'timeslice length' - timeslices in CFS are of variable length
@@ -34,13 +34,13 @@
34 * (to see the precise effective timeslice length of your workload, 34 * (to see the precise effective timeslice length of your workload,
35 * run vmstat and monitor the context-switches (cs) field) 35 * run vmstat and monitor the context-switches (cs) field)
36 */ 36 */
37unsigned int sysctl_sched_latency = 20000000ULL; 37unsigned int sysctl_sched_latency = 5000000ULL;
38 38
39/* 39/*
40 * Minimal preemption granularity for CPU-bound tasks: 40 * Minimal preemption granularity for CPU-bound tasks:
41 * (default: 4 msec * (1 + ilog(ncpus)), units: nanoseconds) 41 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
42 */ 42 */
43unsigned int sysctl_sched_min_granularity = 4000000ULL; 43unsigned int sysctl_sched_min_granularity = 1000000ULL;
44 44
45/* 45/*
46 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity 46 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
@@ -48,10 +48,10 @@ unsigned int sysctl_sched_min_granularity = 4000000ULL;
48static unsigned int sched_nr_latency = 5; 48static unsigned int sched_nr_latency = 5;
49 49
50/* 50/*
51 * After fork, child runs first. (default) If set to 0 then 51 * After fork, child runs first. If set to 0 (default) then
52 * parent will (try to) run first. 52 * parent will (try to) run first.
53 */ 53 */
54const_debug unsigned int sysctl_sched_child_runs_first = 1; 54unsigned int sysctl_sched_child_runs_first __read_mostly;
55 55
56/* 56/*
57 * sys_sched_yield() compat mode 57 * sys_sched_yield() compat mode
@@ -63,13 +63,13 @@ unsigned int __read_mostly sysctl_sched_compat_yield;
63 63
64/* 64/*
65 * SCHED_OTHER wake-up granularity. 65 * SCHED_OTHER wake-up granularity.
66 * (default: 5 msec * (1 + ilog(ncpus)), units: nanoseconds) 66 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
67 * 67 *
68 * This option delays the preemption effects of decoupled workloads 68 * This option delays the preemption effects of decoupled workloads
69 * and reduces their over-scheduling. Synchronous workloads will still 69 * and reduces their over-scheduling. Synchronous workloads will still
70 * have immediate wakeup/sleep latencies. 70 * have immediate wakeup/sleep latencies.
71 */ 71 */
72unsigned int sysctl_sched_wakeup_granularity = 5000000UL; 72unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
73 73
74const_debug unsigned int sysctl_sched_migration_cost = 500000UL; 74const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
75 75
@@ -79,11 +79,6 @@ static const struct sched_class fair_sched_class;
79 * CFS operations on generic schedulable entities: 79 * CFS operations on generic schedulable entities:
80 */ 80 */
81 81
82static inline struct task_struct *task_of(struct sched_entity *se)
83{
84 return container_of(se, struct task_struct, se);
85}
86
87#ifdef CONFIG_FAIR_GROUP_SCHED 82#ifdef CONFIG_FAIR_GROUP_SCHED
88 83
89/* cpu runqueue to which this cfs_rq is attached */ 84/* cpu runqueue to which this cfs_rq is attached */
@@ -95,6 +90,14 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
95/* An entity is a task if it doesn't "own" a runqueue */ 90/* An entity is a task if it doesn't "own" a runqueue */
96#define entity_is_task(se) (!se->my_q) 91#define entity_is_task(se) (!se->my_q)
97 92
93static inline struct task_struct *task_of(struct sched_entity *se)
94{
95#ifdef CONFIG_SCHED_DEBUG
96 WARN_ON_ONCE(!entity_is_task(se));
97#endif
98 return container_of(se, struct task_struct, se);
99}
100
98/* Walk up scheduling entities hierarchy */ 101/* Walk up scheduling entities hierarchy */
99#define for_each_sched_entity(se) \ 102#define for_each_sched_entity(se) \
100 for (; se; se = se->parent) 103 for (; se; se = se->parent)
@@ -186,7 +189,12 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
186 } 189 }
187} 190}
188 191
189#else /* CONFIG_FAIR_GROUP_SCHED */ 192#else /* !CONFIG_FAIR_GROUP_SCHED */
193
194static inline struct task_struct *task_of(struct sched_entity *se)
195{
196 return container_of(se, struct task_struct, se);
197}
190 198
191static inline struct rq *rq_of(struct cfs_rq *cfs_rq) 199static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
192{ 200{
@@ -537,6 +545,12 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
537 schedstat_set(se->wait_count, se->wait_count + 1); 545 schedstat_set(se->wait_count, se->wait_count + 1);
538 schedstat_set(se->wait_sum, se->wait_sum + 546 schedstat_set(se->wait_sum, se->wait_sum +
539 rq_of(cfs_rq)->clock - se->wait_start); 547 rq_of(cfs_rq)->clock - se->wait_start);
548#ifdef CONFIG_SCHEDSTATS
549 if (entity_is_task(se)) {
550 trace_sched_stat_wait(task_of(se),
551 rq_of(cfs_rq)->clock - se->wait_start);
552 }
553#endif
540 schedstat_set(se->wait_start, 0); 554 schedstat_set(se->wait_start, 0);
541} 555}
542 556
@@ -628,8 +642,10 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
628 se->sleep_start = 0; 642 se->sleep_start = 0;
629 se->sum_sleep_runtime += delta; 643 se->sum_sleep_runtime += delta;
630 644
631 if (tsk) 645 if (tsk) {
632 account_scheduler_latency(tsk, delta >> 10, 1); 646 account_scheduler_latency(tsk, delta >> 10, 1);
647 trace_sched_stat_sleep(tsk, delta);
648 }
633 } 649 }
634 if (se->block_start) { 650 if (se->block_start) {
635 u64 delta = rq_of(cfs_rq)->clock - se->block_start; 651 u64 delta = rq_of(cfs_rq)->clock - se->block_start;
@@ -644,6 +660,12 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
644 se->sum_sleep_runtime += delta; 660 se->sum_sleep_runtime += delta;
645 661
646 if (tsk) { 662 if (tsk) {
663 if (tsk->in_iowait) {
664 se->iowait_sum += delta;
665 se->iowait_count++;
666 trace_sched_stat_iowait(tsk, delta);
667 }
668
647 /* 669 /*
648 * Blocking time is in units of nanosecs, so shift by 670 * Blocking time is in units of nanosecs, so shift by
649 * 20 to get a milliseconds-range estimation of the 671 * 20 to get a milliseconds-range estimation of the
@@ -705,11 +727,11 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
705 727
706 vruntime -= thresh; 728 vruntime -= thresh;
707 } 729 }
708
709 /* ensure we never gain time by being placed backwards. */
710 vruntime = max_vruntime(se->vruntime, vruntime);
711 } 730 }
712 731
732 /* ensure we never gain time by being placed backwards. */
733 vruntime = max_vruntime(se->vruntime, vruntime);
734
713 se->vruntime = vruntime; 735 se->vruntime = vruntime;
714} 736}
715 737
@@ -1046,17 +1068,21 @@ static void yield_task_fair(struct rq *rq)
1046 * search starts with cpus closest then further out as needed, 1068 * search starts with cpus closest then further out as needed,
1047 * so we always favor a closer, idle cpu. 1069 * so we always favor a closer, idle cpu.
1048 * Domains may include CPUs that are not usable for migration, 1070 * Domains may include CPUs that are not usable for migration,
1049 * hence we need to mask them out (cpu_active_mask) 1071 * hence we need to mask them out (rq->rd->online)
1050 * 1072 *
1051 * Returns the CPU we should wake onto. 1073 * Returns the CPU we should wake onto.
1052 */ 1074 */
1053#if defined(ARCH_HAS_SCHED_WAKE_IDLE) 1075#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
1076
1077#define cpu_rd_active(cpu, rq) cpumask_test_cpu(cpu, rq->rd->online)
1078
1054static int wake_idle(int cpu, struct task_struct *p) 1079static int wake_idle(int cpu, struct task_struct *p)
1055{ 1080{
1056 struct sched_domain *sd; 1081 struct sched_domain *sd;
1057 int i; 1082 int i;
1058 unsigned int chosen_wakeup_cpu; 1083 unsigned int chosen_wakeup_cpu;
1059 int this_cpu; 1084 int this_cpu;
1085 struct rq *task_rq = task_rq(p);
1060 1086
1061 /* 1087 /*
1062 * At POWERSAVINGS_BALANCE_WAKEUP level, if both this_cpu and prev_cpu 1088 * At POWERSAVINGS_BALANCE_WAKEUP level, if both this_cpu and prev_cpu
@@ -1089,10 +1115,10 @@ static int wake_idle(int cpu, struct task_struct *p)
1089 for_each_domain(cpu, sd) { 1115 for_each_domain(cpu, sd) {
1090 if ((sd->flags & SD_WAKE_IDLE) 1116 if ((sd->flags & SD_WAKE_IDLE)
1091 || ((sd->flags & SD_WAKE_IDLE_FAR) 1117 || ((sd->flags & SD_WAKE_IDLE_FAR)
1092 && !task_hot(p, task_rq(p)->clock, sd))) { 1118 && !task_hot(p, task_rq->clock, sd))) {
1093 for_each_cpu_and(i, sched_domain_span(sd), 1119 for_each_cpu_and(i, sched_domain_span(sd),
1094 &p->cpus_allowed) { 1120 &p->cpus_allowed) {
1095 if (cpu_active(i) && idle_cpu(i)) { 1121 if (cpu_rd_active(i, task_rq) && idle_cpu(i)) {
1096 if (i != task_cpu(p)) { 1122 if (i != task_cpu(p)) {
1097 schedstat_inc(p, 1123 schedstat_inc(p,
1098 se.nr_wakeups_idle); 1124 se.nr_wakeups_idle);
@@ -1235,7 +1261,17 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
1235 tg = task_group(p); 1261 tg = task_group(p);
1236 weight = p->se.load.weight; 1262 weight = p->se.load.weight;
1237 1263
1238 balanced = 100*(tl + effective_load(tg, this_cpu, weight, weight)) <= 1264 /*
1265 * In low-load situations, where prev_cpu is idle and this_cpu is idle
1266 * due to the sync cause above having dropped tl to 0, we'll always have
1267 * an imbalance, but there's really nothing you can do about that, so
1268 * that's good too.
1269 *
1270 * Otherwise check if either cpus are near enough in load to allow this
1271 * task to be woken on this_cpu.
1272 */
1273 balanced = !tl ||
1274 100*(tl + effective_load(tg, this_cpu, weight, weight)) <=
1239 imbalance*(load + effective_load(tg, prev_cpu, 0, weight)); 1275 imbalance*(load + effective_load(tg, prev_cpu, 0, weight));
1240 1276
1241 /* 1277 /*
@@ -1278,8 +1314,6 @@ static int select_task_rq_fair(struct task_struct *p, int sync)
1278 this_rq = cpu_rq(this_cpu); 1314 this_rq = cpu_rq(this_cpu);
1279 new_cpu = prev_cpu; 1315 new_cpu = prev_cpu;
1280 1316
1281 if (prev_cpu == this_cpu)
1282 goto out;
1283 /* 1317 /*
1284 * 'this_sd' is the first domain that both 1318 * 'this_sd' is the first domain that both
1285 * this_cpu and prev_cpu are present in: 1319 * this_cpu and prev_cpu are present in:
@@ -1721,6 +1755,8 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
1721 sched_info_queued(p); 1755 sched_info_queued(p);
1722 1756
1723 update_curr(cfs_rq); 1757 update_curr(cfs_rq);
1758 if (curr)
1759 se->vruntime = curr->vruntime;
1724 place_entity(cfs_rq, se, 1); 1760 place_entity(cfs_rq, se, 1);
1725 1761
1726 /* 'curr' will be NULL if the child belongs to a different group */ 1762 /* 'curr' will be NULL if the child belongs to a different group */
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 4569bfa7df9b..e2dc63a5815d 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -1,4 +1,4 @@
1SCHED_FEAT(NEW_FAIR_SLEEPERS, 1) 1SCHED_FEAT(NEW_FAIR_SLEEPERS, 0)
2SCHED_FEAT(NORMALIZED_SLEEPER, 0) 2SCHED_FEAT(NORMALIZED_SLEEPER, 0)
3SCHED_FEAT(ADAPTIVE_GRAN, 1) 3SCHED_FEAT(ADAPTIVE_GRAN, 1)
4SCHED_FEAT(WAKEUP_PREEMPT, 1) 4SCHED_FEAT(WAKEUP_PREEMPT, 1)
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 3918e01994e0..2eb4bd6a526c 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -3,15 +3,18 @@
3 * policies) 3 * policies)
4 */ 4 */
5 5
6#ifdef CONFIG_RT_GROUP_SCHED
7
8#define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
9
6static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se) 10static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
7{ 11{
12#ifdef CONFIG_SCHED_DEBUG
13 WARN_ON_ONCE(!rt_entity_is_task(rt_se));
14#endif
8 return container_of(rt_se, struct task_struct, rt); 15 return container_of(rt_se, struct task_struct, rt);
9} 16}
10 17
11#ifdef CONFIG_RT_GROUP_SCHED
12
13#define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
14
15static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) 18static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
16{ 19{
17 return rt_rq->rq; 20 return rt_rq->rq;
@@ -26,6 +29,11 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
26 29
27#define rt_entity_is_task(rt_se) (1) 30#define rt_entity_is_task(rt_se) (1)
28 31
32static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
33{
34 return container_of(rt_se, struct task_struct, rt);
35}
36
29static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) 37static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
30{ 38{
31 return container_of(rt_rq, struct rq, rt); 39 return container_of(rt_rq, struct rq, rt);
@@ -128,6 +136,11 @@ static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
128 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); 136 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
129} 137}
130 138
139static inline int has_pushable_tasks(struct rq *rq)
140{
141 return !plist_head_empty(&rq->rt.pushable_tasks);
142}
143
131#else 144#else
132 145
133static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p) 146static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
@@ -602,6 +615,8 @@ static void update_curr_rt(struct rq *rq)
602 curr->se.exec_start = rq->clock; 615 curr->se.exec_start = rq->clock;
603 cpuacct_charge(curr, delta_exec); 616 cpuacct_charge(curr, delta_exec);
604 617
618 sched_rt_avg_update(rq, delta_exec);
619
605 if (!rt_bandwidth_enabled()) 620 if (!rt_bandwidth_enabled())
606 return; 621 return;
607 622
@@ -874,8 +889,6 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
874 889
875 if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) 890 if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
876 enqueue_pushable_task(rq, p); 891 enqueue_pushable_task(rq, p);
877
878 inc_cpu_load(rq, p->se.load.weight);
879} 892}
880 893
881static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) 894static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
@@ -886,8 +899,6 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
886 dequeue_rt_entity(rt_se); 899 dequeue_rt_entity(rt_se);
887 900
888 dequeue_pushable_task(rq, p); 901 dequeue_pushable_task(rq, p);
889
890 dec_cpu_load(rq, p->se.load.weight);
891} 902}
892 903
893/* 904/*
@@ -1064,6 +1075,14 @@ static struct task_struct *pick_next_task_rt(struct rq *rq)
1064 if (p) 1075 if (p)
1065 dequeue_pushable_task(rq, p); 1076 dequeue_pushable_task(rq, p);
1066 1077
1078#ifdef CONFIG_SMP
1079 /*
1080 * We detect this state here so that we can avoid taking the RQ
1081 * lock again later if there is no need to push
1082 */
1083 rq->post_schedule = has_pushable_tasks(rq);
1084#endif
1085
1067 return p; 1086 return p;
1068} 1087}
1069 1088
@@ -1162,13 +1181,6 @@ static int find_lowest_rq(struct task_struct *task)
1162 return -1; /* No targets found */ 1181 return -1; /* No targets found */
1163 1182
1164 /* 1183 /*
1165 * Only consider CPUs that are usable for migration.
1166 * I guess we might want to change cpupri_find() to ignore those
1167 * in the first place.
1168 */
1169 cpumask_and(lowest_mask, lowest_mask, cpu_active_mask);
1170
1171 /*
1172 * At this point we have built a mask of cpus representing the 1184 * At this point we have built a mask of cpus representing the
1173 * lowest priority tasks in the system. Now we want to elect 1185 * lowest priority tasks in the system. Now we want to elect
1174 * the best one based on our affinity and topology. 1186 * the best one based on our affinity and topology.
@@ -1262,11 +1274,6 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
1262 return lowest_rq; 1274 return lowest_rq;
1263} 1275}
1264 1276
1265static inline int has_pushable_tasks(struct rq *rq)
1266{
1267 return !plist_head_empty(&rq->rt.pushable_tasks);
1268}
1269
1270static struct task_struct *pick_next_pushable_task(struct rq *rq) 1277static struct task_struct *pick_next_pushable_task(struct rq *rq)
1271{ 1278{
1272 struct task_struct *p; 1279 struct task_struct *p;
@@ -1466,23 +1473,9 @@ static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
1466 pull_rt_task(rq); 1473 pull_rt_task(rq);
1467} 1474}
1468 1475
1469/*
1470 * assumes rq->lock is held
1471 */
1472static int needs_post_schedule_rt(struct rq *rq)
1473{
1474 return has_pushable_tasks(rq);
1475}
1476
1477static void post_schedule_rt(struct rq *rq) 1476static void post_schedule_rt(struct rq *rq)
1478{ 1477{
1479 /*
1480 * This is only called if needs_post_schedule_rt() indicates that
1481 * we need to push tasks away
1482 */
1483 spin_lock_irq(&rq->lock);
1484 push_rt_tasks(rq); 1478 push_rt_tasks(rq);
1485 spin_unlock_irq(&rq->lock);
1486} 1479}
1487 1480
1488/* 1481/*
@@ -1758,7 +1751,6 @@ static const struct sched_class rt_sched_class = {
1758 .rq_online = rq_online_rt, 1751 .rq_online = rq_online_rt,
1759 .rq_offline = rq_offline_rt, 1752 .rq_offline = rq_offline_rt,
1760 .pre_schedule = pre_schedule_rt, 1753 .pre_schedule = pre_schedule_rt,
1761 .needs_post_schedule = needs_post_schedule_rt,
1762 .post_schedule = post_schedule_rt, 1754 .post_schedule = post_schedule_rt,
1763 .task_wake_up = task_wake_up_rt, 1755 .task_wake_up = task_wake_up_rt,
1764 .switched_from = switched_from_rt, 1756 .switched_from = switched_from_rt,
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 71d8dc7f9920..3125cff1c570 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -245,6 +245,14 @@ static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */
245#endif 245#endif
246 246
247static struct ctl_table kern_table[] = { 247static struct ctl_table kern_table[] = {
248 {
249 .ctl_name = CTL_UNNUMBERED,
250 .procname = "sched_child_runs_first",
251 .data = &sysctl_sched_child_runs_first,
252 .maxlen = sizeof(unsigned int),
253 .mode = 0644,
254 .proc_handler = &proc_dointvec,
255 },
248#ifdef CONFIG_SCHED_DEBUG 256#ifdef CONFIG_SCHED_DEBUG
249 { 257 {
250 .ctl_name = CTL_UNNUMBERED, 258 .ctl_name = CTL_UNNUMBERED,
@@ -299,14 +307,6 @@ static struct ctl_table kern_table[] = {
299 }, 307 },
300 { 308 {
301 .ctl_name = CTL_UNNUMBERED, 309 .ctl_name = CTL_UNNUMBERED,
302 .procname = "sched_child_runs_first",
303 .data = &sysctl_sched_child_runs_first,
304 .maxlen = sizeof(unsigned int),
305 .mode = 0644,
306 .proc_handler = &proc_dointvec,
307 },
308 {
309 .ctl_name = CTL_UNNUMBERED,
310 .procname = "sched_features", 310 .procname = "sched_features",
311 .data = &sysctl_sched_features, 311 .data = &sysctl_sched_features,
312 .maxlen = sizeof(unsigned int), 312 .maxlen = sizeof(unsigned int),
@@ -331,6 +331,14 @@ static struct ctl_table kern_table[] = {
331 }, 331 },
332 { 332 {
333 .ctl_name = CTL_UNNUMBERED, 333 .ctl_name = CTL_UNNUMBERED,
334 .procname = "sched_time_avg",
335 .data = &sysctl_sched_time_avg,
336 .maxlen = sizeof(unsigned int),
337 .mode = 0644,
338 .proc_handler = &proc_dointvec,
339 },
340 {
341 .ctl_name = CTL_UNNUMBERED,
334 .procname = "timer_migration", 342 .procname = "timer_migration",
335 .data = &sysctl_timer_migration, 343 .data = &sysctl_timer_migration,
336 .maxlen = sizeof(unsigned int), 344 .maxlen = sizeof(unsigned int),
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 3c44b56b0da7..addfe2df93b1 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -317,8 +317,6 @@ static int worker_thread(void *__cwq)
317 if (cwq->wq->freezeable) 317 if (cwq->wq->freezeable)
318 set_freezable(); 318 set_freezable();
319 319
320 set_user_nice(current, -5);
321
322 for (;;) { 320 for (;;) {
323 prepare_to_wait(&cwq->more_work, &wait, TASK_INTERRUPTIBLE); 321 prepare_to_wait(&cwq->more_work, &wait, TASK_INTERRUPTIBLE);
324 if (!freezing(current) && 322 if (!freezing(current) &&