diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2009-09-11 16:23:18 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2009-09-11 16:23:18 -0400 |
commit | 774a694f8cd08115d130a290d73c6d8563f26b1b (patch) | |
tree | 2b5f834ac7a149278d2a7e44d7afe69f40ef1431 | |
parent | 4f0ac854167846bd55cd81dbc9a36e03708aa01c (diff) | |
parent | e1f8450854d69f0291882804406ea1bab3ca44b4 (diff) |
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (64 commits)
sched: Fix sched::sched_stat_wait tracepoint field
sched: Disable NEW_FAIR_SLEEPERS for now
sched: Keep kthreads at default priority
sched: Re-tune the scheduler latency defaults to decrease worst-case latencies
sched: Turn off child_runs_first
sched: Ensure that a child can't gain time over it's parent after fork()
sched: enable SD_WAKE_IDLE
sched: Deal with low-load in wake_affine()
sched: Remove short cut from select_task_rq_fair()
sched: Turn on SD_BALANCE_NEWIDLE
sched: Clean up topology.h
sched: Fix dynamic power-balancing crash
sched: Remove reciprocal for cpu_power
sched: Try to deal with low capacity, fix update_sd_power_savings_stats()
sched: Try to deal with low capacity
sched: Scale down cpu_power due to RT tasks
sched: Implement dynamic cpu_power
sched: Add smt_gain
sched: Update the cpu_power sum during load-balance
sched: Add SD_PREFER_SIBLING
...
-rw-r--r-- | arch/x86/include/asm/topology.h | 47 | ||||
-rw-r--r-- | fs/dcache.c | 1 | ||||
-rw-r--r-- | fs/locks.c | 2 | ||||
-rw-r--r-- | include/linux/hardirq.h | 6 | ||||
-rw-r--r-- | include/linux/kernel.h | 5 | ||||
-rw-r--r-- | include/linux/sched.h | 94 | ||||
-rw-r--r-- | include/linux/topology.h | 168 | ||||
-rw-r--r-- | include/trace/events/sched.h | 95 | ||||
-rw-r--r-- | init/main.c | 2 | ||||
-rw-r--r-- | kernel/kthread.c | 4 | ||||
-rw-r--r-- | kernel/sched.c | 1099 | ||||
-rw-r--r-- | kernel/sched_cpupri.c | 30 | ||||
-rw-r--r-- | kernel/sched_debug.c | 4 | ||||
-rw-r--r-- | kernel/sched_fair.c | 84 | ||||
-rw-r--r-- | kernel/sched_features.h | 2 | ||||
-rw-r--r-- | kernel/sched_rt.c | 62 | ||||
-rw-r--r-- | kernel/sysctl.c | 24 | ||||
-rw-r--r-- | kernel/workqueue.c | 2 |
18 files changed, 1117 insertions, 614 deletions
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 066ef590d7e0..26d06e052a18 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h | |||
@@ -129,25 +129,34 @@ extern unsigned long node_remap_size[]; | |||
129 | #endif | 129 | #endif |
130 | 130 | ||
131 | /* sched_domains SD_NODE_INIT for NUMA machines */ | 131 | /* sched_domains SD_NODE_INIT for NUMA machines */ |
132 | #define SD_NODE_INIT (struct sched_domain) { \ | 132 | #define SD_NODE_INIT (struct sched_domain) { \ |
133 | .min_interval = 8, \ | 133 | .min_interval = 8, \ |
134 | .max_interval = 32, \ | 134 | .max_interval = 32, \ |
135 | .busy_factor = 32, \ | 135 | .busy_factor = 32, \ |
136 | .imbalance_pct = 125, \ | 136 | .imbalance_pct = 125, \ |
137 | .cache_nice_tries = SD_CACHE_NICE_TRIES, \ | 137 | .cache_nice_tries = SD_CACHE_NICE_TRIES, \ |
138 | .busy_idx = 3, \ | 138 | .busy_idx = 3, \ |
139 | .idle_idx = SD_IDLE_IDX, \ | 139 | .idle_idx = SD_IDLE_IDX, \ |
140 | .newidle_idx = SD_NEWIDLE_IDX, \ | 140 | .newidle_idx = SD_NEWIDLE_IDX, \ |
141 | .wake_idx = 1, \ | 141 | .wake_idx = 1, \ |
142 | .forkexec_idx = SD_FORKEXEC_IDX, \ | 142 | .forkexec_idx = SD_FORKEXEC_IDX, \ |
143 | .flags = SD_LOAD_BALANCE \ | 143 | \ |
144 | | SD_BALANCE_EXEC \ | 144 | .flags = 1*SD_LOAD_BALANCE \ |
145 | | SD_BALANCE_FORK \ | 145 | | 1*SD_BALANCE_NEWIDLE \ |
146 | | SD_WAKE_AFFINE \ | 146 | | 1*SD_BALANCE_EXEC \ |
147 | | SD_WAKE_BALANCE \ | 147 | | 1*SD_BALANCE_FORK \ |
148 | | SD_SERIALIZE, \ | 148 | | 0*SD_WAKE_IDLE \ |
149 | .last_balance = jiffies, \ | 149 | | 1*SD_WAKE_AFFINE \ |
150 | .balance_interval = 1, \ | 150 | | 1*SD_WAKE_BALANCE \ |
151 | | 0*SD_SHARE_CPUPOWER \ | ||
152 | | 0*SD_POWERSAVINGS_BALANCE \ | ||
153 | | 0*SD_SHARE_PKG_RESOURCES \ | ||
154 | | 1*SD_SERIALIZE \ | ||
155 | | 1*SD_WAKE_IDLE_FAR \ | ||
156 | | 0*SD_PREFER_SIBLING \ | ||
157 | , \ | ||
158 | .last_balance = jiffies, \ | ||
159 | .balance_interval = 1, \ | ||
151 | } | 160 | } |
152 | 161 | ||
153 | #ifdef CONFIG_X86_64_ACPI_NUMA | 162 | #ifdef CONFIG_X86_64_ACPI_NUMA |
diff --git a/fs/dcache.c b/fs/dcache.c index 9e5cd3c3a6ba..a100fa35a48f 100644 --- a/fs/dcache.c +++ b/fs/dcache.c | |||
@@ -32,6 +32,7 @@ | |||
32 | #include <linux/swap.h> | 32 | #include <linux/swap.h> |
33 | #include <linux/bootmem.h> | 33 | #include <linux/bootmem.h> |
34 | #include <linux/fs_struct.h> | 34 | #include <linux/fs_struct.h> |
35 | #include <linux/hardirq.h> | ||
35 | #include "internal.h" | 36 | #include "internal.h" |
36 | 37 | ||
37 | int sysctl_vfs_cache_pressure __read_mostly = 100; | 38 | int sysctl_vfs_cache_pressure __read_mostly = 100; |
diff --git a/fs/locks.c b/fs/locks.c index 52366e877d76..19ee18a6829b 100644 --- a/fs/locks.c +++ b/fs/locks.c | |||
@@ -768,7 +768,7 @@ static int flock_lock_file(struct file *filp, struct file_lock *request) | |||
768 | * give it the opportunity to lock the file. | 768 | * give it the opportunity to lock the file. |
769 | */ | 769 | */ |
770 | if (found) | 770 | if (found) |
771 | cond_resched_bkl(); | 771 | cond_resched(); |
772 | 772 | ||
773 | find_conflict: | 773 | find_conflict: |
774 | for_each_lock(inode, before) { | 774 | for_each_lock(inode, before) { |
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index 330cb31bb496..6d527ee82b2b 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h | |||
@@ -64,6 +64,12 @@ | |||
64 | #define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT) | 64 | #define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT) |
65 | #define NMI_OFFSET (1UL << NMI_SHIFT) | 65 | #define NMI_OFFSET (1UL << NMI_SHIFT) |
66 | 66 | ||
67 | #ifndef PREEMPT_ACTIVE | ||
68 | #define PREEMPT_ACTIVE_BITS 1 | ||
69 | #define PREEMPT_ACTIVE_SHIFT (NMI_SHIFT + NMI_BITS) | ||
70 | #define PREEMPT_ACTIVE (__IRQ_MASK(PREEMPT_ACTIVE_BITS) << PREEMPT_ACTIVE_SHIFT) | ||
71 | #endif | ||
72 | |||
67 | #if PREEMPT_ACTIVE < (1 << (NMI_SHIFT + NMI_BITS)) | 73 | #if PREEMPT_ACTIVE < (1 << (NMI_SHIFT + NMI_BITS)) |
68 | #error PREEMPT_ACTIVE is too low! | 74 | #error PREEMPT_ACTIVE is too low! |
69 | #endif | 75 | #endif |
diff --git a/include/linux/kernel.h b/include/linux/kernel.h index d6320a3e8def..2b5b1e0899a8 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h | |||
@@ -125,7 +125,7 @@ extern int _cond_resched(void); | |||
125 | #endif | 125 | #endif |
126 | 126 | ||
127 | #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP | 127 | #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP |
128 | void __might_sleep(char *file, int line); | 128 | void __might_sleep(char *file, int line, int preempt_offset); |
129 | /** | 129 | /** |
130 | * might_sleep - annotation for functions that can sleep | 130 | * might_sleep - annotation for functions that can sleep |
131 | * | 131 | * |
@@ -137,8 +137,9 @@ extern int _cond_resched(void); | |||
137 | * supposed to. | 137 | * supposed to. |
138 | */ | 138 | */ |
139 | # define might_sleep() \ | 139 | # define might_sleep() \ |
140 | do { __might_sleep(__FILE__, __LINE__); might_resched(); } while (0) | 140 | do { __might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0) |
141 | #else | 141 | #else |
142 | static inline void __might_sleep(char *file, int line, int preempt_offset) { } | ||
142 | # define might_sleep() do { might_resched(); } while (0) | 143 | # define might_sleep() do { might_resched(); } while (0) |
143 | #endif | 144 | #endif |
144 | 145 | ||
diff --git a/include/linux/sched.h b/include/linux/sched.h index 379531c08975..f3d74bd04d18 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -38,6 +38,8 @@ | |||
38 | #define SCHED_BATCH 3 | 38 | #define SCHED_BATCH 3 |
39 | /* SCHED_ISO: reserved but not implemented yet */ | 39 | /* SCHED_ISO: reserved but not implemented yet */ |
40 | #define SCHED_IDLE 5 | 40 | #define SCHED_IDLE 5 |
41 | /* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */ | ||
42 | #define SCHED_RESET_ON_FORK 0x40000000 | ||
41 | 43 | ||
42 | #ifdef __KERNEL__ | 44 | #ifdef __KERNEL__ |
43 | 45 | ||
@@ -796,18 +798,19 @@ enum cpu_idle_type { | |||
796 | #define SCHED_LOAD_SCALE_FUZZ SCHED_LOAD_SCALE | 798 | #define SCHED_LOAD_SCALE_FUZZ SCHED_LOAD_SCALE |
797 | 799 | ||
798 | #ifdef CONFIG_SMP | 800 | #ifdef CONFIG_SMP |
799 | #define SD_LOAD_BALANCE 1 /* Do load balancing on this domain. */ | 801 | #define SD_LOAD_BALANCE 0x0001 /* Do load balancing on this domain. */ |
800 | #define SD_BALANCE_NEWIDLE 2 /* Balance when about to become idle */ | 802 | #define SD_BALANCE_NEWIDLE 0x0002 /* Balance when about to become idle */ |
801 | #define SD_BALANCE_EXEC 4 /* Balance on exec */ | 803 | #define SD_BALANCE_EXEC 0x0004 /* Balance on exec */ |
802 | #define SD_BALANCE_FORK 8 /* Balance on fork, clone */ | 804 | #define SD_BALANCE_FORK 0x0008 /* Balance on fork, clone */ |
803 | #define SD_WAKE_IDLE 16 /* Wake to idle CPU on task wakeup */ | 805 | #define SD_WAKE_IDLE 0x0010 /* Wake to idle CPU on task wakeup */ |
804 | #define SD_WAKE_AFFINE 32 /* Wake task to waking CPU */ | 806 | #define SD_WAKE_AFFINE 0x0020 /* Wake task to waking CPU */ |
805 | #define SD_WAKE_BALANCE 64 /* Perform balancing at task wakeup */ | 807 | #define SD_WAKE_BALANCE 0x0040 /* Perform balancing at task wakeup */ |
806 | #define SD_SHARE_CPUPOWER 128 /* Domain members share cpu power */ | 808 | #define SD_SHARE_CPUPOWER 0x0080 /* Domain members share cpu power */ |
807 | #define SD_POWERSAVINGS_BALANCE 256 /* Balance for power savings */ | 809 | #define SD_POWERSAVINGS_BALANCE 0x0100 /* Balance for power savings */ |
808 | #define SD_SHARE_PKG_RESOURCES 512 /* Domain members share cpu pkg resources */ | 810 | #define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */ |
809 | #define SD_SERIALIZE 1024 /* Only a single load balancing instance */ | 811 | #define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */ |
810 | #define SD_WAKE_IDLE_FAR 2048 /* Gain latency sacrificing cache hit */ | 812 | #define SD_WAKE_IDLE_FAR 0x0800 /* Gain latency sacrificing cache hit */ |
813 | #define SD_PREFER_SIBLING 0x1000 /* Prefer to place tasks in a sibling domain */ | ||
811 | 814 | ||
812 | enum powersavings_balance_level { | 815 | enum powersavings_balance_level { |
813 | POWERSAVINGS_BALANCE_NONE = 0, /* No power saving load balance */ | 816 | POWERSAVINGS_BALANCE_NONE = 0, /* No power saving load balance */ |
@@ -827,7 +830,7 @@ static inline int sd_balance_for_mc_power(void) | |||
827 | if (sched_smt_power_savings) | 830 | if (sched_smt_power_savings) |
828 | return SD_POWERSAVINGS_BALANCE; | 831 | return SD_POWERSAVINGS_BALANCE; |
829 | 832 | ||
830 | return 0; | 833 | return SD_PREFER_SIBLING; |
831 | } | 834 | } |
832 | 835 | ||
833 | static inline int sd_balance_for_package_power(void) | 836 | static inline int sd_balance_for_package_power(void) |
@@ -835,7 +838,7 @@ static inline int sd_balance_for_package_power(void) | |||
835 | if (sched_mc_power_savings | sched_smt_power_savings) | 838 | if (sched_mc_power_savings | sched_smt_power_savings) |
836 | return SD_POWERSAVINGS_BALANCE; | 839 | return SD_POWERSAVINGS_BALANCE; |
837 | 840 | ||
838 | return 0; | 841 | return SD_PREFER_SIBLING; |
839 | } | 842 | } |
840 | 843 | ||
841 | /* | 844 | /* |
@@ -857,15 +860,9 @@ struct sched_group { | |||
857 | 860 | ||
858 | /* | 861 | /* |
859 | * CPU power of this group, SCHED_LOAD_SCALE being max power for a | 862 | * CPU power of this group, SCHED_LOAD_SCALE being max power for a |
860 | * single CPU. This is read only (except for setup, hotplug CPU). | 863 | * single CPU. |
861 | * Note : Never change cpu_power without recompute its reciprocal | ||
862 | */ | ||
863 | unsigned int __cpu_power; | ||
864 | /* | ||
865 | * reciprocal value of cpu_power to avoid expensive divides | ||
866 | * (see include/linux/reciprocal_div.h) | ||
867 | */ | 864 | */ |
868 | u32 reciprocal_cpu_power; | 865 | unsigned int cpu_power; |
869 | 866 | ||
870 | /* | 867 | /* |
871 | * The CPUs this group covers. | 868 | * The CPUs this group covers. |
@@ -918,6 +915,7 @@ struct sched_domain { | |||
918 | unsigned int newidle_idx; | 915 | unsigned int newidle_idx; |
919 | unsigned int wake_idx; | 916 | unsigned int wake_idx; |
920 | unsigned int forkexec_idx; | 917 | unsigned int forkexec_idx; |
918 | unsigned int smt_gain; | ||
921 | int flags; /* See SD_* */ | 919 | int flags; /* See SD_* */ |
922 | enum sched_domain_level level; | 920 | enum sched_domain_level level; |
923 | 921 | ||
@@ -1045,7 +1043,6 @@ struct sched_class { | |||
1045 | struct rq *busiest, struct sched_domain *sd, | 1043 | struct rq *busiest, struct sched_domain *sd, |
1046 | enum cpu_idle_type idle); | 1044 | enum cpu_idle_type idle); |
1047 | void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); | 1045 | void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); |
1048 | int (*needs_post_schedule) (struct rq *this_rq); | ||
1049 | void (*post_schedule) (struct rq *this_rq); | 1046 | void (*post_schedule) (struct rq *this_rq); |
1050 | void (*task_wake_up) (struct rq *this_rq, struct task_struct *task); | 1047 | void (*task_wake_up) (struct rq *this_rq, struct task_struct *task); |
1051 | 1048 | ||
@@ -1110,6 +1107,8 @@ struct sched_entity { | |||
1110 | u64 wait_max; | 1107 | u64 wait_max; |
1111 | u64 wait_count; | 1108 | u64 wait_count; |
1112 | u64 wait_sum; | 1109 | u64 wait_sum; |
1110 | u64 iowait_count; | ||
1111 | u64 iowait_sum; | ||
1113 | 1112 | ||
1114 | u64 sleep_start; | 1113 | u64 sleep_start; |
1115 | u64 sleep_max; | 1114 | u64 sleep_max; |
@@ -1234,11 +1233,19 @@ struct task_struct { | |||
1234 | unsigned did_exec:1; | 1233 | unsigned did_exec:1; |
1235 | unsigned in_execve:1; /* Tell the LSMs that the process is doing an | 1234 | unsigned in_execve:1; /* Tell the LSMs that the process is doing an |
1236 | * execve */ | 1235 | * execve */ |
1236 | unsigned in_iowait:1; | ||
1237 | |||
1238 | |||
1239 | /* Revert to default priority/policy when forking */ | ||
1240 | unsigned sched_reset_on_fork:1; | ||
1241 | |||
1237 | pid_t pid; | 1242 | pid_t pid; |
1238 | pid_t tgid; | 1243 | pid_t tgid; |
1239 | 1244 | ||
1245 | #ifdef CONFIG_CC_STACKPROTECTOR | ||
1240 | /* Canary value for the -fstack-protector gcc feature */ | 1246 | /* Canary value for the -fstack-protector gcc feature */ |
1241 | unsigned long stack_canary; | 1247 | unsigned long stack_canary; |
1248 | #endif | ||
1242 | 1249 | ||
1243 | /* | 1250 | /* |
1244 | * pointers to (original) parent process, youngest child, younger sibling, | 1251 | * pointers to (original) parent process, youngest child, younger sibling, |
@@ -1840,11 +1847,12 @@ extern unsigned int sysctl_sched_min_granularity; | |||
1840 | extern unsigned int sysctl_sched_wakeup_granularity; | 1847 | extern unsigned int sysctl_sched_wakeup_granularity; |
1841 | extern unsigned int sysctl_sched_shares_ratelimit; | 1848 | extern unsigned int sysctl_sched_shares_ratelimit; |
1842 | extern unsigned int sysctl_sched_shares_thresh; | 1849 | extern unsigned int sysctl_sched_shares_thresh; |
1843 | #ifdef CONFIG_SCHED_DEBUG | ||
1844 | extern unsigned int sysctl_sched_child_runs_first; | 1850 | extern unsigned int sysctl_sched_child_runs_first; |
1851 | #ifdef CONFIG_SCHED_DEBUG | ||
1845 | extern unsigned int sysctl_sched_features; | 1852 | extern unsigned int sysctl_sched_features; |
1846 | extern unsigned int sysctl_sched_migration_cost; | 1853 | extern unsigned int sysctl_sched_migration_cost; |
1847 | extern unsigned int sysctl_sched_nr_migrate; | 1854 | extern unsigned int sysctl_sched_nr_migrate; |
1855 | extern unsigned int sysctl_sched_time_avg; | ||
1848 | extern unsigned int sysctl_timer_migration; | 1856 | extern unsigned int sysctl_timer_migration; |
1849 | 1857 | ||
1850 | int sched_nr_latency_handler(struct ctl_table *table, int write, | 1858 | int sched_nr_latency_handler(struct ctl_table *table, int write, |
@@ -2308,23 +2316,31 @@ static inline int need_resched(void) | |||
2308 | * cond_resched_softirq() will enable bhs before scheduling. | 2316 | * cond_resched_softirq() will enable bhs before scheduling. |
2309 | */ | 2317 | */ |
2310 | extern int _cond_resched(void); | 2318 | extern int _cond_resched(void); |
2311 | #ifdef CONFIG_PREEMPT_BKL | 2319 | |
2312 | static inline int cond_resched(void) | 2320 | #define cond_resched() ({ \ |
2313 | { | 2321 | __might_sleep(__FILE__, __LINE__, 0); \ |
2314 | return 0; | 2322 | _cond_resched(); \ |
2315 | } | 2323 | }) |
2324 | |||
2325 | extern int __cond_resched_lock(spinlock_t *lock); | ||
2326 | |||
2327 | #ifdef CONFIG_PREEMPT | ||
2328 | #define PREEMPT_LOCK_OFFSET PREEMPT_OFFSET | ||
2316 | #else | 2329 | #else |
2317 | static inline int cond_resched(void) | 2330 | #define PREEMPT_LOCK_OFFSET 0 |
2318 | { | ||
2319 | return _cond_resched(); | ||
2320 | } | ||
2321 | #endif | 2331 | #endif |
2322 | extern int cond_resched_lock(spinlock_t * lock); | 2332 | |
2323 | extern int cond_resched_softirq(void); | 2333 | #define cond_resched_lock(lock) ({ \ |
2324 | static inline int cond_resched_bkl(void) | 2334 | __might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET); \ |
2325 | { | 2335 | __cond_resched_lock(lock); \ |
2326 | return _cond_resched(); | 2336 | }) |
2327 | } | 2337 | |
2338 | extern int __cond_resched_softirq(void); | ||
2339 | |||
2340 | #define cond_resched_softirq() ({ \ | ||
2341 | __might_sleep(__FILE__, __LINE__, SOFTIRQ_OFFSET); \ | ||
2342 | __cond_resched_softirq(); \ | ||
2343 | }) | ||
2328 | 2344 | ||
2329 | /* | 2345 | /* |
2330 | * Does a critical section need to be broken due to another | 2346 | * Does a critical section need to be broken due to another |
diff --git a/include/linux/topology.h b/include/linux/topology.h index 7402c1a27c4f..85e8cf7d393c 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h | |||
@@ -85,20 +85,29 @@ int arch_update_cpu_topology(void); | |||
85 | #define ARCH_HAS_SCHED_WAKE_IDLE | 85 | #define ARCH_HAS_SCHED_WAKE_IDLE |
86 | /* Common values for SMT siblings */ | 86 | /* Common values for SMT siblings */ |
87 | #ifndef SD_SIBLING_INIT | 87 | #ifndef SD_SIBLING_INIT |
88 | #define SD_SIBLING_INIT (struct sched_domain) { \ | 88 | #define SD_SIBLING_INIT (struct sched_domain) { \ |
89 | .min_interval = 1, \ | 89 | .min_interval = 1, \ |
90 | .max_interval = 2, \ | 90 | .max_interval = 2, \ |
91 | .busy_factor = 64, \ | 91 | .busy_factor = 64, \ |
92 | .imbalance_pct = 110, \ | 92 | .imbalance_pct = 110, \ |
93 | .flags = SD_LOAD_BALANCE \ | 93 | \ |
94 | | SD_BALANCE_NEWIDLE \ | 94 | .flags = 1*SD_LOAD_BALANCE \ |
95 | | SD_BALANCE_FORK \ | 95 | | 1*SD_BALANCE_NEWIDLE \ |
96 | | SD_BALANCE_EXEC \ | 96 | | 1*SD_BALANCE_EXEC \ |
97 | | SD_WAKE_AFFINE \ | 97 | | 1*SD_BALANCE_FORK \ |
98 | | SD_WAKE_BALANCE \ | 98 | | 0*SD_WAKE_IDLE \ |
99 | | SD_SHARE_CPUPOWER, \ | 99 | | 1*SD_WAKE_AFFINE \ |
100 | .last_balance = jiffies, \ | 100 | | 1*SD_WAKE_BALANCE \ |
101 | .balance_interval = 1, \ | 101 | | 1*SD_SHARE_CPUPOWER \ |
102 | | 0*SD_POWERSAVINGS_BALANCE \ | ||
103 | | 0*SD_SHARE_PKG_RESOURCES \ | ||
104 | | 0*SD_SERIALIZE \ | ||
105 | | 0*SD_WAKE_IDLE_FAR \ | ||
106 | | 0*SD_PREFER_SIBLING \ | ||
107 | , \ | ||
108 | .last_balance = jiffies, \ | ||
109 | .balance_interval = 1, \ | ||
110 | .smt_gain = 1178, /* 15% */ \ | ||
102 | } | 111 | } |
103 | #endif | 112 | #endif |
104 | #endif /* CONFIG_SCHED_SMT */ | 113 | #endif /* CONFIG_SCHED_SMT */ |
@@ -106,69 +115,94 @@ int arch_update_cpu_topology(void); | |||
106 | #ifdef CONFIG_SCHED_MC | 115 | #ifdef CONFIG_SCHED_MC |
107 | /* Common values for MC siblings. for now mostly derived from SD_CPU_INIT */ | 116 | /* Common values for MC siblings. for now mostly derived from SD_CPU_INIT */ |
108 | #ifndef SD_MC_INIT | 117 | #ifndef SD_MC_INIT |
109 | #define SD_MC_INIT (struct sched_domain) { \ | 118 | #define SD_MC_INIT (struct sched_domain) { \ |
110 | .min_interval = 1, \ | 119 | .min_interval = 1, \ |
111 | .max_interval = 4, \ | 120 | .max_interval = 4, \ |
112 | .busy_factor = 64, \ | 121 | .busy_factor = 64, \ |
113 | .imbalance_pct = 125, \ | 122 | .imbalance_pct = 125, \ |
114 | .cache_nice_tries = 1, \ | 123 | .cache_nice_tries = 1, \ |
115 | .busy_idx = 2, \ | 124 | .busy_idx = 2, \ |
116 | .wake_idx = 1, \ | 125 | .wake_idx = 1, \ |
117 | .forkexec_idx = 1, \ | 126 | .forkexec_idx = 1, \ |
118 | .flags = SD_LOAD_BALANCE \ | 127 | \ |
119 | | SD_BALANCE_FORK \ | 128 | .flags = 1*SD_LOAD_BALANCE \ |
120 | | SD_BALANCE_EXEC \ | 129 | | 1*SD_BALANCE_NEWIDLE \ |
121 | | SD_WAKE_AFFINE \ | 130 | | 1*SD_BALANCE_EXEC \ |
122 | | SD_WAKE_BALANCE \ | 131 | | 1*SD_BALANCE_FORK \ |
123 | | SD_SHARE_PKG_RESOURCES\ | 132 | | 1*SD_WAKE_IDLE \ |
124 | | sd_balance_for_mc_power()\ | 133 | | 1*SD_WAKE_AFFINE \ |
125 | | sd_power_saving_flags(),\ | 134 | | 1*SD_WAKE_BALANCE \ |
126 | .last_balance = jiffies, \ | 135 | | 0*SD_SHARE_CPUPOWER \ |
127 | .balance_interval = 1, \ | 136 | | 1*SD_SHARE_PKG_RESOURCES \ |
137 | | 0*SD_SERIALIZE \ | ||
138 | | 0*SD_WAKE_IDLE_FAR \ | ||
139 | | sd_balance_for_mc_power() \ | ||
140 | | sd_power_saving_flags() \ | ||
141 | , \ | ||
142 | .last_balance = jiffies, \ | ||
143 | .balance_interval = 1, \ | ||
128 | } | 144 | } |
129 | #endif | 145 | #endif |
130 | #endif /* CONFIG_SCHED_MC */ | 146 | #endif /* CONFIG_SCHED_MC */ |
131 | 147 | ||
132 | /* Common values for CPUs */ | 148 | /* Common values for CPUs */ |
133 | #ifndef SD_CPU_INIT | 149 | #ifndef SD_CPU_INIT |
134 | #define SD_CPU_INIT (struct sched_domain) { \ | 150 | #define SD_CPU_INIT (struct sched_domain) { \ |
135 | .min_interval = 1, \ | 151 | .min_interval = 1, \ |
136 | .max_interval = 4, \ | 152 | .max_interval = 4, \ |
137 | .busy_factor = 64, \ | 153 | .busy_factor = 64, \ |
138 | .imbalance_pct = 125, \ | 154 | .imbalance_pct = 125, \ |
139 | .cache_nice_tries = 1, \ | 155 | .cache_nice_tries = 1, \ |
140 | .busy_idx = 2, \ | 156 | .busy_idx = 2, \ |
141 | .idle_idx = 1, \ | 157 | .idle_idx = 1, \ |
142 | .newidle_idx = 2, \ | 158 | .newidle_idx = 2, \ |
143 | .wake_idx = 1, \ | 159 | .wake_idx = 1, \ |
144 | .forkexec_idx = 1, \ | 160 | .forkexec_idx = 1, \ |
145 | .flags = SD_LOAD_BALANCE \ | 161 | \ |
146 | | SD_BALANCE_EXEC \ | 162 | .flags = 1*SD_LOAD_BALANCE \ |
147 | | SD_BALANCE_FORK \ | 163 | | 1*SD_BALANCE_NEWIDLE \ |
148 | | SD_WAKE_AFFINE \ | 164 | | 1*SD_BALANCE_EXEC \ |
149 | | SD_WAKE_BALANCE \ | 165 | | 1*SD_BALANCE_FORK \ |
150 | | sd_balance_for_package_power()\ | 166 | | 1*SD_WAKE_IDLE \ |
151 | | sd_power_saving_flags(),\ | 167 | | 0*SD_WAKE_AFFINE \ |
152 | .last_balance = jiffies, \ | 168 | | 1*SD_WAKE_BALANCE \ |
153 | .balance_interval = 1, \ | 169 | | 0*SD_SHARE_CPUPOWER \ |
170 | | 0*SD_SHARE_PKG_RESOURCES \ | ||
171 | | 0*SD_SERIALIZE \ | ||
172 | | 0*SD_WAKE_IDLE_FAR \ | ||
173 | | sd_balance_for_package_power() \ | ||
174 | | sd_power_saving_flags() \ | ||
175 | , \ | ||
176 | .last_balance = jiffies, \ | ||
177 | .balance_interval = 1, \ | ||
154 | } | 178 | } |
155 | #endif | 179 | #endif |
156 | 180 | ||
157 | /* sched_domains SD_ALLNODES_INIT for NUMA machines */ | 181 | /* sched_domains SD_ALLNODES_INIT for NUMA machines */ |
158 | #define SD_ALLNODES_INIT (struct sched_domain) { \ | 182 | #define SD_ALLNODES_INIT (struct sched_domain) { \ |
159 | .min_interval = 64, \ | 183 | .min_interval = 64, \ |
160 | .max_interval = 64*num_online_cpus(), \ | 184 | .max_interval = 64*num_online_cpus(), \ |
161 | .busy_factor = 128, \ | 185 | .busy_factor = 128, \ |
162 | .imbalance_pct = 133, \ | 186 | .imbalance_pct = 133, \ |
163 | .cache_nice_tries = 1, \ | 187 | .cache_nice_tries = 1, \ |
164 | .busy_idx = 3, \ | 188 | .busy_idx = 3, \ |
165 | .idle_idx = 3, \ | 189 | .idle_idx = 3, \ |
166 | .flags = SD_LOAD_BALANCE \ | 190 | .flags = 1*SD_LOAD_BALANCE \ |
167 | | SD_BALANCE_NEWIDLE \ | 191 | | 1*SD_BALANCE_NEWIDLE \ |
168 | | SD_WAKE_AFFINE \ | 192 | | 0*SD_BALANCE_EXEC \ |
169 | | SD_SERIALIZE, \ | 193 | | 0*SD_BALANCE_FORK \ |
170 | .last_balance = jiffies, \ | 194 | | 0*SD_WAKE_IDLE \ |
171 | .balance_interval = 64, \ | 195 | | 1*SD_WAKE_AFFINE \ |
196 | | 0*SD_WAKE_BALANCE \ | ||
197 | | 0*SD_SHARE_CPUPOWER \ | ||
198 | | 0*SD_POWERSAVINGS_BALANCE \ | ||
199 | | 0*SD_SHARE_PKG_RESOURCES \ | ||
200 | | 1*SD_SERIALIZE \ | ||
201 | | 1*SD_WAKE_IDLE_FAR \ | ||
202 | | 0*SD_PREFER_SIBLING \ | ||
203 | , \ | ||
204 | .last_balance = jiffies, \ | ||
205 | .balance_interval = 64, \ | ||
172 | } | 206 | } |
173 | 207 | ||
174 | #ifdef CONFIG_NUMA | 208 | #ifdef CONFIG_NUMA |
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 8949bb7eb082..a4c369ec328f 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h | |||
@@ -340,6 +340,101 @@ TRACE_EVENT(sched_signal_send, | |||
340 | __entry->sig, __entry->comm, __entry->pid) | 340 | __entry->sig, __entry->comm, __entry->pid) |
341 | ); | 341 | ); |
342 | 342 | ||
343 | /* | ||
344 | * XXX the below sched_stat tracepoints only apply to SCHED_OTHER/BATCH/IDLE | ||
345 | * adding sched_stat support to SCHED_FIFO/RR would be welcome. | ||
346 | */ | ||
347 | |||
348 | /* | ||
349 | * Tracepoint for accounting wait time (time the task is runnable | ||
350 | * but not actually running due to scheduler contention). | ||
351 | */ | ||
352 | TRACE_EVENT(sched_stat_wait, | ||
353 | |||
354 | TP_PROTO(struct task_struct *tsk, u64 delay), | ||
355 | |||
356 | TP_ARGS(tsk, delay), | ||
357 | |||
358 | TP_STRUCT__entry( | ||
359 | __array( char, comm, TASK_COMM_LEN ) | ||
360 | __field( pid_t, pid ) | ||
361 | __field( u64, delay ) | ||
362 | ), | ||
363 | |||
364 | TP_fast_assign( | ||
365 | memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN); | ||
366 | __entry->pid = tsk->pid; | ||
367 | __entry->delay = delay; | ||
368 | ) | ||
369 | TP_perf_assign( | ||
370 | __perf_count(delay); | ||
371 | ), | ||
372 | |||
373 | TP_printk("task: %s:%d wait: %Lu [ns]", | ||
374 | __entry->comm, __entry->pid, | ||
375 | (unsigned long long)__entry->delay) | ||
376 | ); | ||
377 | |||
378 | /* | ||
379 | * Tracepoint for accounting sleep time (time the task is not runnable, | ||
380 | * including iowait, see below). | ||
381 | */ | ||
382 | TRACE_EVENT(sched_stat_sleep, | ||
383 | |||
384 | TP_PROTO(struct task_struct *tsk, u64 delay), | ||
385 | |||
386 | TP_ARGS(tsk, delay), | ||
387 | |||
388 | TP_STRUCT__entry( | ||
389 | __array( char, comm, TASK_COMM_LEN ) | ||
390 | __field( pid_t, pid ) | ||
391 | __field( u64, delay ) | ||
392 | ), | ||
393 | |||
394 | TP_fast_assign( | ||
395 | memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN); | ||
396 | __entry->pid = tsk->pid; | ||
397 | __entry->delay = delay; | ||
398 | ) | ||
399 | TP_perf_assign( | ||
400 | __perf_count(delay); | ||
401 | ), | ||
402 | |||
403 | TP_printk("task: %s:%d sleep: %Lu [ns]", | ||
404 | __entry->comm, __entry->pid, | ||
405 | (unsigned long long)__entry->delay) | ||
406 | ); | ||
407 | |||
408 | /* | ||
409 | * Tracepoint for accounting iowait time (time the task is not runnable | ||
410 | * due to waiting on IO to complete). | ||
411 | */ | ||
412 | TRACE_EVENT(sched_stat_iowait, | ||
413 | |||
414 | TP_PROTO(struct task_struct *tsk, u64 delay), | ||
415 | |||
416 | TP_ARGS(tsk, delay), | ||
417 | |||
418 | TP_STRUCT__entry( | ||
419 | __array( char, comm, TASK_COMM_LEN ) | ||
420 | __field( pid_t, pid ) | ||
421 | __field( u64, delay ) | ||
422 | ), | ||
423 | |||
424 | TP_fast_assign( | ||
425 | memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN); | ||
426 | __entry->pid = tsk->pid; | ||
427 | __entry->delay = delay; | ||
428 | ) | ||
429 | TP_perf_assign( | ||
430 | __perf_count(delay); | ||
431 | ), | ||
432 | |||
433 | TP_printk("task: %s:%d iowait: %Lu [ns]", | ||
434 | __entry->comm, __entry->pid, | ||
435 | (unsigned long long)__entry->delay) | ||
436 | ); | ||
437 | |||
343 | #endif /* _TRACE_SCHED_H */ | 438 | #endif /* _TRACE_SCHED_H */ |
344 | 439 | ||
345 | /* This part must be outside protection */ | 440 | /* This part must be outside protection */ |
diff --git a/init/main.c b/init/main.c index 525f6fb2bd22..b34fd8e5edef 100644 --- a/init/main.c +++ b/init/main.c | |||
@@ -631,7 +631,6 @@ asmlinkage void __init start_kernel(void) | |||
631 | softirq_init(); | 631 | softirq_init(); |
632 | timekeeping_init(); | 632 | timekeeping_init(); |
633 | time_init(); | 633 | time_init(); |
634 | sched_clock_init(); | ||
635 | profile_init(); | 634 | profile_init(); |
636 | if (!irqs_disabled()) | 635 | if (!irqs_disabled()) |
637 | printk(KERN_CRIT "start_kernel(): bug: interrupts were " | 636 | printk(KERN_CRIT "start_kernel(): bug: interrupts were " |
@@ -682,6 +681,7 @@ asmlinkage void __init start_kernel(void) | |||
682 | numa_policy_init(); | 681 | numa_policy_init(); |
683 | if (late_time_init) | 682 | if (late_time_init) |
684 | late_time_init(); | 683 | late_time_init(); |
684 | sched_clock_init(); | ||
685 | calibrate_delay(); | 685 | calibrate_delay(); |
686 | pidmap_init(); | 686 | pidmap_init(); |
687 | anon_vma_init(); | 687 | anon_vma_init(); |
diff --git a/kernel/kthread.c b/kernel/kthread.c index eb8751aa0418..5fe709982caa 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c | |||
@@ -16,8 +16,6 @@ | |||
16 | #include <linux/mutex.h> | 16 | #include <linux/mutex.h> |
17 | #include <trace/events/sched.h> | 17 | #include <trace/events/sched.h> |
18 | 18 | ||
19 | #define KTHREAD_NICE_LEVEL (-5) | ||
20 | |||
21 | static DEFINE_SPINLOCK(kthread_create_lock); | 19 | static DEFINE_SPINLOCK(kthread_create_lock); |
22 | static LIST_HEAD(kthread_create_list); | 20 | static LIST_HEAD(kthread_create_list); |
23 | struct task_struct *kthreadd_task; | 21 | struct task_struct *kthreadd_task; |
@@ -145,7 +143,6 @@ struct task_struct *kthread_create(int (*threadfn)(void *data), | |||
145 | * The kernel thread should not inherit these properties. | 143 | * The kernel thread should not inherit these properties. |
146 | */ | 144 | */ |
147 | sched_setscheduler_nocheck(create.result, SCHED_NORMAL, ¶m); | 145 | sched_setscheduler_nocheck(create.result, SCHED_NORMAL, ¶m); |
148 | set_user_nice(create.result, KTHREAD_NICE_LEVEL); | ||
149 | set_cpus_allowed_ptr(create.result, cpu_all_mask); | 146 | set_cpus_allowed_ptr(create.result, cpu_all_mask); |
150 | } | 147 | } |
151 | return create.result; | 148 | return create.result; |
@@ -221,7 +218,6 @@ int kthreadd(void *unused) | |||
221 | /* Setup a clean context for our children to inherit. */ | 218 | /* Setup a clean context for our children to inherit. */ |
222 | set_task_comm(tsk, "kthreadd"); | 219 | set_task_comm(tsk, "kthreadd"); |
223 | ignore_signals(tsk); | 220 | ignore_signals(tsk); |
224 | set_user_nice(tsk, KTHREAD_NICE_LEVEL); | ||
225 | set_cpus_allowed_ptr(tsk, cpu_all_mask); | 221 | set_cpus_allowed_ptr(tsk, cpu_all_mask); |
226 | set_mems_allowed(node_possible_map); | 222 | set_mems_allowed(node_possible_map); |
227 | 223 | ||
diff --git a/kernel/sched.c b/kernel/sched.c index 4066241ae9f4..e27a53685ed9 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -64,7 +64,6 @@ | |||
64 | #include <linux/tsacct_kern.h> | 64 | #include <linux/tsacct_kern.h> |
65 | #include <linux/kprobes.h> | 65 | #include <linux/kprobes.h> |
66 | #include <linux/delayacct.h> | 66 | #include <linux/delayacct.h> |
67 | #include <linux/reciprocal_div.h> | ||
68 | #include <linux/unistd.h> | 67 | #include <linux/unistd.h> |
69 | #include <linux/pagemap.h> | 68 | #include <linux/pagemap.h> |
70 | #include <linux/hrtimer.h> | 69 | #include <linux/hrtimer.h> |
@@ -120,30 +119,8 @@ | |||
120 | */ | 119 | */ |
121 | #define RUNTIME_INF ((u64)~0ULL) | 120 | #define RUNTIME_INF ((u64)~0ULL) |
122 | 121 | ||
123 | #ifdef CONFIG_SMP | ||
124 | |||
125 | static void double_rq_lock(struct rq *rq1, struct rq *rq2); | 122 | static void double_rq_lock(struct rq *rq1, struct rq *rq2); |
126 | 123 | ||
127 | /* | ||
128 | * Divide a load by a sched group cpu_power : (load / sg->__cpu_power) | ||
129 | * Since cpu_power is a 'constant', we can use a reciprocal divide. | ||
130 | */ | ||
131 | static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load) | ||
132 | { | ||
133 | return reciprocal_divide(load, sg->reciprocal_cpu_power); | ||
134 | } | ||
135 | |||
136 | /* | ||
137 | * Each time a sched group cpu_power is changed, | ||
138 | * we must compute its reciprocal value | ||
139 | */ | ||
140 | static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val) | ||
141 | { | ||
142 | sg->__cpu_power += val; | ||
143 | sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power); | ||
144 | } | ||
145 | #endif | ||
146 | |||
147 | static inline int rt_policy(int policy) | 124 | static inline int rt_policy(int policy) |
148 | { | 125 | { |
149 | if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR)) | 126 | if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR)) |
@@ -309,8 +286,8 @@ void set_tg_uid(struct user_struct *user) | |||
309 | 286 | ||
310 | /* | 287 | /* |
311 | * Root task group. | 288 | * Root task group. |
312 | * Every UID task group (including init_task_group aka UID-0) will | 289 | * Every UID task group (including init_task_group aka UID-0) will |
313 | * be a child to this group. | 290 | * be a child to this group. |
314 | */ | 291 | */ |
315 | struct task_group root_task_group; | 292 | struct task_group root_task_group; |
316 | 293 | ||
@@ -318,7 +295,7 @@ struct task_group root_task_group; | |||
318 | /* Default task group's sched entity on each cpu */ | 295 | /* Default task group's sched entity on each cpu */ |
319 | static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); | 296 | static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); |
320 | /* Default task group's cfs_rq on each cpu */ | 297 | /* Default task group's cfs_rq on each cpu */ |
321 | static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; | 298 | static DEFINE_PER_CPU(struct cfs_rq, init_tg_cfs_rq) ____cacheline_aligned_in_smp; |
322 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 299 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
323 | 300 | ||
324 | #ifdef CONFIG_RT_GROUP_SCHED | 301 | #ifdef CONFIG_RT_GROUP_SCHED |
@@ -616,6 +593,7 @@ struct rq { | |||
616 | 593 | ||
617 | unsigned char idle_at_tick; | 594 | unsigned char idle_at_tick; |
618 | /* For active balancing */ | 595 | /* For active balancing */ |
596 | int post_schedule; | ||
619 | int active_balance; | 597 | int active_balance; |
620 | int push_cpu; | 598 | int push_cpu; |
621 | /* cpu of this runqueue: */ | 599 | /* cpu of this runqueue: */ |
@@ -626,6 +604,9 @@ struct rq { | |||
626 | 604 | ||
627 | struct task_struct *migration_thread; | 605 | struct task_struct *migration_thread; |
628 | struct list_head migration_queue; | 606 | struct list_head migration_queue; |
607 | |||
608 | u64 rt_avg; | ||
609 | u64 age_stamp; | ||
629 | #endif | 610 | #endif |
630 | 611 | ||
631 | /* calc_load related fields */ | 612 | /* calc_load related fields */ |
@@ -693,6 +674,7 @@ static inline int cpu_of(struct rq *rq) | |||
693 | #define this_rq() (&__get_cpu_var(runqueues)) | 674 | #define this_rq() (&__get_cpu_var(runqueues)) |
694 | #define task_rq(p) cpu_rq(task_cpu(p)) | 675 | #define task_rq(p) cpu_rq(task_cpu(p)) |
695 | #define cpu_curr(cpu) (cpu_rq(cpu)->curr) | 676 | #define cpu_curr(cpu) (cpu_rq(cpu)->curr) |
677 | #define raw_rq() (&__raw_get_cpu_var(runqueues)) | ||
696 | 678 | ||
697 | inline void update_rq_clock(struct rq *rq) | 679 | inline void update_rq_clock(struct rq *rq) |
698 | { | 680 | { |
@@ -861,6 +843,14 @@ unsigned int sysctl_sched_shares_ratelimit = 250000; | |||
861 | unsigned int sysctl_sched_shares_thresh = 4; | 843 | unsigned int sysctl_sched_shares_thresh = 4; |
862 | 844 | ||
863 | /* | 845 | /* |
846 | * period over which we average the RT time consumption, measured | ||
847 | * in ms. | ||
848 | * | ||
849 | * default: 1s | ||
850 | */ | ||
851 | const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC; | ||
852 | |||
853 | /* | ||
864 | * period over which we measure -rt task cpu usage in us. | 854 | * period over which we measure -rt task cpu usage in us. |
865 | * default: 1s | 855 | * default: 1s |
866 | */ | 856 | */ |
@@ -1278,12 +1268,37 @@ void wake_up_idle_cpu(int cpu) | |||
1278 | } | 1268 | } |
1279 | #endif /* CONFIG_NO_HZ */ | 1269 | #endif /* CONFIG_NO_HZ */ |
1280 | 1270 | ||
1271 | static u64 sched_avg_period(void) | ||
1272 | { | ||
1273 | return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2; | ||
1274 | } | ||
1275 | |||
1276 | static void sched_avg_update(struct rq *rq) | ||
1277 | { | ||
1278 | s64 period = sched_avg_period(); | ||
1279 | |||
1280 | while ((s64)(rq->clock - rq->age_stamp) > period) { | ||
1281 | rq->age_stamp += period; | ||
1282 | rq->rt_avg /= 2; | ||
1283 | } | ||
1284 | } | ||
1285 | |||
1286 | static void sched_rt_avg_update(struct rq *rq, u64 rt_delta) | ||
1287 | { | ||
1288 | rq->rt_avg += rt_delta; | ||
1289 | sched_avg_update(rq); | ||
1290 | } | ||
1291 | |||
1281 | #else /* !CONFIG_SMP */ | 1292 | #else /* !CONFIG_SMP */ |
1282 | static void resched_task(struct task_struct *p) | 1293 | static void resched_task(struct task_struct *p) |
1283 | { | 1294 | { |
1284 | assert_spin_locked(&task_rq(p)->lock); | 1295 | assert_spin_locked(&task_rq(p)->lock); |
1285 | set_tsk_need_resched(p); | 1296 | set_tsk_need_resched(p); |
1286 | } | 1297 | } |
1298 | |||
1299 | static void sched_rt_avg_update(struct rq *rq, u64 rt_delta) | ||
1300 | { | ||
1301 | } | ||
1287 | #endif /* CONFIG_SMP */ | 1302 | #endif /* CONFIG_SMP */ |
1288 | 1303 | ||
1289 | #if BITS_PER_LONG == 32 | 1304 | #if BITS_PER_LONG == 32 |
@@ -1513,28 +1528,35 @@ static unsigned long cpu_avg_load_per_task(int cpu) | |||
1513 | 1528 | ||
1514 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1529 | #ifdef CONFIG_FAIR_GROUP_SCHED |
1515 | 1530 | ||
1531 | struct update_shares_data { | ||
1532 | unsigned long rq_weight[NR_CPUS]; | ||
1533 | }; | ||
1534 | |||
1535 | static DEFINE_PER_CPU(struct update_shares_data, update_shares_data); | ||
1536 | |||
1516 | static void __set_se_shares(struct sched_entity *se, unsigned long shares); | 1537 | static void __set_se_shares(struct sched_entity *se, unsigned long shares); |
1517 | 1538 | ||
1518 | /* | 1539 | /* |
1519 | * Calculate and set the cpu's group shares. | 1540 | * Calculate and set the cpu's group shares. |
1520 | */ | 1541 | */ |
1521 | static void | 1542 | static void update_group_shares_cpu(struct task_group *tg, int cpu, |
1522 | update_group_shares_cpu(struct task_group *tg, int cpu, | 1543 | unsigned long sd_shares, |
1523 | unsigned long sd_shares, unsigned long sd_rq_weight) | 1544 | unsigned long sd_rq_weight, |
1545 | struct update_shares_data *usd) | ||
1524 | { | 1546 | { |
1525 | unsigned long shares; | 1547 | unsigned long shares, rq_weight; |
1526 | unsigned long rq_weight; | 1548 | int boost = 0; |
1527 | 1549 | ||
1528 | if (!tg->se[cpu]) | 1550 | rq_weight = usd->rq_weight[cpu]; |
1529 | return; | 1551 | if (!rq_weight) { |
1530 | 1552 | boost = 1; | |
1531 | rq_weight = tg->cfs_rq[cpu]->rq_weight; | 1553 | rq_weight = NICE_0_LOAD; |
1554 | } | ||
1532 | 1555 | ||
1533 | /* | 1556 | /* |
1534 | * \Sum shares * rq_weight | 1557 | * \Sum_j shares_j * rq_weight_i |
1535 | * shares = ----------------------- | 1558 | * shares_i = ----------------------------- |
1536 | * \Sum rq_weight | 1559 | * \Sum_j rq_weight_j |
1537 | * | ||
1538 | */ | 1560 | */ |
1539 | shares = (sd_shares * rq_weight) / sd_rq_weight; | 1561 | shares = (sd_shares * rq_weight) / sd_rq_weight; |
1540 | shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES); | 1562 | shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES); |
@@ -1545,8 +1567,8 @@ update_group_shares_cpu(struct task_group *tg, int cpu, | |||
1545 | unsigned long flags; | 1567 | unsigned long flags; |
1546 | 1568 | ||
1547 | spin_lock_irqsave(&rq->lock, flags); | 1569 | spin_lock_irqsave(&rq->lock, flags); |
1548 | tg->cfs_rq[cpu]->shares = shares; | 1570 | tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight; |
1549 | 1571 | tg->cfs_rq[cpu]->shares = boost ? 0 : shares; | |
1550 | __set_se_shares(tg->se[cpu], shares); | 1572 | __set_se_shares(tg->se[cpu], shares); |
1551 | spin_unlock_irqrestore(&rq->lock, flags); | 1573 | spin_unlock_irqrestore(&rq->lock, flags); |
1552 | } | 1574 | } |
@@ -1559,22 +1581,30 @@ update_group_shares_cpu(struct task_group *tg, int cpu, | |||
1559 | */ | 1581 | */ |
1560 | static int tg_shares_up(struct task_group *tg, void *data) | 1582 | static int tg_shares_up(struct task_group *tg, void *data) |
1561 | { | 1583 | { |
1562 | unsigned long weight, rq_weight = 0; | 1584 | unsigned long weight, rq_weight = 0, shares = 0; |
1563 | unsigned long shares = 0; | 1585 | struct update_shares_data *usd; |
1564 | struct sched_domain *sd = data; | 1586 | struct sched_domain *sd = data; |
1587 | unsigned long flags; | ||
1565 | int i; | 1588 | int i; |
1566 | 1589 | ||
1590 | if (!tg->se[0]) | ||
1591 | return 0; | ||
1592 | |||
1593 | local_irq_save(flags); | ||
1594 | usd = &__get_cpu_var(update_shares_data); | ||
1595 | |||
1567 | for_each_cpu(i, sched_domain_span(sd)) { | 1596 | for_each_cpu(i, sched_domain_span(sd)) { |
1597 | weight = tg->cfs_rq[i]->load.weight; | ||
1598 | usd->rq_weight[i] = weight; | ||
1599 | |||
1568 | /* | 1600 | /* |
1569 | * If there are currently no tasks on the cpu pretend there | 1601 | * If there are currently no tasks on the cpu pretend there |
1570 | * is one of average load so that when a new task gets to | 1602 | * is one of average load so that when a new task gets to |
1571 | * run here it will not get delayed by group starvation. | 1603 | * run here it will not get delayed by group starvation. |
1572 | */ | 1604 | */ |
1573 | weight = tg->cfs_rq[i]->load.weight; | ||
1574 | if (!weight) | 1605 | if (!weight) |
1575 | weight = NICE_0_LOAD; | 1606 | weight = NICE_0_LOAD; |
1576 | 1607 | ||
1577 | tg->cfs_rq[i]->rq_weight = weight; | ||
1578 | rq_weight += weight; | 1608 | rq_weight += weight; |
1579 | shares += tg->cfs_rq[i]->shares; | 1609 | shares += tg->cfs_rq[i]->shares; |
1580 | } | 1610 | } |
@@ -1586,7 +1616,9 @@ static int tg_shares_up(struct task_group *tg, void *data) | |||
1586 | shares = tg->shares; | 1616 | shares = tg->shares; |
1587 | 1617 | ||
1588 | for_each_cpu(i, sched_domain_span(sd)) | 1618 | for_each_cpu(i, sched_domain_span(sd)) |
1589 | update_group_shares_cpu(tg, i, shares, rq_weight); | 1619 | update_group_shares_cpu(tg, i, shares, rq_weight, usd); |
1620 | |||
1621 | local_irq_restore(flags); | ||
1590 | 1622 | ||
1591 | return 0; | 1623 | return 0; |
1592 | } | 1624 | } |
@@ -1616,8 +1648,14 @@ static int tg_load_down(struct task_group *tg, void *data) | |||
1616 | 1648 | ||
1617 | static void update_shares(struct sched_domain *sd) | 1649 | static void update_shares(struct sched_domain *sd) |
1618 | { | 1650 | { |
1619 | u64 now = cpu_clock(raw_smp_processor_id()); | 1651 | s64 elapsed; |
1620 | s64 elapsed = now - sd->last_update; | 1652 | u64 now; |
1653 | |||
1654 | if (root_task_group_empty()) | ||
1655 | return; | ||
1656 | |||
1657 | now = cpu_clock(raw_smp_processor_id()); | ||
1658 | elapsed = now - sd->last_update; | ||
1621 | 1659 | ||
1622 | if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { | 1660 | if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { |
1623 | sd->last_update = now; | 1661 | sd->last_update = now; |
@@ -1627,6 +1665,9 @@ static void update_shares(struct sched_domain *sd) | |||
1627 | 1665 | ||
1628 | static void update_shares_locked(struct rq *rq, struct sched_domain *sd) | 1666 | static void update_shares_locked(struct rq *rq, struct sched_domain *sd) |
1629 | { | 1667 | { |
1668 | if (root_task_group_empty()) | ||
1669 | return; | ||
1670 | |||
1630 | spin_unlock(&rq->lock); | 1671 | spin_unlock(&rq->lock); |
1631 | update_shares(sd); | 1672 | update_shares(sd); |
1632 | spin_lock(&rq->lock); | 1673 | spin_lock(&rq->lock); |
@@ -1634,6 +1675,9 @@ static void update_shares_locked(struct rq *rq, struct sched_domain *sd) | |||
1634 | 1675 | ||
1635 | static void update_h_load(long cpu) | 1676 | static void update_h_load(long cpu) |
1636 | { | 1677 | { |
1678 | if (root_task_group_empty()) | ||
1679 | return; | ||
1680 | |||
1637 | walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); | 1681 | walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); |
1638 | } | 1682 | } |
1639 | 1683 | ||
@@ -2268,8 +2312,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) | |||
2268 | } | 2312 | } |
2269 | 2313 | ||
2270 | /* Adjust by relative CPU power of the group */ | 2314 | /* Adjust by relative CPU power of the group */ |
2271 | avg_load = sg_div_cpu_power(group, | 2315 | avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; |
2272 | avg_load * SCHED_LOAD_SCALE); | ||
2273 | 2316 | ||
2274 | if (local_group) { | 2317 | if (local_group) { |
2275 | this_load = avg_load; | 2318 | this_load = avg_load; |
@@ -2637,9 +2680,32 @@ void sched_fork(struct task_struct *p, int clone_flags) | |||
2637 | set_task_cpu(p, cpu); | 2680 | set_task_cpu(p, cpu); |
2638 | 2681 | ||
2639 | /* | 2682 | /* |
2640 | * Make sure we do not leak PI boosting priority to the child: | 2683 | * Make sure we do not leak PI boosting priority to the child. |
2641 | */ | 2684 | */ |
2642 | p->prio = current->normal_prio; | 2685 | p->prio = current->normal_prio; |
2686 | |||
2687 | /* | ||
2688 | * Revert to default priority/policy on fork if requested. | ||
2689 | */ | ||
2690 | if (unlikely(p->sched_reset_on_fork)) { | ||
2691 | if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) | ||
2692 | p->policy = SCHED_NORMAL; | ||
2693 | |||
2694 | if (p->normal_prio < DEFAULT_PRIO) | ||
2695 | p->prio = DEFAULT_PRIO; | ||
2696 | |||
2697 | if (PRIO_TO_NICE(p->static_prio) < 0) { | ||
2698 | p->static_prio = NICE_TO_PRIO(0); | ||
2699 | set_load_weight(p); | ||
2700 | } | ||
2701 | |||
2702 | /* | ||
2703 | * We don't need the reset flag anymore after the fork. It has | ||
2704 | * fulfilled its duty: | ||
2705 | */ | ||
2706 | p->sched_reset_on_fork = 0; | ||
2707 | } | ||
2708 | |||
2643 | if (!rt_prio(p->prio)) | 2709 | if (!rt_prio(p->prio)) |
2644 | p->sched_class = &fair_sched_class; | 2710 | p->sched_class = &fair_sched_class; |
2645 | 2711 | ||
@@ -2796,12 +2862,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) | |||
2796 | { | 2862 | { |
2797 | struct mm_struct *mm = rq->prev_mm; | 2863 | struct mm_struct *mm = rq->prev_mm; |
2798 | long prev_state; | 2864 | long prev_state; |
2799 | #ifdef CONFIG_SMP | ||
2800 | int post_schedule = 0; | ||
2801 | |||
2802 | if (current->sched_class->needs_post_schedule) | ||
2803 | post_schedule = current->sched_class->needs_post_schedule(rq); | ||
2804 | #endif | ||
2805 | 2865 | ||
2806 | rq->prev_mm = NULL; | 2866 | rq->prev_mm = NULL; |
2807 | 2867 | ||
@@ -2820,10 +2880,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) | |||
2820 | finish_arch_switch(prev); | 2880 | finish_arch_switch(prev); |
2821 | perf_counter_task_sched_in(current, cpu_of(rq)); | 2881 | perf_counter_task_sched_in(current, cpu_of(rq)); |
2822 | finish_lock_switch(rq, prev); | 2882 | finish_lock_switch(rq, prev); |
2823 | #ifdef CONFIG_SMP | ||
2824 | if (post_schedule) | ||
2825 | current->sched_class->post_schedule(rq); | ||
2826 | #endif | ||
2827 | 2883 | ||
2828 | fire_sched_in_preempt_notifiers(current); | 2884 | fire_sched_in_preempt_notifiers(current); |
2829 | if (mm) | 2885 | if (mm) |
@@ -2838,6 +2894,42 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) | |||
2838 | } | 2894 | } |
2839 | } | 2895 | } |
2840 | 2896 | ||
2897 | #ifdef CONFIG_SMP | ||
2898 | |||
2899 | /* assumes rq->lock is held */ | ||
2900 | static inline void pre_schedule(struct rq *rq, struct task_struct *prev) | ||
2901 | { | ||
2902 | if (prev->sched_class->pre_schedule) | ||
2903 | prev->sched_class->pre_schedule(rq, prev); | ||
2904 | } | ||
2905 | |||
2906 | /* rq->lock is NOT held, but preemption is disabled */ | ||
2907 | static inline void post_schedule(struct rq *rq) | ||
2908 | { | ||
2909 | if (rq->post_schedule) { | ||
2910 | unsigned long flags; | ||
2911 | |||
2912 | spin_lock_irqsave(&rq->lock, flags); | ||
2913 | if (rq->curr->sched_class->post_schedule) | ||
2914 | rq->curr->sched_class->post_schedule(rq); | ||
2915 | spin_unlock_irqrestore(&rq->lock, flags); | ||
2916 | |||
2917 | rq->post_schedule = 0; | ||
2918 | } | ||
2919 | } | ||
2920 | |||
2921 | #else | ||
2922 | |||
2923 | static inline void pre_schedule(struct rq *rq, struct task_struct *p) | ||
2924 | { | ||
2925 | } | ||
2926 | |||
2927 | static inline void post_schedule(struct rq *rq) | ||
2928 | { | ||
2929 | } | ||
2930 | |||
2931 | #endif | ||
2932 | |||
2841 | /** | 2933 | /** |
2842 | * schedule_tail - first thing a freshly forked thread must call. | 2934 | * schedule_tail - first thing a freshly forked thread must call. |
2843 | * @prev: the thread we just switched away from. | 2935 | * @prev: the thread we just switched away from. |
@@ -2848,6 +2940,13 @@ asmlinkage void schedule_tail(struct task_struct *prev) | |||
2848 | struct rq *rq = this_rq(); | 2940 | struct rq *rq = this_rq(); |
2849 | 2941 | ||
2850 | finish_task_switch(rq, prev); | 2942 | finish_task_switch(rq, prev); |
2943 | |||
2944 | /* | ||
2945 | * FIXME: do we need to worry about rq being invalidated by the | ||
2946 | * task_switch? | ||
2947 | */ | ||
2948 | post_schedule(rq); | ||
2949 | |||
2851 | #ifdef __ARCH_WANT_UNLOCKED_CTXSW | 2950 | #ifdef __ARCH_WANT_UNLOCKED_CTXSW |
2852 | /* In this case, finish_task_switch does not reenable preemption */ | 2951 | /* In this case, finish_task_switch does not reenable preemption */ |
2853 | preempt_enable(); | 2952 | preempt_enable(); |
@@ -3379,9 +3478,10 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
3379 | { | 3478 | { |
3380 | const struct sched_class *class; | 3479 | const struct sched_class *class; |
3381 | 3480 | ||
3382 | for (class = sched_class_highest; class; class = class->next) | 3481 | for_each_class(class) { |
3383 | if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle)) | 3482 | if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle)) |
3384 | return 1; | 3483 | return 1; |
3484 | } | ||
3385 | 3485 | ||
3386 | return 0; | 3486 | return 0; |
3387 | } | 3487 | } |
@@ -3544,7 +3644,7 @@ static inline void update_sd_power_savings_stats(struct sched_group *group, | |||
3544 | * capacity but still has some space to pick up some load | 3644 | * capacity but still has some space to pick up some load |
3545 | * from other group and save more power | 3645 | * from other group and save more power |
3546 | */ | 3646 | */ |
3547 | if (sgs->sum_nr_running > sgs->group_capacity - 1) | 3647 | if (sgs->sum_nr_running + 1 > sgs->group_capacity) |
3548 | return; | 3648 | return; |
3549 | 3649 | ||
3550 | if (sgs->sum_nr_running > sds->leader_nr_running || | 3650 | if (sgs->sum_nr_running > sds->leader_nr_running || |
@@ -3611,6 +3711,77 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, | |||
3611 | } | 3711 | } |
3612 | #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ | 3712 | #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ |
3613 | 3713 | ||
3714 | unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu) | ||
3715 | { | ||
3716 | unsigned long weight = cpumask_weight(sched_domain_span(sd)); | ||
3717 | unsigned long smt_gain = sd->smt_gain; | ||
3718 | |||
3719 | smt_gain /= weight; | ||
3720 | |||
3721 | return smt_gain; | ||
3722 | } | ||
3723 | |||
3724 | unsigned long scale_rt_power(int cpu) | ||
3725 | { | ||
3726 | struct rq *rq = cpu_rq(cpu); | ||
3727 | u64 total, available; | ||
3728 | |||
3729 | sched_avg_update(rq); | ||
3730 | |||
3731 | total = sched_avg_period() + (rq->clock - rq->age_stamp); | ||
3732 | available = total - rq->rt_avg; | ||
3733 | |||
3734 | if (unlikely((s64)total < SCHED_LOAD_SCALE)) | ||
3735 | total = SCHED_LOAD_SCALE; | ||
3736 | |||
3737 | total >>= SCHED_LOAD_SHIFT; | ||
3738 | |||
3739 | return div_u64(available, total); | ||
3740 | } | ||
3741 | |||
3742 | static void update_cpu_power(struct sched_domain *sd, int cpu) | ||
3743 | { | ||
3744 | unsigned long weight = cpumask_weight(sched_domain_span(sd)); | ||
3745 | unsigned long power = SCHED_LOAD_SCALE; | ||
3746 | struct sched_group *sdg = sd->groups; | ||
3747 | |||
3748 | /* here we could scale based on cpufreq */ | ||
3749 | |||
3750 | if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { | ||
3751 | power *= arch_scale_smt_power(sd, cpu); | ||
3752 | power >>= SCHED_LOAD_SHIFT; | ||
3753 | } | ||
3754 | |||
3755 | power *= scale_rt_power(cpu); | ||
3756 | power >>= SCHED_LOAD_SHIFT; | ||
3757 | |||
3758 | if (!power) | ||
3759 | power = 1; | ||
3760 | |||
3761 | sdg->cpu_power = power; | ||
3762 | } | ||
3763 | |||
3764 | static void update_group_power(struct sched_domain *sd, int cpu) | ||
3765 | { | ||
3766 | struct sched_domain *child = sd->child; | ||
3767 | struct sched_group *group, *sdg = sd->groups; | ||
3768 | unsigned long power; | ||
3769 | |||
3770 | if (!child) { | ||
3771 | update_cpu_power(sd, cpu); | ||
3772 | return; | ||
3773 | } | ||
3774 | |||
3775 | power = 0; | ||
3776 | |||
3777 | group = child->groups; | ||
3778 | do { | ||
3779 | power += group->cpu_power; | ||
3780 | group = group->next; | ||
3781 | } while (group != child->groups); | ||
3782 | |||
3783 | sdg->cpu_power = power; | ||
3784 | } | ||
3614 | 3785 | ||
3615 | /** | 3786 | /** |
3616 | * update_sg_lb_stats - Update sched_group's statistics for load balancing. | 3787 | * update_sg_lb_stats - Update sched_group's statistics for load balancing. |
@@ -3624,7 +3795,8 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, | |||
3624 | * @balance: Should we balance. | 3795 | * @balance: Should we balance. |
3625 | * @sgs: variable to hold the statistics for this group. | 3796 | * @sgs: variable to hold the statistics for this group. |
3626 | */ | 3797 | */ |
3627 | static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu, | 3798 | static inline void update_sg_lb_stats(struct sched_domain *sd, |
3799 | struct sched_group *group, int this_cpu, | ||
3628 | enum cpu_idle_type idle, int load_idx, int *sd_idle, | 3800 | enum cpu_idle_type idle, int load_idx, int *sd_idle, |
3629 | int local_group, const struct cpumask *cpus, | 3801 | int local_group, const struct cpumask *cpus, |
3630 | int *balance, struct sg_lb_stats *sgs) | 3802 | int *balance, struct sg_lb_stats *sgs) |
@@ -3635,8 +3807,11 @@ static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu, | |||
3635 | unsigned long sum_avg_load_per_task; | 3807 | unsigned long sum_avg_load_per_task; |
3636 | unsigned long avg_load_per_task; | 3808 | unsigned long avg_load_per_task; |
3637 | 3809 | ||
3638 | if (local_group) | 3810 | if (local_group) { |
3639 | balance_cpu = group_first_cpu(group); | 3811 | balance_cpu = group_first_cpu(group); |
3812 | if (balance_cpu == this_cpu) | ||
3813 | update_group_power(sd, this_cpu); | ||
3814 | } | ||
3640 | 3815 | ||
3641 | /* Tally up the load of all CPUs in the group */ | 3816 | /* Tally up the load of all CPUs in the group */ |
3642 | sum_avg_load_per_task = avg_load_per_task = 0; | 3817 | sum_avg_load_per_task = avg_load_per_task = 0; |
@@ -3685,8 +3860,7 @@ static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu, | |||
3685 | } | 3860 | } |
3686 | 3861 | ||
3687 | /* Adjust by relative CPU power of the group */ | 3862 | /* Adjust by relative CPU power of the group */ |
3688 | sgs->avg_load = sg_div_cpu_power(group, | 3863 | sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power; |
3689 | sgs->group_load * SCHED_LOAD_SCALE); | ||
3690 | 3864 | ||
3691 | 3865 | ||
3692 | /* | 3866 | /* |
@@ -3698,14 +3872,14 @@ static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu, | |||
3698 | * normalized nr_running number somewhere that negates | 3872 | * normalized nr_running number somewhere that negates |
3699 | * the hierarchy? | 3873 | * the hierarchy? |
3700 | */ | 3874 | */ |
3701 | avg_load_per_task = sg_div_cpu_power(group, | 3875 | avg_load_per_task = (sum_avg_load_per_task * SCHED_LOAD_SCALE) / |
3702 | sum_avg_load_per_task * SCHED_LOAD_SCALE); | 3876 | group->cpu_power; |
3703 | 3877 | ||
3704 | if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) | 3878 | if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) |
3705 | sgs->group_imb = 1; | 3879 | sgs->group_imb = 1; |
3706 | 3880 | ||
3707 | sgs->group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; | 3881 | sgs->group_capacity = |
3708 | 3882 | DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); | |
3709 | } | 3883 | } |
3710 | 3884 | ||
3711 | /** | 3885 | /** |
@@ -3723,9 +3897,13 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | |||
3723 | const struct cpumask *cpus, int *balance, | 3897 | const struct cpumask *cpus, int *balance, |
3724 | struct sd_lb_stats *sds) | 3898 | struct sd_lb_stats *sds) |
3725 | { | 3899 | { |
3900 | struct sched_domain *child = sd->child; | ||
3726 | struct sched_group *group = sd->groups; | 3901 | struct sched_group *group = sd->groups; |
3727 | struct sg_lb_stats sgs; | 3902 | struct sg_lb_stats sgs; |
3728 | int load_idx; | 3903 | int load_idx, prefer_sibling = 0; |
3904 | |||
3905 | if (child && child->flags & SD_PREFER_SIBLING) | ||
3906 | prefer_sibling = 1; | ||
3729 | 3907 | ||
3730 | init_sd_power_savings_stats(sd, sds, idle); | 3908 | init_sd_power_savings_stats(sd, sds, idle); |
3731 | load_idx = get_sd_load_idx(sd, idle); | 3909 | load_idx = get_sd_load_idx(sd, idle); |
@@ -3736,14 +3914,22 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | |||
3736 | local_group = cpumask_test_cpu(this_cpu, | 3914 | local_group = cpumask_test_cpu(this_cpu, |
3737 | sched_group_cpus(group)); | 3915 | sched_group_cpus(group)); |
3738 | memset(&sgs, 0, sizeof(sgs)); | 3916 | memset(&sgs, 0, sizeof(sgs)); |
3739 | update_sg_lb_stats(group, this_cpu, idle, load_idx, sd_idle, | 3917 | update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle, |
3740 | local_group, cpus, balance, &sgs); | 3918 | local_group, cpus, balance, &sgs); |
3741 | 3919 | ||
3742 | if (local_group && balance && !(*balance)) | 3920 | if (local_group && balance && !(*balance)) |
3743 | return; | 3921 | return; |
3744 | 3922 | ||
3745 | sds->total_load += sgs.group_load; | 3923 | sds->total_load += sgs.group_load; |
3746 | sds->total_pwr += group->__cpu_power; | 3924 | sds->total_pwr += group->cpu_power; |
3925 | |||
3926 | /* | ||
3927 | * In case the child domain prefers tasks go to siblings | ||
3928 | * first, lower the group capacity to one so that we'll try | ||
3929 | * and move all the excess tasks away. | ||
3930 | */ | ||
3931 | if (prefer_sibling) | ||
3932 | sgs.group_capacity = min(sgs.group_capacity, 1UL); | ||
3747 | 3933 | ||
3748 | if (local_group) { | 3934 | if (local_group) { |
3749 | sds->this_load = sgs.avg_load; | 3935 | sds->this_load = sgs.avg_load; |
@@ -3763,7 +3949,6 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | |||
3763 | update_sd_power_savings_stats(group, sds, local_group, &sgs); | 3949 | update_sd_power_savings_stats(group, sds, local_group, &sgs); |
3764 | group = group->next; | 3950 | group = group->next; |
3765 | } while (group != sd->groups); | 3951 | } while (group != sd->groups); |
3766 | |||
3767 | } | 3952 | } |
3768 | 3953 | ||
3769 | /** | 3954 | /** |
@@ -3801,28 +3986,28 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds, | |||
3801 | * moving them. | 3986 | * moving them. |
3802 | */ | 3987 | */ |
3803 | 3988 | ||
3804 | pwr_now += sds->busiest->__cpu_power * | 3989 | pwr_now += sds->busiest->cpu_power * |
3805 | min(sds->busiest_load_per_task, sds->max_load); | 3990 | min(sds->busiest_load_per_task, sds->max_load); |
3806 | pwr_now += sds->this->__cpu_power * | 3991 | pwr_now += sds->this->cpu_power * |
3807 | min(sds->this_load_per_task, sds->this_load); | 3992 | min(sds->this_load_per_task, sds->this_load); |
3808 | pwr_now /= SCHED_LOAD_SCALE; | 3993 | pwr_now /= SCHED_LOAD_SCALE; |
3809 | 3994 | ||
3810 | /* Amount of load we'd subtract */ | 3995 | /* Amount of load we'd subtract */ |
3811 | tmp = sg_div_cpu_power(sds->busiest, | 3996 | tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) / |
3812 | sds->busiest_load_per_task * SCHED_LOAD_SCALE); | 3997 | sds->busiest->cpu_power; |
3813 | if (sds->max_load > tmp) | 3998 | if (sds->max_load > tmp) |
3814 | pwr_move += sds->busiest->__cpu_power * | 3999 | pwr_move += sds->busiest->cpu_power * |
3815 | min(sds->busiest_load_per_task, sds->max_load - tmp); | 4000 | min(sds->busiest_load_per_task, sds->max_load - tmp); |
3816 | 4001 | ||
3817 | /* Amount of load we'd add */ | 4002 | /* Amount of load we'd add */ |
3818 | if (sds->max_load * sds->busiest->__cpu_power < | 4003 | if (sds->max_load * sds->busiest->cpu_power < |
3819 | sds->busiest_load_per_task * SCHED_LOAD_SCALE) | 4004 | sds->busiest_load_per_task * SCHED_LOAD_SCALE) |
3820 | tmp = sg_div_cpu_power(sds->this, | 4005 | tmp = (sds->max_load * sds->busiest->cpu_power) / |
3821 | sds->max_load * sds->busiest->__cpu_power); | 4006 | sds->this->cpu_power; |
3822 | else | 4007 | else |
3823 | tmp = sg_div_cpu_power(sds->this, | 4008 | tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) / |
3824 | sds->busiest_load_per_task * SCHED_LOAD_SCALE); | 4009 | sds->this->cpu_power; |
3825 | pwr_move += sds->this->__cpu_power * | 4010 | pwr_move += sds->this->cpu_power * |
3826 | min(sds->this_load_per_task, sds->this_load + tmp); | 4011 | min(sds->this_load_per_task, sds->this_load + tmp); |
3827 | pwr_move /= SCHED_LOAD_SCALE; | 4012 | pwr_move /= SCHED_LOAD_SCALE; |
3828 | 4013 | ||
@@ -3857,8 +4042,8 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, | |||
3857 | sds->max_load - sds->busiest_load_per_task); | 4042 | sds->max_load - sds->busiest_load_per_task); |
3858 | 4043 | ||
3859 | /* How much load to actually move to equalise the imbalance */ | 4044 | /* How much load to actually move to equalise the imbalance */ |
3860 | *imbalance = min(max_pull * sds->busiest->__cpu_power, | 4045 | *imbalance = min(max_pull * sds->busiest->cpu_power, |
3861 | (sds->avg_load - sds->this_load) * sds->this->__cpu_power) | 4046 | (sds->avg_load - sds->this_load) * sds->this->cpu_power) |
3862 | / SCHED_LOAD_SCALE; | 4047 | / SCHED_LOAD_SCALE; |
3863 | 4048 | ||
3864 | /* | 4049 | /* |
@@ -3976,6 +4161,26 @@ ret: | |||
3976 | return NULL; | 4161 | return NULL; |
3977 | } | 4162 | } |
3978 | 4163 | ||
4164 | static struct sched_group *group_of(int cpu) | ||
4165 | { | ||
4166 | struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd); | ||
4167 | |||
4168 | if (!sd) | ||
4169 | return NULL; | ||
4170 | |||
4171 | return sd->groups; | ||
4172 | } | ||
4173 | |||
4174 | static unsigned long power_of(int cpu) | ||
4175 | { | ||
4176 | struct sched_group *group = group_of(cpu); | ||
4177 | |||
4178 | if (!group) | ||
4179 | return SCHED_LOAD_SCALE; | ||
4180 | |||
4181 | return group->cpu_power; | ||
4182 | } | ||
4183 | |||
3979 | /* | 4184 | /* |
3980 | * find_busiest_queue - find the busiest runqueue among the cpus in group. | 4185 | * find_busiest_queue - find the busiest runqueue among the cpus in group. |
3981 | */ | 4186 | */ |
@@ -3988,15 +4193,18 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, | |||
3988 | int i; | 4193 | int i; |
3989 | 4194 | ||
3990 | for_each_cpu(i, sched_group_cpus(group)) { | 4195 | for_each_cpu(i, sched_group_cpus(group)) { |
4196 | unsigned long power = power_of(i); | ||
4197 | unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE); | ||
3991 | unsigned long wl; | 4198 | unsigned long wl; |
3992 | 4199 | ||
3993 | if (!cpumask_test_cpu(i, cpus)) | 4200 | if (!cpumask_test_cpu(i, cpus)) |
3994 | continue; | 4201 | continue; |
3995 | 4202 | ||
3996 | rq = cpu_rq(i); | 4203 | rq = cpu_rq(i); |
3997 | wl = weighted_cpuload(i); | 4204 | wl = weighted_cpuload(i) * SCHED_LOAD_SCALE; |
4205 | wl /= power; | ||
3998 | 4206 | ||
3999 | if (rq->nr_running == 1 && wl > imbalance) | 4207 | if (capacity && rq->nr_running == 1 && wl > imbalance) |
4000 | continue; | 4208 | continue; |
4001 | 4209 | ||
4002 | if (wl > max_load) { | 4210 | if (wl > max_load) { |
@@ -5349,10 +5557,7 @@ need_resched_nonpreemptible: | |||
5349 | switch_count = &prev->nvcsw; | 5557 | switch_count = &prev->nvcsw; |
5350 | } | 5558 | } |
5351 | 5559 | ||
5352 | #ifdef CONFIG_SMP | 5560 | pre_schedule(rq, prev); |
5353 | if (prev->sched_class->pre_schedule) | ||
5354 | prev->sched_class->pre_schedule(rq, prev); | ||
5355 | #endif | ||
5356 | 5561 | ||
5357 | if (unlikely(!rq->nr_running)) | 5562 | if (unlikely(!rq->nr_running)) |
5358 | idle_balance(cpu, rq); | 5563 | idle_balance(cpu, rq); |
@@ -5378,6 +5583,8 @@ need_resched_nonpreemptible: | |||
5378 | } else | 5583 | } else |
5379 | spin_unlock_irq(&rq->lock); | 5584 | spin_unlock_irq(&rq->lock); |
5380 | 5585 | ||
5586 | post_schedule(rq); | ||
5587 | |||
5381 | if (unlikely(reacquire_kernel_lock(current) < 0)) | 5588 | if (unlikely(reacquire_kernel_lock(current) < 0)) |
5382 | goto need_resched_nonpreemptible; | 5589 | goto need_resched_nonpreemptible; |
5383 | 5590 | ||
@@ -6123,17 +6330,25 @@ static int __sched_setscheduler(struct task_struct *p, int policy, | |||
6123 | unsigned long flags; | 6330 | unsigned long flags; |
6124 | const struct sched_class *prev_class = p->sched_class; | 6331 | const struct sched_class *prev_class = p->sched_class; |
6125 | struct rq *rq; | 6332 | struct rq *rq; |
6333 | int reset_on_fork; | ||
6126 | 6334 | ||
6127 | /* may grab non-irq protected spin_locks */ | 6335 | /* may grab non-irq protected spin_locks */ |
6128 | BUG_ON(in_interrupt()); | 6336 | BUG_ON(in_interrupt()); |
6129 | recheck: | 6337 | recheck: |
6130 | /* double check policy once rq lock held */ | 6338 | /* double check policy once rq lock held */ |
6131 | if (policy < 0) | 6339 | if (policy < 0) { |
6340 | reset_on_fork = p->sched_reset_on_fork; | ||
6132 | policy = oldpolicy = p->policy; | 6341 | policy = oldpolicy = p->policy; |
6133 | else if (policy != SCHED_FIFO && policy != SCHED_RR && | 6342 | } else { |
6134 | policy != SCHED_NORMAL && policy != SCHED_BATCH && | 6343 | reset_on_fork = !!(policy & SCHED_RESET_ON_FORK); |
6135 | policy != SCHED_IDLE) | 6344 | policy &= ~SCHED_RESET_ON_FORK; |
6136 | return -EINVAL; | 6345 | |
6346 | if (policy != SCHED_FIFO && policy != SCHED_RR && | ||
6347 | policy != SCHED_NORMAL && policy != SCHED_BATCH && | ||
6348 | policy != SCHED_IDLE) | ||
6349 | return -EINVAL; | ||
6350 | } | ||
6351 | |||
6137 | /* | 6352 | /* |
6138 | * Valid priorities for SCHED_FIFO and SCHED_RR are | 6353 | * Valid priorities for SCHED_FIFO and SCHED_RR are |
6139 | * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, | 6354 | * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, |
@@ -6177,6 +6392,10 @@ recheck: | |||
6177 | /* can't change other user's priorities */ | 6392 | /* can't change other user's priorities */ |
6178 | if (!check_same_owner(p)) | 6393 | if (!check_same_owner(p)) |
6179 | return -EPERM; | 6394 | return -EPERM; |
6395 | |||
6396 | /* Normal users shall not reset the sched_reset_on_fork flag */ | ||
6397 | if (p->sched_reset_on_fork && !reset_on_fork) | ||
6398 | return -EPERM; | ||
6180 | } | 6399 | } |
6181 | 6400 | ||
6182 | if (user) { | 6401 | if (user) { |
@@ -6220,6 +6439,8 @@ recheck: | |||
6220 | if (running) | 6439 | if (running) |
6221 | p->sched_class->put_prev_task(rq, p); | 6440 | p->sched_class->put_prev_task(rq, p); |
6222 | 6441 | ||
6442 | p->sched_reset_on_fork = reset_on_fork; | ||
6443 | |||
6223 | oldprio = p->prio; | 6444 | oldprio = p->prio; |
6224 | __setscheduler(rq, p, policy, param->sched_priority); | 6445 | __setscheduler(rq, p, policy, param->sched_priority); |
6225 | 6446 | ||
@@ -6336,14 +6557,15 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) | |||
6336 | if (p) { | 6557 | if (p) { |
6337 | retval = security_task_getscheduler(p); | 6558 | retval = security_task_getscheduler(p); |
6338 | if (!retval) | 6559 | if (!retval) |
6339 | retval = p->policy; | 6560 | retval = p->policy |
6561 | | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0); | ||
6340 | } | 6562 | } |
6341 | read_unlock(&tasklist_lock); | 6563 | read_unlock(&tasklist_lock); |
6342 | return retval; | 6564 | return retval; |
6343 | } | 6565 | } |
6344 | 6566 | ||
6345 | /** | 6567 | /** |
6346 | * sys_sched_getscheduler - get the RT priority of a thread | 6568 | * sys_sched_getparam - get the RT priority of a thread |
6347 | * @pid: the pid in question. | 6569 | * @pid: the pid in question. |
6348 | * @param: structure containing the RT priority. | 6570 | * @param: structure containing the RT priority. |
6349 | */ | 6571 | */ |
@@ -6571,19 +6793,9 @@ static inline int should_resched(void) | |||
6571 | 6793 | ||
6572 | static void __cond_resched(void) | 6794 | static void __cond_resched(void) |
6573 | { | 6795 | { |
6574 | #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP | 6796 | add_preempt_count(PREEMPT_ACTIVE); |
6575 | __might_sleep(__FILE__, __LINE__); | 6797 | schedule(); |
6576 | #endif | 6798 | sub_preempt_count(PREEMPT_ACTIVE); |
6577 | /* | ||
6578 | * The BKS might be reacquired before we have dropped | ||
6579 | * PREEMPT_ACTIVE, which could trigger a second | ||
6580 | * cond_resched() call. | ||
6581 | */ | ||
6582 | do { | ||
6583 | add_preempt_count(PREEMPT_ACTIVE); | ||
6584 | schedule(); | ||
6585 | sub_preempt_count(PREEMPT_ACTIVE); | ||
6586 | } while (need_resched()); | ||
6587 | } | 6799 | } |
6588 | 6800 | ||
6589 | int __sched _cond_resched(void) | 6801 | int __sched _cond_resched(void) |
@@ -6597,14 +6809,14 @@ int __sched _cond_resched(void) | |||
6597 | EXPORT_SYMBOL(_cond_resched); | 6809 | EXPORT_SYMBOL(_cond_resched); |
6598 | 6810 | ||
6599 | /* | 6811 | /* |
6600 | * cond_resched_lock() - if a reschedule is pending, drop the given lock, | 6812 | * __cond_resched_lock() - if a reschedule is pending, drop the given lock, |
6601 | * call schedule, and on return reacquire the lock. | 6813 | * call schedule, and on return reacquire the lock. |
6602 | * | 6814 | * |
6603 | * This works OK both with and without CONFIG_PREEMPT. We do strange low-level | 6815 | * This works OK both with and without CONFIG_PREEMPT. We do strange low-level |
6604 | * operations here to prevent schedule() from being called twice (once via | 6816 | * operations here to prevent schedule() from being called twice (once via |
6605 | * spin_unlock(), once by hand). | 6817 | * spin_unlock(), once by hand). |
6606 | */ | 6818 | */ |
6607 | int cond_resched_lock(spinlock_t *lock) | 6819 | int __cond_resched_lock(spinlock_t *lock) |
6608 | { | 6820 | { |
6609 | int resched = should_resched(); | 6821 | int resched = should_resched(); |
6610 | int ret = 0; | 6822 | int ret = 0; |
@@ -6622,9 +6834,9 @@ int cond_resched_lock(spinlock_t *lock) | |||
6622 | } | 6834 | } |
6623 | return ret; | 6835 | return ret; |
6624 | } | 6836 | } |
6625 | EXPORT_SYMBOL(cond_resched_lock); | 6837 | EXPORT_SYMBOL(__cond_resched_lock); |
6626 | 6838 | ||
6627 | int __sched cond_resched_softirq(void) | 6839 | int __sched __cond_resched_softirq(void) |
6628 | { | 6840 | { |
6629 | BUG_ON(!in_softirq()); | 6841 | BUG_ON(!in_softirq()); |
6630 | 6842 | ||
@@ -6636,7 +6848,7 @@ int __sched cond_resched_softirq(void) | |||
6636 | } | 6848 | } |
6637 | return 0; | 6849 | return 0; |
6638 | } | 6850 | } |
6639 | EXPORT_SYMBOL(cond_resched_softirq); | 6851 | EXPORT_SYMBOL(__cond_resched_softirq); |
6640 | 6852 | ||
6641 | /** | 6853 | /** |
6642 | * yield - yield the current processor to other threads. | 6854 | * yield - yield the current processor to other threads. |
@@ -6660,11 +6872,13 @@ EXPORT_SYMBOL(yield); | |||
6660 | */ | 6872 | */ |
6661 | void __sched io_schedule(void) | 6873 | void __sched io_schedule(void) |
6662 | { | 6874 | { |
6663 | struct rq *rq = &__raw_get_cpu_var(runqueues); | 6875 | struct rq *rq = raw_rq(); |
6664 | 6876 | ||
6665 | delayacct_blkio_start(); | 6877 | delayacct_blkio_start(); |
6666 | atomic_inc(&rq->nr_iowait); | 6878 | atomic_inc(&rq->nr_iowait); |
6879 | current->in_iowait = 1; | ||
6667 | schedule(); | 6880 | schedule(); |
6881 | current->in_iowait = 0; | ||
6668 | atomic_dec(&rq->nr_iowait); | 6882 | atomic_dec(&rq->nr_iowait); |
6669 | delayacct_blkio_end(); | 6883 | delayacct_blkio_end(); |
6670 | } | 6884 | } |
@@ -6672,12 +6886,14 @@ EXPORT_SYMBOL(io_schedule); | |||
6672 | 6886 | ||
6673 | long __sched io_schedule_timeout(long timeout) | 6887 | long __sched io_schedule_timeout(long timeout) |
6674 | { | 6888 | { |
6675 | struct rq *rq = &__raw_get_cpu_var(runqueues); | 6889 | struct rq *rq = raw_rq(); |
6676 | long ret; | 6890 | long ret; |
6677 | 6891 | ||
6678 | delayacct_blkio_start(); | 6892 | delayacct_blkio_start(); |
6679 | atomic_inc(&rq->nr_iowait); | 6893 | atomic_inc(&rq->nr_iowait); |
6894 | current->in_iowait = 1; | ||
6680 | ret = schedule_timeout(timeout); | 6895 | ret = schedule_timeout(timeout); |
6896 | current->in_iowait = 0; | ||
6681 | atomic_dec(&rq->nr_iowait); | 6897 | atomic_dec(&rq->nr_iowait); |
6682 | delayacct_blkio_end(); | 6898 | delayacct_blkio_end(); |
6683 | return ret; | 6899 | return ret; |
@@ -6994,8 +7210,12 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) | |||
6994 | 7210 | ||
6995 | if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) { | 7211 | if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) { |
6996 | /* Need help from migration thread: drop lock and wait. */ | 7212 | /* Need help from migration thread: drop lock and wait. */ |
7213 | struct task_struct *mt = rq->migration_thread; | ||
7214 | |||
7215 | get_task_struct(mt); | ||
6997 | task_rq_unlock(rq, &flags); | 7216 | task_rq_unlock(rq, &flags); |
6998 | wake_up_process(rq->migration_thread); | 7217 | wake_up_process(rq->migration_thread); |
7218 | put_task_struct(mt); | ||
6999 | wait_for_completion(&req.done); | 7219 | wait_for_completion(&req.done); |
7000 | tlb_migrate_finish(p->mm); | 7220 | tlb_migrate_finish(p->mm); |
7001 | return 0; | 7221 | return 0; |
@@ -7642,7 +7862,7 @@ static int __init migration_init(void) | |||
7642 | migration_call(&migration_notifier, CPU_ONLINE, cpu); | 7862 | migration_call(&migration_notifier, CPU_ONLINE, cpu); |
7643 | register_cpu_notifier(&migration_notifier); | 7863 | register_cpu_notifier(&migration_notifier); |
7644 | 7864 | ||
7645 | return err; | 7865 | return 0; |
7646 | } | 7866 | } |
7647 | early_initcall(migration_init); | 7867 | early_initcall(migration_init); |
7648 | #endif | 7868 | #endif |
@@ -7689,7 +7909,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | |||
7689 | break; | 7909 | break; |
7690 | } | 7910 | } |
7691 | 7911 | ||
7692 | if (!group->__cpu_power) { | 7912 | if (!group->cpu_power) { |
7693 | printk(KERN_CONT "\n"); | 7913 | printk(KERN_CONT "\n"); |
7694 | printk(KERN_ERR "ERROR: domain->cpu_power not " | 7914 | printk(KERN_ERR "ERROR: domain->cpu_power not " |
7695 | "set\n"); | 7915 | "set\n"); |
@@ -7713,9 +7933,9 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | |||
7713 | cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); | 7933 | cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); |
7714 | 7934 | ||
7715 | printk(KERN_CONT " %s", str); | 7935 | printk(KERN_CONT " %s", str); |
7716 | if (group->__cpu_power != SCHED_LOAD_SCALE) { | 7936 | if (group->cpu_power != SCHED_LOAD_SCALE) { |
7717 | printk(KERN_CONT " (__cpu_power = %d)", | 7937 | printk(KERN_CONT " (cpu_power = %d)", |
7718 | group->__cpu_power); | 7938 | group->cpu_power); |
7719 | } | 7939 | } |
7720 | 7940 | ||
7721 | group = group->next; | 7941 | group = group->next; |
@@ -7858,7 +8078,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) | |||
7858 | rq->rd = rd; | 8078 | rq->rd = rd; |
7859 | 8079 | ||
7860 | cpumask_set_cpu(rq->cpu, rd->span); | 8080 | cpumask_set_cpu(rq->cpu, rd->span); |
7861 | if (cpumask_test_cpu(rq->cpu, cpu_online_mask)) | 8081 | if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) |
7862 | set_rq_online(rq); | 8082 | set_rq_online(rq); |
7863 | 8083 | ||
7864 | spin_unlock_irqrestore(&rq->lock, flags); | 8084 | spin_unlock_irqrestore(&rq->lock, flags); |
@@ -8000,7 +8220,7 @@ init_sched_build_groups(const struct cpumask *span, | |||
8000 | continue; | 8220 | continue; |
8001 | 8221 | ||
8002 | cpumask_clear(sched_group_cpus(sg)); | 8222 | cpumask_clear(sched_group_cpus(sg)); |
8003 | sg->__cpu_power = 0; | 8223 | sg->cpu_power = 0; |
8004 | 8224 | ||
8005 | for_each_cpu(j, span) { | 8225 | for_each_cpu(j, span) { |
8006 | if (group_fn(j, cpu_map, NULL, tmpmask) != group) | 8226 | if (group_fn(j, cpu_map, NULL, tmpmask) != group) |
@@ -8108,6 +8328,39 @@ struct static_sched_domain { | |||
8108 | DECLARE_BITMAP(span, CONFIG_NR_CPUS); | 8328 | DECLARE_BITMAP(span, CONFIG_NR_CPUS); |
8109 | }; | 8329 | }; |
8110 | 8330 | ||
8331 | struct s_data { | ||
8332 | #ifdef CONFIG_NUMA | ||
8333 | int sd_allnodes; | ||
8334 | cpumask_var_t domainspan; | ||
8335 | cpumask_var_t covered; | ||
8336 | cpumask_var_t notcovered; | ||
8337 | #endif | ||
8338 | cpumask_var_t nodemask; | ||
8339 | cpumask_var_t this_sibling_map; | ||
8340 | cpumask_var_t this_core_map; | ||
8341 | cpumask_var_t send_covered; | ||
8342 | cpumask_var_t tmpmask; | ||
8343 | struct sched_group **sched_group_nodes; | ||
8344 | struct root_domain *rd; | ||
8345 | }; | ||
8346 | |||
8347 | enum s_alloc { | ||
8348 | sa_sched_groups = 0, | ||
8349 | sa_rootdomain, | ||
8350 | sa_tmpmask, | ||
8351 | sa_send_covered, | ||
8352 | sa_this_core_map, | ||
8353 | sa_this_sibling_map, | ||
8354 | sa_nodemask, | ||
8355 | sa_sched_group_nodes, | ||
8356 | #ifdef CONFIG_NUMA | ||
8357 | sa_notcovered, | ||
8358 | sa_covered, | ||
8359 | sa_domainspan, | ||
8360 | #endif | ||
8361 | sa_none, | ||
8362 | }; | ||
8363 | |||
8111 | /* | 8364 | /* |
8112 | * SMT sched-domains: | 8365 | * SMT sched-domains: |
8113 | */ | 8366 | */ |
@@ -8225,11 +8478,76 @@ static void init_numa_sched_groups_power(struct sched_group *group_head) | |||
8225 | continue; | 8478 | continue; |
8226 | } | 8479 | } |
8227 | 8480 | ||
8228 | sg_inc_cpu_power(sg, sd->groups->__cpu_power); | 8481 | sg->cpu_power += sd->groups->cpu_power; |
8229 | } | 8482 | } |
8230 | sg = sg->next; | 8483 | sg = sg->next; |
8231 | } while (sg != group_head); | 8484 | } while (sg != group_head); |
8232 | } | 8485 | } |
8486 | |||
8487 | static int build_numa_sched_groups(struct s_data *d, | ||
8488 | const struct cpumask *cpu_map, int num) | ||
8489 | { | ||
8490 | struct sched_domain *sd; | ||
8491 | struct sched_group *sg, *prev; | ||
8492 | int n, j; | ||
8493 | |||
8494 | cpumask_clear(d->covered); | ||
8495 | cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map); | ||
8496 | if (cpumask_empty(d->nodemask)) { | ||
8497 | d->sched_group_nodes[num] = NULL; | ||
8498 | goto out; | ||
8499 | } | ||
8500 | |||
8501 | sched_domain_node_span(num, d->domainspan); | ||
8502 | cpumask_and(d->domainspan, d->domainspan, cpu_map); | ||
8503 | |||
8504 | sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), | ||
8505 | GFP_KERNEL, num); | ||
8506 | if (!sg) { | ||
8507 | printk(KERN_WARNING "Can not alloc domain group for node %d\n", | ||
8508 | num); | ||
8509 | return -ENOMEM; | ||
8510 | } | ||
8511 | d->sched_group_nodes[num] = sg; | ||
8512 | |||
8513 | for_each_cpu(j, d->nodemask) { | ||
8514 | sd = &per_cpu(node_domains, j).sd; | ||
8515 | sd->groups = sg; | ||
8516 | } | ||
8517 | |||
8518 | sg->cpu_power = 0; | ||
8519 | cpumask_copy(sched_group_cpus(sg), d->nodemask); | ||
8520 | sg->next = sg; | ||
8521 | cpumask_or(d->covered, d->covered, d->nodemask); | ||
8522 | |||
8523 | prev = sg; | ||
8524 | for (j = 0; j < nr_node_ids; j++) { | ||
8525 | n = (num + j) % nr_node_ids; | ||
8526 | cpumask_complement(d->notcovered, d->covered); | ||
8527 | cpumask_and(d->tmpmask, d->notcovered, cpu_map); | ||
8528 | cpumask_and(d->tmpmask, d->tmpmask, d->domainspan); | ||
8529 | if (cpumask_empty(d->tmpmask)) | ||
8530 | break; | ||
8531 | cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n)); | ||
8532 | if (cpumask_empty(d->tmpmask)) | ||
8533 | continue; | ||
8534 | sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), | ||
8535 | GFP_KERNEL, num); | ||
8536 | if (!sg) { | ||
8537 | printk(KERN_WARNING | ||
8538 | "Can not alloc domain group for node %d\n", j); | ||
8539 | return -ENOMEM; | ||
8540 | } | ||
8541 | sg->cpu_power = 0; | ||
8542 | cpumask_copy(sched_group_cpus(sg), d->tmpmask); | ||
8543 | sg->next = prev->next; | ||
8544 | cpumask_or(d->covered, d->covered, d->tmpmask); | ||
8545 | prev->next = sg; | ||
8546 | prev = sg; | ||
8547 | } | ||
8548 | out: | ||
8549 | return 0; | ||
8550 | } | ||
8233 | #endif /* CONFIG_NUMA */ | 8551 | #endif /* CONFIG_NUMA */ |
8234 | 8552 | ||
8235 | #ifdef CONFIG_NUMA | 8553 | #ifdef CONFIG_NUMA |
@@ -8283,15 +8601,13 @@ static void free_sched_groups(const struct cpumask *cpu_map, | |||
8283 | * there are asymmetries in the topology. If there are asymmetries, group | 8601 | * there are asymmetries in the topology. If there are asymmetries, group |
8284 | * having more cpu_power will pickup more load compared to the group having | 8602 | * having more cpu_power will pickup more load compared to the group having |
8285 | * less cpu_power. | 8603 | * less cpu_power. |
8286 | * | ||
8287 | * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents | ||
8288 | * the maximum number of tasks a group can handle in the presence of other idle | ||
8289 | * or lightly loaded groups in the same sched domain. | ||
8290 | */ | 8604 | */ |
8291 | static void init_sched_groups_power(int cpu, struct sched_domain *sd) | 8605 | static void init_sched_groups_power(int cpu, struct sched_domain *sd) |
8292 | { | 8606 | { |
8293 | struct sched_domain *child; | 8607 | struct sched_domain *child; |
8294 | struct sched_group *group; | 8608 | struct sched_group *group; |
8609 | long power; | ||
8610 | int weight; | ||
8295 | 8611 | ||
8296 | WARN_ON(!sd || !sd->groups); | 8612 | WARN_ON(!sd || !sd->groups); |
8297 | 8613 | ||
@@ -8300,28 +8616,32 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) | |||
8300 | 8616 | ||
8301 | child = sd->child; | 8617 | child = sd->child; |
8302 | 8618 | ||
8303 | sd->groups->__cpu_power = 0; | 8619 | sd->groups->cpu_power = 0; |
8304 | 8620 | ||
8305 | /* | 8621 | if (!child) { |
8306 | * For perf policy, if the groups in child domain share resources | 8622 | power = SCHED_LOAD_SCALE; |
8307 | * (for example cores sharing some portions of the cache hierarchy | 8623 | weight = cpumask_weight(sched_domain_span(sd)); |
8308 | * or SMT), then set this domain groups cpu_power such that each group | 8624 | /* |
8309 | * can handle only one task, when there are other idle groups in the | 8625 | * SMT siblings share the power of a single core. |
8310 | * same sched domain. | 8626 | * Usually multiple threads get a better yield out of |
8311 | */ | 8627 | * that one core than a single thread would have, |
8312 | if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) && | 8628 | * reflect that in sd->smt_gain. |
8313 | (child->flags & | 8629 | */ |
8314 | (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) { | 8630 | if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { |
8315 | sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE); | 8631 | power *= sd->smt_gain; |
8632 | power /= weight; | ||
8633 | power >>= SCHED_LOAD_SHIFT; | ||
8634 | } | ||
8635 | sd->groups->cpu_power += power; | ||
8316 | return; | 8636 | return; |
8317 | } | 8637 | } |
8318 | 8638 | ||
8319 | /* | 8639 | /* |
8320 | * add cpu_power of each child group to this groups cpu_power | 8640 | * Add cpu_power of each child group to this groups cpu_power. |
8321 | */ | 8641 | */ |
8322 | group = child->groups; | 8642 | group = child->groups; |
8323 | do { | 8643 | do { |
8324 | sg_inc_cpu_power(sd->groups, group->__cpu_power); | 8644 | sd->groups->cpu_power += group->cpu_power; |
8325 | group = group->next; | 8645 | group = group->next; |
8326 | } while (group != child->groups); | 8646 | } while (group != child->groups); |
8327 | } | 8647 | } |
@@ -8395,280 +8715,285 @@ static void set_domain_attribute(struct sched_domain *sd, | |||
8395 | } | 8715 | } |
8396 | } | 8716 | } |
8397 | 8717 | ||
8398 | /* | 8718 | static void __free_domain_allocs(struct s_data *d, enum s_alloc what, |
8399 | * Build sched domains for a given set of cpus and attach the sched domains | 8719 | const struct cpumask *cpu_map) |
8400 | * to the individual cpus | 8720 | { |
8401 | */ | 8721 | switch (what) { |
8402 | static int __build_sched_domains(const struct cpumask *cpu_map, | 8722 | case sa_sched_groups: |
8403 | struct sched_domain_attr *attr) | 8723 | free_sched_groups(cpu_map, d->tmpmask); /* fall through */ |
8404 | { | 8724 | d->sched_group_nodes = NULL; |
8405 | int i, err = -ENOMEM; | 8725 | case sa_rootdomain: |
8406 | struct root_domain *rd; | 8726 | free_rootdomain(d->rd); /* fall through */ |
8407 | cpumask_var_t nodemask, this_sibling_map, this_core_map, send_covered, | 8727 | case sa_tmpmask: |
8408 | tmpmask; | 8728 | free_cpumask_var(d->tmpmask); /* fall through */ |
8729 | case sa_send_covered: | ||
8730 | free_cpumask_var(d->send_covered); /* fall through */ | ||
8731 | case sa_this_core_map: | ||
8732 | free_cpumask_var(d->this_core_map); /* fall through */ | ||
8733 | case sa_this_sibling_map: | ||
8734 | free_cpumask_var(d->this_sibling_map); /* fall through */ | ||
8735 | case sa_nodemask: | ||
8736 | free_cpumask_var(d->nodemask); /* fall through */ | ||
8737 | case sa_sched_group_nodes: | ||
8409 | #ifdef CONFIG_NUMA | 8738 | #ifdef CONFIG_NUMA |
8410 | cpumask_var_t domainspan, covered, notcovered; | 8739 | kfree(d->sched_group_nodes); /* fall through */ |
8411 | struct sched_group **sched_group_nodes = NULL; | 8740 | case sa_notcovered: |
8412 | int sd_allnodes = 0; | 8741 | free_cpumask_var(d->notcovered); /* fall through */ |
8413 | 8742 | case sa_covered: | |
8414 | if (!alloc_cpumask_var(&domainspan, GFP_KERNEL)) | 8743 | free_cpumask_var(d->covered); /* fall through */ |
8415 | goto out; | 8744 | case sa_domainspan: |
8416 | if (!alloc_cpumask_var(&covered, GFP_KERNEL)) | 8745 | free_cpumask_var(d->domainspan); /* fall through */ |
8417 | goto free_domainspan; | 8746 | #endif |
8418 | if (!alloc_cpumask_var(¬covered, GFP_KERNEL)) | 8747 | case sa_none: |
8419 | goto free_covered; | 8748 | break; |
8420 | #endif | 8749 | } |
8421 | 8750 | } | |
8422 | if (!alloc_cpumask_var(&nodemask, GFP_KERNEL)) | ||
8423 | goto free_notcovered; | ||
8424 | if (!alloc_cpumask_var(&this_sibling_map, GFP_KERNEL)) | ||
8425 | goto free_nodemask; | ||
8426 | if (!alloc_cpumask_var(&this_core_map, GFP_KERNEL)) | ||
8427 | goto free_this_sibling_map; | ||
8428 | if (!alloc_cpumask_var(&send_covered, GFP_KERNEL)) | ||
8429 | goto free_this_core_map; | ||
8430 | if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL)) | ||
8431 | goto free_send_covered; | ||
8432 | 8751 | ||
8752 | static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, | ||
8753 | const struct cpumask *cpu_map) | ||
8754 | { | ||
8433 | #ifdef CONFIG_NUMA | 8755 | #ifdef CONFIG_NUMA |
8434 | /* | 8756 | if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL)) |
8435 | * Allocate the per-node list of sched groups | 8757 | return sa_none; |
8436 | */ | 8758 | if (!alloc_cpumask_var(&d->covered, GFP_KERNEL)) |
8437 | sched_group_nodes = kcalloc(nr_node_ids, sizeof(struct sched_group *), | 8759 | return sa_domainspan; |
8438 | GFP_KERNEL); | 8760 | if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL)) |
8439 | if (!sched_group_nodes) { | 8761 | return sa_covered; |
8762 | /* Allocate the per-node list of sched groups */ | ||
8763 | d->sched_group_nodes = kcalloc(nr_node_ids, | ||
8764 | sizeof(struct sched_group *), GFP_KERNEL); | ||
8765 | if (!d->sched_group_nodes) { | ||
8440 | printk(KERN_WARNING "Can not alloc sched group node list\n"); | 8766 | printk(KERN_WARNING "Can not alloc sched group node list\n"); |
8441 | goto free_tmpmask; | 8767 | return sa_notcovered; |
8442 | } | 8768 | } |
8443 | #endif | 8769 | sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes; |
8444 | 8770 | #endif | |
8445 | rd = alloc_rootdomain(); | 8771 | if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL)) |
8446 | if (!rd) { | 8772 | return sa_sched_group_nodes; |
8773 | if (!alloc_cpumask_var(&d->this_sibling_map, GFP_KERNEL)) | ||
8774 | return sa_nodemask; | ||
8775 | if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL)) | ||
8776 | return sa_this_sibling_map; | ||
8777 | if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL)) | ||
8778 | return sa_this_core_map; | ||
8779 | if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL)) | ||
8780 | return sa_send_covered; | ||
8781 | d->rd = alloc_rootdomain(); | ||
8782 | if (!d->rd) { | ||
8447 | printk(KERN_WARNING "Cannot alloc root domain\n"); | 8783 | printk(KERN_WARNING "Cannot alloc root domain\n"); |
8448 | goto free_sched_groups; | 8784 | return sa_tmpmask; |
8449 | } | 8785 | } |
8786 | return sa_rootdomain; | ||
8787 | } | ||
8450 | 8788 | ||
8789 | static struct sched_domain *__build_numa_sched_domains(struct s_data *d, | ||
8790 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i) | ||
8791 | { | ||
8792 | struct sched_domain *sd = NULL; | ||
8451 | #ifdef CONFIG_NUMA | 8793 | #ifdef CONFIG_NUMA |
8452 | sched_group_nodes_bycpu[cpumask_first(cpu_map)] = sched_group_nodes; | 8794 | struct sched_domain *parent; |
8453 | #endif | ||
8454 | |||
8455 | /* | ||
8456 | * Set up domains for cpus specified by the cpu_map. | ||
8457 | */ | ||
8458 | for_each_cpu(i, cpu_map) { | ||
8459 | struct sched_domain *sd = NULL, *p; | ||
8460 | 8795 | ||
8461 | cpumask_and(nodemask, cpumask_of_node(cpu_to_node(i)), cpu_map); | 8796 | d->sd_allnodes = 0; |
8462 | 8797 | if (cpumask_weight(cpu_map) > | |
8463 | #ifdef CONFIG_NUMA | 8798 | SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) { |
8464 | if (cpumask_weight(cpu_map) > | 8799 | sd = &per_cpu(allnodes_domains, i).sd; |
8465 | SD_NODES_PER_DOMAIN*cpumask_weight(nodemask)) { | 8800 | SD_INIT(sd, ALLNODES); |
8466 | sd = &per_cpu(allnodes_domains, i).sd; | ||
8467 | SD_INIT(sd, ALLNODES); | ||
8468 | set_domain_attribute(sd, attr); | ||
8469 | cpumask_copy(sched_domain_span(sd), cpu_map); | ||
8470 | cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask); | ||
8471 | p = sd; | ||
8472 | sd_allnodes = 1; | ||
8473 | } else | ||
8474 | p = NULL; | ||
8475 | |||
8476 | sd = &per_cpu(node_domains, i).sd; | ||
8477 | SD_INIT(sd, NODE); | ||
8478 | set_domain_attribute(sd, attr); | 8801 | set_domain_attribute(sd, attr); |
8479 | sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd)); | 8802 | cpumask_copy(sched_domain_span(sd), cpu_map); |
8480 | sd->parent = p; | 8803 | cpu_to_allnodes_group(i, cpu_map, &sd->groups, d->tmpmask); |
8481 | if (p) | 8804 | d->sd_allnodes = 1; |
8482 | p->child = sd; | 8805 | } |
8483 | cpumask_and(sched_domain_span(sd), | 8806 | parent = sd; |
8484 | sched_domain_span(sd), cpu_map); | 8807 | |
8808 | sd = &per_cpu(node_domains, i).sd; | ||
8809 | SD_INIT(sd, NODE); | ||
8810 | set_domain_attribute(sd, attr); | ||
8811 | sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd)); | ||
8812 | sd->parent = parent; | ||
8813 | if (parent) | ||
8814 | parent->child = sd; | ||
8815 | cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map); | ||
8485 | #endif | 8816 | #endif |
8817 | return sd; | ||
8818 | } | ||
8486 | 8819 | ||
8487 | p = sd; | 8820 | static struct sched_domain *__build_cpu_sched_domain(struct s_data *d, |
8488 | sd = &per_cpu(phys_domains, i).sd; | 8821 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, |
8489 | SD_INIT(sd, CPU); | 8822 | struct sched_domain *parent, int i) |
8490 | set_domain_attribute(sd, attr); | 8823 | { |
8491 | cpumask_copy(sched_domain_span(sd), nodemask); | 8824 | struct sched_domain *sd; |
8492 | sd->parent = p; | 8825 | sd = &per_cpu(phys_domains, i).sd; |
8493 | if (p) | 8826 | SD_INIT(sd, CPU); |
8494 | p->child = sd; | 8827 | set_domain_attribute(sd, attr); |
8495 | cpu_to_phys_group(i, cpu_map, &sd->groups, tmpmask); | 8828 | cpumask_copy(sched_domain_span(sd), d->nodemask); |
8829 | sd->parent = parent; | ||
8830 | if (parent) | ||
8831 | parent->child = sd; | ||
8832 | cpu_to_phys_group(i, cpu_map, &sd->groups, d->tmpmask); | ||
8833 | return sd; | ||
8834 | } | ||
8496 | 8835 | ||
8836 | static struct sched_domain *__build_mc_sched_domain(struct s_data *d, | ||
8837 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, | ||
8838 | struct sched_domain *parent, int i) | ||
8839 | { | ||
8840 | struct sched_domain *sd = parent; | ||
8497 | #ifdef CONFIG_SCHED_MC | 8841 | #ifdef CONFIG_SCHED_MC |
8498 | p = sd; | 8842 | sd = &per_cpu(core_domains, i).sd; |
8499 | sd = &per_cpu(core_domains, i).sd; | 8843 | SD_INIT(sd, MC); |
8500 | SD_INIT(sd, MC); | 8844 | set_domain_attribute(sd, attr); |
8501 | set_domain_attribute(sd, attr); | 8845 | cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i)); |
8502 | cpumask_and(sched_domain_span(sd), cpu_map, | 8846 | sd->parent = parent; |
8503 | cpu_coregroup_mask(i)); | 8847 | parent->child = sd; |
8504 | sd->parent = p; | 8848 | cpu_to_core_group(i, cpu_map, &sd->groups, d->tmpmask); |
8505 | p->child = sd; | ||
8506 | cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask); | ||
8507 | #endif | 8849 | #endif |
8850 | return sd; | ||
8851 | } | ||
8508 | 8852 | ||
8853 | static struct sched_domain *__build_smt_sched_domain(struct s_data *d, | ||
8854 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, | ||
8855 | struct sched_domain *parent, int i) | ||
8856 | { | ||
8857 | struct sched_domain *sd = parent; | ||
8509 | #ifdef CONFIG_SCHED_SMT | 8858 | #ifdef CONFIG_SCHED_SMT |
8510 | p = sd; | 8859 | sd = &per_cpu(cpu_domains, i).sd; |
8511 | sd = &per_cpu(cpu_domains, i).sd; | 8860 | SD_INIT(sd, SIBLING); |
8512 | SD_INIT(sd, SIBLING); | 8861 | set_domain_attribute(sd, attr); |
8513 | set_domain_attribute(sd, attr); | 8862 | cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i)); |
8514 | cpumask_and(sched_domain_span(sd), | 8863 | sd->parent = parent; |
8515 | topology_thread_cpumask(i), cpu_map); | 8864 | parent->child = sd; |
8516 | sd->parent = p; | 8865 | cpu_to_cpu_group(i, cpu_map, &sd->groups, d->tmpmask); |
8517 | p->child = sd; | ||
8518 | cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask); | ||
8519 | #endif | 8866 | #endif |
8520 | } | 8867 | return sd; |
8868 | } | ||
8521 | 8869 | ||
8870 | static void build_sched_groups(struct s_data *d, enum sched_domain_level l, | ||
8871 | const struct cpumask *cpu_map, int cpu) | ||
8872 | { | ||
8873 | switch (l) { | ||
8522 | #ifdef CONFIG_SCHED_SMT | 8874 | #ifdef CONFIG_SCHED_SMT |
8523 | /* Set up CPU (sibling) groups */ | 8875 | case SD_LV_SIBLING: /* set up CPU (sibling) groups */ |
8524 | for_each_cpu(i, cpu_map) { | 8876 | cpumask_and(d->this_sibling_map, cpu_map, |
8525 | cpumask_and(this_sibling_map, | 8877 | topology_thread_cpumask(cpu)); |
8526 | topology_thread_cpumask(i), cpu_map); | 8878 | if (cpu == cpumask_first(d->this_sibling_map)) |
8527 | if (i != cpumask_first(this_sibling_map)) | 8879 | init_sched_build_groups(d->this_sibling_map, cpu_map, |
8528 | continue; | 8880 | &cpu_to_cpu_group, |
8529 | 8881 | d->send_covered, d->tmpmask); | |
8530 | init_sched_build_groups(this_sibling_map, cpu_map, | 8882 | break; |
8531 | &cpu_to_cpu_group, | ||
8532 | send_covered, tmpmask); | ||
8533 | } | ||
8534 | #endif | 8883 | #endif |
8535 | |||
8536 | #ifdef CONFIG_SCHED_MC | 8884 | #ifdef CONFIG_SCHED_MC |
8537 | /* Set up multi-core groups */ | 8885 | case SD_LV_MC: /* set up multi-core groups */ |
8538 | for_each_cpu(i, cpu_map) { | 8886 | cpumask_and(d->this_core_map, cpu_map, cpu_coregroup_mask(cpu)); |
8539 | cpumask_and(this_core_map, cpu_coregroup_mask(i), cpu_map); | 8887 | if (cpu == cpumask_first(d->this_core_map)) |
8540 | if (i != cpumask_first(this_core_map)) | 8888 | init_sched_build_groups(d->this_core_map, cpu_map, |
8541 | continue; | 8889 | &cpu_to_core_group, |
8542 | 8890 | d->send_covered, d->tmpmask); | |
8543 | init_sched_build_groups(this_core_map, cpu_map, | 8891 | break; |
8544 | &cpu_to_core_group, | ||
8545 | send_covered, tmpmask); | ||
8546 | } | ||
8547 | #endif | 8892 | #endif |
8548 | 8893 | case SD_LV_CPU: /* set up physical groups */ | |
8549 | /* Set up physical groups */ | 8894 | cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map); |
8550 | for (i = 0; i < nr_node_ids; i++) { | 8895 | if (!cpumask_empty(d->nodemask)) |
8551 | cpumask_and(nodemask, cpumask_of_node(i), cpu_map); | 8896 | init_sched_build_groups(d->nodemask, cpu_map, |
8552 | if (cpumask_empty(nodemask)) | 8897 | &cpu_to_phys_group, |
8553 | continue; | 8898 | d->send_covered, d->tmpmask); |
8554 | 8899 | break; | |
8555 | init_sched_build_groups(nodemask, cpu_map, | ||
8556 | &cpu_to_phys_group, | ||
8557 | send_covered, tmpmask); | ||
8558 | } | ||
8559 | |||
8560 | #ifdef CONFIG_NUMA | 8900 | #ifdef CONFIG_NUMA |
8561 | /* Set up node groups */ | 8901 | case SD_LV_ALLNODES: |
8562 | if (sd_allnodes) { | 8902 | init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group, |
8563 | init_sched_build_groups(cpu_map, cpu_map, | 8903 | d->send_covered, d->tmpmask); |
8564 | &cpu_to_allnodes_group, | 8904 | break; |
8565 | send_covered, tmpmask); | 8905 | #endif |
8906 | default: | ||
8907 | break; | ||
8566 | } | 8908 | } |
8909 | } | ||
8567 | 8910 | ||
8568 | for (i = 0; i < nr_node_ids; i++) { | 8911 | /* |
8569 | /* Set up node groups */ | 8912 | * Build sched domains for a given set of cpus and attach the sched domains |
8570 | struct sched_group *sg, *prev; | 8913 | * to the individual cpus |
8571 | int j; | 8914 | */ |
8572 | 8915 | static int __build_sched_domains(const struct cpumask *cpu_map, | |
8573 | cpumask_clear(covered); | 8916 | struct sched_domain_attr *attr) |
8574 | cpumask_and(nodemask, cpumask_of_node(i), cpu_map); | 8917 | { |
8575 | if (cpumask_empty(nodemask)) { | 8918 | enum s_alloc alloc_state = sa_none; |
8576 | sched_group_nodes[i] = NULL; | 8919 | struct s_data d; |
8577 | continue; | 8920 | struct sched_domain *sd; |
8578 | } | 8921 | int i; |
8922 | #ifdef CONFIG_NUMA | ||
8923 | d.sd_allnodes = 0; | ||
8924 | #endif | ||
8579 | 8925 | ||
8580 | sched_domain_node_span(i, domainspan); | 8926 | alloc_state = __visit_domain_allocation_hell(&d, cpu_map); |
8581 | cpumask_and(domainspan, domainspan, cpu_map); | 8927 | if (alloc_state != sa_rootdomain) |
8928 | goto error; | ||
8929 | alloc_state = sa_sched_groups; | ||
8582 | 8930 | ||
8583 | sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), | 8931 | /* |
8584 | GFP_KERNEL, i); | 8932 | * Set up domains for cpus specified by the cpu_map. |
8585 | if (!sg) { | 8933 | */ |
8586 | printk(KERN_WARNING "Can not alloc domain group for " | 8934 | for_each_cpu(i, cpu_map) { |
8587 | "node %d\n", i); | 8935 | cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)), |
8588 | goto error; | 8936 | cpu_map); |
8589 | } | ||
8590 | sched_group_nodes[i] = sg; | ||
8591 | for_each_cpu(j, nodemask) { | ||
8592 | struct sched_domain *sd; | ||
8593 | 8937 | ||
8594 | sd = &per_cpu(node_domains, j).sd; | 8938 | sd = __build_numa_sched_domains(&d, cpu_map, attr, i); |
8595 | sd->groups = sg; | 8939 | sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i); |
8596 | } | 8940 | sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i); |
8597 | sg->__cpu_power = 0; | 8941 | sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i); |
8598 | cpumask_copy(sched_group_cpus(sg), nodemask); | 8942 | } |
8599 | sg->next = sg; | ||
8600 | cpumask_or(covered, covered, nodemask); | ||
8601 | prev = sg; | ||
8602 | 8943 | ||
8603 | for (j = 0; j < nr_node_ids; j++) { | 8944 | for_each_cpu(i, cpu_map) { |
8604 | int n = (i + j) % nr_node_ids; | 8945 | build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i); |
8946 | build_sched_groups(&d, SD_LV_MC, cpu_map, i); | ||
8947 | } | ||
8605 | 8948 | ||
8606 | cpumask_complement(notcovered, covered); | 8949 | /* Set up physical groups */ |
8607 | cpumask_and(tmpmask, notcovered, cpu_map); | 8950 | for (i = 0; i < nr_node_ids; i++) |
8608 | cpumask_and(tmpmask, tmpmask, domainspan); | 8951 | build_sched_groups(&d, SD_LV_CPU, cpu_map, i); |
8609 | if (cpumask_empty(tmpmask)) | ||
8610 | break; | ||
8611 | 8952 | ||
8612 | cpumask_and(tmpmask, tmpmask, cpumask_of_node(n)); | 8953 | #ifdef CONFIG_NUMA |
8613 | if (cpumask_empty(tmpmask)) | 8954 | /* Set up node groups */ |
8614 | continue; | 8955 | if (d.sd_allnodes) |
8956 | build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0); | ||
8615 | 8957 | ||
8616 | sg = kmalloc_node(sizeof(struct sched_group) + | 8958 | for (i = 0; i < nr_node_ids; i++) |
8617 | cpumask_size(), | 8959 | if (build_numa_sched_groups(&d, cpu_map, i)) |
8618 | GFP_KERNEL, i); | 8960 | goto error; |
8619 | if (!sg) { | ||
8620 | printk(KERN_WARNING | ||
8621 | "Can not alloc domain group for node %d\n", j); | ||
8622 | goto error; | ||
8623 | } | ||
8624 | sg->__cpu_power = 0; | ||
8625 | cpumask_copy(sched_group_cpus(sg), tmpmask); | ||
8626 | sg->next = prev->next; | ||
8627 | cpumask_or(covered, covered, tmpmask); | ||
8628 | prev->next = sg; | ||
8629 | prev = sg; | ||
8630 | } | ||
8631 | } | ||
8632 | #endif | 8961 | #endif |
8633 | 8962 | ||
8634 | /* Calculate CPU power for physical packages and nodes */ | 8963 | /* Calculate CPU power for physical packages and nodes */ |
8635 | #ifdef CONFIG_SCHED_SMT | 8964 | #ifdef CONFIG_SCHED_SMT |
8636 | for_each_cpu(i, cpu_map) { | 8965 | for_each_cpu(i, cpu_map) { |
8637 | struct sched_domain *sd = &per_cpu(cpu_domains, i).sd; | 8966 | sd = &per_cpu(cpu_domains, i).sd; |
8638 | |||
8639 | init_sched_groups_power(i, sd); | 8967 | init_sched_groups_power(i, sd); |
8640 | } | 8968 | } |
8641 | #endif | 8969 | #endif |
8642 | #ifdef CONFIG_SCHED_MC | 8970 | #ifdef CONFIG_SCHED_MC |
8643 | for_each_cpu(i, cpu_map) { | 8971 | for_each_cpu(i, cpu_map) { |
8644 | struct sched_domain *sd = &per_cpu(core_domains, i).sd; | 8972 | sd = &per_cpu(core_domains, i).sd; |
8645 | |||
8646 | init_sched_groups_power(i, sd); | 8973 | init_sched_groups_power(i, sd); |
8647 | } | 8974 | } |
8648 | #endif | 8975 | #endif |
8649 | 8976 | ||
8650 | for_each_cpu(i, cpu_map) { | 8977 | for_each_cpu(i, cpu_map) { |
8651 | struct sched_domain *sd = &per_cpu(phys_domains, i).sd; | 8978 | sd = &per_cpu(phys_domains, i).sd; |
8652 | |||
8653 | init_sched_groups_power(i, sd); | 8979 | init_sched_groups_power(i, sd); |
8654 | } | 8980 | } |
8655 | 8981 | ||
8656 | #ifdef CONFIG_NUMA | 8982 | #ifdef CONFIG_NUMA |
8657 | for (i = 0; i < nr_node_ids; i++) | 8983 | for (i = 0; i < nr_node_ids; i++) |
8658 | init_numa_sched_groups_power(sched_group_nodes[i]); | 8984 | init_numa_sched_groups_power(d.sched_group_nodes[i]); |
8659 | 8985 | ||
8660 | if (sd_allnodes) { | 8986 | if (d.sd_allnodes) { |
8661 | struct sched_group *sg; | 8987 | struct sched_group *sg; |
8662 | 8988 | ||
8663 | cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg, | 8989 | cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg, |
8664 | tmpmask); | 8990 | d.tmpmask); |
8665 | init_numa_sched_groups_power(sg); | 8991 | init_numa_sched_groups_power(sg); |
8666 | } | 8992 | } |
8667 | #endif | 8993 | #endif |
8668 | 8994 | ||
8669 | /* Attach the domains */ | 8995 | /* Attach the domains */ |
8670 | for_each_cpu(i, cpu_map) { | 8996 | for_each_cpu(i, cpu_map) { |
8671 | struct sched_domain *sd; | ||
8672 | #ifdef CONFIG_SCHED_SMT | 8997 | #ifdef CONFIG_SCHED_SMT |
8673 | sd = &per_cpu(cpu_domains, i).sd; | 8998 | sd = &per_cpu(cpu_domains, i).sd; |
8674 | #elif defined(CONFIG_SCHED_MC) | 8999 | #elif defined(CONFIG_SCHED_MC) |
@@ -8676,44 +9001,16 @@ static int __build_sched_domains(const struct cpumask *cpu_map, | |||
8676 | #else | 9001 | #else |
8677 | sd = &per_cpu(phys_domains, i).sd; | 9002 | sd = &per_cpu(phys_domains, i).sd; |
8678 | #endif | 9003 | #endif |
8679 | cpu_attach_domain(sd, rd, i); | 9004 | cpu_attach_domain(sd, d.rd, i); |
8680 | } | 9005 | } |
8681 | 9006 | ||
8682 | err = 0; | 9007 | d.sched_group_nodes = NULL; /* don't free this we still need it */ |
8683 | 9008 | __free_domain_allocs(&d, sa_tmpmask, cpu_map); | |
8684 | free_tmpmask: | 9009 | return 0; |
8685 | free_cpumask_var(tmpmask); | ||
8686 | free_send_covered: | ||
8687 | free_cpumask_var(send_covered); | ||
8688 | free_this_core_map: | ||
8689 | free_cpumask_var(this_core_map); | ||
8690 | free_this_sibling_map: | ||
8691 | free_cpumask_var(this_sibling_map); | ||
8692 | free_nodemask: | ||
8693 | free_cpumask_var(nodemask); | ||
8694 | free_notcovered: | ||
8695 | #ifdef CONFIG_NUMA | ||
8696 | free_cpumask_var(notcovered); | ||
8697 | free_covered: | ||
8698 | free_cpumask_var(covered); | ||
8699 | free_domainspan: | ||
8700 | free_cpumask_var(domainspan); | ||
8701 | out: | ||
8702 | #endif | ||
8703 | return err; | ||
8704 | |||
8705 | free_sched_groups: | ||
8706 | #ifdef CONFIG_NUMA | ||
8707 | kfree(sched_group_nodes); | ||
8708 | #endif | ||
8709 | goto free_tmpmask; | ||
8710 | 9010 | ||
8711 | #ifdef CONFIG_NUMA | ||
8712 | error: | 9011 | error: |
8713 | free_sched_groups(cpu_map, tmpmask); | 9012 | __free_domain_allocs(&d, alloc_state, cpu_map); |
8714 | free_rootdomain(rd); | 9013 | return -ENOMEM; |
8715 | goto free_tmpmask; | ||
8716 | #endif | ||
8717 | } | 9014 | } |
8718 | 9015 | ||
8719 | static int build_sched_domains(const struct cpumask *cpu_map) | 9016 | static int build_sched_domains(const struct cpumask *cpu_map) |
@@ -9321,11 +9618,11 @@ void __init sched_init(void) | |||
9321 | * system cpu resource, based on the weight assigned to root | 9618 | * system cpu resource, based on the weight assigned to root |
9322 | * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished | 9619 | * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished |
9323 | * by letting tasks of init_task_group sit in a separate cfs_rq | 9620 | * by letting tasks of init_task_group sit in a separate cfs_rq |
9324 | * (init_cfs_rq) and having one entity represent this group of | 9621 | * (init_tg_cfs_rq) and having one entity represent this group of |
9325 | * tasks in rq->cfs (i.e init_task_group->se[] != NULL). | 9622 | * tasks in rq->cfs (i.e init_task_group->se[] != NULL). |
9326 | */ | 9623 | */ |
9327 | init_tg_cfs_entry(&init_task_group, | 9624 | init_tg_cfs_entry(&init_task_group, |
9328 | &per_cpu(init_cfs_rq, i), | 9625 | &per_cpu(init_tg_cfs_rq, i), |
9329 | &per_cpu(init_sched_entity, i), i, 1, | 9626 | &per_cpu(init_sched_entity, i), i, 1, |
9330 | root_task_group.se[i]); | 9627 | root_task_group.se[i]); |
9331 | 9628 | ||
@@ -9351,6 +9648,7 @@ void __init sched_init(void) | |||
9351 | #ifdef CONFIG_SMP | 9648 | #ifdef CONFIG_SMP |
9352 | rq->sd = NULL; | 9649 | rq->sd = NULL; |
9353 | rq->rd = NULL; | 9650 | rq->rd = NULL; |
9651 | rq->post_schedule = 0; | ||
9354 | rq->active_balance = 0; | 9652 | rq->active_balance = 0; |
9355 | rq->next_balance = jiffies; | 9653 | rq->next_balance = jiffies; |
9356 | rq->push_cpu = 0; | 9654 | rq->push_cpu = 0; |
@@ -9415,13 +9713,20 @@ void __init sched_init(void) | |||
9415 | } | 9713 | } |
9416 | 9714 | ||
9417 | #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP | 9715 | #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP |
9418 | void __might_sleep(char *file, int line) | 9716 | static inline int preempt_count_equals(int preempt_offset) |
9717 | { | ||
9718 | int nested = preempt_count() & ~PREEMPT_ACTIVE; | ||
9719 | |||
9720 | return (nested == PREEMPT_INATOMIC_BASE + preempt_offset); | ||
9721 | } | ||
9722 | |||
9723 | void __might_sleep(char *file, int line, int preempt_offset) | ||
9419 | { | 9724 | { |
9420 | #ifdef in_atomic | 9725 | #ifdef in_atomic |
9421 | static unsigned long prev_jiffy; /* ratelimiting */ | 9726 | static unsigned long prev_jiffy; /* ratelimiting */ |
9422 | 9727 | ||
9423 | if ((!in_atomic() && !irqs_disabled()) || | 9728 | if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) || |
9424 | system_state != SYSTEM_RUNNING || oops_in_progress) | 9729 | system_state != SYSTEM_RUNNING || oops_in_progress) |
9425 | return; | 9730 | return; |
9426 | if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) | 9731 | if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) |
9427 | return; | 9732 | return; |
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c index d014efbf947a..0f052fc674d5 100644 --- a/kernel/sched_cpupri.c +++ b/kernel/sched_cpupri.c | |||
@@ -127,21 +127,11 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) | |||
127 | 127 | ||
128 | /* | 128 | /* |
129 | * If the cpu was currently mapped to a different value, we | 129 | * If the cpu was currently mapped to a different value, we |
130 | * first need to unmap the old value | 130 | * need to map it to the new value then remove the old value. |
131 | * Note, we must add the new value first, otherwise we risk the | ||
132 | * cpu being cleared from pri_active, and this cpu could be | ||
133 | * missed for a push or pull. | ||
131 | */ | 134 | */ |
132 | if (likely(oldpri != CPUPRI_INVALID)) { | ||
133 | struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri]; | ||
134 | |||
135 | spin_lock_irqsave(&vec->lock, flags); | ||
136 | |||
137 | vec->count--; | ||
138 | if (!vec->count) | ||
139 | clear_bit(oldpri, cp->pri_active); | ||
140 | cpumask_clear_cpu(cpu, vec->mask); | ||
141 | |||
142 | spin_unlock_irqrestore(&vec->lock, flags); | ||
143 | } | ||
144 | |||
145 | if (likely(newpri != CPUPRI_INVALID)) { | 135 | if (likely(newpri != CPUPRI_INVALID)) { |
146 | struct cpupri_vec *vec = &cp->pri_to_cpu[newpri]; | 136 | struct cpupri_vec *vec = &cp->pri_to_cpu[newpri]; |
147 | 137 | ||
@@ -154,6 +144,18 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) | |||
154 | 144 | ||
155 | spin_unlock_irqrestore(&vec->lock, flags); | 145 | spin_unlock_irqrestore(&vec->lock, flags); |
156 | } | 146 | } |
147 | if (likely(oldpri != CPUPRI_INVALID)) { | ||
148 | struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri]; | ||
149 | |||
150 | spin_lock_irqsave(&vec->lock, flags); | ||
151 | |||
152 | vec->count--; | ||
153 | if (!vec->count) | ||
154 | clear_bit(oldpri, cp->pri_active); | ||
155 | cpumask_clear_cpu(cpu, vec->mask); | ||
156 | |||
157 | spin_unlock_irqrestore(&vec->lock, flags); | ||
158 | } | ||
157 | 159 | ||
158 | *currpri = newpri; | 160 | *currpri = newpri; |
159 | } | 161 | } |
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 70c7e0b79946..5ddbd0891267 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c | |||
@@ -409,6 +409,8 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) | |||
409 | PN(se.wait_max); | 409 | PN(se.wait_max); |
410 | PN(se.wait_sum); | 410 | PN(se.wait_sum); |
411 | P(se.wait_count); | 411 | P(se.wait_count); |
412 | PN(se.iowait_sum); | ||
413 | P(se.iowait_count); | ||
412 | P(sched_info.bkl_count); | 414 | P(sched_info.bkl_count); |
413 | P(se.nr_migrations); | 415 | P(se.nr_migrations); |
414 | P(se.nr_migrations_cold); | 416 | P(se.nr_migrations_cold); |
@@ -479,6 +481,8 @@ void proc_sched_set_task(struct task_struct *p) | |||
479 | p->se.wait_max = 0; | 481 | p->se.wait_max = 0; |
480 | p->se.wait_sum = 0; | 482 | p->se.wait_sum = 0; |
481 | p->se.wait_count = 0; | 483 | p->se.wait_count = 0; |
484 | p->se.iowait_sum = 0; | ||
485 | p->se.iowait_count = 0; | ||
482 | p->se.sleep_max = 0; | 486 | p->se.sleep_max = 0; |
483 | p->se.sum_sleep_runtime = 0; | 487 | p->se.sum_sleep_runtime = 0; |
484 | p->se.block_max = 0; | 488 | p->se.block_max = 0; |
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 652e8bdef9aa..aa7f84121016 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c | |||
@@ -24,7 +24,7 @@ | |||
24 | 24 | ||
25 | /* | 25 | /* |
26 | * Targeted preemption latency for CPU-bound tasks: | 26 | * Targeted preemption latency for CPU-bound tasks: |
27 | * (default: 20ms * (1 + ilog(ncpus)), units: nanoseconds) | 27 | * (default: 5ms * (1 + ilog(ncpus)), units: nanoseconds) |
28 | * | 28 | * |
29 | * NOTE: this latency value is not the same as the concept of | 29 | * NOTE: this latency value is not the same as the concept of |
30 | * 'timeslice length' - timeslices in CFS are of variable length | 30 | * 'timeslice length' - timeslices in CFS are of variable length |
@@ -34,13 +34,13 @@ | |||
34 | * (to see the precise effective timeslice length of your workload, | 34 | * (to see the precise effective timeslice length of your workload, |
35 | * run vmstat and monitor the context-switches (cs) field) | 35 | * run vmstat and monitor the context-switches (cs) field) |
36 | */ | 36 | */ |
37 | unsigned int sysctl_sched_latency = 20000000ULL; | 37 | unsigned int sysctl_sched_latency = 5000000ULL; |
38 | 38 | ||
39 | /* | 39 | /* |
40 | * Minimal preemption granularity for CPU-bound tasks: | 40 | * Minimal preemption granularity for CPU-bound tasks: |
41 | * (default: 4 msec * (1 + ilog(ncpus)), units: nanoseconds) | 41 | * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) |
42 | */ | 42 | */ |
43 | unsigned int sysctl_sched_min_granularity = 4000000ULL; | 43 | unsigned int sysctl_sched_min_granularity = 1000000ULL; |
44 | 44 | ||
45 | /* | 45 | /* |
46 | * is kept at sysctl_sched_latency / sysctl_sched_min_granularity | 46 | * is kept at sysctl_sched_latency / sysctl_sched_min_granularity |
@@ -48,10 +48,10 @@ unsigned int sysctl_sched_min_granularity = 4000000ULL; | |||
48 | static unsigned int sched_nr_latency = 5; | 48 | static unsigned int sched_nr_latency = 5; |
49 | 49 | ||
50 | /* | 50 | /* |
51 | * After fork, child runs first. (default) If set to 0 then | 51 | * After fork, child runs first. If set to 0 (default) then |
52 | * parent will (try to) run first. | 52 | * parent will (try to) run first. |
53 | */ | 53 | */ |
54 | const_debug unsigned int sysctl_sched_child_runs_first = 1; | 54 | unsigned int sysctl_sched_child_runs_first __read_mostly; |
55 | 55 | ||
56 | /* | 56 | /* |
57 | * sys_sched_yield() compat mode | 57 | * sys_sched_yield() compat mode |
@@ -63,13 +63,13 @@ unsigned int __read_mostly sysctl_sched_compat_yield; | |||
63 | 63 | ||
64 | /* | 64 | /* |
65 | * SCHED_OTHER wake-up granularity. | 65 | * SCHED_OTHER wake-up granularity. |
66 | * (default: 5 msec * (1 + ilog(ncpus)), units: nanoseconds) | 66 | * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) |
67 | * | 67 | * |
68 | * This option delays the preemption effects of decoupled workloads | 68 | * This option delays the preemption effects of decoupled workloads |
69 | * and reduces their over-scheduling. Synchronous workloads will still | 69 | * and reduces their over-scheduling. Synchronous workloads will still |
70 | * have immediate wakeup/sleep latencies. | 70 | * have immediate wakeup/sleep latencies. |
71 | */ | 71 | */ |
72 | unsigned int sysctl_sched_wakeup_granularity = 5000000UL; | 72 | unsigned int sysctl_sched_wakeup_granularity = 1000000UL; |
73 | 73 | ||
74 | const_debug unsigned int sysctl_sched_migration_cost = 500000UL; | 74 | const_debug unsigned int sysctl_sched_migration_cost = 500000UL; |
75 | 75 | ||
@@ -79,11 +79,6 @@ static const struct sched_class fair_sched_class; | |||
79 | * CFS operations on generic schedulable entities: | 79 | * CFS operations on generic schedulable entities: |
80 | */ | 80 | */ |
81 | 81 | ||
82 | static inline struct task_struct *task_of(struct sched_entity *se) | ||
83 | { | ||
84 | return container_of(se, struct task_struct, se); | ||
85 | } | ||
86 | |||
87 | #ifdef CONFIG_FAIR_GROUP_SCHED | 82 | #ifdef CONFIG_FAIR_GROUP_SCHED |
88 | 83 | ||
89 | /* cpu runqueue to which this cfs_rq is attached */ | 84 | /* cpu runqueue to which this cfs_rq is attached */ |
@@ -95,6 +90,14 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq) | |||
95 | /* An entity is a task if it doesn't "own" a runqueue */ | 90 | /* An entity is a task if it doesn't "own" a runqueue */ |
96 | #define entity_is_task(se) (!se->my_q) | 91 | #define entity_is_task(se) (!se->my_q) |
97 | 92 | ||
93 | static inline struct task_struct *task_of(struct sched_entity *se) | ||
94 | { | ||
95 | #ifdef CONFIG_SCHED_DEBUG | ||
96 | WARN_ON_ONCE(!entity_is_task(se)); | ||
97 | #endif | ||
98 | return container_of(se, struct task_struct, se); | ||
99 | } | ||
100 | |||
98 | /* Walk up scheduling entities hierarchy */ | 101 | /* Walk up scheduling entities hierarchy */ |
99 | #define for_each_sched_entity(se) \ | 102 | #define for_each_sched_entity(se) \ |
100 | for (; se; se = se->parent) | 103 | for (; se; se = se->parent) |
@@ -186,7 +189,12 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse) | |||
186 | } | 189 | } |
187 | } | 190 | } |
188 | 191 | ||
189 | #else /* CONFIG_FAIR_GROUP_SCHED */ | 192 | #else /* !CONFIG_FAIR_GROUP_SCHED */ |
193 | |||
194 | static inline struct task_struct *task_of(struct sched_entity *se) | ||
195 | { | ||
196 | return container_of(se, struct task_struct, se); | ||
197 | } | ||
190 | 198 | ||
191 | static inline struct rq *rq_of(struct cfs_rq *cfs_rq) | 199 | static inline struct rq *rq_of(struct cfs_rq *cfs_rq) |
192 | { | 200 | { |
@@ -537,6 +545,12 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
537 | schedstat_set(se->wait_count, se->wait_count + 1); | 545 | schedstat_set(se->wait_count, se->wait_count + 1); |
538 | schedstat_set(se->wait_sum, se->wait_sum + | 546 | schedstat_set(se->wait_sum, se->wait_sum + |
539 | rq_of(cfs_rq)->clock - se->wait_start); | 547 | rq_of(cfs_rq)->clock - se->wait_start); |
548 | #ifdef CONFIG_SCHEDSTATS | ||
549 | if (entity_is_task(se)) { | ||
550 | trace_sched_stat_wait(task_of(se), | ||
551 | rq_of(cfs_rq)->clock - se->wait_start); | ||
552 | } | ||
553 | #endif | ||
540 | schedstat_set(se->wait_start, 0); | 554 | schedstat_set(se->wait_start, 0); |
541 | } | 555 | } |
542 | 556 | ||
@@ -628,8 +642,10 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
628 | se->sleep_start = 0; | 642 | se->sleep_start = 0; |
629 | se->sum_sleep_runtime += delta; | 643 | se->sum_sleep_runtime += delta; |
630 | 644 | ||
631 | if (tsk) | 645 | if (tsk) { |
632 | account_scheduler_latency(tsk, delta >> 10, 1); | 646 | account_scheduler_latency(tsk, delta >> 10, 1); |
647 | trace_sched_stat_sleep(tsk, delta); | ||
648 | } | ||
633 | } | 649 | } |
634 | if (se->block_start) { | 650 | if (se->block_start) { |
635 | u64 delta = rq_of(cfs_rq)->clock - se->block_start; | 651 | u64 delta = rq_of(cfs_rq)->clock - se->block_start; |
@@ -644,6 +660,12 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
644 | se->sum_sleep_runtime += delta; | 660 | se->sum_sleep_runtime += delta; |
645 | 661 | ||
646 | if (tsk) { | 662 | if (tsk) { |
663 | if (tsk->in_iowait) { | ||
664 | se->iowait_sum += delta; | ||
665 | se->iowait_count++; | ||
666 | trace_sched_stat_iowait(tsk, delta); | ||
667 | } | ||
668 | |||
647 | /* | 669 | /* |
648 | * Blocking time is in units of nanosecs, so shift by | 670 | * Blocking time is in units of nanosecs, so shift by |
649 | * 20 to get a milliseconds-range estimation of the | 671 | * 20 to get a milliseconds-range estimation of the |
@@ -705,11 +727,11 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) | |||
705 | 727 | ||
706 | vruntime -= thresh; | 728 | vruntime -= thresh; |
707 | } | 729 | } |
708 | |||
709 | /* ensure we never gain time by being placed backwards. */ | ||
710 | vruntime = max_vruntime(se->vruntime, vruntime); | ||
711 | } | 730 | } |
712 | 731 | ||
732 | /* ensure we never gain time by being placed backwards. */ | ||
733 | vruntime = max_vruntime(se->vruntime, vruntime); | ||
734 | |||
713 | se->vruntime = vruntime; | 735 | se->vruntime = vruntime; |
714 | } | 736 | } |
715 | 737 | ||
@@ -1046,17 +1068,21 @@ static void yield_task_fair(struct rq *rq) | |||
1046 | * search starts with cpus closest then further out as needed, | 1068 | * search starts with cpus closest then further out as needed, |
1047 | * so we always favor a closer, idle cpu. | 1069 | * so we always favor a closer, idle cpu. |
1048 | * Domains may include CPUs that are not usable for migration, | 1070 | * Domains may include CPUs that are not usable for migration, |
1049 | * hence we need to mask them out (cpu_active_mask) | 1071 | * hence we need to mask them out (rq->rd->online) |
1050 | * | 1072 | * |
1051 | * Returns the CPU we should wake onto. | 1073 | * Returns the CPU we should wake onto. |
1052 | */ | 1074 | */ |
1053 | #if defined(ARCH_HAS_SCHED_WAKE_IDLE) | 1075 | #if defined(ARCH_HAS_SCHED_WAKE_IDLE) |
1076 | |||
1077 | #define cpu_rd_active(cpu, rq) cpumask_test_cpu(cpu, rq->rd->online) | ||
1078 | |||
1054 | static int wake_idle(int cpu, struct task_struct *p) | 1079 | static int wake_idle(int cpu, struct task_struct *p) |
1055 | { | 1080 | { |
1056 | struct sched_domain *sd; | 1081 | struct sched_domain *sd; |
1057 | int i; | 1082 | int i; |
1058 | unsigned int chosen_wakeup_cpu; | 1083 | unsigned int chosen_wakeup_cpu; |
1059 | int this_cpu; | 1084 | int this_cpu; |
1085 | struct rq *task_rq = task_rq(p); | ||
1060 | 1086 | ||
1061 | /* | 1087 | /* |
1062 | * At POWERSAVINGS_BALANCE_WAKEUP level, if both this_cpu and prev_cpu | 1088 | * At POWERSAVINGS_BALANCE_WAKEUP level, if both this_cpu and prev_cpu |
@@ -1089,10 +1115,10 @@ static int wake_idle(int cpu, struct task_struct *p) | |||
1089 | for_each_domain(cpu, sd) { | 1115 | for_each_domain(cpu, sd) { |
1090 | if ((sd->flags & SD_WAKE_IDLE) | 1116 | if ((sd->flags & SD_WAKE_IDLE) |
1091 | || ((sd->flags & SD_WAKE_IDLE_FAR) | 1117 | || ((sd->flags & SD_WAKE_IDLE_FAR) |
1092 | && !task_hot(p, task_rq(p)->clock, sd))) { | 1118 | && !task_hot(p, task_rq->clock, sd))) { |
1093 | for_each_cpu_and(i, sched_domain_span(sd), | 1119 | for_each_cpu_and(i, sched_domain_span(sd), |
1094 | &p->cpus_allowed) { | 1120 | &p->cpus_allowed) { |
1095 | if (cpu_active(i) && idle_cpu(i)) { | 1121 | if (cpu_rd_active(i, task_rq) && idle_cpu(i)) { |
1096 | if (i != task_cpu(p)) { | 1122 | if (i != task_cpu(p)) { |
1097 | schedstat_inc(p, | 1123 | schedstat_inc(p, |
1098 | se.nr_wakeups_idle); | 1124 | se.nr_wakeups_idle); |
@@ -1235,7 +1261,17 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq, | |||
1235 | tg = task_group(p); | 1261 | tg = task_group(p); |
1236 | weight = p->se.load.weight; | 1262 | weight = p->se.load.weight; |
1237 | 1263 | ||
1238 | balanced = 100*(tl + effective_load(tg, this_cpu, weight, weight)) <= | 1264 | /* |
1265 | * In low-load situations, where prev_cpu is idle and this_cpu is idle | ||
1266 | * due to the sync cause above having dropped tl to 0, we'll always have | ||
1267 | * an imbalance, but there's really nothing you can do about that, so | ||
1268 | * that's good too. | ||
1269 | * | ||
1270 | * Otherwise check if either cpus are near enough in load to allow this | ||
1271 | * task to be woken on this_cpu. | ||
1272 | */ | ||
1273 | balanced = !tl || | ||
1274 | 100*(tl + effective_load(tg, this_cpu, weight, weight)) <= | ||
1239 | imbalance*(load + effective_load(tg, prev_cpu, 0, weight)); | 1275 | imbalance*(load + effective_load(tg, prev_cpu, 0, weight)); |
1240 | 1276 | ||
1241 | /* | 1277 | /* |
@@ -1278,8 +1314,6 @@ static int select_task_rq_fair(struct task_struct *p, int sync) | |||
1278 | this_rq = cpu_rq(this_cpu); | 1314 | this_rq = cpu_rq(this_cpu); |
1279 | new_cpu = prev_cpu; | 1315 | new_cpu = prev_cpu; |
1280 | 1316 | ||
1281 | if (prev_cpu == this_cpu) | ||
1282 | goto out; | ||
1283 | /* | 1317 | /* |
1284 | * 'this_sd' is the first domain that both | 1318 | * 'this_sd' is the first domain that both |
1285 | * this_cpu and prev_cpu are present in: | 1319 | * this_cpu and prev_cpu are present in: |
@@ -1721,6 +1755,8 @@ static void task_new_fair(struct rq *rq, struct task_struct *p) | |||
1721 | sched_info_queued(p); | 1755 | sched_info_queued(p); |
1722 | 1756 | ||
1723 | update_curr(cfs_rq); | 1757 | update_curr(cfs_rq); |
1758 | if (curr) | ||
1759 | se->vruntime = curr->vruntime; | ||
1724 | place_entity(cfs_rq, se, 1); | 1760 | place_entity(cfs_rq, se, 1); |
1725 | 1761 | ||
1726 | /* 'curr' will be NULL if the child belongs to a different group */ | 1762 | /* 'curr' will be NULL if the child belongs to a different group */ |
diff --git a/kernel/sched_features.h b/kernel/sched_features.h index 4569bfa7df9b..e2dc63a5815d 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h | |||
@@ -1,4 +1,4 @@ | |||
1 | SCHED_FEAT(NEW_FAIR_SLEEPERS, 1) | 1 | SCHED_FEAT(NEW_FAIR_SLEEPERS, 0) |
2 | SCHED_FEAT(NORMALIZED_SLEEPER, 0) | 2 | SCHED_FEAT(NORMALIZED_SLEEPER, 0) |
3 | SCHED_FEAT(ADAPTIVE_GRAN, 1) | 3 | SCHED_FEAT(ADAPTIVE_GRAN, 1) |
4 | SCHED_FEAT(WAKEUP_PREEMPT, 1) | 4 | SCHED_FEAT(WAKEUP_PREEMPT, 1) |
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 3918e01994e0..2eb4bd6a526c 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c | |||
@@ -3,15 +3,18 @@ | |||
3 | * policies) | 3 | * policies) |
4 | */ | 4 | */ |
5 | 5 | ||
6 | #ifdef CONFIG_RT_GROUP_SCHED | ||
7 | |||
8 | #define rt_entity_is_task(rt_se) (!(rt_se)->my_q) | ||
9 | |||
6 | static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se) | 10 | static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se) |
7 | { | 11 | { |
12 | #ifdef CONFIG_SCHED_DEBUG | ||
13 | WARN_ON_ONCE(!rt_entity_is_task(rt_se)); | ||
14 | #endif | ||
8 | return container_of(rt_se, struct task_struct, rt); | 15 | return container_of(rt_se, struct task_struct, rt); |
9 | } | 16 | } |
10 | 17 | ||
11 | #ifdef CONFIG_RT_GROUP_SCHED | ||
12 | |||
13 | #define rt_entity_is_task(rt_se) (!(rt_se)->my_q) | ||
14 | |||
15 | static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) | 18 | static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) |
16 | { | 19 | { |
17 | return rt_rq->rq; | 20 | return rt_rq->rq; |
@@ -26,6 +29,11 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) | |||
26 | 29 | ||
27 | #define rt_entity_is_task(rt_se) (1) | 30 | #define rt_entity_is_task(rt_se) (1) |
28 | 31 | ||
32 | static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se) | ||
33 | { | ||
34 | return container_of(rt_se, struct task_struct, rt); | ||
35 | } | ||
36 | |||
29 | static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) | 37 | static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) |
30 | { | 38 | { |
31 | return container_of(rt_rq, struct rq, rt); | 39 | return container_of(rt_rq, struct rq, rt); |
@@ -128,6 +136,11 @@ static void dequeue_pushable_task(struct rq *rq, struct task_struct *p) | |||
128 | plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); | 136 | plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); |
129 | } | 137 | } |
130 | 138 | ||
139 | static inline int has_pushable_tasks(struct rq *rq) | ||
140 | { | ||
141 | return !plist_head_empty(&rq->rt.pushable_tasks); | ||
142 | } | ||
143 | |||
131 | #else | 144 | #else |
132 | 145 | ||
133 | static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p) | 146 | static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p) |
@@ -602,6 +615,8 @@ static void update_curr_rt(struct rq *rq) | |||
602 | curr->se.exec_start = rq->clock; | 615 | curr->se.exec_start = rq->clock; |
603 | cpuacct_charge(curr, delta_exec); | 616 | cpuacct_charge(curr, delta_exec); |
604 | 617 | ||
618 | sched_rt_avg_update(rq, delta_exec); | ||
619 | |||
605 | if (!rt_bandwidth_enabled()) | 620 | if (!rt_bandwidth_enabled()) |
606 | return; | 621 | return; |
607 | 622 | ||
@@ -874,8 +889,6 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup) | |||
874 | 889 | ||
875 | if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) | 890 | if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) |
876 | enqueue_pushable_task(rq, p); | 891 | enqueue_pushable_task(rq, p); |
877 | |||
878 | inc_cpu_load(rq, p->se.load.weight); | ||
879 | } | 892 | } |
880 | 893 | ||
881 | static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) | 894 | static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) |
@@ -886,8 +899,6 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) | |||
886 | dequeue_rt_entity(rt_se); | 899 | dequeue_rt_entity(rt_se); |
887 | 900 | ||
888 | dequeue_pushable_task(rq, p); | 901 | dequeue_pushable_task(rq, p); |
889 | |||
890 | dec_cpu_load(rq, p->se.load.weight); | ||
891 | } | 902 | } |
892 | 903 | ||
893 | /* | 904 | /* |
@@ -1064,6 +1075,14 @@ static struct task_struct *pick_next_task_rt(struct rq *rq) | |||
1064 | if (p) | 1075 | if (p) |
1065 | dequeue_pushable_task(rq, p); | 1076 | dequeue_pushable_task(rq, p); |
1066 | 1077 | ||
1078 | #ifdef CONFIG_SMP | ||
1079 | /* | ||
1080 | * We detect this state here so that we can avoid taking the RQ | ||
1081 | * lock again later if there is no need to push | ||
1082 | */ | ||
1083 | rq->post_schedule = has_pushable_tasks(rq); | ||
1084 | #endif | ||
1085 | |||
1067 | return p; | 1086 | return p; |
1068 | } | 1087 | } |
1069 | 1088 | ||
@@ -1162,13 +1181,6 @@ static int find_lowest_rq(struct task_struct *task) | |||
1162 | return -1; /* No targets found */ | 1181 | return -1; /* No targets found */ |
1163 | 1182 | ||
1164 | /* | 1183 | /* |
1165 | * Only consider CPUs that are usable for migration. | ||
1166 | * I guess we might want to change cpupri_find() to ignore those | ||
1167 | * in the first place. | ||
1168 | */ | ||
1169 | cpumask_and(lowest_mask, lowest_mask, cpu_active_mask); | ||
1170 | |||
1171 | /* | ||
1172 | * At this point we have built a mask of cpus representing the | 1184 | * At this point we have built a mask of cpus representing the |
1173 | * lowest priority tasks in the system. Now we want to elect | 1185 | * lowest priority tasks in the system. Now we want to elect |
1174 | * the best one based on our affinity and topology. | 1186 | * the best one based on our affinity and topology. |
@@ -1262,11 +1274,6 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) | |||
1262 | return lowest_rq; | 1274 | return lowest_rq; |
1263 | } | 1275 | } |
1264 | 1276 | ||
1265 | static inline int has_pushable_tasks(struct rq *rq) | ||
1266 | { | ||
1267 | return !plist_head_empty(&rq->rt.pushable_tasks); | ||
1268 | } | ||
1269 | |||
1270 | static struct task_struct *pick_next_pushable_task(struct rq *rq) | 1277 | static struct task_struct *pick_next_pushable_task(struct rq *rq) |
1271 | { | 1278 | { |
1272 | struct task_struct *p; | 1279 | struct task_struct *p; |
@@ -1466,23 +1473,9 @@ static void pre_schedule_rt(struct rq *rq, struct task_struct *prev) | |||
1466 | pull_rt_task(rq); | 1473 | pull_rt_task(rq); |
1467 | } | 1474 | } |
1468 | 1475 | ||
1469 | /* | ||
1470 | * assumes rq->lock is held | ||
1471 | */ | ||
1472 | static int needs_post_schedule_rt(struct rq *rq) | ||
1473 | { | ||
1474 | return has_pushable_tasks(rq); | ||
1475 | } | ||
1476 | |||
1477 | static void post_schedule_rt(struct rq *rq) | 1476 | static void post_schedule_rt(struct rq *rq) |
1478 | { | 1477 | { |
1479 | /* | ||
1480 | * This is only called if needs_post_schedule_rt() indicates that | ||
1481 | * we need to push tasks away | ||
1482 | */ | ||
1483 | spin_lock_irq(&rq->lock); | ||
1484 | push_rt_tasks(rq); | 1478 | push_rt_tasks(rq); |
1485 | spin_unlock_irq(&rq->lock); | ||
1486 | } | 1479 | } |
1487 | 1480 | ||
1488 | /* | 1481 | /* |
@@ -1758,7 +1751,6 @@ static const struct sched_class rt_sched_class = { | |||
1758 | .rq_online = rq_online_rt, | 1751 | .rq_online = rq_online_rt, |
1759 | .rq_offline = rq_offline_rt, | 1752 | .rq_offline = rq_offline_rt, |
1760 | .pre_schedule = pre_schedule_rt, | 1753 | .pre_schedule = pre_schedule_rt, |
1761 | .needs_post_schedule = needs_post_schedule_rt, | ||
1762 | .post_schedule = post_schedule_rt, | 1754 | .post_schedule = post_schedule_rt, |
1763 | .task_wake_up = task_wake_up_rt, | 1755 | .task_wake_up = task_wake_up_rt, |
1764 | .switched_from = switched_from_rt, | 1756 | .switched_from = switched_from_rt, |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 71d8dc7f9920..3125cff1c570 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -245,6 +245,14 @@ static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ | |||
245 | #endif | 245 | #endif |
246 | 246 | ||
247 | static struct ctl_table kern_table[] = { | 247 | static struct ctl_table kern_table[] = { |
248 | { | ||
249 | .ctl_name = CTL_UNNUMBERED, | ||
250 | .procname = "sched_child_runs_first", | ||
251 | .data = &sysctl_sched_child_runs_first, | ||
252 | .maxlen = sizeof(unsigned int), | ||
253 | .mode = 0644, | ||
254 | .proc_handler = &proc_dointvec, | ||
255 | }, | ||
248 | #ifdef CONFIG_SCHED_DEBUG | 256 | #ifdef CONFIG_SCHED_DEBUG |
249 | { | 257 | { |
250 | .ctl_name = CTL_UNNUMBERED, | 258 | .ctl_name = CTL_UNNUMBERED, |
@@ -299,14 +307,6 @@ static struct ctl_table kern_table[] = { | |||
299 | }, | 307 | }, |
300 | { | 308 | { |
301 | .ctl_name = CTL_UNNUMBERED, | 309 | .ctl_name = CTL_UNNUMBERED, |
302 | .procname = "sched_child_runs_first", | ||
303 | .data = &sysctl_sched_child_runs_first, | ||
304 | .maxlen = sizeof(unsigned int), | ||
305 | .mode = 0644, | ||
306 | .proc_handler = &proc_dointvec, | ||
307 | }, | ||
308 | { | ||
309 | .ctl_name = CTL_UNNUMBERED, | ||
310 | .procname = "sched_features", | 310 | .procname = "sched_features", |
311 | .data = &sysctl_sched_features, | 311 | .data = &sysctl_sched_features, |
312 | .maxlen = sizeof(unsigned int), | 312 | .maxlen = sizeof(unsigned int), |
@@ -331,6 +331,14 @@ static struct ctl_table kern_table[] = { | |||
331 | }, | 331 | }, |
332 | { | 332 | { |
333 | .ctl_name = CTL_UNNUMBERED, | 333 | .ctl_name = CTL_UNNUMBERED, |
334 | .procname = "sched_time_avg", | ||
335 | .data = &sysctl_sched_time_avg, | ||
336 | .maxlen = sizeof(unsigned int), | ||
337 | .mode = 0644, | ||
338 | .proc_handler = &proc_dointvec, | ||
339 | }, | ||
340 | { | ||
341 | .ctl_name = CTL_UNNUMBERED, | ||
334 | .procname = "timer_migration", | 342 | .procname = "timer_migration", |
335 | .data = &sysctl_timer_migration, | 343 | .data = &sysctl_timer_migration, |
336 | .maxlen = sizeof(unsigned int), | 344 | .maxlen = sizeof(unsigned int), |
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 3c44b56b0da7..addfe2df93b1 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
@@ -317,8 +317,6 @@ static int worker_thread(void *__cwq) | |||
317 | if (cwq->wq->freezeable) | 317 | if (cwq->wq->freezeable) |
318 | set_freezable(); | 318 | set_freezable(); |
319 | 319 | ||
320 | set_user_nice(current, -5); | ||
321 | |||
322 | for (;;) { | 320 | for (;;) { |
323 | prepare_to_wait(&cwq->more_work, &wait, TASK_INTERRUPTIBLE); | 321 | prepare_to_wait(&cwq->more_work, &wait, TASK_INTERRUPTIBLE); |
324 | if (!freezing(current) && | 322 | if (!freezing(current) && |