diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2009-09-11 16:23:18 -0400 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2009-09-11 16:23:18 -0400 |
| commit | 774a694f8cd08115d130a290d73c6d8563f26b1b (patch) | |
| tree | 2b5f834ac7a149278d2a7e44d7afe69f40ef1431 | |
| parent | 4f0ac854167846bd55cd81dbc9a36e03708aa01c (diff) | |
| parent | e1f8450854d69f0291882804406ea1bab3ca44b4 (diff) | |
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (64 commits)
sched: Fix sched::sched_stat_wait tracepoint field
sched: Disable NEW_FAIR_SLEEPERS for now
sched: Keep kthreads at default priority
sched: Re-tune the scheduler latency defaults to decrease worst-case latencies
sched: Turn off child_runs_first
sched: Ensure that a child can't gain time over it's parent after fork()
sched: enable SD_WAKE_IDLE
sched: Deal with low-load in wake_affine()
sched: Remove short cut from select_task_rq_fair()
sched: Turn on SD_BALANCE_NEWIDLE
sched: Clean up topology.h
sched: Fix dynamic power-balancing crash
sched: Remove reciprocal for cpu_power
sched: Try to deal with low capacity, fix update_sd_power_savings_stats()
sched: Try to deal with low capacity
sched: Scale down cpu_power due to RT tasks
sched: Implement dynamic cpu_power
sched: Add smt_gain
sched: Update the cpu_power sum during load-balance
sched: Add SD_PREFER_SIBLING
...
| -rw-r--r-- | arch/x86/include/asm/topology.h | 47 | ||||
| -rw-r--r-- | fs/dcache.c | 1 | ||||
| -rw-r--r-- | fs/locks.c | 2 | ||||
| -rw-r--r-- | include/linux/hardirq.h | 6 | ||||
| -rw-r--r-- | include/linux/kernel.h | 5 | ||||
| -rw-r--r-- | include/linux/sched.h | 94 | ||||
| -rw-r--r-- | include/linux/topology.h | 168 | ||||
| -rw-r--r-- | include/trace/events/sched.h | 95 | ||||
| -rw-r--r-- | init/main.c | 2 | ||||
| -rw-r--r-- | kernel/kthread.c | 4 | ||||
| -rw-r--r-- | kernel/sched.c | 1099 | ||||
| -rw-r--r-- | kernel/sched_cpupri.c | 30 | ||||
| -rw-r--r-- | kernel/sched_debug.c | 4 | ||||
| -rw-r--r-- | kernel/sched_fair.c | 84 | ||||
| -rw-r--r-- | kernel/sched_features.h | 2 | ||||
| -rw-r--r-- | kernel/sched_rt.c | 62 | ||||
| -rw-r--r-- | kernel/sysctl.c | 24 | ||||
| -rw-r--r-- | kernel/workqueue.c | 2 |
18 files changed, 1117 insertions, 614 deletions
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 066ef590d7e0..26d06e052a18 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h | |||
| @@ -129,25 +129,34 @@ extern unsigned long node_remap_size[]; | |||
| 129 | #endif | 129 | #endif |
| 130 | 130 | ||
| 131 | /* sched_domains SD_NODE_INIT for NUMA machines */ | 131 | /* sched_domains SD_NODE_INIT for NUMA machines */ |
| 132 | #define SD_NODE_INIT (struct sched_domain) { \ | 132 | #define SD_NODE_INIT (struct sched_domain) { \ |
| 133 | .min_interval = 8, \ | 133 | .min_interval = 8, \ |
| 134 | .max_interval = 32, \ | 134 | .max_interval = 32, \ |
| 135 | .busy_factor = 32, \ | 135 | .busy_factor = 32, \ |
| 136 | .imbalance_pct = 125, \ | 136 | .imbalance_pct = 125, \ |
| 137 | .cache_nice_tries = SD_CACHE_NICE_TRIES, \ | 137 | .cache_nice_tries = SD_CACHE_NICE_TRIES, \ |
| 138 | .busy_idx = 3, \ | 138 | .busy_idx = 3, \ |
| 139 | .idle_idx = SD_IDLE_IDX, \ | 139 | .idle_idx = SD_IDLE_IDX, \ |
| 140 | .newidle_idx = SD_NEWIDLE_IDX, \ | 140 | .newidle_idx = SD_NEWIDLE_IDX, \ |
| 141 | .wake_idx = 1, \ | 141 | .wake_idx = 1, \ |
| 142 | .forkexec_idx = SD_FORKEXEC_IDX, \ | 142 | .forkexec_idx = SD_FORKEXEC_IDX, \ |
| 143 | .flags = SD_LOAD_BALANCE \ | 143 | \ |
| 144 | | SD_BALANCE_EXEC \ | 144 | .flags = 1*SD_LOAD_BALANCE \ |
| 145 | | SD_BALANCE_FORK \ | 145 | | 1*SD_BALANCE_NEWIDLE \ |
| 146 | | SD_WAKE_AFFINE \ | 146 | | 1*SD_BALANCE_EXEC \ |
| 147 | | SD_WAKE_BALANCE \ | 147 | | 1*SD_BALANCE_FORK \ |
| 148 | | SD_SERIALIZE, \ | 148 | | 0*SD_WAKE_IDLE \ |
| 149 | .last_balance = jiffies, \ | 149 | | 1*SD_WAKE_AFFINE \ |
| 150 | .balance_interval = 1, \ | 150 | | 1*SD_WAKE_BALANCE \ |
| 151 | | 0*SD_SHARE_CPUPOWER \ | ||
| 152 | | 0*SD_POWERSAVINGS_BALANCE \ | ||
| 153 | | 0*SD_SHARE_PKG_RESOURCES \ | ||
| 154 | | 1*SD_SERIALIZE \ | ||
| 155 | | 1*SD_WAKE_IDLE_FAR \ | ||
| 156 | | 0*SD_PREFER_SIBLING \ | ||
| 157 | , \ | ||
| 158 | .last_balance = jiffies, \ | ||
| 159 | .balance_interval = 1, \ | ||
| 151 | } | 160 | } |
| 152 | 161 | ||
| 153 | #ifdef CONFIG_X86_64_ACPI_NUMA | 162 | #ifdef CONFIG_X86_64_ACPI_NUMA |
diff --git a/fs/dcache.c b/fs/dcache.c index 9e5cd3c3a6ba..a100fa35a48f 100644 --- a/fs/dcache.c +++ b/fs/dcache.c | |||
| @@ -32,6 +32,7 @@ | |||
| 32 | #include <linux/swap.h> | 32 | #include <linux/swap.h> |
| 33 | #include <linux/bootmem.h> | 33 | #include <linux/bootmem.h> |
| 34 | #include <linux/fs_struct.h> | 34 | #include <linux/fs_struct.h> |
| 35 | #include <linux/hardirq.h> | ||
| 35 | #include "internal.h" | 36 | #include "internal.h" |
| 36 | 37 | ||
| 37 | int sysctl_vfs_cache_pressure __read_mostly = 100; | 38 | int sysctl_vfs_cache_pressure __read_mostly = 100; |
diff --git a/fs/locks.c b/fs/locks.c index 52366e877d76..19ee18a6829b 100644 --- a/fs/locks.c +++ b/fs/locks.c | |||
| @@ -768,7 +768,7 @@ static int flock_lock_file(struct file *filp, struct file_lock *request) | |||
| 768 | * give it the opportunity to lock the file. | 768 | * give it the opportunity to lock the file. |
| 769 | */ | 769 | */ |
| 770 | if (found) | 770 | if (found) |
| 771 | cond_resched_bkl(); | 771 | cond_resched(); |
| 772 | 772 | ||
| 773 | find_conflict: | 773 | find_conflict: |
| 774 | for_each_lock(inode, before) { | 774 | for_each_lock(inode, before) { |
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index 330cb31bb496..6d527ee82b2b 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h | |||
| @@ -64,6 +64,12 @@ | |||
| 64 | #define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT) | 64 | #define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT) |
| 65 | #define NMI_OFFSET (1UL << NMI_SHIFT) | 65 | #define NMI_OFFSET (1UL << NMI_SHIFT) |
| 66 | 66 | ||
| 67 | #ifndef PREEMPT_ACTIVE | ||
| 68 | #define PREEMPT_ACTIVE_BITS 1 | ||
| 69 | #define PREEMPT_ACTIVE_SHIFT (NMI_SHIFT + NMI_BITS) | ||
| 70 | #define PREEMPT_ACTIVE (__IRQ_MASK(PREEMPT_ACTIVE_BITS) << PREEMPT_ACTIVE_SHIFT) | ||
| 71 | #endif | ||
| 72 | |||
| 67 | #if PREEMPT_ACTIVE < (1 << (NMI_SHIFT + NMI_BITS)) | 73 | #if PREEMPT_ACTIVE < (1 << (NMI_SHIFT + NMI_BITS)) |
| 68 | #error PREEMPT_ACTIVE is too low! | 74 | #error PREEMPT_ACTIVE is too low! |
| 69 | #endif | 75 | #endif |
diff --git a/include/linux/kernel.h b/include/linux/kernel.h index d6320a3e8def..2b5b1e0899a8 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h | |||
| @@ -125,7 +125,7 @@ extern int _cond_resched(void); | |||
| 125 | #endif | 125 | #endif |
| 126 | 126 | ||
| 127 | #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP | 127 | #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP |
| 128 | void __might_sleep(char *file, int line); | 128 | void __might_sleep(char *file, int line, int preempt_offset); |
| 129 | /** | 129 | /** |
| 130 | * might_sleep - annotation for functions that can sleep | 130 | * might_sleep - annotation for functions that can sleep |
| 131 | * | 131 | * |
| @@ -137,8 +137,9 @@ extern int _cond_resched(void); | |||
| 137 | * supposed to. | 137 | * supposed to. |
| 138 | */ | 138 | */ |
| 139 | # define might_sleep() \ | 139 | # define might_sleep() \ |
| 140 | do { __might_sleep(__FILE__, __LINE__); might_resched(); } while (0) | 140 | do { __might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0) |
| 141 | #else | 141 | #else |
| 142 | static inline void __might_sleep(char *file, int line, int preempt_offset) { } | ||
| 142 | # define might_sleep() do { might_resched(); } while (0) | 143 | # define might_sleep() do { might_resched(); } while (0) |
| 143 | #endif | 144 | #endif |
| 144 | 145 | ||
diff --git a/include/linux/sched.h b/include/linux/sched.h index 379531c08975..f3d74bd04d18 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
| @@ -38,6 +38,8 @@ | |||
| 38 | #define SCHED_BATCH 3 | 38 | #define SCHED_BATCH 3 |
| 39 | /* SCHED_ISO: reserved but not implemented yet */ | 39 | /* SCHED_ISO: reserved but not implemented yet */ |
| 40 | #define SCHED_IDLE 5 | 40 | #define SCHED_IDLE 5 |
| 41 | /* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */ | ||
| 42 | #define SCHED_RESET_ON_FORK 0x40000000 | ||
| 41 | 43 | ||
| 42 | #ifdef __KERNEL__ | 44 | #ifdef __KERNEL__ |
| 43 | 45 | ||
| @@ -796,18 +798,19 @@ enum cpu_idle_type { | |||
| 796 | #define SCHED_LOAD_SCALE_FUZZ SCHED_LOAD_SCALE | 798 | #define SCHED_LOAD_SCALE_FUZZ SCHED_LOAD_SCALE |
| 797 | 799 | ||
| 798 | #ifdef CONFIG_SMP | 800 | #ifdef CONFIG_SMP |
| 799 | #define SD_LOAD_BALANCE 1 /* Do load balancing on this domain. */ | 801 | #define SD_LOAD_BALANCE 0x0001 /* Do load balancing on this domain. */ |
| 800 | #define SD_BALANCE_NEWIDLE 2 /* Balance when about to become idle */ | 802 | #define SD_BALANCE_NEWIDLE 0x0002 /* Balance when about to become idle */ |
| 801 | #define SD_BALANCE_EXEC 4 /* Balance on exec */ | 803 | #define SD_BALANCE_EXEC 0x0004 /* Balance on exec */ |
| 802 | #define SD_BALANCE_FORK 8 /* Balance on fork, clone */ | 804 | #define SD_BALANCE_FORK 0x0008 /* Balance on fork, clone */ |
| 803 | #define SD_WAKE_IDLE 16 /* Wake to idle CPU on task wakeup */ | 805 | #define SD_WAKE_IDLE 0x0010 /* Wake to idle CPU on task wakeup */ |
| 804 | #define SD_WAKE_AFFINE 32 /* Wake task to waking CPU */ | 806 | #define SD_WAKE_AFFINE 0x0020 /* Wake task to waking CPU */ |
| 805 | #define SD_WAKE_BALANCE 64 /* Perform balancing at task wakeup */ | 807 | #define SD_WAKE_BALANCE 0x0040 /* Perform balancing at task wakeup */ |
| 806 | #define SD_SHARE_CPUPOWER 128 /* Domain members share cpu power */ | 808 | #define SD_SHARE_CPUPOWER 0x0080 /* Domain members share cpu power */ |
| 807 | #define SD_POWERSAVINGS_BALANCE 256 /* Balance for power savings */ | 809 | #define SD_POWERSAVINGS_BALANCE 0x0100 /* Balance for power savings */ |
| 808 | #define SD_SHARE_PKG_RESOURCES 512 /* Domain members share cpu pkg resources */ | 810 | #define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */ |
| 809 | #define SD_SERIALIZE 1024 /* Only a single load balancing instance */ | 811 | #define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */ |
| 810 | #define SD_WAKE_IDLE_FAR 2048 /* Gain latency sacrificing cache hit */ | 812 | #define SD_WAKE_IDLE_FAR 0x0800 /* Gain latency sacrificing cache hit */ |
| 813 | #define SD_PREFER_SIBLING 0x1000 /* Prefer to place tasks in a sibling domain */ | ||
| 811 | 814 | ||
| 812 | enum powersavings_balance_level { | 815 | enum powersavings_balance_level { |
| 813 | POWERSAVINGS_BALANCE_NONE = 0, /* No power saving load balance */ | 816 | POWERSAVINGS_BALANCE_NONE = 0, /* No power saving load balance */ |
| @@ -827,7 +830,7 @@ static inline int sd_balance_for_mc_power(void) | |||
| 827 | if (sched_smt_power_savings) | 830 | if (sched_smt_power_savings) |
| 828 | return SD_POWERSAVINGS_BALANCE; | 831 | return SD_POWERSAVINGS_BALANCE; |
| 829 | 832 | ||
| 830 | return 0; | 833 | return SD_PREFER_SIBLING; |
| 831 | } | 834 | } |
| 832 | 835 | ||
| 833 | static inline int sd_balance_for_package_power(void) | 836 | static inline int sd_balance_for_package_power(void) |
| @@ -835,7 +838,7 @@ static inline int sd_balance_for_package_power(void) | |||
| 835 | if (sched_mc_power_savings | sched_smt_power_savings) | 838 | if (sched_mc_power_savings | sched_smt_power_savings) |
| 836 | return SD_POWERSAVINGS_BALANCE; | 839 | return SD_POWERSAVINGS_BALANCE; |
| 837 | 840 | ||
| 838 | return 0; | 841 | return SD_PREFER_SIBLING; |
| 839 | } | 842 | } |
| 840 | 843 | ||
| 841 | /* | 844 | /* |
| @@ -857,15 +860,9 @@ struct sched_group { | |||
| 857 | 860 | ||
| 858 | /* | 861 | /* |
| 859 | * CPU power of this group, SCHED_LOAD_SCALE being max power for a | 862 | * CPU power of this group, SCHED_LOAD_SCALE being max power for a |
| 860 | * single CPU. This is read only (except for setup, hotplug CPU). | 863 | * single CPU. |
| 861 | * Note : Never change cpu_power without recompute its reciprocal | ||
| 862 | */ | ||
| 863 | unsigned int __cpu_power; | ||
| 864 | /* | ||
| 865 | * reciprocal value of cpu_power to avoid expensive divides | ||
| 866 | * (see include/linux/reciprocal_div.h) | ||
| 867 | */ | 864 | */ |
| 868 | u32 reciprocal_cpu_power; | 865 | unsigned int cpu_power; |
| 869 | 866 | ||
| 870 | /* | 867 | /* |
| 871 | * The CPUs this group covers. | 868 | * The CPUs this group covers. |
| @@ -918,6 +915,7 @@ struct sched_domain { | |||
| 918 | unsigned int newidle_idx; | 915 | unsigned int newidle_idx; |
| 919 | unsigned int wake_idx; | 916 | unsigned int wake_idx; |
| 920 | unsigned int forkexec_idx; | 917 | unsigned int forkexec_idx; |
| 918 | unsigned int smt_gain; | ||
| 921 | int flags; /* See SD_* */ | 919 | int flags; /* See SD_* */ |
| 922 | enum sched_domain_level level; | 920 | enum sched_domain_level level; |
| 923 | 921 | ||
| @@ -1045,7 +1043,6 @@ struct sched_class { | |||
| 1045 | struct rq *busiest, struct sched_domain *sd, | 1043 | struct rq *busiest, struct sched_domain *sd, |
| 1046 | enum cpu_idle_type idle); | 1044 | enum cpu_idle_type idle); |
| 1047 | void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); | 1045 | void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); |
| 1048 | int (*needs_post_schedule) (struct rq *this_rq); | ||
| 1049 | void (*post_schedule) (struct rq *this_rq); | 1046 | void (*post_schedule) (struct rq *this_rq); |
| 1050 | void (*task_wake_up) (struct rq *this_rq, struct task_struct *task); | 1047 | void (*task_wake_up) (struct rq *this_rq, struct task_struct *task); |
| 1051 | 1048 | ||
| @@ -1110,6 +1107,8 @@ struct sched_entity { | |||
| 1110 | u64 wait_max; | 1107 | u64 wait_max; |
| 1111 | u64 wait_count; | 1108 | u64 wait_count; |
| 1112 | u64 wait_sum; | 1109 | u64 wait_sum; |
| 1110 | u64 iowait_count; | ||
| 1111 | u64 iowait_sum; | ||
| 1113 | 1112 | ||
| 1114 | u64 sleep_start; | 1113 | u64 sleep_start; |
| 1115 | u64 sleep_max; | 1114 | u64 sleep_max; |
| @@ -1234,11 +1233,19 @@ struct task_struct { | |||
| 1234 | unsigned did_exec:1; | 1233 | unsigned did_exec:1; |
| 1235 | unsigned in_execve:1; /* Tell the LSMs that the process is doing an | 1234 | unsigned in_execve:1; /* Tell the LSMs that the process is doing an |
| 1236 | * execve */ | 1235 | * execve */ |
| 1236 | unsigned in_iowait:1; | ||
| 1237 | |||
| 1238 | |||
| 1239 | /* Revert to default priority/policy when forking */ | ||
| 1240 | unsigned sched_reset_on_fork:1; | ||
| 1241 | |||
| 1237 | pid_t pid; | 1242 | pid_t pid; |
| 1238 | pid_t tgid; | 1243 | pid_t tgid; |
| 1239 | 1244 | ||
| 1245 | #ifdef CONFIG_CC_STACKPROTECTOR | ||
| 1240 | /* Canary value for the -fstack-protector gcc feature */ | 1246 | /* Canary value for the -fstack-protector gcc feature */ |
| 1241 | unsigned long stack_canary; | 1247 | unsigned long stack_canary; |
| 1248 | #endif | ||
| 1242 | 1249 | ||
| 1243 | /* | 1250 | /* |
| 1244 | * pointers to (original) parent process, youngest child, younger sibling, | 1251 | * pointers to (original) parent process, youngest child, younger sibling, |
| @@ -1840,11 +1847,12 @@ extern unsigned int sysctl_sched_min_granularity; | |||
| 1840 | extern unsigned int sysctl_sched_wakeup_granularity; | 1847 | extern unsigned int sysctl_sched_wakeup_granularity; |
| 1841 | extern unsigned int sysctl_sched_shares_ratelimit; | 1848 | extern unsigned int sysctl_sched_shares_ratelimit; |
| 1842 | extern unsigned int sysctl_sched_shares_thresh; | 1849 | extern unsigned int sysctl_sched_shares_thresh; |
| 1843 | #ifdef CONFIG_SCHED_DEBUG | ||
| 1844 | extern unsigned int sysctl_sched_child_runs_first; | 1850 | extern unsigned int sysctl_sched_child_runs_first; |
| 1851 | #ifdef CONFIG_SCHED_DEBUG | ||
| 1845 | extern unsigned int sysctl_sched_features; | 1852 | extern unsigned int sysctl_sched_features; |
| 1846 | extern unsigned int sysctl_sched_migration_cost; | 1853 | extern unsigned int sysctl_sched_migration_cost; |
| 1847 | extern unsigned int sysctl_sched_nr_migrate; | 1854 | extern unsigned int sysctl_sched_nr_migrate; |
| 1855 | extern unsigned int sysctl_sched_time_avg; | ||
| 1848 | extern unsigned int sysctl_timer_migration; | 1856 | extern unsigned int sysctl_timer_migration; |
| 1849 | 1857 | ||
| 1850 | int sched_nr_latency_handler(struct ctl_table *table, int write, | 1858 | int sched_nr_latency_handler(struct ctl_table *table, int write, |
| @@ -2308,23 +2316,31 @@ static inline int need_resched(void) | |||
| 2308 | * cond_resched_softirq() will enable bhs before scheduling. | 2316 | * cond_resched_softirq() will enable bhs before scheduling. |
| 2309 | */ | 2317 | */ |
| 2310 | extern int _cond_resched(void); | 2318 | extern int _cond_resched(void); |
| 2311 | #ifdef CONFIG_PREEMPT_BKL | 2319 | |
| 2312 | static inline int cond_resched(void) | 2320 | #define cond_resched() ({ \ |
| 2313 | { | 2321 | __might_sleep(__FILE__, __LINE__, 0); \ |
| 2314 | return 0; | 2322 | _cond_resched(); \ |
| 2315 | } | 2323 | }) |
| 2324 | |||
| 2325 | extern int __cond_resched_lock(spinlock_t *lock); | ||
| 2326 | |||
| 2327 | #ifdef CONFIG_PREEMPT | ||
| 2328 | #define PREEMPT_LOCK_OFFSET PREEMPT_OFFSET | ||
| 2316 | #else | 2329 | #else |
| 2317 | static inline int cond_resched(void) | 2330 | #define PREEMPT_LOCK_OFFSET 0 |
| 2318 | { | ||
| 2319 | return _cond_resched(); | ||
| 2320 | } | ||
| 2321 | #endif | 2331 | #endif |
| 2322 | extern int cond_resched_lock(spinlock_t * lock); | 2332 | |
| 2323 | extern int cond_resched_softirq(void); | 2333 | #define cond_resched_lock(lock) ({ \ |
| 2324 | static inline int cond_resched_bkl(void) | 2334 | __might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET); \ |
| 2325 | { | 2335 | __cond_resched_lock(lock); \ |
| 2326 | return _cond_resched(); | 2336 | }) |
| 2327 | } | 2337 | |
| 2338 | extern int __cond_resched_softirq(void); | ||
| 2339 | |||
| 2340 | #define cond_resched_softirq() ({ \ | ||
| 2341 | __might_sleep(__FILE__, __LINE__, SOFTIRQ_OFFSET); \ | ||
| 2342 | __cond_resched_softirq(); \ | ||
| 2343 | }) | ||
| 2328 | 2344 | ||
| 2329 | /* | 2345 | /* |
| 2330 | * Does a critical section need to be broken due to another | 2346 | * Does a critical section need to be broken due to another |
diff --git a/include/linux/topology.h b/include/linux/topology.h index 7402c1a27c4f..85e8cf7d393c 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h | |||
| @@ -85,20 +85,29 @@ int arch_update_cpu_topology(void); | |||
| 85 | #define ARCH_HAS_SCHED_WAKE_IDLE | 85 | #define ARCH_HAS_SCHED_WAKE_IDLE |
| 86 | /* Common values for SMT siblings */ | 86 | /* Common values for SMT siblings */ |
| 87 | #ifndef SD_SIBLING_INIT | 87 | #ifndef SD_SIBLING_INIT |
| 88 | #define SD_SIBLING_INIT (struct sched_domain) { \ | 88 | #define SD_SIBLING_INIT (struct sched_domain) { \ |
| 89 | .min_interval = 1, \ | 89 | .min_interval = 1, \ |
| 90 | .max_interval = 2, \ | 90 | .max_interval = 2, \ |
| 91 | .busy_factor = 64, \ | 91 | .busy_factor = 64, \ |
| 92 | .imbalance_pct = 110, \ | 92 | .imbalance_pct = 110, \ |
| 93 | .flags = SD_LOAD_BALANCE \ | 93 | \ |
| 94 | | SD_BALANCE_NEWIDLE \ | 94 | .flags = 1*SD_LOAD_BALANCE \ |
| 95 | | SD_BALANCE_FORK \ | 95 | | 1*SD_BALANCE_NEWIDLE \ |
| 96 | | SD_BALANCE_EXEC \ | 96 | | 1*SD_BALANCE_EXEC \ |
| 97 | | SD_WAKE_AFFINE \ | 97 | | 1*SD_BALANCE_FORK \ |
| 98 | | SD_WAKE_BALANCE \ | 98 | | 0*SD_WAKE_IDLE \ |
| 99 | | SD_SHARE_CPUPOWER, \ | 99 | | 1*SD_WAKE_AFFINE \ |
| 100 | .last_balance = jiffies, \ | 100 | | 1*SD_WAKE_BALANCE \ |
| 101 | .balance_interval = 1, \ | 101 | | 1*SD_SHARE_CPUPOWER \ |
| 102 | | 0*SD_POWERSAVINGS_BALANCE \ | ||
| 103 | | 0*SD_SHARE_PKG_RESOURCES \ | ||
| 104 | | 0*SD_SERIALIZE \ | ||
| 105 | | 0*SD_WAKE_IDLE_FAR \ | ||
| 106 | | 0*SD_PREFER_SIBLING \ | ||
| 107 | , \ | ||
| 108 | .last_balance = jiffies, \ | ||
| 109 | .balance_interval = 1, \ | ||
| 110 | .smt_gain = 1178, /* 15% */ \ | ||
| 102 | } | 111 | } |
| 103 | #endif | 112 | #endif |
| 104 | #endif /* CONFIG_SCHED_SMT */ | 113 | #endif /* CONFIG_SCHED_SMT */ |
| @@ -106,69 +115,94 @@ int arch_update_cpu_topology(void); | |||
| 106 | #ifdef CONFIG_SCHED_MC | 115 | #ifdef CONFIG_SCHED_MC |
| 107 | /* Common values for MC siblings. for now mostly derived from SD_CPU_INIT */ | 116 | /* Common values for MC siblings. for now mostly derived from SD_CPU_INIT */ |
| 108 | #ifndef SD_MC_INIT | 117 | #ifndef SD_MC_INIT |
| 109 | #define SD_MC_INIT (struct sched_domain) { \ | 118 | #define SD_MC_INIT (struct sched_domain) { \ |
| 110 | .min_interval = 1, \ | 119 | .min_interval = 1, \ |
| 111 | .max_interval = 4, \ | 120 | .max_interval = 4, \ |
| 112 | .busy_factor = 64, \ | 121 | .busy_factor = 64, \ |
| 113 | .imbalance_pct = 125, \ | 122 | .imbalance_pct = 125, \ |
| 114 | .cache_nice_tries = 1, \ | 123 | .cache_nice_tries = 1, \ |
| 115 | .busy_idx = 2, \ | 124 | .busy_idx = 2, \ |
| 116 | .wake_idx = 1, \ | 125 | .wake_idx = 1, \ |
| 117 | .forkexec_idx = 1, \ | 126 | .forkexec_idx = 1, \ |
| 118 | .flags = SD_LOAD_BALANCE \ | 127 | \ |
| 119 | | SD_BALANCE_FORK \ | 128 | .flags = 1*SD_LOAD_BALANCE \ |
| 120 | | SD_BALANCE_EXEC \ | 129 | | 1*SD_BALANCE_NEWIDLE \ |
| 121 | | SD_WAKE_AFFINE \ | 130 | | 1*SD_BALANCE_EXEC \ |
| 122 | | SD_WAKE_BALANCE \ | 131 | | 1*SD_BALANCE_FORK \ |
| 123 | | SD_SHARE_PKG_RESOURCES\ | 132 | | 1*SD_WAKE_IDLE \ |
| 124 | | sd_balance_for_mc_power()\ | 133 | | 1*SD_WAKE_AFFINE \ |
| 125 | | sd_power_saving_flags(),\ | 134 | | 1*SD_WAKE_BALANCE \ |
| 126 | .last_balance = jiffies, \ | 135 | | 0*SD_SHARE_CPUPOWER \ |
| 127 | .balance_interval = 1, \ | 136 | | 1*SD_SHARE_PKG_RESOURCES \ |
| 137 | | 0*SD_SERIALIZE \ | ||
| 138 | | 0*SD_WAKE_IDLE_FAR \ | ||
| 139 | | sd_balance_for_mc_power() \ | ||
| 140 | | sd_power_saving_flags() \ | ||
| 141 | , \ | ||
| 142 | .last_balance = jiffies, \ | ||
| 143 | .balance_interval = 1, \ | ||
| 128 | } | 144 | } |
| 129 | #endif | 145 | #endif |
| 130 | #endif /* CONFIG_SCHED_MC */ | 146 | #endif /* CONFIG_SCHED_MC */ |
| 131 | 147 | ||
| 132 | /* Common values for CPUs */ | 148 | /* Common values for CPUs */ |
| 133 | #ifndef SD_CPU_INIT | 149 | #ifndef SD_CPU_INIT |
| 134 | #define SD_CPU_INIT (struct sched_domain) { \ | 150 | #define SD_CPU_INIT (struct sched_domain) { \ |
| 135 | .min_interval = 1, \ | 151 | .min_interval = 1, \ |
| 136 | .max_interval = 4, \ | 152 | .max_interval = 4, \ |
| 137 | .busy_factor = 64, \ | 153 | .busy_factor = 64, \ |
| 138 | .imbalance_pct = 125, \ | 154 | .imbalance_pct = 125, \ |
| 139 | .cache_nice_tries = 1, \ | 155 | .cache_nice_tries = 1, \ |
| 140 | .busy_idx = 2, \ | 156 | .busy_idx = 2, \ |
| 141 | .idle_idx = 1, \ | 157 | .idle_idx = 1, \ |
| 142 | .newidle_idx = 2, \ | 158 | .newidle_idx = 2, \ |
| 143 | .wake_idx = 1, \ | 159 | .wake_idx = 1, \ |
| 144 | .forkexec_idx = 1, \ | 160 | .forkexec_idx = 1, \ |
| 145 | .flags = SD_LOAD_BALANCE \ | 161 | \ |
| 146 | | SD_BALANCE_EXEC \ | 162 | .flags = 1*SD_LOAD_BALANCE \ |
| 147 | | SD_BALANCE_FORK \ | 163 | | 1*SD_BALANCE_NEWIDLE \ |
| 148 | | SD_WAKE_AFFINE \ | 164 | | 1*SD_BALANCE_EXEC \ |
| 149 | | SD_WAKE_BALANCE \ | 165 | | 1*SD_BALANCE_FORK \ |
| 150 | | sd_balance_for_package_power()\ | 166 | | 1*SD_WAKE_IDLE \ |
| 151 | | sd_power_saving_flags(),\ | 167 | | 0*SD_WAKE_AFFINE \ |
| 152 | .last_balance = jiffies, \ | 168 | | 1*SD_WAKE_BALANCE \ |
| 153 | .balance_interval = 1, \ | 169 | | 0*SD_SHARE_CPUPOWER \ |
| 170 | | 0*SD_SHARE_PKG_RESOURCES \ | ||
| 171 | | 0*SD_SERIALIZE \ | ||
| 172 | | 0*SD_WAKE_IDLE_FAR \ | ||
| 173 | | sd_balance_for_package_power() \ | ||
| 174 | | sd_power_saving_flags() \ | ||
| 175 | , \ | ||
| 176 | .last_balance = jiffies, \ | ||
| 177 | .balance_interval = 1, \ | ||
| 154 | } | 178 | } |
| 155 | #endif | 179 | #endif |
| 156 | 180 | ||
| 157 | /* sched_domains SD_ALLNODES_INIT for NUMA machines */ | 181 | /* sched_domains SD_ALLNODES_INIT for NUMA machines */ |
| 158 | #define SD_ALLNODES_INIT (struct sched_domain) { \ | 182 | #define SD_ALLNODES_INIT (struct sched_domain) { \ |
| 159 | .min_interval = 64, \ | 183 | .min_interval = 64, \ |
| 160 | .max_interval = 64*num_online_cpus(), \ | 184 | .max_interval = 64*num_online_cpus(), \ |
| 161 | .busy_factor = 128, \ | 185 | .busy_factor = 128, \ |
| 162 | .imbalance_pct = 133, \ | 186 | .imbalance_pct = 133, \ |
| 163 | .cache_nice_tries = 1, \ | 187 | .cache_nice_tries = 1, \ |
| 164 | .busy_idx = 3, \ | 188 | .busy_idx = 3, \ |
| 165 | .idle_idx = 3, \ | 189 | .idle_idx = 3, \ |
| 166 | .flags = SD_LOAD_BALANCE \ | 190 | .flags = 1*SD_LOAD_BALANCE \ |
| 167 | | SD_BALANCE_NEWIDLE \ | 191 | | 1*SD_BALANCE_NEWIDLE \ |
| 168 | | SD_WAKE_AFFINE \ | 192 | | 0*SD_BALANCE_EXEC \ |
| 169 | | SD_SERIALIZE, \ | 193 | | 0*SD_BALANCE_FORK \ |
| 170 | .last_balance = jiffies, \ | 194 | | 0*SD_WAKE_IDLE \ |
| 171 | .balance_interval = 64, \ | 195 | | 1*SD_WAKE_AFFINE \ |
| 196 | | 0*SD_WAKE_BALANCE \ | ||
| 197 | | 0*SD_SHARE_CPUPOWER \ | ||
| 198 | | 0*SD_POWERSAVINGS_BALANCE \ | ||
| 199 | | 0*SD_SHARE_PKG_RESOURCES \ | ||
| 200 | | 1*SD_SERIALIZE \ | ||
| 201 | | 1*SD_WAKE_IDLE_FAR \ | ||
| 202 | | 0*SD_PREFER_SIBLING \ | ||
| 203 | , \ | ||
| 204 | .last_balance = jiffies, \ | ||
| 205 | .balance_interval = 64, \ | ||
| 172 | } | 206 | } |
| 173 | 207 | ||
| 174 | #ifdef CONFIG_NUMA | 208 | #ifdef CONFIG_NUMA |
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 8949bb7eb082..a4c369ec328f 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h | |||
| @@ -340,6 +340,101 @@ TRACE_EVENT(sched_signal_send, | |||
| 340 | __entry->sig, __entry->comm, __entry->pid) | 340 | __entry->sig, __entry->comm, __entry->pid) |
| 341 | ); | 341 | ); |
| 342 | 342 | ||
| 343 | /* | ||
| 344 | * XXX the below sched_stat tracepoints only apply to SCHED_OTHER/BATCH/IDLE | ||
| 345 | * adding sched_stat support to SCHED_FIFO/RR would be welcome. | ||
| 346 | */ | ||
| 347 | |||
| 348 | /* | ||
| 349 | * Tracepoint for accounting wait time (time the task is runnable | ||
| 350 | * but not actually running due to scheduler contention). | ||
| 351 | */ | ||
| 352 | TRACE_EVENT(sched_stat_wait, | ||
| 353 | |||
| 354 | TP_PROTO(struct task_struct *tsk, u64 delay), | ||
| 355 | |||
| 356 | TP_ARGS(tsk, delay), | ||
| 357 | |||
| 358 | TP_STRUCT__entry( | ||
| 359 | __array( char, comm, TASK_COMM_LEN ) | ||
| 360 | __field( pid_t, pid ) | ||
| 361 | __field( u64, delay ) | ||
| 362 | ), | ||
| 363 | |||
| 364 | TP_fast_assign( | ||
| 365 | memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN); | ||
| 366 | __entry->pid = tsk->pid; | ||
| 367 | __entry->delay = delay; | ||
| 368 | ) | ||
| 369 | TP_perf_assign( | ||
| 370 | __perf_count(delay); | ||
| 371 | ), | ||
| 372 | |||
| 373 | TP_printk("task: %s:%d wait: %Lu [ns]", | ||
| 374 | __entry->comm, __entry->pid, | ||
| 375 | (unsigned long long)__entry->delay) | ||
| 376 | ); | ||
| 377 | |||
| 378 | /* | ||
| 379 | * Tracepoint for accounting sleep time (time the task is not runnable, | ||
| 380 | * including iowait, see below). | ||
| 381 | */ | ||
| 382 | TRACE_EVENT(sched_stat_sleep, | ||
| 383 | |||
| 384 | TP_PROTO(struct task_struct *tsk, u64 delay), | ||
| 385 | |||
| 386 | TP_ARGS(tsk, delay), | ||
| 387 | |||
| 388 | TP_STRUCT__entry( | ||
| 389 | __array( char, comm, TASK_COMM_LEN ) | ||
| 390 | __field( pid_t, pid ) | ||
| 391 | __field( u64, delay ) | ||
| 392 | ), | ||
| 393 | |||
| 394 | TP_fast_assign( | ||
| 395 | memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN); | ||
| 396 | __entry->pid = tsk->pid; | ||
| 397 | __entry->delay = delay; | ||
| 398 | ) | ||
| 399 | TP_perf_assign( | ||
| 400 | __perf_count(delay); | ||
| 401 | ), | ||
| 402 | |||
| 403 | TP_printk("task: %s:%d sleep: %Lu [ns]", | ||
| 404 | __entry->comm, __entry->pid, | ||
| 405 | (unsigned long long)__entry->delay) | ||
| 406 | ); | ||
| 407 | |||
| 408 | /* | ||
| 409 | * Tracepoint for accounting iowait time (time the task is not runnable | ||
| 410 | * due to waiting on IO to complete). | ||
| 411 | */ | ||
| 412 | TRACE_EVENT(sched_stat_iowait, | ||
| 413 | |||
| 414 | TP_PROTO(struct task_struct *tsk, u64 delay), | ||
| 415 | |||
| 416 | TP_ARGS(tsk, delay), | ||
| 417 | |||
| 418 | TP_STRUCT__entry( | ||
| 419 | __array( char, comm, TASK_COMM_LEN ) | ||
| 420 | __field( pid_t, pid ) | ||
| 421 | __field( u64, delay ) | ||
| 422 | ), | ||
| 423 | |||
| 424 | TP_fast_assign( | ||
| 425 | memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN); | ||
| 426 | __entry->pid = tsk->pid; | ||
| 427 | __entry->delay = delay; | ||
| 428 | ) | ||
| 429 | TP_perf_assign( | ||
| 430 | __perf_count(delay); | ||
| 431 | ), | ||
| 432 | |||
| 433 | TP_printk("task: %s:%d iowait: %Lu [ns]", | ||
| 434 | __entry->comm, __entry->pid, | ||
| 435 | (unsigned long long)__entry->delay) | ||
| 436 | ); | ||
| 437 | |||
| 343 | #endif /* _TRACE_SCHED_H */ | 438 | #endif /* _TRACE_SCHED_H */ |
| 344 | 439 | ||
| 345 | /* This part must be outside protection */ | 440 | /* This part must be outside protection */ |
diff --git a/init/main.c b/init/main.c index 525f6fb2bd22..b34fd8e5edef 100644 --- a/init/main.c +++ b/init/main.c | |||
| @@ -631,7 +631,6 @@ asmlinkage void __init start_kernel(void) | |||
| 631 | softirq_init(); | 631 | softirq_init(); |
| 632 | timekeeping_init(); | 632 | timekeeping_init(); |
| 633 | time_init(); | 633 | time_init(); |
| 634 | sched_clock_init(); | ||
| 635 | profile_init(); | 634 | profile_init(); |
| 636 | if (!irqs_disabled()) | 635 | if (!irqs_disabled()) |
| 637 | printk(KERN_CRIT "start_kernel(): bug: interrupts were " | 636 | printk(KERN_CRIT "start_kernel(): bug: interrupts were " |
| @@ -682,6 +681,7 @@ asmlinkage void __init start_kernel(void) | |||
| 682 | numa_policy_init(); | 681 | numa_policy_init(); |
| 683 | if (late_time_init) | 682 | if (late_time_init) |
| 684 | late_time_init(); | 683 | late_time_init(); |
| 684 | sched_clock_init(); | ||
| 685 | calibrate_delay(); | 685 | calibrate_delay(); |
| 686 | pidmap_init(); | 686 | pidmap_init(); |
| 687 | anon_vma_init(); | 687 | anon_vma_init(); |
diff --git a/kernel/kthread.c b/kernel/kthread.c index eb8751aa0418..5fe709982caa 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c | |||
| @@ -16,8 +16,6 @@ | |||
| 16 | #include <linux/mutex.h> | 16 | #include <linux/mutex.h> |
| 17 | #include <trace/events/sched.h> | 17 | #include <trace/events/sched.h> |
| 18 | 18 | ||
| 19 | #define KTHREAD_NICE_LEVEL (-5) | ||
| 20 | |||
| 21 | static DEFINE_SPINLOCK(kthread_create_lock); | 19 | static DEFINE_SPINLOCK(kthread_create_lock); |
| 22 | static LIST_HEAD(kthread_create_list); | 20 | static LIST_HEAD(kthread_create_list); |
| 23 | struct task_struct *kthreadd_task; | 21 | struct task_struct *kthreadd_task; |
| @@ -145,7 +143,6 @@ struct task_struct *kthread_create(int (*threadfn)(void *data), | |||
| 145 | * The kernel thread should not inherit these properties. | 143 | * The kernel thread should not inherit these properties. |
| 146 | */ | 144 | */ |
| 147 | sched_setscheduler_nocheck(create.result, SCHED_NORMAL, ¶m); | 145 | sched_setscheduler_nocheck(create.result, SCHED_NORMAL, ¶m); |
| 148 | set_user_nice(create.result, KTHREAD_NICE_LEVEL); | ||
| 149 | set_cpus_allowed_ptr(create.result, cpu_all_mask); | 146 | set_cpus_allowed_ptr(create.result, cpu_all_mask); |
| 150 | } | 147 | } |
| 151 | return create.result; | 148 | return create.result; |
| @@ -221,7 +218,6 @@ int kthreadd(void *unused) | |||
| 221 | /* Setup a clean context for our children to inherit. */ | 218 | /* Setup a clean context for our children to inherit. */ |
| 222 | set_task_comm(tsk, "kthreadd"); | 219 | set_task_comm(tsk, "kthreadd"); |
| 223 | ignore_signals(tsk); | 220 | ignore_signals(tsk); |
| 224 | set_user_nice(tsk, KTHREAD_NICE_LEVEL); | ||
| 225 | set_cpus_allowed_ptr(tsk, cpu_all_mask); | 221 | set_cpus_allowed_ptr(tsk, cpu_all_mask); |
| 226 | set_mems_allowed(node_possible_map); | 222 | set_mems_allowed(node_possible_map); |
| 227 | 223 | ||
diff --git a/kernel/sched.c b/kernel/sched.c index 4066241ae9f4..e27a53685ed9 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
| @@ -64,7 +64,6 @@ | |||
| 64 | #include <linux/tsacct_kern.h> | 64 | #include <linux/tsacct_kern.h> |
| 65 | #include <linux/kprobes.h> | 65 | #include <linux/kprobes.h> |
| 66 | #include <linux/delayacct.h> | 66 | #include <linux/delayacct.h> |
| 67 | #include <linux/reciprocal_div.h> | ||
| 68 | #include <linux/unistd.h> | 67 | #include <linux/unistd.h> |
| 69 | #include <linux/pagemap.h> | 68 | #include <linux/pagemap.h> |
| 70 | #include <linux/hrtimer.h> | 69 | #include <linux/hrtimer.h> |
| @@ -120,30 +119,8 @@ | |||
| 120 | */ | 119 | */ |
| 121 | #define RUNTIME_INF ((u64)~0ULL) | 120 | #define RUNTIME_INF ((u64)~0ULL) |
| 122 | 121 | ||
| 123 | #ifdef CONFIG_SMP | ||
| 124 | |||
| 125 | static void double_rq_lock(struct rq *rq1, struct rq *rq2); | 122 | static void double_rq_lock(struct rq *rq1, struct rq *rq2); |
| 126 | 123 | ||
| 127 | /* | ||
| 128 | * Divide a load by a sched group cpu_power : (load / sg->__cpu_power) | ||
| 129 | * Since cpu_power is a 'constant', we can use a reciprocal divide. | ||
| 130 | */ | ||
| 131 | static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load) | ||
| 132 | { | ||
| 133 | return reciprocal_divide(load, sg->reciprocal_cpu_power); | ||
| 134 | } | ||
| 135 | |||
| 136 | /* | ||
| 137 | * Each time a sched group cpu_power is changed, | ||
| 138 | * we must compute its reciprocal value | ||
| 139 | */ | ||
| 140 | static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val) | ||
| 141 | { | ||
| 142 | sg->__cpu_power += val; | ||
| 143 | sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power); | ||
| 144 | } | ||
| 145 | #endif | ||
| 146 | |||
| 147 | static inline int rt_policy(int policy) | 124 | static inline int rt_policy(int policy) |
| 148 | { | 125 | { |
| 149 | if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR)) | 126 | if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR)) |
| @@ -309,8 +286,8 @@ void set_tg_uid(struct user_struct *user) | |||
| 309 | 286 | ||
| 310 | /* | 287 | /* |
| 311 | * Root task group. | 288 | * Root task group. |
| 312 | * Every UID task group (including init_task_group aka UID-0) will | 289 | * Every UID task group (including init_task_group aka UID-0) will |
| 313 | * be a child to this group. | 290 | * be a child to this group. |
| 314 | */ | 291 | */ |
| 315 | struct task_group root_task_group; | 292 | struct task_group root_task_group; |
| 316 | 293 | ||
| @@ -318,7 +295,7 @@ struct task_group root_task_group; | |||
| 318 | /* Default task group's sched entity on each cpu */ | 295 | /* Default task group's sched entity on each cpu */ |
| 319 | static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); | 296 | static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); |
| 320 | /* Default task group's cfs_rq on each cpu */ | 297 | /* Default task group's cfs_rq on each cpu */ |
| 321 | static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; | 298 | static DEFINE_PER_CPU(struct cfs_rq, init_tg_cfs_rq) ____cacheline_aligned_in_smp; |
| 322 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 299 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
| 323 | 300 | ||
| 324 | #ifdef CONFIG_RT_GROUP_SCHED | 301 | #ifdef CONFIG_RT_GROUP_SCHED |
| @@ -616,6 +593,7 @@ struct rq { | |||
| 616 | 593 | ||
| 617 | unsigned char idle_at_tick; | 594 | unsigned char idle_at_tick; |
| 618 | /* For active balancing */ | 595 | /* For active balancing */ |
| 596 | int post_schedule; | ||
| 619 | int active_balance; | 597 | int active_balance; |
| 620 | int push_cpu; | 598 | int push_cpu; |
| 621 | /* cpu of this runqueue: */ | 599 | /* cpu of this runqueue: */ |
| @@ -626,6 +604,9 @@ struct rq { | |||
| 626 | 604 | ||
| 627 | struct task_struct *migration_thread; | 605 | struct task_struct *migration_thread; |
| 628 | struct list_head migration_queue; | 606 | struct list_head migration_queue; |
| 607 | |||
| 608 | u64 rt_avg; | ||
| 609 | u64 age_stamp; | ||
| 629 | #endif | 610 | #endif |
| 630 | 611 | ||
| 631 | /* calc_load related fields */ | 612 | /* calc_load related fields */ |
| @@ -693,6 +674,7 @@ static inline int cpu_of(struct rq *rq) | |||
| 693 | #define this_rq() (&__get_cpu_var(runqueues)) | 674 | #define this_rq() (&__get_cpu_var(runqueues)) |
| 694 | #define task_rq(p) cpu_rq(task_cpu(p)) | 675 | #define task_rq(p) cpu_rq(task_cpu(p)) |
| 695 | #define cpu_curr(cpu) (cpu_rq(cpu)->curr) | 676 | #define cpu_curr(cpu) (cpu_rq(cpu)->curr) |
| 677 | #define raw_rq() (&__raw_get_cpu_var(runqueues)) | ||
| 696 | 678 | ||
| 697 | inline void update_rq_clock(struct rq *rq) | 679 | inline void update_rq_clock(struct rq *rq) |
| 698 | { | 680 | { |
| @@ -861,6 +843,14 @@ unsigned int sysctl_sched_shares_ratelimit = 250000; | |||
| 861 | unsigned int sysctl_sched_shares_thresh = 4; | 843 | unsigned int sysctl_sched_shares_thresh = 4; |
| 862 | 844 | ||
| 863 | /* | 845 | /* |
| 846 | * period over which we average the RT time consumption, measured | ||
| 847 | * in ms. | ||
| 848 | * | ||
| 849 | * default: 1s | ||
| 850 | */ | ||
| 851 | const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC; | ||
| 852 | |||
| 853 | /* | ||
| 864 | * period over which we measure -rt task cpu usage in us. | 854 | * period over which we measure -rt task cpu usage in us. |
| 865 | * default: 1s | 855 | * default: 1s |
| 866 | */ | 856 | */ |
| @@ -1278,12 +1268,37 @@ void wake_up_idle_cpu(int cpu) | |||
| 1278 | } | 1268 | } |
| 1279 | #endif /* CONFIG_NO_HZ */ | 1269 | #endif /* CONFIG_NO_HZ */ |
| 1280 | 1270 | ||
| 1271 | static u64 sched_avg_period(void) | ||
| 1272 | { | ||
| 1273 | return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2; | ||
| 1274 | } | ||
| 1275 | |||
| 1276 | static void sched_avg_update(struct rq *rq) | ||
| 1277 | { | ||
| 1278 | s64 period = sched_avg_period(); | ||
| 1279 | |||
| 1280 | while ((s64)(rq->clock - rq->age_stamp) > period) { | ||
| 1281 | rq->age_stamp += period; | ||
| 1282 | rq->rt_avg /= 2; | ||
| 1283 | } | ||
| 1284 | } | ||
| 1285 | |||
| 1286 | static void sched_rt_avg_update(struct rq *rq, u64 rt_delta) | ||
| 1287 | { | ||
| 1288 | rq->rt_avg += rt_delta; | ||
| 1289 | sched_avg_update(rq); | ||
| 1290 | } | ||
| 1291 | |||
| 1281 | #else /* !CONFIG_SMP */ | 1292 | #else /* !CONFIG_SMP */ |
| 1282 | static void resched_task(struct task_struct *p) | 1293 | static void resched_task(struct task_struct *p) |
| 1283 | { | 1294 | { |
| 1284 | assert_spin_locked(&task_rq(p)->lock); | 1295 | assert_spin_locked(&task_rq(p)->lock); |
| 1285 | set_tsk_need_resched(p); | 1296 | set_tsk_need_resched(p); |
| 1286 | } | 1297 | } |
| 1298 | |||
| 1299 | static void sched_rt_avg_update(struct rq *rq, u64 rt_delta) | ||
| 1300 | { | ||
| 1301 | } | ||
| 1287 | #endif /* CONFIG_SMP */ | 1302 | #endif /* CONFIG_SMP */ |
| 1288 | 1303 | ||
| 1289 | #if BITS_PER_LONG == 32 | 1304 | #if BITS_PER_LONG == 32 |
| @@ -1513,28 +1528,35 @@ static unsigned long cpu_avg_load_per_task(int cpu) | |||
| 1513 | 1528 | ||
| 1514 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1529 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 1515 | 1530 | ||
| 1531 | struct update_shares_data { | ||
| 1532 | unsigned long rq_weight[NR_CPUS]; | ||
| 1533 | }; | ||
| 1534 | |||
| 1535 | static DEFINE_PER_CPU(struct update_shares_data, update_shares_data); | ||
| 1536 | |||
| 1516 | static void __set_se_shares(struct sched_entity *se, unsigned long shares); | 1537 | static void __set_se_shares(struct sched_entity *se, unsigned long shares); |
| 1517 | 1538 | ||
| 1518 | /* | 1539 | /* |
| 1519 | * Calculate and set the cpu's group shares. | 1540 | * Calculate and set the cpu's group shares. |
| 1520 | */ | 1541 | */ |
| 1521 | static void | 1542 | static void update_group_shares_cpu(struct task_group *tg, int cpu, |
| 1522 | update_group_shares_cpu(struct task_group *tg, int cpu, | 1543 | unsigned long sd_shares, |
| 1523 | unsigned long sd_shares, unsigned long sd_rq_weight) | 1544 | unsigned long sd_rq_weight, |
| 1545 | struct update_shares_data *usd) | ||
| 1524 | { | 1546 | { |
| 1525 | unsigned long shares; | 1547 | unsigned long shares, rq_weight; |
| 1526 | unsigned long rq_weight; | 1548 | int boost = 0; |
| 1527 | 1549 | ||
| 1528 | if (!tg->se[cpu]) | 1550 | rq_weight = usd->rq_weight[cpu]; |
| 1529 | return; | 1551 | if (!rq_weight) { |
| 1530 | 1552 | boost = 1; | |
| 1531 | rq_weight = tg->cfs_rq[cpu]->rq_weight; | 1553 | rq_weight = NICE_0_LOAD; |
| 1554 | } | ||
| 1532 | 1555 | ||
| 1533 | /* | 1556 | /* |
| 1534 | * \Sum shares * rq_weight | 1557 | * \Sum_j shares_j * rq_weight_i |
| 1535 | * shares = ----------------------- | 1558 | * shares_i = ----------------------------- |
| 1536 | * \Sum rq_weight | 1559 | * \Sum_j rq_weight_j |
| 1537 | * | ||
| 1538 | */ | 1560 | */ |
| 1539 | shares = (sd_shares * rq_weight) / sd_rq_weight; | 1561 | shares = (sd_shares * rq_weight) / sd_rq_weight; |
| 1540 | shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES); | 1562 | shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES); |
| @@ -1545,8 +1567,8 @@ update_group_shares_cpu(struct task_group *tg, int cpu, | |||
| 1545 | unsigned long flags; | 1567 | unsigned long flags; |
| 1546 | 1568 | ||
| 1547 | spin_lock_irqsave(&rq->lock, flags); | 1569 | spin_lock_irqsave(&rq->lock, flags); |
| 1548 | tg->cfs_rq[cpu]->shares = shares; | 1570 | tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight; |
| 1549 | 1571 | tg->cfs_rq[cpu]->shares = boost ? 0 : shares; | |
| 1550 | __set_se_shares(tg->se[cpu], shares); | 1572 | __set_se_shares(tg->se[cpu], shares); |
| 1551 | spin_unlock_irqrestore(&rq->lock, flags); | 1573 | spin_unlock_irqrestore(&rq->lock, flags); |
| 1552 | } | 1574 | } |
| @@ -1559,22 +1581,30 @@ update_group_shares_cpu(struct task_group *tg, int cpu, | |||
| 1559 | */ | 1581 | */ |
| 1560 | static int tg_shares_up(struct task_group *tg, void *data) | 1582 | static int tg_shares_up(struct task_group *tg, void *data) |
| 1561 | { | 1583 | { |
| 1562 | unsigned long weight, rq_weight = 0; | 1584 | unsigned long weight, rq_weight = 0, shares = 0; |
| 1563 | unsigned long shares = 0; | 1585 | struct update_shares_data *usd; |
| 1564 | struct sched_domain *sd = data; | 1586 | struct sched_domain *sd = data; |
| 1587 | unsigned long flags; | ||
| 1565 | int i; | 1588 | int i; |
| 1566 | 1589 | ||
| 1590 | if (!tg->se[0]) | ||
| 1591 | return 0; | ||
| 1592 | |||
| 1593 | local_irq_save(flags); | ||
| 1594 | usd = &__get_cpu_var(update_shares_data); | ||
| 1595 | |||
| 1567 | for_each_cpu(i, sched_domain_span(sd)) { | 1596 | for_each_cpu(i, sched_domain_span(sd)) { |
| 1597 | weight = tg->cfs_rq[i]->load.weight; | ||
| 1598 | usd->rq_weight[i] = weight; | ||
| 1599 | |||
| 1568 | /* | 1600 | /* |
| 1569 | * If there are currently no tasks on the cpu pretend there | 1601 | * If there are currently no tasks on the cpu pretend there |
| 1570 | * is one of average load so that when a new task gets to | 1602 | * is one of average load so that when a new task gets to |
| 1571 | * run here it will not get delayed by group starvation. | 1603 | * run here it will not get delayed by group starvation. |
| 1572 | */ | 1604 | */ |
| 1573 | weight = tg->cfs_rq[i]->load.weight; | ||
| 1574 | if (!weight) | 1605 | if (!weight) |
| 1575 | weight = NICE_0_LOAD; | 1606 | weight = NICE_0_LOAD; |
| 1576 | 1607 | ||
| 1577 | tg->cfs_rq[i]->rq_weight = weight; | ||
| 1578 | rq_weight += weight; | 1608 | rq_weight += weight; |
| 1579 | shares += tg->cfs_rq[i]->shares; | 1609 | shares += tg->cfs_rq[i]->shares; |
| 1580 | } | 1610 | } |
| @@ -1586,7 +1616,9 @@ static int tg_shares_up(struct task_group *tg, void *data) | |||
| 1586 | shares = tg->shares; | 1616 | shares = tg->shares; |
| 1587 | 1617 | ||
| 1588 | for_each_cpu(i, sched_domain_span(sd)) | 1618 | for_each_cpu(i, sched_domain_span(sd)) |
| 1589 | update_group_shares_cpu(tg, i, shares, rq_weight); | 1619 | update_group_shares_cpu(tg, i, shares, rq_weight, usd); |
| 1620 | |||
| 1621 | local_irq_restore(flags); | ||
| 1590 | 1622 | ||
| 1591 | return 0; | 1623 | return 0; |
| 1592 | } | 1624 | } |
| @@ -1616,8 +1648,14 @@ static int tg_load_down(struct task_group *tg, void *data) | |||
| 1616 | 1648 | ||
| 1617 | static void update_shares(struct sched_domain *sd) | 1649 | static void update_shares(struct sched_domain *sd) |
| 1618 | { | 1650 | { |
| 1619 | u64 now = cpu_clock(raw_smp_processor_id()); | 1651 | s64 elapsed; |
| 1620 | s64 elapsed = now - sd->last_update; | 1652 | u64 now; |
| 1653 | |||
| 1654 | if (root_task_group_empty()) | ||
| 1655 | return; | ||
| 1656 | |||
| 1657 | now = cpu_clock(raw_smp_processor_id()); | ||
| 1658 | elapsed = now - sd->last_update; | ||
| 1621 | 1659 | ||
| 1622 | if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { | 1660 | if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { |
| 1623 | sd->last_update = now; | 1661 | sd->last_update = now; |
| @@ -1627,6 +1665,9 @@ static void update_shares(struct sched_domain *sd) | |||
| 1627 | 1665 | ||
| 1628 | static void update_shares_locked(struct rq *rq, struct sched_domain *sd) | 1666 | static void update_shares_locked(struct rq *rq, struct sched_domain *sd) |
| 1629 | { | 1667 | { |
| 1668 | if (root_task_group_empty()) | ||
| 1669 | return; | ||
| 1670 | |||
| 1630 | spin_unlock(&rq->lock); | 1671 | spin_unlock(&rq->lock); |
| 1631 | update_shares(sd); | 1672 | update_shares(sd); |
| 1632 | spin_lock(&rq->lock); | 1673 | spin_lock(&rq->lock); |
| @@ -1634,6 +1675,9 @@ static void update_shares_locked(struct rq *rq, struct sched_domain *sd) | |||
| 1634 | 1675 | ||
| 1635 | static void update_h_load(long cpu) | 1676 | static void update_h_load(long cpu) |
| 1636 | { | 1677 | { |
| 1678 | if (root_task_group_empty()) | ||
| 1679 | return; | ||
| 1680 | |||
| 1637 | walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); | 1681 | walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); |
| 1638 | } | 1682 | } |
| 1639 | 1683 | ||
| @@ -2268,8 +2312,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) | |||
| 2268 | } | 2312 | } |
| 2269 | 2313 | ||
| 2270 | /* Adjust by relative CPU power of the group */ | 2314 | /* Adjust by relative CPU power of the group */ |
| 2271 | avg_load = sg_div_cpu_power(group, | 2315 | avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; |
| 2272 | avg_load * SCHED_LOAD_SCALE); | ||
| 2273 | 2316 | ||
| 2274 | if (local_group) { | 2317 | if (local_group) { |
| 2275 | this_load = avg_load; | 2318 | this_load = avg_load; |
| @@ -2637,9 +2680,32 @@ void sched_fork(struct task_struct *p, int clone_flags) | |||
| 2637 | set_task_cpu(p, cpu); | 2680 | set_task_cpu(p, cpu); |
| 2638 | 2681 | ||
| 2639 | /* | 2682 | /* |
| 2640 | * Make sure we do not leak PI boosting priority to the child: | 2683 | * Make sure we do not leak PI boosting priority to the child. |
| 2641 | */ | 2684 | */ |
| 2642 | p->prio = current->normal_prio; | 2685 | p->prio = current->normal_prio; |
| 2686 | |||
| 2687 | /* | ||
| 2688 | * Revert to default priority/policy on fork if requested. | ||
| 2689 | */ | ||
| 2690 | if (unlikely(p->sched_reset_on_fork)) { | ||
| 2691 | if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) | ||
| 2692 | p->policy = SCHED_NORMAL; | ||
| 2693 | |||
| 2694 | if (p->normal_prio < DEFAULT_PRIO) | ||
| 2695 | p->prio = DEFAULT_PRIO; | ||
| 2696 | |||
| 2697 | if (PRIO_TO_NICE(p->static_prio) < 0) { | ||
| 2698 | p->static_prio = NICE_TO_PRIO(0); | ||
| 2699 | set_load_weight(p); | ||
| 2700 | } | ||
| 2701 | |||
| 2702 | /* | ||
| 2703 | * We don't need the reset flag anymore after the fork. It has | ||
| 2704 | * fulfilled its duty: | ||
| 2705 | */ | ||
| 2706 | p->sched_reset_on_fork = 0; | ||
| 2707 | } | ||
| 2708 | |||
| 2643 | if (!rt_prio(p->prio)) | 2709 | if (!rt_prio(p->prio)) |
| 2644 | p->sched_class = &fair_sched_class; | 2710 | p->sched_class = &fair_sched_class; |
| 2645 | 2711 | ||
| @@ -2796,12 +2862,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) | |||
| 2796 | { | 2862 | { |
| 2797 | struct mm_struct *mm = rq->prev_mm; | 2863 | struct mm_struct *mm = rq->prev_mm; |
| 2798 | long prev_state; | 2864 | long prev_state; |
| 2799 | #ifdef CONFIG_SMP | ||
| 2800 | int post_schedule = 0; | ||
| 2801 | |||
| 2802 | if (current->sched_class->needs_post_schedule) | ||
| 2803 | post_schedule = current->sched_class->needs_post_schedule(rq); | ||
| 2804 | #endif | ||
| 2805 | 2865 | ||
| 2806 | rq->prev_mm = NULL; | 2866 | rq->prev_mm = NULL; |
| 2807 | 2867 | ||
| @@ -2820,10 +2880,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) | |||
| 2820 | finish_arch_switch(prev); | 2880 | finish_arch_switch(prev); |
| 2821 | perf_counter_task_sched_in(current, cpu_of(rq)); | 2881 | perf_counter_task_sched_in(current, cpu_of(rq)); |
| 2822 | finish_lock_switch(rq, prev); | 2882 | finish_lock_switch(rq, prev); |
| 2823 | #ifdef CONFIG_SMP | ||
| 2824 | if (post_schedule) | ||
| 2825 | current->sched_class->post_schedule(rq); | ||
| 2826 | #endif | ||
| 2827 | 2883 | ||
| 2828 | fire_sched_in_preempt_notifiers(current); | 2884 | fire_sched_in_preempt_notifiers(current); |
| 2829 | if (mm) | 2885 | if (mm) |
| @@ -2838,6 +2894,42 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) | |||
| 2838 | } | 2894 | } |
| 2839 | } | 2895 | } |
| 2840 | 2896 | ||
| 2897 | #ifdef CONFIG_SMP | ||
| 2898 | |||
| 2899 | /* assumes rq->lock is held */ | ||
| 2900 | static inline void pre_schedule(struct rq *rq, struct task_struct *prev) | ||
| 2901 | { | ||
| 2902 | if (prev->sched_class->pre_schedule) | ||
| 2903 | prev->sched_class->pre_schedule(rq, prev); | ||
| 2904 | } | ||
| 2905 | |||
| 2906 | /* rq->lock is NOT held, but preemption is disabled */ | ||
| 2907 | static inline void post_schedule(struct rq *rq) | ||
| 2908 | { | ||
| 2909 | if (rq->post_schedule) { | ||
| 2910 | unsigned long flags; | ||
| 2911 | |||
| 2912 | spin_lock_irqsave(&rq->lock, flags); | ||
| 2913 | if (rq->curr->sched_class->post_schedule) | ||
| 2914 | rq->curr->sched_class->post_schedule(rq); | ||
| 2915 | spin_unlock_irqrestore(&rq->lock, flags); | ||
| 2916 | |||
| 2917 | rq->post_schedule = 0; | ||
| 2918 | } | ||
| 2919 | } | ||
| 2920 | |||
| 2921 | #else | ||
| 2922 | |||
| 2923 | static inline void pre_schedule(struct rq *rq, struct task_struct *p) | ||
| 2924 | { | ||
| 2925 | } | ||
| 2926 | |||
| 2927 | static inline void post_schedule(struct rq *rq) | ||
| 2928 | { | ||
| 2929 | } | ||
| 2930 | |||
| 2931 | #endif | ||
| 2932 | |||
| 2841 | /** | 2933 | /** |
| 2842 | * schedule_tail - first thing a freshly forked thread must call. | 2934 | * schedule_tail - first thing a freshly forked thread must call. |
| 2843 | * @prev: the thread we just switched away from. | 2935 | * @prev: the thread we just switched away from. |
| @@ -2848,6 +2940,13 @@ asmlinkage void schedule_tail(struct task_struct *prev) | |||
| 2848 | struct rq *rq = this_rq(); | 2940 | struct rq *rq = this_rq(); |
| 2849 | 2941 | ||
| 2850 | finish_task_switch(rq, prev); | 2942 | finish_task_switch(rq, prev); |
| 2943 | |||
| 2944 | /* | ||
| 2945 | * FIXME: do we need to worry about rq being invalidated by the | ||
| 2946 | * task_switch? | ||
| 2947 | */ | ||
| 2948 | post_schedule(rq); | ||
| 2949 | |||
| 2851 | #ifdef __ARCH_WANT_UNLOCKED_CTXSW | 2950 | #ifdef __ARCH_WANT_UNLOCKED_CTXSW |
| 2852 | /* In this case, finish_task_switch does not reenable preemption */ | 2951 | /* In this case, finish_task_switch does not reenable preemption */ |
| 2853 | preempt_enable(); | 2952 | preempt_enable(); |
| @@ -3379,9 +3478,10 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
| 3379 | { | 3478 | { |
| 3380 | const struct sched_class *class; | 3479 | const struct sched_class *class; |
| 3381 | 3480 | ||
| 3382 | for (class = sched_class_highest; class; class = class->next) | 3481 | for_each_class(class) { |
| 3383 | if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle)) | 3482 | if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle)) |
| 3384 | return 1; | 3483 | return 1; |
| 3484 | } | ||
| 3385 | 3485 | ||
| 3386 | return 0; | 3486 | return 0; |
| 3387 | } | 3487 | } |
| @@ -3544,7 +3644,7 @@ static inline void update_sd_power_savings_stats(struct sched_group *group, | |||
| 3544 | * capacity but still has some space to pick up some load | 3644 | * capacity but still has some space to pick up some load |
| 3545 | * from other group and save more power | 3645 | * from other group and save more power |
| 3546 | */ | 3646 | */ |
| 3547 | if (sgs->sum_nr_running > sgs->group_capacity - 1) | 3647 | if (sgs->sum_nr_running + 1 > sgs->group_capacity) |
| 3548 | return; | 3648 | return; |
| 3549 | 3649 | ||
| 3550 | if (sgs->sum_nr_running > sds->leader_nr_running || | 3650 | if (sgs->sum_nr_running > sds->leader_nr_running || |
| @@ -3611,6 +3711,77 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, | |||
| 3611 | } | 3711 | } |
| 3612 | #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ | 3712 | #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ |
| 3613 | 3713 | ||
| 3714 | unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu) | ||
| 3715 | { | ||
| 3716 | unsigned long weight = cpumask_weight(sched_domain_span(sd)); | ||
| 3717 | unsigned long smt_gain = sd->smt_gain; | ||
| 3718 | |||
| 3719 | smt_gain /= weight; | ||
| 3720 | |||
| 3721 | return smt_gain; | ||
| 3722 | } | ||
| 3723 | |||
| 3724 | unsigned long scale_rt_power(int cpu) | ||
| 3725 | { | ||
| 3726 | struct rq *rq = cpu_rq(cpu); | ||
| 3727 | u64 total, available; | ||
| 3728 | |||
| 3729 | sched_avg_update(rq); | ||
| 3730 | |||
| 3731 | total = sched_avg_period() + (rq->clock - rq->age_stamp); | ||
| 3732 | available = total - rq->rt_avg; | ||
| 3733 | |||
| 3734 | if (unlikely((s64)total < SCHED_LOAD_SCALE)) | ||
| 3735 | total = SCHED_LOAD_SCALE; | ||
| 3736 | |||
| 3737 | total >>= SCHED_LOAD_SHIFT; | ||
| 3738 | |||
| 3739 | return div_u64(available, total); | ||
| 3740 | } | ||
| 3741 | |||
| 3742 | static void update_cpu_power(struct sched_domain *sd, int cpu) | ||
| 3743 | { | ||
| 3744 | unsigned long weight = cpumask_weight(sched_domain_span(sd)); | ||
| 3745 | unsigned long power = SCHED_LOAD_SCALE; | ||
| 3746 | struct sched_group *sdg = sd->groups; | ||
| 3747 | |||
| 3748 | /* here we could scale based on cpufreq */ | ||
| 3749 | |||
| 3750 | if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { | ||
| 3751 | power *= arch_scale_smt_power(sd, cpu); | ||
| 3752 | power >>= SCHED_LOAD_SHIFT; | ||
| 3753 | } | ||
| 3754 | |||
| 3755 | power *= scale_rt_power(cpu); | ||
| 3756 | power >>= SCHED_LOAD_SHIFT; | ||
| 3757 | |||
| 3758 | if (!power) | ||
| 3759 | power = 1; | ||
| 3760 | |||
| 3761 | sdg->cpu_power = power; | ||
| 3762 | } | ||
| 3763 | |||
| 3764 | static void update_group_power(struct sched_domain *sd, int cpu) | ||
| 3765 | { | ||
| 3766 | struct sched_domain *child = sd->child; | ||
| 3767 | struct sched_group *group, *sdg = sd->groups; | ||
| 3768 | unsigned long power; | ||
| 3769 | |||
| 3770 | if (!child) { | ||
| 3771 | update_cpu_power(sd, cpu); | ||
| 3772 | return; | ||
| 3773 | } | ||
| 3774 | |||
| 3775 | power = 0; | ||
| 3776 | |||
| 3777 | group = child->groups; | ||
| 3778 | do { | ||
| 3779 | power += group->cpu_power; | ||
| 3780 | group = group->next; | ||
| 3781 | } while (group != child->groups); | ||
| 3782 | |||
| 3783 | sdg->cpu_power = power; | ||
| 3784 | } | ||
| 3614 | 3785 | ||
| 3615 | /** | 3786 | /** |
| 3616 | * update_sg_lb_stats - Update sched_group's statistics for load balancing. | 3787 | * update_sg_lb_stats - Update sched_group's statistics for load balancing. |
| @@ -3624,7 +3795,8 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, | |||
| 3624 | * @balance: Should we balance. | 3795 | * @balance: Should we balance. |
| 3625 | * @sgs: variable to hold the statistics for this group. | 3796 | * @sgs: variable to hold the statistics for this group. |
| 3626 | */ | 3797 | */ |
| 3627 | static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu, | 3798 | static inline void update_sg_lb_stats(struct sched_domain *sd, |
| 3799 | struct sched_group *group, int this_cpu, | ||
| 3628 | enum cpu_idle_type idle, int load_idx, int *sd_idle, | 3800 | enum cpu_idle_type idle, int load_idx, int *sd_idle, |
| 3629 | int local_group, const struct cpumask *cpus, | 3801 | int local_group, const struct cpumask *cpus, |
| 3630 | int *balance, struct sg_lb_stats *sgs) | 3802 | int *balance, struct sg_lb_stats *sgs) |
| @@ -3635,8 +3807,11 @@ static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu, | |||
| 3635 | unsigned long sum_avg_load_per_task; | 3807 | unsigned long sum_avg_load_per_task; |
| 3636 | unsigned long avg_load_per_task; | 3808 | unsigned long avg_load_per_task; |
| 3637 | 3809 | ||
| 3638 | if (local_group) | 3810 | if (local_group) { |
| 3639 | balance_cpu = group_first_cpu(group); | 3811 | balance_cpu = group_first_cpu(group); |
| 3812 | if (balance_cpu == this_cpu) | ||
| 3813 | update_group_power(sd, this_cpu); | ||
| 3814 | } | ||
| 3640 | 3815 | ||
| 3641 | /* Tally up the load of all CPUs in the group */ | 3816 | /* Tally up the load of all CPUs in the group */ |
| 3642 | sum_avg_load_per_task = avg_load_per_task = 0; | 3817 | sum_avg_load_per_task = avg_load_per_task = 0; |
| @@ -3685,8 +3860,7 @@ static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu, | |||
| 3685 | } | 3860 | } |
| 3686 | 3861 | ||
| 3687 | /* Adjust by relative CPU power of the group */ | 3862 | /* Adjust by relative CPU power of the group */ |
| 3688 | sgs->avg_load = sg_div_cpu_power(group, | 3863 | sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power; |
| 3689 | sgs->group_load * SCHED_LOAD_SCALE); | ||
| 3690 | 3864 | ||
| 3691 | 3865 | ||
| 3692 | /* | 3866 | /* |
| @@ -3698,14 +3872,14 @@ static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu, | |||
| 3698 | * normalized nr_running number somewhere that negates | 3872 | * normalized nr_running number somewhere that negates |
| 3699 | * the hierarchy? | 3873 | * the hierarchy? |
| 3700 | */ | 3874 | */ |
| 3701 | avg_load_per_task = sg_div_cpu_power(group, | 3875 | avg_load_per_task = (sum_avg_load_per_task * SCHED_LOAD_SCALE) / |
| 3702 | sum_avg_load_per_task * SCHED_LOAD_SCALE); | 3876 | group->cpu_power; |
| 3703 | 3877 | ||
| 3704 | if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) | 3878 | if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) |
| 3705 | sgs->group_imb = 1; | 3879 | sgs->group_imb = 1; |
| 3706 | 3880 | ||
| 3707 | sgs->group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; | 3881 | sgs->group_capacity = |
| 3708 | 3882 | DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); | |
| 3709 | } | 3883 | } |
| 3710 | 3884 | ||
| 3711 | /** | 3885 | /** |
| @@ -3723,9 +3897,13 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | |||
| 3723 | const struct cpumask *cpus, int *balance, | 3897 | const struct cpumask *cpus, int *balance, |
| 3724 | struct sd_lb_stats *sds) | 3898 | struct sd_lb_stats *sds) |
| 3725 | { | 3899 | { |
| 3900 | struct sched_domain *child = sd->child; | ||
| 3726 | struct sched_group *group = sd->groups; | 3901 | struct sched_group *group = sd->groups; |
| 3727 | struct sg_lb_stats sgs; | 3902 | struct sg_lb_stats sgs; |
| 3728 | int load_idx; | 3903 | int load_idx, prefer_sibling = 0; |
| 3904 | |||
| 3905 | if (child && child->flags & SD_PREFER_SIBLING) | ||
| 3906 | prefer_sibling = 1; | ||
| 3729 | 3907 | ||
| 3730 | init_sd_power_savings_stats(sd, sds, idle); | 3908 | init_sd_power_savings_stats(sd, sds, idle); |
| 3731 | load_idx = get_sd_load_idx(sd, idle); | 3909 | load_idx = get_sd_load_idx(sd, idle); |
| @@ -3736,14 +3914,22 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | |||
| 3736 | local_group = cpumask_test_cpu(this_cpu, | 3914 | local_group = cpumask_test_cpu(this_cpu, |
| 3737 | sched_group_cpus(group)); | 3915 | sched_group_cpus(group)); |
| 3738 | memset(&sgs, 0, sizeof(sgs)); | 3916 | memset(&sgs, 0, sizeof(sgs)); |
| 3739 | update_sg_lb_stats(group, this_cpu, idle, load_idx, sd_idle, | 3917 | update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle, |
| 3740 | local_group, cpus, balance, &sgs); | 3918 | local_group, cpus, balance, &sgs); |
| 3741 | 3919 | ||
| 3742 | if (local_group && balance && !(*balance)) | 3920 | if (local_group && balance && !(*balance)) |
| 3743 | return; | 3921 | return; |
| 3744 | 3922 | ||
| 3745 | sds->total_load += sgs.group_load; | 3923 | sds->total_load += sgs.group_load; |
| 3746 | sds->total_pwr += group->__cpu_power; | 3924 | sds->total_pwr += group->cpu_power; |
| 3925 | |||
| 3926 | /* | ||
| 3927 | * In case the child domain prefers tasks go to siblings | ||
| 3928 | * first, lower the group capacity to one so that we'll try | ||
| 3929 | * and move all the excess tasks away. | ||
| 3930 | */ | ||
| 3931 | if (prefer_sibling) | ||
| 3932 | sgs.group_capacity = min(sgs.group_capacity, 1UL); | ||
| 3747 | 3933 | ||
| 3748 | if (local_group) { | 3934 | if (local_group) { |
| 3749 | sds->this_load = sgs.avg_load; | 3935 | sds->this_load = sgs.avg_load; |
| @@ -3763,7 +3949,6 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | |||
| 3763 | update_sd_power_savings_stats(group, sds, local_group, &sgs); | 3949 | update_sd_power_savings_stats(group, sds, local_group, &sgs); |
| 3764 | group = group->next; | 3950 | group = group->next; |
| 3765 | } while (group != sd->groups); | 3951 | } while (group != sd->groups); |
| 3766 | |||
| 3767 | } | 3952 | } |
| 3768 | 3953 | ||
| 3769 | /** | 3954 | /** |
| @@ -3801,28 +3986,28 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds, | |||
| 3801 | * moving them. | 3986 | * moving them. |
| 3802 | */ | 3987 | */ |
| 3803 | 3988 | ||
| 3804 | pwr_now += sds->busiest->__cpu_power * | 3989 | pwr_now += sds->busiest->cpu_power * |
| 3805 | min(sds->busiest_load_per_task, sds->max_load); | 3990 | min(sds->busiest_load_per_task, sds->max_load); |
| 3806 | pwr_now += sds->this->__cpu_power * | 3991 | pwr_now += sds->this->cpu_power * |
| 3807 | min(sds->this_load_per_task, sds->this_load); | 3992 | min(sds->this_load_per_task, sds->this_load); |
| 3808 | pwr_now /= SCHED_LOAD_SCALE; | 3993 | pwr_now /= SCHED_LOAD_SCALE; |
| 3809 | 3994 | ||
| 3810 | /* Amount of load we'd subtract */ | 3995 | /* Amount of load we'd subtract */ |
| 3811 | tmp = sg_div_cpu_power(sds->busiest, | 3996 | tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) / |
| 3812 | sds->busiest_load_per_task * SCHED_LOAD_SCALE); | 3997 | sds->busiest->cpu_power; |
| 3813 | if (sds->max_load > tmp) | 3998 | if (sds->max_load > tmp) |
| 3814 | pwr_move += sds->busiest->__cpu_power * | 3999 | pwr_move += sds->busiest->cpu_power * |
| 3815 | min(sds->busiest_load_per_task, sds->max_load - tmp); | 4000 | min(sds->busiest_load_per_task, sds->max_load - tmp); |
| 3816 | 4001 | ||
| 3817 | /* Amount of load we'd add */ | 4002 | /* Amount of load we'd add */ |
| 3818 | if (sds->max_load * sds->busiest->__cpu_power < | 4003 | if (sds->max_load * sds->busiest->cpu_power < |
| 3819 | sds->busiest_load_per_task * SCHED_LOAD_SCALE) | 4004 | sds->busiest_load_per_task * SCHED_LOAD_SCALE) |
| 3820 | tmp = sg_div_cpu_power(sds->this, | 4005 | tmp = (sds->max_load * sds->busiest->cpu_power) / |
| 3821 | sds->max_load * sds->busiest->__cpu_power); | 4006 | sds->this->cpu_power; |
| 3822 | else | 4007 | else |
| 3823 | tmp = sg_div_cpu_power(sds->this, | 4008 | tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) / |
| 3824 | sds->busiest_load_per_task * SCHED_LOAD_SCALE); | 4009 | sds->this->cpu_power; |
| 3825 | pwr_move += sds->this->__cpu_power * | 4010 | pwr_move += sds->this->cpu_power * |
| 3826 | min(sds->this_load_per_task, sds->this_load + tmp); | 4011 | min(sds->this_load_per_task, sds->this_load + tmp); |
| 3827 | pwr_move /= SCHED_LOAD_SCALE; | 4012 | pwr_move /= SCHED_LOAD_SCALE; |
| 3828 | 4013 | ||
| @@ -3857,8 +4042,8 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, | |||
| 3857 | sds->max_load - sds->busiest_load_per_task); | 4042 | sds->max_load - sds->busiest_load_per_task); |
| 3858 | 4043 | ||
| 3859 | /* How much load to actually move to equalise the imbalance */ | 4044 | /* How much load to actually move to equalise the imbalance */ |
| 3860 | *imbalance = min(max_pull * sds->busiest->__cpu_power, | 4045 | *imbalance = min(max_pull * sds->busiest->cpu_power, |
| 3861 | (sds->avg_load - sds->this_load) * sds->this->__cpu_power) | 4046 | (sds->avg_load - sds->this_load) * sds->this->cpu_power) |
| 3862 | / SCHED_LOAD_SCALE; | 4047 | / SCHED_LOAD_SCALE; |
| 3863 | 4048 | ||
| 3864 | /* | 4049 | /* |
| @@ -3976,6 +4161,26 @@ ret: | |||
| 3976 | return NULL; | 4161 | return NULL; |
| 3977 | } | 4162 | } |
| 3978 | 4163 | ||
| 4164 | static struct sched_group *group_of(int cpu) | ||
| 4165 | { | ||
| 4166 | struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd); | ||
| 4167 | |||
| 4168 | if (!sd) | ||
| 4169 | return NULL; | ||
| 4170 | |||
| 4171 | return sd->groups; | ||
| 4172 | } | ||
| 4173 | |||
| 4174 | static unsigned long power_of(int cpu) | ||
| 4175 | { | ||
| 4176 | struct sched_group *group = group_of(cpu); | ||
| 4177 | |||
| 4178 | if (!group) | ||
| 4179 | return SCHED_LOAD_SCALE; | ||
| 4180 | |||
| 4181 | return group->cpu_power; | ||
| 4182 | } | ||
| 4183 | |||
| 3979 | /* | 4184 | /* |
| 3980 | * find_busiest_queue - find the busiest runqueue among the cpus in group. | 4185 | * find_busiest_queue - find the busiest runqueue among the cpus in group. |
| 3981 | */ | 4186 | */ |
| @@ -3988,15 +4193,18 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, | |||
| 3988 | int i; | 4193 | int i; |
| 3989 | 4194 | ||
| 3990 | for_each_cpu(i, sched_group_cpus(group)) { | 4195 | for_each_cpu(i, sched_group_cpus(group)) { |
| 4196 | unsigned long power = power_of(i); | ||
| 4197 | unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE); | ||
| 3991 | unsigned long wl; | 4198 | unsigned long wl; |
| 3992 | 4199 | ||
| 3993 | if (!cpumask_test_cpu(i, cpus)) | 4200 | if (!cpumask_test_cpu(i, cpus)) |
| 3994 | continue; | 4201 | continue; |
| 3995 | 4202 | ||
| 3996 | rq = cpu_rq(i); | 4203 | rq = cpu_rq(i); |
| 3997 | wl = weighted_cpuload(i); | 4204 | wl = weighted_cpuload(i) * SCHED_LOAD_SCALE; |
| 4205 | wl /= power; | ||
| 3998 | 4206 | ||
| 3999 | if (rq->nr_running == 1 && wl > imbalance) | 4207 | if (capacity && rq->nr_running == 1 && wl > imbalance) |
| 4000 | continue; | 4208 | continue; |
| 4001 | 4209 | ||
| 4002 | if (wl > max_load) { | 4210 | if (wl > max_load) { |
| @@ -5349,10 +5557,7 @@ need_resched_nonpreemptible: | |||
| 5349 | switch_count = &prev->nvcsw; | 5557 | switch_count = &prev->nvcsw; |
| 5350 | } | 5558 | } |
| 5351 | 5559 | ||
| 5352 | #ifdef CONFIG_SMP | 5560 | pre_schedule(rq, prev); |
| 5353 | if (prev->sched_class->pre_schedule) | ||
| 5354 | prev->sched_class->pre_schedule(rq, prev); | ||
| 5355 | #endif | ||
| 5356 | 5561 | ||
| 5357 | if (unlikely(!rq->nr_running)) | 5562 | if (unlikely(!rq->nr_running)) |
| 5358 | idle_balance(cpu, rq); | 5563 | idle_balance(cpu, rq); |
| @@ -5378,6 +5583,8 @@ need_resched_nonpreemptible: | |||
| 5378 | } else | 5583 | } else |
| 5379 | spin_unlock_irq(&rq->lock); | 5584 | spin_unlock_irq(&rq->lock); |
| 5380 | 5585 | ||
| 5586 | post_schedule(rq); | ||
| 5587 | |||
| 5381 | if (unlikely(reacquire_kernel_lock(current) < 0)) | 5588 | if (unlikely(reacquire_kernel_lock(current) < 0)) |
| 5382 | goto need_resched_nonpreemptible; | 5589 | goto need_resched_nonpreemptible; |
| 5383 | 5590 | ||
| @@ -6123,17 +6330,25 @@ static int __sched_setscheduler(struct task_struct *p, int policy, | |||
| 6123 | unsigned long flags; | 6330 | unsigned long flags; |
| 6124 | const struct sched_class *prev_class = p->sched_class; | 6331 | const struct sched_class *prev_class = p->sched_class; |
| 6125 | struct rq *rq; | 6332 | struct rq *rq; |
| 6333 | int reset_on_fork; | ||
| 6126 | 6334 | ||
| 6127 | /* may grab non-irq protected spin_locks */ | 6335 | /* may grab non-irq protected spin_locks */ |
| 6128 | BUG_ON(in_interrupt()); | 6336 | BUG_ON(in_interrupt()); |
| 6129 | recheck: | 6337 | recheck: |
| 6130 | /* double check policy once rq lock held */ | 6338 | /* double check policy once rq lock held */ |
| 6131 | if (policy < 0) | 6339 | if (policy < 0) { |
| 6340 | reset_on_fork = p->sched_reset_on_fork; | ||
| 6132 | policy = oldpolicy = p->policy; | 6341 | policy = oldpolicy = p->policy; |
| 6133 | else if (policy != SCHED_FIFO && policy != SCHED_RR && | 6342 | } else { |
| 6134 | policy != SCHED_NORMAL && policy != SCHED_BATCH && | 6343 | reset_on_fork = !!(policy & SCHED_RESET_ON_FORK); |
| 6135 | policy != SCHED_IDLE) | 6344 | policy &= ~SCHED_RESET_ON_FORK; |
| 6136 | return -EINVAL; | 6345 | |
| 6346 | if (policy != SCHED_FIFO && policy != SCHED_RR && | ||
| 6347 | policy != SCHED_NORMAL && policy != SCHED_BATCH && | ||
| 6348 | policy != SCHED_IDLE) | ||
| 6349 | return -EINVAL; | ||
| 6350 | } | ||
| 6351 | |||
| 6137 | /* | 6352 | /* |
| 6138 | * Valid priorities for SCHED_FIFO and SCHED_RR are | 6353 | * Valid priorities for SCHED_FIFO and SCHED_RR are |
| 6139 | * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, | 6354 | * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, |
| @@ -6177,6 +6392,10 @@ recheck: | |||
| 6177 | /* can't change other user's priorities */ | 6392 | /* can't change other user's priorities */ |
| 6178 | if (!check_same_owner(p)) | 6393 | if (!check_same_owner(p)) |
| 6179 | return -EPERM; | 6394 | return -EPERM; |
| 6395 | |||
| 6396 | /* Normal users shall not reset the sched_reset_on_fork flag */ | ||
| 6397 | if (p->sched_reset_on_fork && !reset_on_fork) | ||
| 6398 | return -EPERM; | ||
| 6180 | } | 6399 | } |
| 6181 | 6400 | ||
| 6182 | if (user) { | 6401 | if (user) { |
| @@ -6220,6 +6439,8 @@ recheck: | |||
| 6220 | if (running) | 6439 | if (running) |
| 6221 | p->sched_class->put_prev_task(rq, p); | 6440 | p->sched_class->put_prev_task(rq, p); |
| 6222 | 6441 | ||
| 6442 | p->sched_reset_on_fork = reset_on_fork; | ||
| 6443 | |||
| 6223 | oldprio = p->prio; | 6444 | oldprio = p->prio; |
| 6224 | __setscheduler(rq, p, policy, param->sched_priority); | 6445 | __setscheduler(rq, p, policy, param->sched_priority); |
| 6225 | 6446 | ||
| @@ -6336,14 +6557,15 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) | |||
| 6336 | if (p) { | 6557 | if (p) { |
| 6337 | retval = security_task_getscheduler(p); | 6558 | retval = security_task_getscheduler(p); |
| 6338 | if (!retval) | 6559 | if (!retval) |
| 6339 | retval = p->policy; | 6560 | retval = p->policy |
| 6561 | | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0); | ||
| 6340 | } | 6562 | } |
| 6341 | read_unlock(&tasklist_lock); | 6563 | read_unlock(&tasklist_lock); |
| 6342 | return retval; | 6564 | return retval; |
| 6343 | } | 6565 | } |
| 6344 | 6566 | ||
| 6345 | /** | 6567 | /** |
| 6346 | * sys_sched_getscheduler - get the RT priority of a thread | 6568 | * sys_sched_getparam - get the RT priority of a thread |
| 6347 | * @pid: the pid in question. | 6569 | * @pid: the pid in question. |
| 6348 | * @param: structure containing the RT priority. | 6570 | * @param: structure containing the RT priority. |
| 6349 | */ | 6571 | */ |
| @@ -6571,19 +6793,9 @@ static inline int should_resched(void) | |||
| 6571 | 6793 | ||
| 6572 | static void __cond_resched(void) | 6794 | static void __cond_resched(void) |
| 6573 | { | 6795 | { |
| 6574 | #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP | 6796 | add_preempt_count(PREEMPT_ACTIVE); |
| 6575 | __might_sleep(__FILE__, __LINE__); | 6797 | schedule(); |
| 6576 | #endif | 6798 | sub_preempt_count(PREEMPT_ACTIVE); |
| 6577 | /* | ||
| 6578 | * The BKS might be reacquired before we have dropped | ||
| 6579 | * PREEMPT_ACTIVE, which could trigger a second | ||
| 6580 | * cond_resched() call. | ||
| 6581 | */ | ||
| 6582 | do { | ||
| 6583 | add_preempt_count(PREEMPT_ACTIVE); | ||
| 6584 | schedule(); | ||
| 6585 | sub_preempt_count(PREEMPT_ACTIVE); | ||
| 6586 | } while (need_resched()); | ||
| 6587 | } | 6799 | } |
| 6588 | 6800 | ||
| 6589 | int __sched _cond_resched(void) | 6801 | int __sched _cond_resched(void) |
| @@ -6597,14 +6809,14 @@ int __sched _cond_resched(void) | |||
| 6597 | EXPORT_SYMBOL(_cond_resched); | 6809 | EXPORT_SYMBOL(_cond_resched); |
| 6598 | 6810 | ||
| 6599 | /* | 6811 | /* |
| 6600 | * cond_resched_lock() - if a reschedule is pending, drop the given lock, | 6812 | * __cond_resched_lock() - if a reschedule is pending, drop the given lock, |
| 6601 | * call schedule, and on return reacquire the lock. | 6813 | * call schedule, and on return reacquire the lock. |
| 6602 | * | 6814 | * |
| 6603 | * This works OK both with and without CONFIG_PREEMPT. We do strange low-level | 6815 | * This works OK both with and without CONFIG_PREEMPT. We do strange low-level |
| 6604 | * operations here to prevent schedule() from being called twice (once via | 6816 | * operations here to prevent schedule() from being called twice (once via |
| 6605 | * spin_unlock(), once by hand). | 6817 | * spin_unlock(), once by hand). |
| 6606 | */ | 6818 | */ |
| 6607 | int cond_resched_lock(spinlock_t *lock) | 6819 | int __cond_resched_lock(spinlock_t *lock) |
| 6608 | { | 6820 | { |
| 6609 | int resched = should_resched(); | 6821 | int resched = should_resched(); |
| 6610 | int ret = 0; | 6822 | int ret = 0; |
| @@ -6622,9 +6834,9 @@ int cond_resched_lock(spinlock_t *lock) | |||
| 6622 | } | 6834 | } |
| 6623 | return ret; | 6835 | return ret; |
| 6624 | } | 6836 | } |
| 6625 | EXPORT_SYMBOL(cond_resched_lock); | 6837 | EXPORT_SYMBOL(__cond_resched_lock); |
| 6626 | 6838 | ||
| 6627 | int __sched cond_resched_softirq(void) | 6839 | int __sched __cond_resched_softirq(void) |
| 6628 | { | 6840 | { |
| 6629 | BUG_ON(!in_softirq()); | 6841 | BUG_ON(!in_softirq()); |
| 6630 | 6842 | ||
| @@ -6636,7 +6848,7 @@ int __sched cond_resched_softirq(void) | |||
| 6636 | } | 6848 | } |
| 6637 | return 0; | 6849 | return 0; |
| 6638 | } | 6850 | } |
| 6639 | EXPORT_SYMBOL(cond_resched_softirq); | 6851 | EXPORT_SYMBOL(__cond_resched_softirq); |
| 6640 | 6852 | ||
| 6641 | /** | 6853 | /** |
| 6642 | * yield - yield the current processor to other threads. | 6854 | * yield - yield the current processor to other threads. |
| @@ -6660,11 +6872,13 @@ EXPORT_SYMBOL(yield); | |||
| 6660 | */ | 6872 | */ |
| 6661 | void __sched io_schedule(void) | 6873 | void __sched io_schedule(void) |
| 6662 | { | 6874 | { |
| 6663 | struct rq *rq = &__raw_get_cpu_var(runqueues); | 6875 | struct rq *rq = raw_rq(); |
| 6664 | 6876 | ||
| 6665 | delayacct_blkio_start(); | 6877 | delayacct_blkio_start(); |
| 6666 | atomic_inc(&rq->nr_iowait); | 6878 | atomic_inc(&rq->nr_iowait); |
| 6879 | current->in_iowait = 1; | ||
| 6667 | schedule(); | 6880 | schedule(); |
| 6881 | current->in_iowait = 0; | ||
| 6668 | atomic_dec(&rq->nr_iowait); | 6882 | atomic_dec(&rq->nr_iowait); |
| 6669 | delayacct_blkio_end(); | 6883 | delayacct_blkio_end(); |
| 6670 | } | 6884 | } |
| @@ -6672,12 +6886,14 @@ EXPORT_SYMBOL(io_schedule); | |||
| 6672 | 6886 | ||
| 6673 | long __sched io_schedule_timeout(long timeout) | 6887 | long __sched io_schedule_timeout(long timeout) |
| 6674 | { | 6888 | { |
| 6675 | struct rq *rq = &__raw_get_cpu_var(runqueues); | 6889 | struct rq *rq = raw_rq(); |
| 6676 | long ret; | 6890 | long ret; |
| 6677 | 6891 | ||
| 6678 | delayacct_blkio_start(); | 6892 | delayacct_blkio_start(); |
| 6679 | atomic_inc(&rq->nr_iowait); | 6893 | atomic_inc(&rq->nr_iowait); |
| 6894 | current->in_iowait = 1; | ||
| 6680 | ret = schedule_timeout(timeout); | 6895 | ret = schedule_timeout(timeout); |
| 6896 | current->in_iowait = 0; | ||
| 6681 | atomic_dec(&rq->nr_iowait); | 6897 | atomic_dec(&rq->nr_iowait); |
| 6682 | delayacct_blkio_end(); | 6898 | delayacct_blkio_end(); |
| 6683 | return ret; | 6899 | return ret; |
| @@ -6994,8 +7210,12 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) | |||
| 6994 | 7210 | ||
| 6995 | if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) { | 7211 | if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) { |
| 6996 | /* Need help from migration thread: drop lock and wait. */ | 7212 | /* Need help from migration thread: drop lock and wait. */ |
| 7213 | struct task_struct *mt = rq->migration_thread; | ||
| 7214 | |||
| 7215 | get_task_struct(mt); | ||
| 6997 | task_rq_unlock(rq, &flags); | 7216 | task_rq_unlock(rq, &flags); |
| 6998 | wake_up_process(rq->migration_thread); | 7217 | wake_up_process(rq->migration_thread); |
| 7218 | put_task_struct(mt); | ||
| 6999 | wait_for_completion(&req.done); | 7219 | wait_for_completion(&req.done); |
| 7000 | tlb_migrate_finish(p->mm); | 7220 | tlb_migrate_finish(p->mm); |
| 7001 | return 0; | 7221 | return 0; |
| @@ -7642,7 +7862,7 @@ static int __init migration_init(void) | |||
| 7642 | migration_call(&migration_notifier, CPU_ONLINE, cpu); | 7862 | migration_call(&migration_notifier, CPU_ONLINE, cpu); |
| 7643 | register_cpu_notifier(&migration_notifier); | 7863 | register_cpu_notifier(&migration_notifier); |
| 7644 | 7864 | ||
| 7645 | return err; | 7865 | return 0; |
| 7646 | } | 7866 | } |
| 7647 | early_initcall(migration_init); | 7867 | early_initcall(migration_init); |
| 7648 | #endif | 7868 | #endif |
| @@ -7689,7 +7909,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | |||
| 7689 | break; | 7909 | break; |
| 7690 | } | 7910 | } |
| 7691 | 7911 | ||
| 7692 | if (!group->__cpu_power) { | 7912 | if (!group->cpu_power) { |
| 7693 | printk(KERN_CONT "\n"); | 7913 | printk(KERN_CONT "\n"); |
| 7694 | printk(KERN_ERR "ERROR: domain->cpu_power not " | 7914 | printk(KERN_ERR "ERROR: domain->cpu_power not " |
| 7695 | "set\n"); | 7915 | "set\n"); |
| @@ -7713,9 +7933,9 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | |||
| 7713 | cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); | 7933 | cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); |
| 7714 | 7934 | ||
| 7715 | printk(KERN_CONT " %s", str); | 7935 | printk(KERN_CONT " %s", str); |
| 7716 | if (group->__cpu_power != SCHED_LOAD_SCALE) { | 7936 | if (group->cpu_power != SCHED_LOAD_SCALE) { |
| 7717 | printk(KERN_CONT " (__cpu_power = %d)", | 7937 | printk(KERN_CONT " (cpu_power = %d)", |
| 7718 | group->__cpu_power); | 7938 | group->cpu_power); |
| 7719 | } | 7939 | } |
| 7720 | 7940 | ||
| 7721 | group = group->next; | 7941 | group = group->next; |
| @@ -7858,7 +8078,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) | |||
| 7858 | rq->rd = rd; | 8078 | rq->rd = rd; |
| 7859 | 8079 | ||
| 7860 | cpumask_set_cpu(rq->cpu, rd->span); | 8080 | cpumask_set_cpu(rq->cpu, rd->span); |
| 7861 | if (cpumask_test_cpu(rq->cpu, cpu_online_mask)) | 8081 | if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) |
| 7862 | set_rq_online(rq); | 8082 | set_rq_online(rq); |
| 7863 | 8083 | ||
| 7864 | spin_unlock_irqrestore(&rq->lock, flags); | 8084 | spin_unlock_irqrestore(&rq->lock, flags); |
| @@ -8000,7 +8220,7 @@ init_sched_build_groups(const struct cpumask *span, | |||
| 8000 | continue; | 8220 | continue; |
| 8001 | 8221 | ||
| 8002 | cpumask_clear(sched_group_cpus(sg)); | 8222 | cpumask_clear(sched_group_cpus(sg)); |
| 8003 | sg->__cpu_power = 0; | 8223 | sg->cpu_power = 0; |
| 8004 | 8224 | ||
| 8005 | for_each_cpu(j, span) { | 8225 | for_each_cpu(j, span) { |
| 8006 | if (group_fn(j, cpu_map, NULL, tmpmask) != group) | 8226 | if (group_fn(j, cpu_map, NULL, tmpmask) != group) |
| @@ -8108,6 +8328,39 @@ struct static_sched_domain { | |||
| 8108 | DECLARE_BITMAP(span, CONFIG_NR_CPUS); | 8328 | DECLARE_BITMAP(span, CONFIG_NR_CPUS); |
| 8109 | }; | 8329 | }; |
| 8110 | 8330 | ||
| 8331 | struct s_data { | ||
| 8332 | #ifdef CONFIG_NUMA | ||
| 8333 | int sd_allnodes; | ||
| 8334 | cpumask_var_t domainspan; | ||
| 8335 | cpumask_var_t covered; | ||
| 8336 | cpumask_var_t notcovered; | ||
| 8337 | #endif | ||
| 8338 | cpumask_var_t nodemask; | ||
| 8339 | cpumask_var_t this_sibling_map; | ||
| 8340 | cpumask_var_t this_core_map; | ||
| 8341 | cpumask_var_t send_covered; | ||
| 8342 | cpumask_var_t tmpmask; | ||
| 8343 | struct sched_group **sched_group_nodes; | ||
| 8344 | struct root_domain *rd; | ||
| 8345 | }; | ||
| 8346 | |||
| 8347 | enum s_alloc { | ||
| 8348 | sa_sched_groups = 0, | ||
| 8349 | sa_rootdomain, | ||
| 8350 | sa_tmpmask, | ||
| 8351 | sa_send_covered, | ||
| 8352 | sa_this_core_map, | ||
| 8353 | sa_this_sibling_map, | ||
| 8354 | sa_nodemask, | ||
| 8355 | sa_sched_group_nodes, | ||
| 8356 | #ifdef CONFIG_NUMA | ||
| 8357 | sa_notcovered, | ||
| 8358 | sa_covered, | ||
| 8359 | sa_domainspan, | ||
| 8360 | #endif | ||
| 8361 | sa_none, | ||
| 8362 | }; | ||
| 8363 | |||
| 8111 | /* | 8364 | /* |
| 8112 | * SMT sched-domains: | 8365 | * SMT sched-domains: |
| 8113 | */ | 8366 | */ |
| @@ -8225,11 +8478,76 @@ static void init_numa_sched_groups_power(struct sched_group *group_head) | |||
| 8225 | continue; | 8478 | continue; |
| 8226 | } | 8479 | } |
| 8227 | 8480 | ||
| 8228 | sg_inc_cpu_power(sg, sd->groups->__cpu_power); | 8481 | sg->cpu_power += sd->groups->cpu_power; |
| 8229 | } | 8482 | } |
| 8230 | sg = sg->next; | 8483 | sg = sg->next; |
| 8231 | } while (sg != group_head); | 8484 | } while (sg != group_head); |
| 8232 | } | 8485 | } |
| 8486 | |||
| 8487 | static int build_numa_sched_groups(struct s_data *d, | ||
| 8488 | const struct cpumask *cpu_map, int num) | ||
| 8489 | { | ||
| 8490 | struct sched_domain *sd; | ||
| 8491 | struct sched_group *sg, *prev; | ||
| 8492 | int n, j; | ||
| 8493 | |||
| 8494 | cpumask_clear(d->covered); | ||
| 8495 | cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map); | ||
| 8496 | if (cpumask_empty(d->nodemask)) { | ||
| 8497 | d->sched_group_nodes[num] = NULL; | ||
| 8498 | goto out; | ||
| 8499 | } | ||
| 8500 | |||
| 8501 | sched_domain_node_span(num, d->domainspan); | ||
| 8502 | cpumask_and(d->domainspan, d->domainspan, cpu_map); | ||
| 8503 | |||
| 8504 | sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), | ||
| 8505 | GFP_KERNEL, num); | ||
| 8506 | if (!sg) { | ||
| 8507 | printk(KERN_WARNING "Can not alloc domain group for node %d\n", | ||
| 8508 | num); | ||
| 8509 | return -ENOMEM; | ||
| 8510 | } | ||
| 8511 | d->sched_group_nodes[num] = sg; | ||
| 8512 | |||
| 8513 | for_each_cpu(j, d->nodemask) { | ||
| 8514 | sd = &per_cpu(node_domains, j).sd; | ||
| 8515 | sd->groups = sg; | ||
| 8516 | } | ||
| 8517 | |||
| 8518 | sg->cpu_power = 0; | ||
| 8519 | cpumask_copy(sched_group_cpus(sg), d->nodemask); | ||
| 8520 | sg->next = sg; | ||
| 8521 | cpumask_or(d->covered, d->covered, d->nodemask); | ||
| 8522 | |||
| 8523 | prev = sg; | ||
| 8524 | for (j = 0; j < nr_node_ids; j++) { | ||
| 8525 | n = (num + j) % nr_node_ids; | ||
| 8526 | cpumask_complement(d->notcovered, d->covered); | ||
| 8527 | cpumask_and(d->tmpmask, d->notcovered, cpu_map); | ||
| 8528 | cpumask_and(d->tmpmask, d->tmpmask, d->domainspan); | ||
| 8529 | if (cpumask_empty(d->tmpmask)) | ||
| 8530 | break; | ||
| 8531 | cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n)); | ||
| 8532 | if (cpumask_empty(d->tmpmask)) | ||
| 8533 | continue; | ||
| 8534 | sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), | ||
| 8535 | GFP_KERNEL, num); | ||
| 8536 | if (!sg) { | ||
| 8537 | printk(KERN_WARNING | ||
| 8538 | "Can not alloc domain group for node %d\n", j); | ||
| 8539 | return -ENOMEM; | ||
| 8540 | } | ||
| 8541 | sg->cpu_power = 0; | ||
| 8542 | cpumask_copy(sched_group_cpus(sg), d->tmpmask); | ||
| 8543 | sg->next = prev->next; | ||
| 8544 | cpumask_or(d->covered, d->covered, d->tmpmask); | ||
| 8545 | prev->next = sg; | ||
| 8546 | prev = sg; | ||
| 8547 | } | ||
| 8548 | out: | ||
| 8549 | return 0; | ||
| 8550 | } | ||
| 8233 | #endif /* CONFIG_NUMA */ | 8551 | #endif /* CONFIG_NUMA */ |
| 8234 | 8552 | ||
| 8235 | #ifdef CONFIG_NUMA | 8553 | #ifdef CONFIG_NUMA |
| @@ -8283,15 +8601,13 @@ static void free_sched_groups(const struct cpumask *cpu_map, | |||
| 8283 | * there are asymmetries in the topology. If there are asymmetries, group | 8601 | * there are asymmetries in the topology. If there are asymmetries, group |
| 8284 | * having more cpu_power will pickup more load compared to the group having | 8602 | * having more cpu_power will pickup more load compared to the group having |
| 8285 | * less cpu_power. | 8603 | * less cpu_power. |
| 8286 | * | ||
| 8287 | * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents | ||
| 8288 | * the maximum number of tasks a group can handle in the presence of other idle | ||
| 8289 | * or lightly loaded groups in the same sched domain. | ||
| 8290 | */ | 8604 | */ |
| 8291 | static void init_sched_groups_power(int cpu, struct sched_domain *sd) | 8605 | static void init_sched_groups_power(int cpu, struct sched_domain *sd) |
| 8292 | { | 8606 | { |
| 8293 | struct sched_domain *child; | 8607 | struct sched_domain *child; |
| 8294 | struct sched_group *group; | 8608 | struct sched_group *group; |
| 8609 | long power; | ||
| 8610 | int weight; | ||
| 8295 | 8611 | ||
| 8296 | WARN_ON(!sd || !sd->groups); | 8612 | WARN_ON(!sd || !sd->groups); |
| 8297 | 8613 | ||
| @@ -8300,28 +8616,32 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) | |||
| 8300 | 8616 | ||
| 8301 | child = sd->child; | 8617 | child = sd->child; |
| 8302 | 8618 | ||
| 8303 | sd->groups->__cpu_power = 0; | 8619 | sd->groups->cpu_power = 0; |
| 8304 | 8620 | ||
| 8305 | /* | 8621 | if (!child) { |
| 8306 | * For perf policy, if the groups in child domain share resources | 8622 | power = SCHED_LOAD_SCALE; |
| 8307 | * (for example cores sharing some portions of the cache hierarchy | 8623 | weight = cpumask_weight(sched_domain_span(sd)); |
| 8308 | * or SMT), then set this domain groups cpu_power such that each group | 8624 | /* |
| 8309 | * can handle only one task, when there are other idle groups in the | 8625 | * SMT siblings share the power of a single core. |
| 8310 | * same sched domain. | 8626 | * Usually multiple threads get a better yield out of |
| 8311 | */ | 8627 | * that one core than a single thread would have, |
| 8312 | if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) && | 8628 | * reflect that in sd->smt_gain. |
| 8313 | (child->flags & | 8629 | */ |
| 8314 | (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) { | 8630 | if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { |
| 8315 | sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE); | 8631 | power *= sd->smt_gain; |
| 8632 | power /= weight; | ||
| 8633 | power >>= SCHED_LOAD_SHIFT; | ||
| 8634 | } | ||
| 8635 | sd->groups->cpu_power += power; | ||
| 8316 | return; | 8636 | return; |
| 8317 | } | 8637 | } |
| 8318 | 8638 | ||
| 8319 | /* | 8639 | /* |
| 8320 | * add cpu_power of each child group to this groups cpu_power | 8640 | * Add cpu_power of each child group to this groups cpu_power. |
| 8321 | */ | 8641 | */ |
| 8322 | group = child->groups; | 8642 | group = child->groups; |
| 8323 | do { | 8643 | do { |
| 8324 | sg_inc_cpu_power(sd->groups, group->__cpu_power); | 8644 | sd->groups->cpu_power += group->cpu_power; |
| 8325 | group = group->next; | 8645 | group = group->next; |
| 8326 | } while (group != child->groups); | 8646 | } while (group != child->groups); |
| 8327 | } | 8647 | } |
| @@ -8395,280 +8715,285 @@ static void set_domain_attribute(struct sched_domain *sd, | |||
| 8395 | } | 8715 | } |
| 8396 | } | 8716 | } |
| 8397 | 8717 | ||
| 8398 | /* | 8718 | static void __free_domain_allocs(struct s_data *d, enum s_alloc what, |
| 8399 | * Build sched domains for a given set of cpus and attach the sched domains | 8719 | const struct cpumask *cpu_map) |
| 8400 | * to the individual cpus | 8720 | { |
| 8401 | */ | 8721 | switch (what) { |
| 8402 | static int __build_sched_domains(const struct cpumask *cpu_map, | 8722 | case sa_sched_groups: |
| 8403 | struct sched_domain_attr *attr) | 8723 | free_sched_groups(cpu_map, d->tmpmask); /* fall through */ |
| 8404 | { | 8724 | d->sched_group_nodes = NULL; |
| 8405 | int i, err = -ENOMEM; | 8725 | case sa_rootdomain: |
| 8406 | struct root_domain *rd; | 8726 | free_rootdomain(d->rd); /* fall through */ |
| 8407 | cpumask_var_t nodemask, this_sibling_map, this_core_map, send_covered, | 8727 | case sa_tmpmask: |
| 8408 | tmpmask; | 8728 | free_cpumask_var(d->tmpmask); /* fall through */ |
| 8729 | case sa_send_covered: | ||
| 8730 | free_cpumask_var(d->send_covered); /* fall through */ | ||
| 8731 | case sa_this_core_map: | ||
| 8732 | free_cpumask_var(d->this_core_map); /* fall through */ | ||
| 8733 | case sa_this_sibling_map: | ||
| 8734 | free_cpumask_var(d->this_sibling_map); /* fall through */ | ||
| 8735 | case sa_nodemask: | ||
| 8736 | free_cpumask_var(d->nodemask); /* fall through */ | ||
| 8737 | case sa_sched_group_nodes: | ||
| 8409 | #ifdef CONFIG_NUMA | 8738 | #ifdef CONFIG_NUMA |
| 8410 | cpumask_var_t domainspan, covered, notcovered; | 8739 | kfree(d->sched_group_nodes); /* fall through */ |
| 8411 | struct sched_group **sched_group_nodes = NULL; | 8740 | case sa_notcovered: |
| 8412 | int sd_allnodes = 0; | 8741 | free_cpumask_var(d->notcovered); /* fall through */ |
| 8413 | 8742 | case sa_covered: | |
| 8414 | if (!alloc_cpumask_var(&domainspan, GFP_KERNEL)) | 8743 | free_cpumask_var(d->covered); /* fall through */ |
| 8415 | goto out; | 8744 | case sa_domainspan: |
| 8416 | if (!alloc_cpumask_var(&covered, GFP_KERNEL)) | 8745 | free_cpumask_var(d->domainspan); /* fall through */ |
| 8417 | goto free_domainspan; | 8746 | #endif |
| 8418 | if (!alloc_cpumask_var(¬covered, GFP_KERNEL)) | 8747 | case sa_none: |
| 8419 | goto free_covered; | 8748 | break; |
| 8420 | #endif | 8749 | } |
| 8421 | 8750 | } | |
| 8422 | if (!alloc_cpumask_var(&nodemask, GFP_KERNEL)) | ||
| 8423 | goto free_notcovered; | ||
| 8424 | if (!alloc_cpumask_var(&this_sibling_map, GFP_KERNEL)) | ||
| 8425 | goto free_nodemask; | ||
| 8426 | if (!alloc_cpumask_var(&this_core_map, GFP_KERNEL)) | ||
| 8427 | goto free_this_sibling_map; | ||
| 8428 | if (!alloc_cpumask_var(&send_covered, GFP_KERNEL)) | ||
| 8429 | goto free_this_core_map; | ||
| 8430 | if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL)) | ||
| 8431 | goto free_send_covered; | ||
| 8432 | 8751 | ||
| 8752 | static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, | ||
| 8753 | const struct cpumask *cpu_map) | ||
| 8754 | { | ||
| 8433 | #ifdef CONFIG_NUMA | 8755 | #ifdef CONFIG_NUMA |
| 8434 | /* | 8756 | if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL)) |
| 8435 | * Allocate the per-node list of sched groups | 8757 | return sa_none; |
| 8436 | */ | 8758 | if (!alloc_cpumask_var(&d->covered, GFP_KERNEL)) |
| 8437 | sched_group_nodes = kcalloc(nr_node_ids, sizeof(struct sched_group *), | 8759 | return sa_domainspan; |
| 8438 | GFP_KERNEL); | 8760 | if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL)) |
| 8439 | if (!sched_group_nodes) { | 8761 | return sa_covered; |
| 8762 | /* Allocate the per-node list of sched groups */ | ||
| 8763 | d->sched_group_nodes = kcalloc(nr_node_ids, | ||
| 8764 | sizeof(struct sched_group *), GFP_KERNEL); | ||
| 8765 | if (!d->sched_group_nodes) { | ||
| 8440 | printk(KERN_WARNING "Can not alloc sched group node list\n"); | 8766 | printk(KERN_WARNING "Can not alloc sched group node list\n"); |
| 8441 | goto free_tmpmask; | 8767 | return sa_notcovered; |
| 8442 | } | 8768 | } |
| 8443 | #endif | 8769 | sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes; |
| 8444 | 8770 | #endif | |
| 8445 | rd = alloc_rootdomain(); | 8771 | if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL)) |
| 8446 | if (!rd) { | 8772 | return sa_sched_group_nodes; |
| 8773 | if (!alloc_cpumask_var(&d->this_sibling_map, GFP_KERNEL)) | ||
| 8774 | return sa_nodemask; | ||
| 8775 | if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL)) | ||
| 8776 | return sa_this_sibling_map; | ||
| 8777 | if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL)) | ||
| 8778 | return sa_this_core_map; | ||
| 8779 | if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL)) | ||
| 8780 | return sa_send_covered; | ||
| 8781 | d->rd = alloc_rootdomain(); | ||
| 8782 | if (!d->rd) { | ||
| 8447 | printk(KERN_WARNING "Cannot alloc root domain\n"); | 8783 | printk(KERN_WARNING "Cannot alloc root domain\n"); |
| 8448 | goto free_sched_groups; | 8784 | return sa_tmpmask; |
| 8449 | } | 8785 | } |
| 8786 | return sa_rootdomain; | ||
| 8787 | } | ||
| 8450 | 8788 | ||
| 8789 | static struct sched_domain *__build_numa_sched_domains(struct s_data *d, | ||
| 8790 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i) | ||
| 8791 | { | ||
| 8792 | struct sched_domain *sd = NULL; | ||
| 8451 | #ifdef CONFIG_NUMA | 8793 | #ifdef CONFIG_NUMA |
| 8452 | sched_group_nodes_bycpu[cpumask_first(cpu_map)] = sched_group_nodes; | 8794 | struct sched_domain *parent; |
| 8453 | #endif | ||
| 8454 | |||
| 8455 | /* | ||
| 8456 | * Set up domains for cpus specified by the cpu_map. | ||
| 8457 | */ | ||
| 8458 | for_each_cpu(i, cpu_map) { | ||
| 8459 | struct sched_domain *sd = NULL, *p; | ||
| 8460 | 8795 | ||
| 8461 | cpumask_and(nodemask, cpumask_of_node(cpu_to_node(i)), cpu_map); | 8796 | d->sd_allnodes = 0; |
| 8462 | 8797 | if (cpumask_weight(cpu_map) > | |
| 8463 | #ifdef CONFIG_NUMA | 8798 | SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) { |
| 8464 | if (cpumask_weight(cpu_map) > | 8799 | sd = &per_cpu(allnodes_domains, i).sd; |
| 8465 | SD_NODES_PER_DOMAIN*cpumask_weight(nodemask)) { | 8800 | SD_INIT(sd, ALLNODES); |
| 8466 | sd = &per_cpu(allnodes_domains, i).sd; | ||
| 8467 | SD_INIT(sd, ALLNODES); | ||
| 8468 | set_domain_attribute(sd, attr); | ||
| 8469 | cpumask_copy(sched_domain_span(sd), cpu_map); | ||
| 8470 | cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask); | ||
| 8471 | p = sd; | ||
| 8472 | sd_allnodes = 1; | ||
| 8473 | } else | ||
| 8474 | p = NULL; | ||
| 8475 | |||
| 8476 | sd = &per_cpu(node_domains, i).sd; | ||
| 8477 | SD_INIT(sd, NODE); | ||
| 8478 | set_domain_attribute(sd, attr); | 8801 | set_domain_attribute(sd, attr); |
| 8479 | sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd)); | 8802 | cpumask_copy(sched_domain_span(sd), cpu_map); |
| 8480 | sd->parent = p; | 8803 | cpu_to_allnodes_group(i, cpu_map, &sd->groups, d->tmpmask); |
| 8481 | if (p) | 8804 | d->sd_allnodes = 1; |
| 8482 | p->child = sd; | 8805 | } |
| 8483 | cpumask_and(sched_domain_span(sd), | 8806 | parent = sd; |
| 8484 | sched_domain_span(sd), cpu_map); | 8807 | |
| 8808 | sd = &per_cpu(node_domains, i).sd; | ||
| 8809 | SD_INIT(sd, NODE); | ||
| 8810 | set_domain_attribute(sd, attr); | ||
| 8811 | sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd)); | ||
| 8812 | sd->parent = parent; | ||
| 8813 | if (parent) | ||
| 8814 | parent->child = sd; | ||
| 8815 | cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map); | ||
| 8485 | #endif | 8816 | #endif |
| 8817 | return sd; | ||
| 8818 | } | ||
| 8486 | 8819 | ||
| 8487 | p = sd; | 8820 | static struct sched_domain *__build_cpu_sched_domain(struct s_data *d, |
| 8488 | sd = &per_cpu(phys_domains, i).sd; | 8821 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, |
| 8489 | SD_INIT(sd, CPU); | 8822 | struct sched_domain *parent, int i) |
| 8490 | set_domain_attribute(sd, attr); | 8823 | { |
| 8491 | cpumask_copy(sched_domain_span(sd), nodemask); | 8824 | struct sched_domain *sd; |
| 8492 | sd->parent = p; | 8825 | sd = &per_cpu(phys_domains, i).sd; |
| 8493 | if (p) | 8826 | SD_INIT(sd, CPU); |
| 8494 | p->child = sd; | 8827 | set_domain_attribute(sd, attr); |
| 8495 | cpu_to_phys_group(i, cpu_map, &sd->groups, tmpmask); | 8828 | cpumask_copy(sched_domain_span(sd), d->nodemask); |
| 8829 | sd->parent = parent; | ||
| 8830 | if (parent) | ||
| 8831 | parent->child = sd; | ||
| 8832 | cpu_to_phys_group(i, cpu_map, &sd->groups, d->tmpmask); | ||
| 8833 | return sd; | ||
| 8834 | } | ||
| 8496 | 8835 | ||
| 8836 | static struct sched_domain *__build_mc_sched_domain(struct s_data *d, | ||
| 8837 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, | ||
| 8838 | struct sched_domain *parent, int i) | ||
| 8839 | { | ||
| 8840 | struct sched_domain *sd = parent; | ||
| 8497 | #ifdef CONFIG_SCHED_MC | 8841 | #ifdef CONFIG_SCHED_MC |
| 8498 | p = sd; | 8842 | sd = &per_cpu(core_domains, i).sd; |
| 8499 | sd = &per_cpu(core_domains, i).sd; | 8843 | SD_INIT(sd, MC); |
| 8500 | SD_INIT(sd, MC); | 8844 | set_domain_attribute(sd, attr); |
| 8501 | set_domain_attribute(sd, attr); | 8845 | cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i)); |
| 8502 | cpumask_and(sched_domain_span(sd), cpu_map, | 8846 | sd->parent = parent; |
| 8503 | cpu_coregroup_mask(i)); | 8847 | parent->child = sd; |
| 8504 | sd->parent = p; | 8848 | cpu_to_core_group(i, cpu_map, &sd->groups, d->tmpmask); |
| 8505 | p->child = sd; | ||
| 8506 | cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask); | ||
| 8507 | #endif | 8849 | #endif |
| 8850 | return sd; | ||
| 8851 | } | ||
| 8508 | 8852 | ||
| 8853 | static struct sched_domain *__build_smt_sched_domain(struct s_data *d, | ||
| 8854 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, | ||
| 8855 | struct sched_domain *parent, int i) | ||
| 8856 | { | ||
| 8857 | struct sched_domain *sd = parent; | ||
| 8509 | #ifdef CONFIG_SCHED_SMT | 8858 | #ifdef CONFIG_SCHED_SMT |
| 8510 | p = sd; | 8859 | sd = &per_cpu(cpu_domains, i).sd; |
| 8511 | sd = &per_cpu(cpu_domains, i).sd; | 8860 | SD_INIT(sd, SIBLING); |
| 8512 | SD_INIT(sd, SIBLING); | 8861 | set_domain_attribute(sd, attr); |
| 8513 | set_domain_attribute(sd, attr); | 8862 | cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i)); |
| 8514 | cpumask_and(sched_domain_span(sd), | 8863 | sd->parent = parent; |
| 8515 | topology_thread_cpumask(i), cpu_map); | 8864 | parent->child = sd; |
| 8516 | sd->parent = p; | 8865 | cpu_to_cpu_group(i, cpu_map, &sd->groups, d->tmpmask); |
| 8517 | p->child = sd; | ||
| 8518 | cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask); | ||
| 8519 | #endif | 8866 | #endif |
| 8520 | } | 8867 | return sd; |
| 8868 | } | ||
| 8521 | 8869 | ||
| 8870 | static void build_sched_groups(struct s_data *d, enum sched_domain_level l, | ||
| 8871 | const struct cpumask *cpu_map, int cpu) | ||
| 8872 | { | ||
| 8873 | switch (l) { | ||
| 8522 | #ifdef CONFIG_SCHED_SMT | 8874 | #ifdef CONFIG_SCHED_SMT |
| 8523 | /* Set up CPU (sibling) groups */ | 8875 | case SD_LV_SIBLING: /* set up CPU (sibling) groups */ |
| 8524 | for_each_cpu(i, cpu_map) { | 8876 | cpumask_and(d->this_sibling_map, cpu_map, |
| 8525 | cpumask_and(this_sibling_map, | 8877 | topology_thread_cpumask(cpu)); |
| 8526 | topology_thread_cpumask(i), cpu_map); | 8878 | if (cpu == cpumask_first(d->this_sibling_map)) |
| 8527 | if (i != cpumask_first(this_sibling_map)) | 8879 | init_sched_build_groups(d->this_sibling_map, cpu_map, |
| 8528 | continue; | 8880 | &cpu_to_cpu_group, |
| 8529 | 8881 | d->send_covered, d->tmpmask); | |
| 8530 | init_sched_build_groups(this_sibling_map, cpu_map, | 8882 | break; |
| 8531 | &cpu_to_cpu_group, | ||
| 8532 | send_covered, tmpmask); | ||
| 8533 | } | ||
| 8534 | #endif | 8883 | #endif |
| 8535 | |||
| 8536 | #ifdef CONFIG_SCHED_MC | 8884 | #ifdef CONFIG_SCHED_MC |
| 8537 | /* Set up multi-core groups */ | 8885 | case SD_LV_MC: /* set up multi-core groups */ |
| 8538 | for_each_cpu(i, cpu_map) { | 8886 | cpumask_and(d->this_core_map, cpu_map, cpu_coregroup_mask(cpu)); |
| 8539 | cpumask_and(this_core_map, cpu_coregroup_mask(i), cpu_map); | 8887 | if (cpu == cpumask_first(d->this_core_map)) |
| 8540 | if (i != cpumask_first(this_core_map)) | 8888 | init_sched_build_groups(d->this_core_map, cpu_map, |
| 8541 | continue; | 8889 | &cpu_to_core_group, |
| 8542 | 8890 | d->send_covered, d->tmpmask); | |
| 8543 | init_sched_build_groups(this_core_map, cpu_map, | 8891 | break; |
| 8544 | &cpu_to_core_group, | ||
| 8545 | send_covered, tmpmask); | ||
| 8546 | } | ||
| 8547 | #endif | 8892 | #endif |
| 8548 | 8893 | case SD_LV_CPU: /* set up physical groups */ | |
| 8549 | /* Set up physical groups */ | 8894 | cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map); |
| 8550 | for (i = 0; i < nr_node_ids; i++) { | 8895 | if (!cpumask_empty(d->nodemask)) |
| 8551 | cpumask_and(nodemask, cpumask_of_node(i), cpu_map); | 8896 | init_sched_build_groups(d->nodemask, cpu_map, |
| 8552 | if (cpumask_empty(nodemask)) | 8897 | &cpu_to_phys_group, |
| 8553 | continue; | 8898 | d->send_covered, d->tmpmask); |
| 8554 | 8899 | break; | |
| 8555 | init_sched_build_groups(nodemask, cpu_map, | ||
| 8556 | &cpu_to_phys_group, | ||
| 8557 | send_covered, tmpmask); | ||
| 8558 | } | ||
| 8559 | |||
| 8560 | #ifdef CONFIG_NUMA | 8900 | #ifdef CONFIG_NUMA |
| 8561 | /* Set up node groups */ | 8901 | case SD_LV_ALLNODES: |
| 8562 | if (sd_allnodes) { | 8902 | init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group, |
| 8563 | init_sched_build_groups(cpu_map, cpu_map, | 8903 | d->send_covered, d->tmpmask); |
| 8564 | &cpu_to_allnodes_group, | 8904 | break; |
| 8565 | send_covered, tmpmask); | 8905 | #endif |
| 8906 | default: | ||
| 8907 | break; | ||
| 8566 | } | 8908 | } |
| 8909 | } | ||
| 8567 | 8910 | ||
| 8568 | for (i = 0; i < nr_node_ids; i++) { | 8911 | /* |
| 8569 | /* Set up node groups */ | 8912 | * Build sched domains for a given set of cpus and attach the sched domains |
| 8570 | struct sched_group *sg, *prev; | 8913 | * to the individual cpus |
| 8571 | int j; | 8914 | */ |
| 8572 | 8915 | static int __build_sched_domains(const struct cpumask *cpu_map, | |
| 8573 | cpumask_clear(covered); | 8916 | struct sched_domain_attr *attr) |
| 8574 | cpumask_and(nodemask, cpumask_of_node(i), cpu_map); | 8917 | { |
| 8575 | if (cpumask_empty(nodemask)) { | 8918 | enum s_alloc alloc_state = sa_none; |
| 8576 | sched_group_nodes[i] = NULL; | 8919 | struct s_data d; |
| 8577 | continue; | 8920 | struct sched_domain *sd; |
| 8578 | } | 8921 | int i; |
| 8922 | #ifdef CONFIG_NUMA | ||
| 8923 | d.sd_allnodes = 0; | ||
| 8924 | #endif | ||
| 8579 | 8925 | ||
| 8580 | sched_domain_node_span(i, domainspan); | 8926 | alloc_state = __visit_domain_allocation_hell(&d, cpu_map); |
| 8581 | cpumask_and(domainspan, domainspan, cpu_map); | 8927 | if (alloc_state != sa_rootdomain) |
| 8928 | goto error; | ||
| 8929 | alloc_state = sa_sched_groups; | ||
| 8582 | 8930 | ||
| 8583 | sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), | 8931 | /* |
| 8584 | GFP_KERNEL, i); | 8932 | * Set up domains for cpus specified by the cpu_map. |
| 8585 | if (!sg) { | 8933 | */ |
| 8586 | printk(KERN_WARNING "Can not alloc domain group for " | 8934 | for_each_cpu(i, cpu_map) { |
| 8587 | "node %d\n", i); | 8935 | cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)), |
| 8588 | goto error; | 8936 | cpu_map); |
| 8589 | } | ||
| 8590 | sched_group_nodes[i] = sg; | ||
| 8591 | for_each_cpu(j, nodemask) { | ||
| 8592 | struct sched_domain *sd; | ||
| 8593 | 8937 | ||
| 8594 | sd = &per_cpu(node_domains, j).sd; | 8938 | sd = __build_numa_sched_domains(&d, cpu_map, attr, i); |
| 8595 | sd->groups = sg; | 8939 | sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i); |
| 8596 | } | 8940 | sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i); |
| 8597 | sg->__cpu_power = 0; | 8941 | sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i); |
| 8598 | cpumask_copy(sched_group_cpus(sg), nodemask); | 8942 | } |
| 8599 | sg->next = sg; | ||
| 8600 | cpumask_or(covered, covered, nodemask); | ||
| 8601 | prev = sg; | ||
| 8602 | 8943 | ||
| 8603 | for (j = 0; j < nr_node_ids; j++) { | 8944 | for_each_cpu(i, cpu_map) { |
| 8604 | int n = (i + j) % nr_node_ids; | 8945 | build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i); |
| 8946 | build_sched_groups(&d, SD_LV_MC, cpu_map, i); | ||
| 8947 | } | ||
| 8605 | 8948 | ||
| 8606 | cpumask_complement(notcovered, covered); | 8949 | /* Set up physical groups */ |
| 8607 | cpumask_and(tmpmask, notcovered, cpu_map); | 8950 | for (i = 0; i < nr_node_ids; i++) |
| 8608 | cpumask_and(tmpmask, tmpmask, domainspan); | 8951 | build_sched_groups(&d, SD_LV_CPU, cpu_map, i); |
| 8609 | if (cpumask_empty(tmpmask)) | ||
| 8610 | break; | ||
| 8611 | 8952 | ||
| 8612 | cpumask_and(tmpmask, tmpmask, cpumask_of_node(n)); | 8953 | #ifdef CONFIG_NUMA |
| 8613 | if (cpumask_empty(tmpmask)) | 8954 | /* Set up node groups */ |
| 8614 | continue; | 8955 | if (d.sd_allnodes) |
| 8956 | build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0); | ||
| 8615 | 8957 | ||
| 8616 | sg = kmalloc_node(sizeof(struct sched_group) + | 8958 | for (i = 0; i < nr_node_ids; i++) |
| 8617 | cpumask_size(), | 8959 | if (build_numa_sched_groups(&d, cpu_map, i)) |
| 8618 | GFP_KERNEL, i); | 8960 | goto error; |
| 8619 | if (!sg) { | ||
| 8620 | printk(KERN_WARNING | ||
| 8621 | "Can not alloc domain group for node %d\n", j); | ||
| 8622 | goto error; | ||
| 8623 | } | ||
| 8624 | sg->__cpu_power = 0; | ||
| 8625 | cpumask_copy(sched_group_cpus(sg), tmpmask); | ||
| 8626 | sg->next = prev->next; | ||
| 8627 | cpumask_or(covered, covered, tmpmask); | ||
| 8628 | prev->next = sg; | ||
| 8629 | prev = sg; | ||
| 8630 | } | ||
| 8631 | } | ||
| 8632 | #endif | 8961 | #endif |
| 8633 | 8962 | ||
| 8634 | /* Calculate CPU power for physical packages and nodes */ | 8963 | /* Calculate CPU power for physical packages and nodes */ |
| 8635 | #ifdef CONFIG_SCHED_SMT | 8964 | #ifdef CONFIG_SCHED_SMT |
| 8636 | for_each_cpu(i, cpu_map) { | 8965 | for_each_cpu(i, cpu_map) { |
| 8637 | struct sched_domain *sd = &per_cpu(cpu_domains, i).sd; | 8966 | sd = &per_cpu(cpu_domains, i).sd; |
| 8638 | |||
| 8639 | init_sched_groups_power(i, sd); | 8967 | init_sched_groups_power(i, sd); |
| 8640 | } | 8968 | } |
| 8641 | #endif | 8969 | #endif |
| 8642 | #ifdef CONFIG_SCHED_MC | 8970 | #ifdef CONFIG_SCHED_MC |
| 8643 | for_each_cpu(i, cpu_map) { | 8971 | for_each_cpu(i, cpu_map) { |
| 8644 | struct sched_domain *sd = &per_cpu(core_domains, i).sd; | 8972 | sd = &per_cpu(core_domains, i).sd; |
| 8645 | |||
| 8646 | init_sched_groups_power(i, sd); | 8973 | init_sched_groups_power(i, sd); |
| 8647 | } | 8974 | } |
| 8648 | #endif | 8975 | #endif |
| 8649 | 8976 | ||
| 8650 | for_each_cpu(i, cpu_map) { | 8977 | for_each_cpu(i, cpu_map) { |
| 8651 | struct sched_domain *sd = &per_cpu(phys_domains, i).sd; | 8978 | sd = &per_cpu(phys_domains, i).sd; |
| 8652 | |||
| 8653 | init_sched_groups_power(i, sd); | 8979 | init_sched_groups_power(i, sd); |
| 8654 | } | 8980 | } |
| 8655 | 8981 | ||
| 8656 | #ifdef CONFIG_NUMA | 8982 | #ifdef CONFIG_NUMA |
| 8657 | for (i = 0; i < nr_node_ids; i++) | 8983 | for (i = 0; i < nr_node_ids; i++) |
| 8658 | init_numa_sched_groups_power(sched_group_nodes[i]); | 8984 | init_numa_sched_groups_power(d.sched_group_nodes[i]); |
| 8659 | 8985 | ||
| 8660 | if (sd_allnodes) { | 8986 | if (d.sd_allnodes) { |
| 8661 | struct sched_group *sg; | 8987 | struct sched_group *sg; |
| 8662 | 8988 | ||
| 8663 | cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg, | 8989 | cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg, |
| 8664 | tmpmask); | 8990 | d.tmpmask); |
| 8665 | init_numa_sched_groups_power(sg); | 8991 | init_numa_sched_groups_power(sg); |
| 8666 | } | 8992 | } |
| 8667 | #endif | 8993 | #endif |
| 8668 | 8994 | ||
| 8669 | /* Attach the domains */ | 8995 | /* Attach the domains */ |
| 8670 | for_each_cpu(i, cpu_map) { | 8996 | for_each_cpu(i, cpu_map) { |
| 8671 | struct sched_domain *sd; | ||
| 8672 | #ifdef CONFIG_SCHED_SMT | 8997 | #ifdef CONFIG_SCHED_SMT |
| 8673 | sd = &per_cpu(cpu_domains, i).sd; | 8998 | sd = &per_cpu(cpu_domains, i).sd; |
| 8674 | #elif defined(CONFIG_SCHED_MC) | 8999 | #elif defined(CONFIG_SCHED_MC) |
| @@ -8676,44 +9001,16 @@ static int __build_sched_domains(const struct cpumask *cpu_map, | |||
| 8676 | #else | 9001 | #else |
| 8677 | sd = &per_cpu(phys_domains, i).sd; | 9002 | sd = &per_cpu(phys_domains, i).sd; |
| 8678 | #endif | 9003 | #endif |
| 8679 | cpu_attach_domain(sd, rd, i); | 9004 | cpu_attach_domain(sd, d.rd, i); |
| 8680 | } | 9005 | } |
| 8681 | 9006 | ||
| 8682 | err = 0; | 9007 | d.sched_group_nodes = NULL; /* don't free this we still need it */ |
| 8683 | 9008 | __free_domain_allocs(&d, sa_tmpmask, cpu_map); | |
| 8684 | free_tmpmask: | 9009 | return 0; |
| 8685 | free_cpumask_var(tmpmask); | ||
| 8686 | free_send_covered: | ||
| 8687 | free_cpumask_var(send_covered); | ||
| 8688 | free_this_core_map: | ||
| 8689 | free_cpumask_var(this_core_map); | ||
| 8690 | free_this_sibling_map: | ||
| 8691 | free_cpumask_var(this_sibling_map); | ||
| 8692 | free_nodemask: | ||
| 8693 | free_cpumask_var(nodemask); | ||
| 8694 | free_notcovered: | ||
| 8695 | #ifdef CONFIG_NUMA | ||
| 8696 | free_cpumask_var(notcovered); | ||
| 8697 | free_covered: | ||
| 8698 | free_cpumask_var(covered); | ||
| 8699 | free_domainspan: | ||
| 8700 | free_cpumask_var(domainspan); | ||
| 8701 | out: | ||
| 8702 | #endif | ||
| 8703 | return err; | ||
| 8704 | |||
| 8705 | free_sched_groups: | ||
| 8706 | #ifdef CONFIG_NUMA | ||
| 8707 | kfree(sched_group_nodes); | ||
| 8708 | #endif | ||
| 8709 | goto free_tmpmask; | ||
| 8710 | 9010 | ||
| 8711 | #ifdef CONFIG_NUMA | ||
| 8712 | error: | 9011 | error: |
| 8713 | free_sched_groups(cpu_map, tmpmask); | 9012 | __free_domain_allocs(&d, alloc_state, cpu_map); |
| 8714 | free_rootdomain(rd); | 9013 | return -ENOMEM; |
| 8715 | goto free_tmpmask; | ||
| 8716 | #endif | ||
| 8717 | } | 9014 | } |
| 8718 | 9015 | ||
| 8719 | static int build_sched_domains(const struct cpumask *cpu_map) | 9016 | static int build_sched_domains(const struct cpumask *cpu_map) |
| @@ -9321,11 +9618,11 @@ void __init sched_init(void) | |||
| 9321 | * system cpu resource, based on the weight assigned to root | 9618 | * system cpu resource, based on the weight assigned to root |
| 9322 | * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished | 9619 | * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished |
| 9323 | * by letting tasks of init_task_group sit in a separate cfs_rq | 9620 | * by letting tasks of init_task_group sit in a separate cfs_rq |
| 9324 | * (init_cfs_rq) and having one entity represent this group of | 9621 | * (init_tg_cfs_rq) and having one entity represent this group of |
| 9325 | * tasks in rq->cfs (i.e init_task_group->se[] != NULL). | 9622 | * tasks in rq->cfs (i.e init_task_group->se[] != NULL). |
| 9326 | */ | 9623 | */ |
| 9327 | init_tg_cfs_entry(&init_task_group, | 9624 | init_tg_cfs_entry(&init_task_group, |
| 9328 | &per_cpu(init_cfs_rq, i), | 9625 | &per_cpu(init_tg_cfs_rq, i), |
| 9329 | &per_cpu(init_sched_entity, i), i, 1, | 9626 | &per_cpu(init_sched_entity, i), i, 1, |
| 9330 | root_task_group.se[i]); | 9627 | root_task_group.se[i]); |
| 9331 | 9628 | ||
| @@ -9351,6 +9648,7 @@ void __init sched_init(void) | |||
| 9351 | #ifdef CONFIG_SMP | 9648 | #ifdef CONFIG_SMP |
| 9352 | rq->sd = NULL; | 9649 | rq->sd = NULL; |
| 9353 | rq->rd = NULL; | 9650 | rq->rd = NULL; |
| 9651 | rq->post_schedule = 0; | ||
| 9354 | rq->active_balance = 0; | 9652 | rq->active_balance = 0; |
| 9355 | rq->next_balance = jiffies; | 9653 | rq->next_balance = jiffies; |
| 9356 | rq->push_cpu = 0; | 9654 | rq->push_cpu = 0; |
| @@ -9415,13 +9713,20 @@ void __init sched_init(void) | |||
| 9415 | } | 9713 | } |
| 9416 | 9714 | ||
| 9417 | #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP | 9715 | #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP |
| 9418 | void __might_sleep(char *file, int line) | 9716 | static inline int preempt_count_equals(int preempt_offset) |
| 9717 | { | ||
| 9718 | int nested = preempt_count() & ~PREEMPT_ACTIVE; | ||
| 9719 | |||
| 9720 | return (nested == PREEMPT_INATOMIC_BASE + preempt_offset); | ||
| 9721 | } | ||
| 9722 | |||
| 9723 | void __might_sleep(char *file, int line, int preempt_offset) | ||
| 9419 | { | 9724 | { |
| 9420 | #ifdef in_atomic | 9725 | #ifdef in_atomic |
| 9421 | static unsigned long prev_jiffy; /* ratelimiting */ | 9726 | static unsigned long prev_jiffy; /* ratelimiting */ |
| 9422 | 9727 | ||
| 9423 | if ((!in_atomic() && !irqs_disabled()) || | 9728 | if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) || |
| 9424 | system_state != SYSTEM_RUNNING || oops_in_progress) | 9729 | system_state != SYSTEM_RUNNING || oops_in_progress) |
| 9425 | return; | 9730 | return; |
| 9426 | if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) | 9731 | if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) |
| 9427 | return; | 9732 | return; |
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c index d014efbf947a..0f052fc674d5 100644 --- a/kernel/sched_cpupri.c +++ b/kernel/sched_cpupri.c | |||
| @@ -127,21 +127,11 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) | |||
| 127 | 127 | ||
| 128 | /* | 128 | /* |
| 129 | * If the cpu was currently mapped to a different value, we | 129 | * If the cpu was currently mapped to a different value, we |
| 130 | * first need to unmap the old value | 130 | * need to map it to the new value then remove the old value. |
| 131 | * Note, we must add the new value first, otherwise we risk the | ||
| 132 | * cpu being cleared from pri_active, and this cpu could be | ||
| 133 | * missed for a push or pull. | ||
| 131 | */ | 134 | */ |
| 132 | if (likely(oldpri != CPUPRI_INVALID)) { | ||
| 133 | struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri]; | ||
| 134 | |||
| 135 | spin_lock_irqsave(&vec->lock, flags); | ||
| 136 | |||
| 137 | vec->count--; | ||
| 138 | if (!vec->count) | ||
| 139 | clear_bit(oldpri, cp->pri_active); | ||
| 140 | cpumask_clear_cpu(cpu, vec->mask); | ||
| 141 | |||
| 142 | spin_unlock_irqrestore(&vec->lock, flags); | ||
| 143 | } | ||
| 144 | |||
| 145 | if (likely(newpri != CPUPRI_INVALID)) { | 135 | if (likely(newpri != CPUPRI_INVALID)) { |
| 146 | struct cpupri_vec *vec = &cp->pri_to_cpu[newpri]; | 136 | struct cpupri_vec *vec = &cp->pri_to_cpu[newpri]; |
| 147 | 137 | ||
| @@ -154,6 +144,18 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) | |||
| 154 | 144 | ||
| 155 | spin_unlock_irqrestore(&vec->lock, flags); | 145 | spin_unlock_irqrestore(&vec->lock, flags); |
| 156 | } | 146 | } |
| 147 | if (likely(oldpri != CPUPRI_INVALID)) { | ||
| 148 | struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri]; | ||
| 149 | |||
| 150 | spin_lock_irqsave(&vec->lock, flags); | ||
| 151 | |||
| 152 | vec->count--; | ||
| 153 | if (!vec->count) | ||
| 154 | clear_bit(oldpri, cp->pri_active); | ||
| 155 | cpumask_clear_cpu(cpu, vec->mask); | ||
| 156 | |||
| 157 | spin_unlock_irqrestore(&vec->lock, flags); | ||
| 158 | } | ||
| 157 | 159 | ||
| 158 | *currpri = newpri; | 160 | *currpri = newpri; |
| 159 | } | 161 | } |
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 70c7e0b79946..5ddbd0891267 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c | |||
| @@ -409,6 +409,8 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) | |||
| 409 | PN(se.wait_max); | 409 | PN(se.wait_max); |
| 410 | PN(se.wait_sum); | 410 | PN(se.wait_sum); |
| 411 | P(se.wait_count); | 411 | P(se.wait_count); |
| 412 | PN(se.iowait_sum); | ||
| 413 | P(se.iowait_count); | ||
| 412 | P(sched_info.bkl_count); | 414 | P(sched_info.bkl_count); |
| 413 | P(se.nr_migrations); | 415 | P(se.nr_migrations); |
| 414 | P(se.nr_migrations_cold); | 416 | P(se.nr_migrations_cold); |
| @@ -479,6 +481,8 @@ void proc_sched_set_task(struct task_struct *p) | |||
| 479 | p->se.wait_max = 0; | 481 | p->se.wait_max = 0; |
| 480 | p->se.wait_sum = 0; | 482 | p->se.wait_sum = 0; |
| 481 | p->se.wait_count = 0; | 483 | p->se.wait_count = 0; |
| 484 | p->se.iowait_sum = 0; | ||
| 485 | p->se.iowait_count = 0; | ||
| 482 | p->se.sleep_max = 0; | 486 | p->se.sleep_max = 0; |
| 483 | p->se.sum_sleep_runtime = 0; | 487 | p->se.sum_sleep_runtime = 0; |
| 484 | p->se.block_max = 0; | 488 | p->se.block_max = 0; |
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 652e8bdef9aa..aa7f84121016 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c | |||
| @@ -24,7 +24,7 @@ | |||
| 24 | 24 | ||
| 25 | /* | 25 | /* |
| 26 | * Targeted preemption latency for CPU-bound tasks: | 26 | * Targeted preemption latency for CPU-bound tasks: |
| 27 | * (default: 20ms * (1 + ilog(ncpus)), units: nanoseconds) | 27 | * (default: 5ms * (1 + ilog(ncpus)), units: nanoseconds) |
| 28 | * | 28 | * |
| 29 | * NOTE: this latency value is not the same as the concept of | 29 | * NOTE: this latency value is not the same as the concept of |
| 30 | * 'timeslice length' - timeslices in CFS are of variable length | 30 | * 'timeslice length' - timeslices in CFS are of variable length |
| @@ -34,13 +34,13 @@ | |||
| 34 | * (to see the precise effective timeslice length of your workload, | 34 | * (to see the precise effective timeslice length of your workload, |
| 35 | * run vmstat and monitor the context-switches (cs) field) | 35 | * run vmstat and monitor the context-switches (cs) field) |
| 36 | */ | 36 | */ |
| 37 | unsigned int sysctl_sched_latency = 20000000ULL; | 37 | unsigned int sysctl_sched_latency = 5000000ULL; |
| 38 | 38 | ||
| 39 | /* | 39 | /* |
| 40 | * Minimal preemption granularity for CPU-bound tasks: | 40 | * Minimal preemption granularity for CPU-bound tasks: |
| 41 | * (default: 4 msec * (1 + ilog(ncpus)), units: nanoseconds) | 41 | * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) |
| 42 | */ | 42 | */ |
| 43 | unsigned int sysctl_sched_min_granularity = 4000000ULL; | 43 | unsigned int sysctl_sched_min_granularity = 1000000ULL; |
| 44 | 44 | ||
| 45 | /* | 45 | /* |
| 46 | * is kept at sysctl_sched_latency / sysctl_sched_min_granularity | 46 | * is kept at sysctl_sched_latency / sysctl_sched_min_granularity |
| @@ -48,10 +48,10 @@ unsigned int sysctl_sched_min_granularity = 4000000ULL; | |||
| 48 | static unsigned int sched_nr_latency = 5; | 48 | static unsigned int sched_nr_latency = 5; |
| 49 | 49 | ||
| 50 | /* | 50 | /* |
| 51 | * After fork, child runs first. (default) If set to 0 then | 51 | * After fork, child runs first. If set to 0 (default) then |
| 52 | * parent will (try to) run first. | 52 | * parent will (try to) run first. |
| 53 | */ | 53 | */ |
| 54 | const_debug unsigned int sysctl_sched_child_runs_first = 1; | 54 | unsigned int sysctl_sched_child_runs_first __read_mostly; |
| 55 | 55 | ||
| 56 | /* | 56 | /* |
| 57 | * sys_sched_yield() compat mode | 57 | * sys_sched_yield() compat mode |
| @@ -63,13 +63,13 @@ unsigned int __read_mostly sysctl_sched_compat_yield; | |||
| 63 | 63 | ||
| 64 | /* | 64 | /* |
| 65 | * SCHED_OTHER wake-up granularity. | 65 | * SCHED_OTHER wake-up granularity. |
| 66 | * (default: 5 msec * (1 + ilog(ncpus)), units: nanoseconds) | 66 | * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) |
| 67 | * | 67 | * |
| 68 | * This option delays the preemption effects of decoupled workloads | 68 | * This option delays the preemption effects of decoupled workloads |
| 69 | * and reduces their over-scheduling. Synchronous workloads will still | 69 | * and reduces their over-scheduling. Synchronous workloads will still |
| 70 | * have immediate wakeup/sleep latencies. | 70 | * have immediate wakeup/sleep latencies. |
| 71 | */ | 71 | */ |
| 72 | unsigned int sysctl_sched_wakeup_granularity = 5000000UL; | 72 | unsigned int sysctl_sched_wakeup_granularity = 1000000UL; |
| 73 | 73 | ||
| 74 | const_debug unsigned int sysctl_sched_migration_cost = 500000UL; | 74 | const_debug unsigned int sysctl_sched_migration_cost = 500000UL; |
| 75 | 75 | ||
| @@ -79,11 +79,6 @@ static const struct sched_class fair_sched_class; | |||
| 79 | * CFS operations on generic schedulable entities: | 79 | * CFS operations on generic schedulable entities: |
| 80 | */ | 80 | */ |
| 81 | 81 | ||
| 82 | static inline struct task_struct *task_of(struct sched_entity *se) | ||
| 83 | { | ||
| 84 | return container_of(se, struct task_struct, se); | ||
| 85 | } | ||
| 86 | |||
| 87 | #ifdef CONFIG_FAIR_GROUP_SCHED | 82 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 88 | 83 | ||
| 89 | /* cpu runqueue to which this cfs_rq is attached */ | 84 | /* cpu runqueue to which this cfs_rq is attached */ |
| @@ -95,6 +90,14 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq) | |||
| 95 | /* An entity is a task if it doesn't "own" a runqueue */ | 90 | /* An entity is a task if it doesn't "own" a runqueue */ |
| 96 | #define entity_is_task(se) (!se->my_q) | 91 | #define entity_is_task(se) (!se->my_q) |
| 97 | 92 | ||
| 93 | static inline struct task_struct *task_of(struct sched_entity *se) | ||
| 94 | { | ||
| 95 | #ifdef CONFIG_SCHED_DEBUG | ||
| 96 | WARN_ON_ONCE(!entity_is_task(se)); | ||
| 97 | #endif | ||
| 98 | return container_of(se, struct task_struct, se); | ||
| 99 | } | ||
| 100 | |||
| 98 | /* Walk up scheduling entities hierarchy */ | 101 | /* Walk up scheduling entities hierarchy */ |
| 99 | #define for_each_sched_entity(se) \ | 102 | #define for_each_sched_entity(se) \ |
| 100 | for (; se; se = se->parent) | 103 | for (; se; se = se->parent) |
| @@ -186,7 +189,12 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse) | |||
| 186 | } | 189 | } |
| 187 | } | 190 | } |
| 188 | 191 | ||
| 189 | #else /* CONFIG_FAIR_GROUP_SCHED */ | 192 | #else /* !CONFIG_FAIR_GROUP_SCHED */ |
| 193 | |||
| 194 | static inline struct task_struct *task_of(struct sched_entity *se) | ||
| 195 | { | ||
| 196 | return container_of(se, struct task_struct, se); | ||
| 197 | } | ||
| 190 | 198 | ||
| 191 | static inline struct rq *rq_of(struct cfs_rq *cfs_rq) | 199 | static inline struct rq *rq_of(struct cfs_rq *cfs_rq) |
| 192 | { | 200 | { |
| @@ -537,6 +545,12 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
| 537 | schedstat_set(se->wait_count, se->wait_count + 1); | 545 | schedstat_set(se->wait_count, se->wait_count + 1); |
| 538 | schedstat_set(se->wait_sum, se->wait_sum + | 546 | schedstat_set(se->wait_sum, se->wait_sum + |
| 539 | rq_of(cfs_rq)->clock - se->wait_start); | 547 | rq_of(cfs_rq)->clock - se->wait_start); |
| 548 | #ifdef CONFIG_SCHEDSTATS | ||
| 549 | if (entity_is_task(se)) { | ||
| 550 | trace_sched_stat_wait(task_of(se), | ||
| 551 | rq_of(cfs_rq)->clock - se->wait_start); | ||
| 552 | } | ||
| 553 | #endif | ||
| 540 | schedstat_set(se->wait_start, 0); | 554 | schedstat_set(se->wait_start, 0); |
| 541 | } | 555 | } |
| 542 | 556 | ||
| @@ -628,8 +642,10 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
| 628 | se->sleep_start = 0; | 642 | se->sleep_start = 0; |
| 629 | se->sum_sleep_runtime += delta; | 643 | se->sum_sleep_runtime += delta; |
| 630 | 644 | ||
| 631 | if (tsk) | 645 | if (tsk) { |
| 632 | account_scheduler_latency(tsk, delta >> 10, 1); | 646 | account_scheduler_latency(tsk, delta >> 10, 1); |
| 647 | trace_sched_stat_sleep(tsk, delta); | ||
| 648 | } | ||
| 633 | } | 649 | } |
| 634 | if (se->block_start) { | 650 | if (se->block_start) { |
| 635 | u64 delta = rq_of(cfs_rq)->clock - se->block_start; | 651 | u64 delta = rq_of(cfs_rq)->clock - se->block_start; |
| @@ -644,6 +660,12 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
| 644 | se->sum_sleep_runtime += delta; | 660 | se->sum_sleep_runtime += delta; |
| 645 | 661 | ||
| 646 | if (tsk) { | 662 | if (tsk) { |
| 663 | if (tsk->in_iowait) { | ||
| 664 | se->iowait_sum += delta; | ||
| 665 | se->iowait_count++; | ||
| 666 | trace_sched_stat_iowait(tsk, delta); | ||
| 667 | } | ||
| 668 | |||
| 647 | /* | 669 | /* |
| 648 | * Blocking time is in units of nanosecs, so shift by | 670 | * Blocking time is in units of nanosecs, so shift by |
| 649 | * 20 to get a milliseconds-range estimation of the | 671 | * 20 to get a milliseconds-range estimation of the |
| @@ -705,11 +727,11 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) | |||
| 705 | 727 | ||
| 706 | vruntime -= thresh; | 728 | vruntime -= thresh; |
| 707 | } | 729 | } |
| 708 | |||
| 709 | /* ensure we never gain time by being placed backwards. */ | ||
| 710 | vruntime = max_vruntime(se->vruntime, vruntime); | ||
| 711 | } | 730 | } |
| 712 | 731 | ||
| 732 | /* ensure we never gain time by being placed backwards. */ | ||
| 733 | vruntime = max_vruntime(se->vruntime, vruntime); | ||
| 734 | |||
| 713 | se->vruntime = vruntime; | 735 | se->vruntime = vruntime; |
| 714 | } | 736 | } |
| 715 | 737 | ||
| @@ -1046,17 +1068,21 @@ static void yield_task_fair(struct rq *rq) | |||
| 1046 | * search starts with cpus closest then further out as needed, | 1068 | * search starts with cpus closest then further out as needed, |
| 1047 | * so we always favor a closer, idle cpu. | 1069 | * so we always favor a closer, idle cpu. |
| 1048 | * Domains may include CPUs that are not usable for migration, | 1070 | * Domains may include CPUs that are not usable for migration, |
| 1049 | * hence we need to mask them out (cpu_active_mask) | 1071 | * hence we need to mask them out (rq->rd->online) |
| 1050 | * | 1072 | * |
| 1051 | * Returns the CPU we should wake onto. | 1073 | * Returns the CPU we should wake onto. |
| 1052 | */ | 1074 | */ |
| 1053 | #if defined(ARCH_HAS_SCHED_WAKE_IDLE) | 1075 | #if defined(ARCH_HAS_SCHED_WAKE_IDLE) |
| 1076 | |||
| 1077 | #define cpu_rd_active(cpu, rq) cpumask_test_cpu(cpu, rq->rd->online) | ||
| 1078 | |||
| 1054 | static int wake_idle(int cpu, struct task_struct *p) | 1079 | static int wake_idle(int cpu, struct task_struct *p) |
| 1055 | { | 1080 | { |
| 1056 | struct sched_domain *sd; | 1081 | struct sched_domain *sd; |
| 1057 | int i; | 1082 | int i; |
| 1058 | unsigned int chosen_wakeup_cpu; | 1083 | unsigned int chosen_wakeup_cpu; |
| 1059 | int this_cpu; | 1084 | int this_cpu; |
| 1085 | struct rq *task_rq = task_rq(p); | ||
| 1060 | 1086 | ||
| 1061 | /* | 1087 | /* |
| 1062 | * At POWERSAVINGS_BALANCE_WAKEUP level, if both this_cpu and prev_cpu | 1088 | * At POWERSAVINGS_BALANCE_WAKEUP level, if both this_cpu and prev_cpu |
| @@ -1089,10 +1115,10 @@ static int wake_idle(int cpu, struct task_struct *p) | |||
| 1089 | for_each_domain(cpu, sd) { | 1115 | for_each_domain(cpu, sd) { |
| 1090 | if ((sd->flags & SD_WAKE_IDLE) | 1116 | if ((sd->flags & SD_WAKE_IDLE) |
| 1091 | || ((sd->flags & SD_WAKE_IDLE_FAR) | 1117 | || ((sd->flags & SD_WAKE_IDLE_FAR) |
| 1092 | && !task_hot(p, task_rq(p)->clock, sd))) { | 1118 | && !task_hot(p, task_rq->clock, sd))) { |
| 1093 | for_each_cpu_and(i, sched_domain_span(sd), | 1119 | for_each_cpu_and(i, sched_domain_span(sd), |
| 1094 | &p->cpus_allowed) { | 1120 | &p->cpus_allowed) { |
| 1095 | if (cpu_active(i) && idle_cpu(i)) { | 1121 | if (cpu_rd_active(i, task_rq) && idle_cpu(i)) { |
| 1096 | if (i != task_cpu(p)) { | 1122 | if (i != task_cpu(p)) { |
| 1097 | schedstat_inc(p, | 1123 | schedstat_inc(p, |
| 1098 | se.nr_wakeups_idle); | 1124 | se.nr_wakeups_idle); |
| @@ -1235,7 +1261,17 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq, | |||
| 1235 | tg = task_group(p); | 1261 | tg = task_group(p); |
| 1236 | weight = p->se.load.weight; | 1262 | weight = p->se.load.weight; |
| 1237 | 1263 | ||
| 1238 | balanced = 100*(tl + effective_load(tg, this_cpu, weight, weight)) <= | 1264 | /* |
| 1265 | * In low-load situations, where prev_cpu is idle and this_cpu is idle | ||
| 1266 | * due to the sync cause above having dropped tl to 0, we'll always have | ||
| 1267 | * an imbalance, but there's really nothing you can do about that, so | ||
| 1268 | * that's good too. | ||
| 1269 | * | ||
| 1270 | * Otherwise check if either cpus are near enough in load to allow this | ||
| 1271 | * task to be woken on this_cpu. | ||
| 1272 | */ | ||
| 1273 | balanced = !tl || | ||
| 1274 | 100*(tl + effective_load(tg, this_cpu, weight, weight)) <= | ||
| 1239 | imbalance*(load + effective_load(tg, prev_cpu, 0, weight)); | 1275 | imbalance*(load + effective_load(tg, prev_cpu, 0, weight)); |
| 1240 | 1276 | ||
| 1241 | /* | 1277 | /* |
| @@ -1278,8 +1314,6 @@ static int select_task_rq_fair(struct task_struct *p, int sync) | |||
| 1278 | this_rq = cpu_rq(this_cpu); | 1314 | this_rq = cpu_rq(this_cpu); |
| 1279 | new_cpu = prev_cpu; | 1315 | new_cpu = prev_cpu; |
| 1280 | 1316 | ||
| 1281 | if (prev_cpu == this_cpu) | ||
| 1282 | goto out; | ||
| 1283 | /* | 1317 | /* |
| 1284 | * 'this_sd' is the first domain that both | 1318 | * 'this_sd' is the first domain that both |
| 1285 | * this_cpu and prev_cpu are present in: | 1319 | * this_cpu and prev_cpu are present in: |
| @@ -1721,6 +1755,8 @@ static void task_new_fair(struct rq *rq, struct task_struct *p) | |||
| 1721 | sched_info_queued(p); | 1755 | sched_info_queued(p); |
| 1722 | 1756 | ||
| 1723 | update_curr(cfs_rq); | 1757 | update_curr(cfs_rq); |
| 1758 | if (curr) | ||
| 1759 | se->vruntime = curr->vruntime; | ||
| 1724 | place_entity(cfs_rq, se, 1); | 1760 | place_entity(cfs_rq, se, 1); |
| 1725 | 1761 | ||
| 1726 | /* 'curr' will be NULL if the child belongs to a different group */ | 1762 | /* 'curr' will be NULL if the child belongs to a different group */ |
diff --git a/kernel/sched_features.h b/kernel/sched_features.h index 4569bfa7df9b..e2dc63a5815d 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h | |||
| @@ -1,4 +1,4 @@ | |||
| 1 | SCHED_FEAT(NEW_FAIR_SLEEPERS, 1) | 1 | SCHED_FEAT(NEW_FAIR_SLEEPERS, 0) |
| 2 | SCHED_FEAT(NORMALIZED_SLEEPER, 0) | 2 | SCHED_FEAT(NORMALIZED_SLEEPER, 0) |
| 3 | SCHED_FEAT(ADAPTIVE_GRAN, 1) | 3 | SCHED_FEAT(ADAPTIVE_GRAN, 1) |
| 4 | SCHED_FEAT(WAKEUP_PREEMPT, 1) | 4 | SCHED_FEAT(WAKEUP_PREEMPT, 1) |
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 3918e01994e0..2eb4bd6a526c 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c | |||
| @@ -3,15 +3,18 @@ | |||
| 3 | * policies) | 3 | * policies) |
| 4 | */ | 4 | */ |
| 5 | 5 | ||
| 6 | #ifdef CONFIG_RT_GROUP_SCHED | ||
| 7 | |||
| 8 | #define rt_entity_is_task(rt_se) (!(rt_se)->my_q) | ||
| 9 | |||
| 6 | static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se) | 10 | static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se) |
| 7 | { | 11 | { |
| 12 | #ifdef CONFIG_SCHED_DEBUG | ||
| 13 | WARN_ON_ONCE(!rt_entity_is_task(rt_se)); | ||
| 14 | #endif | ||
| 8 | return container_of(rt_se, struct task_struct, rt); | 15 | return container_of(rt_se, struct task_struct, rt); |
| 9 | } | 16 | } |
| 10 | 17 | ||
| 11 | #ifdef CONFIG_RT_GROUP_SCHED | ||
| 12 | |||
| 13 | #define rt_entity_is_task(rt_se) (!(rt_se)->my_q) | ||
| 14 | |||
| 15 | static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) | 18 | static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) |
| 16 | { | 19 | { |
| 17 | return rt_rq->rq; | 20 | return rt_rq->rq; |
| @@ -26,6 +29,11 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) | |||
| 26 | 29 | ||
| 27 | #define rt_entity_is_task(rt_se) (1) | 30 | #define rt_entity_is_task(rt_se) (1) |
| 28 | 31 | ||
| 32 | static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se) | ||
| 33 | { | ||
| 34 | return container_of(rt_se, struct task_struct, rt); | ||
| 35 | } | ||
| 36 | |||
| 29 | static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) | 37 | static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) |
| 30 | { | 38 | { |
| 31 | return container_of(rt_rq, struct rq, rt); | 39 | return container_of(rt_rq, struct rq, rt); |
| @@ -128,6 +136,11 @@ static void dequeue_pushable_task(struct rq *rq, struct task_struct *p) | |||
| 128 | plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); | 136 | plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); |
| 129 | } | 137 | } |
| 130 | 138 | ||
| 139 | static inline int has_pushable_tasks(struct rq *rq) | ||
| 140 | { | ||
| 141 | return !plist_head_empty(&rq->rt.pushable_tasks); | ||
| 142 | } | ||
| 143 | |||
| 131 | #else | 144 | #else |
| 132 | 145 | ||
| 133 | static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p) | 146 | static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p) |
| @@ -602,6 +615,8 @@ static void update_curr_rt(struct rq *rq) | |||
| 602 | curr->se.exec_start = rq->clock; | 615 | curr->se.exec_start = rq->clock; |
| 603 | cpuacct_charge(curr, delta_exec); | 616 | cpuacct_charge(curr, delta_exec); |
| 604 | 617 | ||
| 618 | sched_rt_avg_update(rq, delta_exec); | ||
| 619 | |||
| 605 | if (!rt_bandwidth_enabled()) | 620 | if (!rt_bandwidth_enabled()) |
| 606 | return; | 621 | return; |
| 607 | 622 | ||
| @@ -874,8 +889,6 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup) | |||
| 874 | 889 | ||
| 875 | if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) | 890 | if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) |
| 876 | enqueue_pushable_task(rq, p); | 891 | enqueue_pushable_task(rq, p); |
| 877 | |||
| 878 | inc_cpu_load(rq, p->se.load.weight); | ||
| 879 | } | 892 | } |
| 880 | 893 | ||
| 881 | static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) | 894 | static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) |
| @@ -886,8 +899,6 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) | |||
| 886 | dequeue_rt_entity(rt_se); | 899 | dequeue_rt_entity(rt_se); |
| 887 | 900 | ||
| 888 | dequeue_pushable_task(rq, p); | 901 | dequeue_pushable_task(rq, p); |
| 889 | |||
| 890 | dec_cpu_load(rq, p->se.load.weight); | ||
| 891 | } | 902 | } |
| 892 | 903 | ||
| 893 | /* | 904 | /* |
| @@ -1064,6 +1075,14 @@ static struct task_struct *pick_next_task_rt(struct rq *rq) | |||
| 1064 | if (p) | 1075 | if (p) |
| 1065 | dequeue_pushable_task(rq, p); | 1076 | dequeue_pushable_task(rq, p); |
| 1066 | 1077 | ||
| 1078 | #ifdef CONFIG_SMP | ||
| 1079 | /* | ||
| 1080 | * We detect this state here so that we can avoid taking the RQ | ||
| 1081 | * lock again later if there is no need to push | ||
| 1082 | */ | ||
| 1083 | rq->post_schedule = has_pushable_tasks(rq); | ||
| 1084 | #endif | ||
| 1085 | |||
| 1067 | return p; | 1086 | return p; |
| 1068 | } | 1087 | } |
| 1069 | 1088 | ||
| @@ -1162,13 +1181,6 @@ static int find_lowest_rq(struct task_struct *task) | |||
| 1162 | return -1; /* No targets found */ | 1181 | return -1; /* No targets found */ |
| 1163 | 1182 | ||
| 1164 | /* | 1183 | /* |
| 1165 | * Only consider CPUs that are usable for migration. | ||
| 1166 | * I guess we might want to change cpupri_find() to ignore those | ||
| 1167 | * in the first place. | ||
| 1168 | */ | ||
| 1169 | cpumask_and(lowest_mask, lowest_mask, cpu_active_mask); | ||
| 1170 | |||
| 1171 | /* | ||
| 1172 | * At this point we have built a mask of cpus representing the | 1184 | * At this point we have built a mask of cpus representing the |
| 1173 | * lowest priority tasks in the system. Now we want to elect | 1185 | * lowest priority tasks in the system. Now we want to elect |
| 1174 | * the best one based on our affinity and topology. | 1186 | * the best one based on our affinity and topology. |
| @@ -1262,11 +1274,6 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) | |||
| 1262 | return lowest_rq; | 1274 | return lowest_rq; |
| 1263 | } | 1275 | } |
| 1264 | 1276 | ||
| 1265 | static inline int has_pushable_tasks(struct rq *rq) | ||
| 1266 | { | ||
| 1267 | return !plist_head_empty(&rq->rt.pushable_tasks); | ||
| 1268 | } | ||
| 1269 | |||
| 1270 | static struct task_struct *pick_next_pushable_task(struct rq *rq) | 1277 | static struct task_struct *pick_next_pushable_task(struct rq *rq) |
| 1271 | { | 1278 | { |
| 1272 | struct task_struct *p; | 1279 | struct task_struct *p; |
| @@ -1466,23 +1473,9 @@ static void pre_schedule_rt(struct rq *rq, struct task_struct *prev) | |||
| 1466 | pull_rt_task(rq); | 1473 | pull_rt_task(rq); |
| 1467 | } | 1474 | } |
| 1468 | 1475 | ||
| 1469 | /* | ||
| 1470 | * assumes rq->lock is held | ||
| 1471 | */ | ||
| 1472 | static int needs_post_schedule_rt(struct rq *rq) | ||
| 1473 | { | ||
| 1474 | return has_pushable_tasks(rq); | ||
| 1475 | } | ||
| 1476 | |||
| 1477 | static void post_schedule_rt(struct rq *rq) | 1476 | static void post_schedule_rt(struct rq *rq) |
| 1478 | { | 1477 | { |
| 1479 | /* | ||
| 1480 | * This is only called if needs_post_schedule_rt() indicates that | ||
| 1481 | * we need to push tasks away | ||
| 1482 | */ | ||
| 1483 | spin_lock_irq(&rq->lock); | ||
| 1484 | push_rt_tasks(rq); | 1478 | push_rt_tasks(rq); |
| 1485 | spin_unlock_irq(&rq->lock); | ||
| 1486 | } | 1479 | } |
| 1487 | 1480 | ||
| 1488 | /* | 1481 | /* |
| @@ -1758,7 +1751,6 @@ static const struct sched_class rt_sched_class = { | |||
| 1758 | .rq_online = rq_online_rt, | 1751 | .rq_online = rq_online_rt, |
| 1759 | .rq_offline = rq_offline_rt, | 1752 | .rq_offline = rq_offline_rt, |
| 1760 | .pre_schedule = pre_schedule_rt, | 1753 | .pre_schedule = pre_schedule_rt, |
| 1761 | .needs_post_schedule = needs_post_schedule_rt, | ||
| 1762 | .post_schedule = post_schedule_rt, | 1754 | .post_schedule = post_schedule_rt, |
| 1763 | .task_wake_up = task_wake_up_rt, | 1755 | .task_wake_up = task_wake_up_rt, |
| 1764 | .switched_from = switched_from_rt, | 1756 | .switched_from = switched_from_rt, |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 71d8dc7f9920..3125cff1c570 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
| @@ -245,6 +245,14 @@ static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ | |||
| 245 | #endif | 245 | #endif |
| 246 | 246 | ||
| 247 | static struct ctl_table kern_table[] = { | 247 | static struct ctl_table kern_table[] = { |
| 248 | { | ||
| 249 | .ctl_name = CTL_UNNUMBERED, | ||
| 250 | .procname = "sched_child_runs_first", | ||
| 251 | .data = &sysctl_sched_child_runs_first, | ||
| 252 | .maxlen = sizeof(unsigned int), | ||
| 253 | .mode = 0644, | ||
| 254 | .proc_handler = &proc_dointvec, | ||
| 255 | }, | ||
| 248 | #ifdef CONFIG_SCHED_DEBUG | 256 | #ifdef CONFIG_SCHED_DEBUG |
| 249 | { | 257 | { |
| 250 | .ctl_name = CTL_UNNUMBERED, | 258 | .ctl_name = CTL_UNNUMBERED, |
| @@ -299,14 +307,6 @@ static struct ctl_table kern_table[] = { | |||
| 299 | }, | 307 | }, |
| 300 | { | 308 | { |
| 301 | .ctl_name = CTL_UNNUMBERED, | 309 | .ctl_name = CTL_UNNUMBERED, |
| 302 | .procname = "sched_child_runs_first", | ||
| 303 | .data = &sysctl_sched_child_runs_first, | ||
| 304 | .maxlen = sizeof(unsigned int), | ||
| 305 | .mode = 0644, | ||
| 306 | .proc_handler = &proc_dointvec, | ||
| 307 | }, | ||
| 308 | { | ||
| 309 | .ctl_name = CTL_UNNUMBERED, | ||
| 310 | .procname = "sched_features", | 310 | .procname = "sched_features", |
| 311 | .data = &sysctl_sched_features, | 311 | .data = &sysctl_sched_features, |
| 312 | .maxlen = sizeof(unsigned int), | 312 | .maxlen = sizeof(unsigned int), |
| @@ -331,6 +331,14 @@ static struct ctl_table kern_table[] = { | |||
| 331 | }, | 331 | }, |
| 332 | { | 332 | { |
| 333 | .ctl_name = CTL_UNNUMBERED, | 333 | .ctl_name = CTL_UNNUMBERED, |
| 334 | .procname = "sched_time_avg", | ||
| 335 | .data = &sysctl_sched_time_avg, | ||
| 336 | .maxlen = sizeof(unsigned int), | ||
| 337 | .mode = 0644, | ||
| 338 | .proc_handler = &proc_dointvec, | ||
| 339 | }, | ||
| 340 | { | ||
| 341 | .ctl_name = CTL_UNNUMBERED, | ||
| 334 | .procname = "timer_migration", | 342 | .procname = "timer_migration", |
| 335 | .data = &sysctl_timer_migration, | 343 | .data = &sysctl_timer_migration, |
| 336 | .maxlen = sizeof(unsigned int), | 344 | .maxlen = sizeof(unsigned int), |
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 3c44b56b0da7..addfe2df93b1 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
| @@ -317,8 +317,6 @@ static int worker_thread(void *__cwq) | |||
| 317 | if (cwq->wq->freezeable) | 317 | if (cwq->wq->freezeable) |
| 318 | set_freezable(); | 318 | set_freezable(); |
| 319 | 319 | ||
| 320 | set_user_nice(current, -5); | ||
| 321 | |||
| 322 | for (;;) { | 320 | for (;;) { |
| 323 | prepare_to_wait(&cwq->more_work, &wait, TASK_INTERRUPTIBLE); | 321 | prepare_to_wait(&cwq->more_work, &wait, TASK_INTERRUPTIBLE); |
| 324 | if (!freezing(current) && | 322 | if (!freezing(current) && |
