aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2008-07-14 16:54:49 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2008-07-14 16:54:49 -0400
commit17489c058e8c63ab5ebdc67ab52ca70d1bc270b1 (patch)
treed0e0a3503967b11f6e86ebfaddf103a82fbea38f
parenta3da5bf84a97d48cfaf66c6842470fc403da5121 (diff)
parent873a6ed6288b6c2c0d2cc84d3b2bf2fab9ba0181 (diff)
Merge branch 'sched/for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
* 'sched/for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (76 commits) sched_clock: and multiplier for TSC to gtod drift sched_clock: record TSC after gtod sched_clock: only update deltas with local reads. sched_clock: fix calculation of other CPU sched_clock: stop maximum check on NO HZ sched_clock: widen the max and min time sched_clock: record from last tick sched: fix accounting in task delay accounting & migration sched: add avg-overlap support to RT tasks sched: terminate newidle balancing once at least one task has moved over sched: fix warning sched: build fix sched: sched_clock_cpu() based cpu_clock(), lockdep fix sched: export cpu_clock sched: make sched_{rt,fair}.c ifdefs more readable sched: bias effective_load() error towards failing wake_affine(). sched: incremental effective_load() sched: correct wakeup weight calculations sched: fix mult overflow sched: update shares on wakeup ...
-rw-r--r--Documentation/scheduler/sched-domains.txt7
-rw-r--r--Documentation/scheduler/sched-rt-group.txt4
-rw-r--r--include/linux/sched.h59
-rw-r--r--kernel/Makefile5
-rw-r--r--kernel/cpu.c24
-rw-r--r--kernel/cpuset.c14
-rw-r--r--kernel/kthread.c1
-rw-r--r--kernel/sched.c723
-rw-r--r--kernel/sched_clock.c137
-rw-r--r--kernel/sched_cpupri.c174
-rw-r--r--kernel/sched_cpupri.h36
-rw-r--r--kernel/sched_debug.c64
-rw-r--r--kernel/sched_fair.c413
-rw-r--r--kernel/sched_features.h7
-rw-r--r--kernel/sched_rt.c405
-rw-r--r--kernel/sched_stats.h42
-rw-r--r--kernel/sysctl.c8
-rw-r--r--kernel/time/tick-sched.c2
18 files changed, 1555 insertions, 570 deletions
diff --git a/Documentation/scheduler/sched-domains.txt b/Documentation/scheduler/sched-domains.txt
index a9e990ab980f..373ceacc367e 100644
--- a/Documentation/scheduler/sched-domains.txt
+++ b/Documentation/scheduler/sched-domains.txt
@@ -61,10 +61,7 @@ builder by #define'ing ARCH_HASH_SCHED_DOMAIN, and exporting your
61arch_init_sched_domains function. This function will attach domains to all 61arch_init_sched_domains function. This function will attach domains to all
62CPUs using cpu_attach_domain. 62CPUs using cpu_attach_domain.
63 63
64Implementors should change the line 64The sched-domains debugging infrastructure can be enabled by enabling
65#undef SCHED_DOMAIN_DEBUG 65CONFIG_SCHED_DEBUG. This enables an error checking parse of the sched domains
66to
67#define SCHED_DOMAIN_DEBUG
68in kernel/sched.c as this enables an error checking parse of the sched domains
69which should catch most possible errors (described above). It also prints out 66which should catch most possible errors (described above). It also prints out
70the domain structure in a visual format. 67the domain structure in a visual format.
diff --git a/Documentation/scheduler/sched-rt-group.txt b/Documentation/scheduler/sched-rt-group.txt
index 14f901f639ee..3ef339f491e0 100644
--- a/Documentation/scheduler/sched-rt-group.txt
+++ b/Documentation/scheduler/sched-rt-group.txt
@@ -51,9 +51,9 @@ needs only about 3% CPU time to do so, it can do with a 0.03 * 0.005s =
510.00015s. So this group can be scheduled with a period of 0.005s and a run time 510.00015s. So this group can be scheduled with a period of 0.005s and a run time
52of 0.00015s. 52of 0.00015s.
53 53
54The remaining CPU time will be used for user input and other tass. Because 54The remaining CPU time will be used for user input and other tasks. Because
55realtime tasks have explicitly allocated the CPU time they need to perform 55realtime tasks have explicitly allocated the CPU time they need to perform
56their tasks, buffer underruns in the graphocs or audio can be eliminated. 56their tasks, buffer underruns in the graphics or audio can be eliminated.
57 57
58NOTE: the above example is not fully implemented as of yet (2.6.25). We still 58NOTE: the above example is not fully implemented as of yet (2.6.25). We still
59lack an EDF scheduler to make non-uniform periods usable. 59lack an EDF scheduler to make non-uniform periods usable.
diff --git a/include/linux/sched.h b/include/linux/sched.h
index c5d3f847ca8d..f6cd60f2de63 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -134,7 +134,6 @@ extern unsigned long nr_running(void);
134extern unsigned long nr_uninterruptible(void); 134extern unsigned long nr_uninterruptible(void);
135extern unsigned long nr_active(void); 135extern unsigned long nr_active(void);
136extern unsigned long nr_iowait(void); 136extern unsigned long nr_iowait(void);
137extern unsigned long weighted_cpuload(const int cpu);
138 137
139struct seq_file; 138struct seq_file;
140struct cfs_rq; 139struct cfs_rq;
@@ -784,6 +783,8 @@ struct sched_domain {
784 unsigned int balance_interval; /* initialise to 1. units in ms. */ 783 unsigned int balance_interval; /* initialise to 1. units in ms. */
785 unsigned int nr_balance_failed; /* initialise to 0 */ 784 unsigned int nr_balance_failed; /* initialise to 0 */
786 785
786 u64 last_update;
787
787#ifdef CONFIG_SCHEDSTATS 788#ifdef CONFIG_SCHEDSTATS
788 /* load_balance() stats */ 789 /* load_balance() stats */
789 unsigned int lb_count[CPU_MAX_IDLE_TYPES]; 790 unsigned int lb_count[CPU_MAX_IDLE_TYPES];
@@ -823,23 +824,6 @@ extern int arch_reinit_sched_domains(void);
823 824
824#endif /* CONFIG_SMP */ 825#endif /* CONFIG_SMP */
825 826
826/*
827 * A runqueue laden with a single nice 0 task scores a weighted_cpuload of
828 * SCHED_LOAD_SCALE. This function returns 1 if any cpu is laden with a
829 * task of nice 0 or enough lower priority tasks to bring up the
830 * weighted_cpuload
831 */
832static inline int above_background_load(void)
833{
834 unsigned long cpu;
835
836 for_each_online_cpu(cpu) {
837 if (weighted_cpuload(cpu) >= SCHED_LOAD_SCALE)
838 return 1;
839 }
840 return 0;
841}
842
843struct io_context; /* See blkdev.h */ 827struct io_context; /* See blkdev.h */
844#define NGROUPS_SMALL 32 828#define NGROUPS_SMALL 32
845#define NGROUPS_PER_BLOCK ((unsigned int)(PAGE_SIZE / sizeof(gid_t))) 829#define NGROUPS_PER_BLOCK ((unsigned int)(PAGE_SIZE / sizeof(gid_t)))
@@ -921,8 +905,8 @@ struct sched_class {
921 void (*set_cpus_allowed)(struct task_struct *p, 905 void (*set_cpus_allowed)(struct task_struct *p,
922 const cpumask_t *newmask); 906 const cpumask_t *newmask);
923 907
924 void (*join_domain)(struct rq *rq); 908 void (*rq_online)(struct rq *rq);
925 void (*leave_domain)(struct rq *rq); 909 void (*rq_offline)(struct rq *rq);
926 910
927 void (*switched_from) (struct rq *this_rq, struct task_struct *task, 911 void (*switched_from) (struct rq *this_rq, struct task_struct *task,
928 int running); 912 int running);
@@ -1039,6 +1023,7 @@ struct task_struct {
1039#endif 1023#endif
1040 1024
1041 int prio, static_prio, normal_prio; 1025 int prio, static_prio, normal_prio;
1026 unsigned int rt_priority;
1042 const struct sched_class *sched_class; 1027 const struct sched_class *sched_class;
1043 struct sched_entity se; 1028 struct sched_entity se;
1044 struct sched_rt_entity rt; 1029 struct sched_rt_entity rt;
@@ -1122,7 +1107,6 @@ struct task_struct {
1122 int __user *set_child_tid; /* CLONE_CHILD_SETTID */ 1107 int __user *set_child_tid; /* CLONE_CHILD_SETTID */
1123 int __user *clear_child_tid; /* CLONE_CHILD_CLEARTID */ 1108 int __user *clear_child_tid; /* CLONE_CHILD_CLEARTID */
1124 1109
1125 unsigned int rt_priority;
1126 cputime_t utime, stime, utimescaled, stimescaled; 1110 cputime_t utime, stime, utimescaled, stimescaled;
1127 cputime_t gtime; 1111 cputime_t gtime;
1128 cputime_t prev_utime, prev_stime; 1112 cputime_t prev_utime, prev_stime;
@@ -1141,12 +1125,12 @@ struct task_struct {
1141 gid_t gid,egid,sgid,fsgid; 1125 gid_t gid,egid,sgid,fsgid;
1142 struct group_info *group_info; 1126 struct group_info *group_info;
1143 kernel_cap_t cap_effective, cap_inheritable, cap_permitted, cap_bset; 1127 kernel_cap_t cap_effective, cap_inheritable, cap_permitted, cap_bset;
1144 unsigned securebits;
1145 struct user_struct *user; 1128 struct user_struct *user;
1129 unsigned securebits;
1146#ifdef CONFIG_KEYS 1130#ifdef CONFIG_KEYS
1131 unsigned char jit_keyring; /* default keyring to attach requested keys to */
1147 struct key *request_key_auth; /* assumed request_key authority */ 1132 struct key *request_key_auth; /* assumed request_key authority */
1148 struct key *thread_keyring; /* keyring private to this thread */ 1133 struct key *thread_keyring; /* keyring private to this thread */
1149 unsigned char jit_keyring; /* default keyring to attach requested keys to */
1150#endif 1134#endif
1151 char comm[TASK_COMM_LEN]; /* executable name excluding path 1135 char comm[TASK_COMM_LEN]; /* executable name excluding path
1152 - access with [gs]et_task_comm (which lock 1136 - access with [gs]et_task_comm (which lock
@@ -1233,8 +1217,8 @@ struct task_struct {
1233# define MAX_LOCK_DEPTH 48UL 1217# define MAX_LOCK_DEPTH 48UL
1234 u64 curr_chain_key; 1218 u64 curr_chain_key;
1235 int lockdep_depth; 1219 int lockdep_depth;
1236 struct held_lock held_locks[MAX_LOCK_DEPTH];
1237 unsigned int lockdep_recursion; 1220 unsigned int lockdep_recursion;
1221 struct held_lock held_locks[MAX_LOCK_DEPTH];
1238#endif 1222#endif
1239 1223
1240/* journalling filesystem info */ 1224/* journalling filesystem info */
@@ -1262,10 +1246,6 @@ struct task_struct {
1262 u64 acct_vm_mem1; /* accumulated virtual memory usage */ 1246 u64 acct_vm_mem1; /* accumulated virtual memory usage */
1263 cputime_t acct_stimexpd;/* stime since last update */ 1247 cputime_t acct_stimexpd;/* stime since last update */
1264#endif 1248#endif
1265#ifdef CONFIG_NUMA
1266 struct mempolicy *mempolicy;
1267 short il_next;
1268#endif
1269#ifdef CONFIG_CPUSETS 1249#ifdef CONFIG_CPUSETS
1270 nodemask_t mems_allowed; 1250 nodemask_t mems_allowed;
1271 int cpuset_mems_generation; 1251 int cpuset_mems_generation;
@@ -1285,6 +1265,10 @@ struct task_struct {
1285 struct list_head pi_state_list; 1265 struct list_head pi_state_list;
1286 struct futex_pi_state *pi_state_cache; 1266 struct futex_pi_state *pi_state_cache;
1287#endif 1267#endif
1268#ifdef CONFIG_NUMA
1269 struct mempolicy *mempolicy;
1270 short il_next;
1271#endif
1288 atomic_t fs_excl; /* holding fs exclusive resources */ 1272 atomic_t fs_excl; /* holding fs exclusive resources */
1289 struct rcu_head rcu; 1273 struct rcu_head rcu;
1290 1274
@@ -1504,6 +1488,7 @@ static inline void put_task_struct(struct task_struct *t)
1504#define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */ 1488#define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */
1505#define PF_SPREAD_PAGE 0x01000000 /* Spread page cache over cpuset */ 1489#define PF_SPREAD_PAGE 0x01000000 /* Spread page cache over cpuset */
1506#define PF_SPREAD_SLAB 0x02000000 /* Spread some slab caches over cpuset */ 1490#define PF_SPREAD_SLAB 0x02000000 /* Spread some slab caches over cpuset */
1491#define PF_THREAD_BOUND 0x04000000 /* Thread bound to specific cpu */
1507#define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */ 1492#define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */
1508#define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */ 1493#define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */
1509#define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezeable */ 1494#define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezeable */
@@ -1573,13 +1558,28 @@ static inline void sched_clock_idle_sleep_event(void)
1573static inline void sched_clock_idle_wakeup_event(u64 delta_ns) 1558static inline void sched_clock_idle_wakeup_event(u64 delta_ns)
1574{ 1559{
1575} 1560}
1576#else 1561
1562#ifdef CONFIG_NO_HZ
1563static inline void sched_clock_tick_stop(int cpu)
1564{
1565}
1566
1567static inline void sched_clock_tick_start(int cpu)
1568{
1569}
1570#endif
1571
1572#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
1577extern void sched_clock_init(void); 1573extern void sched_clock_init(void);
1578extern u64 sched_clock_cpu(int cpu); 1574extern u64 sched_clock_cpu(int cpu);
1579extern void sched_clock_tick(void); 1575extern void sched_clock_tick(void);
1580extern void sched_clock_idle_sleep_event(void); 1576extern void sched_clock_idle_sleep_event(void);
1581extern void sched_clock_idle_wakeup_event(u64 delta_ns); 1577extern void sched_clock_idle_wakeup_event(u64 delta_ns);
1578#ifdef CONFIG_NO_HZ
1579extern void sched_clock_tick_stop(int cpu);
1580extern void sched_clock_tick_start(int cpu);
1582#endif 1581#endif
1582#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
1583 1583
1584/* 1584/*
1585 * For kernel-internal use: high-speed (but slightly incorrect) per-cpu 1585 * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
@@ -1622,6 +1622,7 @@ extern unsigned int sysctl_sched_child_runs_first;
1622extern unsigned int sysctl_sched_features; 1622extern unsigned int sysctl_sched_features;
1623extern unsigned int sysctl_sched_migration_cost; 1623extern unsigned int sysctl_sched_migration_cost;
1624extern unsigned int sysctl_sched_nr_migrate; 1624extern unsigned int sysctl_sched_nr_migrate;
1625extern unsigned int sysctl_sched_shares_ratelimit;
1625 1626
1626int sched_nr_latency_handler(struct ctl_table *table, int write, 1627int sched_nr_latency_handler(struct ctl_table *table, int write,
1627 struct file *file, void __user *buffer, size_t *length, 1628 struct file *file, void __user *buffer, size_t *length,
diff --git a/kernel/Makefile b/kernel/Makefile
index 1c9938addb9d..6c55301112e0 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -3,7 +3,7 @@
3# 3#
4 4
5obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ 5obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
6 exit.o itimer.o time.o softirq.o resource.o \ 6 cpu.o exit.o itimer.o time.o softirq.o resource.o \
7 sysctl.o capability.o ptrace.o timer.o user.o \ 7 sysctl.o capability.o ptrace.o timer.o user.o \
8 signal.o sys.o kmod.o workqueue.o pid.o \ 8 signal.o sys.o kmod.o workqueue.o pid.o \
9 rcupdate.o extable.o params.o posix-timers.o \ 9 rcupdate.o extable.o params.o posix-timers.o \
@@ -27,7 +27,7 @@ obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
27obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o 27obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
28obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o 28obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
29obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o 29obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
30obj-$(CONFIG_SMP) += cpu.o spinlock.o 30obj-$(CONFIG_SMP) += spinlock.o
31obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o 31obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
32obj-$(CONFIG_PROVE_LOCKING) += spinlock.o 32obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
33obj-$(CONFIG_UID16) += uid16.o 33obj-$(CONFIG_UID16) += uid16.o
@@ -69,6 +69,7 @@ obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
69obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o 69obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
70obj-$(CONFIG_MARKERS) += marker.o 70obj-$(CONFIG_MARKERS) += marker.o
71obj-$(CONFIG_LATENCYTOP) += latencytop.o 71obj-$(CONFIG_LATENCYTOP) += latencytop.o
72obj-$(CONFIG_SMP) += sched_cpupri.o
72 73
73ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) 74ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
74# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is 75# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/cpu.c b/kernel/cpu.c
index c77bc3a1c722..b11f06dc149a 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -15,6 +15,28 @@
15#include <linux/stop_machine.h> 15#include <linux/stop_machine.h>
16#include <linux/mutex.h> 16#include <linux/mutex.h>
17 17
18/*
19 * Represents all cpu's present in the system
20 * In systems capable of hotplug, this map could dynamically grow
21 * as new cpu's are detected in the system via any platform specific
22 * method, such as ACPI for e.g.
23 */
24cpumask_t cpu_present_map __read_mostly;
25EXPORT_SYMBOL(cpu_present_map);
26
27#ifndef CONFIG_SMP
28
29/*
30 * Represents all cpu's that are currently online.
31 */
32cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL;
33EXPORT_SYMBOL(cpu_online_map);
34
35cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
36EXPORT_SYMBOL(cpu_possible_map);
37
38#else /* CONFIG_SMP */
39
18/* Serializes the updates to cpu_online_map, cpu_present_map */ 40/* Serializes the updates to cpu_online_map, cpu_present_map */
19static DEFINE_MUTEX(cpu_add_remove_lock); 41static DEFINE_MUTEX(cpu_add_remove_lock);
20 42
@@ -403,3 +425,5 @@ out:
403 cpu_maps_update_done(); 425 cpu_maps_update_done();
404} 426}
405#endif /* CONFIG_PM_SLEEP_SMP */ 427#endif /* CONFIG_PM_SLEEP_SMP */
428
429#endif /* CONFIG_SMP */
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 798b3ab054eb..459d601947a8 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1194,6 +1194,15 @@ static int cpuset_can_attach(struct cgroup_subsys *ss,
1194 1194
1195 if (cpus_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) 1195 if (cpus_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
1196 return -ENOSPC; 1196 return -ENOSPC;
1197 if (tsk->flags & PF_THREAD_BOUND) {
1198 cpumask_t mask;
1199
1200 mutex_lock(&callback_mutex);
1201 mask = cs->cpus_allowed;
1202 mutex_unlock(&callback_mutex);
1203 if (!cpus_equal(tsk->cpus_allowed, mask))
1204 return -EINVAL;
1205 }
1197 1206
1198 return security_task_setscheduler(tsk, 0, NULL); 1207 return security_task_setscheduler(tsk, 0, NULL);
1199} 1208}
@@ -1207,11 +1216,14 @@ static void cpuset_attach(struct cgroup_subsys *ss,
1207 struct mm_struct *mm; 1216 struct mm_struct *mm;
1208 struct cpuset *cs = cgroup_cs(cont); 1217 struct cpuset *cs = cgroup_cs(cont);
1209 struct cpuset *oldcs = cgroup_cs(oldcont); 1218 struct cpuset *oldcs = cgroup_cs(oldcont);
1219 int err;
1210 1220
1211 mutex_lock(&callback_mutex); 1221 mutex_lock(&callback_mutex);
1212 guarantee_online_cpus(cs, &cpus); 1222 guarantee_online_cpus(cs, &cpus);
1213 set_cpus_allowed_ptr(tsk, &cpus); 1223 err = set_cpus_allowed_ptr(tsk, &cpus);
1214 mutex_unlock(&callback_mutex); 1224 mutex_unlock(&callback_mutex);
1225 if (err)
1226 return;
1215 1227
1216 from = oldcs->mems_allowed; 1228 from = oldcs->mems_allowed;
1217 to = cs->mems_allowed; 1229 to = cs->mems_allowed;
diff --git a/kernel/kthread.c b/kernel/kthread.c
index bd1b9ea024e1..97747cdd37c9 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -180,6 +180,7 @@ void kthread_bind(struct task_struct *k, unsigned int cpu)
180 set_task_cpu(k, cpu); 180 set_task_cpu(k, cpu);
181 k->cpus_allowed = cpumask_of_cpu(cpu); 181 k->cpus_allowed = cpumask_of_cpu(cpu);
182 k->rt.nr_cpus_allowed = 1; 182 k->rt.nr_cpus_allowed = 1;
183 k->flags |= PF_THREAD_BOUND;
183} 184}
184EXPORT_SYMBOL(kthread_bind); 185EXPORT_SYMBOL(kthread_bind);
185 186
diff --git a/kernel/sched.c b/kernel/sched.c
index 8402944f715b..591d5e7f757a 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -74,6 +74,8 @@
74#include <asm/tlb.h> 74#include <asm/tlb.h>
75#include <asm/irq_regs.h> 75#include <asm/irq_regs.h>
76 76
77#include "sched_cpupri.h"
78
77/* 79/*
78 * Convert user-nice values [ -20 ... 0 ... 19 ] 80 * Convert user-nice values [ -20 ... 0 ... 19 ]
79 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], 81 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
@@ -289,15 +291,15 @@ struct task_group root_task_group;
289static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); 291static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
290/* Default task group's cfs_rq on each cpu */ 292/* Default task group's cfs_rq on each cpu */
291static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; 293static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
292#endif 294#endif /* CONFIG_FAIR_GROUP_SCHED */
293 295
294#ifdef CONFIG_RT_GROUP_SCHED 296#ifdef CONFIG_RT_GROUP_SCHED
295static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); 297static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
296static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; 298static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
297#endif 299#endif /* CONFIG_RT_GROUP_SCHED */
298#else 300#else /* !CONFIG_FAIR_GROUP_SCHED */
299#define root_task_group init_task_group 301#define root_task_group init_task_group
300#endif 302#endif /* CONFIG_FAIR_GROUP_SCHED */
301 303
302/* task_group_lock serializes add/remove of task groups and also changes to 304/* task_group_lock serializes add/remove of task groups and also changes to
303 * a task group's cpu shares. 305 * a task group's cpu shares.
@@ -307,9 +309,9 @@ static DEFINE_SPINLOCK(task_group_lock);
307#ifdef CONFIG_FAIR_GROUP_SCHED 309#ifdef CONFIG_FAIR_GROUP_SCHED
308#ifdef CONFIG_USER_SCHED 310#ifdef CONFIG_USER_SCHED
309# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) 311# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
310#else 312#else /* !CONFIG_USER_SCHED */
311# define INIT_TASK_GROUP_LOAD NICE_0_LOAD 313# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
312#endif 314#endif /* CONFIG_USER_SCHED */
313 315
314/* 316/*
315 * A weight of 0 or 1 can cause arithmetics problems. 317 * A weight of 0 or 1 can cause arithmetics problems.
@@ -363,6 +365,10 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
363#else 365#else
364 366
365static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } 367static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
368static inline struct task_group *task_group(struct task_struct *p)
369{
370 return NULL;
371}
366 372
367#endif /* CONFIG_GROUP_SCHED */ 373#endif /* CONFIG_GROUP_SCHED */
368 374
@@ -373,6 +379,7 @@ struct cfs_rq {
373 379
374 u64 exec_clock; 380 u64 exec_clock;
375 u64 min_vruntime; 381 u64 min_vruntime;
382 u64 pair_start;
376 383
377 struct rb_root tasks_timeline; 384 struct rb_root tasks_timeline;
378 struct rb_node *rb_leftmost; 385 struct rb_node *rb_leftmost;
@@ -401,6 +408,31 @@ struct cfs_rq {
401 */ 408 */
402 struct list_head leaf_cfs_rq_list; 409 struct list_head leaf_cfs_rq_list;
403 struct task_group *tg; /* group that "owns" this runqueue */ 410 struct task_group *tg; /* group that "owns" this runqueue */
411
412#ifdef CONFIG_SMP
413 /*
414 * the part of load.weight contributed by tasks
415 */
416 unsigned long task_weight;
417
418 /*
419 * h_load = weight * f(tg)
420 *
421 * Where f(tg) is the recursive weight fraction assigned to
422 * this group.
423 */
424 unsigned long h_load;
425
426 /*
427 * this cpu's part of tg->shares
428 */
429 unsigned long shares;
430
431 /*
432 * load.weight at the time we set shares
433 */
434 unsigned long rq_weight;
435#endif
404#endif 436#endif
405}; 437};
406 438
@@ -452,6 +484,9 @@ struct root_domain {
452 */ 484 */
453 cpumask_t rto_mask; 485 cpumask_t rto_mask;
454 atomic_t rto_count; 486 atomic_t rto_count;
487#ifdef CONFIG_SMP
488 struct cpupri cpupri;
489#endif
455}; 490};
456 491
457/* 492/*
@@ -526,6 +561,9 @@ struct rq {
526 int push_cpu; 561 int push_cpu;
527 /* cpu of this runqueue: */ 562 /* cpu of this runqueue: */
528 int cpu; 563 int cpu;
564 int online;
565
566 unsigned long avg_load_per_task;
529 567
530 struct task_struct *migration_thread; 568 struct task_struct *migration_thread;
531 struct list_head migration_queue; 569 struct list_head migration_queue;
@@ -749,6 +787,12 @@ late_initcall(sched_init_debug);
749const_debug unsigned int sysctl_sched_nr_migrate = 32; 787const_debug unsigned int sysctl_sched_nr_migrate = 32;
750 788
751/* 789/*
790 * ratelimit for updating the group shares.
791 * default: 0.5ms
792 */
793const_debug unsigned int sysctl_sched_shares_ratelimit = 500000;
794
795/*
752 * period over which we measure -rt task cpu usage in us. 796 * period over which we measure -rt task cpu usage in us.
753 * default: 1s 797 * default: 1s
754 */ 798 */
@@ -775,82 +819,6 @@ static inline u64 global_rt_runtime(void)
775 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; 819 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
776} 820}
777 821
778unsigned long long time_sync_thresh = 100000;
779
780static DEFINE_PER_CPU(unsigned long long, time_offset);
781static DEFINE_PER_CPU(unsigned long long, prev_cpu_time);
782
783/*
784 * Global lock which we take every now and then to synchronize
785 * the CPUs time. This method is not warp-safe, but it's good
786 * enough to synchronize slowly diverging time sources and thus
787 * it's good enough for tracing:
788 */
789static DEFINE_SPINLOCK(time_sync_lock);
790static unsigned long long prev_global_time;
791
792static unsigned long long __sync_cpu_clock(unsigned long long time, int cpu)
793{
794 /*
795 * We want this inlined, to not get tracer function calls
796 * in this critical section:
797 */
798 spin_acquire(&time_sync_lock.dep_map, 0, 0, _THIS_IP_);
799 __raw_spin_lock(&time_sync_lock.raw_lock);
800
801 if (time < prev_global_time) {
802 per_cpu(time_offset, cpu) += prev_global_time - time;
803 time = prev_global_time;
804 } else {
805 prev_global_time = time;
806 }
807
808 __raw_spin_unlock(&time_sync_lock.raw_lock);
809 spin_release(&time_sync_lock.dep_map, 1, _THIS_IP_);
810
811 return time;
812}
813
814static unsigned long long __cpu_clock(int cpu)
815{
816 unsigned long long now;
817
818 /*
819 * Only call sched_clock() if the scheduler has already been
820 * initialized (some code might call cpu_clock() very early):
821 */
822 if (unlikely(!scheduler_running))
823 return 0;
824
825 now = sched_clock_cpu(cpu);
826
827 return now;
828}
829
830/*
831 * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
832 * clock constructed from sched_clock():
833 */
834unsigned long long cpu_clock(int cpu)
835{
836 unsigned long long prev_cpu_time, time, delta_time;
837 unsigned long flags;
838
839 local_irq_save(flags);
840 prev_cpu_time = per_cpu(prev_cpu_time, cpu);
841 time = __cpu_clock(cpu) + per_cpu(time_offset, cpu);
842 delta_time = time-prev_cpu_time;
843
844 if (unlikely(delta_time > time_sync_thresh)) {
845 time = __sync_cpu_clock(time, cpu);
846 per_cpu(prev_cpu_time, cpu) = time;
847 }
848 local_irq_restore(flags);
849
850 return time;
851}
852EXPORT_SYMBOL_GPL(cpu_clock);
853
854#ifndef prepare_arch_switch 822#ifndef prepare_arch_switch
855# define prepare_arch_switch(next) do { } while (0) 823# define prepare_arch_switch(next) do { } while (0)
856#endif 824#endif
@@ -1313,15 +1281,15 @@ void wake_up_idle_cpu(int cpu)
1313 if (!tsk_is_polling(rq->idle)) 1281 if (!tsk_is_polling(rq->idle))
1314 smp_send_reschedule(cpu); 1282 smp_send_reschedule(cpu);
1315} 1283}
1316#endif 1284#endif /* CONFIG_NO_HZ */
1317 1285
1318#else 1286#else /* !CONFIG_SMP */
1319static void __resched_task(struct task_struct *p, int tif_bit) 1287static void __resched_task(struct task_struct *p, int tif_bit)
1320{ 1288{
1321 assert_spin_locked(&task_rq(p)->lock); 1289 assert_spin_locked(&task_rq(p)->lock);
1322 set_tsk_thread_flag(p, tif_bit); 1290 set_tsk_thread_flag(p, tif_bit);
1323} 1291}
1324#endif 1292#endif /* CONFIG_SMP */
1325 1293
1326#if BITS_PER_LONG == 32 1294#if BITS_PER_LONG == 32
1327# define WMULT_CONST (~0UL) 1295# define WMULT_CONST (~0UL)
@@ -1336,6 +1304,9 @@ static void __resched_task(struct task_struct *p, int tif_bit)
1336 */ 1304 */
1337#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) 1305#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
1338 1306
1307/*
1308 * delta *= weight / lw
1309 */
1339static unsigned long 1310static unsigned long
1340calc_delta_mine(unsigned long delta_exec, unsigned long weight, 1311calc_delta_mine(unsigned long delta_exec, unsigned long weight,
1341 struct load_weight *lw) 1312 struct load_weight *lw)
@@ -1363,12 +1334,6 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight,
1363 return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); 1334 return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
1364} 1335}
1365 1336
1366static inline unsigned long
1367calc_delta_fair(unsigned long delta_exec, struct load_weight *lw)
1368{
1369 return calc_delta_mine(delta_exec, NICE_0_LOAD, lw);
1370}
1371
1372static inline void update_load_add(struct load_weight *lw, unsigned long inc) 1337static inline void update_load_add(struct load_weight *lw, unsigned long inc)
1373{ 1338{
1374 lw->weight += inc; 1339 lw->weight += inc;
@@ -1479,17 +1444,211 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load)
1479#ifdef CONFIG_SMP 1444#ifdef CONFIG_SMP
1480static unsigned long source_load(int cpu, int type); 1445static unsigned long source_load(int cpu, int type);
1481static unsigned long target_load(int cpu, int type); 1446static unsigned long target_load(int cpu, int type);
1482static unsigned long cpu_avg_load_per_task(int cpu);
1483static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); 1447static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1484#else /* CONFIG_SMP */ 1448
1449static unsigned long cpu_avg_load_per_task(int cpu)
1450{
1451 struct rq *rq = cpu_rq(cpu);
1452
1453 if (rq->nr_running)
1454 rq->avg_load_per_task = rq->load.weight / rq->nr_running;
1455
1456 return rq->avg_load_per_task;
1457}
1485 1458
1486#ifdef CONFIG_FAIR_GROUP_SCHED 1459#ifdef CONFIG_FAIR_GROUP_SCHED
1487static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) 1460
1461typedef void (*tg_visitor)(struct task_group *, int, struct sched_domain *);
1462
1463/*
1464 * Iterate the full tree, calling @down when first entering a node and @up when
1465 * leaving it for the final time.
1466 */
1467static void
1468walk_tg_tree(tg_visitor down, tg_visitor up, int cpu, struct sched_domain *sd)
1469{
1470 struct task_group *parent, *child;
1471
1472 rcu_read_lock();
1473 parent = &root_task_group;
1474down:
1475 (*down)(parent, cpu, sd);
1476 list_for_each_entry_rcu(child, &parent->children, siblings) {
1477 parent = child;
1478 goto down;
1479
1480up:
1481 continue;
1482 }
1483 (*up)(parent, cpu, sd);
1484
1485 child = parent;
1486 parent = parent->parent;
1487 if (parent)
1488 goto up;
1489 rcu_read_unlock();
1490}
1491
1492static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1493
1494/*
1495 * Calculate and set the cpu's group shares.
1496 */
1497static void
1498__update_group_shares_cpu(struct task_group *tg, int cpu,
1499 unsigned long sd_shares, unsigned long sd_rq_weight)
1488{ 1500{
1501 int boost = 0;
1502 unsigned long shares;
1503 unsigned long rq_weight;
1504
1505 if (!tg->se[cpu])
1506 return;
1507
1508 rq_weight = tg->cfs_rq[cpu]->load.weight;
1509
1510 /*
1511 * If there are currently no tasks on the cpu pretend there is one of
1512 * average load so that when a new task gets to run here it will not
1513 * get delayed by group starvation.
1514 */
1515 if (!rq_weight) {
1516 boost = 1;
1517 rq_weight = NICE_0_LOAD;
1518 }
1519
1520 if (unlikely(rq_weight > sd_rq_weight))
1521 rq_weight = sd_rq_weight;
1522
1523 /*
1524 * \Sum shares * rq_weight
1525 * shares = -----------------------
1526 * \Sum rq_weight
1527 *
1528 */
1529 shares = (sd_shares * rq_weight) / (sd_rq_weight + 1);
1530
1531 /*
1532 * record the actual number of shares, not the boosted amount.
1533 */
1534 tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
1535 tg->cfs_rq[cpu]->rq_weight = rq_weight;
1536
1537 if (shares < MIN_SHARES)
1538 shares = MIN_SHARES;
1539 else if (shares > MAX_SHARES)
1540 shares = MAX_SHARES;
1541
1542 __set_se_shares(tg->se[cpu], shares);
1489} 1543}
1544
1545/*
1546 * Re-compute the task group their per cpu shares over the given domain.
1547 * This needs to be done in a bottom-up fashion because the rq weight of a
1548 * parent group depends on the shares of its child groups.
1549 */
1550static void
1551tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
1552{
1553 unsigned long rq_weight = 0;
1554 unsigned long shares = 0;
1555 int i;
1556
1557 for_each_cpu_mask(i, sd->span) {
1558 rq_weight += tg->cfs_rq[i]->load.weight;
1559 shares += tg->cfs_rq[i]->shares;
1560 }
1561
1562 if ((!shares && rq_weight) || shares > tg->shares)
1563 shares = tg->shares;
1564
1565 if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
1566 shares = tg->shares;
1567
1568 if (!rq_weight)
1569 rq_weight = cpus_weight(sd->span) * NICE_0_LOAD;
1570
1571 for_each_cpu_mask(i, sd->span) {
1572 struct rq *rq = cpu_rq(i);
1573 unsigned long flags;
1574
1575 spin_lock_irqsave(&rq->lock, flags);
1576 __update_group_shares_cpu(tg, i, shares, rq_weight);
1577 spin_unlock_irqrestore(&rq->lock, flags);
1578 }
1579}
1580
1581/*
1582 * Compute the cpu's hierarchical load factor for each task group.
1583 * This needs to be done in a top-down fashion because the load of a child
1584 * group is a fraction of its parents load.
1585 */
1586static void
1587tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd)
1588{
1589 unsigned long load;
1590
1591 if (!tg->parent) {
1592 load = cpu_rq(cpu)->load.weight;
1593 } else {
1594 load = tg->parent->cfs_rq[cpu]->h_load;
1595 load *= tg->cfs_rq[cpu]->shares;
1596 load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
1597 }
1598
1599 tg->cfs_rq[cpu]->h_load = load;
1600}
1601
1602static void
1603tg_nop(struct task_group *tg, int cpu, struct sched_domain *sd)
1604{
1605}
1606
1607static void update_shares(struct sched_domain *sd)
1608{
1609 u64 now = cpu_clock(raw_smp_processor_id());
1610 s64 elapsed = now - sd->last_update;
1611
1612 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
1613 sd->last_update = now;
1614 walk_tg_tree(tg_nop, tg_shares_up, 0, sd);
1615 }
1616}
1617
1618static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1619{
1620 spin_unlock(&rq->lock);
1621 update_shares(sd);
1622 spin_lock(&rq->lock);
1623}
1624
1625static void update_h_load(int cpu)
1626{
1627 walk_tg_tree(tg_load_down, tg_nop, cpu, NULL);
1628}
1629
1630#else
1631
1632static inline void update_shares(struct sched_domain *sd)
1633{
1634}
1635
1636static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1637{
1638}
1639
1490#endif 1640#endif
1491 1641
1492#endif /* CONFIG_SMP */ 1642#endif
1643
1644#ifdef CONFIG_FAIR_GROUP_SCHED
1645static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1646{
1647#ifdef CONFIG_SMP
1648 cfs_rq->shares = shares;
1649#endif
1650}
1651#endif
1493 1652
1494#include "sched_stats.h" 1653#include "sched_stats.h"
1495#include "sched_idletask.c" 1654#include "sched_idletask.c"
@@ -1500,27 +1659,17 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1500#endif 1659#endif
1501 1660
1502#define sched_class_highest (&rt_sched_class) 1661#define sched_class_highest (&rt_sched_class)
1662#define for_each_class(class) \
1663 for (class = sched_class_highest; class; class = class->next)
1503 1664
1504static inline void inc_load(struct rq *rq, const struct task_struct *p) 1665static void inc_nr_running(struct rq *rq)
1505{
1506 update_load_add(&rq->load, p->se.load.weight);
1507}
1508
1509static inline void dec_load(struct rq *rq, const struct task_struct *p)
1510{
1511 update_load_sub(&rq->load, p->se.load.weight);
1512}
1513
1514static void inc_nr_running(struct task_struct *p, struct rq *rq)
1515{ 1666{
1516 rq->nr_running++; 1667 rq->nr_running++;
1517 inc_load(rq, p);
1518} 1668}
1519 1669
1520static void dec_nr_running(struct task_struct *p, struct rq *rq) 1670static void dec_nr_running(struct rq *rq)
1521{ 1671{
1522 rq->nr_running--; 1672 rq->nr_running--;
1523 dec_load(rq, p);
1524} 1673}
1525 1674
1526static void set_load_weight(struct task_struct *p) 1675static void set_load_weight(struct task_struct *p)
@@ -1544,6 +1693,12 @@ static void set_load_weight(struct task_struct *p)
1544 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO]; 1693 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
1545} 1694}
1546 1695
1696static void update_avg(u64 *avg, u64 sample)
1697{
1698 s64 diff = sample - *avg;
1699 *avg += diff >> 3;
1700}
1701
1547static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) 1702static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
1548{ 1703{
1549 sched_info_queued(p); 1704 sched_info_queued(p);
@@ -1553,6 +1708,13 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
1553 1708
1554static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep) 1709static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
1555{ 1710{
1711 if (sleep && p->se.last_wakeup) {
1712 update_avg(&p->se.avg_overlap,
1713 p->se.sum_exec_runtime - p->se.last_wakeup);
1714 p->se.last_wakeup = 0;
1715 }
1716
1717 sched_info_dequeued(p);
1556 p->sched_class->dequeue_task(rq, p, sleep); 1718 p->sched_class->dequeue_task(rq, p, sleep);
1557 p->se.on_rq = 0; 1719 p->se.on_rq = 0;
1558} 1720}
@@ -1612,7 +1774,7 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
1612 rq->nr_uninterruptible--; 1774 rq->nr_uninterruptible--;
1613 1775
1614 enqueue_task(rq, p, wakeup); 1776 enqueue_task(rq, p, wakeup);
1615 inc_nr_running(p, rq); 1777 inc_nr_running(rq);
1616} 1778}
1617 1779
1618/* 1780/*
@@ -1624,7 +1786,7 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
1624 rq->nr_uninterruptible++; 1786 rq->nr_uninterruptible++;
1625 1787
1626 dequeue_task(rq, p, sleep); 1788 dequeue_task(rq, p, sleep);
1627 dec_nr_running(p, rq); 1789 dec_nr_running(rq);
1628} 1790}
1629 1791
1630/** 1792/**
@@ -1636,12 +1798,6 @@ inline int task_curr(const struct task_struct *p)
1636 return cpu_curr(task_cpu(p)) == p; 1798 return cpu_curr(task_cpu(p)) == p;
1637} 1799}
1638 1800
1639/* Used instead of source_load when we know the type == 0 */
1640unsigned long weighted_cpuload(const int cpu)
1641{
1642 return cpu_rq(cpu)->load.weight;
1643}
1644
1645static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) 1801static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1646{ 1802{
1647 set_task_rq(p, cpu); 1803 set_task_rq(p, cpu);
@@ -1670,6 +1826,12 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
1670 1826
1671#ifdef CONFIG_SMP 1827#ifdef CONFIG_SMP
1672 1828
1829/* Used instead of source_load when we know the type == 0 */
1830static unsigned long weighted_cpuload(const int cpu)
1831{
1832 return cpu_rq(cpu)->load.weight;
1833}
1834
1673/* 1835/*
1674 * Is this task likely cache-hot: 1836 * Is this task likely cache-hot:
1675 */ 1837 */
@@ -1880,7 +2042,7 @@ static unsigned long source_load(int cpu, int type)
1880 struct rq *rq = cpu_rq(cpu); 2042 struct rq *rq = cpu_rq(cpu);
1881 unsigned long total = weighted_cpuload(cpu); 2043 unsigned long total = weighted_cpuload(cpu);
1882 2044
1883 if (type == 0) 2045 if (type == 0 || !sched_feat(LB_BIAS))
1884 return total; 2046 return total;
1885 2047
1886 return min(rq->cpu_load[type-1], total); 2048 return min(rq->cpu_load[type-1], total);
@@ -1895,25 +2057,13 @@ static unsigned long target_load(int cpu, int type)
1895 struct rq *rq = cpu_rq(cpu); 2057 struct rq *rq = cpu_rq(cpu);
1896 unsigned long total = weighted_cpuload(cpu); 2058 unsigned long total = weighted_cpuload(cpu);
1897 2059
1898 if (type == 0) 2060 if (type == 0 || !sched_feat(LB_BIAS))
1899 return total; 2061 return total;
1900 2062
1901 return max(rq->cpu_load[type-1], total); 2063 return max(rq->cpu_load[type-1], total);
1902} 2064}
1903 2065
1904/* 2066/*
1905 * Return the average load per task on the cpu's run queue
1906 */
1907static unsigned long cpu_avg_load_per_task(int cpu)
1908{
1909 struct rq *rq = cpu_rq(cpu);
1910 unsigned long total = weighted_cpuload(cpu);
1911 unsigned long n = rq->nr_running;
1912
1913 return n ? total / n : SCHED_LOAD_SCALE;
1914}
1915
1916/*
1917 * find_idlest_group finds and returns the least busy CPU group within the 2067 * find_idlest_group finds and returns the least busy CPU group within the
1918 * domain. 2068 * domain.
1919 */ 2069 */
@@ -2019,6 +2169,9 @@ static int sched_balance_self(int cpu, int flag)
2019 sd = tmp; 2169 sd = tmp;
2020 } 2170 }
2021 2171
2172 if (sd)
2173 update_shares(sd);
2174
2022 while (sd) { 2175 while (sd) {
2023 cpumask_t span, tmpmask; 2176 cpumask_t span, tmpmask;
2024 struct sched_group *group; 2177 struct sched_group *group;
@@ -2085,6 +2238,22 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2085 if (!sched_feat(SYNC_WAKEUPS)) 2238 if (!sched_feat(SYNC_WAKEUPS))
2086 sync = 0; 2239 sync = 0;
2087 2240
2241#ifdef CONFIG_SMP
2242 if (sched_feat(LB_WAKEUP_UPDATE)) {
2243 struct sched_domain *sd;
2244
2245 this_cpu = raw_smp_processor_id();
2246 cpu = task_cpu(p);
2247
2248 for_each_domain(this_cpu, sd) {
2249 if (cpu_isset(cpu, sd->span)) {
2250 update_shares(sd);
2251 break;
2252 }
2253 }
2254 }
2255#endif
2256
2088 smp_wmb(); 2257 smp_wmb();
2089 rq = task_rq_lock(p, &flags); 2258 rq = task_rq_lock(p, &flags);
2090 old_state = p->state; 2259 old_state = p->state;
@@ -2131,7 +2300,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2131 } 2300 }
2132 } 2301 }
2133 } 2302 }
2134#endif 2303#endif /* CONFIG_SCHEDSTATS */
2135 2304
2136out_activate: 2305out_activate:
2137#endif /* CONFIG_SMP */ 2306#endif /* CONFIG_SMP */
@@ -2157,6 +2326,8 @@ out_running:
2157 p->sched_class->task_wake_up(rq, p); 2326 p->sched_class->task_wake_up(rq, p);
2158#endif 2327#endif
2159out: 2328out:
2329 current->se.last_wakeup = current->se.sum_exec_runtime;
2330
2160 task_rq_unlock(rq, &flags); 2331 task_rq_unlock(rq, &flags);
2161 2332
2162 return success; 2333 return success;
@@ -2277,7 +2448,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2277 * management (if any): 2448 * management (if any):
2278 */ 2449 */
2279 p->sched_class->task_new(rq, p); 2450 p->sched_class->task_new(rq, p);
2280 inc_nr_running(p, rq); 2451 inc_nr_running(rq);
2281 } 2452 }
2282 check_preempt_curr(rq, p); 2453 check_preempt_curr(rq, p);
2283#ifdef CONFIG_SMP 2454#ifdef CONFIG_SMP
@@ -2331,7 +2502,7 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr,
2331 notifier->ops->sched_out(notifier, next); 2502 notifier->ops->sched_out(notifier, next);
2332} 2503}
2333 2504
2334#else 2505#else /* !CONFIG_PREEMPT_NOTIFIERS */
2335 2506
2336static void fire_sched_in_preempt_notifiers(struct task_struct *curr) 2507static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2337{ 2508{
@@ -2343,7 +2514,7 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr,
2343{ 2514{
2344} 2515}
2345 2516
2346#endif 2517#endif /* CONFIG_PREEMPT_NOTIFIERS */
2347 2518
2348/** 2519/**
2349 * prepare_task_switch - prepare to switch tasks 2520 * prepare_task_switch - prepare to switch tasks
@@ -2785,7 +2956,7 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2785 enum cpu_idle_type idle, int *all_pinned, 2956 enum cpu_idle_type idle, int *all_pinned,
2786 int *this_best_prio, struct rq_iterator *iterator) 2957 int *this_best_prio, struct rq_iterator *iterator)
2787{ 2958{
2788 int loops = 0, pulled = 0, pinned = 0, skip_for_load; 2959 int loops = 0, pulled = 0, pinned = 0;
2789 struct task_struct *p; 2960 struct task_struct *p;
2790 long rem_load_move = max_load_move; 2961 long rem_load_move = max_load_move;
2791 2962
@@ -2801,14 +2972,8 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2801next: 2972next:
2802 if (!p || loops++ > sysctl_sched_nr_migrate) 2973 if (!p || loops++ > sysctl_sched_nr_migrate)
2803 goto out; 2974 goto out;
2804 /* 2975
2805 * To help distribute high priority tasks across CPUs we don't 2976 if ((p->se.load.weight >> 1) > rem_load_move ||
2806 * skip a task if it will be the highest priority task (i.e. smallest
2807 * prio value) on its new queue regardless of its load weight
2808 */
2809 skip_for_load = (p->se.load.weight >> 1) > rem_load_move +
2810 SCHED_LOAD_SCALE_FUZZ;
2811 if ((skip_for_load && p->prio >= *this_best_prio) ||
2812 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) { 2977 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
2813 p = iterator->next(iterator->arg); 2978 p = iterator->next(iterator->arg);
2814 goto next; 2979 goto next;
@@ -2863,6 +3028,10 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2863 max_load_move - total_load_moved, 3028 max_load_move - total_load_moved,
2864 sd, idle, all_pinned, &this_best_prio); 3029 sd, idle, all_pinned, &this_best_prio);
2865 class = class->next; 3030 class = class->next;
3031
3032 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
3033 break;
3034
2866 } while (class && max_load_move > total_load_moved); 3035 } while (class && max_load_move > total_load_moved);
2867 3036
2868 return total_load_moved > 0; 3037 return total_load_moved > 0;
@@ -2939,6 +3108,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2939 max_load = this_load = total_load = total_pwr = 0; 3108 max_load = this_load = total_load = total_pwr = 0;
2940 busiest_load_per_task = busiest_nr_running = 0; 3109 busiest_load_per_task = busiest_nr_running = 0;
2941 this_load_per_task = this_nr_running = 0; 3110 this_load_per_task = this_nr_running = 0;
3111
2942 if (idle == CPU_NOT_IDLE) 3112 if (idle == CPU_NOT_IDLE)
2943 load_idx = sd->busy_idx; 3113 load_idx = sd->busy_idx;
2944 else if (idle == CPU_NEWLY_IDLE) 3114 else if (idle == CPU_NEWLY_IDLE)
@@ -2953,6 +3123,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2953 int __group_imb = 0; 3123 int __group_imb = 0;
2954 unsigned int balance_cpu = -1, first_idle_cpu = 0; 3124 unsigned int balance_cpu = -1, first_idle_cpu = 0;
2955 unsigned long sum_nr_running, sum_weighted_load; 3125 unsigned long sum_nr_running, sum_weighted_load;
3126 unsigned long sum_avg_load_per_task;
3127 unsigned long avg_load_per_task;
2956 3128
2957 local_group = cpu_isset(this_cpu, group->cpumask); 3129 local_group = cpu_isset(this_cpu, group->cpumask);
2958 3130
@@ -2961,6 +3133,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2961 3133
2962 /* Tally up the load of all CPUs in the group */ 3134 /* Tally up the load of all CPUs in the group */
2963 sum_weighted_load = sum_nr_running = avg_load = 0; 3135 sum_weighted_load = sum_nr_running = avg_load = 0;
3136 sum_avg_load_per_task = avg_load_per_task = 0;
3137
2964 max_cpu_load = 0; 3138 max_cpu_load = 0;
2965 min_cpu_load = ~0UL; 3139 min_cpu_load = ~0UL;
2966 3140
@@ -2994,6 +3168,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2994 avg_load += load; 3168 avg_load += load;
2995 sum_nr_running += rq->nr_running; 3169 sum_nr_running += rq->nr_running;
2996 sum_weighted_load += weighted_cpuload(i); 3170 sum_weighted_load += weighted_cpuload(i);
3171
3172 sum_avg_load_per_task += cpu_avg_load_per_task(i);
2997 } 3173 }
2998 3174
2999 /* 3175 /*
@@ -3015,7 +3191,20 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
3015 avg_load = sg_div_cpu_power(group, 3191 avg_load = sg_div_cpu_power(group,
3016 avg_load * SCHED_LOAD_SCALE); 3192 avg_load * SCHED_LOAD_SCALE);
3017 3193
3018 if ((max_cpu_load - min_cpu_load) > SCHED_LOAD_SCALE) 3194
3195 /*
3196 * Consider the group unbalanced when the imbalance is larger
3197 * than the average weight of two tasks.
3198 *
3199 * APZ: with cgroup the avg task weight can vary wildly and
3200 * might not be a suitable number - should we keep a
3201 * normalized nr_running number somewhere that negates
3202 * the hierarchy?
3203 */
3204 avg_load_per_task = sg_div_cpu_power(group,
3205 sum_avg_load_per_task * SCHED_LOAD_SCALE);
3206
3207 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
3019 __group_imb = 1; 3208 __group_imb = 1;
3020 3209
3021 group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; 3210 group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
@@ -3156,9 +3345,9 @@ small_imbalance:
3156 if (busiest_load_per_task > this_load_per_task) 3345 if (busiest_load_per_task > this_load_per_task)
3157 imbn = 1; 3346 imbn = 1;
3158 } else 3347 } else
3159 this_load_per_task = SCHED_LOAD_SCALE; 3348 this_load_per_task = cpu_avg_load_per_task(this_cpu);
3160 3349
3161 if (max_load - this_load + SCHED_LOAD_SCALE_FUZZ >= 3350 if (max_load - this_load + 2*busiest_load_per_task >=
3162 busiest_load_per_task * imbn) { 3351 busiest_load_per_task * imbn) {
3163 *imbalance = busiest_load_per_task; 3352 *imbalance = busiest_load_per_task;
3164 return busiest; 3353 return busiest;
@@ -3284,6 +3473,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
3284 schedstat_inc(sd, lb_count[idle]); 3473 schedstat_inc(sd, lb_count[idle]);
3285 3474
3286redo: 3475redo:
3476 update_shares(sd);
3287 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, 3477 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
3288 cpus, balance); 3478 cpus, balance);
3289 3479
@@ -3386,8 +3576,9 @@ redo:
3386 3576
3387 if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && 3577 if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3388 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) 3578 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3389 return -1; 3579 ld_moved = -1;
3390 return ld_moved; 3580
3581 goto out;
3391 3582
3392out_balanced: 3583out_balanced:
3393 schedstat_inc(sd, lb_balanced[idle]); 3584 schedstat_inc(sd, lb_balanced[idle]);
@@ -3402,8 +3593,13 @@ out_one_pinned:
3402 3593
3403 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && 3594 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3404 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) 3595 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3405 return -1; 3596 ld_moved = -1;
3406 return 0; 3597 else
3598 ld_moved = 0;
3599out:
3600 if (ld_moved)
3601 update_shares(sd);
3602 return ld_moved;
3407} 3603}
3408 3604
3409/* 3605/*
@@ -3438,6 +3634,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,
3438 3634
3439 schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]); 3635 schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]);
3440redo: 3636redo:
3637 update_shares_locked(this_rq, sd);
3441 group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE, 3638 group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
3442 &sd_idle, cpus, NULL); 3639 &sd_idle, cpus, NULL);
3443 if (!group) { 3640 if (!group) {
@@ -3481,6 +3678,7 @@ redo:
3481 } else 3678 } else
3482 sd->nr_balance_failed = 0; 3679 sd->nr_balance_failed = 0;
3483 3680
3681 update_shares_locked(this_rq, sd);
3484 return ld_moved; 3682 return ld_moved;
3485 3683
3486out_balanced: 3684out_balanced:
@@ -3672,6 +3870,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3672 /* Earliest time when we have to do rebalance again */ 3870 /* Earliest time when we have to do rebalance again */
3673 unsigned long next_balance = jiffies + 60*HZ; 3871 unsigned long next_balance = jiffies + 60*HZ;
3674 int update_next_balance = 0; 3872 int update_next_balance = 0;
3873 int need_serialize;
3675 cpumask_t tmp; 3874 cpumask_t tmp;
3676 3875
3677 for_each_domain(cpu, sd) { 3876 for_each_domain(cpu, sd) {
@@ -3689,8 +3888,9 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3689 if (interval > HZ*NR_CPUS/10) 3888 if (interval > HZ*NR_CPUS/10)
3690 interval = HZ*NR_CPUS/10; 3889 interval = HZ*NR_CPUS/10;
3691 3890
3891 need_serialize = sd->flags & SD_SERIALIZE;
3692 3892
3693 if (sd->flags & SD_SERIALIZE) { 3893 if (need_serialize) {
3694 if (!spin_trylock(&balancing)) 3894 if (!spin_trylock(&balancing))
3695 goto out; 3895 goto out;
3696 } 3896 }
@@ -3706,7 +3906,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3706 } 3906 }
3707 sd->last_balance = jiffies; 3907 sd->last_balance = jiffies;
3708 } 3908 }
3709 if (sd->flags & SD_SERIALIZE) 3909 if (need_serialize)
3710 spin_unlock(&balancing); 3910 spin_unlock(&balancing);
3711out: 3911out:
3712 if (time_after(next_balance, sd->last_balance + interval)) { 3912 if (time_after(next_balance, sd->last_balance + interval)) {
@@ -4070,6 +4270,7 @@ static noinline void __schedule_bug(struct task_struct *prev)
4070 prev->comm, prev->pid, preempt_count()); 4270 prev->comm, prev->pid, preempt_count());
4071 4271
4072 debug_show_held_locks(prev); 4272 debug_show_held_locks(prev);
4273 print_modules();
4073 if (irqs_disabled()) 4274 if (irqs_disabled())
4074 print_irqtrace_events(prev); 4275 print_irqtrace_events(prev);
4075 4276
@@ -4143,7 +4344,7 @@ asmlinkage void __sched schedule(void)
4143 struct task_struct *prev, *next; 4344 struct task_struct *prev, *next;
4144 unsigned long *switch_count; 4345 unsigned long *switch_count;
4145 struct rq *rq; 4346 struct rq *rq;
4146 int cpu; 4347 int cpu, hrtick = sched_feat(HRTICK);
4147 4348
4148need_resched: 4349need_resched:
4149 preempt_disable(); 4350 preempt_disable();
@@ -4158,7 +4359,8 @@ need_resched_nonpreemptible:
4158 4359
4159 schedule_debug(prev); 4360 schedule_debug(prev);
4160 4361
4161 hrtick_clear(rq); 4362 if (hrtick)
4363 hrtick_clear(rq);
4162 4364
4163 /* 4365 /*
4164 * Do the rq-clock update outside the rq lock: 4366 * Do the rq-clock update outside the rq lock:
@@ -4204,7 +4406,8 @@ need_resched_nonpreemptible:
4204 } else 4406 } else
4205 spin_unlock_irq(&rq->lock); 4407 spin_unlock_irq(&rq->lock);
4206 4408
4207 hrtick_set(rq); 4409 if (hrtick)
4410 hrtick_set(rq);
4208 4411
4209 if (unlikely(reacquire_kernel_lock(current) < 0)) 4412 if (unlikely(reacquire_kernel_lock(current) < 0))
4210 goto need_resched_nonpreemptible; 4413 goto need_resched_nonpreemptible;
@@ -4586,10 +4789,8 @@ void set_user_nice(struct task_struct *p, long nice)
4586 goto out_unlock; 4789 goto out_unlock;
4587 } 4790 }
4588 on_rq = p->se.on_rq; 4791 on_rq = p->se.on_rq;
4589 if (on_rq) { 4792 if (on_rq)
4590 dequeue_task(rq, p, 0); 4793 dequeue_task(rq, p, 0);
4591 dec_load(rq, p);
4592 }
4593 4794
4594 p->static_prio = NICE_TO_PRIO(nice); 4795 p->static_prio = NICE_TO_PRIO(nice);
4595 set_load_weight(p); 4796 set_load_weight(p);
@@ -4599,7 +4800,6 @@ void set_user_nice(struct task_struct *p, long nice)
4599 4800
4600 if (on_rq) { 4801 if (on_rq) {
4601 enqueue_task(rq, p, 0); 4802 enqueue_task(rq, p, 0);
4602 inc_load(rq, p);
4603 /* 4803 /*
4604 * If the task increased its priority or is running and 4804 * If the task increased its priority or is running and
4605 * lowered its priority, then reschedule its CPU: 4805 * lowered its priority, then reschedule its CPU:
@@ -5070,24 +5270,6 @@ asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
5070 return sched_setaffinity(pid, &new_mask); 5270 return sched_setaffinity(pid, &new_mask);
5071} 5271}
5072 5272
5073/*
5074 * Represents all cpu's present in the system
5075 * In systems capable of hotplug, this map could dynamically grow
5076 * as new cpu's are detected in the system via any platform specific
5077 * method, such as ACPI for e.g.
5078 */
5079
5080cpumask_t cpu_present_map __read_mostly;
5081EXPORT_SYMBOL(cpu_present_map);
5082
5083#ifndef CONFIG_SMP
5084cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL;
5085EXPORT_SYMBOL(cpu_online_map);
5086
5087cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
5088EXPORT_SYMBOL(cpu_possible_map);
5089#endif
5090
5091long sched_getaffinity(pid_t pid, cpumask_t *mask) 5273long sched_getaffinity(pid_t pid, cpumask_t *mask)
5092{ 5274{
5093 struct task_struct *p; 5275 struct task_struct *p;
@@ -5571,6 +5753,12 @@ int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask)
5571 goto out; 5753 goto out;
5572 } 5754 }
5573 5755
5756 if (unlikely((p->flags & PF_THREAD_BOUND) && p != current &&
5757 !cpus_equal(p->cpus_allowed, *new_mask))) {
5758 ret = -EINVAL;
5759 goto out;
5760 }
5761
5574 if (p->sched_class->set_cpus_allowed) 5762 if (p->sched_class->set_cpus_allowed)
5575 p->sched_class->set_cpus_allowed(p, new_mask); 5763 p->sched_class->set_cpus_allowed(p, new_mask);
5576 else { 5764 else {
@@ -6060,6 +6248,36 @@ static void unregister_sched_domain_sysctl(void)
6060} 6248}
6061#endif 6249#endif
6062 6250
6251static void set_rq_online(struct rq *rq)
6252{
6253 if (!rq->online) {
6254 const struct sched_class *class;
6255
6256 cpu_set(rq->cpu, rq->rd->online);
6257 rq->online = 1;
6258
6259 for_each_class(class) {
6260 if (class->rq_online)
6261 class->rq_online(rq);
6262 }
6263 }
6264}
6265
6266static void set_rq_offline(struct rq *rq)
6267{
6268 if (rq->online) {
6269 const struct sched_class *class;
6270
6271 for_each_class(class) {
6272 if (class->rq_offline)
6273 class->rq_offline(rq);
6274 }
6275
6276 cpu_clear(rq->cpu, rq->rd->online);
6277 rq->online = 0;
6278 }
6279}
6280
6063/* 6281/*
6064 * migration_call - callback that gets triggered when a CPU is added. 6282 * migration_call - callback that gets triggered when a CPU is added.
6065 * Here we can start up the necessary migration thread for the new CPU. 6283 * Here we can start up the necessary migration thread for the new CPU.
@@ -6097,7 +6315,8 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6097 spin_lock_irqsave(&rq->lock, flags); 6315 spin_lock_irqsave(&rq->lock, flags);
6098 if (rq->rd) { 6316 if (rq->rd) {
6099 BUG_ON(!cpu_isset(cpu, rq->rd->span)); 6317 BUG_ON(!cpu_isset(cpu, rq->rd->span));
6100 cpu_set(cpu, rq->rd->online); 6318
6319 set_rq_online(rq);
6101 } 6320 }
6102 spin_unlock_irqrestore(&rq->lock, flags); 6321 spin_unlock_irqrestore(&rq->lock, flags);
6103 break; 6322 break;
@@ -6158,7 +6377,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6158 spin_lock_irqsave(&rq->lock, flags); 6377 spin_lock_irqsave(&rq->lock, flags);
6159 if (rq->rd) { 6378 if (rq->rd) {
6160 BUG_ON(!cpu_isset(cpu, rq->rd->span)); 6379 BUG_ON(!cpu_isset(cpu, rq->rd->span));
6161 cpu_clear(cpu, rq->rd->online); 6380 set_rq_offline(rq);
6162 } 6381 }
6163 spin_unlock_irqrestore(&rq->lock, flags); 6382 spin_unlock_irqrestore(&rq->lock, flags);
6164 break; 6383 break;
@@ -6192,6 +6411,28 @@ void __init migration_init(void)
6192 6411
6193#ifdef CONFIG_SCHED_DEBUG 6412#ifdef CONFIG_SCHED_DEBUG
6194 6413
6414static inline const char *sd_level_to_string(enum sched_domain_level lvl)
6415{
6416 switch (lvl) {
6417 case SD_LV_NONE:
6418 return "NONE";
6419 case SD_LV_SIBLING:
6420 return "SIBLING";
6421 case SD_LV_MC:
6422 return "MC";
6423 case SD_LV_CPU:
6424 return "CPU";
6425 case SD_LV_NODE:
6426 return "NODE";
6427 case SD_LV_ALLNODES:
6428 return "ALLNODES";
6429 case SD_LV_MAX:
6430 return "MAX";
6431
6432 }
6433 return "MAX";
6434}
6435
6195static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, 6436static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6196 cpumask_t *groupmask) 6437 cpumask_t *groupmask)
6197{ 6438{
@@ -6211,7 +6452,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6211 return -1; 6452 return -1;
6212 } 6453 }
6213 6454
6214 printk(KERN_CONT "span %s\n", str); 6455 printk(KERN_CONT "span %s level %s\n",
6456 str, sd_level_to_string(sd->level));
6215 6457
6216 if (!cpu_isset(cpu, sd->span)) { 6458 if (!cpu_isset(cpu, sd->span)) {
6217 printk(KERN_ERR "ERROR: domain->span does not contain " 6459 printk(KERN_ERR "ERROR: domain->span does not contain "
@@ -6295,9 +6537,9 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
6295 } 6537 }
6296 kfree(groupmask); 6538 kfree(groupmask);
6297} 6539}
6298#else 6540#else /* !CONFIG_SCHED_DEBUG */
6299# define sched_domain_debug(sd, cpu) do { } while (0) 6541# define sched_domain_debug(sd, cpu) do { } while (0)
6300#endif 6542#endif /* CONFIG_SCHED_DEBUG */
6301 6543
6302static int sd_degenerate(struct sched_domain *sd) 6544static int sd_degenerate(struct sched_domain *sd)
6303{ 6545{
@@ -6357,20 +6599,16 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
6357static void rq_attach_root(struct rq *rq, struct root_domain *rd) 6599static void rq_attach_root(struct rq *rq, struct root_domain *rd)
6358{ 6600{
6359 unsigned long flags; 6601 unsigned long flags;
6360 const struct sched_class *class;
6361 6602
6362 spin_lock_irqsave(&rq->lock, flags); 6603 spin_lock_irqsave(&rq->lock, flags);
6363 6604
6364 if (rq->rd) { 6605 if (rq->rd) {
6365 struct root_domain *old_rd = rq->rd; 6606 struct root_domain *old_rd = rq->rd;
6366 6607
6367 for (class = sched_class_highest; class; class = class->next) { 6608 if (cpu_isset(rq->cpu, old_rd->online))
6368 if (class->leave_domain) 6609 set_rq_offline(rq);
6369 class->leave_domain(rq);
6370 }
6371 6610
6372 cpu_clear(rq->cpu, old_rd->span); 6611 cpu_clear(rq->cpu, old_rd->span);
6373 cpu_clear(rq->cpu, old_rd->online);
6374 6612
6375 if (atomic_dec_and_test(&old_rd->refcount)) 6613 if (atomic_dec_and_test(&old_rd->refcount))
6376 kfree(old_rd); 6614 kfree(old_rd);
@@ -6381,12 +6619,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
6381 6619
6382 cpu_set(rq->cpu, rd->span); 6620 cpu_set(rq->cpu, rd->span);
6383 if (cpu_isset(rq->cpu, cpu_online_map)) 6621 if (cpu_isset(rq->cpu, cpu_online_map))
6384 cpu_set(rq->cpu, rd->online); 6622 set_rq_online(rq);
6385
6386 for (class = sched_class_highest; class; class = class->next) {
6387 if (class->join_domain)
6388 class->join_domain(rq);
6389 }
6390 6623
6391 spin_unlock_irqrestore(&rq->lock, flags); 6624 spin_unlock_irqrestore(&rq->lock, flags);
6392} 6625}
@@ -6397,6 +6630,8 @@ static void init_rootdomain(struct root_domain *rd)
6397 6630
6398 cpus_clear(rd->span); 6631 cpus_clear(rd->span);
6399 cpus_clear(rd->online); 6632 cpus_clear(rd->online);
6633
6634 cpupri_init(&rd->cpupri);
6400} 6635}
6401 6636
6402static void init_defrootdomain(void) 6637static void init_defrootdomain(void)
@@ -6591,7 +6826,7 @@ static void sched_domain_node_span(int node, cpumask_t *span)
6591 cpus_or(*span, *span, *nodemask); 6826 cpus_or(*span, *span, *nodemask);
6592 } 6827 }
6593} 6828}
6594#endif 6829#endif /* CONFIG_NUMA */
6595 6830
6596int sched_smt_power_savings = 0, sched_mc_power_savings = 0; 6831int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
6597 6832
@@ -6610,7 +6845,7 @@ cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
6610 *sg = &per_cpu(sched_group_cpus, cpu); 6845 *sg = &per_cpu(sched_group_cpus, cpu);
6611 return cpu; 6846 return cpu;
6612} 6847}
6613#endif 6848#endif /* CONFIG_SCHED_SMT */
6614 6849
6615/* 6850/*
6616 * multi-core sched-domains: 6851 * multi-core sched-domains:
@@ -6618,7 +6853,7 @@ cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
6618#ifdef CONFIG_SCHED_MC 6853#ifdef CONFIG_SCHED_MC
6619static DEFINE_PER_CPU(struct sched_domain, core_domains); 6854static DEFINE_PER_CPU(struct sched_domain, core_domains);
6620static DEFINE_PER_CPU(struct sched_group, sched_group_core); 6855static DEFINE_PER_CPU(struct sched_group, sched_group_core);
6621#endif 6856#endif /* CONFIG_SCHED_MC */
6622 6857
6623#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) 6858#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
6624static int 6859static int
@@ -6720,7 +6955,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
6720 sg = sg->next; 6955 sg = sg->next;
6721 } while (sg != group_head); 6956 } while (sg != group_head);
6722} 6957}
6723#endif 6958#endif /* CONFIG_NUMA */
6724 6959
6725#ifdef CONFIG_NUMA 6960#ifdef CONFIG_NUMA
6726/* Free memory allocated for various sched_group structures */ 6961/* Free memory allocated for various sched_group structures */
@@ -6757,11 +6992,11 @@ next_sg:
6757 sched_group_nodes_bycpu[cpu] = NULL; 6992 sched_group_nodes_bycpu[cpu] = NULL;
6758 } 6993 }
6759} 6994}
6760#else 6995#else /* !CONFIG_NUMA */
6761static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) 6996static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
6762{ 6997{
6763} 6998}
6764#endif 6999#endif /* CONFIG_NUMA */
6765 7000
6766/* 7001/*
6767 * Initialize sched groups cpu_power. 7002 * Initialize sched groups cpu_power.
@@ -7470,7 +7705,7 @@ int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
7470#endif 7705#endif
7471 return err; 7706 return err;
7472} 7707}
7473#endif 7708#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
7474 7709
7475/* 7710/*
7476 * Force a reinitialization of the sched domains hierarchy. The domains 7711 * Force a reinitialization of the sched domains hierarchy. The domains
@@ -7481,21 +7716,28 @@ int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
7481static int update_sched_domains(struct notifier_block *nfb, 7716static int update_sched_domains(struct notifier_block *nfb,
7482 unsigned long action, void *hcpu) 7717 unsigned long action, void *hcpu)
7483{ 7718{
7719 int cpu = (int)(long)hcpu;
7720
7484 switch (action) { 7721 switch (action) {
7485 case CPU_UP_PREPARE:
7486 case CPU_UP_PREPARE_FROZEN:
7487 case CPU_DOWN_PREPARE: 7722 case CPU_DOWN_PREPARE:
7488 case CPU_DOWN_PREPARE_FROZEN: 7723 case CPU_DOWN_PREPARE_FROZEN:
7724 disable_runtime(cpu_rq(cpu));
7725 /* fall-through */
7726 case CPU_UP_PREPARE:
7727 case CPU_UP_PREPARE_FROZEN:
7489 detach_destroy_domains(&cpu_online_map); 7728 detach_destroy_domains(&cpu_online_map);
7490 free_sched_domains(); 7729 free_sched_domains();
7491 return NOTIFY_OK; 7730 return NOTIFY_OK;
7492 7731
7493 case CPU_UP_CANCELED: 7732
7494 case CPU_UP_CANCELED_FROZEN:
7495 case CPU_DOWN_FAILED: 7733 case CPU_DOWN_FAILED:
7496 case CPU_DOWN_FAILED_FROZEN: 7734 case CPU_DOWN_FAILED_FROZEN:
7497 case CPU_ONLINE: 7735 case CPU_ONLINE:
7498 case CPU_ONLINE_FROZEN: 7736 case CPU_ONLINE_FROZEN:
7737 enable_runtime(cpu_rq(cpu));
7738 /* fall-through */
7739 case CPU_UP_CANCELED:
7740 case CPU_UP_CANCELED_FROZEN:
7499 case CPU_DEAD: 7741 case CPU_DEAD:
7500 case CPU_DEAD_FROZEN: 7742 case CPU_DEAD_FROZEN:
7501 /* 7743 /*
@@ -7695,8 +7937,8 @@ void __init sched_init(void)
7695 7937
7696 root_task_group.cfs_rq = (struct cfs_rq **)ptr; 7938 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
7697 ptr += nr_cpu_ids * sizeof(void **); 7939 ptr += nr_cpu_ids * sizeof(void **);
7698#endif 7940#endif /* CONFIG_USER_SCHED */
7699#endif 7941#endif /* CONFIG_FAIR_GROUP_SCHED */
7700#ifdef CONFIG_RT_GROUP_SCHED 7942#ifdef CONFIG_RT_GROUP_SCHED
7701 init_task_group.rt_se = (struct sched_rt_entity **)ptr; 7943 init_task_group.rt_se = (struct sched_rt_entity **)ptr;
7702 ptr += nr_cpu_ids * sizeof(void **); 7944 ptr += nr_cpu_ids * sizeof(void **);
@@ -7710,8 +7952,8 @@ void __init sched_init(void)
7710 7952
7711 root_task_group.rt_rq = (struct rt_rq **)ptr; 7953 root_task_group.rt_rq = (struct rt_rq **)ptr;
7712 ptr += nr_cpu_ids * sizeof(void **); 7954 ptr += nr_cpu_ids * sizeof(void **);
7713#endif 7955#endif /* CONFIG_USER_SCHED */
7714#endif 7956#endif /* CONFIG_RT_GROUP_SCHED */
7715 } 7957 }
7716 7958
7717#ifdef CONFIG_SMP 7959#ifdef CONFIG_SMP
@@ -7727,8 +7969,8 @@ void __init sched_init(void)
7727#ifdef CONFIG_USER_SCHED 7969#ifdef CONFIG_USER_SCHED
7728 init_rt_bandwidth(&root_task_group.rt_bandwidth, 7970 init_rt_bandwidth(&root_task_group.rt_bandwidth,
7729 global_rt_period(), RUNTIME_INF); 7971 global_rt_period(), RUNTIME_INF);
7730#endif 7972#endif /* CONFIG_USER_SCHED */
7731#endif 7973#endif /* CONFIG_RT_GROUP_SCHED */
7732 7974
7733#ifdef CONFIG_GROUP_SCHED 7975#ifdef CONFIG_GROUP_SCHED
7734 list_add(&init_task_group.list, &task_groups); 7976 list_add(&init_task_group.list, &task_groups);
@@ -7738,8 +7980,8 @@ void __init sched_init(void)
7738 INIT_LIST_HEAD(&root_task_group.children); 7980 INIT_LIST_HEAD(&root_task_group.children);
7739 init_task_group.parent = &root_task_group; 7981 init_task_group.parent = &root_task_group;
7740 list_add(&init_task_group.siblings, &root_task_group.children); 7982 list_add(&init_task_group.siblings, &root_task_group.children);
7741#endif 7983#endif /* CONFIG_USER_SCHED */
7742#endif 7984#endif /* CONFIG_GROUP_SCHED */
7743 7985
7744 for_each_possible_cpu(i) { 7986 for_each_possible_cpu(i) {
7745 struct rq *rq; 7987 struct rq *rq;
@@ -7819,6 +8061,7 @@ void __init sched_init(void)
7819 rq->next_balance = jiffies; 8061 rq->next_balance = jiffies;
7820 rq->push_cpu = 0; 8062 rq->push_cpu = 0;
7821 rq->cpu = i; 8063 rq->cpu = i;
8064 rq->online = 0;
7822 rq->migration_thread = NULL; 8065 rq->migration_thread = NULL;
7823 INIT_LIST_HEAD(&rq->migration_queue); 8066 INIT_LIST_HEAD(&rq->migration_queue);
7824 rq_attach_root(rq, &def_root_domain); 8067 rq_attach_root(rq, &def_root_domain);
@@ -8058,7 +8301,7 @@ static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8058{ 8301{
8059 list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list); 8302 list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list);
8060} 8303}
8061#else 8304#else /* !CONFG_FAIR_GROUP_SCHED */
8062static inline void free_fair_sched_group(struct task_group *tg) 8305static inline void free_fair_sched_group(struct task_group *tg)
8063{ 8306{
8064} 8307}
@@ -8076,7 +8319,7 @@ static inline void register_fair_sched_group(struct task_group *tg, int cpu)
8076static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) 8319static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8077{ 8320{
8078} 8321}
8079#endif 8322#endif /* CONFIG_FAIR_GROUP_SCHED */
8080 8323
8081#ifdef CONFIG_RT_GROUP_SCHED 8324#ifdef CONFIG_RT_GROUP_SCHED
8082static void free_rt_sched_group(struct task_group *tg) 8325static void free_rt_sched_group(struct task_group *tg)
@@ -8147,7 +8390,7 @@ static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
8147{ 8390{
8148 list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list); 8391 list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list);
8149} 8392}
8150#else 8393#else /* !CONFIG_RT_GROUP_SCHED */
8151static inline void free_rt_sched_group(struct task_group *tg) 8394static inline void free_rt_sched_group(struct task_group *tg)
8152{ 8395{
8153} 8396}
@@ -8165,7 +8408,7 @@ static inline void register_rt_sched_group(struct task_group *tg, int cpu)
8165static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) 8408static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
8166{ 8409{
8167} 8410}
8168#endif 8411#endif /* CONFIG_RT_GROUP_SCHED */
8169 8412
8170#ifdef CONFIG_GROUP_SCHED 8413#ifdef CONFIG_GROUP_SCHED
8171static void free_sched_group(struct task_group *tg) 8414static void free_sched_group(struct task_group *tg)
@@ -8276,17 +8519,14 @@ void sched_move_task(struct task_struct *tsk)
8276 8519
8277 task_rq_unlock(rq, &flags); 8520 task_rq_unlock(rq, &flags);
8278} 8521}
8279#endif 8522#endif /* CONFIG_GROUP_SCHED */
8280 8523
8281#ifdef CONFIG_FAIR_GROUP_SCHED 8524#ifdef CONFIG_FAIR_GROUP_SCHED
8282static void set_se_shares(struct sched_entity *se, unsigned long shares) 8525static void __set_se_shares(struct sched_entity *se, unsigned long shares)
8283{ 8526{
8284 struct cfs_rq *cfs_rq = se->cfs_rq; 8527 struct cfs_rq *cfs_rq = se->cfs_rq;
8285 struct rq *rq = cfs_rq->rq;
8286 int on_rq; 8528 int on_rq;
8287 8529
8288 spin_lock_irq(&rq->lock);
8289
8290 on_rq = se->on_rq; 8530 on_rq = se->on_rq;
8291 if (on_rq) 8531 if (on_rq)
8292 dequeue_entity(cfs_rq, se, 0); 8532 dequeue_entity(cfs_rq, se, 0);
@@ -8296,8 +8536,17 @@ static void set_se_shares(struct sched_entity *se, unsigned long shares)
8296 8536
8297 if (on_rq) 8537 if (on_rq)
8298 enqueue_entity(cfs_rq, se, 0); 8538 enqueue_entity(cfs_rq, se, 0);
8539}
8299 8540
8300 spin_unlock_irq(&rq->lock); 8541static void set_se_shares(struct sched_entity *se, unsigned long shares)
8542{
8543 struct cfs_rq *cfs_rq = se->cfs_rq;
8544 struct rq *rq = cfs_rq->rq;
8545 unsigned long flags;
8546
8547 spin_lock_irqsave(&rq->lock, flags);
8548 __set_se_shares(se, shares);
8549 spin_unlock_irqrestore(&rq->lock, flags);
8301} 8550}
8302 8551
8303static DEFINE_MUTEX(shares_mutex); 8552static DEFINE_MUTEX(shares_mutex);
@@ -8336,8 +8585,13 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
8336 * w/o tripping rebalance_share or load_balance_fair. 8585 * w/o tripping rebalance_share or load_balance_fair.
8337 */ 8586 */
8338 tg->shares = shares; 8587 tg->shares = shares;
8339 for_each_possible_cpu(i) 8588 for_each_possible_cpu(i) {
8589 /*
8590 * force a rebalance
8591 */
8592 cfs_rq_set_shares(tg->cfs_rq[i], 0);
8340 set_se_shares(tg->se[i], shares); 8593 set_se_shares(tg->se[i], shares);
8594 }
8341 8595
8342 /* 8596 /*
8343 * Enable load balance activity on this group, by inserting it back on 8597 * Enable load balance activity on this group, by inserting it back on
@@ -8376,7 +8630,7 @@ static unsigned long to_ratio(u64 period, u64 runtime)
8376#ifdef CONFIG_CGROUP_SCHED 8630#ifdef CONFIG_CGROUP_SCHED
8377static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) 8631static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
8378{ 8632{
8379 struct task_group *tgi, *parent = tg ? tg->parent : NULL; 8633 struct task_group *tgi, *parent = tg->parent;
8380 unsigned long total = 0; 8634 unsigned long total = 0;
8381 8635
8382 if (!parent) { 8636 if (!parent) {
@@ -8400,7 +8654,7 @@ static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
8400 } 8654 }
8401 rcu_read_unlock(); 8655 rcu_read_unlock();
8402 8656
8403 return total + to_ratio(period, runtime) < 8657 return total + to_ratio(period, runtime) <=
8404 to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period), 8658 to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period),
8405 parent->rt_bandwidth.rt_runtime); 8659 parent->rt_bandwidth.rt_runtime);
8406} 8660}
@@ -8520,16 +8774,21 @@ long sched_group_rt_period(struct task_group *tg)
8520 8774
8521static int sched_rt_global_constraints(void) 8775static int sched_rt_global_constraints(void)
8522{ 8776{
8777 struct task_group *tg = &root_task_group;
8778 u64 rt_runtime, rt_period;
8523 int ret = 0; 8779 int ret = 0;
8524 8780
8781 rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
8782 rt_runtime = tg->rt_bandwidth.rt_runtime;
8783
8525 mutex_lock(&rt_constraints_mutex); 8784 mutex_lock(&rt_constraints_mutex);
8526 if (!__rt_schedulable(NULL, 1, 0)) 8785 if (!__rt_schedulable(tg, rt_period, rt_runtime))
8527 ret = -EINVAL; 8786 ret = -EINVAL;
8528 mutex_unlock(&rt_constraints_mutex); 8787 mutex_unlock(&rt_constraints_mutex);
8529 8788
8530 return ret; 8789 return ret;
8531} 8790}
8532#else 8791#else /* !CONFIG_RT_GROUP_SCHED */
8533static int sched_rt_global_constraints(void) 8792static int sched_rt_global_constraints(void)
8534{ 8793{
8535 unsigned long flags; 8794 unsigned long flags;
@@ -8547,7 +8806,7 @@ static int sched_rt_global_constraints(void)
8547 8806
8548 return 0; 8807 return 0;
8549} 8808}
8550#endif 8809#endif /* CONFIG_RT_GROUP_SCHED */
8551 8810
8552int sched_rt_handler(struct ctl_table *table, int write, 8811int sched_rt_handler(struct ctl_table *table, int write,
8553 struct file *filp, void __user *buffer, size_t *lenp, 8812 struct file *filp, void __user *buffer, size_t *lenp,
@@ -8655,7 +8914,7 @@ static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
8655 8914
8656 return (u64) tg->shares; 8915 return (u64) tg->shares;
8657} 8916}
8658#endif 8917#endif /* CONFIG_FAIR_GROUP_SCHED */
8659 8918
8660#ifdef CONFIG_RT_GROUP_SCHED 8919#ifdef CONFIG_RT_GROUP_SCHED
8661static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, 8920static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
@@ -8679,7 +8938,7 @@ static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft)
8679{ 8938{
8680 return sched_group_rt_period(cgroup_tg(cgrp)); 8939 return sched_group_rt_period(cgroup_tg(cgrp));
8681} 8940}
8682#endif 8941#endif /* CONFIG_RT_GROUP_SCHED */
8683 8942
8684static struct cftype cpu_files[] = { 8943static struct cftype cpu_files[] = {
8685#ifdef CONFIG_FAIR_GROUP_SCHED 8944#ifdef CONFIG_FAIR_GROUP_SCHED
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index ce05271219ab..22ed55d1167f 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -3,6 +3,9 @@
3 * 3 *
4 * Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> 4 * Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
5 * 5 *
6 * Updates and enhancements:
7 * Copyright (C) 2008 Red Hat, Inc. Steven Rostedt <srostedt@redhat.com>
8 *
6 * Based on code by: 9 * Based on code by:
7 * Ingo Molnar <mingo@redhat.com> 10 * Ingo Molnar <mingo@redhat.com>
8 * Guillaume Chazarain <guichaz@gmail.com> 11 * Guillaume Chazarain <guichaz@gmail.com>
@@ -32,6 +35,11 @@
32 35
33#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK 36#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
34 37
38#define MULTI_SHIFT 15
39/* Max is double, Min is 1/2 */
40#define MAX_MULTI (2LL << MULTI_SHIFT)
41#define MIN_MULTI (1LL << (MULTI_SHIFT-1))
42
35struct sched_clock_data { 43struct sched_clock_data {
36 /* 44 /*
37 * Raw spinlock - this is a special case: this might be called 45 * Raw spinlock - this is a special case: this might be called
@@ -40,11 +48,15 @@ struct sched_clock_data {
40 */ 48 */
41 raw_spinlock_t lock; 49 raw_spinlock_t lock;
42 50
43 unsigned long prev_jiffies; 51 unsigned long tick_jiffies;
44 u64 prev_raw; 52 u64 prev_raw;
45 u64 tick_raw; 53 u64 tick_raw;
46 u64 tick_gtod; 54 u64 tick_gtod;
47 u64 clock; 55 u64 clock;
56 s64 multi;
57#ifdef CONFIG_NO_HZ
58 int check_max;
59#endif
48}; 60};
49 61
50static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data); 62static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data);
@@ -71,41 +83,91 @@ void sched_clock_init(void)
71 struct sched_clock_data *scd = cpu_sdc(cpu); 83 struct sched_clock_data *scd = cpu_sdc(cpu);
72 84
73 scd->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; 85 scd->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
74 scd->prev_jiffies = now_jiffies; 86 scd->tick_jiffies = now_jiffies;
75 scd->prev_raw = 0; 87 scd->prev_raw = 0;
76 scd->tick_raw = 0; 88 scd->tick_raw = 0;
77 scd->tick_gtod = ktime_now; 89 scd->tick_gtod = ktime_now;
78 scd->clock = ktime_now; 90 scd->clock = ktime_now;
91 scd->multi = 1 << MULTI_SHIFT;
92#ifdef CONFIG_NO_HZ
93 scd->check_max = 1;
94#endif
79 } 95 }
80 96
81 sched_clock_running = 1; 97 sched_clock_running = 1;
82} 98}
83 99
100#ifdef CONFIG_NO_HZ
101/*
102 * The dynamic ticks makes the delta jiffies inaccurate. This
103 * prevents us from checking the maximum time update.
104 * Disable the maximum check during stopped ticks.
105 */
106void sched_clock_tick_stop(int cpu)
107{
108 struct sched_clock_data *scd = cpu_sdc(cpu);
109
110 scd->check_max = 0;
111}
112
113void sched_clock_tick_start(int cpu)
114{
115 struct sched_clock_data *scd = cpu_sdc(cpu);
116
117 scd->check_max = 1;
118}
119
120static int check_max(struct sched_clock_data *scd)
121{
122 return scd->check_max;
123}
124#else
125static int check_max(struct sched_clock_data *scd)
126{
127 return 1;
128}
129#endif /* CONFIG_NO_HZ */
130
84/* 131/*
85 * update the percpu scd from the raw @now value 132 * update the percpu scd from the raw @now value
86 * 133 *
87 * - filter out backward motion 134 * - filter out backward motion
88 * - use jiffies to generate a min,max window to clip the raw values 135 * - use jiffies to generate a min,max window to clip the raw values
89 */ 136 */
90static void __update_sched_clock(struct sched_clock_data *scd, u64 now) 137static void __update_sched_clock(struct sched_clock_data *scd, u64 now, u64 *time)
91{ 138{
92 unsigned long now_jiffies = jiffies; 139 unsigned long now_jiffies = jiffies;
93 long delta_jiffies = now_jiffies - scd->prev_jiffies; 140 long delta_jiffies = now_jiffies - scd->tick_jiffies;
94 u64 clock = scd->clock; 141 u64 clock = scd->clock;
95 u64 min_clock, max_clock; 142 u64 min_clock, max_clock;
96 s64 delta = now - scd->prev_raw; 143 s64 delta = now - scd->prev_raw;
97 144
98 WARN_ON_ONCE(!irqs_disabled()); 145 WARN_ON_ONCE(!irqs_disabled());
99 min_clock = scd->tick_gtod + delta_jiffies * TICK_NSEC; 146
147 /*
148 * At schedule tick the clock can be just under the gtod. We don't
149 * want to push it too prematurely.
150 */
151 min_clock = scd->tick_gtod + (delta_jiffies * TICK_NSEC);
152 if (min_clock > TICK_NSEC)
153 min_clock -= TICK_NSEC / 2;
100 154
101 if (unlikely(delta < 0)) { 155 if (unlikely(delta < 0)) {
102 clock++; 156 clock++;
103 goto out; 157 goto out;
104 } 158 }
105 159
106 max_clock = min_clock + TICK_NSEC; 160 /*
161 * The clock must stay within a jiffie of the gtod.
162 * But since we may be at the start of a jiffy or the end of one
163 * we add another jiffy buffer.
164 */
165 max_clock = scd->tick_gtod + (2 + delta_jiffies) * TICK_NSEC;
166
167 delta *= scd->multi;
168 delta >>= MULTI_SHIFT;
107 169
108 if (unlikely(clock + delta > max_clock)) { 170 if (unlikely(clock + delta > max_clock) && check_max(scd)) {
109 if (clock < max_clock) 171 if (clock < max_clock)
110 clock = max_clock; 172 clock = max_clock;
111 else 173 else
@@ -118,9 +180,12 @@ static void __update_sched_clock(struct sched_clock_data *scd, u64 now)
118 if (unlikely(clock < min_clock)) 180 if (unlikely(clock < min_clock))
119 clock = min_clock; 181 clock = min_clock;
120 182
121 scd->prev_raw = now; 183 if (time)
122 scd->prev_jiffies = now_jiffies; 184 *time = clock;
123 scd->clock = clock; 185 else {
186 scd->prev_raw = now;
187 scd->clock = clock;
188 }
124} 189}
125 190
126static void lock_double_clock(struct sched_clock_data *data1, 191static void lock_double_clock(struct sched_clock_data *data1,
@@ -160,25 +225,30 @@ u64 sched_clock_cpu(int cpu)
160 now -= my_scd->tick_raw; 225 now -= my_scd->tick_raw;
161 now += scd->tick_raw; 226 now += scd->tick_raw;
162 227
163 now -= my_scd->tick_gtod; 228 now += my_scd->tick_gtod;
164 now += scd->tick_gtod; 229 now -= scd->tick_gtod;
165 230
166 __raw_spin_unlock(&my_scd->lock); 231 __raw_spin_unlock(&my_scd->lock);
232
233 __update_sched_clock(scd, now, &clock);
234
235 __raw_spin_unlock(&scd->lock);
236
167 } else { 237 } else {
168 __raw_spin_lock(&scd->lock); 238 __raw_spin_lock(&scd->lock);
239 __update_sched_clock(scd, now, NULL);
240 clock = scd->clock;
241 __raw_spin_unlock(&scd->lock);
169 } 242 }
170 243
171 __update_sched_clock(scd, now);
172 clock = scd->clock;
173
174 __raw_spin_unlock(&scd->lock);
175
176 return clock; 244 return clock;
177} 245}
178 246
179void sched_clock_tick(void) 247void sched_clock_tick(void)
180{ 248{
181 struct sched_clock_data *scd = this_scd(); 249 struct sched_clock_data *scd = this_scd();
250 unsigned long now_jiffies = jiffies;
251 s64 mult, delta_gtod, delta_raw;
182 u64 now, now_gtod; 252 u64 now, now_gtod;
183 253
184 if (unlikely(!sched_clock_running)) 254 if (unlikely(!sched_clock_running))
@@ -186,18 +256,33 @@ void sched_clock_tick(void)
186 256
187 WARN_ON_ONCE(!irqs_disabled()); 257 WARN_ON_ONCE(!irqs_disabled());
188 258
189 now = sched_clock();
190 now_gtod = ktime_to_ns(ktime_get()); 259 now_gtod = ktime_to_ns(ktime_get());
260 now = sched_clock();
191 261
192 __raw_spin_lock(&scd->lock); 262 __raw_spin_lock(&scd->lock);
193 __update_sched_clock(scd, now); 263 __update_sched_clock(scd, now, NULL);
194 /* 264 /*
195 * update tick_gtod after __update_sched_clock() because that will 265 * update tick_gtod after __update_sched_clock() because that will
196 * already observe 1 new jiffy; adding a new tick_gtod to that would 266 * already observe 1 new jiffy; adding a new tick_gtod to that would
197 * increase the clock 2 jiffies. 267 * increase the clock 2 jiffies.
198 */ 268 */
269 delta_gtod = now_gtod - scd->tick_gtod;
270 delta_raw = now - scd->tick_raw;
271
272 if ((long)delta_raw > 0) {
273 mult = delta_gtod << MULTI_SHIFT;
274 do_div(mult, delta_raw);
275 scd->multi = mult;
276 if (scd->multi > MAX_MULTI)
277 scd->multi = MAX_MULTI;
278 else if (scd->multi < MIN_MULTI)
279 scd->multi = MIN_MULTI;
280 } else
281 scd->multi = 1 << MULTI_SHIFT;
282
199 scd->tick_raw = now; 283 scd->tick_raw = now;
200 scd->tick_gtod = now_gtod; 284 scd->tick_gtod = now_gtod;
285 scd->tick_jiffies = now_jiffies;
201 __raw_spin_unlock(&scd->lock); 286 __raw_spin_unlock(&scd->lock);
202} 287}
203 288
@@ -227,6 +312,7 @@ void sched_clock_idle_wakeup_event(u64 delta_ns)
227 __raw_spin_lock(&scd->lock); 312 __raw_spin_lock(&scd->lock);
228 scd->prev_raw = now; 313 scd->prev_raw = now;
229 scd->clock += delta_ns; 314 scd->clock += delta_ns;
315 scd->multi = 1 << MULTI_SHIFT;
230 __raw_spin_unlock(&scd->lock); 316 __raw_spin_unlock(&scd->lock);
231 317
232 touch_softlockup_watchdog(); 318 touch_softlockup_watchdog();
@@ -244,3 +330,16 @@ unsigned long long __attribute__((weak)) sched_clock(void)
244{ 330{
245 return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ); 331 return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ);
246} 332}
333
334unsigned long long cpu_clock(int cpu)
335{
336 unsigned long long clock;
337 unsigned long flags;
338
339 local_irq_save(flags);
340 clock = sched_clock_cpu(cpu);
341 local_irq_restore(flags);
342
343 return clock;
344}
345EXPORT_SYMBOL_GPL(cpu_clock);
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
new file mode 100644
index 000000000000..52154fefab7e
--- /dev/null
+++ b/kernel/sched_cpupri.c
@@ -0,0 +1,174 @@
1/*
2 * kernel/sched_cpupri.c
3 *
4 * CPU priority management
5 *
6 * Copyright (C) 2007-2008 Novell
7 *
8 * Author: Gregory Haskins <ghaskins@novell.com>
9 *
10 * This code tracks the priority of each CPU so that global migration
11 * decisions are easy to calculate. Each CPU can be in a state as follows:
12 *
13 * (INVALID), IDLE, NORMAL, RT1, ... RT99
14 *
15 * going from the lowest priority to the highest. CPUs in the INVALID state
16 * are not eligible for routing. The system maintains this state with
17 * a 2 dimensional bitmap (the first for priority class, the second for cpus
18 * in that class). Therefore a typical application without affinity
19 * restrictions can find a suitable CPU with O(1) complexity (e.g. two bit
20 * searches). For tasks with affinity restrictions, the algorithm has a
21 * worst case complexity of O(min(102, nr_domcpus)), though the scenario that
22 * yields the worst case search is fairly contrived.
23 *
24 * This program is free software; you can redistribute it and/or
25 * modify it under the terms of the GNU General Public License
26 * as published by the Free Software Foundation; version 2
27 * of the License.
28 */
29
30#include "sched_cpupri.h"
31
32/* Convert between a 140 based task->prio, and our 102 based cpupri */
33static int convert_prio(int prio)
34{
35 int cpupri;
36
37 if (prio == CPUPRI_INVALID)
38 cpupri = CPUPRI_INVALID;
39 else if (prio == MAX_PRIO)
40 cpupri = CPUPRI_IDLE;
41 else if (prio >= MAX_RT_PRIO)
42 cpupri = CPUPRI_NORMAL;
43 else
44 cpupri = MAX_RT_PRIO - prio + 1;
45
46 return cpupri;
47}
48
49#define for_each_cpupri_active(array, idx) \
50 for (idx = find_first_bit(array, CPUPRI_NR_PRIORITIES); \
51 idx < CPUPRI_NR_PRIORITIES; \
52 idx = find_next_bit(array, CPUPRI_NR_PRIORITIES, idx+1))
53
54/**
55 * cpupri_find - find the best (lowest-pri) CPU in the system
56 * @cp: The cpupri context
57 * @p: The task
58 * @lowest_mask: A mask to fill in with selected CPUs
59 *
60 * Note: This function returns the recommended CPUs as calculated during the
61 * current invokation. By the time the call returns, the CPUs may have in
62 * fact changed priorities any number of times. While not ideal, it is not
63 * an issue of correctness since the normal rebalancer logic will correct
64 * any discrepancies created by racing against the uncertainty of the current
65 * priority configuration.
66 *
67 * Returns: (int)bool - CPUs were found
68 */
69int cpupri_find(struct cpupri *cp, struct task_struct *p,
70 cpumask_t *lowest_mask)
71{
72 int idx = 0;
73 int task_pri = convert_prio(p->prio);
74
75 for_each_cpupri_active(cp->pri_active, idx) {
76 struct cpupri_vec *vec = &cp->pri_to_cpu[idx];
77 cpumask_t mask;
78
79 if (idx >= task_pri)
80 break;
81
82 cpus_and(mask, p->cpus_allowed, vec->mask);
83
84 if (cpus_empty(mask))
85 continue;
86
87 *lowest_mask = mask;
88 return 1;
89 }
90
91 return 0;
92}
93
94/**
95 * cpupri_set - update the cpu priority setting
96 * @cp: The cpupri context
97 * @cpu: The target cpu
98 * @pri: The priority (INVALID-RT99) to assign to this CPU
99 *
100 * Note: Assumes cpu_rq(cpu)->lock is locked
101 *
102 * Returns: (void)
103 */
104void cpupri_set(struct cpupri *cp, int cpu, int newpri)
105{
106 int *currpri = &cp->cpu_to_pri[cpu];
107 int oldpri = *currpri;
108 unsigned long flags;
109
110 newpri = convert_prio(newpri);
111
112 BUG_ON(newpri >= CPUPRI_NR_PRIORITIES);
113
114 if (newpri == oldpri)
115 return;
116
117 /*
118 * If the cpu was currently mapped to a different value, we
119 * first need to unmap the old value
120 */
121 if (likely(oldpri != CPUPRI_INVALID)) {
122 struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri];
123
124 spin_lock_irqsave(&vec->lock, flags);
125
126 vec->count--;
127 if (!vec->count)
128 clear_bit(oldpri, cp->pri_active);
129 cpu_clear(cpu, vec->mask);
130
131 spin_unlock_irqrestore(&vec->lock, flags);
132 }
133
134 if (likely(newpri != CPUPRI_INVALID)) {
135 struct cpupri_vec *vec = &cp->pri_to_cpu[newpri];
136
137 spin_lock_irqsave(&vec->lock, flags);
138
139 cpu_set(cpu, vec->mask);
140 vec->count++;
141 if (vec->count == 1)
142 set_bit(newpri, cp->pri_active);
143
144 spin_unlock_irqrestore(&vec->lock, flags);
145 }
146
147 *currpri = newpri;
148}
149
150/**
151 * cpupri_init - initialize the cpupri structure
152 * @cp: The cpupri context
153 *
154 * Returns: (void)
155 */
156void cpupri_init(struct cpupri *cp)
157{
158 int i;
159
160 memset(cp, 0, sizeof(*cp));
161
162 for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
163 struct cpupri_vec *vec = &cp->pri_to_cpu[i];
164
165 spin_lock_init(&vec->lock);
166 vec->count = 0;
167 cpus_clear(vec->mask);
168 }
169
170 for_each_possible_cpu(i)
171 cp->cpu_to_pri[i] = CPUPRI_INVALID;
172}
173
174
diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h
new file mode 100644
index 000000000000..f25811b0f931
--- /dev/null
+++ b/kernel/sched_cpupri.h
@@ -0,0 +1,36 @@
1#ifndef _LINUX_CPUPRI_H
2#define _LINUX_CPUPRI_H
3
4#include <linux/sched.h>
5
6#define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2)
7#define CPUPRI_NR_PRI_WORDS BITS_TO_LONGS(CPUPRI_NR_PRIORITIES)
8
9#define CPUPRI_INVALID -1
10#define CPUPRI_IDLE 0
11#define CPUPRI_NORMAL 1
12/* values 2-101 are RT priorities 0-99 */
13
14struct cpupri_vec {
15 spinlock_t lock;
16 int count;
17 cpumask_t mask;
18};
19
20struct cpupri {
21 struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES];
22 long pri_active[CPUPRI_NR_PRI_WORDS];
23 int cpu_to_pri[NR_CPUS];
24};
25
26#ifdef CONFIG_SMP
27int cpupri_find(struct cpupri *cp,
28 struct task_struct *p, cpumask_t *lowest_mask);
29void cpupri_set(struct cpupri *cp, int cpu, int pri);
30void cpupri_init(struct cpupri *cp);
31#else
32#define cpupri_set(cp, cpu, pri) do { } while (0)
33#define cpupri_init() do { } while (0)
34#endif
35
36#endif /* _LINUX_CPUPRI_H */
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 8bb713040ac9..bbe6b31c3c56 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -119,9 +119,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
119 struct sched_entity *last; 119 struct sched_entity *last;
120 unsigned long flags; 120 unsigned long flags;
121 121
122#if !defined(CONFIG_CGROUP_SCHED) || !defined(CONFIG_USER_SCHED) 122#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED)
123 SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
124#else
125 char path[128] = ""; 123 char path[128] = "";
126 struct cgroup *cgroup = NULL; 124 struct cgroup *cgroup = NULL;
127 struct task_group *tg = cfs_rq->tg; 125 struct task_group *tg = cfs_rq->tg;
@@ -133,6 +131,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
133 cgroup_path(cgroup, path, sizeof(path)); 131 cgroup_path(cgroup, path, sizeof(path));
134 132
135 SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path); 133 SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path);
134#else
135 SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
136#endif 136#endif
137 137
138 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock", 138 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock",
@@ -162,11 +162,64 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
162 SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running); 162 SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
163 SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); 163 SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
164#ifdef CONFIG_SCHEDSTATS 164#ifdef CONFIG_SCHEDSTATS
165 SEQ_printf(m, " .%-30s: %d\n", "bkl_count", 165#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n);
166 rq->bkl_count); 166
167 P(yld_exp_empty);
168 P(yld_act_empty);
169 P(yld_both_empty);
170 P(yld_count);
171
172 P(sched_switch);
173 P(sched_count);
174 P(sched_goidle);
175
176 P(ttwu_count);
177 P(ttwu_local);
178
179 P(bkl_count);
180
181#undef P
167#endif 182#endif
168 SEQ_printf(m, " .%-30s: %ld\n", "nr_spread_over", 183 SEQ_printf(m, " .%-30s: %ld\n", "nr_spread_over",
169 cfs_rq->nr_spread_over); 184 cfs_rq->nr_spread_over);
185#ifdef CONFIG_FAIR_GROUP_SCHED
186#ifdef CONFIG_SMP
187 SEQ_printf(m, " .%-30s: %lu\n", "shares", cfs_rq->shares);
188#endif
189#endif
190}
191
192void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
193{
194#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_RT_GROUP_SCHED)
195 char path[128] = "";
196 struct cgroup *cgroup = NULL;
197 struct task_group *tg = rt_rq->tg;
198
199 if (tg)
200 cgroup = tg->css.cgroup;
201
202 if (cgroup)
203 cgroup_path(cgroup, path, sizeof(path));
204
205 SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, path);
206#else
207 SEQ_printf(m, "\nrt_rq[%d]:\n", cpu);
208#endif
209
210
211#define P(x) \
212 SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rt_rq->x))
213#define PN(x) \
214 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rt_rq->x))
215
216 P(rt_nr_running);
217 P(rt_throttled);
218 PN(rt_time);
219 PN(rt_runtime);
220
221#undef PN
222#undef P
170} 223}
171 224
172static void print_cpu(struct seq_file *m, int cpu) 225static void print_cpu(struct seq_file *m, int cpu)
@@ -208,6 +261,7 @@ static void print_cpu(struct seq_file *m, int cpu)
208#undef PN 261#undef PN
209 262
210 print_cfs_stats(m, cpu); 263 print_cfs_stats(m, cpu);
264 print_rt_stats(m, cpu);
211 265
212 print_rq(m, rq, cpu); 266 print_rq(m, rq, cpu);
213} 267}
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 08ae848b71d4..f2aa987027d6 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -63,13 +63,13 @@ unsigned int __read_mostly sysctl_sched_compat_yield;
63 63
64/* 64/*
65 * SCHED_OTHER wake-up granularity. 65 * SCHED_OTHER wake-up granularity.
66 * (default: 10 msec * (1 + ilog(ncpus)), units: nanoseconds) 66 * (default: 5 msec * (1 + ilog(ncpus)), units: nanoseconds)
67 * 67 *
68 * This option delays the preemption effects of decoupled workloads 68 * This option delays the preemption effects of decoupled workloads
69 * and reduces their over-scheduling. Synchronous workloads will still 69 * and reduces their over-scheduling. Synchronous workloads will still
70 * have immediate wakeup/sleep latencies. 70 * have immediate wakeup/sleep latencies.
71 */ 71 */
72unsigned int sysctl_sched_wakeup_granularity = 10000000UL; 72unsigned int sysctl_sched_wakeup_granularity = 5000000UL;
73 73
74const_debug unsigned int sysctl_sched_migration_cost = 500000UL; 74const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
75 75
@@ -334,6 +334,34 @@ int sched_nr_latency_handler(struct ctl_table *table, int write,
334#endif 334#endif
335 335
336/* 336/*
337 * delta *= w / rw
338 */
339static inline unsigned long
340calc_delta_weight(unsigned long delta, struct sched_entity *se)
341{
342 for_each_sched_entity(se) {
343 delta = calc_delta_mine(delta,
344 se->load.weight, &cfs_rq_of(se)->load);
345 }
346
347 return delta;
348}
349
350/*
351 * delta *= rw / w
352 */
353static inline unsigned long
354calc_delta_fair(unsigned long delta, struct sched_entity *se)
355{
356 for_each_sched_entity(se) {
357 delta = calc_delta_mine(delta,
358 cfs_rq_of(se)->load.weight, &se->load);
359 }
360
361 return delta;
362}
363
364/*
337 * The idea is to set a period in which each task runs once. 365 * The idea is to set a period in which each task runs once.
338 * 366 *
339 * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch 367 * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch
@@ -362,47 +390,80 @@ static u64 __sched_period(unsigned long nr_running)
362 */ 390 */
363static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) 391static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
364{ 392{
365 u64 slice = __sched_period(cfs_rq->nr_running); 393 return calc_delta_weight(__sched_period(cfs_rq->nr_running), se);
366
367 for_each_sched_entity(se) {
368 cfs_rq = cfs_rq_of(se);
369
370 slice *= se->load.weight;
371 do_div(slice, cfs_rq->load.weight);
372 }
373
374
375 return slice;
376} 394}
377 395
378/* 396/*
379 * We calculate the vruntime slice of a to be inserted task 397 * We calculate the vruntime slice of a to be inserted task
380 * 398 *
381 * vs = s/w = p/rw 399 * vs = s*rw/w = p
382 */ 400 */
383static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se) 401static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
384{ 402{
385 unsigned long nr_running = cfs_rq->nr_running; 403 unsigned long nr_running = cfs_rq->nr_running;
386 unsigned long weight;
387 u64 vslice;
388 404
389 if (!se->on_rq) 405 if (!se->on_rq)
390 nr_running++; 406 nr_running++;
391 407
392 vslice = __sched_period(nr_running); 408 return __sched_period(nr_running);
409}
410
411/*
412 * The goal of calc_delta_asym() is to be asymmetrically around NICE_0_LOAD, in
413 * that it favours >=0 over <0.
414 *
415 * -20 |
416 * |
417 * 0 --------+-------
418 * .'
419 * 19 .'
420 *
421 */
422static unsigned long
423calc_delta_asym(unsigned long delta, struct sched_entity *se)
424{
425 struct load_weight lw = {
426 .weight = NICE_0_LOAD,
427 .inv_weight = 1UL << (WMULT_SHIFT-NICE_0_SHIFT)
428 };
393 429
394 for_each_sched_entity(se) { 430 for_each_sched_entity(se) {
395 cfs_rq = cfs_rq_of(se); 431 struct load_weight *se_lw = &se->load;
432 unsigned long rw = cfs_rq_of(se)->load.weight;
433
434#ifdef CONFIG_FAIR_SCHED_GROUP
435 struct cfs_rq *cfs_rq = se->my_q;
436 struct task_group *tg = NULL
437
438 if (cfs_rq)
439 tg = cfs_rq->tg;
440
441 if (tg && tg->shares < NICE_0_LOAD) {
442 /*
443 * scale shares to what it would have been had
444 * tg->weight been NICE_0_LOAD:
445 *
446 * weight = 1024 * shares / tg->weight
447 */
448 lw.weight *= se->load.weight;
449 lw.weight /= tg->shares;
450
451 lw.inv_weight = 0;
452
453 se_lw = &lw;
454 rw += lw.weight - se->load.weight;
455 } else
456#endif
396 457
397 weight = cfs_rq->load.weight; 458 if (se->load.weight < NICE_0_LOAD) {
398 if (!se->on_rq) 459 se_lw = &lw;
399 weight += se->load.weight; 460 rw += NICE_0_LOAD - se->load.weight;
461 }
400 462
401 vslice *= NICE_0_LOAD; 463 delta = calc_delta_mine(delta, rw, se_lw);
402 do_div(vslice, weight);
403 } 464 }
404 465
405 return vslice; 466 return delta;
406} 467}
407 468
408/* 469/*
@@ -419,11 +480,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
419 480
420 curr->sum_exec_runtime += delta_exec; 481 curr->sum_exec_runtime += delta_exec;
421 schedstat_add(cfs_rq, exec_clock, delta_exec); 482 schedstat_add(cfs_rq, exec_clock, delta_exec);
422 delta_exec_weighted = delta_exec; 483 delta_exec_weighted = calc_delta_fair(delta_exec, curr);
423 if (unlikely(curr->load.weight != NICE_0_LOAD)) {
424 delta_exec_weighted = calc_delta_fair(delta_exec_weighted,
425 &curr->load);
426 }
427 curr->vruntime += delta_exec_weighted; 484 curr->vruntime += delta_exec_weighted;
428} 485}
429 486
@@ -510,10 +567,27 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
510 * Scheduling class queueing methods: 567 * Scheduling class queueing methods:
511 */ 568 */
512 569
570#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
571static void
572add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
573{
574 cfs_rq->task_weight += weight;
575}
576#else
577static inline void
578add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
579{
580}
581#endif
582
513static void 583static void
514account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) 584account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
515{ 585{
516 update_load_add(&cfs_rq->load, se->load.weight); 586 update_load_add(&cfs_rq->load, se->load.weight);
587 if (!parent_entity(se))
588 inc_cpu_load(rq_of(cfs_rq), se->load.weight);
589 if (entity_is_task(se))
590 add_cfs_task_weight(cfs_rq, se->load.weight);
517 cfs_rq->nr_running++; 591 cfs_rq->nr_running++;
518 se->on_rq = 1; 592 se->on_rq = 1;
519 list_add(&se->group_node, &cfs_rq->tasks); 593 list_add(&se->group_node, &cfs_rq->tasks);
@@ -523,6 +597,10 @@ static void
523account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) 597account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
524{ 598{
525 update_load_sub(&cfs_rq->load, se->load.weight); 599 update_load_sub(&cfs_rq->load, se->load.weight);
600 if (!parent_entity(se))
601 dec_cpu_load(rq_of(cfs_rq), se->load.weight);
602 if (entity_is_task(se))
603 add_cfs_task_weight(cfs_rq, -se->load.weight);
526 cfs_rq->nr_running--; 604 cfs_rq->nr_running--;
527 se->on_rq = 0; 605 se->on_rq = 0;
528 list_del_init(&se->group_node); 606 list_del_init(&se->group_node);
@@ -609,8 +687,17 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
609 687
610 if (!initial) { 688 if (!initial) {
611 /* sleeps upto a single latency don't count. */ 689 /* sleeps upto a single latency don't count. */
612 if (sched_feat(NEW_FAIR_SLEEPERS)) 690 if (sched_feat(NEW_FAIR_SLEEPERS)) {
613 vruntime -= sysctl_sched_latency; 691 unsigned long thresh = sysctl_sched_latency;
692
693 /*
694 * convert the sleeper threshold into virtual time
695 */
696 if (sched_feat(NORMALIZED_SLEEPER))
697 thresh = calc_delta_fair(thresh, se);
698
699 vruntime -= thresh;
700 }
614 701
615 /* ensure we never gain time by being placed backwards. */ 702 /* ensure we never gain time by being placed backwards. */
616 vruntime = max_vruntime(se->vruntime, vruntime); 703 vruntime = max_vruntime(se->vruntime, vruntime);
@@ -639,21 +726,6 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
639 __enqueue_entity(cfs_rq, se); 726 __enqueue_entity(cfs_rq, se);
640} 727}
641 728
642static void update_avg(u64 *avg, u64 sample)
643{
644 s64 diff = sample - *avg;
645 *avg += diff >> 3;
646}
647
648static void update_avg_stats(struct cfs_rq *cfs_rq, struct sched_entity *se)
649{
650 if (!se->last_wakeup)
651 return;
652
653 update_avg(&se->avg_overlap, se->sum_exec_runtime - se->last_wakeup);
654 se->last_wakeup = 0;
655}
656
657static void 729static void
658dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) 730dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
659{ 731{
@@ -664,7 +736,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
664 736
665 update_stats_dequeue(cfs_rq, se); 737 update_stats_dequeue(cfs_rq, se);
666 if (sleep) { 738 if (sleep) {
667 update_avg_stats(cfs_rq, se);
668#ifdef CONFIG_SCHEDSTATS 739#ifdef CONFIG_SCHEDSTATS
669 if (entity_is_task(se)) { 740 if (entity_is_task(se)) {
670 struct task_struct *tsk = task_of(se); 741 struct task_struct *tsk = task_of(se);
@@ -726,17 +797,16 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
726 se->prev_sum_exec_runtime = se->sum_exec_runtime; 797 se->prev_sum_exec_runtime = se->sum_exec_runtime;
727} 798}
728 799
729static int
730wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
731
732static struct sched_entity * 800static struct sched_entity *
733pick_next(struct cfs_rq *cfs_rq, struct sched_entity *se) 801pick_next(struct cfs_rq *cfs_rq, struct sched_entity *se)
734{ 802{
735 if (!cfs_rq->next) 803 struct rq *rq = rq_of(cfs_rq);
736 return se; 804 u64 pair_slice = rq->clock - cfs_rq->pair_start;
737 805
738 if (wakeup_preempt_entity(cfs_rq->next, se) != 0) 806 if (!cfs_rq->next || pair_slice > sched_slice(cfs_rq, cfs_rq->next)) {
807 cfs_rq->pair_start = rq->clock;
739 return se; 808 return se;
809 }
740 810
741 return cfs_rq->next; 811 return cfs_rq->next;
742} 812}
@@ -835,7 +905,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
835 hrtick_start(rq, delta, requeue); 905 hrtick_start(rq, delta, requeue);
836 } 906 }
837} 907}
838#else 908#else /* !CONFIG_SCHED_HRTICK */
839static inline void 909static inline void
840hrtick_start_fair(struct rq *rq, struct task_struct *p) 910hrtick_start_fair(struct rq *rq, struct task_struct *p)
841{ 911{
@@ -976,7 +1046,7 @@ static int wake_idle(int cpu, struct task_struct *p)
976 } 1046 }
977 return cpu; 1047 return cpu;
978} 1048}
979#else 1049#else /* !ARCH_HAS_SCHED_WAKE_IDLE*/
980static inline int wake_idle(int cpu, struct task_struct *p) 1050static inline int wake_idle(int cpu, struct task_struct *p)
981{ 1051{
982 return cpu; 1052 return cpu;
@@ -987,6 +1057,89 @@ static inline int wake_idle(int cpu, struct task_struct *p)
987 1057
988static const struct sched_class fair_sched_class; 1058static const struct sched_class fair_sched_class;
989 1059
1060#ifdef CONFIG_FAIR_GROUP_SCHED
1061/*
1062 * effective_load() calculates the load change as seen from the root_task_group
1063 *
1064 * Adding load to a group doesn't make a group heavier, but can cause movement
1065 * of group shares between cpus. Assuming the shares were perfectly aligned one
1066 * can calculate the shift in shares.
1067 *
1068 * The problem is that perfectly aligning the shares is rather expensive, hence
1069 * we try to avoid doing that too often - see update_shares(), which ratelimits
1070 * this change.
1071 *
1072 * We compensate this by not only taking the current delta into account, but
1073 * also considering the delta between when the shares were last adjusted and
1074 * now.
1075 *
1076 * We still saw a performance dip, some tracing learned us that between
1077 * cgroup:/ and cgroup:/foo balancing the number of affine wakeups increased
1078 * significantly. Therefore try to bias the error in direction of failing
1079 * the affine wakeup.
1080 *
1081 */
1082static long effective_load(struct task_group *tg, int cpu,
1083 long wl, long wg)
1084{
1085 struct sched_entity *se = tg->se[cpu];
1086 long more_w;
1087
1088 if (!tg->parent)
1089 return wl;
1090
1091 /*
1092 * By not taking the decrease of shares on the other cpu into
1093 * account our error leans towards reducing the affine wakeups.
1094 */
1095 if (!wl && sched_feat(ASYM_EFF_LOAD))
1096 return wl;
1097
1098 /*
1099 * Instead of using this increment, also add the difference
1100 * between when the shares were last updated and now.
1101 */
1102 more_w = se->my_q->load.weight - se->my_q->rq_weight;
1103 wl += more_w;
1104 wg += more_w;
1105
1106 for_each_sched_entity(se) {
1107#define D(n) (likely(n) ? (n) : 1)
1108
1109 long S, rw, s, a, b;
1110
1111 S = se->my_q->tg->shares;
1112 s = se->my_q->shares;
1113 rw = se->my_q->rq_weight;
1114
1115 a = S*(rw + wl);
1116 b = S*rw + s*wg;
1117
1118 wl = s*(a-b)/D(b);
1119 /*
1120 * Assume the group is already running and will
1121 * thus already be accounted for in the weight.
1122 *
1123 * That is, moving shares between CPUs, does not
1124 * alter the group weight.
1125 */
1126 wg = 0;
1127#undef D
1128 }
1129
1130 return wl;
1131}
1132
1133#else
1134
1135static inline unsigned long effective_load(struct task_group *tg, int cpu,
1136 unsigned long wl, unsigned long wg)
1137{
1138 return wl;
1139}
1140
1141#endif
1142
990static int 1143static int
991wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, 1144wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
992 struct task_struct *p, int prev_cpu, int this_cpu, int sync, 1145 struct task_struct *p, int prev_cpu, int this_cpu, int sync,
@@ -994,8 +1147,10 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
994 unsigned int imbalance) 1147 unsigned int imbalance)
995{ 1148{
996 struct task_struct *curr = this_rq->curr; 1149 struct task_struct *curr = this_rq->curr;
1150 struct task_group *tg;
997 unsigned long tl = this_load; 1151 unsigned long tl = this_load;
998 unsigned long tl_per_task; 1152 unsigned long tl_per_task;
1153 unsigned long weight;
999 int balanced; 1154 int balanced;
1000 1155
1001 if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS)) 1156 if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS))
@@ -1006,19 +1161,28 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
1006 * effect of the currently running task from the load 1161 * effect of the currently running task from the load
1007 * of the current CPU: 1162 * of the current CPU:
1008 */ 1163 */
1009 if (sync) 1164 if (sync) {
1010 tl -= current->se.load.weight; 1165 tg = task_group(current);
1166 weight = current->se.load.weight;
1167
1168 tl += effective_load(tg, this_cpu, -weight, -weight);
1169 load += effective_load(tg, prev_cpu, 0, -weight);
1170 }
1011 1171
1012 balanced = 100*(tl + p->se.load.weight) <= imbalance*load; 1172 tg = task_group(p);
1173 weight = p->se.load.weight;
1174
1175 balanced = 100*(tl + effective_load(tg, this_cpu, weight, weight)) <=
1176 imbalance*(load + effective_load(tg, prev_cpu, 0, weight));
1013 1177
1014 /* 1178 /*
1015 * If the currently running task will sleep within 1179 * If the currently running task will sleep within
1016 * a reasonable amount of time then attract this newly 1180 * a reasonable amount of time then attract this newly
1017 * woken task: 1181 * woken task:
1018 */ 1182 */
1019 if (sync && balanced && curr->sched_class == &fair_sched_class) { 1183 if (sync && balanced) {
1020 if (curr->se.avg_overlap < sysctl_sched_migration_cost && 1184 if (curr->se.avg_overlap < sysctl_sched_migration_cost &&
1021 p->se.avg_overlap < sysctl_sched_migration_cost) 1185 p->se.avg_overlap < sysctl_sched_migration_cost)
1022 return 1; 1186 return 1;
1023 } 1187 }
1024 1188
@@ -1111,11 +1275,13 @@ static unsigned long wakeup_gran(struct sched_entity *se)
1111 unsigned long gran = sysctl_sched_wakeup_granularity; 1275 unsigned long gran = sysctl_sched_wakeup_granularity;
1112 1276
1113 /* 1277 /*
1114 * More easily preempt - nice tasks, while not making 1278 * More easily preempt - nice tasks, while not making it harder for
1115 * it harder for + nice tasks. 1279 * + nice tasks.
1116 */ 1280 */
1117 if (unlikely(se->load.weight > NICE_0_LOAD)) 1281 if (sched_feat(ASYM_GRAN))
1118 gran = calc_delta_fair(gran, &se->load); 1282 gran = calc_delta_asym(sysctl_sched_wakeup_granularity, se);
1283 else
1284 gran = calc_delta_fair(sysctl_sched_wakeup_granularity, se);
1119 1285
1120 return gran; 1286 return gran;
1121} 1287}
@@ -1177,7 +1343,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
1177 return; 1343 return;
1178 } 1344 }
1179 1345
1180 se->last_wakeup = se->sum_exec_runtime;
1181 if (unlikely(se == pse)) 1346 if (unlikely(se == pse))
1182 return; 1347 return;
1183 1348
@@ -1275,23 +1440,18 @@ __load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next)
1275 struct task_struct *p = NULL; 1440 struct task_struct *p = NULL;
1276 struct sched_entity *se; 1441 struct sched_entity *se;
1277 1442
1278 if (next == &cfs_rq->tasks) 1443 while (next != &cfs_rq->tasks) {
1279 return NULL;
1280
1281 /* Skip over entities that are not tasks */
1282 do {
1283 se = list_entry(next, struct sched_entity, group_node); 1444 se = list_entry(next, struct sched_entity, group_node);
1284 next = next->next; 1445 next = next->next;
1285 } while (next != &cfs_rq->tasks && !entity_is_task(se));
1286 1446
1287 if (next == &cfs_rq->tasks) 1447 /* Skip over entities that are not tasks */
1288 return NULL; 1448 if (entity_is_task(se)) {
1449 p = task_of(se);
1450 break;
1451 }
1452 }
1289 1453
1290 cfs_rq->balance_iterator = next; 1454 cfs_rq->balance_iterator = next;
1291
1292 if (entity_is_task(se))
1293 p = task_of(se);
1294
1295 return p; 1455 return p;
1296} 1456}
1297 1457
@@ -1309,75 +1469,82 @@ static struct task_struct *load_balance_next_fair(void *arg)
1309 return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator); 1469 return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator);
1310} 1470}
1311 1471
1312#ifdef CONFIG_FAIR_GROUP_SCHED 1472static unsigned long
1313static int cfs_rq_best_prio(struct cfs_rq *cfs_rq) 1473__load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1474 unsigned long max_load_move, struct sched_domain *sd,
1475 enum cpu_idle_type idle, int *all_pinned, int *this_best_prio,
1476 struct cfs_rq *cfs_rq)
1314{ 1477{
1315 struct sched_entity *curr; 1478 struct rq_iterator cfs_rq_iterator;
1316 struct task_struct *p;
1317
1318 if (!cfs_rq->nr_running || !first_fair(cfs_rq))
1319 return MAX_PRIO;
1320
1321 curr = cfs_rq->curr;
1322 if (!curr)
1323 curr = __pick_next_entity(cfs_rq);
1324 1479
1325 p = task_of(curr); 1480 cfs_rq_iterator.start = load_balance_start_fair;
1481 cfs_rq_iterator.next = load_balance_next_fair;
1482 cfs_rq_iterator.arg = cfs_rq;
1326 1483
1327 return p->prio; 1484 return balance_tasks(this_rq, this_cpu, busiest,
1485 max_load_move, sd, idle, all_pinned,
1486 this_best_prio, &cfs_rq_iterator);
1328} 1487}
1329#endif
1330 1488
1489#ifdef CONFIG_FAIR_GROUP_SCHED
1331static unsigned long 1490static unsigned long
1332load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 1491load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1333 unsigned long max_load_move, 1492 unsigned long max_load_move,
1334 struct sched_domain *sd, enum cpu_idle_type idle, 1493 struct sched_domain *sd, enum cpu_idle_type idle,
1335 int *all_pinned, int *this_best_prio) 1494 int *all_pinned, int *this_best_prio)
1336{ 1495{
1337 struct cfs_rq *busy_cfs_rq;
1338 long rem_load_move = max_load_move; 1496 long rem_load_move = max_load_move;
1339 struct rq_iterator cfs_rq_iterator; 1497 int busiest_cpu = cpu_of(busiest);
1340 1498 struct task_group *tg;
1341 cfs_rq_iterator.start = load_balance_start_fair;
1342 cfs_rq_iterator.next = load_balance_next_fair;
1343 1499
1344 for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { 1500 rcu_read_lock();
1345#ifdef CONFIG_FAIR_GROUP_SCHED 1501 update_h_load(busiest_cpu);
1346 struct cfs_rq *this_cfs_rq;
1347 long imbalance;
1348 unsigned long maxload;
1349 1502
1350 this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu); 1503 list_for_each_entry(tg, &task_groups, list) {
1504 struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu];
1505 unsigned long busiest_h_load = busiest_cfs_rq->h_load;
1506 unsigned long busiest_weight = busiest_cfs_rq->load.weight;
1507 u64 rem_load, moved_load;
1351 1508
1352 imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight; 1509 /*
1353 /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */ 1510 * empty group
1354 if (imbalance <= 0) 1511 */
1512 if (!busiest_cfs_rq->task_weight)
1355 continue; 1513 continue;
1356 1514
1357 /* Don't pull more than imbalance/2 */ 1515 rem_load = (u64)rem_load_move * busiest_weight;
1358 imbalance /= 2; 1516 rem_load = div_u64(rem_load, busiest_h_load + 1);
1359 maxload = min(rem_load_move, imbalance);
1360 1517
1361 *this_best_prio = cfs_rq_best_prio(this_cfs_rq); 1518 moved_load = __load_balance_fair(this_rq, this_cpu, busiest,
1362#else 1519 rem_load, sd, idle, all_pinned, this_best_prio,
1363# define maxload rem_load_move 1520 tg->cfs_rq[busiest_cpu]);
1364#endif 1521
1365 /* 1522 if (!moved_load)
1366 * pass busy_cfs_rq argument into 1523 continue;
1367 * load_balance_[start|next]_fair iterators 1524
1368 */ 1525 moved_load *= busiest_h_load;
1369 cfs_rq_iterator.arg = busy_cfs_rq; 1526 moved_load = div_u64(moved_load, busiest_weight + 1);
1370 rem_load_move -= balance_tasks(this_rq, this_cpu, busiest,
1371 maxload, sd, idle, all_pinned,
1372 this_best_prio,
1373 &cfs_rq_iterator);
1374 1527
1375 if (rem_load_move <= 0) 1528 rem_load_move -= moved_load;
1529 if (rem_load_move < 0)
1376 break; 1530 break;
1377 } 1531 }
1532 rcu_read_unlock();
1378 1533
1379 return max_load_move - rem_load_move; 1534 return max_load_move - rem_load_move;
1380} 1535}
1536#else
1537static unsigned long
1538load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1539 unsigned long max_load_move,
1540 struct sched_domain *sd, enum cpu_idle_type idle,
1541 int *all_pinned, int *this_best_prio)
1542{
1543 return __load_balance_fair(this_rq, this_cpu, busiest,
1544 max_load_move, sd, idle, all_pinned,
1545 this_best_prio, &busiest->cfs);
1546}
1547#endif
1381 1548
1382static int 1549static int
1383move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 1550move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
@@ -1402,7 +1569,7 @@ move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1402 1569
1403 return 0; 1570 return 0;
1404} 1571}
1405#endif 1572#endif /* CONFIG_SMP */
1406 1573
1407/* 1574/*
1408 * scheduler tick hitting a task of our scheduling class: 1575 * scheduler tick hitting a task of our scheduling class:
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 1c7283cb9581..862b06bd560a 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -1,4 +1,5 @@
1SCHED_FEAT(NEW_FAIR_SLEEPERS, 1) 1SCHED_FEAT(NEW_FAIR_SLEEPERS, 1)
2SCHED_FEAT(NORMALIZED_SLEEPER, 1)
2SCHED_FEAT(WAKEUP_PREEMPT, 1) 3SCHED_FEAT(WAKEUP_PREEMPT, 1)
3SCHED_FEAT(START_DEBIT, 1) 4SCHED_FEAT(START_DEBIT, 1)
4SCHED_FEAT(AFFINE_WAKEUPS, 1) 5SCHED_FEAT(AFFINE_WAKEUPS, 1)
@@ -6,5 +7,7 @@ SCHED_FEAT(CACHE_HOT_BUDDY, 1)
6SCHED_FEAT(SYNC_WAKEUPS, 1) 7SCHED_FEAT(SYNC_WAKEUPS, 1)
7SCHED_FEAT(HRTICK, 1) 8SCHED_FEAT(HRTICK, 1)
8SCHED_FEAT(DOUBLE_TICK, 0) 9SCHED_FEAT(DOUBLE_TICK, 0)
9SCHED_FEAT(NORMALIZED_SLEEPER, 1) 10SCHED_FEAT(ASYM_GRAN, 1)
10SCHED_FEAT(DEADLINE, 1) 11SCHED_FEAT(LB_BIAS, 0)
12SCHED_FEAT(LB_WAKEUP_UPDATE, 1)
13SCHED_FEAT(ASYM_EFF_LOAD, 1)
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 0f3c19197fa4..47ceac9e8552 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -12,6 +12,9 @@ static inline int rt_overloaded(struct rq *rq)
12 12
13static inline void rt_set_overload(struct rq *rq) 13static inline void rt_set_overload(struct rq *rq)
14{ 14{
15 if (!rq->online)
16 return;
17
15 cpu_set(rq->cpu, rq->rd->rto_mask); 18 cpu_set(rq->cpu, rq->rd->rto_mask);
16 /* 19 /*
17 * Make sure the mask is visible before we set 20 * Make sure the mask is visible before we set
@@ -26,6 +29,9 @@ static inline void rt_set_overload(struct rq *rq)
26 29
27static inline void rt_clear_overload(struct rq *rq) 30static inline void rt_clear_overload(struct rq *rq)
28{ 31{
32 if (!rq->online)
33 return;
34
29 /* the order here really doesn't matter */ 35 /* the order here really doesn't matter */
30 atomic_dec(&rq->rd->rto_count); 36 atomic_dec(&rq->rd->rto_count);
31 cpu_clear(rq->cpu, rq->rd->rto_mask); 37 cpu_clear(rq->cpu, rq->rd->rto_mask);
@@ -155,7 +161,7 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
155 return &rt_rq->tg->rt_bandwidth; 161 return &rt_rq->tg->rt_bandwidth;
156} 162}
157 163
158#else 164#else /* !CONFIG_RT_GROUP_SCHED */
159 165
160static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) 166static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
161{ 167{
@@ -220,49 +226,10 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
220 return &def_rt_bandwidth; 226 return &def_rt_bandwidth;
221} 227}
222 228
223#endif 229#endif /* CONFIG_RT_GROUP_SCHED */
224
225static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
226{
227 int i, idle = 1;
228 cpumask_t span;
229
230 if (rt_b->rt_runtime == RUNTIME_INF)
231 return 1;
232
233 span = sched_rt_period_mask();
234 for_each_cpu_mask(i, span) {
235 int enqueue = 0;
236 struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
237 struct rq *rq = rq_of_rt_rq(rt_rq);
238
239 spin_lock(&rq->lock);
240 if (rt_rq->rt_time) {
241 u64 runtime;
242
243 spin_lock(&rt_rq->rt_runtime_lock);
244 runtime = rt_rq->rt_runtime;
245 rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime);
246 if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
247 rt_rq->rt_throttled = 0;
248 enqueue = 1;
249 }
250 if (rt_rq->rt_time || rt_rq->rt_nr_running)
251 idle = 0;
252 spin_unlock(&rt_rq->rt_runtime_lock);
253 } else if (rt_rq->rt_nr_running)
254 idle = 0;
255
256 if (enqueue)
257 sched_rt_rq_enqueue(rt_rq);
258 spin_unlock(&rq->lock);
259 }
260
261 return idle;
262}
263 230
264#ifdef CONFIG_SMP 231#ifdef CONFIG_SMP
265static int balance_runtime(struct rt_rq *rt_rq) 232static int do_balance_runtime(struct rt_rq *rt_rq)
266{ 233{
267 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); 234 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
268 struct root_domain *rd = cpu_rq(smp_processor_id())->rd; 235 struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
@@ -281,6 +248,9 @@ static int balance_runtime(struct rt_rq *rt_rq)
281 continue; 248 continue;
282 249
283 spin_lock(&iter->rt_runtime_lock); 250 spin_lock(&iter->rt_runtime_lock);
251 if (iter->rt_runtime == RUNTIME_INF)
252 goto next;
253
284 diff = iter->rt_runtime - iter->rt_time; 254 diff = iter->rt_runtime - iter->rt_time;
285 if (diff > 0) { 255 if (diff > 0) {
286 do_div(diff, weight); 256 do_div(diff, weight);
@@ -294,13 +264,163 @@ static int balance_runtime(struct rt_rq *rt_rq)
294 break; 264 break;
295 } 265 }
296 } 266 }
267next:
297 spin_unlock(&iter->rt_runtime_lock); 268 spin_unlock(&iter->rt_runtime_lock);
298 } 269 }
299 spin_unlock(&rt_b->rt_runtime_lock); 270 spin_unlock(&rt_b->rt_runtime_lock);
300 271
301 return more; 272 return more;
302} 273}
303#endif 274
275static void __disable_runtime(struct rq *rq)
276{
277 struct root_domain *rd = rq->rd;
278 struct rt_rq *rt_rq;
279
280 if (unlikely(!scheduler_running))
281 return;
282
283 for_each_leaf_rt_rq(rt_rq, rq) {
284 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
285 s64 want;
286 int i;
287
288 spin_lock(&rt_b->rt_runtime_lock);
289 spin_lock(&rt_rq->rt_runtime_lock);
290 if (rt_rq->rt_runtime == RUNTIME_INF ||
291 rt_rq->rt_runtime == rt_b->rt_runtime)
292 goto balanced;
293 spin_unlock(&rt_rq->rt_runtime_lock);
294
295 want = rt_b->rt_runtime - rt_rq->rt_runtime;
296
297 for_each_cpu_mask(i, rd->span) {
298 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
299 s64 diff;
300
301 if (iter == rt_rq)
302 continue;
303
304 spin_lock(&iter->rt_runtime_lock);
305 if (want > 0) {
306 diff = min_t(s64, iter->rt_runtime, want);
307 iter->rt_runtime -= diff;
308 want -= diff;
309 } else {
310 iter->rt_runtime -= want;
311 want -= want;
312 }
313 spin_unlock(&iter->rt_runtime_lock);
314
315 if (!want)
316 break;
317 }
318
319 spin_lock(&rt_rq->rt_runtime_lock);
320 BUG_ON(want);
321balanced:
322 rt_rq->rt_runtime = RUNTIME_INF;
323 spin_unlock(&rt_rq->rt_runtime_lock);
324 spin_unlock(&rt_b->rt_runtime_lock);
325 }
326}
327
328static void disable_runtime(struct rq *rq)
329{
330 unsigned long flags;
331
332 spin_lock_irqsave(&rq->lock, flags);
333 __disable_runtime(rq);
334 spin_unlock_irqrestore(&rq->lock, flags);
335}
336
337static void __enable_runtime(struct rq *rq)
338{
339 struct rt_rq *rt_rq;
340
341 if (unlikely(!scheduler_running))
342 return;
343
344 for_each_leaf_rt_rq(rt_rq, rq) {
345 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
346
347 spin_lock(&rt_b->rt_runtime_lock);
348 spin_lock(&rt_rq->rt_runtime_lock);
349 rt_rq->rt_runtime = rt_b->rt_runtime;
350 rt_rq->rt_time = 0;
351 spin_unlock(&rt_rq->rt_runtime_lock);
352 spin_unlock(&rt_b->rt_runtime_lock);
353 }
354}
355
356static void enable_runtime(struct rq *rq)
357{
358 unsigned long flags;
359
360 spin_lock_irqsave(&rq->lock, flags);
361 __enable_runtime(rq);
362 spin_unlock_irqrestore(&rq->lock, flags);
363}
364
365static int balance_runtime(struct rt_rq *rt_rq)
366{
367 int more = 0;
368
369 if (rt_rq->rt_time > rt_rq->rt_runtime) {
370 spin_unlock(&rt_rq->rt_runtime_lock);
371 more = do_balance_runtime(rt_rq);
372 spin_lock(&rt_rq->rt_runtime_lock);
373 }
374
375 return more;
376}
377#else /* !CONFIG_SMP */
378static inline int balance_runtime(struct rt_rq *rt_rq)
379{
380 return 0;
381}
382#endif /* CONFIG_SMP */
383
384static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
385{
386 int i, idle = 1;
387 cpumask_t span;
388
389 if (rt_b->rt_runtime == RUNTIME_INF)
390 return 1;
391
392 span = sched_rt_period_mask();
393 for_each_cpu_mask(i, span) {
394 int enqueue = 0;
395 struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
396 struct rq *rq = rq_of_rt_rq(rt_rq);
397
398 spin_lock(&rq->lock);
399 if (rt_rq->rt_time) {
400 u64 runtime;
401
402 spin_lock(&rt_rq->rt_runtime_lock);
403 if (rt_rq->rt_throttled)
404 balance_runtime(rt_rq);
405 runtime = rt_rq->rt_runtime;
406 rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime);
407 if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
408 rt_rq->rt_throttled = 0;
409 enqueue = 1;
410 }
411 if (rt_rq->rt_time || rt_rq->rt_nr_running)
412 idle = 0;
413 spin_unlock(&rt_rq->rt_runtime_lock);
414 } else if (rt_rq->rt_nr_running)
415 idle = 0;
416
417 if (enqueue)
418 sched_rt_rq_enqueue(rt_rq);
419 spin_unlock(&rq->lock);
420 }
421
422 return idle;
423}
304 424
305static inline int rt_se_prio(struct sched_rt_entity *rt_se) 425static inline int rt_se_prio(struct sched_rt_entity *rt_se)
306{ 426{
@@ -327,18 +447,10 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
327 if (sched_rt_runtime(rt_rq) >= sched_rt_period(rt_rq)) 447 if (sched_rt_runtime(rt_rq) >= sched_rt_period(rt_rq))
328 return 0; 448 return 0;
329 449
330#ifdef CONFIG_SMP 450 balance_runtime(rt_rq);
331 if (rt_rq->rt_time > runtime) { 451 runtime = sched_rt_runtime(rt_rq);
332 int more; 452 if (runtime == RUNTIME_INF)
333 453 return 0;
334 spin_unlock(&rt_rq->rt_runtime_lock);
335 more = balance_runtime(rt_rq);
336 spin_lock(&rt_rq->rt_runtime_lock);
337
338 if (more)
339 runtime = sched_rt_runtime(rt_rq);
340 }
341#endif
342 454
343 if (rt_rq->rt_time > runtime) { 455 if (rt_rq->rt_time > runtime) {
344 rt_rq->rt_throttled = 1; 456 rt_rq->rt_throttled = 1;
@@ -392,12 +504,21 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
392 WARN_ON(!rt_prio(rt_se_prio(rt_se))); 504 WARN_ON(!rt_prio(rt_se_prio(rt_se)));
393 rt_rq->rt_nr_running++; 505 rt_rq->rt_nr_running++;
394#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED 506#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
395 if (rt_se_prio(rt_se) < rt_rq->highest_prio) 507 if (rt_se_prio(rt_se) < rt_rq->highest_prio) {
508 struct rq *rq = rq_of_rt_rq(rt_rq);
509
396 rt_rq->highest_prio = rt_se_prio(rt_se); 510 rt_rq->highest_prio = rt_se_prio(rt_se);
511#ifdef CONFIG_SMP
512 if (rq->online)
513 cpupri_set(&rq->rd->cpupri, rq->cpu,
514 rt_se_prio(rt_se));
515#endif
516 }
397#endif 517#endif
398#ifdef CONFIG_SMP 518#ifdef CONFIG_SMP
399 if (rt_se->nr_cpus_allowed > 1) { 519 if (rt_se->nr_cpus_allowed > 1) {
400 struct rq *rq = rq_of_rt_rq(rt_rq); 520 struct rq *rq = rq_of_rt_rq(rt_rq);
521
401 rq->rt.rt_nr_migratory++; 522 rq->rt.rt_nr_migratory++;
402 } 523 }
403 524
@@ -417,6 +538,10 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
417static inline 538static inline
418void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 539void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
419{ 540{
541#ifdef CONFIG_SMP
542 int highest_prio = rt_rq->highest_prio;
543#endif
544
420 WARN_ON(!rt_prio(rt_se_prio(rt_se))); 545 WARN_ON(!rt_prio(rt_se_prio(rt_se)));
421 WARN_ON(!rt_rq->rt_nr_running); 546 WARN_ON(!rt_rq->rt_nr_running);
422 rt_rq->rt_nr_running--; 547 rt_rq->rt_nr_running--;
@@ -440,6 +565,14 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
440 rq->rt.rt_nr_migratory--; 565 rq->rt.rt_nr_migratory--;
441 } 566 }
442 567
568 if (rt_rq->highest_prio != highest_prio) {
569 struct rq *rq = rq_of_rt_rq(rt_rq);
570
571 if (rq->online)
572 cpupri_set(&rq->rd->cpupri, rq->cpu,
573 rt_rq->highest_prio);
574 }
575
443 update_rt_migration(rq_of_rt_rq(rt_rq)); 576 update_rt_migration(rq_of_rt_rq(rt_rq));
444#endif /* CONFIG_SMP */ 577#endif /* CONFIG_SMP */
445#ifdef CONFIG_RT_GROUP_SCHED 578#ifdef CONFIG_RT_GROUP_SCHED
@@ -455,6 +588,7 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se)
455 struct rt_rq *rt_rq = rt_rq_of_se(rt_se); 588 struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
456 struct rt_prio_array *array = &rt_rq->active; 589 struct rt_prio_array *array = &rt_rq->active;
457 struct rt_rq *group_rq = group_rt_rq(rt_se); 590 struct rt_rq *group_rq = group_rt_rq(rt_se);
591 struct list_head *queue = array->queue + rt_se_prio(rt_se);
458 592
459 /* 593 /*
460 * Don't enqueue the group if its throttled, or when empty. 594 * Don't enqueue the group if its throttled, or when empty.
@@ -465,7 +599,11 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se)
465 if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) 599 if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
466 return; 600 return;
467 601
468 list_add_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se)); 602 if (rt_se->nr_cpus_allowed == 1)
603 list_add(&rt_se->run_list, queue);
604 else
605 list_add_tail(&rt_se->run_list, queue);
606
469 __set_bit(rt_se_prio(rt_se), array->bitmap); 607 __set_bit(rt_se_prio(rt_se), array->bitmap);
470 608
471 inc_rt_tasks(rt_se, rt_rq); 609 inc_rt_tasks(rt_se, rt_rq);
@@ -532,6 +670,8 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
532 rt_se->timeout = 0; 670 rt_se->timeout = 0;
533 671
534 enqueue_rt_entity(rt_se); 672 enqueue_rt_entity(rt_se);
673
674 inc_cpu_load(rq, p->se.load.weight);
535} 675}
536 676
537static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) 677static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
@@ -540,6 +680,8 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
540 680
541 update_curr_rt(rq); 681 update_curr_rt(rq);
542 dequeue_rt_entity(rt_se); 682 dequeue_rt_entity(rt_se);
683
684 dec_cpu_load(rq, p->se.load.weight);
543} 685}
544 686
545/* 687/*
@@ -550,10 +692,12 @@ static
550void requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se) 692void requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
551{ 693{
552 struct rt_prio_array *array = &rt_rq->active; 694 struct rt_prio_array *array = &rt_rq->active;
553 struct list_head *queue = array->queue + rt_se_prio(rt_se);
554 695
555 if (on_rt_rq(rt_se)) 696 if (on_rt_rq(rt_se)) {
556 list_move_tail(&rt_se->run_list, queue); 697 list_del_init(&rt_se->run_list);
698 list_add_tail(&rt_se->run_list,
699 array->queue + rt_se_prio(rt_se));
700 }
557} 701}
558 702
559static void requeue_task_rt(struct rq *rq, struct task_struct *p) 703static void requeue_task_rt(struct rq *rq, struct task_struct *p)
@@ -616,8 +760,37 @@ static int select_task_rq_rt(struct task_struct *p, int sync)
616 */ 760 */
617static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p) 761static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p)
618{ 762{
619 if (p->prio < rq->curr->prio) 763 if (p->prio < rq->curr->prio) {
620 resched_task(rq->curr); 764 resched_task(rq->curr);
765 return;
766 }
767
768#ifdef CONFIG_SMP
769 /*
770 * If:
771 *
772 * - the newly woken task is of equal priority to the current task
773 * - the newly woken task is non-migratable while current is migratable
774 * - current will be preempted on the next reschedule
775 *
776 * we should check to see if current can readily move to a different
777 * cpu. If so, we will reschedule to allow the push logic to try
778 * to move current somewhere else, making room for our non-migratable
779 * task.
780 */
781 if((p->prio == rq->curr->prio)
782 && p->rt.nr_cpus_allowed == 1
783 && rq->curr->rt.nr_cpus_allowed != 1) {
784 cpumask_t mask;
785
786 if (cpupri_find(&rq->rd->cpupri, rq->curr, &mask))
787 /*
788 * There appears to be other cpus that can accept
789 * current, so lets reschedule to try and push it away
790 */
791 resched_task(rq->curr);
792 }
793#endif
621} 794}
622 795
623static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq, 796static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
@@ -720,73 +893,6 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
720 893
721static DEFINE_PER_CPU(cpumask_t, local_cpu_mask); 894static DEFINE_PER_CPU(cpumask_t, local_cpu_mask);
722 895
723static int find_lowest_cpus(struct task_struct *task, cpumask_t *lowest_mask)
724{
725 int lowest_prio = -1;
726 int lowest_cpu = -1;
727 int count = 0;
728 int cpu;
729
730 cpus_and(*lowest_mask, task_rq(task)->rd->online, task->cpus_allowed);
731
732 /*
733 * Scan each rq for the lowest prio.
734 */
735 for_each_cpu_mask(cpu, *lowest_mask) {
736 struct rq *rq = cpu_rq(cpu);
737
738 /* We look for lowest RT prio or non-rt CPU */
739 if (rq->rt.highest_prio >= MAX_RT_PRIO) {
740 /*
741 * if we already found a low RT queue
742 * and now we found this non-rt queue
743 * clear the mask and set our bit.
744 * Otherwise just return the queue as is
745 * and the count==1 will cause the algorithm
746 * to use the first bit found.
747 */
748 if (lowest_cpu != -1) {
749 cpus_clear(*lowest_mask);
750 cpu_set(rq->cpu, *lowest_mask);
751 }
752 return 1;
753 }
754
755 /* no locking for now */
756 if ((rq->rt.highest_prio > task->prio)
757 && (rq->rt.highest_prio >= lowest_prio)) {
758 if (rq->rt.highest_prio > lowest_prio) {
759 /* new low - clear old data */
760 lowest_prio = rq->rt.highest_prio;
761 lowest_cpu = cpu;
762 count = 0;
763 }
764 count++;
765 } else
766 cpu_clear(cpu, *lowest_mask);
767 }
768
769 /*
770 * Clear out all the set bits that represent
771 * runqueues that were of higher prio than
772 * the lowest_prio.
773 */
774 if (lowest_cpu > 0) {
775 /*
776 * Perhaps we could add another cpumask op to
777 * zero out bits. Like cpu_zero_bits(cpumask, nrbits);
778 * Then that could be optimized to use memset and such.
779 */
780 for_each_cpu_mask(cpu, *lowest_mask) {
781 if (cpu >= lowest_cpu)
782 break;
783 cpu_clear(cpu, *lowest_mask);
784 }
785 }
786
787 return count;
788}
789
790static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask) 896static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask)
791{ 897{
792 int first; 898 int first;
@@ -808,17 +914,12 @@ static int find_lowest_rq(struct task_struct *task)
808 cpumask_t *lowest_mask = &__get_cpu_var(local_cpu_mask); 914 cpumask_t *lowest_mask = &__get_cpu_var(local_cpu_mask);
809 int this_cpu = smp_processor_id(); 915 int this_cpu = smp_processor_id();
810 int cpu = task_cpu(task); 916 int cpu = task_cpu(task);
811 int count = find_lowest_cpus(task, lowest_mask);
812 917
813 if (!count) 918 if (task->rt.nr_cpus_allowed == 1)
814 return -1; /* No targets found */ 919 return -1; /* No other targets possible */
815 920
816 /* 921 if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
817 * There is no sense in performing an optimal search if only one 922 return -1; /* No targets found */
818 * target is found.
819 */
820 if (count == 1)
821 return first_cpu(*lowest_mask);
822 923
823 /* 924 /*
824 * At this point we have built a mask of cpus representing the 925 * At this point we have built a mask of cpus representing the
@@ -1163,17 +1264,25 @@ static void set_cpus_allowed_rt(struct task_struct *p,
1163} 1264}
1164 1265
1165/* Assumes rq->lock is held */ 1266/* Assumes rq->lock is held */
1166static void join_domain_rt(struct rq *rq) 1267static void rq_online_rt(struct rq *rq)
1167{ 1268{
1168 if (rq->rt.overloaded) 1269 if (rq->rt.overloaded)
1169 rt_set_overload(rq); 1270 rt_set_overload(rq);
1271
1272 __enable_runtime(rq);
1273
1274 cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio);
1170} 1275}
1171 1276
1172/* Assumes rq->lock is held */ 1277/* Assumes rq->lock is held */
1173static void leave_domain_rt(struct rq *rq) 1278static void rq_offline_rt(struct rq *rq)
1174{ 1279{
1175 if (rq->rt.overloaded) 1280 if (rq->rt.overloaded)
1176 rt_clear_overload(rq); 1281 rt_clear_overload(rq);
1282
1283 __disable_runtime(rq);
1284
1285 cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID);
1177} 1286}
1178 1287
1179/* 1288/*
@@ -1336,8 +1445,8 @@ static const struct sched_class rt_sched_class = {
1336 .load_balance = load_balance_rt, 1445 .load_balance = load_balance_rt,
1337 .move_one_task = move_one_task_rt, 1446 .move_one_task = move_one_task_rt,
1338 .set_cpus_allowed = set_cpus_allowed_rt, 1447 .set_cpus_allowed = set_cpus_allowed_rt,
1339 .join_domain = join_domain_rt, 1448 .rq_online = rq_online_rt,
1340 .leave_domain = leave_domain_rt, 1449 .rq_offline = rq_offline_rt,
1341 .pre_schedule = pre_schedule_rt, 1450 .pre_schedule = pre_schedule_rt,
1342 .post_schedule = post_schedule_rt, 1451 .post_schedule = post_schedule_rt,
1343 .task_wake_up = task_wake_up_rt, 1452 .task_wake_up = task_wake_up_rt,
@@ -1350,3 +1459,17 @@ static const struct sched_class rt_sched_class = {
1350 .prio_changed = prio_changed_rt, 1459 .prio_changed = prio_changed_rt,
1351 .switched_to = switched_to_rt, 1460 .switched_to = switched_to_rt,
1352}; 1461};
1462
1463#ifdef CONFIG_SCHED_DEBUG
1464extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
1465
1466static void print_rt_stats(struct seq_file *m, int cpu)
1467{
1468 struct rt_rq *rt_rq;
1469
1470 rcu_read_lock();
1471 for_each_leaf_rt_rq(rt_rq, cpu_rq(cpu))
1472 print_rt_rq(m, cpu, rt_rq);
1473 rcu_read_unlock();
1474}
1475#endif /* CONFIG_SCHED_DEBUG */
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index 80179ef7450e..8385d43987e2 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -118,6 +118,13 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta)
118 if (rq) 118 if (rq)
119 rq->rq_sched_info.cpu_time += delta; 119 rq->rq_sched_info.cpu_time += delta;
120} 120}
121
122static inline void
123rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
124{
125 if (rq)
126 rq->rq_sched_info.run_delay += delta;
127}
121# define schedstat_inc(rq, field) do { (rq)->field++; } while (0) 128# define schedstat_inc(rq, field) do { (rq)->field++; } while (0)
122# define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0) 129# define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0)
123# define schedstat_set(var, val) do { var = (val); } while (0) 130# define schedstat_set(var, val) do { var = (val); } while (0)
@@ -126,6 +133,9 @@ static inline void
126rq_sched_info_arrive(struct rq *rq, unsigned long long delta) 133rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
127{} 134{}
128static inline void 135static inline void
136rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
137{}
138static inline void
129rq_sched_info_depart(struct rq *rq, unsigned long long delta) 139rq_sched_info_depart(struct rq *rq, unsigned long long delta)
130{} 140{}
131# define schedstat_inc(rq, field) do { } while (0) 141# define schedstat_inc(rq, field) do { } while (0)
@@ -134,6 +144,11 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta)
134#endif 144#endif
135 145
136#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 146#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
147static inline void sched_info_reset_dequeued(struct task_struct *t)
148{
149 t->sched_info.last_queued = 0;
150}
151
137/* 152/*
138 * Called when a process is dequeued from the active array and given 153 * Called when a process is dequeued from the active array and given
139 * the cpu. We should note that with the exception of interactive 154 * the cpu. We should note that with the exception of interactive
@@ -143,15 +158,22 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta)
143 * active queue, thus delaying tasks in the expired queue from running; 158 * active queue, thus delaying tasks in the expired queue from running;
144 * see scheduler_tick()). 159 * see scheduler_tick()).
145 * 160 *
146 * This function is only called from sched_info_arrive(), rather than 161 * Though we are interested in knowing how long it was from the *first* time a
147 * dequeue_task(). Even though a task may be queued and dequeued multiple 162 * task was queued to the time that it finally hit a cpu, we call this routine
148 * times as it is shuffled about, we're really interested in knowing how 163 * from dequeue_task() to account for possible rq->clock skew across cpus. The
149 * long it was from the *first* time it was queued to the time that it 164 * delta taken on each cpu would annul the skew.
150 * finally hit a cpu.
151 */ 165 */
152static inline void sched_info_dequeued(struct task_struct *t) 166static inline void sched_info_dequeued(struct task_struct *t)
153{ 167{
154 t->sched_info.last_queued = 0; 168 unsigned long long now = task_rq(t)->clock, delta = 0;
169
170 if (unlikely(sched_info_on()))
171 if (t->sched_info.last_queued)
172 delta = now - t->sched_info.last_queued;
173 sched_info_reset_dequeued(t);
174 t->sched_info.run_delay += delta;
175
176 rq_sched_info_dequeued(task_rq(t), delta);
155} 177}
156 178
157/* 179/*
@@ -165,7 +187,7 @@ static void sched_info_arrive(struct task_struct *t)
165 187
166 if (t->sched_info.last_queued) 188 if (t->sched_info.last_queued)
167 delta = now - t->sched_info.last_queued; 189 delta = now - t->sched_info.last_queued;
168 sched_info_dequeued(t); 190 sched_info_reset_dequeued(t);
169 t->sched_info.run_delay += delta; 191 t->sched_info.run_delay += delta;
170 t->sched_info.last_arrival = now; 192 t->sched_info.last_arrival = now;
171 t->sched_info.pcount++; 193 t->sched_info.pcount++;
@@ -242,7 +264,9 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next)
242 __sched_info_switch(prev, next); 264 __sched_info_switch(prev, next);
243} 265}
244#else 266#else
245#define sched_info_queued(t) do { } while (0) 267#define sched_info_queued(t) do { } while (0)
246#define sched_info_switch(t, next) do { } while (0) 268#define sched_info_reset_dequeued(t) do { } while (0)
269#define sched_info_dequeued(t) do { } while (0)
270#define sched_info_switch(t, next) do { } while (0)
247#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */ 271#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
248 272
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 29116652dca8..fe8cdc80ff02 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -266,6 +266,14 @@ static struct ctl_table kern_table[] = {
266 }, 266 },
267 { 267 {
268 .ctl_name = CTL_UNNUMBERED, 268 .ctl_name = CTL_UNNUMBERED,
269 .procname = "sched_shares_ratelimit",
270 .data = &sysctl_sched_shares_ratelimit,
271 .maxlen = sizeof(unsigned int),
272 .mode = 0644,
273 .proc_handler = &proc_dointvec,
274 },
275 {
276 .ctl_name = CTL_UNNUMBERED,
269 .procname = "sched_child_runs_first", 277 .procname = "sched_child_runs_first",
270 .data = &sysctl_sched_child_runs_first, 278 .data = &sysctl_sched_child_runs_first,
271 .maxlen = sizeof(unsigned int), 279 .maxlen = sizeof(unsigned int),
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index b854a895591e..d63008b09a4c 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -276,6 +276,7 @@ void tick_nohz_stop_sched_tick(void)
276 ts->tick_stopped = 1; 276 ts->tick_stopped = 1;
277 ts->idle_jiffies = last_jiffies; 277 ts->idle_jiffies = last_jiffies;
278 rcu_enter_nohz(); 278 rcu_enter_nohz();
279 sched_clock_tick_stop(cpu);
279 } 280 }
280 281
281 /* 282 /*
@@ -375,6 +376,7 @@ void tick_nohz_restart_sched_tick(void)
375 select_nohz_load_balancer(0); 376 select_nohz_load_balancer(0);
376 now = ktime_get(); 377 now = ktime_get();
377 tick_do_update_jiffies64(now); 378 tick_do_update_jiffies64(now);
379 sched_clock_tick_start(cpu);
378 cpu_clear(cpu, nohz_cpu_mask); 380 cpu_clear(cpu, nohz_cpu_mask);
379 381
380 /* 382 /*