aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/scheduler/sched-domains.txt7
-rw-r--r--Documentation/scheduler/sched-rt-group.txt4
-rw-r--r--include/linux/sched.h39
-rw-r--r--kernel/Makefile5
-rw-r--r--kernel/cpu.c24
-rw-r--r--kernel/cpuset.c14
-rw-r--r--kernel/kthread.c1
-rw-r--r--kernel/sched.c247
-rw-r--r--kernel/sched_cpupri.c174
-rw-r--r--kernel/sched_cpupri.h36
-rw-r--r--kernel/sched_debug.c40
-rw-r--r--kernel/sched_fair.c19
-rw-r--r--kernel/sched_features.h2
-rw-r--r--kernel/sched_rt.c395
14 files changed, 718 insertions, 289 deletions
diff --git a/Documentation/scheduler/sched-domains.txt b/Documentation/scheduler/sched-domains.txt
index a9e990ab980f..373ceacc367e 100644
--- a/Documentation/scheduler/sched-domains.txt
+++ b/Documentation/scheduler/sched-domains.txt
@@ -61,10 +61,7 @@ builder by #define'ing ARCH_HASH_SCHED_DOMAIN, and exporting your
61arch_init_sched_domains function. This function will attach domains to all 61arch_init_sched_domains function. This function will attach domains to all
62CPUs using cpu_attach_domain. 62CPUs using cpu_attach_domain.
63 63
64Implementors should change the line 64The sched-domains debugging infrastructure can be enabled by enabling
65#undef SCHED_DOMAIN_DEBUG 65CONFIG_SCHED_DEBUG. This enables an error checking parse of the sched domains
66to
67#define SCHED_DOMAIN_DEBUG
68in kernel/sched.c as this enables an error checking parse of the sched domains
69which should catch most possible errors (described above). It also prints out 66which should catch most possible errors (described above). It also prints out
70the domain structure in a visual format. 67the domain structure in a visual format.
diff --git a/Documentation/scheduler/sched-rt-group.txt b/Documentation/scheduler/sched-rt-group.txt
index 14f901f639ee..3ef339f491e0 100644
--- a/Documentation/scheduler/sched-rt-group.txt
+++ b/Documentation/scheduler/sched-rt-group.txt
@@ -51,9 +51,9 @@ needs only about 3% CPU time to do so, it can do with a 0.03 * 0.005s =
510.00015s. So this group can be scheduled with a period of 0.005s and a run time 510.00015s. So this group can be scheduled with a period of 0.005s and a run time
52of 0.00015s. 52of 0.00015s.
53 53
54The remaining CPU time will be used for user input and other tass. Because 54The remaining CPU time will be used for user input and other tasks. Because
55realtime tasks have explicitly allocated the CPU time they need to perform 55realtime tasks have explicitly allocated the CPU time they need to perform
56their tasks, buffer underruns in the graphocs or audio can be eliminated. 56their tasks, buffer underruns in the graphics or audio can be eliminated.
57 57
58NOTE: the above example is not fully implemented as of yet (2.6.25). We still 58NOTE: the above example is not fully implemented as of yet (2.6.25). We still
59lack an EDF scheduler to make non-uniform periods usable. 59lack an EDF scheduler to make non-uniform periods usable.
diff --git a/include/linux/sched.h b/include/linux/sched.h
index c5d3f847ca8d..eaf821072dbd 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -134,7 +134,6 @@ extern unsigned long nr_running(void);
134extern unsigned long nr_uninterruptible(void); 134extern unsigned long nr_uninterruptible(void);
135extern unsigned long nr_active(void); 135extern unsigned long nr_active(void);
136extern unsigned long nr_iowait(void); 136extern unsigned long nr_iowait(void);
137extern unsigned long weighted_cpuload(const int cpu);
138 137
139struct seq_file; 138struct seq_file;
140struct cfs_rq; 139struct cfs_rq;
@@ -823,23 +822,6 @@ extern int arch_reinit_sched_domains(void);
823 822
824#endif /* CONFIG_SMP */ 823#endif /* CONFIG_SMP */
825 824
826/*
827 * A runqueue laden with a single nice 0 task scores a weighted_cpuload of
828 * SCHED_LOAD_SCALE. This function returns 1 if any cpu is laden with a
829 * task of nice 0 or enough lower priority tasks to bring up the
830 * weighted_cpuload
831 */
832static inline int above_background_load(void)
833{
834 unsigned long cpu;
835
836 for_each_online_cpu(cpu) {
837 if (weighted_cpuload(cpu) >= SCHED_LOAD_SCALE)
838 return 1;
839 }
840 return 0;
841}
842
843struct io_context; /* See blkdev.h */ 825struct io_context; /* See blkdev.h */
844#define NGROUPS_SMALL 32 826#define NGROUPS_SMALL 32
845#define NGROUPS_PER_BLOCK ((unsigned int)(PAGE_SIZE / sizeof(gid_t))) 827#define NGROUPS_PER_BLOCK ((unsigned int)(PAGE_SIZE / sizeof(gid_t)))
@@ -921,8 +903,8 @@ struct sched_class {
921 void (*set_cpus_allowed)(struct task_struct *p, 903 void (*set_cpus_allowed)(struct task_struct *p,
922 const cpumask_t *newmask); 904 const cpumask_t *newmask);
923 905
924 void (*join_domain)(struct rq *rq); 906 void (*rq_online)(struct rq *rq);
925 void (*leave_domain)(struct rq *rq); 907 void (*rq_offline)(struct rq *rq);
926 908
927 void (*switched_from) (struct rq *this_rq, struct task_struct *task, 909 void (*switched_from) (struct rq *this_rq, struct task_struct *task,
928 int running); 910 int running);
@@ -1039,6 +1021,7 @@ struct task_struct {
1039#endif 1021#endif
1040 1022
1041 int prio, static_prio, normal_prio; 1023 int prio, static_prio, normal_prio;
1024 unsigned int rt_priority;
1042 const struct sched_class *sched_class; 1025 const struct sched_class *sched_class;
1043 struct sched_entity se; 1026 struct sched_entity se;
1044 struct sched_rt_entity rt; 1027 struct sched_rt_entity rt;
@@ -1122,7 +1105,6 @@ struct task_struct {
1122 int __user *set_child_tid; /* CLONE_CHILD_SETTID */ 1105 int __user *set_child_tid; /* CLONE_CHILD_SETTID */
1123 int __user *clear_child_tid; /* CLONE_CHILD_CLEARTID */ 1106 int __user *clear_child_tid; /* CLONE_CHILD_CLEARTID */
1124 1107
1125 unsigned int rt_priority;
1126 cputime_t utime, stime, utimescaled, stimescaled; 1108 cputime_t utime, stime, utimescaled, stimescaled;
1127 cputime_t gtime; 1109 cputime_t gtime;
1128 cputime_t prev_utime, prev_stime; 1110 cputime_t prev_utime, prev_stime;
@@ -1141,12 +1123,12 @@ struct task_struct {
1141 gid_t gid,egid,sgid,fsgid; 1123 gid_t gid,egid,sgid,fsgid;
1142 struct group_info *group_info; 1124 struct group_info *group_info;
1143 kernel_cap_t cap_effective, cap_inheritable, cap_permitted, cap_bset; 1125 kernel_cap_t cap_effective, cap_inheritable, cap_permitted, cap_bset;
1144 unsigned securebits;
1145 struct user_struct *user; 1126 struct user_struct *user;
1127 unsigned securebits;
1146#ifdef CONFIG_KEYS 1128#ifdef CONFIG_KEYS
1129 unsigned char jit_keyring; /* default keyring to attach requested keys to */
1147 struct key *request_key_auth; /* assumed request_key authority */ 1130 struct key *request_key_auth; /* assumed request_key authority */
1148 struct key *thread_keyring; /* keyring private to this thread */ 1131 struct key *thread_keyring; /* keyring private to this thread */
1149 unsigned char jit_keyring; /* default keyring to attach requested keys to */
1150#endif 1132#endif
1151 char comm[TASK_COMM_LEN]; /* executable name excluding path 1133 char comm[TASK_COMM_LEN]; /* executable name excluding path
1152 - access with [gs]et_task_comm (which lock 1134 - access with [gs]et_task_comm (which lock
@@ -1233,8 +1215,8 @@ struct task_struct {
1233# define MAX_LOCK_DEPTH 48UL 1215# define MAX_LOCK_DEPTH 48UL
1234 u64 curr_chain_key; 1216 u64 curr_chain_key;
1235 int lockdep_depth; 1217 int lockdep_depth;
1236 struct held_lock held_locks[MAX_LOCK_DEPTH];
1237 unsigned int lockdep_recursion; 1218 unsigned int lockdep_recursion;
1219 struct held_lock held_locks[MAX_LOCK_DEPTH];
1238#endif 1220#endif
1239 1221
1240/* journalling filesystem info */ 1222/* journalling filesystem info */
@@ -1262,10 +1244,6 @@ struct task_struct {
1262 u64 acct_vm_mem1; /* accumulated virtual memory usage */ 1244 u64 acct_vm_mem1; /* accumulated virtual memory usage */
1263 cputime_t acct_stimexpd;/* stime since last update */ 1245 cputime_t acct_stimexpd;/* stime since last update */
1264#endif 1246#endif
1265#ifdef CONFIG_NUMA
1266 struct mempolicy *mempolicy;
1267 short il_next;
1268#endif
1269#ifdef CONFIG_CPUSETS 1247#ifdef CONFIG_CPUSETS
1270 nodemask_t mems_allowed; 1248 nodemask_t mems_allowed;
1271 int cpuset_mems_generation; 1249 int cpuset_mems_generation;
@@ -1285,6 +1263,10 @@ struct task_struct {
1285 struct list_head pi_state_list; 1263 struct list_head pi_state_list;
1286 struct futex_pi_state *pi_state_cache; 1264 struct futex_pi_state *pi_state_cache;
1287#endif 1265#endif
1266#ifdef CONFIG_NUMA
1267 struct mempolicy *mempolicy;
1268 short il_next;
1269#endif
1288 atomic_t fs_excl; /* holding fs exclusive resources */ 1270 atomic_t fs_excl; /* holding fs exclusive resources */
1289 struct rcu_head rcu; 1271 struct rcu_head rcu;
1290 1272
@@ -1504,6 +1486,7 @@ static inline void put_task_struct(struct task_struct *t)
1504#define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */ 1486#define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */
1505#define PF_SPREAD_PAGE 0x01000000 /* Spread page cache over cpuset */ 1487#define PF_SPREAD_PAGE 0x01000000 /* Spread page cache over cpuset */
1506#define PF_SPREAD_SLAB 0x02000000 /* Spread some slab caches over cpuset */ 1488#define PF_SPREAD_SLAB 0x02000000 /* Spread some slab caches over cpuset */
1489#define PF_THREAD_BOUND 0x04000000 /* Thread bound to specific cpu */
1507#define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */ 1490#define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */
1508#define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */ 1491#define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */
1509#define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezeable */ 1492#define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezeable */
diff --git a/kernel/Makefile b/kernel/Makefile
index 1c9938addb9d..6c55301112e0 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -3,7 +3,7 @@
3# 3#
4 4
5obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ 5obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
6 exit.o itimer.o time.o softirq.o resource.o \ 6 cpu.o exit.o itimer.o time.o softirq.o resource.o \
7 sysctl.o capability.o ptrace.o timer.o user.o \ 7 sysctl.o capability.o ptrace.o timer.o user.o \
8 signal.o sys.o kmod.o workqueue.o pid.o \ 8 signal.o sys.o kmod.o workqueue.o pid.o \
9 rcupdate.o extable.o params.o posix-timers.o \ 9 rcupdate.o extable.o params.o posix-timers.o \
@@ -27,7 +27,7 @@ obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
27obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o 27obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
28obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o 28obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
29obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o 29obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
30obj-$(CONFIG_SMP) += cpu.o spinlock.o 30obj-$(CONFIG_SMP) += spinlock.o
31obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o 31obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
32obj-$(CONFIG_PROVE_LOCKING) += spinlock.o 32obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
33obj-$(CONFIG_UID16) += uid16.o 33obj-$(CONFIG_UID16) += uid16.o
@@ -69,6 +69,7 @@ obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
69obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o 69obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
70obj-$(CONFIG_MARKERS) += marker.o 70obj-$(CONFIG_MARKERS) += marker.o
71obj-$(CONFIG_LATENCYTOP) += latencytop.o 71obj-$(CONFIG_LATENCYTOP) += latencytop.o
72obj-$(CONFIG_SMP) += sched_cpupri.o
72 73
73ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) 74ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
74# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is 75# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/cpu.c b/kernel/cpu.c
index c77bc3a1c722..b11f06dc149a 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -15,6 +15,28 @@
15#include <linux/stop_machine.h> 15#include <linux/stop_machine.h>
16#include <linux/mutex.h> 16#include <linux/mutex.h>
17 17
18/*
19 * Represents all cpu's present in the system
20 * In systems capable of hotplug, this map could dynamically grow
21 * as new cpu's are detected in the system via any platform specific
22 * method, such as ACPI for e.g.
23 */
24cpumask_t cpu_present_map __read_mostly;
25EXPORT_SYMBOL(cpu_present_map);
26
27#ifndef CONFIG_SMP
28
29/*
30 * Represents all cpu's that are currently online.
31 */
32cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL;
33EXPORT_SYMBOL(cpu_online_map);
34
35cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
36EXPORT_SYMBOL(cpu_possible_map);
37
38#else /* CONFIG_SMP */
39
18/* Serializes the updates to cpu_online_map, cpu_present_map */ 40/* Serializes the updates to cpu_online_map, cpu_present_map */
19static DEFINE_MUTEX(cpu_add_remove_lock); 41static DEFINE_MUTEX(cpu_add_remove_lock);
20 42
@@ -403,3 +425,5 @@ out:
403 cpu_maps_update_done(); 425 cpu_maps_update_done();
404} 426}
405#endif /* CONFIG_PM_SLEEP_SMP */ 427#endif /* CONFIG_PM_SLEEP_SMP */
428
429#endif /* CONFIG_SMP */
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 9fceb97e989c..64a05da9bc4c 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1194,6 +1194,15 @@ static int cpuset_can_attach(struct cgroup_subsys *ss,
1194 1194
1195 if (cpus_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) 1195 if (cpus_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
1196 return -ENOSPC; 1196 return -ENOSPC;
1197 if (tsk->flags & PF_THREAD_BOUND) {
1198 cpumask_t mask;
1199
1200 mutex_lock(&callback_mutex);
1201 mask = cs->cpus_allowed;
1202 mutex_unlock(&callback_mutex);
1203 if (!cpus_equal(tsk->cpus_allowed, mask))
1204 return -EINVAL;
1205 }
1197 1206
1198 return security_task_setscheduler(tsk, 0, NULL); 1207 return security_task_setscheduler(tsk, 0, NULL);
1199} 1208}
@@ -1207,11 +1216,14 @@ static void cpuset_attach(struct cgroup_subsys *ss,
1207 struct mm_struct *mm; 1216 struct mm_struct *mm;
1208 struct cpuset *cs = cgroup_cs(cont); 1217 struct cpuset *cs = cgroup_cs(cont);
1209 struct cpuset *oldcs = cgroup_cs(oldcont); 1218 struct cpuset *oldcs = cgroup_cs(oldcont);
1219 int err;
1210 1220
1211 mutex_lock(&callback_mutex); 1221 mutex_lock(&callback_mutex);
1212 guarantee_online_cpus(cs, &cpus); 1222 guarantee_online_cpus(cs, &cpus);
1213 set_cpus_allowed_ptr(tsk, &cpus); 1223 err = set_cpus_allowed_ptr(tsk, &cpus);
1214 mutex_unlock(&callback_mutex); 1224 mutex_unlock(&callback_mutex);
1225 if (err)
1226 return;
1215 1227
1216 from = oldcs->mems_allowed; 1228 from = oldcs->mems_allowed;
1217 to = cs->mems_allowed; 1229 to = cs->mems_allowed;
diff --git a/kernel/kthread.c b/kernel/kthread.c
index bd1b9ea024e1..97747cdd37c9 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -180,6 +180,7 @@ void kthread_bind(struct task_struct *k, unsigned int cpu)
180 set_task_cpu(k, cpu); 180 set_task_cpu(k, cpu);
181 k->cpus_allowed = cpumask_of_cpu(cpu); 181 k->cpus_allowed = cpumask_of_cpu(cpu);
182 k->rt.nr_cpus_allowed = 1; 182 k->rt.nr_cpus_allowed = 1;
183 k->flags |= PF_THREAD_BOUND;
183} 184}
184EXPORT_SYMBOL(kthread_bind); 185EXPORT_SYMBOL(kthread_bind);
185 186
diff --git a/kernel/sched.c b/kernel/sched.c
index 3aaa5c8cb421..c51d9fae8cd8 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -74,6 +74,8 @@
74#include <asm/tlb.h> 74#include <asm/tlb.h>
75#include <asm/irq_regs.h> 75#include <asm/irq_regs.h>
76 76
77#include "sched_cpupri.h"
78
77/* 79/*
78 * Convert user-nice values [ -20 ... 0 ... 19 ] 80 * Convert user-nice values [ -20 ... 0 ... 19 ]
79 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], 81 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
@@ -289,15 +291,15 @@ struct task_group root_task_group;
289static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); 291static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
290/* Default task group's cfs_rq on each cpu */ 292/* Default task group's cfs_rq on each cpu */
291static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; 293static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
292#endif 294#endif /* CONFIG_FAIR_GROUP_SCHED */
293 295
294#ifdef CONFIG_RT_GROUP_SCHED 296#ifdef CONFIG_RT_GROUP_SCHED
295static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); 297static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
296static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; 298static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
297#endif 299#endif /* CONFIG_RT_GROUP_SCHED */
298#else 300#else /* !CONFIG_FAIR_GROUP_SCHED */
299#define root_task_group init_task_group 301#define root_task_group init_task_group
300#endif 302#endif /* CONFIG_FAIR_GROUP_SCHED */
301 303
302/* task_group_lock serializes add/remove of task groups and also changes to 304/* task_group_lock serializes add/remove of task groups and also changes to
303 * a task group's cpu shares. 305 * a task group's cpu shares.
@@ -307,9 +309,9 @@ static DEFINE_SPINLOCK(task_group_lock);
307#ifdef CONFIG_FAIR_GROUP_SCHED 309#ifdef CONFIG_FAIR_GROUP_SCHED
308#ifdef CONFIG_USER_SCHED 310#ifdef CONFIG_USER_SCHED
309# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) 311# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
310#else 312#else /* !CONFIG_USER_SCHED */
311# define INIT_TASK_GROUP_LOAD NICE_0_LOAD 313# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
312#endif 314#endif /* CONFIG_USER_SCHED */
313 315
314/* 316/*
315 * A weight of 0 or 1 can cause arithmetics problems. 317 * A weight of 0 or 1 can cause arithmetics problems.
@@ -452,6 +454,9 @@ struct root_domain {
452 */ 454 */
453 cpumask_t rto_mask; 455 cpumask_t rto_mask;
454 atomic_t rto_count; 456 atomic_t rto_count;
457#ifdef CONFIG_SMP
458 struct cpupri cpupri;
459#endif
455}; 460};
456 461
457/* 462/*
@@ -526,6 +531,7 @@ struct rq {
526 int push_cpu; 531 int push_cpu;
527 /* cpu of this runqueue: */ 532 /* cpu of this runqueue: */
528 int cpu; 533 int cpu;
534 int online;
529 535
530 struct task_struct *migration_thread; 536 struct task_struct *migration_thread;
531 struct list_head migration_queue; 537 struct list_head migration_queue;
@@ -1313,15 +1319,15 @@ void wake_up_idle_cpu(int cpu)
1313 if (!tsk_is_polling(rq->idle)) 1319 if (!tsk_is_polling(rq->idle))
1314 smp_send_reschedule(cpu); 1320 smp_send_reschedule(cpu);
1315} 1321}
1316#endif 1322#endif /* CONFIG_NO_HZ */
1317 1323
1318#else 1324#else /* !CONFIG_SMP */
1319static void __resched_task(struct task_struct *p, int tif_bit) 1325static void __resched_task(struct task_struct *p, int tif_bit)
1320{ 1326{
1321 assert_spin_locked(&task_rq(p)->lock); 1327 assert_spin_locked(&task_rq(p)->lock);
1322 set_tsk_thread_flag(p, tif_bit); 1328 set_tsk_thread_flag(p, tif_bit);
1323} 1329}
1324#endif 1330#endif /* CONFIG_SMP */
1325 1331
1326#if BITS_PER_LONG == 32 1332#if BITS_PER_LONG == 32
1327# define WMULT_CONST (~0UL) 1333# define WMULT_CONST (~0UL)
@@ -1481,16 +1487,8 @@ static unsigned long source_load(int cpu, int type);
1481static unsigned long target_load(int cpu, int type); 1487static unsigned long target_load(int cpu, int type);
1482static unsigned long cpu_avg_load_per_task(int cpu); 1488static unsigned long cpu_avg_load_per_task(int cpu);
1483static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); 1489static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1484#else /* CONFIG_SMP */
1485
1486#ifdef CONFIG_FAIR_GROUP_SCHED
1487static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1488{
1489}
1490#endif 1490#endif
1491 1491
1492#endif /* CONFIG_SMP */
1493
1494#include "sched_stats.h" 1492#include "sched_stats.h"
1495#include "sched_idletask.c" 1493#include "sched_idletask.c"
1496#include "sched_fair.c" 1494#include "sched_fair.c"
@@ -1500,6 +1498,8 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1500#endif 1498#endif
1501 1499
1502#define sched_class_highest (&rt_sched_class) 1500#define sched_class_highest (&rt_sched_class)
1501#define for_each_class(class) \
1502 for (class = sched_class_highest; class; class = class->next)
1503 1503
1504static inline void inc_load(struct rq *rq, const struct task_struct *p) 1504static inline void inc_load(struct rq *rq, const struct task_struct *p)
1505{ 1505{
@@ -1636,12 +1636,6 @@ inline int task_curr(const struct task_struct *p)
1636 return cpu_curr(task_cpu(p)) == p; 1636 return cpu_curr(task_cpu(p)) == p;
1637} 1637}
1638 1638
1639/* Used instead of source_load when we know the type == 0 */
1640unsigned long weighted_cpuload(const int cpu)
1641{
1642 return cpu_rq(cpu)->load.weight;
1643}
1644
1645static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) 1639static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1646{ 1640{
1647 set_task_rq(p, cpu); 1641 set_task_rq(p, cpu);
@@ -1670,6 +1664,12 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
1670 1664
1671#ifdef CONFIG_SMP 1665#ifdef CONFIG_SMP
1672 1666
1667/* Used instead of source_load when we know the type == 0 */
1668static unsigned long weighted_cpuload(const int cpu)
1669{
1670 return cpu_rq(cpu)->load.weight;
1671}
1672
1673/* 1673/*
1674 * Is this task likely cache-hot: 1674 * Is this task likely cache-hot:
1675 */ 1675 */
@@ -2131,7 +2131,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2131 } 2131 }
2132 } 2132 }
2133 } 2133 }
2134#endif 2134#endif /* CONFIG_SCHEDSTATS */
2135 2135
2136out_activate: 2136out_activate:
2137#endif /* CONFIG_SMP */ 2137#endif /* CONFIG_SMP */
@@ -2331,7 +2331,7 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr,
2331 notifier->ops->sched_out(notifier, next); 2331 notifier->ops->sched_out(notifier, next);
2332} 2332}
2333 2333
2334#else 2334#else /* !CONFIG_PREEMPT_NOTIFIERS */
2335 2335
2336static void fire_sched_in_preempt_notifiers(struct task_struct *curr) 2336static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2337{ 2337{
@@ -2343,7 +2343,7 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr,
2343{ 2343{
2344} 2344}
2345 2345
2346#endif 2346#endif /* CONFIG_PREEMPT_NOTIFIERS */
2347 2347
2348/** 2348/**
2349 * prepare_task_switch - prepare to switch tasks 2349 * prepare_task_switch - prepare to switch tasks
@@ -3672,6 +3672,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3672 /* Earliest time when we have to do rebalance again */ 3672 /* Earliest time when we have to do rebalance again */
3673 unsigned long next_balance = jiffies + 60*HZ; 3673 unsigned long next_balance = jiffies + 60*HZ;
3674 int update_next_balance = 0; 3674 int update_next_balance = 0;
3675 int need_serialize;
3675 cpumask_t tmp; 3676 cpumask_t tmp;
3676 3677
3677 for_each_domain(cpu, sd) { 3678 for_each_domain(cpu, sd) {
@@ -3689,8 +3690,9 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3689 if (interval > HZ*NR_CPUS/10) 3690 if (interval > HZ*NR_CPUS/10)
3690 interval = HZ*NR_CPUS/10; 3691 interval = HZ*NR_CPUS/10;
3691 3692
3693 need_serialize = sd->flags & SD_SERIALIZE;
3692 3694
3693 if (sd->flags & SD_SERIALIZE) { 3695 if (need_serialize) {
3694 if (!spin_trylock(&balancing)) 3696 if (!spin_trylock(&balancing))
3695 goto out; 3697 goto out;
3696 } 3698 }
@@ -3706,7 +3708,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3706 } 3708 }
3707 sd->last_balance = jiffies; 3709 sd->last_balance = jiffies;
3708 } 3710 }
3709 if (sd->flags & SD_SERIALIZE) 3711 if (need_serialize)
3710 spin_unlock(&balancing); 3712 spin_unlock(&balancing);
3711out: 3713out:
3712 if (time_after(next_balance, sd->last_balance + interval)) { 3714 if (time_after(next_balance, sd->last_balance + interval)) {
@@ -4070,6 +4072,7 @@ static noinline void __schedule_bug(struct task_struct *prev)
4070 prev->comm, prev->pid, preempt_count()); 4072 prev->comm, prev->pid, preempt_count());
4071 4073
4072 debug_show_held_locks(prev); 4074 debug_show_held_locks(prev);
4075 print_modules();
4073 if (irqs_disabled()) 4076 if (irqs_disabled())
4074 print_irqtrace_events(prev); 4077 print_irqtrace_events(prev);
4075 4078
@@ -4143,7 +4146,7 @@ asmlinkage void __sched schedule(void)
4143 struct task_struct *prev, *next; 4146 struct task_struct *prev, *next;
4144 unsigned long *switch_count; 4147 unsigned long *switch_count;
4145 struct rq *rq; 4148 struct rq *rq;
4146 int cpu; 4149 int cpu, hrtick = sched_feat(HRTICK);
4147 4150
4148need_resched: 4151need_resched:
4149 preempt_disable(); 4152 preempt_disable();
@@ -4158,7 +4161,8 @@ need_resched_nonpreemptible:
4158 4161
4159 schedule_debug(prev); 4162 schedule_debug(prev);
4160 4163
4161 hrtick_clear(rq); 4164 if (hrtick)
4165 hrtick_clear(rq);
4162 4166
4163 /* 4167 /*
4164 * Do the rq-clock update outside the rq lock: 4168 * Do the rq-clock update outside the rq lock:
@@ -4204,7 +4208,8 @@ need_resched_nonpreemptible:
4204 } else 4208 } else
4205 spin_unlock_irq(&rq->lock); 4209 spin_unlock_irq(&rq->lock);
4206 4210
4207 hrtick_set(rq); 4211 if (hrtick)
4212 hrtick_set(rq);
4208 4213
4209 if (unlikely(reacquire_kernel_lock(current) < 0)) 4214 if (unlikely(reacquire_kernel_lock(current) < 0))
4210 goto need_resched_nonpreemptible; 4215 goto need_resched_nonpreemptible;
@@ -5070,24 +5075,6 @@ asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
5070 return sched_setaffinity(pid, &new_mask); 5075 return sched_setaffinity(pid, &new_mask);
5071} 5076}
5072 5077
5073/*
5074 * Represents all cpu's present in the system
5075 * In systems capable of hotplug, this map could dynamically grow
5076 * as new cpu's are detected in the system via any platform specific
5077 * method, such as ACPI for e.g.
5078 */
5079
5080cpumask_t cpu_present_map __read_mostly;
5081EXPORT_SYMBOL(cpu_present_map);
5082
5083#ifndef CONFIG_SMP
5084cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL;
5085EXPORT_SYMBOL(cpu_online_map);
5086
5087cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
5088EXPORT_SYMBOL(cpu_possible_map);
5089#endif
5090
5091long sched_getaffinity(pid_t pid, cpumask_t *mask) 5078long sched_getaffinity(pid_t pid, cpumask_t *mask)
5092{ 5079{
5093 struct task_struct *p; 5080 struct task_struct *p;
@@ -5571,6 +5558,12 @@ int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask)
5571 goto out; 5558 goto out;
5572 } 5559 }
5573 5560
5561 if (unlikely((p->flags & PF_THREAD_BOUND) && p != current &&
5562 !cpus_equal(p->cpus_allowed, *new_mask))) {
5563 ret = -EINVAL;
5564 goto out;
5565 }
5566
5574 if (p->sched_class->set_cpus_allowed) 5567 if (p->sched_class->set_cpus_allowed)
5575 p->sched_class->set_cpus_allowed(p, new_mask); 5568 p->sched_class->set_cpus_allowed(p, new_mask);
5576 else { 5569 else {
@@ -6058,6 +6051,36 @@ static void unregister_sched_domain_sysctl(void)
6058} 6051}
6059#endif 6052#endif
6060 6053
6054static void set_rq_online(struct rq *rq)
6055{
6056 if (!rq->online) {
6057 const struct sched_class *class;
6058
6059 cpu_set(rq->cpu, rq->rd->online);
6060 rq->online = 1;
6061
6062 for_each_class(class) {
6063 if (class->rq_online)
6064 class->rq_online(rq);
6065 }
6066 }
6067}
6068
6069static void set_rq_offline(struct rq *rq)
6070{
6071 if (rq->online) {
6072 const struct sched_class *class;
6073
6074 for_each_class(class) {
6075 if (class->rq_offline)
6076 class->rq_offline(rq);
6077 }
6078
6079 cpu_clear(rq->cpu, rq->rd->online);
6080 rq->online = 0;
6081 }
6082}
6083
6061/* 6084/*
6062 * migration_call - callback that gets triggered when a CPU is added. 6085 * migration_call - callback that gets triggered when a CPU is added.
6063 * Here we can start up the necessary migration thread for the new CPU. 6086 * Here we can start up the necessary migration thread for the new CPU.
@@ -6095,7 +6118,8 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6095 spin_lock_irqsave(&rq->lock, flags); 6118 spin_lock_irqsave(&rq->lock, flags);
6096 if (rq->rd) { 6119 if (rq->rd) {
6097 BUG_ON(!cpu_isset(cpu, rq->rd->span)); 6120 BUG_ON(!cpu_isset(cpu, rq->rd->span));
6098 cpu_set(cpu, rq->rd->online); 6121
6122 set_rq_online(rq);
6099 } 6123 }
6100 spin_unlock_irqrestore(&rq->lock, flags); 6124 spin_unlock_irqrestore(&rq->lock, flags);
6101 break; 6125 break;
@@ -6156,7 +6180,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6156 spin_lock_irqsave(&rq->lock, flags); 6180 spin_lock_irqsave(&rq->lock, flags);
6157 if (rq->rd) { 6181 if (rq->rd) {
6158 BUG_ON(!cpu_isset(cpu, rq->rd->span)); 6182 BUG_ON(!cpu_isset(cpu, rq->rd->span));
6159 cpu_clear(cpu, rq->rd->online); 6183 set_rq_offline(rq);
6160 } 6184 }
6161 spin_unlock_irqrestore(&rq->lock, flags); 6185 spin_unlock_irqrestore(&rq->lock, flags);
6162 break; 6186 break;
@@ -6190,6 +6214,28 @@ void __init migration_init(void)
6190 6214
6191#ifdef CONFIG_SCHED_DEBUG 6215#ifdef CONFIG_SCHED_DEBUG
6192 6216
6217static inline const char *sd_level_to_string(enum sched_domain_level lvl)
6218{
6219 switch (lvl) {
6220 case SD_LV_NONE:
6221 return "NONE";
6222 case SD_LV_SIBLING:
6223 return "SIBLING";
6224 case SD_LV_MC:
6225 return "MC";
6226 case SD_LV_CPU:
6227 return "CPU";
6228 case SD_LV_NODE:
6229 return "NODE";
6230 case SD_LV_ALLNODES:
6231 return "ALLNODES";
6232 case SD_LV_MAX:
6233 return "MAX";
6234
6235 }
6236 return "MAX";
6237}
6238
6193static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, 6239static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6194 cpumask_t *groupmask) 6240 cpumask_t *groupmask)
6195{ 6241{
@@ -6209,7 +6255,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6209 return -1; 6255 return -1;
6210 } 6256 }
6211 6257
6212 printk(KERN_CONT "span %s\n", str); 6258 printk(KERN_CONT "span %s level %s\n",
6259 str, sd_level_to_string(sd->level));
6213 6260
6214 if (!cpu_isset(cpu, sd->span)) { 6261 if (!cpu_isset(cpu, sd->span)) {
6215 printk(KERN_ERR "ERROR: domain->span does not contain " 6262 printk(KERN_ERR "ERROR: domain->span does not contain "
@@ -6293,9 +6340,9 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
6293 } 6340 }
6294 kfree(groupmask); 6341 kfree(groupmask);
6295} 6342}
6296#else 6343#else /* !CONFIG_SCHED_DEBUG */
6297# define sched_domain_debug(sd, cpu) do { } while (0) 6344# define sched_domain_debug(sd, cpu) do { } while (0)
6298#endif 6345#endif /* CONFIG_SCHED_DEBUG */
6299 6346
6300static int sd_degenerate(struct sched_domain *sd) 6347static int sd_degenerate(struct sched_domain *sd)
6301{ 6348{
@@ -6355,20 +6402,16 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
6355static void rq_attach_root(struct rq *rq, struct root_domain *rd) 6402static void rq_attach_root(struct rq *rq, struct root_domain *rd)
6356{ 6403{
6357 unsigned long flags; 6404 unsigned long flags;
6358 const struct sched_class *class;
6359 6405
6360 spin_lock_irqsave(&rq->lock, flags); 6406 spin_lock_irqsave(&rq->lock, flags);
6361 6407
6362 if (rq->rd) { 6408 if (rq->rd) {
6363 struct root_domain *old_rd = rq->rd; 6409 struct root_domain *old_rd = rq->rd;
6364 6410
6365 for (class = sched_class_highest; class; class = class->next) { 6411 if (cpu_isset(rq->cpu, old_rd->online))
6366 if (class->leave_domain) 6412 set_rq_offline(rq);
6367 class->leave_domain(rq);
6368 }
6369 6413
6370 cpu_clear(rq->cpu, old_rd->span); 6414 cpu_clear(rq->cpu, old_rd->span);
6371 cpu_clear(rq->cpu, old_rd->online);
6372 6415
6373 if (atomic_dec_and_test(&old_rd->refcount)) 6416 if (atomic_dec_and_test(&old_rd->refcount))
6374 kfree(old_rd); 6417 kfree(old_rd);
@@ -6379,12 +6422,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
6379 6422
6380 cpu_set(rq->cpu, rd->span); 6423 cpu_set(rq->cpu, rd->span);
6381 if (cpu_isset(rq->cpu, cpu_online_map)) 6424 if (cpu_isset(rq->cpu, cpu_online_map))
6382 cpu_set(rq->cpu, rd->online); 6425 set_rq_online(rq);
6383
6384 for (class = sched_class_highest; class; class = class->next) {
6385 if (class->join_domain)
6386 class->join_domain(rq);
6387 }
6388 6426
6389 spin_unlock_irqrestore(&rq->lock, flags); 6427 spin_unlock_irqrestore(&rq->lock, flags);
6390} 6428}
@@ -6395,6 +6433,8 @@ static void init_rootdomain(struct root_domain *rd)
6395 6433
6396 cpus_clear(rd->span); 6434 cpus_clear(rd->span);
6397 cpus_clear(rd->online); 6435 cpus_clear(rd->online);
6436
6437 cpupri_init(&rd->cpupri);
6398} 6438}
6399 6439
6400static void init_defrootdomain(void) 6440static void init_defrootdomain(void)
@@ -6589,7 +6629,7 @@ static void sched_domain_node_span(int node, cpumask_t *span)
6589 cpus_or(*span, *span, *nodemask); 6629 cpus_or(*span, *span, *nodemask);
6590 } 6630 }
6591} 6631}
6592#endif 6632#endif /* CONFIG_NUMA */
6593 6633
6594int sched_smt_power_savings = 0, sched_mc_power_savings = 0; 6634int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
6595 6635
@@ -6608,7 +6648,7 @@ cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
6608 *sg = &per_cpu(sched_group_cpus, cpu); 6648 *sg = &per_cpu(sched_group_cpus, cpu);
6609 return cpu; 6649 return cpu;
6610} 6650}
6611#endif 6651#endif /* CONFIG_SCHED_SMT */
6612 6652
6613/* 6653/*
6614 * multi-core sched-domains: 6654 * multi-core sched-domains:
@@ -6616,7 +6656,7 @@ cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
6616#ifdef CONFIG_SCHED_MC 6656#ifdef CONFIG_SCHED_MC
6617static DEFINE_PER_CPU(struct sched_domain, core_domains); 6657static DEFINE_PER_CPU(struct sched_domain, core_domains);
6618static DEFINE_PER_CPU(struct sched_group, sched_group_core); 6658static DEFINE_PER_CPU(struct sched_group, sched_group_core);
6619#endif 6659#endif /* CONFIG_SCHED_MC */
6620 6660
6621#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) 6661#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
6622static int 6662static int
@@ -6718,7 +6758,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
6718 sg = sg->next; 6758 sg = sg->next;
6719 } while (sg != group_head); 6759 } while (sg != group_head);
6720} 6760}
6721#endif 6761#endif /* CONFIG_NUMA */
6722 6762
6723#ifdef CONFIG_NUMA 6763#ifdef CONFIG_NUMA
6724/* Free memory allocated for various sched_group structures */ 6764/* Free memory allocated for various sched_group structures */
@@ -6755,11 +6795,11 @@ next_sg:
6755 sched_group_nodes_bycpu[cpu] = NULL; 6795 sched_group_nodes_bycpu[cpu] = NULL;
6756 } 6796 }
6757} 6797}
6758#else 6798#else /* !CONFIG_NUMA */
6759static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) 6799static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
6760{ 6800{
6761} 6801}
6762#endif 6802#endif /* CONFIG_NUMA */
6763 6803
6764/* 6804/*
6765 * Initialize sched groups cpu_power. 6805 * Initialize sched groups cpu_power.
@@ -7468,7 +7508,7 @@ int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
7468#endif 7508#endif
7469 return err; 7509 return err;
7470} 7510}
7471#endif 7511#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
7472 7512
7473/* 7513/*
7474 * Force a reinitialization of the sched domains hierarchy. The domains 7514 * Force a reinitialization of the sched domains hierarchy. The domains
@@ -7479,21 +7519,28 @@ int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
7479static int update_sched_domains(struct notifier_block *nfb, 7519static int update_sched_domains(struct notifier_block *nfb,
7480 unsigned long action, void *hcpu) 7520 unsigned long action, void *hcpu)
7481{ 7521{
7522 int cpu = (int)(long)hcpu;
7523
7482 switch (action) { 7524 switch (action) {
7483 case CPU_UP_PREPARE:
7484 case CPU_UP_PREPARE_FROZEN:
7485 case CPU_DOWN_PREPARE: 7525 case CPU_DOWN_PREPARE:
7486 case CPU_DOWN_PREPARE_FROZEN: 7526 case CPU_DOWN_PREPARE_FROZEN:
7527 disable_runtime(cpu_rq(cpu));
7528 /* fall-through */
7529 case CPU_UP_PREPARE:
7530 case CPU_UP_PREPARE_FROZEN:
7487 detach_destroy_domains(&cpu_online_map); 7531 detach_destroy_domains(&cpu_online_map);
7488 free_sched_domains(); 7532 free_sched_domains();
7489 return NOTIFY_OK; 7533 return NOTIFY_OK;
7490 7534
7491 case CPU_UP_CANCELED: 7535
7492 case CPU_UP_CANCELED_FROZEN:
7493 case CPU_DOWN_FAILED: 7536 case CPU_DOWN_FAILED:
7494 case CPU_DOWN_FAILED_FROZEN: 7537 case CPU_DOWN_FAILED_FROZEN:
7495 case CPU_ONLINE: 7538 case CPU_ONLINE:
7496 case CPU_ONLINE_FROZEN: 7539 case CPU_ONLINE_FROZEN:
7540 enable_runtime(cpu_rq(cpu));
7541 /* fall-through */
7542 case CPU_UP_CANCELED:
7543 case CPU_UP_CANCELED_FROZEN:
7497 case CPU_DEAD: 7544 case CPU_DEAD:
7498 case CPU_DEAD_FROZEN: 7545 case CPU_DEAD_FROZEN:
7499 /* 7546 /*
@@ -7693,8 +7740,8 @@ void __init sched_init(void)
7693 7740
7694 root_task_group.cfs_rq = (struct cfs_rq **)ptr; 7741 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
7695 ptr += nr_cpu_ids * sizeof(void **); 7742 ptr += nr_cpu_ids * sizeof(void **);
7696#endif 7743#endif /* CONFIG_USER_SCHED */
7697#endif 7744#endif /* CONFIG_FAIR_GROUP_SCHED */
7698#ifdef CONFIG_RT_GROUP_SCHED 7745#ifdef CONFIG_RT_GROUP_SCHED
7699 init_task_group.rt_se = (struct sched_rt_entity **)ptr; 7746 init_task_group.rt_se = (struct sched_rt_entity **)ptr;
7700 ptr += nr_cpu_ids * sizeof(void **); 7747 ptr += nr_cpu_ids * sizeof(void **);
@@ -7708,8 +7755,8 @@ void __init sched_init(void)
7708 7755
7709 root_task_group.rt_rq = (struct rt_rq **)ptr; 7756 root_task_group.rt_rq = (struct rt_rq **)ptr;
7710 ptr += nr_cpu_ids * sizeof(void **); 7757 ptr += nr_cpu_ids * sizeof(void **);
7711#endif 7758#endif /* CONFIG_USER_SCHED */
7712#endif 7759#endif /* CONFIG_RT_GROUP_SCHED */
7713 } 7760 }
7714 7761
7715#ifdef CONFIG_SMP 7762#ifdef CONFIG_SMP
@@ -7725,8 +7772,8 @@ void __init sched_init(void)
7725#ifdef CONFIG_USER_SCHED 7772#ifdef CONFIG_USER_SCHED
7726 init_rt_bandwidth(&root_task_group.rt_bandwidth, 7773 init_rt_bandwidth(&root_task_group.rt_bandwidth,
7727 global_rt_period(), RUNTIME_INF); 7774 global_rt_period(), RUNTIME_INF);
7728#endif 7775#endif /* CONFIG_USER_SCHED */
7729#endif 7776#endif /* CONFIG_RT_GROUP_SCHED */
7730 7777
7731#ifdef CONFIG_GROUP_SCHED 7778#ifdef CONFIG_GROUP_SCHED
7732 list_add(&init_task_group.list, &task_groups); 7779 list_add(&init_task_group.list, &task_groups);
@@ -7736,8 +7783,8 @@ void __init sched_init(void)
7736 INIT_LIST_HEAD(&root_task_group.children); 7783 INIT_LIST_HEAD(&root_task_group.children);
7737 init_task_group.parent = &root_task_group; 7784 init_task_group.parent = &root_task_group;
7738 list_add(&init_task_group.siblings, &root_task_group.children); 7785 list_add(&init_task_group.siblings, &root_task_group.children);
7739#endif 7786#endif /* CONFIG_USER_SCHED */
7740#endif 7787#endif /* CONFIG_GROUP_SCHED */
7741 7788
7742 for_each_possible_cpu(i) { 7789 for_each_possible_cpu(i) {
7743 struct rq *rq; 7790 struct rq *rq;
@@ -7817,6 +7864,7 @@ void __init sched_init(void)
7817 rq->next_balance = jiffies; 7864 rq->next_balance = jiffies;
7818 rq->push_cpu = 0; 7865 rq->push_cpu = 0;
7819 rq->cpu = i; 7866 rq->cpu = i;
7867 rq->online = 0;
7820 rq->migration_thread = NULL; 7868 rq->migration_thread = NULL;
7821 INIT_LIST_HEAD(&rq->migration_queue); 7869 INIT_LIST_HEAD(&rq->migration_queue);
7822 rq_attach_root(rq, &def_root_domain); 7870 rq_attach_root(rq, &def_root_domain);
@@ -8056,7 +8104,7 @@ static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8056{ 8104{
8057 list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list); 8105 list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list);
8058} 8106}
8059#else 8107#else /* !CONFG_FAIR_GROUP_SCHED */
8060static inline void free_fair_sched_group(struct task_group *tg) 8108static inline void free_fair_sched_group(struct task_group *tg)
8061{ 8109{
8062} 8110}
@@ -8074,7 +8122,7 @@ static inline void register_fair_sched_group(struct task_group *tg, int cpu)
8074static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) 8122static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8075{ 8123{
8076} 8124}
8077#endif 8125#endif /* CONFIG_FAIR_GROUP_SCHED */
8078 8126
8079#ifdef CONFIG_RT_GROUP_SCHED 8127#ifdef CONFIG_RT_GROUP_SCHED
8080static void free_rt_sched_group(struct task_group *tg) 8128static void free_rt_sched_group(struct task_group *tg)
@@ -8145,7 +8193,7 @@ static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
8145{ 8193{
8146 list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list); 8194 list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list);
8147} 8195}
8148#else 8196#else /* !CONFIG_RT_GROUP_SCHED */
8149static inline void free_rt_sched_group(struct task_group *tg) 8197static inline void free_rt_sched_group(struct task_group *tg)
8150{ 8198{
8151} 8199}
@@ -8163,7 +8211,7 @@ static inline void register_rt_sched_group(struct task_group *tg, int cpu)
8163static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) 8211static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
8164{ 8212{
8165} 8213}
8166#endif 8214#endif /* CONFIG_RT_GROUP_SCHED */
8167 8215
8168#ifdef CONFIG_GROUP_SCHED 8216#ifdef CONFIG_GROUP_SCHED
8169static void free_sched_group(struct task_group *tg) 8217static void free_sched_group(struct task_group *tg)
@@ -8274,7 +8322,7 @@ void sched_move_task(struct task_struct *tsk)
8274 8322
8275 task_rq_unlock(rq, &flags); 8323 task_rq_unlock(rq, &flags);
8276} 8324}
8277#endif 8325#endif /* CONFIG_GROUP_SCHED */
8278 8326
8279#ifdef CONFIG_FAIR_GROUP_SCHED 8327#ifdef CONFIG_FAIR_GROUP_SCHED
8280static void set_se_shares(struct sched_entity *se, unsigned long shares) 8328static void set_se_shares(struct sched_entity *se, unsigned long shares)
@@ -8374,7 +8422,7 @@ static unsigned long to_ratio(u64 period, u64 runtime)
8374#ifdef CONFIG_CGROUP_SCHED 8422#ifdef CONFIG_CGROUP_SCHED
8375static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) 8423static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
8376{ 8424{
8377 struct task_group *tgi, *parent = tg ? tg->parent : NULL; 8425 struct task_group *tgi, *parent = tg->parent;
8378 unsigned long total = 0; 8426 unsigned long total = 0;
8379 8427
8380 if (!parent) { 8428 if (!parent) {
@@ -8398,7 +8446,7 @@ static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
8398 } 8446 }
8399 rcu_read_unlock(); 8447 rcu_read_unlock();
8400 8448
8401 return total + to_ratio(period, runtime) < 8449 return total + to_ratio(period, runtime) <=
8402 to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period), 8450 to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period),
8403 parent->rt_bandwidth.rt_runtime); 8451 parent->rt_bandwidth.rt_runtime);
8404} 8452}
@@ -8515,16 +8563,21 @@ long sched_group_rt_period(struct task_group *tg)
8515 8563
8516static int sched_rt_global_constraints(void) 8564static int sched_rt_global_constraints(void)
8517{ 8565{
8566 struct task_group *tg = &root_task_group;
8567 u64 rt_runtime, rt_period;
8518 int ret = 0; 8568 int ret = 0;
8519 8569
8570 rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
8571 rt_runtime = tg->rt_bandwidth.rt_runtime;
8572
8520 mutex_lock(&rt_constraints_mutex); 8573 mutex_lock(&rt_constraints_mutex);
8521 if (!__rt_schedulable(NULL, 1, 0)) 8574 if (!__rt_schedulable(tg, rt_period, rt_runtime))
8522 ret = -EINVAL; 8575 ret = -EINVAL;
8523 mutex_unlock(&rt_constraints_mutex); 8576 mutex_unlock(&rt_constraints_mutex);
8524 8577
8525 return ret; 8578 return ret;
8526} 8579}
8527#else 8580#else /* !CONFIG_RT_GROUP_SCHED */
8528static int sched_rt_global_constraints(void) 8581static int sched_rt_global_constraints(void)
8529{ 8582{
8530 unsigned long flags; 8583 unsigned long flags;
@@ -8542,7 +8595,7 @@ static int sched_rt_global_constraints(void)
8542 8595
8543 return 0; 8596 return 0;
8544} 8597}
8545#endif 8598#endif /* CONFIG_RT_GROUP_SCHED */
8546 8599
8547int sched_rt_handler(struct ctl_table *table, int write, 8600int sched_rt_handler(struct ctl_table *table, int write,
8548 struct file *filp, void __user *buffer, size_t *lenp, 8601 struct file *filp, void __user *buffer, size_t *lenp,
@@ -8650,7 +8703,7 @@ static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
8650 8703
8651 return (u64) tg->shares; 8704 return (u64) tg->shares;
8652} 8705}
8653#endif 8706#endif /* CONFIG_FAIR_GROUP_SCHED */
8654 8707
8655#ifdef CONFIG_RT_GROUP_SCHED 8708#ifdef CONFIG_RT_GROUP_SCHED
8656static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, 8709static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
@@ -8674,7 +8727,7 @@ static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft)
8674{ 8727{
8675 return sched_group_rt_period(cgroup_tg(cgrp)); 8728 return sched_group_rt_period(cgroup_tg(cgrp));
8676} 8729}
8677#endif 8730#endif /* CONFIG_RT_GROUP_SCHED */
8678 8731
8679static struct cftype cpu_files[] = { 8732static struct cftype cpu_files[] = {
8680#ifdef CONFIG_FAIR_GROUP_SCHED 8733#ifdef CONFIG_FAIR_GROUP_SCHED
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
new file mode 100644
index 000000000000..52154fefab7e
--- /dev/null
+++ b/kernel/sched_cpupri.c
@@ -0,0 +1,174 @@
1/*
2 * kernel/sched_cpupri.c
3 *
4 * CPU priority management
5 *
6 * Copyright (C) 2007-2008 Novell
7 *
8 * Author: Gregory Haskins <ghaskins@novell.com>
9 *
10 * This code tracks the priority of each CPU so that global migration
11 * decisions are easy to calculate. Each CPU can be in a state as follows:
12 *
13 * (INVALID), IDLE, NORMAL, RT1, ... RT99
14 *
15 * going from the lowest priority to the highest. CPUs in the INVALID state
16 * are not eligible for routing. The system maintains this state with
17 * a 2 dimensional bitmap (the first for priority class, the second for cpus
18 * in that class). Therefore a typical application without affinity
19 * restrictions can find a suitable CPU with O(1) complexity (e.g. two bit
20 * searches). For tasks with affinity restrictions, the algorithm has a
21 * worst case complexity of O(min(102, nr_domcpus)), though the scenario that
22 * yields the worst case search is fairly contrived.
23 *
24 * This program is free software; you can redistribute it and/or
25 * modify it under the terms of the GNU General Public License
26 * as published by the Free Software Foundation; version 2
27 * of the License.
28 */
29
30#include "sched_cpupri.h"
31
32/* Convert between a 140 based task->prio, and our 102 based cpupri */
33static int convert_prio(int prio)
34{
35 int cpupri;
36
37 if (prio == CPUPRI_INVALID)
38 cpupri = CPUPRI_INVALID;
39 else if (prio == MAX_PRIO)
40 cpupri = CPUPRI_IDLE;
41 else if (prio >= MAX_RT_PRIO)
42 cpupri = CPUPRI_NORMAL;
43 else
44 cpupri = MAX_RT_PRIO - prio + 1;
45
46 return cpupri;
47}
48
49#define for_each_cpupri_active(array, idx) \
50 for (idx = find_first_bit(array, CPUPRI_NR_PRIORITIES); \
51 idx < CPUPRI_NR_PRIORITIES; \
52 idx = find_next_bit(array, CPUPRI_NR_PRIORITIES, idx+1))
53
54/**
55 * cpupri_find - find the best (lowest-pri) CPU in the system
56 * @cp: The cpupri context
57 * @p: The task
58 * @lowest_mask: A mask to fill in with selected CPUs
59 *
60 * Note: This function returns the recommended CPUs as calculated during the
61 * current invokation. By the time the call returns, the CPUs may have in
62 * fact changed priorities any number of times. While not ideal, it is not
63 * an issue of correctness since the normal rebalancer logic will correct
64 * any discrepancies created by racing against the uncertainty of the current
65 * priority configuration.
66 *
67 * Returns: (int)bool - CPUs were found
68 */
69int cpupri_find(struct cpupri *cp, struct task_struct *p,
70 cpumask_t *lowest_mask)
71{
72 int idx = 0;
73 int task_pri = convert_prio(p->prio);
74
75 for_each_cpupri_active(cp->pri_active, idx) {
76 struct cpupri_vec *vec = &cp->pri_to_cpu[idx];
77 cpumask_t mask;
78
79 if (idx >= task_pri)
80 break;
81
82 cpus_and(mask, p->cpus_allowed, vec->mask);
83
84 if (cpus_empty(mask))
85 continue;
86
87 *lowest_mask = mask;
88 return 1;
89 }
90
91 return 0;
92}
93
94/**
95 * cpupri_set - update the cpu priority setting
96 * @cp: The cpupri context
97 * @cpu: The target cpu
98 * @pri: The priority (INVALID-RT99) to assign to this CPU
99 *
100 * Note: Assumes cpu_rq(cpu)->lock is locked
101 *
102 * Returns: (void)
103 */
104void cpupri_set(struct cpupri *cp, int cpu, int newpri)
105{
106 int *currpri = &cp->cpu_to_pri[cpu];
107 int oldpri = *currpri;
108 unsigned long flags;
109
110 newpri = convert_prio(newpri);
111
112 BUG_ON(newpri >= CPUPRI_NR_PRIORITIES);
113
114 if (newpri == oldpri)
115 return;
116
117 /*
118 * If the cpu was currently mapped to a different value, we
119 * first need to unmap the old value
120 */
121 if (likely(oldpri != CPUPRI_INVALID)) {
122 struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri];
123
124 spin_lock_irqsave(&vec->lock, flags);
125
126 vec->count--;
127 if (!vec->count)
128 clear_bit(oldpri, cp->pri_active);
129 cpu_clear(cpu, vec->mask);
130
131 spin_unlock_irqrestore(&vec->lock, flags);
132 }
133
134 if (likely(newpri != CPUPRI_INVALID)) {
135 struct cpupri_vec *vec = &cp->pri_to_cpu[newpri];
136
137 spin_lock_irqsave(&vec->lock, flags);
138
139 cpu_set(cpu, vec->mask);
140 vec->count++;
141 if (vec->count == 1)
142 set_bit(newpri, cp->pri_active);
143
144 spin_unlock_irqrestore(&vec->lock, flags);
145 }
146
147 *currpri = newpri;
148}
149
150/**
151 * cpupri_init - initialize the cpupri structure
152 * @cp: The cpupri context
153 *
154 * Returns: (void)
155 */
156void cpupri_init(struct cpupri *cp)
157{
158 int i;
159
160 memset(cp, 0, sizeof(*cp));
161
162 for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
163 struct cpupri_vec *vec = &cp->pri_to_cpu[i];
164
165 spin_lock_init(&vec->lock);
166 vec->count = 0;
167 cpus_clear(vec->mask);
168 }
169
170 for_each_possible_cpu(i)
171 cp->cpu_to_pri[i] = CPUPRI_INVALID;
172}
173
174
diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h
new file mode 100644
index 000000000000..f25811b0f931
--- /dev/null
+++ b/kernel/sched_cpupri.h
@@ -0,0 +1,36 @@
1#ifndef _LINUX_CPUPRI_H
2#define _LINUX_CPUPRI_H
3
4#include <linux/sched.h>
5
6#define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2)
7#define CPUPRI_NR_PRI_WORDS BITS_TO_LONGS(CPUPRI_NR_PRIORITIES)
8
9#define CPUPRI_INVALID -1
10#define CPUPRI_IDLE 0
11#define CPUPRI_NORMAL 1
12/* values 2-101 are RT priorities 0-99 */
13
14struct cpupri_vec {
15 spinlock_t lock;
16 int count;
17 cpumask_t mask;
18};
19
20struct cpupri {
21 struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES];
22 long pri_active[CPUPRI_NR_PRI_WORDS];
23 int cpu_to_pri[NR_CPUS];
24};
25
26#ifdef CONFIG_SMP
27int cpupri_find(struct cpupri *cp,
28 struct task_struct *p, cpumask_t *lowest_mask);
29void cpupri_set(struct cpupri *cp, int cpu, int pri);
30void cpupri_init(struct cpupri *cp);
31#else
32#define cpupri_set(cp, cpu, pri) do { } while (0)
33#define cpupri_init() do { } while (0)
34#endif
35
36#endif /* _LINUX_CPUPRI_H */
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 8bb713040ac9..8e077b9c91cb 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -119,9 +119,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
119 struct sched_entity *last; 119 struct sched_entity *last;
120 unsigned long flags; 120 unsigned long flags;
121 121
122#if !defined(CONFIG_CGROUP_SCHED) || !defined(CONFIG_USER_SCHED) 122#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED)
123 SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
124#else
125 char path[128] = ""; 123 char path[128] = "";
126 struct cgroup *cgroup = NULL; 124 struct cgroup *cgroup = NULL;
127 struct task_group *tg = cfs_rq->tg; 125 struct task_group *tg = cfs_rq->tg;
@@ -133,6 +131,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
133 cgroup_path(cgroup, path, sizeof(path)); 131 cgroup_path(cgroup, path, sizeof(path));
134 132
135 SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path); 133 SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path);
134#else
135 SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
136#endif 136#endif
137 137
138 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock", 138 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock",
@@ -169,6 +169,39 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
169 cfs_rq->nr_spread_over); 169 cfs_rq->nr_spread_over);
170} 170}
171 171
172void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
173{
174#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_RT_GROUP_SCHED)
175 char path[128] = "";
176 struct cgroup *cgroup = NULL;
177 struct task_group *tg = rt_rq->tg;
178
179 if (tg)
180 cgroup = tg->css.cgroup;
181
182 if (cgroup)
183 cgroup_path(cgroup, path, sizeof(path));
184
185 SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, path);
186#else
187 SEQ_printf(m, "\nrt_rq[%d]:\n", cpu);
188#endif
189
190
191#define P(x) \
192 SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rt_rq->x))
193#define PN(x) \
194 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rt_rq->x))
195
196 P(rt_nr_running);
197 P(rt_throttled);
198 PN(rt_time);
199 PN(rt_runtime);
200
201#undef PN
202#undef P
203}
204
172static void print_cpu(struct seq_file *m, int cpu) 205static void print_cpu(struct seq_file *m, int cpu)
173{ 206{
174 struct rq *rq = &per_cpu(runqueues, cpu); 207 struct rq *rq = &per_cpu(runqueues, cpu);
@@ -208,6 +241,7 @@ static void print_cpu(struct seq_file *m, int cpu)
208#undef PN 241#undef PN
209 242
210 print_cfs_stats(m, cpu); 243 print_cfs_stats(m, cpu);
244 print_rt_stats(m, cpu);
211 245
212 print_rq(m, rq, cpu); 246 print_rq(m, rq, cpu);
213} 247}
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 08ae848b71d4..1fe4c65a8170 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1275,23 +1275,18 @@ __load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next)
1275 struct task_struct *p = NULL; 1275 struct task_struct *p = NULL;
1276 struct sched_entity *se; 1276 struct sched_entity *se;
1277 1277
1278 if (next == &cfs_rq->tasks) 1278 while (next != &cfs_rq->tasks) {
1279 return NULL;
1280
1281 /* Skip over entities that are not tasks */
1282 do {
1283 se = list_entry(next, struct sched_entity, group_node); 1279 se = list_entry(next, struct sched_entity, group_node);
1284 next = next->next; 1280 next = next->next;
1285 } while (next != &cfs_rq->tasks && !entity_is_task(se));
1286 1281
1287 if (next == &cfs_rq->tasks) 1282 /* Skip over entities that are not tasks */
1288 return NULL; 1283 if (entity_is_task(se)) {
1284 p = task_of(se);
1285 break;
1286 }
1287 }
1289 1288
1290 cfs_rq->balance_iterator = next; 1289 cfs_rq->balance_iterator = next;
1291
1292 if (entity_is_task(se))
1293 p = task_of(se);
1294
1295 return p; 1290 return p;
1296} 1291}
1297 1292
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 1c7283cb9581..62b39ca92ebd 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -6,5 +6,3 @@ SCHED_FEAT(CACHE_HOT_BUDDY, 1)
6SCHED_FEAT(SYNC_WAKEUPS, 1) 6SCHED_FEAT(SYNC_WAKEUPS, 1)
7SCHED_FEAT(HRTICK, 1) 7SCHED_FEAT(HRTICK, 1)
8SCHED_FEAT(DOUBLE_TICK, 0) 8SCHED_FEAT(DOUBLE_TICK, 0)
9SCHED_FEAT(NORMALIZED_SLEEPER, 1)
10SCHED_FEAT(DEADLINE, 1)
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 0f3c19197fa4..bd90c8bb0739 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -12,6 +12,9 @@ static inline int rt_overloaded(struct rq *rq)
12 12
13static inline void rt_set_overload(struct rq *rq) 13static inline void rt_set_overload(struct rq *rq)
14{ 14{
15 if (!rq->online)
16 return;
17
15 cpu_set(rq->cpu, rq->rd->rto_mask); 18 cpu_set(rq->cpu, rq->rd->rto_mask);
16 /* 19 /*
17 * Make sure the mask is visible before we set 20 * Make sure the mask is visible before we set
@@ -26,6 +29,9 @@ static inline void rt_set_overload(struct rq *rq)
26 29
27static inline void rt_clear_overload(struct rq *rq) 30static inline void rt_clear_overload(struct rq *rq)
28{ 31{
32 if (!rq->online)
33 return;
34
29 /* the order here really doesn't matter */ 35 /* the order here really doesn't matter */
30 atomic_dec(&rq->rd->rto_count); 36 atomic_dec(&rq->rd->rto_count);
31 cpu_clear(rq->cpu, rq->rd->rto_mask); 37 cpu_clear(rq->cpu, rq->rd->rto_mask);
@@ -222,47 +228,8 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
222 228
223#endif 229#endif
224 230
225static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
226{
227 int i, idle = 1;
228 cpumask_t span;
229
230 if (rt_b->rt_runtime == RUNTIME_INF)
231 return 1;
232
233 span = sched_rt_period_mask();
234 for_each_cpu_mask(i, span) {
235 int enqueue = 0;
236 struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
237 struct rq *rq = rq_of_rt_rq(rt_rq);
238
239 spin_lock(&rq->lock);
240 if (rt_rq->rt_time) {
241 u64 runtime;
242
243 spin_lock(&rt_rq->rt_runtime_lock);
244 runtime = rt_rq->rt_runtime;
245 rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime);
246 if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
247 rt_rq->rt_throttled = 0;
248 enqueue = 1;
249 }
250 if (rt_rq->rt_time || rt_rq->rt_nr_running)
251 idle = 0;
252 spin_unlock(&rt_rq->rt_runtime_lock);
253 } else if (rt_rq->rt_nr_running)
254 idle = 0;
255
256 if (enqueue)
257 sched_rt_rq_enqueue(rt_rq);
258 spin_unlock(&rq->lock);
259 }
260
261 return idle;
262}
263
264#ifdef CONFIG_SMP 231#ifdef CONFIG_SMP
265static int balance_runtime(struct rt_rq *rt_rq) 232static int do_balance_runtime(struct rt_rq *rt_rq)
266{ 233{
267 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); 234 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
268 struct root_domain *rd = cpu_rq(smp_processor_id())->rd; 235 struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
@@ -281,6 +248,9 @@ static int balance_runtime(struct rt_rq *rt_rq)
281 continue; 248 continue;
282 249
283 spin_lock(&iter->rt_runtime_lock); 250 spin_lock(&iter->rt_runtime_lock);
251 if (iter->rt_runtime == RUNTIME_INF)
252 goto next;
253
284 diff = iter->rt_runtime - iter->rt_time; 254 diff = iter->rt_runtime - iter->rt_time;
285 if (diff > 0) { 255 if (diff > 0) {
286 do_div(diff, weight); 256 do_div(diff, weight);
@@ -294,14 +264,165 @@ static int balance_runtime(struct rt_rq *rt_rq)
294 break; 264 break;
295 } 265 }
296 } 266 }
267next:
297 spin_unlock(&iter->rt_runtime_lock); 268 spin_unlock(&iter->rt_runtime_lock);
298 } 269 }
299 spin_unlock(&rt_b->rt_runtime_lock); 270 spin_unlock(&rt_b->rt_runtime_lock);
300 271
301 return more; 272 return more;
302} 273}
274
275static void __disable_runtime(struct rq *rq)
276{
277 struct root_domain *rd = rq->rd;
278 struct rt_rq *rt_rq;
279
280 if (unlikely(!scheduler_running))
281 return;
282
283 for_each_leaf_rt_rq(rt_rq, rq) {
284 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
285 s64 want;
286 int i;
287
288 spin_lock(&rt_b->rt_runtime_lock);
289 spin_lock(&rt_rq->rt_runtime_lock);
290 if (rt_rq->rt_runtime == RUNTIME_INF ||
291 rt_rq->rt_runtime == rt_b->rt_runtime)
292 goto balanced;
293 spin_unlock(&rt_rq->rt_runtime_lock);
294
295 want = rt_b->rt_runtime - rt_rq->rt_runtime;
296
297 for_each_cpu_mask(i, rd->span) {
298 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
299 s64 diff;
300
301 if (iter == rt_rq)
302 continue;
303
304 spin_lock(&iter->rt_runtime_lock);
305 if (want > 0) {
306 diff = min_t(s64, iter->rt_runtime, want);
307 iter->rt_runtime -= diff;
308 want -= diff;
309 } else {
310 iter->rt_runtime -= want;
311 want -= want;
312 }
313 spin_unlock(&iter->rt_runtime_lock);
314
315 if (!want)
316 break;
317 }
318
319 spin_lock(&rt_rq->rt_runtime_lock);
320 BUG_ON(want);
321balanced:
322 rt_rq->rt_runtime = RUNTIME_INF;
323 spin_unlock(&rt_rq->rt_runtime_lock);
324 spin_unlock(&rt_b->rt_runtime_lock);
325 }
326}
327
328static void disable_runtime(struct rq *rq)
329{
330 unsigned long flags;
331
332 spin_lock_irqsave(&rq->lock, flags);
333 __disable_runtime(rq);
334 spin_unlock_irqrestore(&rq->lock, flags);
335}
336
337static void __enable_runtime(struct rq *rq)
338{
339 struct root_domain *rd = rq->rd;
340 struct rt_rq *rt_rq;
341
342 if (unlikely(!scheduler_running))
343 return;
344
345 for_each_leaf_rt_rq(rt_rq, rq) {
346 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
347
348 spin_lock(&rt_b->rt_runtime_lock);
349 spin_lock(&rt_rq->rt_runtime_lock);
350 rt_rq->rt_runtime = rt_b->rt_runtime;
351 rt_rq->rt_time = 0;
352 spin_unlock(&rt_rq->rt_runtime_lock);
353 spin_unlock(&rt_b->rt_runtime_lock);
354 }
355}
356
357static void enable_runtime(struct rq *rq)
358{
359 unsigned long flags;
360
361 spin_lock_irqsave(&rq->lock, flags);
362 __enable_runtime(rq);
363 spin_unlock_irqrestore(&rq->lock, flags);
364}
365
366static int balance_runtime(struct rt_rq *rt_rq)
367{
368 int more = 0;
369
370 if (rt_rq->rt_time > rt_rq->rt_runtime) {
371 spin_unlock(&rt_rq->rt_runtime_lock);
372 more = do_balance_runtime(rt_rq);
373 spin_lock(&rt_rq->rt_runtime_lock);
374 }
375
376 return more;
377}
378#else
379static inline int balance_runtime(struct rt_rq *rt_rq)
380{
381 return 0;
382}
303#endif 383#endif
304 384
385static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
386{
387 int i, idle = 1;
388 cpumask_t span;
389
390 if (rt_b->rt_runtime == RUNTIME_INF)
391 return 1;
392
393 span = sched_rt_period_mask();
394 for_each_cpu_mask(i, span) {
395 int enqueue = 0;
396 struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
397 struct rq *rq = rq_of_rt_rq(rt_rq);
398
399 spin_lock(&rq->lock);
400 if (rt_rq->rt_time) {
401 u64 runtime;
402
403 spin_lock(&rt_rq->rt_runtime_lock);
404 if (rt_rq->rt_throttled)
405 balance_runtime(rt_rq);
406 runtime = rt_rq->rt_runtime;
407 rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime);
408 if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
409 rt_rq->rt_throttled = 0;
410 enqueue = 1;
411 }
412 if (rt_rq->rt_time || rt_rq->rt_nr_running)
413 idle = 0;
414 spin_unlock(&rt_rq->rt_runtime_lock);
415 } else if (rt_rq->rt_nr_running)
416 idle = 0;
417
418 if (enqueue)
419 sched_rt_rq_enqueue(rt_rq);
420 spin_unlock(&rq->lock);
421 }
422
423 return idle;
424}
425
305static inline int rt_se_prio(struct sched_rt_entity *rt_se) 426static inline int rt_se_prio(struct sched_rt_entity *rt_se)
306{ 427{
307#ifdef CONFIG_RT_GROUP_SCHED 428#ifdef CONFIG_RT_GROUP_SCHED
@@ -327,18 +448,10 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
327 if (sched_rt_runtime(rt_rq) >= sched_rt_period(rt_rq)) 448 if (sched_rt_runtime(rt_rq) >= sched_rt_period(rt_rq))
328 return 0; 449 return 0;
329 450
330#ifdef CONFIG_SMP 451 balance_runtime(rt_rq);
331 if (rt_rq->rt_time > runtime) { 452 runtime = sched_rt_runtime(rt_rq);
332 int more; 453 if (runtime == RUNTIME_INF)
333 454 return 0;
334 spin_unlock(&rt_rq->rt_runtime_lock);
335 more = balance_runtime(rt_rq);
336 spin_lock(&rt_rq->rt_runtime_lock);
337
338 if (more)
339 runtime = sched_rt_runtime(rt_rq);
340 }
341#endif
342 455
343 if (rt_rq->rt_time > runtime) { 456 if (rt_rq->rt_time > runtime) {
344 rt_rq->rt_throttled = 1; 457 rt_rq->rt_throttled = 1;
@@ -392,12 +505,21 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
392 WARN_ON(!rt_prio(rt_se_prio(rt_se))); 505 WARN_ON(!rt_prio(rt_se_prio(rt_se)));
393 rt_rq->rt_nr_running++; 506 rt_rq->rt_nr_running++;
394#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED 507#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
395 if (rt_se_prio(rt_se) < rt_rq->highest_prio) 508 if (rt_se_prio(rt_se) < rt_rq->highest_prio) {
509 struct rq *rq = rq_of_rt_rq(rt_rq);
510
396 rt_rq->highest_prio = rt_se_prio(rt_se); 511 rt_rq->highest_prio = rt_se_prio(rt_se);
512#ifdef CONFIG_SMP
513 if (rq->online)
514 cpupri_set(&rq->rd->cpupri, rq->cpu,
515 rt_se_prio(rt_se));
516#endif
517 }
397#endif 518#endif
398#ifdef CONFIG_SMP 519#ifdef CONFIG_SMP
399 if (rt_se->nr_cpus_allowed > 1) { 520 if (rt_se->nr_cpus_allowed > 1) {
400 struct rq *rq = rq_of_rt_rq(rt_rq); 521 struct rq *rq = rq_of_rt_rq(rt_rq);
522
401 rq->rt.rt_nr_migratory++; 523 rq->rt.rt_nr_migratory++;
402 } 524 }
403 525
@@ -417,6 +539,10 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
417static inline 539static inline
418void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 540void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
419{ 541{
542#ifdef CONFIG_SMP
543 int highest_prio = rt_rq->highest_prio;
544#endif
545
420 WARN_ON(!rt_prio(rt_se_prio(rt_se))); 546 WARN_ON(!rt_prio(rt_se_prio(rt_se)));
421 WARN_ON(!rt_rq->rt_nr_running); 547 WARN_ON(!rt_rq->rt_nr_running);
422 rt_rq->rt_nr_running--; 548 rt_rq->rt_nr_running--;
@@ -440,6 +566,14 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
440 rq->rt.rt_nr_migratory--; 566 rq->rt.rt_nr_migratory--;
441 } 567 }
442 568
569 if (rt_rq->highest_prio != highest_prio) {
570 struct rq *rq = rq_of_rt_rq(rt_rq);
571
572 if (rq->online)
573 cpupri_set(&rq->rd->cpupri, rq->cpu,
574 rt_rq->highest_prio);
575 }
576
443 update_rt_migration(rq_of_rt_rq(rt_rq)); 577 update_rt_migration(rq_of_rt_rq(rt_rq));
444#endif /* CONFIG_SMP */ 578#endif /* CONFIG_SMP */
445#ifdef CONFIG_RT_GROUP_SCHED 579#ifdef CONFIG_RT_GROUP_SCHED
@@ -455,6 +589,7 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se)
455 struct rt_rq *rt_rq = rt_rq_of_se(rt_se); 589 struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
456 struct rt_prio_array *array = &rt_rq->active; 590 struct rt_prio_array *array = &rt_rq->active;
457 struct rt_rq *group_rq = group_rt_rq(rt_se); 591 struct rt_rq *group_rq = group_rt_rq(rt_se);
592 struct list_head *queue = array->queue + rt_se_prio(rt_se);
458 593
459 /* 594 /*
460 * Don't enqueue the group if its throttled, or when empty. 595 * Don't enqueue the group if its throttled, or when empty.
@@ -465,7 +600,11 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se)
465 if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) 600 if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
466 return; 601 return;
467 602
468 list_add_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se)); 603 if (rt_se->nr_cpus_allowed == 1)
604 list_add(&rt_se->run_list, queue);
605 else
606 list_add_tail(&rt_se->run_list, queue);
607
469 __set_bit(rt_se_prio(rt_se), array->bitmap); 608 __set_bit(rt_se_prio(rt_se), array->bitmap);
470 609
471 inc_rt_tasks(rt_se, rt_rq); 610 inc_rt_tasks(rt_se, rt_rq);
@@ -552,8 +691,11 @@ void requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
552 struct rt_prio_array *array = &rt_rq->active; 691 struct rt_prio_array *array = &rt_rq->active;
553 struct list_head *queue = array->queue + rt_se_prio(rt_se); 692 struct list_head *queue = array->queue + rt_se_prio(rt_se);
554 693
555 if (on_rt_rq(rt_se)) 694 if (on_rt_rq(rt_se)) {
556 list_move_tail(&rt_se->run_list, queue); 695 list_del_init(&rt_se->run_list);
696 list_add_tail(&rt_se->run_list,
697 array->queue + rt_se_prio(rt_se));
698 }
557} 699}
558 700
559static void requeue_task_rt(struct rq *rq, struct task_struct *p) 701static void requeue_task_rt(struct rq *rq, struct task_struct *p)
@@ -616,8 +758,37 @@ static int select_task_rq_rt(struct task_struct *p, int sync)
616 */ 758 */
617static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p) 759static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p)
618{ 760{
619 if (p->prio < rq->curr->prio) 761 if (p->prio < rq->curr->prio) {
620 resched_task(rq->curr); 762 resched_task(rq->curr);
763 return;
764 }
765
766#ifdef CONFIG_SMP
767 /*
768 * If:
769 *
770 * - the newly woken task is of equal priority to the current task
771 * - the newly woken task is non-migratable while current is migratable
772 * - current will be preempted on the next reschedule
773 *
774 * we should check to see if current can readily move to a different
775 * cpu. If so, we will reschedule to allow the push logic to try
776 * to move current somewhere else, making room for our non-migratable
777 * task.
778 */
779 if((p->prio == rq->curr->prio)
780 && p->rt.nr_cpus_allowed == 1
781 && rq->curr->rt.nr_cpus_allowed != 1) {
782 cpumask_t mask;
783
784 if (cpupri_find(&rq->rd->cpupri, rq->curr, &mask))
785 /*
786 * There appears to be other cpus that can accept
787 * current, so lets reschedule to try and push it away
788 */
789 resched_task(rq->curr);
790 }
791#endif
621} 792}
622 793
623static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq, 794static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
@@ -720,73 +891,6 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
720 891
721static DEFINE_PER_CPU(cpumask_t, local_cpu_mask); 892static DEFINE_PER_CPU(cpumask_t, local_cpu_mask);
722 893
723static int find_lowest_cpus(struct task_struct *task, cpumask_t *lowest_mask)
724{
725 int lowest_prio = -1;
726 int lowest_cpu = -1;
727 int count = 0;
728 int cpu;
729
730 cpus_and(*lowest_mask, task_rq(task)->rd->online, task->cpus_allowed);
731
732 /*
733 * Scan each rq for the lowest prio.
734 */
735 for_each_cpu_mask(cpu, *lowest_mask) {
736 struct rq *rq = cpu_rq(cpu);
737
738 /* We look for lowest RT prio or non-rt CPU */
739 if (rq->rt.highest_prio >= MAX_RT_PRIO) {
740 /*
741 * if we already found a low RT queue
742 * and now we found this non-rt queue
743 * clear the mask and set our bit.
744 * Otherwise just return the queue as is
745 * and the count==1 will cause the algorithm
746 * to use the first bit found.
747 */
748 if (lowest_cpu != -1) {
749 cpus_clear(*lowest_mask);
750 cpu_set(rq->cpu, *lowest_mask);
751 }
752 return 1;
753 }
754
755 /* no locking for now */
756 if ((rq->rt.highest_prio > task->prio)
757 && (rq->rt.highest_prio >= lowest_prio)) {
758 if (rq->rt.highest_prio > lowest_prio) {
759 /* new low - clear old data */
760 lowest_prio = rq->rt.highest_prio;
761 lowest_cpu = cpu;
762 count = 0;
763 }
764 count++;
765 } else
766 cpu_clear(cpu, *lowest_mask);
767 }
768
769 /*
770 * Clear out all the set bits that represent
771 * runqueues that were of higher prio than
772 * the lowest_prio.
773 */
774 if (lowest_cpu > 0) {
775 /*
776 * Perhaps we could add another cpumask op to
777 * zero out bits. Like cpu_zero_bits(cpumask, nrbits);
778 * Then that could be optimized to use memset and such.
779 */
780 for_each_cpu_mask(cpu, *lowest_mask) {
781 if (cpu >= lowest_cpu)
782 break;
783 cpu_clear(cpu, *lowest_mask);
784 }
785 }
786
787 return count;
788}
789
790static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask) 894static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask)
791{ 895{
792 int first; 896 int first;
@@ -808,17 +912,12 @@ static int find_lowest_rq(struct task_struct *task)
808 cpumask_t *lowest_mask = &__get_cpu_var(local_cpu_mask); 912 cpumask_t *lowest_mask = &__get_cpu_var(local_cpu_mask);
809 int this_cpu = smp_processor_id(); 913 int this_cpu = smp_processor_id();
810 int cpu = task_cpu(task); 914 int cpu = task_cpu(task);
811 int count = find_lowest_cpus(task, lowest_mask);
812 915
813 if (!count) 916 if (task->rt.nr_cpus_allowed == 1)
814 return -1; /* No targets found */ 917 return -1; /* No other targets possible */
815 918
816 /* 919 if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
817 * There is no sense in performing an optimal search if only one 920 return -1; /* No targets found */
818 * target is found.
819 */
820 if (count == 1)
821 return first_cpu(*lowest_mask);
822 921
823 /* 922 /*
824 * At this point we have built a mask of cpus representing the 923 * At this point we have built a mask of cpus representing the
@@ -1163,17 +1262,25 @@ static void set_cpus_allowed_rt(struct task_struct *p,
1163} 1262}
1164 1263
1165/* Assumes rq->lock is held */ 1264/* Assumes rq->lock is held */
1166static void join_domain_rt(struct rq *rq) 1265static void rq_online_rt(struct rq *rq)
1167{ 1266{
1168 if (rq->rt.overloaded) 1267 if (rq->rt.overloaded)
1169 rt_set_overload(rq); 1268 rt_set_overload(rq);
1269
1270 __enable_runtime(rq);
1271
1272 cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio);
1170} 1273}
1171 1274
1172/* Assumes rq->lock is held */ 1275/* Assumes rq->lock is held */
1173static void leave_domain_rt(struct rq *rq) 1276static void rq_offline_rt(struct rq *rq)
1174{ 1277{
1175 if (rq->rt.overloaded) 1278 if (rq->rt.overloaded)
1176 rt_clear_overload(rq); 1279 rt_clear_overload(rq);
1280
1281 __disable_runtime(rq);
1282
1283 cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID);
1177} 1284}
1178 1285
1179/* 1286/*
@@ -1336,8 +1443,8 @@ static const struct sched_class rt_sched_class = {
1336 .load_balance = load_balance_rt, 1443 .load_balance = load_balance_rt,
1337 .move_one_task = move_one_task_rt, 1444 .move_one_task = move_one_task_rt,
1338 .set_cpus_allowed = set_cpus_allowed_rt, 1445 .set_cpus_allowed = set_cpus_allowed_rt,
1339 .join_domain = join_domain_rt, 1446 .rq_online = rq_online_rt,
1340 .leave_domain = leave_domain_rt, 1447 .rq_offline = rq_offline_rt,
1341 .pre_schedule = pre_schedule_rt, 1448 .pre_schedule = pre_schedule_rt,
1342 .post_schedule = post_schedule_rt, 1449 .post_schedule = post_schedule_rt,
1343 .task_wake_up = task_wake_up_rt, 1450 .task_wake_up = task_wake_up_rt,
@@ -1350,3 +1457,17 @@ static const struct sched_class rt_sched_class = {
1350 .prio_changed = prio_changed_rt, 1457 .prio_changed = prio_changed_rt,
1351 .switched_to = switched_to_rt, 1458 .switched_to = switched_to_rt,
1352}; 1459};
1460
1461#ifdef CONFIG_SCHED_DEBUG
1462extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
1463
1464static void print_rt_stats(struct seq_file *m, int cpu)
1465{
1466 struct rt_rq *rt_rq;
1467
1468 rcu_read_lock();
1469 for_each_leaf_rt_rq(rt_rq, cpu_rq(cpu))
1470 print_rt_rq(m, cpu, rt_rq);
1471 rcu_read_unlock();
1472}
1473#endif