aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile5
-rw-r--r--kernel/cpu.c24
-rw-r--r--kernel/cpuset.c14
-rw-r--r--kernel/kthread.c1
-rw-r--r--kernel/sched.c247
-rw-r--r--kernel/sched_cpupri.c174
-rw-r--r--kernel/sched_cpupri.h36
-rw-r--r--kernel/sched_debug.c40
-rw-r--r--kernel/sched_fair.c19
-rw-r--r--kernel/sched_features.h2
-rw-r--r--kernel/sched_rt.c394
11 files changed, 703 insertions, 253 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 1c9938addb9d..6c55301112e0 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -3,7 +3,7 @@
3# 3#
4 4
5obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ 5obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
6 exit.o itimer.o time.o softirq.o resource.o \ 6 cpu.o exit.o itimer.o time.o softirq.o resource.o \
7 sysctl.o capability.o ptrace.o timer.o user.o \ 7 sysctl.o capability.o ptrace.o timer.o user.o \
8 signal.o sys.o kmod.o workqueue.o pid.o \ 8 signal.o sys.o kmod.o workqueue.o pid.o \
9 rcupdate.o extable.o params.o posix-timers.o \ 9 rcupdate.o extable.o params.o posix-timers.o \
@@ -27,7 +27,7 @@ obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
27obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o 27obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
28obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o 28obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
29obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o 29obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
30obj-$(CONFIG_SMP) += cpu.o spinlock.o 30obj-$(CONFIG_SMP) += spinlock.o
31obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o 31obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
32obj-$(CONFIG_PROVE_LOCKING) += spinlock.o 32obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
33obj-$(CONFIG_UID16) += uid16.o 33obj-$(CONFIG_UID16) += uid16.o
@@ -69,6 +69,7 @@ obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
69obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o 69obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
70obj-$(CONFIG_MARKERS) += marker.o 70obj-$(CONFIG_MARKERS) += marker.o
71obj-$(CONFIG_LATENCYTOP) += latencytop.o 71obj-$(CONFIG_LATENCYTOP) += latencytop.o
72obj-$(CONFIG_SMP) += sched_cpupri.o
72 73
73ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) 74ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
74# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is 75# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/cpu.c b/kernel/cpu.c
index c77bc3a1c722..b11f06dc149a 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -15,6 +15,28 @@
15#include <linux/stop_machine.h> 15#include <linux/stop_machine.h>
16#include <linux/mutex.h> 16#include <linux/mutex.h>
17 17
18/*
19 * Represents all cpu's present in the system
20 * In systems capable of hotplug, this map could dynamically grow
21 * as new cpu's are detected in the system via any platform specific
22 * method, such as ACPI for e.g.
23 */
24cpumask_t cpu_present_map __read_mostly;
25EXPORT_SYMBOL(cpu_present_map);
26
27#ifndef CONFIG_SMP
28
29/*
30 * Represents all cpu's that are currently online.
31 */
32cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL;
33EXPORT_SYMBOL(cpu_online_map);
34
35cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
36EXPORT_SYMBOL(cpu_possible_map);
37
38#else /* CONFIG_SMP */
39
18/* Serializes the updates to cpu_online_map, cpu_present_map */ 40/* Serializes the updates to cpu_online_map, cpu_present_map */
19static DEFINE_MUTEX(cpu_add_remove_lock); 41static DEFINE_MUTEX(cpu_add_remove_lock);
20 42
@@ -403,3 +425,5 @@ out:
403 cpu_maps_update_done(); 425 cpu_maps_update_done();
404} 426}
405#endif /* CONFIG_PM_SLEEP_SMP */ 427#endif /* CONFIG_PM_SLEEP_SMP */
428
429#endif /* CONFIG_SMP */
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 9fceb97e989c..64a05da9bc4c 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1194,6 +1194,15 @@ static int cpuset_can_attach(struct cgroup_subsys *ss,
1194 1194
1195 if (cpus_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) 1195 if (cpus_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
1196 return -ENOSPC; 1196 return -ENOSPC;
1197 if (tsk->flags & PF_THREAD_BOUND) {
1198 cpumask_t mask;
1199
1200 mutex_lock(&callback_mutex);
1201 mask = cs->cpus_allowed;
1202 mutex_unlock(&callback_mutex);
1203 if (!cpus_equal(tsk->cpus_allowed, mask))
1204 return -EINVAL;
1205 }
1197 1206
1198 return security_task_setscheduler(tsk, 0, NULL); 1207 return security_task_setscheduler(tsk, 0, NULL);
1199} 1208}
@@ -1207,11 +1216,14 @@ static void cpuset_attach(struct cgroup_subsys *ss,
1207 struct mm_struct *mm; 1216 struct mm_struct *mm;
1208 struct cpuset *cs = cgroup_cs(cont); 1217 struct cpuset *cs = cgroup_cs(cont);
1209 struct cpuset *oldcs = cgroup_cs(oldcont); 1218 struct cpuset *oldcs = cgroup_cs(oldcont);
1219 int err;
1210 1220
1211 mutex_lock(&callback_mutex); 1221 mutex_lock(&callback_mutex);
1212 guarantee_online_cpus(cs, &cpus); 1222 guarantee_online_cpus(cs, &cpus);
1213 set_cpus_allowed_ptr(tsk, &cpus); 1223 err = set_cpus_allowed_ptr(tsk, &cpus);
1214 mutex_unlock(&callback_mutex); 1224 mutex_unlock(&callback_mutex);
1225 if (err)
1226 return;
1215 1227
1216 from = oldcs->mems_allowed; 1228 from = oldcs->mems_allowed;
1217 to = cs->mems_allowed; 1229 to = cs->mems_allowed;
diff --git a/kernel/kthread.c b/kernel/kthread.c
index bd1b9ea024e1..97747cdd37c9 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -180,6 +180,7 @@ void kthread_bind(struct task_struct *k, unsigned int cpu)
180 set_task_cpu(k, cpu); 180 set_task_cpu(k, cpu);
181 k->cpus_allowed = cpumask_of_cpu(cpu); 181 k->cpus_allowed = cpumask_of_cpu(cpu);
182 k->rt.nr_cpus_allowed = 1; 182 k->rt.nr_cpus_allowed = 1;
183 k->flags |= PF_THREAD_BOUND;
183} 184}
184EXPORT_SYMBOL(kthread_bind); 185EXPORT_SYMBOL(kthread_bind);
185 186
diff --git a/kernel/sched.c b/kernel/sched.c
index b048ad8a11af..adb2d01fccc2 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -74,6 +74,8 @@
74#include <asm/tlb.h> 74#include <asm/tlb.h>
75#include <asm/irq_regs.h> 75#include <asm/irq_regs.h>
76 76
77#include "sched_cpupri.h"
78
77/* 79/*
78 * Convert user-nice values [ -20 ... 0 ... 19 ] 80 * Convert user-nice values [ -20 ... 0 ... 19 ]
79 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], 81 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
@@ -289,15 +291,15 @@ struct task_group root_task_group;
289static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); 291static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
290/* Default task group's cfs_rq on each cpu */ 292/* Default task group's cfs_rq on each cpu */
291static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; 293static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
292#endif 294#endif /* CONFIG_FAIR_GROUP_SCHED */
293 295
294#ifdef CONFIG_RT_GROUP_SCHED 296#ifdef CONFIG_RT_GROUP_SCHED
295static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); 297static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
296static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; 298static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
297#endif 299#endif /* CONFIG_RT_GROUP_SCHED */
298#else 300#else /* !CONFIG_FAIR_GROUP_SCHED */
299#define root_task_group init_task_group 301#define root_task_group init_task_group
300#endif 302#endif /* CONFIG_FAIR_GROUP_SCHED */
301 303
302/* task_group_lock serializes add/remove of task groups and also changes to 304/* task_group_lock serializes add/remove of task groups and also changes to
303 * a task group's cpu shares. 305 * a task group's cpu shares.
@@ -307,9 +309,9 @@ static DEFINE_SPINLOCK(task_group_lock);
307#ifdef CONFIG_FAIR_GROUP_SCHED 309#ifdef CONFIG_FAIR_GROUP_SCHED
308#ifdef CONFIG_USER_SCHED 310#ifdef CONFIG_USER_SCHED
309# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) 311# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
310#else 312#else /* !CONFIG_USER_SCHED */
311# define INIT_TASK_GROUP_LOAD NICE_0_LOAD 313# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
312#endif 314#endif /* CONFIG_USER_SCHED */
313 315
314/* 316/*
315 * A weight of 0 or 1 can cause arithmetics problems. 317 * A weight of 0 or 1 can cause arithmetics problems.
@@ -452,6 +454,9 @@ struct root_domain {
452 */ 454 */
453 cpumask_t rto_mask; 455 cpumask_t rto_mask;
454 atomic_t rto_count; 456 atomic_t rto_count;
457#ifdef CONFIG_SMP
458 struct cpupri cpupri;
459#endif
455}; 460};
456 461
457/* 462/*
@@ -526,6 +531,7 @@ struct rq {
526 int push_cpu; 531 int push_cpu;
527 /* cpu of this runqueue: */ 532 /* cpu of this runqueue: */
528 int cpu; 533 int cpu;
534 int online;
529 535
530 struct task_struct *migration_thread; 536 struct task_struct *migration_thread;
531 struct list_head migration_queue; 537 struct list_head migration_queue;
@@ -1313,15 +1319,15 @@ void wake_up_idle_cpu(int cpu)
1313 if (!tsk_is_polling(rq->idle)) 1319 if (!tsk_is_polling(rq->idle))
1314 smp_send_reschedule(cpu); 1320 smp_send_reschedule(cpu);
1315} 1321}
1316#endif 1322#endif /* CONFIG_NO_HZ */
1317 1323
1318#else 1324#else /* !CONFIG_SMP */
1319static void __resched_task(struct task_struct *p, int tif_bit) 1325static void __resched_task(struct task_struct *p, int tif_bit)
1320{ 1326{
1321 assert_spin_locked(&task_rq(p)->lock); 1327 assert_spin_locked(&task_rq(p)->lock);
1322 set_tsk_thread_flag(p, tif_bit); 1328 set_tsk_thread_flag(p, tif_bit);
1323} 1329}
1324#endif 1330#endif /* CONFIG_SMP */
1325 1331
1326#if BITS_PER_LONG == 32 1332#if BITS_PER_LONG == 32
1327# define WMULT_CONST (~0UL) 1333# define WMULT_CONST (~0UL)
@@ -1481,16 +1487,8 @@ static unsigned long source_load(int cpu, int type);
1481static unsigned long target_load(int cpu, int type); 1487static unsigned long target_load(int cpu, int type);
1482static unsigned long cpu_avg_load_per_task(int cpu); 1488static unsigned long cpu_avg_load_per_task(int cpu);
1483static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); 1489static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1484#else /* CONFIG_SMP */
1485
1486#ifdef CONFIG_FAIR_GROUP_SCHED
1487static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1488{
1489}
1490#endif 1490#endif
1491 1491
1492#endif /* CONFIG_SMP */
1493
1494#include "sched_stats.h" 1492#include "sched_stats.h"
1495#include "sched_idletask.c" 1493#include "sched_idletask.c"
1496#include "sched_fair.c" 1494#include "sched_fair.c"
@@ -1500,6 +1498,8 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1500#endif 1498#endif
1501 1499
1502#define sched_class_highest (&rt_sched_class) 1500#define sched_class_highest (&rt_sched_class)
1501#define for_each_class(class) \
1502 for (class = sched_class_highest; class; class = class->next)
1503 1503
1504static inline void inc_load(struct rq *rq, const struct task_struct *p) 1504static inline void inc_load(struct rq *rq, const struct task_struct *p)
1505{ 1505{
@@ -1636,12 +1636,6 @@ inline int task_curr(const struct task_struct *p)
1636 return cpu_curr(task_cpu(p)) == p; 1636 return cpu_curr(task_cpu(p)) == p;
1637} 1637}
1638 1638
1639/* Used instead of source_load when we know the type == 0 */
1640unsigned long weighted_cpuload(const int cpu)
1641{
1642 return cpu_rq(cpu)->load.weight;
1643}
1644
1645static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) 1639static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1646{ 1640{
1647 set_task_rq(p, cpu); 1641 set_task_rq(p, cpu);
@@ -1670,6 +1664,12 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
1670 1664
1671#ifdef CONFIG_SMP 1665#ifdef CONFIG_SMP
1672 1666
1667/* Used instead of source_load when we know the type == 0 */
1668static unsigned long weighted_cpuload(const int cpu)
1669{
1670 return cpu_rq(cpu)->load.weight;
1671}
1672
1673/* 1673/*
1674 * Is this task likely cache-hot: 1674 * Is this task likely cache-hot:
1675 */ 1675 */
@@ -2131,7 +2131,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2131 } 2131 }
2132 } 2132 }
2133 } 2133 }
2134#endif 2134#endif /* CONFIG_SCHEDSTATS */
2135 2135
2136out_activate: 2136out_activate:
2137#endif /* CONFIG_SMP */ 2137#endif /* CONFIG_SMP */
@@ -2331,7 +2331,7 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr,
2331 notifier->ops->sched_out(notifier, next); 2331 notifier->ops->sched_out(notifier, next);
2332} 2332}
2333 2333
2334#else 2334#else /* !CONFIG_PREEMPT_NOTIFIERS */
2335 2335
2336static void fire_sched_in_preempt_notifiers(struct task_struct *curr) 2336static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2337{ 2337{
@@ -2343,7 +2343,7 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr,
2343{ 2343{
2344} 2344}
2345 2345
2346#endif 2346#endif /* CONFIG_PREEMPT_NOTIFIERS */
2347 2347
2348/** 2348/**
2349 * prepare_task_switch - prepare to switch tasks 2349 * prepare_task_switch - prepare to switch tasks
@@ -3672,6 +3672,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3672 /* Earliest time when we have to do rebalance again */ 3672 /* Earliest time when we have to do rebalance again */
3673 unsigned long next_balance = jiffies + 60*HZ; 3673 unsigned long next_balance = jiffies + 60*HZ;
3674 int update_next_balance = 0; 3674 int update_next_balance = 0;
3675 int need_serialize;
3675 cpumask_t tmp; 3676 cpumask_t tmp;
3676 3677
3677 for_each_domain(cpu, sd) { 3678 for_each_domain(cpu, sd) {
@@ -3689,8 +3690,9 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3689 if (interval > HZ*NR_CPUS/10) 3690 if (interval > HZ*NR_CPUS/10)
3690 interval = HZ*NR_CPUS/10; 3691 interval = HZ*NR_CPUS/10;
3691 3692
3693 need_serialize = sd->flags & SD_SERIALIZE;
3692 3694
3693 if (sd->flags & SD_SERIALIZE) { 3695 if (need_serialize) {
3694 if (!spin_trylock(&balancing)) 3696 if (!spin_trylock(&balancing))
3695 goto out; 3697 goto out;
3696 } 3698 }
@@ -3706,7 +3708,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3706 } 3708 }
3707 sd->last_balance = jiffies; 3709 sd->last_balance = jiffies;
3708 } 3710 }
3709 if (sd->flags & SD_SERIALIZE) 3711 if (need_serialize)
3710 spin_unlock(&balancing); 3712 spin_unlock(&balancing);
3711out: 3713out:
3712 if (time_after(next_balance, sd->last_balance + interval)) { 3714 if (time_after(next_balance, sd->last_balance + interval)) {
@@ -4070,6 +4072,7 @@ static noinline void __schedule_bug(struct task_struct *prev)
4070 prev->comm, prev->pid, preempt_count()); 4072 prev->comm, prev->pid, preempt_count());
4071 4073
4072 debug_show_held_locks(prev); 4074 debug_show_held_locks(prev);
4075 print_modules();
4073 if (irqs_disabled()) 4076 if (irqs_disabled())
4074 print_irqtrace_events(prev); 4077 print_irqtrace_events(prev);
4075 4078
@@ -4143,7 +4146,7 @@ asmlinkage void __sched schedule(void)
4143 struct task_struct *prev, *next; 4146 struct task_struct *prev, *next;
4144 unsigned long *switch_count; 4147 unsigned long *switch_count;
4145 struct rq *rq; 4148 struct rq *rq;
4146 int cpu; 4149 int cpu, hrtick = sched_feat(HRTICK);
4147 4150
4148need_resched: 4151need_resched:
4149 preempt_disable(); 4152 preempt_disable();
@@ -4158,7 +4161,8 @@ need_resched_nonpreemptible:
4158 4161
4159 schedule_debug(prev); 4162 schedule_debug(prev);
4160 4163
4161 hrtick_clear(rq); 4164 if (hrtick)
4165 hrtick_clear(rq);
4162 4166
4163 /* 4167 /*
4164 * Do the rq-clock update outside the rq lock: 4168 * Do the rq-clock update outside the rq lock:
@@ -4204,7 +4208,8 @@ need_resched_nonpreemptible:
4204 } else 4208 } else
4205 spin_unlock_irq(&rq->lock); 4209 spin_unlock_irq(&rq->lock);
4206 4210
4207 hrtick_set(rq); 4211 if (hrtick)
4212 hrtick_set(rq);
4208 4213
4209 if (unlikely(reacquire_kernel_lock(current) < 0)) 4214 if (unlikely(reacquire_kernel_lock(current) < 0))
4210 goto need_resched_nonpreemptible; 4215 goto need_resched_nonpreemptible;
@@ -5072,24 +5077,6 @@ asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
5072 return sched_setaffinity(pid, &new_mask); 5077 return sched_setaffinity(pid, &new_mask);
5073} 5078}
5074 5079
5075/*
5076 * Represents all cpu's present in the system
5077 * In systems capable of hotplug, this map could dynamically grow
5078 * as new cpu's are detected in the system via any platform specific
5079 * method, such as ACPI for e.g.
5080 */
5081
5082cpumask_t cpu_present_map __read_mostly;
5083EXPORT_SYMBOL(cpu_present_map);
5084
5085#ifndef CONFIG_SMP
5086cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL;
5087EXPORT_SYMBOL(cpu_online_map);
5088
5089cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
5090EXPORT_SYMBOL(cpu_possible_map);
5091#endif
5092
5093long sched_getaffinity(pid_t pid, cpumask_t *mask) 5080long sched_getaffinity(pid_t pid, cpumask_t *mask)
5094{ 5081{
5095 struct task_struct *p; 5082 struct task_struct *p;
@@ -5573,6 +5560,12 @@ int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask)
5573 goto out; 5560 goto out;
5574 } 5561 }
5575 5562
5563 if (unlikely((p->flags & PF_THREAD_BOUND) && p != current &&
5564 !cpus_equal(p->cpus_allowed, *new_mask))) {
5565 ret = -EINVAL;
5566 goto out;
5567 }
5568
5576 if (p->sched_class->set_cpus_allowed) 5569 if (p->sched_class->set_cpus_allowed)
5577 p->sched_class->set_cpus_allowed(p, new_mask); 5570 p->sched_class->set_cpus_allowed(p, new_mask);
5578 else { 5571 else {
@@ -6060,6 +6053,36 @@ static void unregister_sched_domain_sysctl(void)
6060} 6053}
6061#endif 6054#endif
6062 6055
6056static void set_rq_online(struct rq *rq)
6057{
6058 if (!rq->online) {
6059 const struct sched_class *class;
6060
6061 cpu_set(rq->cpu, rq->rd->online);
6062 rq->online = 1;
6063
6064 for_each_class(class) {
6065 if (class->rq_online)
6066 class->rq_online(rq);
6067 }
6068 }
6069}
6070
6071static void set_rq_offline(struct rq *rq)
6072{
6073 if (rq->online) {
6074 const struct sched_class *class;
6075
6076 for_each_class(class) {
6077 if (class->rq_offline)
6078 class->rq_offline(rq);
6079 }
6080
6081 cpu_clear(rq->cpu, rq->rd->online);
6082 rq->online = 0;
6083 }
6084}
6085
6063/* 6086/*
6064 * migration_call - callback that gets triggered when a CPU is added. 6087 * migration_call - callback that gets triggered when a CPU is added.
6065 * Here we can start up the necessary migration thread for the new CPU. 6088 * Here we can start up the necessary migration thread for the new CPU.
@@ -6097,7 +6120,8 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6097 spin_lock_irqsave(&rq->lock, flags); 6120 spin_lock_irqsave(&rq->lock, flags);
6098 if (rq->rd) { 6121 if (rq->rd) {
6099 BUG_ON(!cpu_isset(cpu, rq->rd->span)); 6122 BUG_ON(!cpu_isset(cpu, rq->rd->span));
6100 cpu_set(cpu, rq->rd->online); 6123
6124 set_rq_online(rq);
6101 } 6125 }
6102 spin_unlock_irqrestore(&rq->lock, flags); 6126 spin_unlock_irqrestore(&rq->lock, flags);
6103 break; 6127 break;
@@ -6158,7 +6182,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6158 spin_lock_irqsave(&rq->lock, flags); 6182 spin_lock_irqsave(&rq->lock, flags);
6159 if (rq->rd) { 6183 if (rq->rd) {
6160 BUG_ON(!cpu_isset(cpu, rq->rd->span)); 6184 BUG_ON(!cpu_isset(cpu, rq->rd->span));
6161 cpu_clear(cpu, rq->rd->online); 6185 set_rq_offline(rq);
6162 } 6186 }
6163 spin_unlock_irqrestore(&rq->lock, flags); 6187 spin_unlock_irqrestore(&rq->lock, flags);
6164 break; 6188 break;
@@ -6192,6 +6216,28 @@ void __init migration_init(void)
6192 6216
6193#ifdef CONFIG_SCHED_DEBUG 6217#ifdef CONFIG_SCHED_DEBUG
6194 6218
6219static inline const char *sd_level_to_string(enum sched_domain_level lvl)
6220{
6221 switch (lvl) {
6222 case SD_LV_NONE:
6223 return "NONE";
6224 case SD_LV_SIBLING:
6225 return "SIBLING";
6226 case SD_LV_MC:
6227 return "MC";
6228 case SD_LV_CPU:
6229 return "CPU";
6230 case SD_LV_NODE:
6231 return "NODE";
6232 case SD_LV_ALLNODES:
6233 return "ALLNODES";
6234 case SD_LV_MAX:
6235 return "MAX";
6236
6237 }
6238 return "MAX";
6239}
6240
6195static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, 6241static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6196 cpumask_t *groupmask) 6242 cpumask_t *groupmask)
6197{ 6243{
@@ -6211,7 +6257,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6211 return -1; 6257 return -1;
6212 } 6258 }
6213 6259
6214 printk(KERN_CONT "span %s\n", str); 6260 printk(KERN_CONT "span %s level %s\n",
6261 str, sd_level_to_string(sd->level));
6215 6262
6216 if (!cpu_isset(cpu, sd->span)) { 6263 if (!cpu_isset(cpu, sd->span)) {
6217 printk(KERN_ERR "ERROR: domain->span does not contain " 6264 printk(KERN_ERR "ERROR: domain->span does not contain "
@@ -6295,9 +6342,9 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
6295 } 6342 }
6296 kfree(groupmask); 6343 kfree(groupmask);
6297} 6344}
6298#else 6345#else /* !CONFIG_SCHED_DEBUG */
6299# define sched_domain_debug(sd, cpu) do { } while (0) 6346# define sched_domain_debug(sd, cpu) do { } while (0)
6300#endif 6347#endif /* CONFIG_SCHED_DEBUG */
6301 6348
6302static int sd_degenerate(struct sched_domain *sd) 6349static int sd_degenerate(struct sched_domain *sd)
6303{ 6350{
@@ -6357,20 +6404,16 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
6357static void rq_attach_root(struct rq *rq, struct root_domain *rd) 6404static void rq_attach_root(struct rq *rq, struct root_domain *rd)
6358{ 6405{
6359 unsigned long flags; 6406 unsigned long flags;
6360 const struct sched_class *class;
6361 6407
6362 spin_lock_irqsave(&rq->lock, flags); 6408 spin_lock_irqsave(&rq->lock, flags);
6363 6409
6364 if (rq->rd) { 6410 if (rq->rd) {
6365 struct root_domain *old_rd = rq->rd; 6411 struct root_domain *old_rd = rq->rd;
6366 6412
6367 for (class = sched_class_highest; class; class = class->next) { 6413 if (cpu_isset(rq->cpu, old_rd->online))
6368 if (class->leave_domain) 6414 set_rq_offline(rq);
6369 class->leave_domain(rq);
6370 }
6371 6415
6372 cpu_clear(rq->cpu, old_rd->span); 6416 cpu_clear(rq->cpu, old_rd->span);
6373 cpu_clear(rq->cpu, old_rd->online);
6374 6417
6375 if (atomic_dec_and_test(&old_rd->refcount)) 6418 if (atomic_dec_and_test(&old_rd->refcount))
6376 kfree(old_rd); 6419 kfree(old_rd);
@@ -6381,12 +6424,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
6381 6424
6382 cpu_set(rq->cpu, rd->span); 6425 cpu_set(rq->cpu, rd->span);
6383 if (cpu_isset(rq->cpu, cpu_online_map)) 6426 if (cpu_isset(rq->cpu, cpu_online_map))
6384 cpu_set(rq->cpu, rd->online); 6427 set_rq_online(rq);
6385
6386 for (class = sched_class_highest; class; class = class->next) {
6387 if (class->join_domain)
6388 class->join_domain(rq);
6389 }
6390 6428
6391 spin_unlock_irqrestore(&rq->lock, flags); 6429 spin_unlock_irqrestore(&rq->lock, flags);
6392} 6430}
@@ -6397,6 +6435,8 @@ static void init_rootdomain(struct root_domain *rd)
6397 6435
6398 cpus_clear(rd->span); 6436 cpus_clear(rd->span);
6399 cpus_clear(rd->online); 6437 cpus_clear(rd->online);
6438
6439 cpupri_init(&rd->cpupri);
6400} 6440}
6401 6441
6402static void init_defrootdomain(void) 6442static void init_defrootdomain(void)
@@ -6591,7 +6631,7 @@ static void sched_domain_node_span(int node, cpumask_t *span)
6591 cpus_or(*span, *span, *nodemask); 6631 cpus_or(*span, *span, *nodemask);
6592 } 6632 }
6593} 6633}
6594#endif 6634#endif /* CONFIG_NUMA */
6595 6635
6596int sched_smt_power_savings = 0, sched_mc_power_savings = 0; 6636int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
6597 6637
@@ -6610,7 +6650,7 @@ cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
6610 *sg = &per_cpu(sched_group_cpus, cpu); 6650 *sg = &per_cpu(sched_group_cpus, cpu);
6611 return cpu; 6651 return cpu;
6612} 6652}
6613#endif 6653#endif /* CONFIG_SCHED_SMT */
6614 6654
6615/* 6655/*
6616 * multi-core sched-domains: 6656 * multi-core sched-domains:
@@ -6618,7 +6658,7 @@ cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
6618#ifdef CONFIG_SCHED_MC 6658#ifdef CONFIG_SCHED_MC
6619static DEFINE_PER_CPU(struct sched_domain, core_domains); 6659static DEFINE_PER_CPU(struct sched_domain, core_domains);
6620static DEFINE_PER_CPU(struct sched_group, sched_group_core); 6660static DEFINE_PER_CPU(struct sched_group, sched_group_core);
6621#endif 6661#endif /* CONFIG_SCHED_MC */
6622 6662
6623#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) 6663#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
6624static int 6664static int
@@ -6720,7 +6760,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
6720 sg = sg->next; 6760 sg = sg->next;
6721 } while (sg != group_head); 6761 } while (sg != group_head);
6722} 6762}
6723#endif 6763#endif /* CONFIG_NUMA */
6724 6764
6725#ifdef CONFIG_NUMA 6765#ifdef CONFIG_NUMA
6726/* Free memory allocated for various sched_group structures */ 6766/* Free memory allocated for various sched_group structures */
@@ -6757,11 +6797,11 @@ next_sg:
6757 sched_group_nodes_bycpu[cpu] = NULL; 6797 sched_group_nodes_bycpu[cpu] = NULL;
6758 } 6798 }
6759} 6799}
6760#else 6800#else /* !CONFIG_NUMA */
6761static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) 6801static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
6762{ 6802{
6763} 6803}
6764#endif 6804#endif /* CONFIG_NUMA */
6765 6805
6766/* 6806/*
6767 * Initialize sched groups cpu_power. 6807 * Initialize sched groups cpu_power.
@@ -7470,7 +7510,7 @@ int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
7470#endif 7510#endif
7471 return err; 7511 return err;
7472} 7512}
7473#endif 7513#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
7474 7514
7475/* 7515/*
7476 * Force a reinitialization of the sched domains hierarchy. The domains 7516 * Force a reinitialization of the sched domains hierarchy. The domains
@@ -7481,21 +7521,28 @@ int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
7481static int update_sched_domains(struct notifier_block *nfb, 7521static int update_sched_domains(struct notifier_block *nfb,
7482 unsigned long action, void *hcpu) 7522 unsigned long action, void *hcpu)
7483{ 7523{
7524 int cpu = (int)(long)hcpu;
7525
7484 switch (action) { 7526 switch (action) {
7485 case CPU_UP_PREPARE:
7486 case CPU_UP_PREPARE_FROZEN:
7487 case CPU_DOWN_PREPARE: 7527 case CPU_DOWN_PREPARE:
7488 case CPU_DOWN_PREPARE_FROZEN: 7528 case CPU_DOWN_PREPARE_FROZEN:
7529 disable_runtime(cpu_rq(cpu));
7530 /* fall-through */
7531 case CPU_UP_PREPARE:
7532 case CPU_UP_PREPARE_FROZEN:
7489 detach_destroy_domains(&cpu_online_map); 7533 detach_destroy_domains(&cpu_online_map);
7490 free_sched_domains(); 7534 free_sched_domains();
7491 return NOTIFY_OK; 7535 return NOTIFY_OK;
7492 7536
7493 case CPU_UP_CANCELED: 7537
7494 case CPU_UP_CANCELED_FROZEN:
7495 case CPU_DOWN_FAILED: 7538 case CPU_DOWN_FAILED:
7496 case CPU_DOWN_FAILED_FROZEN: 7539 case CPU_DOWN_FAILED_FROZEN:
7497 case CPU_ONLINE: 7540 case CPU_ONLINE:
7498 case CPU_ONLINE_FROZEN: 7541 case CPU_ONLINE_FROZEN:
7542 enable_runtime(cpu_rq(cpu));
7543 /* fall-through */
7544 case CPU_UP_CANCELED:
7545 case CPU_UP_CANCELED_FROZEN:
7499 case CPU_DEAD: 7546 case CPU_DEAD:
7500 case CPU_DEAD_FROZEN: 7547 case CPU_DEAD_FROZEN:
7501 /* 7548 /*
@@ -7695,8 +7742,8 @@ void __init sched_init(void)
7695 7742
7696 root_task_group.cfs_rq = (struct cfs_rq **)ptr; 7743 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
7697 ptr += nr_cpu_ids * sizeof(void **); 7744 ptr += nr_cpu_ids * sizeof(void **);
7698#endif 7745#endif /* CONFIG_USER_SCHED */
7699#endif 7746#endif /* CONFIG_FAIR_GROUP_SCHED */
7700#ifdef CONFIG_RT_GROUP_SCHED 7747#ifdef CONFIG_RT_GROUP_SCHED
7701 init_task_group.rt_se = (struct sched_rt_entity **)ptr; 7748 init_task_group.rt_se = (struct sched_rt_entity **)ptr;
7702 ptr += nr_cpu_ids * sizeof(void **); 7749 ptr += nr_cpu_ids * sizeof(void **);
@@ -7710,8 +7757,8 @@ void __init sched_init(void)
7710 7757
7711 root_task_group.rt_rq = (struct rt_rq **)ptr; 7758 root_task_group.rt_rq = (struct rt_rq **)ptr;
7712 ptr += nr_cpu_ids * sizeof(void **); 7759 ptr += nr_cpu_ids * sizeof(void **);
7713#endif 7760#endif /* CONFIG_USER_SCHED */
7714#endif 7761#endif /* CONFIG_RT_GROUP_SCHED */
7715 } 7762 }
7716 7763
7717#ifdef CONFIG_SMP 7764#ifdef CONFIG_SMP
@@ -7727,8 +7774,8 @@ void __init sched_init(void)
7727#ifdef CONFIG_USER_SCHED 7774#ifdef CONFIG_USER_SCHED
7728 init_rt_bandwidth(&root_task_group.rt_bandwidth, 7775 init_rt_bandwidth(&root_task_group.rt_bandwidth,
7729 global_rt_period(), RUNTIME_INF); 7776 global_rt_period(), RUNTIME_INF);
7730#endif 7777#endif /* CONFIG_USER_SCHED */
7731#endif 7778#endif /* CONFIG_RT_GROUP_SCHED */
7732 7779
7733#ifdef CONFIG_GROUP_SCHED 7780#ifdef CONFIG_GROUP_SCHED
7734 list_add(&init_task_group.list, &task_groups); 7781 list_add(&init_task_group.list, &task_groups);
@@ -7738,8 +7785,8 @@ void __init sched_init(void)
7738 INIT_LIST_HEAD(&root_task_group.children); 7785 INIT_LIST_HEAD(&root_task_group.children);
7739 init_task_group.parent = &root_task_group; 7786 init_task_group.parent = &root_task_group;
7740 list_add(&init_task_group.siblings, &root_task_group.children); 7787 list_add(&init_task_group.siblings, &root_task_group.children);
7741#endif 7788#endif /* CONFIG_USER_SCHED */
7742#endif 7789#endif /* CONFIG_GROUP_SCHED */
7743 7790
7744 for_each_possible_cpu(i) { 7791 for_each_possible_cpu(i) {
7745 struct rq *rq; 7792 struct rq *rq;
@@ -7819,6 +7866,7 @@ void __init sched_init(void)
7819 rq->next_balance = jiffies; 7866 rq->next_balance = jiffies;
7820 rq->push_cpu = 0; 7867 rq->push_cpu = 0;
7821 rq->cpu = i; 7868 rq->cpu = i;
7869 rq->online = 0;
7822 rq->migration_thread = NULL; 7870 rq->migration_thread = NULL;
7823 INIT_LIST_HEAD(&rq->migration_queue); 7871 INIT_LIST_HEAD(&rq->migration_queue);
7824 rq_attach_root(rq, &def_root_domain); 7872 rq_attach_root(rq, &def_root_domain);
@@ -8058,7 +8106,7 @@ static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8058{ 8106{
8059 list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list); 8107 list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list);
8060} 8108}
8061#else 8109#else /* !CONFG_FAIR_GROUP_SCHED */
8062static inline void free_fair_sched_group(struct task_group *tg) 8110static inline void free_fair_sched_group(struct task_group *tg)
8063{ 8111{
8064} 8112}
@@ -8076,7 +8124,7 @@ static inline void register_fair_sched_group(struct task_group *tg, int cpu)
8076static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) 8124static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8077{ 8125{
8078} 8126}
8079#endif 8127#endif /* CONFIG_FAIR_GROUP_SCHED */
8080 8128
8081#ifdef CONFIG_RT_GROUP_SCHED 8129#ifdef CONFIG_RT_GROUP_SCHED
8082static void free_rt_sched_group(struct task_group *tg) 8130static void free_rt_sched_group(struct task_group *tg)
@@ -8147,7 +8195,7 @@ static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
8147{ 8195{
8148 list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list); 8196 list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list);
8149} 8197}
8150#else 8198#else /* !CONFIG_RT_GROUP_SCHED */
8151static inline void free_rt_sched_group(struct task_group *tg) 8199static inline void free_rt_sched_group(struct task_group *tg)
8152{ 8200{
8153} 8201}
@@ -8165,7 +8213,7 @@ static inline void register_rt_sched_group(struct task_group *tg, int cpu)
8165static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) 8213static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
8166{ 8214{
8167} 8215}
8168#endif 8216#endif /* CONFIG_RT_GROUP_SCHED */
8169 8217
8170#ifdef CONFIG_GROUP_SCHED 8218#ifdef CONFIG_GROUP_SCHED
8171static void free_sched_group(struct task_group *tg) 8219static void free_sched_group(struct task_group *tg)
@@ -8276,7 +8324,7 @@ void sched_move_task(struct task_struct *tsk)
8276 8324
8277 task_rq_unlock(rq, &flags); 8325 task_rq_unlock(rq, &flags);
8278} 8326}
8279#endif 8327#endif /* CONFIG_GROUP_SCHED */
8280 8328
8281#ifdef CONFIG_FAIR_GROUP_SCHED 8329#ifdef CONFIG_FAIR_GROUP_SCHED
8282static void set_se_shares(struct sched_entity *se, unsigned long shares) 8330static void set_se_shares(struct sched_entity *se, unsigned long shares)
@@ -8376,7 +8424,7 @@ static unsigned long to_ratio(u64 period, u64 runtime)
8376#ifdef CONFIG_CGROUP_SCHED 8424#ifdef CONFIG_CGROUP_SCHED
8377static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) 8425static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
8378{ 8426{
8379 struct task_group *tgi, *parent = tg ? tg->parent : NULL; 8427 struct task_group *tgi, *parent = tg->parent;
8380 unsigned long total = 0; 8428 unsigned long total = 0;
8381 8429
8382 if (!parent) { 8430 if (!parent) {
@@ -8400,7 +8448,7 @@ static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
8400 } 8448 }
8401 rcu_read_unlock(); 8449 rcu_read_unlock();
8402 8450
8403 return total + to_ratio(period, runtime) < 8451 return total + to_ratio(period, runtime) <=
8404 to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period), 8452 to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period),
8405 parent->rt_bandwidth.rt_runtime); 8453 parent->rt_bandwidth.rt_runtime);
8406} 8454}
@@ -8517,16 +8565,21 @@ long sched_group_rt_period(struct task_group *tg)
8517 8565
8518static int sched_rt_global_constraints(void) 8566static int sched_rt_global_constraints(void)
8519{ 8567{
8568 struct task_group *tg = &root_task_group;
8569 u64 rt_runtime, rt_period;
8520 int ret = 0; 8570 int ret = 0;
8521 8571
8572 rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
8573 rt_runtime = tg->rt_bandwidth.rt_runtime;
8574
8522 mutex_lock(&rt_constraints_mutex); 8575 mutex_lock(&rt_constraints_mutex);
8523 if (!__rt_schedulable(NULL, 1, 0)) 8576 if (!__rt_schedulable(tg, rt_period, rt_runtime))
8524 ret = -EINVAL; 8577 ret = -EINVAL;
8525 mutex_unlock(&rt_constraints_mutex); 8578 mutex_unlock(&rt_constraints_mutex);
8526 8579
8527 return ret; 8580 return ret;
8528} 8581}
8529#else 8582#else /* !CONFIG_RT_GROUP_SCHED */
8530static int sched_rt_global_constraints(void) 8583static int sched_rt_global_constraints(void)
8531{ 8584{
8532 unsigned long flags; 8585 unsigned long flags;
@@ -8544,7 +8597,7 @@ static int sched_rt_global_constraints(void)
8544 8597
8545 return 0; 8598 return 0;
8546} 8599}
8547#endif 8600#endif /* CONFIG_RT_GROUP_SCHED */
8548 8601
8549int sched_rt_handler(struct ctl_table *table, int write, 8602int sched_rt_handler(struct ctl_table *table, int write,
8550 struct file *filp, void __user *buffer, size_t *lenp, 8603 struct file *filp, void __user *buffer, size_t *lenp,
@@ -8652,7 +8705,7 @@ static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
8652 8705
8653 return (u64) tg->shares; 8706 return (u64) tg->shares;
8654} 8707}
8655#endif 8708#endif /* CONFIG_FAIR_GROUP_SCHED */
8656 8709
8657#ifdef CONFIG_RT_GROUP_SCHED 8710#ifdef CONFIG_RT_GROUP_SCHED
8658static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, 8711static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
@@ -8676,7 +8729,7 @@ static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft)
8676{ 8729{
8677 return sched_group_rt_period(cgroup_tg(cgrp)); 8730 return sched_group_rt_period(cgroup_tg(cgrp));
8678} 8731}
8679#endif 8732#endif /* CONFIG_RT_GROUP_SCHED */
8680 8733
8681static struct cftype cpu_files[] = { 8734static struct cftype cpu_files[] = {
8682#ifdef CONFIG_FAIR_GROUP_SCHED 8735#ifdef CONFIG_FAIR_GROUP_SCHED
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
new file mode 100644
index 000000000000..52154fefab7e
--- /dev/null
+++ b/kernel/sched_cpupri.c
@@ -0,0 +1,174 @@
1/*
2 * kernel/sched_cpupri.c
3 *
4 * CPU priority management
5 *
6 * Copyright (C) 2007-2008 Novell
7 *
8 * Author: Gregory Haskins <ghaskins@novell.com>
9 *
10 * This code tracks the priority of each CPU so that global migration
11 * decisions are easy to calculate. Each CPU can be in a state as follows:
12 *
13 * (INVALID), IDLE, NORMAL, RT1, ... RT99
14 *
15 * going from the lowest priority to the highest. CPUs in the INVALID state
16 * are not eligible for routing. The system maintains this state with
17 * a 2 dimensional bitmap (the first for priority class, the second for cpus
18 * in that class). Therefore a typical application without affinity
19 * restrictions can find a suitable CPU with O(1) complexity (e.g. two bit
20 * searches). For tasks with affinity restrictions, the algorithm has a
21 * worst case complexity of O(min(102, nr_domcpus)), though the scenario that
22 * yields the worst case search is fairly contrived.
23 *
24 * This program is free software; you can redistribute it and/or
25 * modify it under the terms of the GNU General Public License
26 * as published by the Free Software Foundation; version 2
27 * of the License.
28 */
29
30#include "sched_cpupri.h"
31
32/* Convert between a 140 based task->prio, and our 102 based cpupri */
33static int convert_prio(int prio)
34{
35 int cpupri;
36
37 if (prio == CPUPRI_INVALID)
38 cpupri = CPUPRI_INVALID;
39 else if (prio == MAX_PRIO)
40 cpupri = CPUPRI_IDLE;
41 else if (prio >= MAX_RT_PRIO)
42 cpupri = CPUPRI_NORMAL;
43 else
44 cpupri = MAX_RT_PRIO - prio + 1;
45
46 return cpupri;
47}
48
49#define for_each_cpupri_active(array, idx) \
50 for (idx = find_first_bit(array, CPUPRI_NR_PRIORITIES); \
51 idx < CPUPRI_NR_PRIORITIES; \
52 idx = find_next_bit(array, CPUPRI_NR_PRIORITIES, idx+1))
53
54/**
55 * cpupri_find - find the best (lowest-pri) CPU in the system
56 * @cp: The cpupri context
57 * @p: The task
58 * @lowest_mask: A mask to fill in with selected CPUs
59 *
60 * Note: This function returns the recommended CPUs as calculated during the
61 * current invokation. By the time the call returns, the CPUs may have in
62 * fact changed priorities any number of times. While not ideal, it is not
63 * an issue of correctness since the normal rebalancer logic will correct
64 * any discrepancies created by racing against the uncertainty of the current
65 * priority configuration.
66 *
67 * Returns: (int)bool - CPUs were found
68 */
69int cpupri_find(struct cpupri *cp, struct task_struct *p,
70 cpumask_t *lowest_mask)
71{
72 int idx = 0;
73 int task_pri = convert_prio(p->prio);
74
75 for_each_cpupri_active(cp->pri_active, idx) {
76 struct cpupri_vec *vec = &cp->pri_to_cpu[idx];
77 cpumask_t mask;
78
79 if (idx >= task_pri)
80 break;
81
82 cpus_and(mask, p->cpus_allowed, vec->mask);
83
84 if (cpus_empty(mask))
85 continue;
86
87 *lowest_mask = mask;
88 return 1;
89 }
90
91 return 0;
92}
93
94/**
95 * cpupri_set - update the cpu priority setting
96 * @cp: The cpupri context
97 * @cpu: The target cpu
98 * @pri: The priority (INVALID-RT99) to assign to this CPU
99 *
100 * Note: Assumes cpu_rq(cpu)->lock is locked
101 *
102 * Returns: (void)
103 */
104void cpupri_set(struct cpupri *cp, int cpu, int newpri)
105{
106 int *currpri = &cp->cpu_to_pri[cpu];
107 int oldpri = *currpri;
108 unsigned long flags;
109
110 newpri = convert_prio(newpri);
111
112 BUG_ON(newpri >= CPUPRI_NR_PRIORITIES);
113
114 if (newpri == oldpri)
115 return;
116
117 /*
118 * If the cpu was currently mapped to a different value, we
119 * first need to unmap the old value
120 */
121 if (likely(oldpri != CPUPRI_INVALID)) {
122 struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri];
123
124 spin_lock_irqsave(&vec->lock, flags);
125
126 vec->count--;
127 if (!vec->count)
128 clear_bit(oldpri, cp->pri_active);
129 cpu_clear(cpu, vec->mask);
130
131 spin_unlock_irqrestore(&vec->lock, flags);
132 }
133
134 if (likely(newpri != CPUPRI_INVALID)) {
135 struct cpupri_vec *vec = &cp->pri_to_cpu[newpri];
136
137 spin_lock_irqsave(&vec->lock, flags);
138
139 cpu_set(cpu, vec->mask);
140 vec->count++;
141 if (vec->count == 1)
142 set_bit(newpri, cp->pri_active);
143
144 spin_unlock_irqrestore(&vec->lock, flags);
145 }
146
147 *currpri = newpri;
148}
149
150/**
151 * cpupri_init - initialize the cpupri structure
152 * @cp: The cpupri context
153 *
154 * Returns: (void)
155 */
156void cpupri_init(struct cpupri *cp)
157{
158 int i;
159
160 memset(cp, 0, sizeof(*cp));
161
162 for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
163 struct cpupri_vec *vec = &cp->pri_to_cpu[i];
164
165 spin_lock_init(&vec->lock);
166 vec->count = 0;
167 cpus_clear(vec->mask);
168 }
169
170 for_each_possible_cpu(i)
171 cp->cpu_to_pri[i] = CPUPRI_INVALID;
172}
173
174
diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h
new file mode 100644
index 000000000000..f25811b0f931
--- /dev/null
+++ b/kernel/sched_cpupri.h
@@ -0,0 +1,36 @@
1#ifndef _LINUX_CPUPRI_H
2#define _LINUX_CPUPRI_H
3
4#include <linux/sched.h>
5
6#define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2)
7#define CPUPRI_NR_PRI_WORDS BITS_TO_LONGS(CPUPRI_NR_PRIORITIES)
8
9#define CPUPRI_INVALID -1
10#define CPUPRI_IDLE 0
11#define CPUPRI_NORMAL 1
12/* values 2-101 are RT priorities 0-99 */
13
14struct cpupri_vec {
15 spinlock_t lock;
16 int count;
17 cpumask_t mask;
18};
19
20struct cpupri {
21 struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES];
22 long pri_active[CPUPRI_NR_PRI_WORDS];
23 int cpu_to_pri[NR_CPUS];
24};
25
26#ifdef CONFIG_SMP
27int cpupri_find(struct cpupri *cp,
28 struct task_struct *p, cpumask_t *lowest_mask);
29void cpupri_set(struct cpupri *cp, int cpu, int pri);
30void cpupri_init(struct cpupri *cp);
31#else
32#define cpupri_set(cp, cpu, pri) do { } while (0)
33#define cpupri_init() do { } while (0)
34#endif
35
36#endif /* _LINUX_CPUPRI_H */
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 8bb713040ac9..8e077b9c91cb 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -119,9 +119,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
119 struct sched_entity *last; 119 struct sched_entity *last;
120 unsigned long flags; 120 unsigned long flags;
121 121
122#if !defined(CONFIG_CGROUP_SCHED) || !defined(CONFIG_USER_SCHED) 122#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED)
123 SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
124#else
125 char path[128] = ""; 123 char path[128] = "";
126 struct cgroup *cgroup = NULL; 124 struct cgroup *cgroup = NULL;
127 struct task_group *tg = cfs_rq->tg; 125 struct task_group *tg = cfs_rq->tg;
@@ -133,6 +131,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
133 cgroup_path(cgroup, path, sizeof(path)); 131 cgroup_path(cgroup, path, sizeof(path));
134 132
135 SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path); 133 SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path);
134#else
135 SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
136#endif 136#endif
137 137
138 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock", 138 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock",
@@ -169,6 +169,39 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
169 cfs_rq->nr_spread_over); 169 cfs_rq->nr_spread_over);
170} 170}
171 171
172void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
173{
174#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_RT_GROUP_SCHED)
175 char path[128] = "";
176 struct cgroup *cgroup = NULL;
177 struct task_group *tg = rt_rq->tg;
178
179 if (tg)
180 cgroup = tg->css.cgroup;
181
182 if (cgroup)
183 cgroup_path(cgroup, path, sizeof(path));
184
185 SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, path);
186#else
187 SEQ_printf(m, "\nrt_rq[%d]:\n", cpu);
188#endif
189
190
191#define P(x) \
192 SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rt_rq->x))
193#define PN(x) \
194 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rt_rq->x))
195
196 P(rt_nr_running);
197 P(rt_throttled);
198 PN(rt_time);
199 PN(rt_runtime);
200
201#undef PN
202#undef P
203}
204
172static void print_cpu(struct seq_file *m, int cpu) 205static void print_cpu(struct seq_file *m, int cpu)
173{ 206{
174 struct rq *rq = &per_cpu(runqueues, cpu); 207 struct rq *rq = &per_cpu(runqueues, cpu);
@@ -208,6 +241,7 @@ static void print_cpu(struct seq_file *m, int cpu)
208#undef PN 241#undef PN
209 242
210 print_cfs_stats(m, cpu); 243 print_cfs_stats(m, cpu);
244 print_rt_stats(m, cpu);
211 245
212 print_rq(m, rq, cpu); 246 print_rq(m, rq, cpu);
213} 247}
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 08ae848b71d4..1fe4c65a8170 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1275,23 +1275,18 @@ __load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next)
1275 struct task_struct *p = NULL; 1275 struct task_struct *p = NULL;
1276 struct sched_entity *se; 1276 struct sched_entity *se;
1277 1277
1278 if (next == &cfs_rq->tasks) 1278 while (next != &cfs_rq->tasks) {
1279 return NULL;
1280
1281 /* Skip over entities that are not tasks */
1282 do {
1283 se = list_entry(next, struct sched_entity, group_node); 1279 se = list_entry(next, struct sched_entity, group_node);
1284 next = next->next; 1280 next = next->next;
1285 } while (next != &cfs_rq->tasks && !entity_is_task(se));
1286 1281
1287 if (next == &cfs_rq->tasks) 1282 /* Skip over entities that are not tasks */
1288 return NULL; 1283 if (entity_is_task(se)) {
1284 p = task_of(se);
1285 break;
1286 }
1287 }
1289 1288
1290 cfs_rq->balance_iterator = next; 1289 cfs_rq->balance_iterator = next;
1291
1292 if (entity_is_task(se))
1293 p = task_of(se);
1294
1295 return p; 1290 return p;
1296} 1291}
1297 1292
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 1c7283cb9581..62b39ca92ebd 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -6,5 +6,3 @@ SCHED_FEAT(CACHE_HOT_BUDDY, 1)
6SCHED_FEAT(SYNC_WAKEUPS, 1) 6SCHED_FEAT(SYNC_WAKEUPS, 1)
7SCHED_FEAT(HRTICK, 1) 7SCHED_FEAT(HRTICK, 1)
8SCHED_FEAT(DOUBLE_TICK, 0) 8SCHED_FEAT(DOUBLE_TICK, 0)
9SCHED_FEAT(NORMALIZED_SLEEPER, 1)
10SCHED_FEAT(DEADLINE, 1)
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 1dad5bbb59b6..bd90c8bb0739 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -12,6 +12,9 @@ static inline int rt_overloaded(struct rq *rq)
12 12
13static inline void rt_set_overload(struct rq *rq) 13static inline void rt_set_overload(struct rq *rq)
14{ 14{
15 if (!rq->online)
16 return;
17
15 cpu_set(rq->cpu, rq->rd->rto_mask); 18 cpu_set(rq->cpu, rq->rd->rto_mask);
16 /* 19 /*
17 * Make sure the mask is visible before we set 20 * Make sure the mask is visible before we set
@@ -26,6 +29,9 @@ static inline void rt_set_overload(struct rq *rq)
26 29
27static inline void rt_clear_overload(struct rq *rq) 30static inline void rt_clear_overload(struct rq *rq)
28{ 31{
32 if (!rq->online)
33 return;
34
29 /* the order here really doesn't matter */ 35 /* the order here really doesn't matter */
30 atomic_dec(&rq->rd->rto_count); 36 atomic_dec(&rq->rd->rto_count);
31 cpu_clear(rq->cpu, rq->rd->rto_mask); 37 cpu_clear(rq->cpu, rq->rd->rto_mask);
@@ -222,46 +228,8 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
222 228
223#endif 229#endif
224 230
225static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
226{
227 int i, idle = 1;
228 cpumask_t span;
229
230 if (rt_b->rt_runtime == RUNTIME_INF)
231 return 1;
232
233 span = sched_rt_period_mask();
234 for_each_cpu_mask(i, span) {
235 int enqueue = 0;
236 struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
237 struct rq *rq = rq_of_rt_rq(rt_rq);
238
239 spin_lock(&rq->lock);
240 if (rt_rq->rt_time) {
241 u64 runtime;
242
243 spin_lock(&rt_rq->rt_runtime_lock);
244 runtime = rt_rq->rt_runtime;
245 rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime);
246 if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
247 rt_rq->rt_throttled = 0;
248 enqueue = 1;
249 }
250 if (rt_rq->rt_time || rt_rq->rt_nr_running)
251 idle = 0;
252 spin_unlock(&rt_rq->rt_runtime_lock);
253 }
254
255 if (enqueue)
256 sched_rt_rq_enqueue(rt_rq);
257 spin_unlock(&rq->lock);
258 }
259
260 return idle;
261}
262
263#ifdef CONFIG_SMP 231#ifdef CONFIG_SMP
264static int balance_runtime(struct rt_rq *rt_rq) 232static int do_balance_runtime(struct rt_rq *rt_rq)
265{ 233{
266 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); 234 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
267 struct root_domain *rd = cpu_rq(smp_processor_id())->rd; 235 struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
@@ -280,6 +248,9 @@ static int balance_runtime(struct rt_rq *rt_rq)
280 continue; 248 continue;
281 249
282 spin_lock(&iter->rt_runtime_lock); 250 spin_lock(&iter->rt_runtime_lock);
251 if (iter->rt_runtime == RUNTIME_INF)
252 goto next;
253
283 diff = iter->rt_runtime - iter->rt_time; 254 diff = iter->rt_runtime - iter->rt_time;
284 if (diff > 0) { 255 if (diff > 0) {
285 do_div(diff, weight); 256 do_div(diff, weight);
@@ -293,14 +264,165 @@ static int balance_runtime(struct rt_rq *rt_rq)
293 break; 264 break;
294 } 265 }
295 } 266 }
267next:
296 spin_unlock(&iter->rt_runtime_lock); 268 spin_unlock(&iter->rt_runtime_lock);
297 } 269 }
298 spin_unlock(&rt_b->rt_runtime_lock); 270 spin_unlock(&rt_b->rt_runtime_lock);
299 271
300 return more; 272 return more;
301} 273}
274
275static void __disable_runtime(struct rq *rq)
276{
277 struct root_domain *rd = rq->rd;
278 struct rt_rq *rt_rq;
279
280 if (unlikely(!scheduler_running))
281 return;
282
283 for_each_leaf_rt_rq(rt_rq, rq) {
284 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
285 s64 want;
286 int i;
287
288 spin_lock(&rt_b->rt_runtime_lock);
289 spin_lock(&rt_rq->rt_runtime_lock);
290 if (rt_rq->rt_runtime == RUNTIME_INF ||
291 rt_rq->rt_runtime == rt_b->rt_runtime)
292 goto balanced;
293 spin_unlock(&rt_rq->rt_runtime_lock);
294
295 want = rt_b->rt_runtime - rt_rq->rt_runtime;
296
297 for_each_cpu_mask(i, rd->span) {
298 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
299 s64 diff;
300
301 if (iter == rt_rq)
302 continue;
303
304 spin_lock(&iter->rt_runtime_lock);
305 if (want > 0) {
306 diff = min_t(s64, iter->rt_runtime, want);
307 iter->rt_runtime -= diff;
308 want -= diff;
309 } else {
310 iter->rt_runtime -= want;
311 want -= want;
312 }
313 spin_unlock(&iter->rt_runtime_lock);
314
315 if (!want)
316 break;
317 }
318
319 spin_lock(&rt_rq->rt_runtime_lock);
320 BUG_ON(want);
321balanced:
322 rt_rq->rt_runtime = RUNTIME_INF;
323 spin_unlock(&rt_rq->rt_runtime_lock);
324 spin_unlock(&rt_b->rt_runtime_lock);
325 }
326}
327
328static void disable_runtime(struct rq *rq)
329{
330 unsigned long flags;
331
332 spin_lock_irqsave(&rq->lock, flags);
333 __disable_runtime(rq);
334 spin_unlock_irqrestore(&rq->lock, flags);
335}
336
337static void __enable_runtime(struct rq *rq)
338{
339 struct root_domain *rd = rq->rd;
340 struct rt_rq *rt_rq;
341
342 if (unlikely(!scheduler_running))
343 return;
344
345 for_each_leaf_rt_rq(rt_rq, rq) {
346 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
347
348 spin_lock(&rt_b->rt_runtime_lock);
349 spin_lock(&rt_rq->rt_runtime_lock);
350 rt_rq->rt_runtime = rt_b->rt_runtime;
351 rt_rq->rt_time = 0;
352 spin_unlock(&rt_rq->rt_runtime_lock);
353 spin_unlock(&rt_b->rt_runtime_lock);
354 }
355}
356
357static void enable_runtime(struct rq *rq)
358{
359 unsigned long flags;
360
361 spin_lock_irqsave(&rq->lock, flags);
362 __enable_runtime(rq);
363 spin_unlock_irqrestore(&rq->lock, flags);
364}
365
366static int balance_runtime(struct rt_rq *rt_rq)
367{
368 int more = 0;
369
370 if (rt_rq->rt_time > rt_rq->rt_runtime) {
371 spin_unlock(&rt_rq->rt_runtime_lock);
372 more = do_balance_runtime(rt_rq);
373 spin_lock(&rt_rq->rt_runtime_lock);
374 }
375
376 return more;
377}
378#else
379static inline int balance_runtime(struct rt_rq *rt_rq)
380{
381 return 0;
382}
302#endif 383#endif
303 384
385static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
386{
387 int i, idle = 1;
388 cpumask_t span;
389
390 if (rt_b->rt_runtime == RUNTIME_INF)
391 return 1;
392
393 span = sched_rt_period_mask();
394 for_each_cpu_mask(i, span) {
395 int enqueue = 0;
396 struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
397 struct rq *rq = rq_of_rt_rq(rt_rq);
398
399 spin_lock(&rq->lock);
400 if (rt_rq->rt_time) {
401 u64 runtime;
402
403 spin_lock(&rt_rq->rt_runtime_lock);
404 if (rt_rq->rt_throttled)
405 balance_runtime(rt_rq);
406 runtime = rt_rq->rt_runtime;
407 rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime);
408 if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
409 rt_rq->rt_throttled = 0;
410 enqueue = 1;
411 }
412 if (rt_rq->rt_time || rt_rq->rt_nr_running)
413 idle = 0;
414 spin_unlock(&rt_rq->rt_runtime_lock);
415 } else if (rt_rq->rt_nr_running)
416 idle = 0;
417
418 if (enqueue)
419 sched_rt_rq_enqueue(rt_rq);
420 spin_unlock(&rq->lock);
421 }
422
423 return idle;
424}
425
304static inline int rt_se_prio(struct sched_rt_entity *rt_se) 426static inline int rt_se_prio(struct sched_rt_entity *rt_se)
305{ 427{
306#ifdef CONFIG_RT_GROUP_SCHED 428#ifdef CONFIG_RT_GROUP_SCHED
@@ -326,18 +448,10 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
326 if (sched_rt_runtime(rt_rq) >= sched_rt_period(rt_rq)) 448 if (sched_rt_runtime(rt_rq) >= sched_rt_period(rt_rq))
327 return 0; 449 return 0;
328 450
329#ifdef CONFIG_SMP 451 balance_runtime(rt_rq);
330 if (rt_rq->rt_time > runtime) { 452 runtime = sched_rt_runtime(rt_rq);
331 int more; 453 if (runtime == RUNTIME_INF)
332 454 return 0;
333 spin_unlock(&rt_rq->rt_runtime_lock);
334 more = balance_runtime(rt_rq);
335 spin_lock(&rt_rq->rt_runtime_lock);
336
337 if (more)
338 runtime = sched_rt_runtime(rt_rq);
339 }
340#endif
341 455
342 if (rt_rq->rt_time > runtime) { 456 if (rt_rq->rt_time > runtime) {
343 rt_rq->rt_throttled = 1; 457 rt_rq->rt_throttled = 1;
@@ -391,12 +505,21 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
391 WARN_ON(!rt_prio(rt_se_prio(rt_se))); 505 WARN_ON(!rt_prio(rt_se_prio(rt_se)));
392 rt_rq->rt_nr_running++; 506 rt_rq->rt_nr_running++;
393#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED 507#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
394 if (rt_se_prio(rt_se) < rt_rq->highest_prio) 508 if (rt_se_prio(rt_se) < rt_rq->highest_prio) {
509 struct rq *rq = rq_of_rt_rq(rt_rq);
510
395 rt_rq->highest_prio = rt_se_prio(rt_se); 511 rt_rq->highest_prio = rt_se_prio(rt_se);
512#ifdef CONFIG_SMP
513 if (rq->online)
514 cpupri_set(&rq->rd->cpupri, rq->cpu,
515 rt_se_prio(rt_se));
516#endif
517 }
396#endif 518#endif
397#ifdef CONFIG_SMP 519#ifdef CONFIG_SMP
398 if (rt_se->nr_cpus_allowed > 1) { 520 if (rt_se->nr_cpus_allowed > 1) {
399 struct rq *rq = rq_of_rt_rq(rt_rq); 521 struct rq *rq = rq_of_rt_rq(rt_rq);
522
400 rq->rt.rt_nr_migratory++; 523 rq->rt.rt_nr_migratory++;
401 } 524 }
402 525
@@ -416,6 +539,10 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
416static inline 539static inline
417void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 540void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
418{ 541{
542#ifdef CONFIG_SMP
543 int highest_prio = rt_rq->highest_prio;
544#endif
545
419 WARN_ON(!rt_prio(rt_se_prio(rt_se))); 546 WARN_ON(!rt_prio(rt_se_prio(rt_se)));
420 WARN_ON(!rt_rq->rt_nr_running); 547 WARN_ON(!rt_rq->rt_nr_running);
421 rt_rq->rt_nr_running--; 548 rt_rq->rt_nr_running--;
@@ -439,6 +566,14 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
439 rq->rt.rt_nr_migratory--; 566 rq->rt.rt_nr_migratory--;
440 } 567 }
441 568
569 if (rt_rq->highest_prio != highest_prio) {
570 struct rq *rq = rq_of_rt_rq(rt_rq);
571
572 if (rq->online)
573 cpupri_set(&rq->rd->cpupri, rq->cpu,
574 rt_rq->highest_prio);
575 }
576
442 update_rt_migration(rq_of_rt_rq(rt_rq)); 577 update_rt_migration(rq_of_rt_rq(rt_rq));
443#endif /* CONFIG_SMP */ 578#endif /* CONFIG_SMP */
444#ifdef CONFIG_RT_GROUP_SCHED 579#ifdef CONFIG_RT_GROUP_SCHED
@@ -454,6 +589,7 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se)
454 struct rt_rq *rt_rq = rt_rq_of_se(rt_se); 589 struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
455 struct rt_prio_array *array = &rt_rq->active; 590 struct rt_prio_array *array = &rt_rq->active;
456 struct rt_rq *group_rq = group_rt_rq(rt_se); 591 struct rt_rq *group_rq = group_rt_rq(rt_se);
592 struct list_head *queue = array->queue + rt_se_prio(rt_se);
457 593
458 /* 594 /*
459 * Don't enqueue the group if its throttled, or when empty. 595 * Don't enqueue the group if its throttled, or when empty.
@@ -464,7 +600,11 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se)
464 if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) 600 if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
465 return; 601 return;
466 602
467 list_add_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se)); 603 if (rt_se->nr_cpus_allowed == 1)
604 list_add(&rt_se->run_list, queue);
605 else
606 list_add_tail(&rt_se->run_list, queue);
607
468 __set_bit(rt_se_prio(rt_se), array->bitmap); 608 __set_bit(rt_se_prio(rt_se), array->bitmap);
469 609
470 inc_rt_tasks(rt_se, rt_rq); 610 inc_rt_tasks(rt_se, rt_rq);
@@ -551,8 +691,11 @@ void requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
551 struct rt_prio_array *array = &rt_rq->active; 691 struct rt_prio_array *array = &rt_rq->active;
552 struct list_head *queue = array->queue + rt_se_prio(rt_se); 692 struct list_head *queue = array->queue + rt_se_prio(rt_se);
553 693
554 if (on_rt_rq(rt_se)) 694 if (on_rt_rq(rt_se)) {
555 list_move_tail(&rt_se->run_list, queue); 695 list_del_init(&rt_se->run_list);
696 list_add_tail(&rt_se->run_list,
697 array->queue + rt_se_prio(rt_se));
698 }
556} 699}
557 700
558static void requeue_task_rt(struct rq *rq, struct task_struct *p) 701static void requeue_task_rt(struct rq *rq, struct task_struct *p)
@@ -615,8 +758,37 @@ static int select_task_rq_rt(struct task_struct *p, int sync)
615 */ 758 */
616static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p) 759static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p)
617{ 760{
618 if (p->prio < rq->curr->prio) 761 if (p->prio < rq->curr->prio) {
619 resched_task(rq->curr); 762 resched_task(rq->curr);
763 return;
764 }
765
766#ifdef CONFIG_SMP
767 /*
768 * If:
769 *
770 * - the newly woken task is of equal priority to the current task
771 * - the newly woken task is non-migratable while current is migratable
772 * - current will be preempted on the next reschedule
773 *
774 * we should check to see if current can readily move to a different
775 * cpu. If so, we will reschedule to allow the push logic to try
776 * to move current somewhere else, making room for our non-migratable
777 * task.
778 */
779 if((p->prio == rq->curr->prio)
780 && p->rt.nr_cpus_allowed == 1
781 && rq->curr->rt.nr_cpus_allowed != 1) {
782 cpumask_t mask;
783
784 if (cpupri_find(&rq->rd->cpupri, rq->curr, &mask))
785 /*
786 * There appears to be other cpus that can accept
787 * current, so lets reschedule to try and push it away
788 */
789 resched_task(rq->curr);
790 }
791#endif
620} 792}
621 793
622static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq, 794static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
@@ -719,73 +891,6 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
719 891
720static DEFINE_PER_CPU(cpumask_t, local_cpu_mask); 892static DEFINE_PER_CPU(cpumask_t, local_cpu_mask);
721 893
722static int find_lowest_cpus(struct task_struct *task, cpumask_t *lowest_mask)
723{
724 int lowest_prio = -1;
725 int lowest_cpu = -1;
726 int count = 0;
727 int cpu;
728
729 cpus_and(*lowest_mask, task_rq(task)->rd->online, task->cpus_allowed);
730
731 /*
732 * Scan each rq for the lowest prio.
733 */
734 for_each_cpu_mask(cpu, *lowest_mask) {
735 struct rq *rq = cpu_rq(cpu);
736
737 /* We look for lowest RT prio or non-rt CPU */
738 if (rq->rt.highest_prio >= MAX_RT_PRIO) {
739 /*
740 * if we already found a low RT queue
741 * and now we found this non-rt queue
742 * clear the mask and set our bit.
743 * Otherwise just return the queue as is
744 * and the count==1 will cause the algorithm
745 * to use the first bit found.
746 */
747 if (lowest_cpu != -1) {
748 cpus_clear(*lowest_mask);
749 cpu_set(rq->cpu, *lowest_mask);
750 }
751 return 1;
752 }
753
754 /* no locking for now */
755 if ((rq->rt.highest_prio > task->prio)
756 && (rq->rt.highest_prio >= lowest_prio)) {
757 if (rq->rt.highest_prio > lowest_prio) {
758 /* new low - clear old data */
759 lowest_prio = rq->rt.highest_prio;
760 lowest_cpu = cpu;
761 count = 0;
762 }
763 count++;
764 } else
765 cpu_clear(cpu, *lowest_mask);
766 }
767
768 /*
769 * Clear out all the set bits that represent
770 * runqueues that were of higher prio than
771 * the lowest_prio.
772 */
773 if (lowest_cpu > 0) {
774 /*
775 * Perhaps we could add another cpumask op to
776 * zero out bits. Like cpu_zero_bits(cpumask, nrbits);
777 * Then that could be optimized to use memset and such.
778 */
779 for_each_cpu_mask(cpu, *lowest_mask) {
780 if (cpu >= lowest_cpu)
781 break;
782 cpu_clear(cpu, *lowest_mask);
783 }
784 }
785
786 return count;
787}
788
789static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask) 894static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask)
790{ 895{
791 int first; 896 int first;
@@ -807,17 +912,12 @@ static int find_lowest_rq(struct task_struct *task)
807 cpumask_t *lowest_mask = &__get_cpu_var(local_cpu_mask); 912 cpumask_t *lowest_mask = &__get_cpu_var(local_cpu_mask);
808 int this_cpu = smp_processor_id(); 913 int this_cpu = smp_processor_id();
809 int cpu = task_cpu(task); 914 int cpu = task_cpu(task);
810 int count = find_lowest_cpus(task, lowest_mask);
811 915
812 if (!count) 916 if (task->rt.nr_cpus_allowed == 1)
813 return -1; /* No targets found */ 917 return -1; /* No other targets possible */
814 918
815 /* 919 if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
816 * There is no sense in performing an optimal search if only one 920 return -1; /* No targets found */
817 * target is found.
818 */
819 if (count == 1)
820 return first_cpu(*lowest_mask);
821 921
822 /* 922 /*
823 * At this point we have built a mask of cpus representing the 923 * At this point we have built a mask of cpus representing the
@@ -1162,17 +1262,25 @@ static void set_cpus_allowed_rt(struct task_struct *p,
1162} 1262}
1163 1263
1164/* Assumes rq->lock is held */ 1264/* Assumes rq->lock is held */
1165static void join_domain_rt(struct rq *rq) 1265static void rq_online_rt(struct rq *rq)
1166{ 1266{
1167 if (rq->rt.overloaded) 1267 if (rq->rt.overloaded)
1168 rt_set_overload(rq); 1268 rt_set_overload(rq);
1269
1270 __enable_runtime(rq);
1271
1272 cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio);
1169} 1273}
1170 1274
1171/* Assumes rq->lock is held */ 1275/* Assumes rq->lock is held */
1172static void leave_domain_rt(struct rq *rq) 1276static void rq_offline_rt(struct rq *rq)
1173{ 1277{
1174 if (rq->rt.overloaded) 1278 if (rq->rt.overloaded)
1175 rt_clear_overload(rq); 1279 rt_clear_overload(rq);
1280
1281 __disable_runtime(rq);
1282
1283 cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID);
1176} 1284}
1177 1285
1178/* 1286/*
@@ -1335,8 +1443,8 @@ static const struct sched_class rt_sched_class = {
1335 .load_balance = load_balance_rt, 1443 .load_balance = load_balance_rt,
1336 .move_one_task = move_one_task_rt, 1444 .move_one_task = move_one_task_rt,
1337 .set_cpus_allowed = set_cpus_allowed_rt, 1445 .set_cpus_allowed = set_cpus_allowed_rt,
1338 .join_domain = join_domain_rt, 1446 .rq_online = rq_online_rt,
1339 .leave_domain = leave_domain_rt, 1447 .rq_offline = rq_offline_rt,
1340 .pre_schedule = pre_schedule_rt, 1448 .pre_schedule = pre_schedule_rt,
1341 .post_schedule = post_schedule_rt, 1449 .post_schedule = post_schedule_rt,
1342 .task_wake_up = task_wake_up_rt, 1450 .task_wake_up = task_wake_up_rt,
@@ -1349,3 +1457,17 @@ static const struct sched_class rt_sched_class = {
1349 .prio_changed = prio_changed_rt, 1457 .prio_changed = prio_changed_rt,
1350 .switched_to = switched_to_rt, 1458 .switched_to = switched_to_rt,
1351}; 1459};
1460
1461#ifdef CONFIG_SCHED_DEBUG
1462extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
1463
1464static void print_rt_stats(struct seq_file *m, int cpu)
1465{
1466 struct rt_rq *rt_rq;
1467
1468 rcu_read_lock();
1469 for_each_leaf_rt_rq(rt_rq, cpu_rq(cpu))
1470 print_rt_rq(m, cpu, rt_rq);
1471 rcu_read_unlock();
1472}
1473#endif