aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2008-05-05 20:31:14 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2008-05-05 20:31:14 -0400
commitbb896afe2089575ca1bb1fbf3f07b934e1ba999b (patch)
tree7300f9b4e3e267fe97d898440c72ff2c4c327f23 /kernel
parent2e83fc4df5f27dfc1b53044c4f142b2f9d1db08c (diff)
parentaac6abca858386438d9a7233c3471d2ecfa2f704 (diff)
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mingo/linux-2.6-sched-fixes
* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mingo/linux-2.6-sched-fixes: sched: default to n for GROUP_SCHED and FAIR_GROUP_SCHED sched: add optional support for CONFIG_HAVE_UNSTABLE_SCHED_CLOCK sched, x86: add HAVE_UNSTABLE_SCHED_CLOCK sched: fix cpu clock sched: fair-group: fix a Div0 error of the fair group scheduler sched: fix missing locking in sched_domains code sched: make clock sync tunable by architecture code sched: fix debugging sched: fix sched_info_switch not being called according to documentation sched: fix hrtick_start_fair and CPU-Hotplug sched: fix SCHED_FAIR wake-idle logic error sched: fix RT task-wakeup logic sched: add statics, don't return void expressions sched: add debug checks to idle functions sched: remove old sched doc sched: make rt_sched_class, idle_sched_class static sched: optimize calc_delta_mine() sched: fix normalized sleeper
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile2
-rw-r--r--kernel/sched.c323
-rw-r--r--kernel/sched_clock.c236
-rw-r--r--kernel/sched_debug.c7
-rw-r--r--kernel/sched_fair.c39
-rw-r--r--kernel/sched_idletask.c2
-rw-r--r--kernel/sched_rt.c9
7 files changed, 380 insertions, 238 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 188c43223f52..1c9938addb9d 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -9,7 +9,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
9 rcupdate.o extable.o params.o posix-timers.o \ 9 rcupdate.o extable.o params.o posix-timers.o \
10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ 10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ 11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
12 notifier.o ksysfs.o pm_qos_params.o 12 notifier.o ksysfs.o pm_qos_params.o sched_clock.o
13 13
14obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o 14obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o
15obj-$(CONFIG_STACKTRACE) += stacktrace.o 15obj-$(CONFIG_STACKTRACE) += stacktrace.o
diff --git a/kernel/sched.c b/kernel/sched.c
index 34bcc5bc120e..58fb8af15776 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -75,16 +75,6 @@
75#include <asm/irq_regs.h> 75#include <asm/irq_regs.h>
76 76
77/* 77/*
78 * Scheduler clock - returns current time in nanosec units.
79 * This is default implementation.
80 * Architectures and sub-architectures can override this.
81 */
82unsigned long long __attribute__((weak)) sched_clock(void)
83{
84 return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ);
85}
86
87/*
88 * Convert user-nice values [ -20 ... 0 ... 19 ] 78 * Convert user-nice values [ -20 ... 0 ... 19 ]
89 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], 79 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
90 * and back. 80 * and back.
@@ -242,6 +232,12 @@ static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
242} 232}
243#endif 233#endif
244 234
235/*
236 * sched_domains_mutex serializes calls to arch_init_sched_domains,
237 * detach_destroy_domains and partition_sched_domains.
238 */
239static DEFINE_MUTEX(sched_domains_mutex);
240
245#ifdef CONFIG_GROUP_SCHED 241#ifdef CONFIG_GROUP_SCHED
246 242
247#include <linux/cgroup.h> 243#include <linux/cgroup.h>
@@ -308,9 +304,6 @@ static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
308 */ 304 */
309static DEFINE_SPINLOCK(task_group_lock); 305static DEFINE_SPINLOCK(task_group_lock);
310 306
311/* doms_cur_mutex serializes access to doms_cur[] array */
312static DEFINE_MUTEX(doms_cur_mutex);
313
314#ifdef CONFIG_FAIR_GROUP_SCHED 307#ifdef CONFIG_FAIR_GROUP_SCHED
315#ifdef CONFIG_USER_SCHED 308#ifdef CONFIG_USER_SCHED
316# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) 309# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
@@ -318,7 +311,13 @@ static DEFINE_MUTEX(doms_cur_mutex);
318# define INIT_TASK_GROUP_LOAD NICE_0_LOAD 311# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
319#endif 312#endif
320 313
314/*
315 * A weight of 0, 1 or ULONG_MAX can cause arithmetics problems.
316 * (The default weight is 1024 - so there's no practical
317 * limitation from this.)
318 */
321#define MIN_SHARES 2 319#define MIN_SHARES 2
320#define MAX_SHARES (ULONG_MAX - 1)
322 321
323static int init_task_group_load = INIT_TASK_GROUP_LOAD; 322static int init_task_group_load = INIT_TASK_GROUP_LOAD;
324#endif 323#endif
@@ -358,21 +357,9 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
358#endif 357#endif
359} 358}
360 359
361static inline void lock_doms_cur(void)
362{
363 mutex_lock(&doms_cur_mutex);
364}
365
366static inline void unlock_doms_cur(void)
367{
368 mutex_unlock(&doms_cur_mutex);
369}
370
371#else 360#else
372 361
373static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } 362static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
374static inline void lock_doms_cur(void) { }
375static inline void unlock_doms_cur(void) { }
376 363
377#endif /* CONFIG_GROUP_SCHED */ 364#endif /* CONFIG_GROUP_SCHED */
378 365
@@ -560,13 +547,7 @@ struct rq {
560 unsigned long next_balance; 547 unsigned long next_balance;
561 struct mm_struct *prev_mm; 548 struct mm_struct *prev_mm;
562 549
563 u64 clock, prev_clock_raw; 550 u64 clock;
564 s64 clock_max_delta;
565
566 unsigned int clock_warps, clock_overflows, clock_underflows;
567 u64 idle_clock;
568 unsigned int clock_deep_idle_events;
569 u64 tick_timestamp;
570 551
571 atomic_t nr_iowait; 552 atomic_t nr_iowait;
572 553
@@ -631,82 +612,6 @@ static inline int cpu_of(struct rq *rq)
631#endif 612#endif
632} 613}
633 614
634#ifdef CONFIG_NO_HZ
635static inline bool nohz_on(int cpu)
636{
637 return tick_get_tick_sched(cpu)->nohz_mode != NOHZ_MODE_INACTIVE;
638}
639
640static inline u64 max_skipped_ticks(struct rq *rq)
641{
642 return nohz_on(cpu_of(rq)) ? jiffies - rq->last_tick_seen + 2 : 1;
643}
644
645static inline void update_last_tick_seen(struct rq *rq)
646{
647 rq->last_tick_seen = jiffies;
648}
649#else
650static inline u64 max_skipped_ticks(struct rq *rq)
651{
652 return 1;
653}
654
655static inline void update_last_tick_seen(struct rq *rq)
656{
657}
658#endif
659
660/*
661 * Update the per-runqueue clock, as finegrained as the platform can give
662 * us, but without assuming monotonicity, etc.:
663 */
664static void __update_rq_clock(struct rq *rq)
665{
666 u64 prev_raw = rq->prev_clock_raw;
667 u64 now = sched_clock();
668 s64 delta = now - prev_raw;
669 u64 clock = rq->clock;
670
671#ifdef CONFIG_SCHED_DEBUG
672 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
673#endif
674 /*
675 * Protect against sched_clock() occasionally going backwards:
676 */
677 if (unlikely(delta < 0)) {
678 clock++;
679 rq->clock_warps++;
680 } else {
681 /*
682 * Catch too large forward jumps too:
683 */
684 u64 max_jump = max_skipped_ticks(rq) * TICK_NSEC;
685 u64 max_time = rq->tick_timestamp + max_jump;
686
687 if (unlikely(clock + delta > max_time)) {
688 if (clock < max_time)
689 clock = max_time;
690 else
691 clock++;
692 rq->clock_overflows++;
693 } else {
694 if (unlikely(delta > rq->clock_max_delta))
695 rq->clock_max_delta = delta;
696 clock += delta;
697 }
698 }
699
700 rq->prev_clock_raw = now;
701 rq->clock = clock;
702}
703
704static void update_rq_clock(struct rq *rq)
705{
706 if (likely(smp_processor_id() == cpu_of(rq)))
707 __update_rq_clock(rq);
708}
709
710/* 615/*
711 * The domain tree (rq->sd) is protected by RCU's quiescent state transition. 616 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
712 * See detach_destroy_domains: synchronize_sched for details. 617 * See detach_destroy_domains: synchronize_sched for details.
@@ -722,6 +627,11 @@ static void update_rq_clock(struct rq *rq)
722#define task_rq(p) cpu_rq(task_cpu(p)) 627#define task_rq(p) cpu_rq(task_cpu(p))
723#define cpu_curr(cpu) (cpu_rq(cpu)->curr) 628#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
724 629
630static inline void update_rq_clock(struct rq *rq)
631{
632 rq->clock = sched_clock_cpu(cpu_of(rq));
633}
634
725/* 635/*
726 * Tunables that become constants when CONFIG_SCHED_DEBUG is off: 636 * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
727 */ 637 */
@@ -757,14 +667,14 @@ const_debug unsigned int sysctl_sched_features =
757#define SCHED_FEAT(name, enabled) \ 667#define SCHED_FEAT(name, enabled) \
758 #name , 668 #name ,
759 669
760__read_mostly char *sched_feat_names[] = { 670static __read_mostly char *sched_feat_names[] = {
761#include "sched_features.h" 671#include "sched_features.h"
762 NULL 672 NULL
763}; 673};
764 674
765#undef SCHED_FEAT 675#undef SCHED_FEAT
766 676
767int sched_feat_open(struct inode *inode, struct file *filp) 677static int sched_feat_open(struct inode *inode, struct file *filp)
768{ 678{
769 filp->private_data = inode->i_private; 679 filp->private_data = inode->i_private;
770 return 0; 680 return 0;
@@ -899,7 +809,7 @@ static inline u64 global_rt_runtime(void)
899 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; 809 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
900} 810}
901 811
902static const unsigned long long time_sync_thresh = 100000; 812unsigned long long time_sync_thresh = 100000;
903 813
904static DEFINE_PER_CPU(unsigned long long, time_offset); 814static DEFINE_PER_CPU(unsigned long long, time_offset);
905static DEFINE_PER_CPU(unsigned long long, prev_cpu_time); 815static DEFINE_PER_CPU(unsigned long long, prev_cpu_time);
@@ -913,11 +823,14 @@ static DEFINE_PER_CPU(unsigned long long, prev_cpu_time);
913static DEFINE_SPINLOCK(time_sync_lock); 823static DEFINE_SPINLOCK(time_sync_lock);
914static unsigned long long prev_global_time; 824static unsigned long long prev_global_time;
915 825
916static unsigned long long __sync_cpu_clock(cycles_t time, int cpu) 826static unsigned long long __sync_cpu_clock(unsigned long long time, int cpu)
917{ 827{
918 unsigned long flags; 828 /*
919 829 * We want this inlined, to not get tracer function calls
920 spin_lock_irqsave(&time_sync_lock, flags); 830 * in this critical section:
831 */
832 spin_acquire(&time_sync_lock.dep_map, 0, 0, _THIS_IP_);
833 __raw_spin_lock(&time_sync_lock.raw_lock);
921 834
922 if (time < prev_global_time) { 835 if (time < prev_global_time) {
923 per_cpu(time_offset, cpu) += prev_global_time - time; 836 per_cpu(time_offset, cpu) += prev_global_time - time;
@@ -926,7 +839,8 @@ static unsigned long long __sync_cpu_clock(cycles_t time, int cpu)
926 prev_global_time = time; 839 prev_global_time = time;
927 } 840 }
928 841
929 spin_unlock_irqrestore(&time_sync_lock, flags); 842 __raw_spin_unlock(&time_sync_lock.raw_lock);
843 spin_release(&time_sync_lock.dep_map, 1, _THIS_IP_);
930 844
931 return time; 845 return time;
932} 846}
@@ -934,8 +848,6 @@ static unsigned long long __sync_cpu_clock(cycles_t time, int cpu)
934static unsigned long long __cpu_clock(int cpu) 848static unsigned long long __cpu_clock(int cpu)
935{ 849{
936 unsigned long long now; 850 unsigned long long now;
937 unsigned long flags;
938 struct rq *rq;
939 851
940 /* 852 /*
941 * Only call sched_clock() if the scheduler has already been 853 * Only call sched_clock() if the scheduler has already been
@@ -944,11 +856,7 @@ static unsigned long long __cpu_clock(int cpu)
944 if (unlikely(!scheduler_running)) 856 if (unlikely(!scheduler_running))
945 return 0; 857 return 0;
946 858
947 local_irq_save(flags); 859 now = sched_clock_cpu(cpu);
948 rq = cpu_rq(cpu);
949 update_rq_clock(rq);
950 now = rq->clock;
951 local_irq_restore(flags);
952 860
953 return now; 861 return now;
954} 862}
@@ -960,13 +868,18 @@ static unsigned long long __cpu_clock(int cpu)
960unsigned long long cpu_clock(int cpu) 868unsigned long long cpu_clock(int cpu)
961{ 869{
962 unsigned long long prev_cpu_time, time, delta_time; 870 unsigned long long prev_cpu_time, time, delta_time;
871 unsigned long flags;
963 872
873 local_irq_save(flags);
964 prev_cpu_time = per_cpu(prev_cpu_time, cpu); 874 prev_cpu_time = per_cpu(prev_cpu_time, cpu);
965 time = __cpu_clock(cpu) + per_cpu(time_offset, cpu); 875 time = __cpu_clock(cpu) + per_cpu(time_offset, cpu);
966 delta_time = time-prev_cpu_time; 876 delta_time = time-prev_cpu_time;
967 877
968 if (unlikely(delta_time > time_sync_thresh)) 878 if (unlikely(delta_time > time_sync_thresh)) {
969 time = __sync_cpu_clock(time, cpu); 879 time = __sync_cpu_clock(time, cpu);
880 per_cpu(prev_cpu_time, cpu) = time;
881 }
882 local_irq_restore(flags);
970 883
971 return time; 884 return time;
972} 885}
@@ -1117,43 +1030,6 @@ static struct rq *this_rq_lock(void)
1117 return rq; 1030 return rq;
1118} 1031}
1119 1032
1120/*
1121 * We are going deep-idle (irqs are disabled):
1122 */
1123void sched_clock_idle_sleep_event(void)
1124{
1125 struct rq *rq = cpu_rq(smp_processor_id());
1126
1127 spin_lock(&rq->lock);
1128 __update_rq_clock(rq);
1129 spin_unlock(&rq->lock);
1130 rq->clock_deep_idle_events++;
1131}
1132EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);
1133
1134/*
1135 * We just idled delta nanoseconds (called with irqs disabled):
1136 */
1137void sched_clock_idle_wakeup_event(u64 delta_ns)
1138{
1139 struct rq *rq = cpu_rq(smp_processor_id());
1140 u64 now = sched_clock();
1141
1142 rq->idle_clock += delta_ns;
1143 /*
1144 * Override the previous timestamp and ignore all
1145 * sched_clock() deltas that occured while we idled,
1146 * and use the PM-provided delta_ns to advance the
1147 * rq clock:
1148 */
1149 spin_lock(&rq->lock);
1150 rq->prev_clock_raw = now;
1151 rq->clock += delta_ns;
1152 spin_unlock(&rq->lock);
1153 touch_softlockup_watchdog();
1154}
1155EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
1156
1157static void __resched_task(struct task_struct *p, int tif_bit); 1033static void __resched_task(struct task_struct *p, int tif_bit);
1158 1034
1159static inline void resched_task(struct task_struct *p) 1035static inline void resched_task(struct task_struct *p)
@@ -1189,6 +1065,7 @@ static inline void resched_rq(struct rq *rq)
1189enum { 1065enum {
1190 HRTICK_SET, /* re-programm hrtick_timer */ 1066 HRTICK_SET, /* re-programm hrtick_timer */
1191 HRTICK_RESET, /* not a new slice */ 1067 HRTICK_RESET, /* not a new slice */
1068 HRTICK_BLOCK, /* stop hrtick operations */
1192}; 1069};
1193 1070
1194/* 1071/*
@@ -1200,6 +1077,8 @@ static inline int hrtick_enabled(struct rq *rq)
1200{ 1077{
1201 if (!sched_feat(HRTICK)) 1078 if (!sched_feat(HRTICK))
1202 return 0; 1079 return 0;
1080 if (unlikely(test_bit(HRTICK_BLOCK, &rq->hrtick_flags)))
1081 return 0;
1203 return hrtimer_is_hres_active(&rq->hrtick_timer); 1082 return hrtimer_is_hres_active(&rq->hrtick_timer);
1204} 1083}
1205 1084
@@ -1275,14 +1154,70 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
1275 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); 1154 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
1276 1155
1277 spin_lock(&rq->lock); 1156 spin_lock(&rq->lock);
1278 __update_rq_clock(rq); 1157 update_rq_clock(rq);
1279 rq->curr->sched_class->task_tick(rq, rq->curr, 1); 1158 rq->curr->sched_class->task_tick(rq, rq->curr, 1);
1280 spin_unlock(&rq->lock); 1159 spin_unlock(&rq->lock);
1281 1160
1282 return HRTIMER_NORESTART; 1161 return HRTIMER_NORESTART;
1283} 1162}
1284 1163
1285static inline void init_rq_hrtick(struct rq *rq) 1164static void hotplug_hrtick_disable(int cpu)
1165{
1166 struct rq *rq = cpu_rq(cpu);
1167 unsigned long flags;
1168
1169 spin_lock_irqsave(&rq->lock, flags);
1170 rq->hrtick_flags = 0;
1171 __set_bit(HRTICK_BLOCK, &rq->hrtick_flags);
1172 spin_unlock_irqrestore(&rq->lock, flags);
1173
1174 hrtick_clear(rq);
1175}
1176
1177static void hotplug_hrtick_enable(int cpu)
1178{
1179 struct rq *rq = cpu_rq(cpu);
1180 unsigned long flags;
1181
1182 spin_lock_irqsave(&rq->lock, flags);
1183 __clear_bit(HRTICK_BLOCK, &rq->hrtick_flags);
1184 spin_unlock_irqrestore(&rq->lock, flags);
1185}
1186
1187static int
1188hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
1189{
1190 int cpu = (int)(long)hcpu;
1191
1192 switch (action) {
1193 case CPU_UP_CANCELED:
1194 case CPU_UP_CANCELED_FROZEN:
1195 case CPU_DOWN_PREPARE:
1196 case CPU_DOWN_PREPARE_FROZEN:
1197 case CPU_DEAD:
1198 case CPU_DEAD_FROZEN:
1199 hotplug_hrtick_disable(cpu);
1200 return NOTIFY_OK;
1201
1202 case CPU_UP_PREPARE:
1203 case CPU_UP_PREPARE_FROZEN:
1204 case CPU_DOWN_FAILED:
1205 case CPU_DOWN_FAILED_FROZEN:
1206 case CPU_ONLINE:
1207 case CPU_ONLINE_FROZEN:
1208 hotplug_hrtick_enable(cpu);
1209 return NOTIFY_OK;
1210 }
1211
1212 return NOTIFY_DONE;
1213}
1214
1215static void init_hrtick(void)
1216{
1217 hotcpu_notifier(hotplug_hrtick, 0);
1218}
1219
1220static void init_rq_hrtick(struct rq *rq)
1286{ 1221{
1287 rq->hrtick_flags = 0; 1222 rq->hrtick_flags = 0;
1288 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 1223 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
@@ -1319,6 +1254,10 @@ static inline void init_rq_hrtick(struct rq *rq)
1319void hrtick_resched(void) 1254void hrtick_resched(void)
1320{ 1255{
1321} 1256}
1257
1258static inline void init_hrtick(void)
1259{
1260}
1322#endif 1261#endif
1323 1262
1324/* 1263/*
@@ -1438,8 +1377,8 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight,
1438{ 1377{
1439 u64 tmp; 1378 u64 tmp;
1440 1379
1441 if (unlikely(!lw->inv_weight)) 1380 if (!lw->inv_weight)
1442 lw->inv_weight = (WMULT_CONST-lw->weight/2) / (lw->weight+1); 1381 lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2)/(lw->weight+1);
1443 1382
1444 tmp = (u64)delta_exec * weight; 1383 tmp = (u64)delta_exec * weight;
1445 /* 1384 /*
@@ -1748,6 +1687,8 @@ __update_group_shares_cpu(struct task_group *tg, struct sched_domain *sd,
1748 1687
1749 if (shares < MIN_SHARES) 1688 if (shares < MIN_SHARES)
1750 shares = MIN_SHARES; 1689 shares = MIN_SHARES;
1690 else if (shares > MAX_SHARES)
1691 shares = MAX_SHARES;
1751 1692
1752 __set_se_shares(tg->se[tcpu], shares); 1693 __set_se_shares(tg->se[tcpu], shares);
1753} 1694}
@@ -4339,8 +4280,10 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
4339 struct rq *rq = this_rq(); 4280 struct rq *rq = this_rq();
4340 cputime64_t tmp; 4281 cputime64_t tmp;
4341 4282
4342 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) 4283 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
4343 return account_guest_time(p, cputime); 4284 account_guest_time(p, cputime);
4285 return;
4286 }
4344 4287
4345 p->stime = cputime_add(p->stime, cputime); 4288 p->stime = cputime_add(p->stime, cputime);
4346 4289
@@ -4404,19 +4347,11 @@ void scheduler_tick(void)
4404 int cpu = smp_processor_id(); 4347 int cpu = smp_processor_id();
4405 struct rq *rq = cpu_rq(cpu); 4348 struct rq *rq = cpu_rq(cpu);
4406 struct task_struct *curr = rq->curr; 4349 struct task_struct *curr = rq->curr;
4407 u64 next_tick = rq->tick_timestamp + TICK_NSEC; 4350
4351 sched_clock_tick();
4408 4352
4409 spin_lock(&rq->lock); 4353 spin_lock(&rq->lock);
4410 __update_rq_clock(rq); 4354 update_rq_clock(rq);
4411 /*
4412 * Let rq->clock advance by at least TICK_NSEC:
4413 */
4414 if (unlikely(rq->clock < next_tick)) {
4415 rq->clock = next_tick;
4416 rq->clock_underflows++;
4417 }
4418 rq->tick_timestamp = rq->clock;
4419 update_last_tick_seen(rq);
4420 update_cpu_load(rq); 4355 update_cpu_load(rq);
4421 curr->sched_class->task_tick(rq, curr, 0); 4356 curr->sched_class->task_tick(rq, curr, 0);
4422 spin_unlock(&rq->lock); 4357 spin_unlock(&rq->lock);
@@ -4570,7 +4505,7 @@ need_resched_nonpreemptible:
4570 * Do the rq-clock update outside the rq lock: 4505 * Do the rq-clock update outside the rq lock:
4571 */ 4506 */
4572 local_irq_disable(); 4507 local_irq_disable();
4573 __update_rq_clock(rq); 4508 update_rq_clock(rq);
4574 spin_lock(&rq->lock); 4509 spin_lock(&rq->lock);
4575 clear_tsk_need_resched(prev); 4510 clear_tsk_need_resched(prev);
4576 4511
@@ -4595,9 +4530,9 @@ need_resched_nonpreemptible:
4595 prev->sched_class->put_prev_task(rq, prev); 4530 prev->sched_class->put_prev_task(rq, prev);
4596 next = pick_next_task(rq, prev); 4531 next = pick_next_task(rq, prev);
4597 4532
4598 sched_info_switch(prev, next);
4599
4600 if (likely(prev != next)) { 4533 if (likely(prev != next)) {
4534 sched_info_switch(prev, next);
4535
4601 rq->nr_switches++; 4536 rq->nr_switches++;
4602 rq->curr = next; 4537 rq->curr = next;
4603 ++*switch_count; 4538 ++*switch_count;
@@ -7755,7 +7690,7 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
7755{ 7690{
7756 int i, j; 7691 int i, j;
7757 7692
7758 lock_doms_cur(); 7693 mutex_lock(&sched_domains_mutex);
7759 7694
7760 /* always unregister in case we don't destroy any domains */ 7695 /* always unregister in case we don't destroy any domains */
7761 unregister_sched_domain_sysctl(); 7696 unregister_sched_domain_sysctl();
@@ -7804,7 +7739,7 @@ match2:
7804 7739
7805 register_sched_domain_sysctl(); 7740 register_sched_domain_sysctl();
7806 7741
7807 unlock_doms_cur(); 7742 mutex_unlock(&sched_domains_mutex);
7808} 7743}
7809 7744
7810#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 7745#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
@@ -7813,8 +7748,10 @@ int arch_reinit_sched_domains(void)
7813 int err; 7748 int err;
7814 7749
7815 get_online_cpus(); 7750 get_online_cpus();
7751 mutex_lock(&sched_domains_mutex);
7816 detach_destroy_domains(&cpu_online_map); 7752 detach_destroy_domains(&cpu_online_map);
7817 err = arch_init_sched_domains(&cpu_online_map); 7753 err = arch_init_sched_domains(&cpu_online_map);
7754 mutex_unlock(&sched_domains_mutex);
7818 put_online_cpus(); 7755 put_online_cpus();
7819 7756
7820 return err; 7757 return err;
@@ -7932,13 +7869,16 @@ void __init sched_init_smp(void)
7932 BUG_ON(sched_group_nodes_bycpu == NULL); 7869 BUG_ON(sched_group_nodes_bycpu == NULL);
7933#endif 7870#endif
7934 get_online_cpus(); 7871 get_online_cpus();
7872 mutex_lock(&sched_domains_mutex);
7935 arch_init_sched_domains(&cpu_online_map); 7873 arch_init_sched_domains(&cpu_online_map);
7936 cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map); 7874 cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);
7937 if (cpus_empty(non_isolated_cpus)) 7875 if (cpus_empty(non_isolated_cpus))
7938 cpu_set(smp_processor_id(), non_isolated_cpus); 7876 cpu_set(smp_processor_id(), non_isolated_cpus);
7877 mutex_unlock(&sched_domains_mutex);
7939 put_online_cpus(); 7878 put_online_cpus();
7940 /* XXX: Theoretical race here - CPU may be hotplugged now */ 7879 /* XXX: Theoretical race here - CPU may be hotplugged now */
7941 hotcpu_notifier(update_sched_domains, 0); 7880 hotcpu_notifier(update_sched_domains, 0);
7881 init_hrtick();
7942 7882
7943 /* Move init over to a non-isolated CPU */ 7883 /* Move init over to a non-isolated CPU */
7944 if (set_cpus_allowed_ptr(current, &non_isolated_cpus) < 0) 7884 if (set_cpus_allowed_ptr(current, &non_isolated_cpus) < 0)
@@ -8025,7 +7965,7 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
8025 7965
8026 se->my_q = cfs_rq; 7966 se->my_q = cfs_rq;
8027 se->load.weight = tg->shares; 7967 se->load.weight = tg->shares;
8028 se->load.inv_weight = div64_u64(1ULL<<32, se->load.weight); 7968 se->load.inv_weight = 0;
8029 se->parent = parent; 7969 se->parent = parent;
8030} 7970}
8031#endif 7971#endif
@@ -8149,8 +8089,6 @@ void __init sched_init(void)
8149 spin_lock_init(&rq->lock); 8089 spin_lock_init(&rq->lock);
8150 lockdep_set_class(&rq->lock, &rq->rq_lock_key); 8090 lockdep_set_class(&rq->lock, &rq->rq_lock_key);
8151 rq->nr_running = 0; 8091 rq->nr_running = 0;
8152 rq->clock = 1;
8153 update_last_tick_seen(rq);
8154 init_cfs_rq(&rq->cfs, rq); 8092 init_cfs_rq(&rq->cfs, rq);
8155 init_rt_rq(&rq->rt, rq); 8093 init_rt_rq(&rq->rt, rq);
8156#ifdef CONFIG_FAIR_GROUP_SCHED 8094#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -8294,6 +8232,7 @@ EXPORT_SYMBOL(__might_sleep);
8294static void normalize_task(struct rq *rq, struct task_struct *p) 8232static void normalize_task(struct rq *rq, struct task_struct *p)
8295{ 8233{
8296 int on_rq; 8234 int on_rq;
8235
8297 update_rq_clock(rq); 8236 update_rq_clock(rq);
8298 on_rq = p->se.on_rq; 8237 on_rq = p->se.on_rq;
8299 if (on_rq) 8238 if (on_rq)
@@ -8325,7 +8264,6 @@ void normalize_rt_tasks(void)
8325 p->se.sleep_start = 0; 8264 p->se.sleep_start = 0;
8326 p->se.block_start = 0; 8265 p->se.block_start = 0;
8327#endif 8266#endif
8328 task_rq(p)->clock = 0;
8329 8267
8330 if (!rt_task(p)) { 8268 if (!rt_task(p)) {
8331 /* 8269 /*
@@ -8692,7 +8630,7 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares)
8692 dequeue_entity(cfs_rq, se, 0); 8630 dequeue_entity(cfs_rq, se, 0);
8693 8631
8694 se->load.weight = shares; 8632 se->load.weight = shares;
8695 se->load.inv_weight = div64_u64((1ULL<<32), shares); 8633 se->load.inv_weight = 0;
8696 8634
8697 if (on_rq) 8635 if (on_rq)
8698 enqueue_entity(cfs_rq, se, 0); 8636 enqueue_entity(cfs_rq, se, 0);
@@ -8722,13 +8660,10 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
8722 if (!tg->se[0]) 8660 if (!tg->se[0])
8723 return -EINVAL; 8661 return -EINVAL;
8724 8662
8725 /*
8726 * A weight of 0 or 1 can cause arithmetics problems.
8727 * (The default weight is 1024 - so there's no practical
8728 * limitation from this.)
8729 */
8730 if (shares < MIN_SHARES) 8663 if (shares < MIN_SHARES)
8731 shares = MIN_SHARES; 8664 shares = MIN_SHARES;
8665 else if (shares > MAX_SHARES)
8666 shares = MAX_SHARES;
8732 8667
8733 mutex_lock(&shares_mutex); 8668 mutex_lock(&shares_mutex);
8734 if (tg->shares == shares) 8669 if (tg->shares == shares)
@@ -8753,7 +8688,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
8753 * force a rebalance 8688 * force a rebalance
8754 */ 8689 */
8755 cfs_rq_set_shares(tg->cfs_rq[i], 0); 8690 cfs_rq_set_shares(tg->cfs_rq[i], 0);
8756 set_se_shares(tg->se[i], shares/nr_cpu_ids); 8691 set_se_shares(tg->se[i], shares);
8757 } 8692 }
8758 8693
8759 /* 8694 /*
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
new file mode 100644
index 000000000000..9c597e37f7de
--- /dev/null
+++ b/kernel/sched_clock.c
@@ -0,0 +1,236 @@
1/*
2 * sched_clock for unstable cpu clocks
3 *
4 * Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
5 *
6 * Based on code by:
7 * Ingo Molnar <mingo@redhat.com>
8 * Guillaume Chazarain <guichaz@gmail.com>
9 *
10 * Create a semi stable clock from a mixture of other events, including:
11 * - gtod
12 * - jiffies
13 * - sched_clock()
14 * - explicit idle events
15 *
16 * We use gtod as base and the unstable clock deltas. The deltas are filtered,
17 * making it monotonic and keeping it within an expected window. This window
18 * is set up using jiffies.
19 *
20 * Furthermore, explicit sleep and wakeup hooks allow us to account for time
21 * that is otherwise invisible (TSC gets stopped).
22 *
23 * The clock: sched_clock_cpu() is monotonic per cpu, and should be somewhat
24 * consistent between cpus (never more than 1 jiffies difference).
25 */
26#include <linux/sched.h>
27#include <linux/percpu.h>
28#include <linux/spinlock.h>
29#include <linux/ktime.h>
30#include <linux/module.h>
31
32
33#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
34
35struct sched_clock_data {
36 /*
37 * Raw spinlock - this is a special case: this might be called
38 * from within instrumentation code so we dont want to do any
39 * instrumentation ourselves.
40 */
41 raw_spinlock_t lock;
42
43 unsigned long prev_jiffies;
44 u64 prev_raw;
45 u64 tick_raw;
46 u64 tick_gtod;
47 u64 clock;
48};
49
50static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data);
51
52static inline struct sched_clock_data *this_scd(void)
53{
54 return &__get_cpu_var(sched_clock_data);
55}
56
57static inline struct sched_clock_data *cpu_sdc(int cpu)
58{
59 return &per_cpu(sched_clock_data, cpu);
60}
61
62void sched_clock_init(void)
63{
64 u64 ktime_now = ktime_to_ns(ktime_get());
65 u64 now = 0;
66 int cpu;
67
68 for_each_possible_cpu(cpu) {
69 struct sched_clock_data *scd = cpu_sdc(cpu);
70
71 scd->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
72 scd->prev_jiffies = jiffies;
73 scd->prev_raw = now;
74 scd->tick_raw = now;
75 scd->tick_gtod = ktime_now;
76 scd->clock = ktime_now;
77 }
78}
79
80/*
81 * update the percpu scd from the raw @now value
82 *
83 * - filter out backward motion
84 * - use jiffies to generate a min,max window to clip the raw values
85 */
86static void __update_sched_clock(struct sched_clock_data *scd, u64 now)
87{
88 unsigned long now_jiffies = jiffies;
89 long delta_jiffies = now_jiffies - scd->prev_jiffies;
90 u64 clock = scd->clock;
91 u64 min_clock, max_clock;
92 s64 delta = now - scd->prev_raw;
93
94 WARN_ON_ONCE(!irqs_disabled());
95 min_clock = scd->tick_gtod + delta_jiffies * TICK_NSEC;
96
97 if (unlikely(delta < 0)) {
98 clock++;
99 goto out;
100 }
101
102 max_clock = min_clock + TICK_NSEC;
103
104 if (unlikely(clock + delta > max_clock)) {
105 if (clock < max_clock)
106 clock = max_clock;
107 else
108 clock++;
109 } else {
110 clock += delta;
111 }
112
113 out:
114 if (unlikely(clock < min_clock))
115 clock = min_clock;
116
117 scd->prev_raw = now;
118 scd->prev_jiffies = now_jiffies;
119 scd->clock = clock;
120}
121
122static void lock_double_clock(struct sched_clock_data *data1,
123 struct sched_clock_data *data2)
124{
125 if (data1 < data2) {
126 __raw_spin_lock(&data1->lock);
127 __raw_spin_lock(&data2->lock);
128 } else {
129 __raw_spin_lock(&data2->lock);
130 __raw_spin_lock(&data1->lock);
131 }
132}
133
134u64 sched_clock_cpu(int cpu)
135{
136 struct sched_clock_data *scd = cpu_sdc(cpu);
137 u64 now, clock;
138
139 WARN_ON_ONCE(!irqs_disabled());
140 now = sched_clock();
141
142 if (cpu != raw_smp_processor_id()) {
143 /*
144 * in order to update a remote cpu's clock based on our
145 * unstable raw time rebase it against:
146 * tick_raw (offset between raw counters)
147 * tick_gotd (tick offset between cpus)
148 */
149 struct sched_clock_data *my_scd = this_scd();
150
151 lock_double_clock(scd, my_scd);
152
153 now -= my_scd->tick_raw;
154 now += scd->tick_raw;
155
156 now -= my_scd->tick_gtod;
157 now += scd->tick_gtod;
158
159 __raw_spin_unlock(&my_scd->lock);
160 } else {
161 __raw_spin_lock(&scd->lock);
162 }
163
164 __update_sched_clock(scd, now);
165 clock = scd->clock;
166
167 __raw_spin_unlock(&scd->lock);
168
169 return clock;
170}
171
172void sched_clock_tick(void)
173{
174 struct sched_clock_data *scd = this_scd();
175 u64 now, now_gtod;
176
177 WARN_ON_ONCE(!irqs_disabled());
178
179 now = sched_clock();
180 now_gtod = ktime_to_ns(ktime_get());
181
182 __raw_spin_lock(&scd->lock);
183 __update_sched_clock(scd, now);
184 /*
185 * update tick_gtod after __update_sched_clock() because that will
186 * already observe 1 new jiffy; adding a new tick_gtod to that would
187 * increase the clock 2 jiffies.
188 */
189 scd->tick_raw = now;
190 scd->tick_gtod = now_gtod;
191 __raw_spin_unlock(&scd->lock);
192}
193
194/*
195 * We are going deep-idle (irqs are disabled):
196 */
197void sched_clock_idle_sleep_event(void)
198{
199 sched_clock_cpu(smp_processor_id());
200}
201EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);
202
203/*
204 * We just idled delta nanoseconds (called with irqs disabled):
205 */
206void sched_clock_idle_wakeup_event(u64 delta_ns)
207{
208 struct sched_clock_data *scd = this_scd();
209 u64 now = sched_clock();
210
211 /*
212 * Override the previous timestamp and ignore all
213 * sched_clock() deltas that occured while we idled,
214 * and use the PM-provided delta_ns to advance the
215 * rq clock:
216 */
217 __raw_spin_lock(&scd->lock);
218 scd->prev_raw = now;
219 scd->clock += delta_ns;
220 __raw_spin_unlock(&scd->lock);
221
222 touch_softlockup_watchdog();
223}
224EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
225
226#endif
227
228/*
229 * Scheduler clock - returns current time in nanosec units.
230 * This is default implementation.
231 * Architectures and sub-architectures can override this.
232 */
233unsigned long long __attribute__((weak)) sched_clock(void)
234{
235 return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ);
236}
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 6b4a12558e88..5f06118fbc31 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -204,13 +204,6 @@ static void print_cpu(struct seq_file *m, int cpu)
204 PN(next_balance); 204 PN(next_balance);
205 P(curr->pid); 205 P(curr->pid);
206 PN(clock); 206 PN(clock);
207 PN(idle_clock);
208 PN(prev_clock_raw);
209 P(clock_warps);
210 P(clock_overflows);
211 P(clock_underflows);
212 P(clock_deep_idle_events);
213 PN(clock_max_delta);
214 P(cpu_load[0]); 207 P(cpu_load[0]);
215 P(cpu_load[1]); 208 P(cpu_load[1]);
216 P(cpu_load[2]); 209 P(cpu_load[2]);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 89fa32b4edf2..c863663d204d 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -682,6 +682,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
682 * Update run-time statistics of the 'current'. 682 * Update run-time statistics of the 'current'.
683 */ 683 */
684 update_curr(cfs_rq); 684 update_curr(cfs_rq);
685 account_entity_enqueue(cfs_rq, se);
685 686
686 if (wakeup) { 687 if (wakeup) {
687 place_entity(cfs_rq, se, 0); 688 place_entity(cfs_rq, se, 0);
@@ -692,7 +693,6 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
692 check_spread(cfs_rq, se); 693 check_spread(cfs_rq, se);
693 if (se != cfs_rq->curr) 694 if (se != cfs_rq->curr)
694 __enqueue_entity(cfs_rq, se); 695 __enqueue_entity(cfs_rq, se);
695 account_entity_enqueue(cfs_rq, se);
696} 696}
697 697
698static void update_avg(u64 *avg, u64 sample) 698static void update_avg(u64 *avg, u64 sample)
@@ -841,8 +841,10 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
841 * queued ticks are scheduled to match the slice, so don't bother 841 * queued ticks are scheduled to match the slice, so don't bother
842 * validating it and just reschedule. 842 * validating it and just reschedule.
843 */ 843 */
844 if (queued) 844 if (queued) {
845 return resched_task(rq_of(cfs_rq)->curr); 845 resched_task(rq_of(cfs_rq)->curr);
846 return;
847 }
846 /* 848 /*
847 * don't let the period tick interfere with the hrtick preemption 849 * don't let the period tick interfere with the hrtick preemption
848 */ 850 */
@@ -957,7 +959,7 @@ static void yield_task_fair(struct rq *rq)
957 return; 959 return;
958 960
959 if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) { 961 if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) {
960 __update_rq_clock(rq); 962 update_rq_clock(rq);
961 /* 963 /*
962 * Update run-time statistics of the 'current'. 964 * Update run-time statistics of the 'current'.
963 */ 965 */
@@ -1007,7 +1009,7 @@ static int wake_idle(int cpu, struct task_struct *p)
1007 * sibling runqueue info. This will avoid the checks and cache miss 1009 * sibling runqueue info. This will avoid the checks and cache miss
1008 * penalities associated with that. 1010 * penalities associated with that.
1009 */ 1011 */
1010 if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1) 1012 if (idle_cpu(cpu) || cpu_rq(cpu)->cfs.nr_running > 1)
1011 return cpu; 1013 return cpu;
1012 1014
1013 for_each_domain(cpu, sd) { 1015 for_each_domain(cpu, sd) {
@@ -1611,30 +1613,6 @@ static const struct sched_class fair_sched_class = {
1611}; 1613};
1612 1614
1613#ifdef CONFIG_SCHED_DEBUG 1615#ifdef CONFIG_SCHED_DEBUG
1614static void
1615print_cfs_rq_tasks(struct seq_file *m, struct cfs_rq *cfs_rq, int depth)
1616{
1617 struct sched_entity *se;
1618
1619 if (!cfs_rq)
1620 return;
1621
1622 list_for_each_entry_rcu(se, &cfs_rq->tasks, group_node) {
1623 int i;
1624
1625 for (i = depth; i; i--)
1626 seq_puts(m, " ");
1627
1628 seq_printf(m, "%lu %s %lu\n",
1629 se->load.weight,
1630 entity_is_task(se) ? "T" : "G",
1631 calc_delta_weight(SCHED_LOAD_SCALE, se)
1632 );
1633 if (!entity_is_task(se))
1634 print_cfs_rq_tasks(m, group_cfs_rq(se), depth + 1);
1635 }
1636}
1637
1638static void print_cfs_stats(struct seq_file *m, int cpu) 1616static void print_cfs_stats(struct seq_file *m, int cpu)
1639{ 1617{
1640 struct cfs_rq *cfs_rq; 1618 struct cfs_rq *cfs_rq;
@@ -1642,9 +1620,6 @@ static void print_cfs_stats(struct seq_file *m, int cpu)
1642 rcu_read_lock(); 1620 rcu_read_lock();
1643 for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq) 1621 for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
1644 print_cfs_rq(m, cpu, cfs_rq); 1622 print_cfs_rq(m, cpu, cfs_rq);
1645
1646 seq_printf(m, "\nWeight tree:\n");
1647 print_cfs_rq_tasks(m, &cpu_rq(cpu)->cfs, 1);
1648 rcu_read_unlock(); 1623 rcu_read_unlock();
1649} 1624}
1650#endif 1625#endif
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 2bcafa375633..3a4f92dbbe66 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -99,7 +99,7 @@ static void prio_changed_idle(struct rq *rq, struct task_struct *p,
99/* 99/*
100 * Simple, special scheduling class for the per-CPU idle tasks: 100 * Simple, special scheduling class for the per-CPU idle tasks:
101 */ 101 */
102const struct sched_class idle_sched_class = { 102static const struct sched_class idle_sched_class = {
103 /* .next is NULL */ 103 /* .next is NULL */
104 /* no enqueue/yield_task for idle tasks */ 104 /* no enqueue/yield_task for idle tasks */
105 105
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index c2730a5a4f05..060e87b0cb1c 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1098,11 +1098,14 @@ static void post_schedule_rt(struct rq *rq)
1098 } 1098 }
1099} 1099}
1100 1100
1101 1101/*
1102 * If we are not running and we are not going to reschedule soon, we should
1103 * try to push tasks away now
1104 */
1102static void task_wake_up_rt(struct rq *rq, struct task_struct *p) 1105static void task_wake_up_rt(struct rq *rq, struct task_struct *p)
1103{ 1106{
1104 if (!task_running(rq, p) && 1107 if (!task_running(rq, p) &&
1105 (p->prio >= rq->rt.highest_prio) && 1108 !test_tsk_need_resched(rq->curr) &&
1106 rq->rt.overloaded) 1109 rq->rt.overloaded)
1107 push_rt_tasks(rq); 1110 push_rt_tasks(rq);
1108} 1111}
@@ -1309,7 +1312,7 @@ static void set_curr_task_rt(struct rq *rq)
1309 p->se.exec_start = rq->clock; 1312 p->se.exec_start = rq->clock;
1310} 1313}
1311 1314
1312const struct sched_class rt_sched_class = { 1315static const struct sched_class rt_sched_class = {
1313 .next = &fair_sched_class, 1316 .next = &fair_sched_class,
1314 .enqueue_task = enqueue_task_rt, 1317 .enqueue_task = enqueue_task_rt,
1315 .dequeue_task = dequeue_task_rt, 1318 .dequeue_task = dequeue_task_rt,