aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c686
1 files changed, 551 insertions, 135 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index da19c1e05a5a..21c1cf2e27aa 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4,6 +4,7 @@
4 * Kernel scheduler and related syscalls 4 * Kernel scheduler and related syscalls
5 * 5 *
6 * Copyright (C) 1991-2002 Linus Torvalds 6 * Copyright (C) 1991-2002 Linus Torvalds
7 * Copyright (C) 2004 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
7 * 8 *
8 * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and 9 * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and
9 * make semaphores SMP safe 10 * make semaphores SMP safe
@@ -16,6 +17,7 @@
16 * by Davide Libenzi, preemptible kernel bits by Robert Love. 17 * by Davide Libenzi, preemptible kernel bits by Robert Love.
17 * 2003-09-03 Interactivity tuning by Con Kolivas. 18 * 2003-09-03 Interactivity tuning by Con Kolivas.
18 * 2004-04-02 Scheduler domains code by Nick Piggin 19 * 2004-04-02 Scheduler domains code by Nick Piggin
20 * 2004-10-13 Real-Time Preemption support by Ingo Molnar
19 * 2007-04-15 Work begun on replacing all interactivity tuning with a 21 * 2007-04-15 Work begun on replacing all interactivity tuning with a
20 * fair scheduling design by Con Kolivas. 22 * fair scheduling design by Con Kolivas.
21 * 2007-05-05 Load balancing (smp-nice) and other improvements 23 * 2007-05-05 Load balancing (smp-nice) and other improvements
@@ -61,6 +63,7 @@
61#include <linux/sysctl.h> 63#include <linux/sysctl.h>
62#include <linux/syscalls.h> 64#include <linux/syscalls.h>
63#include <linux/times.h> 65#include <linux/times.h>
66#include <linux/kallsyms.h>
64#include <linux/tsacct_kern.h> 67#include <linux/tsacct_kern.h>
65#include <linux/kprobes.h> 68#include <linux/kprobes.h>
66#include <linux/delayacct.h> 69#include <linux/delayacct.h>
@@ -106,6 +109,20 @@
106#define NICE_0_LOAD SCHED_LOAD_SCALE 109#define NICE_0_LOAD SCHED_LOAD_SCALE
107#define NICE_0_SHIFT SCHED_LOAD_SHIFT 110#define NICE_0_SHIFT SCHED_LOAD_SHIFT
108 111
112#if (BITS_PER_LONG < 64)
113#define JIFFIES_TO_NS64(TIME) \
114 ((unsigned long long)(TIME) * ((unsigned long) (1000000000 / HZ)))
115
116#define NS64_TO_JIFFIES(TIME) \
117 ((((unsigned long long)((TIME)) >> BITS_PER_LONG) * \
118 (1 + NS_TO_JIFFIES(~0UL))) + NS_TO_JIFFIES((unsigned long)(TIME)))
119#else /* BITS_PER_LONG < 64 */
120
121#define NS64_TO_JIFFIES(TIME) NS_TO_JIFFIES(TIME)
122#define JIFFIES_TO_NS64(TIME) JIFFIES_TO_NS(TIME)
123
124#endif /* BITS_PER_LONG < 64 */
125
109/* 126/*
110 * These are the 'tuning knobs' of the scheduler: 127 * These are the 'tuning knobs' of the scheduler:
111 * 128 *
@@ -131,6 +148,9 @@ static inline int task_has_rt_policy(struct task_struct *p)
131 return rt_policy(p->policy); 148 return rt_policy(p->policy);
132} 149}
133 150
151#define TASK_PREEMPTS_CURR(p, rq) \
152 ((p)->prio < (rq)->curr->prio)
153
134/* 154/*
135 * This is the priority-queue data structure of the RT scheduling class: 155 * This is the priority-queue data structure of the RT scheduling class:
136 */ 156 */
@@ -182,6 +202,7 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
182 202
183 hrtimer_init(&rt_b->rt_period_timer, 203 hrtimer_init(&rt_b->rt_period_timer,
184 CLOCK_MONOTONIC, HRTIMER_MODE_REL); 204 CLOCK_MONOTONIC, HRTIMER_MODE_REL);
205 rt_b->rt_period_timer.irqsafe = 1;
185 rt_b->rt_period_timer.function = sched_rt_period_timer; 206 rt_b->rt_period_timer.function = sched_rt_period_timer;
186} 207}
187 208
@@ -389,6 +410,7 @@ static inline struct task_group *task_group(struct task_struct *p)
389struct cfs_rq { 410struct cfs_rq {
390 struct load_weight load; 411 struct load_weight load;
391 unsigned long nr_running; 412 unsigned long nr_running;
413 unsigned long nr_enqueued;
392 414
393 u64 exec_clock; 415 u64 exec_clock;
394 u64 min_vruntime; 416 u64 min_vruntime;
@@ -466,6 +488,7 @@ struct rt_rq {
466 int overloaded; 488 int overloaded;
467 struct plist_head pushable_tasks; 489 struct plist_head pushable_tasks;
468#endif 490#endif
491 unsigned long rt_nr_uninterruptible;
469 int rt_throttled; 492 int rt_throttled;
470 u64 rt_time; 493 u64 rt_time;
471 u64 rt_runtime; 494 u64 rt_runtime;
@@ -561,6 +584,8 @@ struct rq {
561 */ 584 */
562 unsigned long nr_uninterruptible; 585 unsigned long nr_uninterruptible;
563 586
587 unsigned long switch_timestamp;
588 unsigned long slice_avg;
564 struct task_struct *curr, *idle; 589 struct task_struct *curr, *idle;
565 unsigned long next_balance; 590 unsigned long next_balance;
566 struct mm_struct *prev_mm; 591 struct mm_struct *prev_mm;
@@ -625,9 +650,21 @@ struct rq {
625 650
626 /* BKL stats */ 651 /* BKL stats */
627 unsigned int bkl_count; 652 unsigned int bkl_count;
653
654 /* RT-overload stats: */
655 unsigned long rto_schedule;
656 unsigned long rto_schedule_tail;
657 unsigned long rto_wakeup;
658 unsigned long rto_pulled;
659 unsigned long rto_pushed;
628#endif 660#endif
629}; 661};
630 662
663struct task_struct *rq_curr(struct rq *rq)
664{
665 return rq->curr;
666}
667
631static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 668static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
632 669
633static inline 670static inline
@@ -666,6 +703,13 @@ inline void update_rq_clock(struct rq *rq)
666 rq->clock = sched_clock_cpu(cpu_of(rq)); 703 rq->clock = sched_clock_cpu(cpu_of(rq));
667} 704}
668 705
706#ifndef CONFIG_SMP
707int task_is_current(struct task_struct *task)
708{
709 return task_rq(task)->curr == task;
710}
711#endif
712
669/* 713/*
670 * Tunables that become constants when CONFIG_SCHED_DEBUG is off: 714 * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
671 */ 715 */
@@ -807,7 +851,11 @@ late_initcall(sched_init_debug);
807 * Number of tasks to iterate in a single balance run. 851 * Number of tasks to iterate in a single balance run.
808 * Limited because this is done with IRQs disabled. 852 * Limited because this is done with IRQs disabled.
809 */ 853 */
854#ifndef CONFIG_PREEMPT
810const_debug unsigned int sysctl_sched_nr_migrate = 32; 855const_debug unsigned int sysctl_sched_nr_migrate = 32;
856#else
857const_debug unsigned int sysctl_sched_nr_migrate = 8;
858#endif
811 859
812/* 860/*
813 * ratelimit for updating the group shares. 861 * ratelimit for updating the group shares.
@@ -858,11 +906,25 @@ static inline u64 global_rt_runtime(void)
858 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; 906 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
859} 907}
860 908
909/*
910 * We really dont want to do anything complex within switch_to()
911 * on PREEMPT_RT - this check enforces this.
912 */
913#ifdef prepare_arch_switch
914# ifdef CONFIG_PREEMPT_RT
915# error FIXME
916# else
917# define _finish_arch_switch finish_arch_switch
918# endif
919#endif
920
861#ifndef prepare_arch_switch 921#ifndef prepare_arch_switch
862# define prepare_arch_switch(next) do { } while (0) 922# define prepare_arch_switch(next) do { } while (0)
863#endif 923#endif
864#ifndef finish_arch_switch 924#ifndef finish_arch_switch
865# define finish_arch_switch(prev) do { } while (0) 925# define _finish_arch_switch(prev) do { } while (0)
926#else
927# define _finish_arch_switch finish_arch_switch
866#endif 928#endif
867 929
868static inline int task_current(struct rq *rq, struct task_struct *p) 930static inline int task_current(struct rq *rq, struct task_struct *p)
@@ -870,18 +932,39 @@ static inline int task_current(struct rq *rq, struct task_struct *p)
870 return rq->curr == p; 932 return rq->curr == p;
871} 933}
872 934
873#ifndef __ARCH_WANT_UNLOCKED_CTXSW
874static inline int task_running(struct rq *rq, struct task_struct *p) 935static inline int task_running(struct rq *rq, struct task_struct *p)
875{ 936{
937#ifdef CONFIG_SMP
938 return p->oncpu;
939#else
876 return task_current(rq, p); 940 return task_current(rq, p);
941#endif
877} 942}
878 943
944#ifndef __ARCH_WANT_UNLOCKED_CTXSW
879static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) 945static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
880{ 946{
947#ifdef CONFIG_SMP
948 /*
949 * We can optimise this out completely for !SMP, because the
950 * SMP rebalancing from interrupt is the only thing that cares
951 * here.
952 */
953 next->oncpu = 1;
954#endif
881} 955}
882 956
883static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) 957static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
884{ 958{
959#ifdef CONFIG_SMP
960 /*
961 * After ->oncpu is cleared, the task can be moved to a different CPU.
962 * We must ensure this doesn't happen until the switch is completely
963 * finished.
964 */
965 smp_wmb();
966 prev->oncpu = 0;
967#endif
885#ifdef CONFIG_DEBUG_SPINLOCK 968#ifdef CONFIG_DEBUG_SPINLOCK
886 /* this is a valid case when another task releases the spinlock */ 969 /* this is a valid case when another task releases the spinlock */
887 rq->lock.owner = current; 970 rq->lock.owner = current;
@@ -893,18 +976,10 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
893 */ 976 */
894 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); 977 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
895 978
896 raw_spin_unlock_irq(&rq->lock); 979 raw_spin_unlock(&rq->lock);
897} 980}
898 981
899#else /* __ARCH_WANT_UNLOCKED_CTXSW */ 982#else /* __ARCH_WANT_UNLOCKED_CTXSW */
900static inline int task_running(struct rq *rq, struct task_struct *p)
901{
902#ifdef CONFIG_SMP
903 return p->oncpu;
904#else
905 return task_current(rq, p);
906#endif
907}
908 983
909static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) 984static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
910{ 985{
@@ -934,23 +1009,40 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
934 smp_wmb(); 1009 smp_wmb();
935 prev->oncpu = 0; 1010 prev->oncpu = 0;
936#endif 1011#endif
937#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW 1012#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
938 local_irq_enable(); 1013 local_irq_disable();
939#endif 1014#endif
940} 1015}
941#endif /* __ARCH_WANT_UNLOCKED_CTXSW */ 1016#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
942 1017
943/* 1018/*
1019 * Check whether the task is waking, we use this to synchronize against
1020 * ttwu() so that task_cpu() reports a stable number.
1021 *
1022 * We need to make an exception for PF_STARTING tasks because the fork
1023 * path might require task_rq_lock() to work, eg. it can call
1024 * set_cpus_allowed_ptr() from the cpuset clone_ns code.
1025 */
1026static inline int task_is_waking(struct task_struct *p)
1027{
1028 return unlikely((p->state & TASK_WAKING) && !(p->flags & PF_STARTING));
1029}
1030
1031/*
944 * __task_rq_lock - lock the runqueue a given task resides on. 1032 * __task_rq_lock - lock the runqueue a given task resides on.
945 * Must be called interrupts disabled. 1033 * Must be called interrupts disabled.
946 */ 1034 */
947static inline struct rq *__task_rq_lock(struct task_struct *p) 1035static inline struct rq *__task_rq_lock(struct task_struct *p)
948 __acquires(rq->lock) 1036 __acquires(rq->lock)
949{ 1037{
1038 struct rq *rq;
1039
950 for (;;) { 1040 for (;;) {
951 struct rq *rq = task_rq(p); 1041 while (task_is_waking(p))
1042 cpu_relax();
1043 rq = task_rq(p);
952 raw_spin_lock(&rq->lock); 1044 raw_spin_lock(&rq->lock);
953 if (likely(rq == task_rq(p))) 1045 if (likely(rq == task_rq(p) && !task_is_waking(p)))
954 return rq; 1046 return rq;
955 raw_spin_unlock(&rq->lock); 1047 raw_spin_unlock(&rq->lock);
956 } 1048 }
@@ -967,10 +1059,12 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
967 struct rq *rq; 1059 struct rq *rq;
968 1060
969 for (;;) { 1061 for (;;) {
1062 while (task_is_waking(p))
1063 cpu_relax();
970 local_irq_save(*flags); 1064 local_irq_save(*flags);
971 rq = task_rq(p); 1065 rq = task_rq(p);
972 raw_spin_lock(&rq->lock); 1066 raw_spin_lock(&rq->lock);
973 if (likely(rq == task_rq(p))) 1067 if (likely(rq == task_rq(p) && !task_is_waking(p)))
974 return rq; 1068 return rq;
975 raw_spin_unlock_irqrestore(&rq->lock, *flags); 1069 raw_spin_unlock_irqrestore(&rq->lock, *flags);
976 } 1070 }
@@ -1147,6 +1241,7 @@ static void init_rq_hrtick(struct rq *rq)
1147 1241
1148 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 1242 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1149 rq->hrtick_timer.function = hrtick; 1243 rq->hrtick_timer.function = hrtick;
1244 rq->hrtick_timer.irqsafe = 1;
1150} 1245}
1151#else /* CONFIG_SCHED_HRTICK */ 1246#else /* CONFIG_SCHED_HRTICK */
1152static inline void hrtick_clear(struct rq *rq) 1247static inline void hrtick_clear(struct rq *rq)
@@ -1222,7 +1317,7 @@ void wake_up_idle_cpu(int cpu)
1222{ 1317{
1223 struct rq *rq = cpu_rq(cpu); 1318 struct rq *rq = cpu_rq(cpu);
1224 1319
1225 if (cpu == smp_processor_id()) 1320 if (cpu == raw_smp_processor_id())
1226 return; 1321 return;
1227 1322
1228 /* 1323 /*
@@ -1390,7 +1485,8 @@ static const u32 prio_to_wmult[40] = {
1390 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, 1485 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
1391}; 1486};
1392 1487
1393static void activate_task(struct rq *rq, struct task_struct *p, int wakeup); 1488static void activate_task(struct rq *rq, struct task_struct *p, int wakeup,
1489 bool head);
1394 1490
1395/* 1491/*
1396 * runqueue iterator, to support SMP load-balancing between different 1492 * runqueue iterator, to support SMP load-balancing between different
@@ -1883,13 +1979,14 @@ static void update_avg(u64 *avg, u64 sample)
1883 *avg += diff >> 3; 1979 *avg += diff >> 3;
1884} 1980}
1885 1981
1886static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) 1982static void
1983enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, bool head)
1887{ 1984{
1888 if (wakeup) 1985 if (wakeup)
1889 p->se.start_runtime = p->se.sum_exec_runtime; 1986 p->se.start_runtime = p->se.sum_exec_runtime;
1890 1987
1891 sched_info_queued(p); 1988 sched_info_queued(p);
1892 p->sched_class->enqueue_task(rq, p, wakeup); 1989 p->sched_class->enqueue_task(rq, p, wakeup, head);
1893 p->se.on_rq = 1; 1990 p->se.on_rq = 1;
1894} 1991}
1895 1992
@@ -1934,6 +2031,8 @@ static inline int normal_prio(struct task_struct *p)
1934 prio = MAX_RT_PRIO-1 - p->rt_priority; 2031 prio = MAX_RT_PRIO-1 - p->rt_priority;
1935 else 2032 else
1936 prio = __normal_prio(p); 2033 prio = __normal_prio(p);
2034
2035// trace_special_pid(p->pid, PRIO(p), __PRIO(prio));
1937 return prio; 2036 return prio;
1938} 2037}
1939 2038
@@ -1960,12 +2059,13 @@ static int effective_prio(struct task_struct *p)
1960/* 2059/*
1961 * activate_task - move a task to the runqueue. 2060 * activate_task - move a task to the runqueue.
1962 */ 2061 */
1963static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) 2062static void
2063activate_task(struct rq *rq, struct task_struct *p, int wakeup, bool head)
1964{ 2064{
1965 if (task_contributes_to_load(p)) 2065 if (task_contributes_to_load(p))
1966 rq->nr_uninterruptible--; 2066 rq->nr_uninterruptible--;
1967 2067
1968 enqueue_task(rq, p, wakeup); 2068 enqueue_task(rq, p, wakeup, head);
1969 inc_nr_running(rq); 2069 inc_nr_running(rq);
1970} 2070}
1971 2071
@@ -2034,13 +2134,20 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
2034 2134
2035void set_task_cpu(struct task_struct *p, unsigned int new_cpu) 2135void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
2036{ 2136{
2037#ifdef CONFIG_SCHED_DEBUG 2137#if defined(CONFIG_SCHED_DEBUG)
2038 /* 2138 /*
2039 * We should never call set_task_cpu() on a blocked task, 2139 * We should never call set_task_cpu() on a blocked task,
2040 * ttwu() will sort out the placement. 2140 * ttwu() will sort out the placement.
2041 */ 2141 */
2042 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && 2142 if (p->state != TASK_RUNNING &&
2043 !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); 2143 !(p->state & TASK_WAKING) &&
2144 !(p->state & TASK_RUNNING_MUTEX) &&
2145 !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)) {
2146 printk(KERN_ERR "%d %s %lx %lx\n", p->pid, p->comm,
2147 (unsigned long) p->state,
2148 (unsigned long) preempt_count());
2149 WARN_ON(1);
2150 }
2044#endif 2151#endif
2045 2152
2046 trace_sched_migrate_task(p, new_cpu); 2153 trace_sched_migrate_task(p, new_cpu);
@@ -2219,7 +2326,10 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
2219 * yield - it could be a while. 2326 * yield - it could be a while.
2220 */ 2327 */
2221 if (unlikely(on_rq)) { 2328 if (unlikely(on_rq)) {
2222 schedule_timeout_uninterruptible(1); 2329 ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);
2330
2331 set_current_state(TASK_UNINTERRUPTIBLE);
2332 schedule_hrtimeout(&to, HRTIMER_MODE_REL);
2223 continue; 2333 continue;
2224 } 2334 }
2225 2335
@@ -2365,7 +2475,7 @@ int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
2365 * returns failure only if the task is already active. 2475 * returns failure only if the task is already active.
2366 */ 2476 */
2367static int try_to_wake_up(struct task_struct *p, unsigned int state, 2477static int try_to_wake_up(struct task_struct *p, unsigned int state,
2368 int wake_flags) 2478 int wake_flags, int mutex)
2369{ 2479{
2370 int cpu, orig_cpu, this_cpu, success = 0; 2480 int cpu, orig_cpu, this_cpu, success = 0;
2371 unsigned long flags; 2481 unsigned long flags;
@@ -2395,12 +2505,8 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
2395 /* 2505 /*
2396 * In order to handle concurrent wakeups and release the rq->lock 2506 * In order to handle concurrent wakeups and release the rq->lock
2397 * we put the task in TASK_WAKING state. 2507 * we put the task in TASK_WAKING state.
2398 *
2399 * First fix up the nr_uninterruptible count:
2400 */ 2508 */
2401 if (task_contributes_to_load(p)) 2509 p->state |= TASK_WAKING;
2402 rq->nr_uninterruptible--;
2403 p->state = TASK_WAKING;
2404 2510
2405 if (p->sched_class->task_waking) 2511 if (p->sched_class->task_waking)
2406 p->sched_class->task_waking(rq, p); 2512 p->sched_class->task_waking(rq, p);
@@ -2408,14 +2514,27 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
2408 __task_rq_unlock(rq); 2514 __task_rq_unlock(rq);
2409 2515
2410 cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); 2516 cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
2411 if (cpu != orig_cpu) 2517 if (cpu != orig_cpu) {
2518 /*
2519 * Since we migrate the task without holding any rq->lock,
2520 * we need to be careful with task_rq_lock(), since that
2521 * might end up locking an invalid rq.
2522 */
2412 set_task_cpu(p, cpu); 2523 set_task_cpu(p, cpu);
2524 }
2413 2525
2414 rq = __task_rq_lock(p); 2526 rq = cpu_rq(cpu);
2527 raw_spin_lock(&rq->lock);
2415 update_rq_clock(rq); 2528 update_rq_clock(rq);
2416 2529
2417 WARN_ON(p->state != TASK_WAKING); 2530 /*
2418 cpu = task_cpu(p); 2531 * We migrated the task without holding either rq->lock, however
2532 * since the task is not on the task list itself, nobody else
2533 * will try and migrate the task, hence the rq should match the
2534 * cpu we just moved it to.
2535 */
2536 WARN_ON(task_cpu(p) != cpu);
2537 WARN_ON(!(p->state & TASK_WAKING));
2419 2538
2420#ifdef CONFIG_SCHEDSTATS 2539#ifdef CONFIG_SCHEDSTATS
2421 schedstat_inc(rq, ttwu_count); 2540 schedstat_inc(rq, ttwu_count);
@@ -2443,7 +2562,7 @@ out_activate:
2443 schedstat_inc(p, se.nr_wakeups_local); 2562 schedstat_inc(p, se.nr_wakeups_local);
2444 else 2563 else
2445 schedstat_inc(p, se.nr_wakeups_remote); 2564 schedstat_inc(p, se.nr_wakeups_remote);
2446 activate_task(rq, p, 1); 2565 activate_task(rq, p, 1, false);
2447 success = 1; 2566 success = 1;
2448 2567
2449 /* 2568 /*
@@ -2466,7 +2585,20 @@ out_running:
2466 trace_sched_wakeup(rq, p, success); 2585 trace_sched_wakeup(rq, p, success);
2467 check_preempt_curr(rq, p, wake_flags); 2586 check_preempt_curr(rq, p, wake_flags);
2468 2587
2469 p->state = TASK_RUNNING; 2588 /*
2589 * For a mutex wakeup we or TASK_RUNNING_MUTEX to the task
2590 * state to preserve the original state, so a real wakeup
2591 * still can see the (UN)INTERRUPTIBLE bits in the state check
2592 * above. We dont have to worry about the | TASK_RUNNING_MUTEX
2593 * here. The waiter is serialized by the mutex lock and nobody
2594 * else can fiddle with p->state as we hold rq lock.
2595 */
2596 p->state &= ~TASK_WAKING;
2597 if (mutex)
2598 p->state |= TASK_RUNNING_MUTEX;
2599 else
2600 p->state = TASK_RUNNING;
2601
2470#ifdef CONFIG_SMP 2602#ifdef CONFIG_SMP
2471 if (p->sched_class->task_woken) 2603 if (p->sched_class->task_woken)
2472 p->sched_class->task_woken(rq, p); 2604 p->sched_class->task_woken(rq, p);
@@ -2502,13 +2634,31 @@ out:
2502 */ 2634 */
2503int wake_up_process(struct task_struct *p) 2635int wake_up_process(struct task_struct *p)
2504{ 2636{
2505 return try_to_wake_up(p, TASK_ALL, 0); 2637 return try_to_wake_up(p, TASK_ALL, 0, 0);
2506} 2638}
2507EXPORT_SYMBOL(wake_up_process); 2639EXPORT_SYMBOL(wake_up_process);
2508 2640
2641int wake_up_process_sync(struct task_struct * p)
2642{
2643 return try_to_wake_up(p, TASK_ALL, 1, 0);
2644}
2645EXPORT_SYMBOL(wake_up_process_sync);
2646
2647int wake_up_process_mutex(struct task_struct * p)
2648{
2649 return try_to_wake_up(p, TASK_ALL, 0, 1);
2650}
2651EXPORT_SYMBOL(wake_up_process_mutex);
2652
2653int wake_up_process_mutex_sync(struct task_struct * p)
2654{
2655 return try_to_wake_up(p, TASK_ALL, 1, 1);
2656}
2657EXPORT_SYMBOL(wake_up_process_mutex_sync);
2658
2509int wake_up_state(struct task_struct *p, unsigned int state) 2659int wake_up_state(struct task_struct *p, unsigned int state)
2510{ 2660{
2511 return try_to_wake_up(p, state, 0); 2661 return try_to_wake_up(p, state, 0, 0);
2512} 2662}
2513 2663
2514/* 2664/*
@@ -2575,7 +2725,7 @@ static void __sched_fork(struct task_struct *p)
2575 */ 2725 */
2576void sched_fork(struct task_struct *p, int clone_flags) 2726void sched_fork(struct task_struct *p, int clone_flags)
2577{ 2727{
2578 int cpu = get_cpu(); 2728 int cpu;
2579 2729
2580 __sched_fork(p); 2730 __sched_fork(p);
2581 /* 2731 /*
@@ -2615,16 +2765,24 @@ void sched_fork(struct task_struct *p, int clone_flags)
2615 if (!rt_prio(p->prio)) 2765 if (!rt_prio(p->prio))
2616 p->sched_class = &fair_sched_class; 2766 p->sched_class = &fair_sched_class;
2617 2767
2768 /*
2769 * task_fork() and set_task_cpu() must be called with
2770 * preemption disabled
2771 */
2772 cpu = get_cpu();
2773
2618 if (p->sched_class->task_fork) 2774 if (p->sched_class->task_fork)
2619 p->sched_class->task_fork(p); 2775 p->sched_class->task_fork(p);
2620 2776
2621 set_task_cpu(p, cpu); 2777 set_task_cpu(p, cpu);
2622 2778
2779 put_cpu();
2780
2623#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 2781#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
2624 if (likely(sched_info_on())) 2782 if (likely(sched_info_on()))
2625 memset(&p->sched_info, 0, sizeof(p->sched_info)); 2783 memset(&p->sched_info, 0, sizeof(p->sched_info));
2626#endif 2784#endif
2627#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) 2785#if defined(CONFIG_SMP)
2628 p->oncpu = 0; 2786 p->oncpu = 0;
2629#endif 2787#endif
2630#ifdef CONFIG_PREEMPT 2788#ifdef CONFIG_PREEMPT
@@ -2632,8 +2790,6 @@ void sched_fork(struct task_struct *p, int clone_flags)
2632 task_thread_info(p)->preempt_count = 1; 2790 task_thread_info(p)->preempt_count = 1;
2633#endif 2791#endif
2634 plist_node_init(&p->pushable_tasks, MAX_PRIO); 2792 plist_node_init(&p->pushable_tasks, MAX_PRIO);
2635
2636 put_cpu();
2637} 2793}
2638 2794
2639/* 2795/*
@@ -2663,11 +2819,17 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2663 set_task_cpu(p, cpu); 2819 set_task_cpu(p, cpu);
2664#endif 2820#endif
2665 2821
2666 rq = task_rq_lock(p, &flags); 2822 /*
2823 * Since the task is not on the rq and we still have TASK_WAKING set
2824 * nobody else will migrate this task.
2825 */
2826 rq = cpu_rq(cpu);
2827 raw_spin_lock_irqsave(&rq->lock, flags);
2828
2667 BUG_ON(p->state != TASK_WAKING); 2829 BUG_ON(p->state != TASK_WAKING);
2668 p->state = TASK_RUNNING; 2830 p->state = TASK_RUNNING;
2669 update_rq_clock(rq); 2831 update_rq_clock(rq);
2670 activate_task(rq, p, 0); 2832 activate_task(rq, p, 0, false);
2671 trace_sched_wakeup_new(rq, p, 1); 2833 trace_sched_wakeup_new(rq, p, 1);
2672 check_preempt_curr(rq, p, WF_FORK); 2834 check_preempt_curr(rq, p, WF_FORK);
2673#ifdef CONFIG_SMP 2835#ifdef CONFIG_SMP
@@ -2707,8 +2869,17 @@ static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2707 struct preempt_notifier *notifier; 2869 struct preempt_notifier *notifier;
2708 struct hlist_node *node; 2870 struct hlist_node *node;
2709 2871
2872 if (hlist_empty(&curr->preempt_notifiers))
2873 return;
2874
2875 /*
2876 * The KVM sched in notifier expects to be called with
2877 * interrupts enabled.
2878 */
2879 local_irq_enable();
2710 hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link) 2880 hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
2711 notifier->ops->sched_in(notifier, raw_smp_processor_id()); 2881 notifier->ops->sched_in(notifier, raw_smp_processor_id());
2882 local_irq_disable();
2712} 2883}
2713 2884
2714static void 2885static void
@@ -2793,13 +2964,17 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2793 * Manfred Spraul <manfred@colorfullife.com> 2964 * Manfred Spraul <manfred@colorfullife.com>
2794 */ 2965 */
2795 prev_state = prev->state; 2966 prev_state = prev->state;
2796 finish_arch_switch(prev); 2967 _finish_arch_switch(prev);
2797 perf_event_task_sched_in(current, cpu_of(rq)); 2968 perf_event_task_sched_in(current, cpu_of(rq));
2798 finish_lock_switch(rq, prev); 2969 finish_lock_switch(rq, prev);
2799 2970
2800 fire_sched_in_preempt_notifiers(current); 2971 fire_sched_in_preempt_notifiers(current);
2972 /*
2973 * Delay the final freeing of the mm or task, so that we dont have
2974 * to do complex work from within the scheduler:
2975 */
2801 if (mm) 2976 if (mm)
2802 mmdrop(mm); 2977 mmdrop_delayed(mm);
2803 if (unlikely(prev_state == TASK_DEAD)) { 2978 if (unlikely(prev_state == TASK_DEAD)) {
2804 /* 2979 /*
2805 * Remove function-return probe instances associated with this 2980 * Remove function-return probe instances associated with this
@@ -2853,8 +3028,10 @@ static inline void post_schedule(struct rq *rq)
2853asmlinkage void schedule_tail(struct task_struct *prev) 3028asmlinkage void schedule_tail(struct task_struct *prev)
2854 __releases(rq->lock) 3029 __releases(rq->lock)
2855{ 3030{
2856 struct rq *rq = this_rq(); 3031 struct rq *rq;
2857 3032
3033 preempt_disable();
3034 rq = this_rq();
2858 finish_task_switch(rq, prev); 3035 finish_task_switch(rq, prev);
2859 3036
2860 /* 3037 /*
@@ -2863,9 +3040,14 @@ asmlinkage void schedule_tail(struct task_struct *prev)
2863 */ 3040 */
2864 post_schedule(rq); 3041 post_schedule(rq);
2865 3042
3043 __preempt_enable_no_resched();
3044 local_irq_enable();
3045
2866#ifdef __ARCH_WANT_UNLOCKED_CTXSW 3046#ifdef __ARCH_WANT_UNLOCKED_CTXSW
2867 /* In this case, finish_task_switch does not reenable preemption */ 3047 /* In this case, finish_task_switch does not reenable preemption */
2868 preempt_enable(); 3048 preempt_enable();
3049#else
3050 preempt_check_resched();
2869#endif 3051#endif
2870 if (current->set_child_tid) 3052 if (current->set_child_tid)
2871 put_user(task_pid_vnr(current), current->set_child_tid); 3053 put_user(task_pid_vnr(current), current->set_child_tid);
@@ -2913,6 +3095,11 @@ context_switch(struct rq *rq, struct task_struct *prev,
2913 spin_release(&rq->lock.dep_map, 1, _THIS_IP_); 3095 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
2914#endif 3096#endif
2915 3097
3098#ifdef CURRENT_PTR
3099 barrier();
3100 *current_ptr = next;
3101 *current_ti_ptr = next->thread_info;
3102#endif
2916 /* Here we just switch the register state and the stack. */ 3103 /* Here we just switch the register state and the stack. */
2917 switch_to(prev, next, prev); 3104 switch_to(prev, next, prev);
2918 3105
@@ -2959,6 +3146,11 @@ unsigned long nr_uninterruptible(void)
2959 return sum; 3146 return sum;
2960} 3147}
2961 3148
3149unsigned long nr_uninterruptible_cpu(int cpu)
3150{
3151 return cpu_rq(cpu)->nr_uninterruptible;
3152}
3153
2962unsigned long long nr_context_switches(void) 3154unsigned long long nr_context_switches(void)
2963{ 3155{
2964 int i; 3156 int i;
@@ -2977,6 +3169,13 @@ unsigned long nr_iowait(void)
2977 for_each_possible_cpu(i) 3169 for_each_possible_cpu(i)
2978 sum += atomic_read(&cpu_rq(i)->nr_iowait); 3170 sum += atomic_read(&cpu_rq(i)->nr_iowait);
2979 3171
3172 /*
3173 * Since we read the counters lockless, it might be slightly
3174 * inaccurate. Do not allow it to go below zero though:
3175 */
3176 if (unlikely((long)sum < 0))
3177 sum = 0;
3178
2980 return sum; 3179 return sum;
2981} 3180}
2982 3181
@@ -3199,7 +3398,7 @@ static void pull_task(struct rq *src_rq, struct task_struct *p,
3199{ 3398{
3200 deactivate_task(src_rq, p, 0); 3399 deactivate_task(src_rq, p, 0);
3201 set_task_cpu(p, this_cpu); 3400 set_task_cpu(p, this_cpu);
3202 activate_task(this_rq, p, 0); 3401 activate_task(this_rq, p, 0, false);
3203 check_preempt_curr(this_rq, p, 0); 3402 check_preempt_curr(this_rq, p, 0);
3204} 3403}
3205 3404
@@ -3295,6 +3494,10 @@ next:
3295 */ 3494 */
3296 if (idle == CPU_NEWLY_IDLE) 3495 if (idle == CPU_NEWLY_IDLE)
3297 goto out; 3496 goto out;
3497
3498 if (raw_spin_is_contended(&this_rq->lock) ||
3499 raw_spin_is_contended(&busiest->lock))
3500 goto out;
3298#endif 3501#endif
3299 3502
3300 /* 3503 /*
@@ -3351,6 +3554,10 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3351 */ 3554 */
3352 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) 3555 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
3353 break; 3556 break;
3557
3558 if (raw_spin_is_contended(&this_rq->lock) ||
3559 raw_spin_is_contended(&busiest->lock))
3560 break;
3354#endif 3561#endif
3355 } while (class && max_load_move > total_load_moved); 3562 } while (class && max_load_move > total_load_moved);
3356 3563
@@ -4867,7 +5074,7 @@ out:
4867 */ 5074 */
4868static void run_rebalance_domains(struct softirq_action *h) 5075static void run_rebalance_domains(struct softirq_action *h)
4869{ 5076{
4870 int this_cpu = smp_processor_id(); 5077 int this_cpu = raw_smp_processor_id();
4871 struct rq *this_rq = cpu_rq(this_cpu); 5078 struct rq *this_rq = cpu_rq(this_cpu);
4872 enum cpu_idle_type idle = this_rq->idle_at_tick ? 5079 enum cpu_idle_type idle = this_rq->idle_at_tick ?
4873 CPU_IDLE : CPU_NOT_IDLE; 5080 CPU_IDLE : CPU_NOT_IDLE;
@@ -5141,9 +5348,10 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
5141 5348
5142 /* Add system time to cpustat. */ 5349 /* Add system time to cpustat. */
5143 tmp = cputime_to_cputime64(cputime); 5350 tmp = cputime_to_cputime64(cputime);
5144 if (hardirq_count() - hardirq_offset) 5351 if ((hardirq_count() - hardirq_offset) ||
5352 (p->extra_flags & PFE_HARDIRQ))
5145 cpustat->irq = cputime64_add(cpustat->irq, tmp); 5353 cpustat->irq = cputime64_add(cpustat->irq, tmp);
5146 else if (softirq_count()) 5354 else if (softirq_count() || (p->extra_flags & PFE_SOFTIRQ))
5147 cpustat->softirq = cputime64_add(cpustat->softirq, tmp); 5355 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
5148 else 5356 else
5149 cpustat->system = cputime64_add(cpustat->system, tmp); 5357 cpustat->system = cputime64_add(cpustat->system, tmp);
@@ -5324,10 +5532,13 @@ void scheduler_tick(void)
5324 5532
5325 sched_clock_tick(); 5533 sched_clock_tick();
5326 5534
5535 BUG_ON(!irqs_disabled());
5536
5327 raw_spin_lock(&rq->lock); 5537 raw_spin_lock(&rq->lock);
5328 update_rq_clock(rq); 5538 update_rq_clock(rq);
5329 update_cpu_load(rq); 5539 update_cpu_load(rq);
5330 curr->sched_class->task_tick(rq, curr, 0); 5540 if (curr != rq->idle && curr->se.on_rq)
5541 curr->sched_class->task_tick(rq, curr, 0);
5331 raw_spin_unlock(&rq->lock); 5542 raw_spin_unlock(&rq->lock);
5332 5543
5333 perf_event_task_tick(curr, cpu); 5544 perf_event_task_tick(curr, cpu);
@@ -5348,6 +5559,19 @@ notrace unsigned long get_parent_ip(unsigned long addr)
5348 return addr; 5559 return addr;
5349} 5560}
5350 5561
5562#ifdef CONFIG_DEBUG_PREEMPT
5563void notrace preempt_enable_no_resched(void)
5564{
5565 barrier();
5566 dec_preempt_count();
5567
5568 WARN_ONCE(!preempt_count(),
5569 KERN_ERR "BUG: %s:%d task might have lost a preemption check!\n",
5570 current->comm, current->pid);
5571}
5572EXPORT_SYMBOL(preempt_enable_no_resched);
5573#endif
5574
5351#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ 5575#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
5352 defined(CONFIG_PREEMPT_TRACER)) 5576 defined(CONFIG_PREEMPT_TRACER))
5353 5577
@@ -5404,8 +5628,8 @@ static noinline void __schedule_bug(struct task_struct *prev)
5404{ 5628{
5405 struct pt_regs *regs = get_irq_regs(); 5629 struct pt_regs *regs = get_irq_regs();
5406 5630
5407 printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", 5631 printk(KERN_ERR "BUG: scheduling while atomic: %s/0x%08x/%d, CPU#%d\n",
5408 prev->comm, prev->pid, preempt_count()); 5632 prev->comm, preempt_count(), prev->pid, smp_processor_id());
5409 5633
5410 debug_show_held_locks(prev); 5634 debug_show_held_locks(prev);
5411 print_modules(); 5635 print_modules();
@@ -5423,12 +5647,14 @@ static noinline void __schedule_bug(struct task_struct *prev)
5423 */ 5647 */
5424static inline void schedule_debug(struct task_struct *prev) 5648static inline void schedule_debug(struct task_struct *prev)
5425{ 5649{
5650// WARN_ON(system_state == SYSTEM_BOOTING);
5651
5426 /* 5652 /*
5427 * Test if we are atomic. Since do_exit() needs to call into 5653 * Test if we are atomic. Since do_exit() needs to call into
5428 * schedule() atomically, we ignore that path for now. 5654 * schedule() atomically, we ignore that path for now.
5429 * Otherwise, whine if we are scheduling when we should not be. 5655 * Otherwise, whine if we are scheduling when we should not be.
5430 */ 5656 */
5431 if (unlikely(in_atomic_preempt_off() && !prev->exit_state)) 5657 if (unlikely(in_atomic() && !prev->exit_state))
5432 __schedule_bug(prev); 5658 __schedule_bug(prev);
5433 5659
5434 profile_hit(SCHED_PROFILING, __builtin_return_address(0)); 5660 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
@@ -5499,15 +5725,13 @@ pick_next_task(struct rq *rq)
5499/* 5725/*
5500 * schedule() is the main scheduler function. 5726 * schedule() is the main scheduler function.
5501 */ 5727 */
5502asmlinkage void __sched schedule(void) 5728asmlinkage void __sched __schedule(void)
5503{ 5729{
5504 struct task_struct *prev, *next; 5730 struct task_struct *prev, *next;
5505 unsigned long *switch_count; 5731 unsigned long *switch_count;
5506 struct rq *rq; 5732 struct rq *rq;
5507 int cpu; 5733 int cpu;
5508 5734
5509need_resched:
5510 preempt_disable();
5511 cpu = smp_processor_id(); 5735 cpu = smp_processor_id();
5512 rq = cpu_rq(cpu); 5736 rq = cpu_rq(cpu);
5513 rcu_sched_qs(cpu); 5737 rcu_sched_qs(cpu);
@@ -5515,10 +5739,11 @@ need_resched:
5515 switch_count = &prev->nivcsw; 5739 switch_count = &prev->nivcsw;
5516 5740
5517 release_kernel_lock(prev); 5741 release_kernel_lock(prev);
5518need_resched_nonpreemptible:
5519 5742
5520 schedule_debug(prev); 5743 schedule_debug(prev);
5521 5744
5745 preempt_disable();
5746
5522 if (sched_feat(HRTICK)) 5747 if (sched_feat(HRTICK))
5523 hrtick_clear(rq); 5748 hrtick_clear(rq);
5524 5749
@@ -5526,7 +5751,8 @@ need_resched_nonpreemptible:
5526 update_rq_clock(rq); 5751 update_rq_clock(rq);
5527 clear_tsk_need_resched(prev); 5752 clear_tsk_need_resched(prev);
5528 5753
5529 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { 5754 if (!(prev->state & TASK_RUNNING_MUTEX) && prev->state &&
5755 !(preempt_count() & PREEMPT_ACTIVE)) {
5530 if (unlikely(signal_pending_state(prev->state, prev))) 5756 if (unlikely(signal_pending_state(prev->state, prev)))
5531 prev->state = TASK_RUNNING; 5757 prev->state = TASK_RUNNING;
5532 else 5758 else
@@ -5557,24 +5783,29 @@ need_resched_nonpreemptible:
5557 */ 5783 */
5558 cpu = smp_processor_id(); 5784 cpu = smp_processor_id();
5559 rq = cpu_rq(cpu); 5785 rq = cpu_rq(cpu);
5560 } else 5786 __preempt_enable_no_resched();
5561 raw_spin_unlock_irq(&rq->lock); 5787 } else {
5788 __preempt_enable_no_resched();
5789 raw_spin_unlock(&rq->lock);
5790 }
5562 5791
5563 post_schedule(rq); 5792 post_schedule(rq);
5564 5793
5565 if (unlikely(reacquire_kernel_lock(current) < 0)) { 5794 reacquire_kernel_lock(current);
5566 prev = rq->curr; 5795}
5567 switch_count = &prev->nivcsw;
5568 goto need_resched_nonpreemptible;
5569 }
5570 5796
5571 preempt_enable_no_resched(); 5797asmlinkage void __sched schedule(void)
5798{
5799need_resched:
5800 local_irq_disable();
5801 __schedule();
5802 local_irq_enable();
5572 if (need_resched()) 5803 if (need_resched())
5573 goto need_resched; 5804 goto need_resched;
5574} 5805}
5575EXPORT_SYMBOL(schedule); 5806EXPORT_SYMBOL(schedule);
5576 5807
5577#ifdef CONFIG_MUTEX_SPIN_ON_OWNER 5808#if defined(CONFIG_MUTEX_SPIN_ON_OWNER) && !defined(CONFIG_PREEMPT_RT)
5578/* 5809/*
5579 * Look out! "owner" is an entirely speculative pointer 5810 * Look out! "owner" is an entirely speculative pointer
5580 * access and not reliable. 5811 * access and not reliable.
@@ -5636,6 +5867,35 @@ out:
5636#endif 5867#endif
5637 5868
5638#ifdef CONFIG_PREEMPT 5869#ifdef CONFIG_PREEMPT
5870
5871/*
5872 * Global flag to turn preemption off on a CONFIG_PREEMPT kernel:
5873 */
5874int kernel_preemption = 1;
5875
5876static int __init preempt_setup (char *str)
5877{
5878 if (!strncmp(str, "off", 3)) {
5879 if (kernel_preemption) {
5880 printk(KERN_INFO "turning off kernel preemption!\n");
5881 kernel_preemption = 0;
5882 }
5883 return 1;
5884 }
5885 if (!strncmp(str, "on", 2)) {
5886 if (!kernel_preemption) {
5887 printk(KERN_INFO "turning on kernel preemption!\n");
5888 kernel_preemption = 1;
5889 }
5890 return 1;
5891 }
5892 get_option(&str, &kernel_preemption);
5893
5894 return 1;
5895}
5896
5897__setup("preempt=", preempt_setup);
5898
5639/* 5899/*
5640 * this is the entry point to schedule() from in-kernel preemption 5900 * this is the entry point to schedule() from in-kernel preemption
5641 * off of preempt_enable. Kernel preemptions off return from interrupt 5901 * off of preempt_enable. Kernel preemptions off return from interrupt
@@ -5644,7 +5904,11 @@ out:
5644asmlinkage void __sched preempt_schedule(void) 5904asmlinkage void __sched preempt_schedule(void)
5645{ 5905{
5646 struct thread_info *ti = current_thread_info(); 5906 struct thread_info *ti = current_thread_info();
5907 struct task_struct *task = current;
5908 int saved_lock_depth;
5647 5909
5910 if (!kernel_preemption)
5911 return;
5648 /* 5912 /*
5649 * If there is a non-zero preempt_count or interrupts are disabled, 5913 * If there is a non-zero preempt_count or interrupts are disabled,
5650 * we do not want to preempt the current task. Just return.. 5914 * we do not want to preempt the current task. Just return..
@@ -5653,10 +5917,23 @@ asmlinkage void __sched preempt_schedule(void)
5653 return; 5917 return;
5654 5918
5655 do { 5919 do {
5920 local_irq_disable();
5656 add_preempt_count(PREEMPT_ACTIVE); 5921 add_preempt_count(PREEMPT_ACTIVE);
5657 schedule(); 5922
5923 /*
5924 * We keep the big kernel semaphore locked, but we
5925 * clear ->lock_depth so that schedule() doesnt
5926 * auto-release the semaphore:
5927 */
5928 saved_lock_depth = task->lock_depth;
5929 task->lock_depth = -1;
5930 __schedule();
5931 task->lock_depth = saved_lock_depth;
5932
5658 sub_preempt_count(PREEMPT_ACTIVE); 5933 sub_preempt_count(PREEMPT_ACTIVE);
5659 5934
5935 local_irq_enable();
5936
5660 /* 5937 /*
5661 * Check again in case we missed a preemption opportunity 5938 * Check again in case we missed a preemption opportunity
5662 * between schedule and now. 5939 * between schedule and now.
@@ -5667,23 +5944,40 @@ asmlinkage void __sched preempt_schedule(void)
5667EXPORT_SYMBOL(preempt_schedule); 5944EXPORT_SYMBOL(preempt_schedule);
5668 5945
5669/* 5946/*
5670 * this is the entry point to schedule() from kernel preemption 5947 * this is is the entry point for the IRQ return path. Called with
5671 * off of irq context. 5948 * interrupts disabled. To avoid infinite irq-entry recursion problems
5672 * Note, that this is called and return with irqs disabled. This will 5949 * with fast-paced IRQ sources we do all of this carefully to never
5673 * protect us against recursive calling from irq. 5950 * enable interrupts again.
5674 */ 5951 */
5675asmlinkage void __sched preempt_schedule_irq(void) 5952asmlinkage void __sched preempt_schedule_irq(void)
5676{ 5953{
5677 struct thread_info *ti = current_thread_info(); 5954 struct thread_info *ti = current_thread_info();
5955 struct task_struct *task = current;
5956 int saved_lock_depth;
5678 5957
5679 /* Catch callers which need to be fixed */ 5958 if (!kernel_preemption)
5680 BUG_ON(ti->preempt_count || !irqs_disabled()); 5959 return;
5960 /*
5961 * If there is a non-zero preempt_count then just return.
5962 * (interrupts are disabled)
5963 */
5964 if (unlikely(ti->preempt_count))
5965 return;
5681 5966
5682 do { 5967 do {
5683 add_preempt_count(PREEMPT_ACTIVE);
5684 local_irq_enable();
5685 schedule();
5686 local_irq_disable(); 5968 local_irq_disable();
5969 add_preempt_count(PREEMPT_ACTIVE);
5970
5971 /*
5972 * We keep the big kernel semaphore locked, but we
5973 * clear ->lock_depth so that schedule() doesnt
5974 * auto-release the semaphore:
5975 */
5976 saved_lock_depth = task->lock_depth;
5977 task->lock_depth = -1;
5978 __schedule();
5979
5980 task->lock_depth = saved_lock_depth;
5687 sub_preempt_count(PREEMPT_ACTIVE); 5981 sub_preempt_count(PREEMPT_ACTIVE);
5688 5982
5689 /* 5983 /*
@@ -5699,7 +5993,7 @@ asmlinkage void __sched preempt_schedule_irq(void)
5699int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags, 5993int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
5700 void *key) 5994 void *key)
5701{ 5995{
5702 return try_to_wake_up(curr->private, mode, wake_flags); 5996 return try_to_wake_up(curr->private, mode, wake_flags, 0);
5703} 5997}
5704EXPORT_SYMBOL(default_wake_function); 5998EXPORT_SYMBOL(default_wake_function);
5705 5999
@@ -5742,7 +6036,7 @@ void __wake_up(wait_queue_head_t *q, unsigned int mode,
5742 unsigned long flags; 6036 unsigned long flags;
5743 6037
5744 spin_lock_irqsave(&q->lock, flags); 6038 spin_lock_irqsave(&q->lock, flags);
5745 __wake_up_common(q, mode, nr_exclusive, 0, key); 6039 __wake_up_common(q, mode, nr_exclusive, 1, key);
5746 spin_unlock_irqrestore(&q->lock, flags); 6040 spin_unlock_irqrestore(&q->lock, flags);
5747} 6041}
5748EXPORT_SYMBOL(__wake_up); 6042EXPORT_SYMBOL(__wake_up);
@@ -5822,7 +6116,7 @@ void complete(struct completion *x)
5822 6116
5823 spin_lock_irqsave(&x->wait.lock, flags); 6117 spin_lock_irqsave(&x->wait.lock, flags);
5824 x->done++; 6118 x->done++;
5825 __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL); 6119 __wake_up_common(&x->wait, TASK_NORMAL, 1, 1, NULL);
5826 spin_unlock_irqrestore(&x->wait.lock, flags); 6120 spin_unlock_irqrestore(&x->wait.lock, flags);
5827} 6121}
5828EXPORT_SYMBOL(complete); 6122EXPORT_SYMBOL(complete);
@@ -5842,7 +6136,7 @@ void complete_all(struct completion *x)
5842 6136
5843 spin_lock_irqsave(&x->wait.lock, flags); 6137 spin_lock_irqsave(&x->wait.lock, flags);
5844 x->done += UINT_MAX/2; 6138 x->done += UINT_MAX/2;
5845 __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL); 6139 __wake_up_common(&x->wait, TASK_NORMAL, 0, 1, NULL);
5846 spin_unlock_irqrestore(&x->wait.lock, flags); 6140 spin_unlock_irqrestore(&x->wait.lock, flags);
5847} 6141}
5848EXPORT_SYMBOL(complete_all); 6142EXPORT_SYMBOL(complete_all);
@@ -6058,19 +6352,19 @@ long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
6058} 6352}
6059EXPORT_SYMBOL(sleep_on_timeout); 6353EXPORT_SYMBOL(sleep_on_timeout);
6060 6354
6061#ifdef CONFIG_RT_MUTEXES
6062
6063/* 6355/*
6064 * rt_mutex_setprio - set the current priority of a task 6356 * task_setprio - set the current priority of a task
6065 * @p: task 6357 * @p: task
6066 * @prio: prio value (kernel-internal form) 6358 * @prio: prio value (kernel-internal form)
6067 * 6359 *
6068 * This function changes the 'effective' priority of a task. It does 6360 * This function changes the 'effective' priority of a task. It does
6069 * not touch ->normal_prio like __setscheduler(). 6361 * not touch ->normal_prio like __setscheduler().
6070 * 6362 *
6071 * Used by the rt_mutex code to implement priority inheritance logic. 6363 * Used by the rt_mutex code to implement priority inheritance logic
6364 * and by rcupreempt-boost to boost priorities of tasks sleeping
6365 * with rcu locks.
6072 */ 6366 */
6073void rt_mutex_setprio(struct task_struct *p, int prio) 6367void task_setprio(struct task_struct *p, int prio)
6074{ 6368{
6075 unsigned long flags; 6369 unsigned long flags;
6076 int oldprio, on_rq, running; 6370 int oldprio, on_rq, running;
@@ -6080,6 +6374,25 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
6080 BUG_ON(prio < 0 || prio > MAX_PRIO); 6374 BUG_ON(prio < 0 || prio > MAX_PRIO);
6081 6375
6082 rq = task_rq_lock(p, &flags); 6376 rq = task_rq_lock(p, &flags);
6377
6378 /*
6379 * Idle task boosting is a nono in general. There is one
6380 * exception, when NOHZ is active:
6381 *
6382 * The idle task calls get_next_timer_interrupt() and holds
6383 * the timer wheel base->lock on the CPU and another CPU wants
6384 * to access the timer (probably to cancel it). We can safely
6385 * ignore the boosting request, as the idle CPU runs this code
6386 * with interrupts disabled and will complete the lock
6387 * protected section without being interrupted. So there is no
6388 * real need to boost.
6389 */
6390 if (unlikely(p == rq->idle)) {
6391 WARN_ON(p != rq->curr);
6392 WARN_ON(p->pi_blocked_on);
6393 goto out_unlock;
6394 }
6395
6083 update_rq_clock(rq); 6396 update_rq_clock(rq);
6084 6397
6085 oldprio = p->prio; 6398 oldprio = p->prio;
@@ -6098,18 +6411,20 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
6098 6411
6099 p->prio = prio; 6412 p->prio = prio;
6100 6413
6414 trace_sched_task_setprio(rq, p, oldprio);
6415
6101 if (running) 6416 if (running)
6102 p->sched_class->set_curr_task(rq); 6417 p->sched_class->set_curr_task(rq);
6103 if (on_rq) { 6418 if (on_rq) {
6104 enqueue_task(rq, p, 0); 6419 enqueue_task(rq, p, 0, oldprio < prio);
6105 6420
6106 check_class_changed(rq, p, prev_class, oldprio, running); 6421 check_class_changed(rq, p, prev_class, oldprio, running);
6107 } 6422 }
6423
6424out_unlock:
6108 task_rq_unlock(rq, &flags); 6425 task_rq_unlock(rq, &flags);
6109} 6426}
6110 6427
6111#endif
6112
6113void set_user_nice(struct task_struct *p, long nice) 6428void set_user_nice(struct task_struct *p, long nice)
6114{ 6429{
6115 int old_prio, delta, on_rq; 6430 int old_prio, delta, on_rq;
@@ -6145,7 +6460,7 @@ void set_user_nice(struct task_struct *p, long nice)
6145 delta = p->prio - old_prio; 6460 delta = p->prio - old_prio;
6146 6461
6147 if (on_rq) { 6462 if (on_rq) {
6148 enqueue_task(rq, p, 0); 6463 enqueue_task(rq, p, 0, false);
6149 /* 6464 /*
6150 * If the task increased its priority or is running and 6465 * If the task increased its priority or is running and
6151 * lowered its priority, then reschedule its CPU: 6466 * lowered its priority, then reschedule its CPU:
@@ -6423,7 +6738,25 @@ recheck:
6423 if (running) 6738 if (running)
6424 p->sched_class->set_curr_task(rq); 6739 p->sched_class->set_curr_task(rq);
6425 if (on_rq) { 6740 if (on_rq) {
6426 activate_task(rq, p, 0); 6741 /*
6742 * Workaround to make prio ceiling work as expected:
6743 *
6744 * Queue task to head when task is running and task is
6745 * lowering its priority. This works around the non-
6746 * availability of a sched_setprio syscall which was
6747 * tinkered into the posix spec to make prio ceiling
6748 * work correctly.
6749 *
6750 * This workaround violates the posix scheduling
6751 * semantics of tail queueing in the case that the
6752 * priority was changed by anything else than
6753 * sched_setprio, but there is no other breakage
6754 * lurking than some specification fetishists going
6755 * berserk on me.
6756 *
6757 * Fixing this in mainline needs more thoughts.
6758 */
6759 activate_task(rq, p, 0, running && oldprio < p->prio);
6427 6760
6428 check_class_changed(rq, p, prev_class, oldprio, running); 6761 check_class_changed(rq, p, prev_class, oldprio, running);
6429 } 6762 }
@@ -6759,9 +7092,9 @@ SYSCALL_DEFINE0(sched_yield)
6759 __release(rq->lock); 7092 __release(rq->lock);
6760 spin_release(&rq->lock.dep_map, 1, _THIS_IP_); 7093 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
6761 do_raw_spin_unlock(&rq->lock); 7094 do_raw_spin_unlock(&rq->lock);
6762 preempt_enable_no_resched(); 7095 local_irq_enable();
6763 7096
6764 schedule(); 7097 preempt_enable_and_schedule();
6765 7098
6766 return 0; 7099 return 0;
6767} 7100}
@@ -6773,9 +7106,18 @@ static inline int should_resched(void)
6773 7106
6774static void __cond_resched(void) 7107static void __cond_resched(void)
6775{ 7108{
6776 add_preempt_count(PREEMPT_ACTIVE); 7109 do {
6777 schedule(); 7110 add_preempt_count(PREEMPT_ACTIVE);
6778 sub_preempt_count(PREEMPT_ACTIVE); 7111 schedule();
7112 sub_preempt_count(PREEMPT_ACTIVE);
7113
7114 /*
7115 * Check again in case we missed a preemption opportunity
7116 * between schedule and now.
7117 */
7118 barrier();
7119
7120 } while (need_resched());
6779} 7121}
6780 7122
6781int __sched _cond_resched(void) 7123int __sched _cond_resched(void)
@@ -6816,10 +7158,16 @@ int __cond_resched_lock(spinlock_t *lock)
6816} 7158}
6817EXPORT_SYMBOL(__cond_resched_lock); 7159EXPORT_SYMBOL(__cond_resched_lock);
6818 7160
7161/*
7162 * Voluntarily preempt a process context that has softirqs disabled:
7163 */
6819int __sched __cond_resched_softirq(void) 7164int __sched __cond_resched_softirq(void)
6820{ 7165{
6821 BUG_ON(!in_softirq()); 7166#ifndef CONFIG_PREEMPT_SOFTIRQS
6822 7167 WARN_ON_ONCE(!in_softirq());
7168 if (!in_softirq())
7169 return 0;
7170#endif
6823 if (should_resched()) { 7171 if (should_resched()) {
6824 local_bh_enable(); 7172 local_bh_enable();
6825 __cond_resched(); 7173 __cond_resched();
@@ -6830,17 +7178,75 @@ int __sched __cond_resched_softirq(void)
6830} 7178}
6831EXPORT_SYMBOL(__cond_resched_softirq); 7179EXPORT_SYMBOL(__cond_resched_softirq);
6832 7180
7181/*
7182 * Voluntarily preempt a softirq context (possible with softirq threading):
7183 */
7184int __sched cond_resched_softirq_context(void)
7185{
7186 WARN_ON_ONCE(!in_softirq() && !(current->extra_flags & PFE_SOFTIRQ));
7187
7188 if (softirq_need_resched() && system_state == SYSTEM_RUNNING) {
7189 raw_local_irq_disable();
7190 _local_bh_enable();
7191 raw_local_irq_enable();
7192 __cond_resched();
7193 local_bh_disable();
7194 return 1;
7195 }
7196 return 0;
7197}
7198EXPORT_SYMBOL(cond_resched_softirq_context);
7199
7200#ifdef CONFIG_PREEMPT_VOLUNTARY
7201int voluntary_preemption = 1;
7202EXPORT_SYMBOL(voluntary_preemption);
7203
7204static int __init voluntary_preempt_setup (char *str)
7205{
7206 if (!strncmp(str, "off", 3))
7207 voluntary_preemption = 0;
7208 else
7209 get_option(&str, &voluntary_preemption);
7210 if (!voluntary_preemption)
7211 printk("turning off voluntary preemption!\n");
7212
7213 return 1;
7214}
7215
7216__setup("voluntary-preempt=", voluntary_preempt_setup);
7217
7218#endif
7219
6833/** 7220/**
6834 * yield - yield the current processor to other threads. 7221 * yield - yield the current processor to other threads.
6835 * 7222 *
6836 * This is a shortcut for kernel-space yielding - it marks the 7223 * This is a shortcut for kernel-space yielding - it marks the
6837 * thread runnable and calls sys_sched_yield(). 7224 * thread runnable and calls sys_sched_yield().
6838 */ 7225 */
6839void __sched yield(void) 7226void __sched __yield(void)
6840{ 7227{
6841 set_current_state(TASK_RUNNING); 7228 set_current_state(TASK_RUNNING);
6842 sys_sched_yield(); 7229 sys_sched_yield();
6843} 7230}
7231
7232void __sched yield(void)
7233{
7234 static int once = 1;
7235
7236 /*
7237 * it's a bug to rely on yield() with RT priorities. We print
7238 * the first occurance after bootup ... this will still give
7239 * us an idea about the scope of the problem, without spamming
7240 * the syslog:
7241 */
7242 if (once && rt_task(current)) {
7243 once = 0;
7244 printk(KERN_ERR "BUG: %s:%d RT task yield()-ing!\n",
7245 current->comm, current->pid);
7246 dump_stack();
7247 }
7248 __yield();
7249}
6844EXPORT_SYMBOL(yield); 7250EXPORT_SYMBOL(yield);
6845 7251
6846/* 7252/*
@@ -7004,6 +7410,7 @@ void sched_show_task(struct task_struct *p)
7004void show_state_filter(unsigned long state_filter) 7410void show_state_filter(unsigned long state_filter)
7005{ 7411{
7006 struct task_struct *g, *p; 7412 struct task_struct *g, *p;
7413 int do_unlock = 1;
7007 7414
7008#if BITS_PER_LONG == 32 7415#if BITS_PER_LONG == 32
7009 printk(KERN_INFO 7416 printk(KERN_INFO
@@ -7012,7 +7419,16 @@ void show_state_filter(unsigned long state_filter)
7012 printk(KERN_INFO 7419 printk(KERN_INFO
7013 " task PC stack pid father\n"); 7420 " task PC stack pid father\n");
7014#endif 7421#endif
7422#ifdef CONFIG_PREEMPT_RT
7423 if (!read_trylock(&tasklist_lock)) {
7424 printk("hm, tasklist_lock write-locked.\n");
7425 printk("ignoring ...\n");
7426 do_unlock = 0;
7427 }
7428#else
7015 read_lock(&tasklist_lock); 7429 read_lock(&tasklist_lock);
7430#endif
7431
7016 do_each_thread(g, p) { 7432 do_each_thread(g, p) {
7017 /* 7433 /*
7018 * reset the NMI-timeout, listing all files on a slow 7434 * reset the NMI-timeout, listing all files on a slow
@@ -7028,7 +7444,8 @@ void show_state_filter(unsigned long state_filter)
7028#ifdef CONFIG_SCHED_DEBUG 7444#ifdef CONFIG_SCHED_DEBUG
7029 sysrq_sched_debug_show(); 7445 sysrq_sched_debug_show();
7030#endif 7446#endif
7031 read_unlock(&tasklist_lock); 7447 if (do_unlock)
7448 read_unlock(&tasklist_lock);
7032 /* 7449 /*
7033 * Only show locks if all tasks are dumped: 7450 * Only show locks if all tasks are dumped:
7034 */ 7451 */
@@ -7064,17 +7481,14 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
7064 __set_task_cpu(idle, cpu); 7481 __set_task_cpu(idle, cpu);
7065 7482
7066 rq->curr = rq->idle = idle; 7483 rq->curr = rq->idle = idle;
7067#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) 7484#if defined(CONFIG_SMP)
7068 idle->oncpu = 1; 7485 idle->oncpu = 1;
7069#endif 7486#endif
7070 raw_spin_unlock_irqrestore(&rq->lock, flags); 7487 raw_spin_unlock_irqrestore(&rq->lock, flags);
7071 7488
7072 /* Set the preempt count _outside_ the spinlocks! */ 7489 /* Set the preempt count _outside_ the spinlocks! */
7073#if defined(CONFIG_PREEMPT)
7074 task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
7075#else
7076 task_thread_info(idle)->preempt_count = 0; 7490 task_thread_info(idle)->preempt_count = 0;
7077#endif 7491
7078 /* 7492 /*
7079 * The idle tasks have their own, simple scheduling class: 7493 * The idle tasks have their own, simple scheduling class:
7080 */ 7494 */
@@ -7172,27 +7586,8 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
7172 struct rq *rq; 7586 struct rq *rq;
7173 int ret = 0; 7587 int ret = 0;
7174 7588
7175 /*
7176 * Since we rely on wake-ups to migrate sleeping tasks, don't change
7177 * the ->cpus_allowed mask from under waking tasks, which would be
7178 * possible when we change rq->lock in ttwu(), so synchronize against
7179 * TASK_WAKING to avoid that.
7180 *
7181 * Make an exception for freshly cloned tasks, since cpuset namespaces
7182 * might move the task about, we have to validate the target in
7183 * wake_up_new_task() anyway since the cpu might have gone away.
7184 */
7185again:
7186 while (p->state == TASK_WAKING && !(p->flags & PF_STARTING))
7187 cpu_relax();
7188
7189 rq = task_rq_lock(p, &flags); 7589 rq = task_rq_lock(p, &flags);
7190 7590
7191 if (p->state == TASK_WAKING && !(p->flags & PF_STARTING)) {
7192 task_rq_unlock(rq, &flags);
7193 goto again;
7194 }
7195
7196 if (!cpumask_intersects(new_mask, cpu_active_mask)) { 7591 if (!cpumask_intersects(new_mask, cpu_active_mask)) {
7197 ret = -EINVAL; 7592 ret = -EINVAL;
7198 goto out; 7593 goto out;
@@ -7248,11 +7643,18 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
7248static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) 7643static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
7249{ 7644{
7250 struct rq *rq_dest, *rq_src; 7645 struct rq *rq_dest, *rq_src;
7646 unsigned long flags;
7251 int ret = 0; 7647 int ret = 0;
7252 7648
7253 if (unlikely(!cpu_active(dest_cpu))) 7649 if (unlikely(!cpu_active(dest_cpu)))
7254 return ret; 7650 return ret;
7255 7651
7652 /*
7653 * PREEMPT_RT: this relies on write_lock_irq(&tasklist_lock)
7654 * disabling interrupts - which on PREEMPT_RT does not do:
7655 */
7656 local_irq_save(flags);
7657
7256 rq_src = cpu_rq(src_cpu); 7658 rq_src = cpu_rq(src_cpu);
7257 rq_dest = cpu_rq(dest_cpu); 7659 rq_dest = cpu_rq(dest_cpu);
7258 7660
@@ -7271,13 +7673,15 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
7271 if (p->se.on_rq) { 7673 if (p->se.on_rq) {
7272 deactivate_task(rq_src, p, 0); 7674 deactivate_task(rq_src, p, 0);
7273 set_task_cpu(p, dest_cpu); 7675 set_task_cpu(p, dest_cpu);
7274 activate_task(rq_dest, p, 0); 7676 activate_task(rq_dest, p, 0, false);
7275 check_preempt_curr(rq_dest, p, 0); 7677 check_preempt_curr(rq_dest, p, 0);
7276 } 7678 }
7277done: 7679done:
7278 ret = 1; 7680 ret = 1;
7279fail: 7681fail:
7280 double_rq_unlock(rq_src, rq_dest); 7682 double_rq_unlock(rq_src, rq_dest);
7683 local_irq_restore(flags);
7684
7281 return ret; 7685 return ret;
7282} 7686}
7283 7687
@@ -7437,7 +7841,7 @@ void sched_idle_next(void)
7437 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); 7841 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
7438 7842
7439 update_rq_clock(rq); 7843 update_rq_clock(rq);
7440 activate_task(rq, p, 0); 7844 activate_task(rq, p, 0, false);
7441 7845
7442 raw_spin_unlock_irqrestore(&rq->lock, flags); 7846 raw_spin_unlock_irqrestore(&rq->lock, flags);
7443} 7847}
@@ -7454,7 +7858,11 @@ void idle_task_exit(void)
7454 7858
7455 if (mm != &init_mm) 7859 if (mm != &init_mm)
7456 switch_mm(mm, &init_mm, current); 7860 switch_mm(mm, &init_mm, current);
7861#ifdef CONFIG_PREEMPT_RT
7862 mmdrop_delayed(mm);
7863#else
7457 mmdrop(mm); 7864 mmdrop(mm);
7865#endif
7458} 7866}
7459 7867
7460/* called under rq->lock with disabled interrupts */ 7868/* called under rq->lock with disabled interrupts */
@@ -9699,6 +10107,9 @@ void __init sched_init(void)
9699 atomic_inc(&init_mm.mm_count); 10107 atomic_inc(&init_mm.mm_count);
9700 enter_lazy_tlb(&init_mm, current); 10108 enter_lazy_tlb(&init_mm, current);
9701 10109
10110#ifdef CONFIG_PREEMPT_RT
10111 printk("Real-Time Preemption Support (C) 2004-2007 Ingo Molnar\n");
10112#endif
9702 /* 10113 /*
9703 * Make us the idle thread. Technically, schedule() should not be 10114 * Make us the idle thread. Technically, schedule() should not be
9704 * called from this thread, however somewhere below it might be, 10115 * called from this thread, however somewhere below it might be,
@@ -9731,10 +10142,14 @@ void __init sched_init(void)
9731 scheduler_running = 1; 10142 scheduler_running = 1;
9732} 10143}
9733 10144
9734#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP 10145#if defined(CONFIG_DEBUG_SPINLOCK_SLEEP) || defined(CONFIG_DEBUG_PREEMPT)
9735static inline int preempt_count_equals(int preempt_offset) 10146static inline int preempt_count_equals(int preempt_offset)
9736{ 10147{
9737 int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth(); 10148 int nested = (preempt_count() & ~PREEMPT_ACTIVE);
10149
10150#ifndef CONFIG_PREEMPT_RT
10151 nested += rcu_preempt_depth();
10152#endif
9738 10153
9739 return (nested == PREEMPT_INATOMIC_BASE + preempt_offset); 10154 return (nested == PREEMPT_INATOMIC_BASE + preempt_offset);
9740} 10155}
@@ -9755,7 +10170,8 @@ void __might_sleep(char *file, int line, int preempt_offset)
9755 "BUG: sleeping function called from invalid context at %s:%d\n", 10170 "BUG: sleeping function called from invalid context at %s:%d\n",
9756 file, line); 10171 file, line);
9757 printk(KERN_ERR 10172 printk(KERN_ERR
9758 "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", 10173 "pcnt: %x %d in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
10174 preempt_count(), preempt_offset,
9759 in_atomic(), irqs_disabled(), 10175 in_atomic(), irqs_disabled(),
9760 current->pid, current->comm); 10176 current->pid, current->comm);
9761 10177
@@ -9779,7 +10195,7 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
9779 deactivate_task(rq, p, 0); 10195 deactivate_task(rq, p, 0);
9780 __setscheduler(rq, p, SCHED_NORMAL, 0); 10196 __setscheduler(rq, p, SCHED_NORMAL, 0);
9781 if (on_rq) { 10197 if (on_rq) {
9782 activate_task(rq, p, 0); 10198 activate_task(rq, p, 0, false);
9783 resched_task(rq->curr); 10199 resched_task(rq->curr);
9784 } 10200 }
9785} 10201}
@@ -10155,7 +10571,7 @@ void sched_move_task(struct task_struct *tsk)
10155 if (unlikely(running)) 10571 if (unlikely(running))
10156 tsk->sched_class->set_curr_task(rq); 10572 tsk->sched_class->set_curr_task(rq);
10157 if (on_rq) 10573 if (on_rq)
10158 enqueue_task(rq, tsk, 0); 10574 enqueue_task(rq, tsk, 0, false);
10159 10575
10160 task_rq_unlock(rq, &flags); 10576 task_rq_unlock(rq, &flags);
10161} 10577}