1 files changed, 551 insertions, 135 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index da19c1e05a5a..21c1cf2e27aa 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4,6 +4,7 @@
 *  Kernel scheduler and related syscalls
 *
 *  Copyright (C) 1991-2002  Linus Torvalds
+ *  Copyright (C) 2004 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
 *
 *  1996-12-23  Modified by Dave Grothe to fix bugs in semaphores and
 *              make semaphores SMP safe
@@ -16,6 +17,7 @@
 *              by Davide Libenzi, preemptible kernel bits by Robert Love.
 *  2003-09-03  Interactivity tuning by Con Kolivas.
 *  2004-04-02  Scheduler domains code by Nick Piggin
+ *  2004-10-13  Real-Time Preemption support by Ingo Molnar
 *  2007-04-15  Work begun on replacing all interactivity tuning with a
 *              fair scheduling design by Con Kolivas.
 *  2007-05-05  Load balancing (smp-nice) and other improvements
@@ -61,6 +63,7 @@
 #include <linux/sysctl.h>
 #include <linux/syscalls.h>
 #include <linux/times.h>
+#include <linux/kallsyms.h>
 #include <linux/tsacct_kern.h>
 #include <linux/kprobes.h>
 #include <linux/delayacct.h>
@@ -106,6 +109,20 @@
 #define NICE_0_LOAD             SCHED_LOAD_SCALE
 #define NICE_0_SHIFT            SCHED_LOAD_SHIFT
+#if (BITS_PER_LONG < 64)
+#define JIFFIES_TO_NS64(TIME) \
+        ((unsigned long long)(TIME) * ((unsigned long) (1000000000 / HZ)))
+#define NS64_TO_JIFFIES(TIME) \
+        ((((unsigned long long)((TIME)) >> BITS_PER_LONG) * \
+        (1 + NS_TO_JIFFIES(~0UL))) + NS_TO_JIFFIES((unsigned long)(TIME)))
+#else /* BITS_PER_LONG < 64 */
+#define NS64_TO_JIFFIES(TIME) NS_TO_JIFFIES(TIME)
+#define JIFFIES_TO_NS64(TIME) JIFFIES_TO_NS(TIME)
+#endif /* BITS_PER_LONG < 64 */
 /*
 * These are the 'tuning knobs' of the scheduler:
 *
@@ -131,6 +148,9 @@ static inline int task_has_rt_policy(struct task_struct *p)
        return rt_policy(p->policy);
 }
+#define TASK_PREEMPTS_CURR(p, rq) \
+        ((p)->prio < (rq)->curr->prio)
 /*
 * This is the priority-queue data structure of the RT scheduling class:
 */
@@ -182,6 +202,7 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
        hrtimer_init(&rt_b->rt_period_timer,
                        CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+        rt_b->rt_period_timer.irqsafe = 1;
        rt_b->rt_period_timer.function = sched_rt_period_timer;
 }
@@ -389,6 +410,7 @@ static inline struct task_group *task_group(struct task_struct *p)
 struct cfs_rq {
        struct load_weight load;
        unsigned long nr_running;
+        unsigned long nr_enqueued;
        u64 exec_clock;
        u64 min_vruntime;
@@ -466,6 +488,7 @@ struct rt_rq {
        int overloaded;
        struct plist_head pushable_tasks;
 #endif
+        unsigned long rt_nr_uninterruptible;
        int rt_throttled;
        u64 rt_time;
        u64 rt_runtime;
@@ -561,6 +584,8 @@ struct rq {
         */
        unsigned long nr_uninterruptible;
+        unsigned long switch_timestamp;
+        unsigned long slice_avg;
        struct task_struct *curr, *idle;
        unsigned long next_balance;
        struct mm_struct *prev_mm;
@@ -625,9 +650,21 @@ struct rq {
        /* BKL stats */
        unsigned int bkl_count;
+        /* RT-overload stats: */
+        unsigned long rto_schedule;
+        unsigned long rto_schedule_tail;
+        unsigned long rto_wakeup;
+        unsigned long rto_pulled;
+        unsigned long rto_pushed;
 #endif
 };
+struct task_struct *rq_curr(struct rq *rq)
+{
+        return rq->curr;
+}
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
 static inline
@@ -666,6 +703,13 @@ inline void update_rq_clock(struct rq *rq)
        rq->clock = sched_clock_cpu(cpu_of(rq));
 }
+#ifndef CONFIG_SMP
+int task_is_current(struct task_struct *task)
+{
+        return task_rq(task)->curr == task;
+}
+#endif
 /*
 * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
 */
@@ -807,7 +851,11 @@ late_initcall(sched_init_debug);
 * Number of tasks to iterate in a single balance run.
 * Limited because this is done with IRQs disabled.
 */
+#ifndef CONFIG_PREEMPT
 const_debug unsigned int sysctl_sched_nr_migrate = 32;
+#else
+const_debug unsigned int sysctl_sched_nr_migrate = 8;
+#endif
 /*
 * ratelimit for updating the group shares.
@@ -858,11 +906,25 @@ static inline u64 global_rt_runtime(void)
        return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
 }
+/*
+ * We really dont want to do anything complex within switch_to()
+ * on PREEMPT_RT - this check enforces this.
+ */
+#ifdef prepare_arch_switch
+# ifdef CONFIG_PREEMPT_RT
+#   error FIXME
+# else
+#  define _finish_arch_switch finish_arch_switch
+# endif
+#endif
 #ifndef prepare_arch_switch
 # define prepare_arch_switch(next)      do { } while (0)
 #endif
 #ifndef finish_arch_switch
-# define finish_arch_switch(prev)       do { } while (0)
+# define _finish_arch_switch(prev)      do { } while (0)
+#else
+# define _finish_arch_switch            finish_arch_switch
 #endif
 static inline int task_current(struct rq *rq, struct task_struct *p)
@@ -870,18 +932,39 @@ static inline int task_current(struct rq *rq, struct task_struct *p)
        return rq->curr == p;
 }
-#ifndef __ARCH_WANT_UNLOCKED_CTXSW
 static inline int task_running(struct rq *rq, struct task_struct *p)
 {
+#ifdef CONFIG_SMP
+        return p->oncpu;
+#else
        return task_current(rq, p);
+#endif
 }
+#ifndef __ARCH_WANT_UNLOCKED_CTXSW
 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
 {
+#ifdef CONFIG_SMP
+        /*
+         * We can optimise this out completely for !SMP, because the
+         * SMP rebalancing from interrupt is the only thing that cares
+         * here.
+         */
+        next->oncpu = 1;
+#endif
 }
 static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 {
+#ifdef CONFIG_SMP
+        /*
+         * After ->oncpu is cleared, the task can be moved to a different CPU.
+         * We must ensure this doesn't happen until the switch is completely
+         * finished.
+         */
+        smp_wmb();
+        prev->oncpu = 0;
+#endif
 #ifdef CONFIG_DEBUG_SPINLOCK
        /* this is a valid case when another task releases the spinlock */
        rq->lock.owner = current;
@@ -893,18 +976,10 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
         */
        spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
-        raw_spin_unlock_irq(&rq->lock);
+        raw_spin_unlock(&rq->lock);
 }
 #else /* __ARCH_WANT_UNLOCKED_CTXSW */
-static inline int task_running(struct rq *rq, struct task_struct *p)
-{
-#ifdef CONFIG_SMP
-        return p->oncpu;
-#else
-        return task_current(rq, p);
-#endif
-}
 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
 {
@@ -934,23 +1009,40 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
        smp_wmb();
        prev->oncpu = 0;
 #endif
-#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
-        local_irq_enable();
+        local_irq_disable();
 #endif
 }
 #endif /* __ARCH_WANT_UNLOCKED_CTXSW */
 /*
+ * Check whether the task is waking, we use this to synchronize against
+ * ttwu() so that task_cpu() reports a stable number.
+ *
+ * We need to make an exception for PF_STARTING tasks because the fork
+ * path might require task_rq_lock() to work, eg. it can call
+ * set_cpus_allowed_ptr() from the cpuset clone_ns code.
+ */
+static inline int task_is_waking(struct task_struct *p)
+{
+        return unlikely((p->state & TASK_WAKING) && !(p->flags & PF_STARTING));
+}
+/*
 * __task_rq_lock - lock the runqueue a given task resides on.
 * Must be called interrupts disabled.
 */
 static inline struct rq *__task_rq_lock(struct task_struct *p)
        __acquires(rq->lock)
 {
+        struct rq *rq;
        for (;;) {
-                struct rq *rq = task_rq(p);
+                while (task_is_waking(p))
+                        cpu_relax();
+                rq = task_rq(p);
                raw_spin_lock(&rq->lock);
-                if (likely(rq == task_rq(p)))
+                if (likely(rq == task_rq(p) && !task_is_waking(p)))
                        return rq;
                raw_spin_unlock(&rq->lock);
        }
@@ -967,10 +1059,12 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
        struct rq *rq;
        for (;;) {
+                while (task_is_waking(p))
+                        cpu_relax();
                local_irq_save(*flags);
                rq = task_rq(p);
                raw_spin_lock(&rq->lock);
-                if (likely(rq == task_rq(p)))
+                if (likely(rq == task_rq(p) && !task_is_waking(p)))
                        return rq;
                raw_spin_unlock_irqrestore(&rq->lock, *flags);
        }
@@ -1147,6 +1241,7 @@ static void init_rq_hrtick(struct rq *rq)
        hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
        rq->hrtick_timer.function = hrtick;
+        rq->hrtick_timer.irqsafe = 1;
 }
 #else   /* CONFIG_SCHED_HRTICK */
 static inline void hrtick_clear(struct rq *rq)
@@ -1222,7 +1317,7 @@ void wake_up_idle_cpu(int cpu)
 {
        struct rq *rq = cpu_rq(cpu);
-        if (cpu == smp_processor_id())
+        if (cpu == raw_smp_processor_id())
                return;
        /*
@@ -1390,7 +1485,8 @@ static const u32 prio_to_wmult[40] = {
 /*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
 };
-static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
+static void activate_task(struct rq *rq, struct task_struct *p, int wakeup,
+                          bool head);
 /*
 * runqueue iterator, to support SMP load-balancing between different
@@ -1883,13 +1979,14 @@ static void update_avg(u64 *avg, u64 sample)
        *avg += diff >> 3;
 }
-static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
+static void
+enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, bool head)
 {
        if (wakeup)
                p->se.start_runtime = p->se.sum_exec_runtime;
        sched_info_queued(p);
-        p->sched_class->enqueue_task(rq, p, wakeup);
+        p->sched_class->enqueue_task(rq, p, wakeup, head);
        p->se.on_rq = 1;
 }
@@ -1934,6 +2031,8 @@ static inline int normal_prio(struct task_struct *p)
                prio = MAX_RT_PRIO-1 - p->rt_priority;
        else
                prio = __normal_prio(p);
+//      trace_special_pid(p->pid, PRIO(p), __PRIO(prio));
        return prio;
 }
@@ -1960,12 +2059,13 @@ static int effective_prio(struct task_struct *p)
 /*
 * activate_task - move a task to the runqueue.
 */
-static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
+static void
+activate_task(struct rq *rq, struct task_struct *p, int wakeup, bool head)
 {
        if (task_contributes_to_load(p))
                rq->nr_uninterruptible--;
-        enqueue_task(rq, p, wakeup);
+        enqueue_task(rq, p, wakeup, head);
        inc_nr_running(rq);
 }
@@ -2034,13 +2134,20 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 {
-#ifdef CONFIG_SCHED_DEBUG
+#if defined(CONFIG_SCHED_DEBUG)
        /*
         * We should never call set_task_cpu() on a blocked task,
         * ttwu() will sort out the placement.
         */
-        WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
+        if (p->state != TASK_RUNNING &&
-                        !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
+            !(p->state & TASK_WAKING) &&
+            !(p->state & TASK_RUNNING_MUTEX) &&
+            !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)) {
+                printk(KERN_ERR "%d %s %lx %lx\n", p->pid, p->comm,
+                       (unsigned long) p->state,
+                       (unsigned long) preempt_count());
+                WARN_ON(1);
+        }
 #endif
        trace_sched_migrate_task(p, new_cpu);
@@ -2219,7 +2326,10 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
                 * yield - it could be a while.
                 */
                if (unlikely(on_rq)) {
-                        schedule_timeout_uninterruptible(1);
+                        ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);
+                        set_current_state(TASK_UNINTERRUPTIBLE);
+                        schedule_hrtimeout(&to, HRTIMER_MODE_REL);
                        continue;
                }
@@ -2365,7 +2475,7 @@ int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
 * returns failure only if the task is already active.
 */
 static int try_to_wake_up(struct task_struct *p, unsigned int state,
-                          int wake_flags)
+                          int wake_flags, int mutex)
 {
        int cpu, orig_cpu, this_cpu, success = 0;
        unsigned long flags;
@@ -2395,12 +2505,8 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
        /*
         * In order to handle concurrent wakeups and release the rq->lock
         * we put the task in TASK_WAKING state.
-         *
-         * First fix up the nr_uninterruptible count:
         */
-        if (task_contributes_to_load(p))
+        p->state |= TASK_WAKING;
-                rq->nr_uninterruptible--;
-        p->state = TASK_WAKING;
        if (p->sched_class->task_waking)
                p->sched_class->task_waking(rq, p);
@@ -2408,14 +2514,27 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
        __task_rq_unlock(rq);
        cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
-        if (cpu != orig_cpu)
+        if (cpu != orig_cpu) {
+                /*
+                 * Since we migrate the task without holding any rq->lock,
+                 * we need to be careful with task_rq_lock(), since that
+                 * might end up locking an invalid rq.
+                 */
                set_task_cpu(p, cpu);
+        }
-        rq = __task_rq_lock(p);
+        rq = cpu_rq(cpu);
+        raw_spin_lock(&rq->lock);
        update_rq_clock(rq);
-        WARN_ON(p->state != TASK_WAKING);
+        /*
-        cpu = task_cpu(p);
+         * We migrated the task without holding either rq->lock, however
+         * since the task is not on the task list itself, nobody else
+         * will try and migrate the task, hence the rq should match the
+         * cpu we just moved it to.
+         */
+        WARN_ON(task_cpu(p) != cpu);
+        WARN_ON(!(p->state & TASK_WAKING));
 #ifdef CONFIG_SCHEDSTATS
        schedstat_inc(rq, ttwu_count);
@@ -2443,7 +2562,7 @@ out_activate:
                schedstat_inc(p, se.nr_wakeups_local);
        else
                schedstat_inc(p, se.nr_wakeups_remote);
-        activate_task(rq, p, 1);
+        activate_task(rq, p, 1, false);
        success = 1;
        /*
@@ -2466,7 +2585,20 @@ out_running:
        trace_sched_wakeup(rq, p, success);
        check_preempt_curr(rq, p, wake_flags);
-        p->state = TASK_RUNNING;
+        /*
+         * For a mutex wakeup we or TASK_RUNNING_MUTEX to the task
+         * state to preserve the original state, so a real wakeup
+         * still can see the (UN)INTERRUPTIBLE bits in the state check
+         * above. We dont have to worry about the | TASK_RUNNING_MUTEX
+         * here. The waiter is serialized by the mutex lock and nobody
+         * else can fiddle with p->state as we hold rq lock.
+         */
+        p->state &= ~TASK_WAKING;
+        if (mutex)
+                p->state |= TASK_RUNNING_MUTEX;
+        else
+                p->state = TASK_RUNNING;
 #ifdef CONFIG_SMP
        if (p->sched_class->task_woken)
                p->sched_class->task_woken(rq, p);
@@ -2502,13 +2634,31 @@ out:
 */
 int wake_up_process(struct task_struct *p)
 {
-        return try_to_wake_up(p, TASK_ALL, 0);
+        return try_to_wake_up(p, TASK_ALL, 0, 0);
 }
 EXPORT_SYMBOL(wake_up_process);
+int  wake_up_process_sync(struct task_struct * p)
+{
+        return try_to_wake_up(p, TASK_ALL, 1, 0);
+}
+EXPORT_SYMBOL(wake_up_process_sync);
+int  wake_up_process_mutex(struct task_struct * p)
+{
+        return try_to_wake_up(p, TASK_ALL, 0, 1);
+}
+EXPORT_SYMBOL(wake_up_process_mutex);
+int  wake_up_process_mutex_sync(struct task_struct * p)
+{
+        return try_to_wake_up(p, TASK_ALL, 1, 1);
+}
+EXPORT_SYMBOL(wake_up_process_mutex_sync);
 int wake_up_state(struct task_struct *p, unsigned int state)
 {
-        return try_to_wake_up(p, state, 0);
+        return try_to_wake_up(p, state, 0, 0);
 }
 /*
@@ -2575,7 +2725,7 @@ static void __sched_fork(struct task_struct *p)
 */
 void sched_fork(struct task_struct *p, int clone_flags)
 {
-        int cpu = get_cpu();
+        int cpu;
        __sched_fork(p);
        /*
@@ -2615,16 +2765,24 @@ void sched_fork(struct task_struct *p, int clone_flags)
        if (!rt_prio(p->prio))
                p->sched_class = &fair_sched_class;
+        /*
+         * task_fork() and set_task_cpu() must be called with
+         * preemption disabled
+         */
+        cpu = get_cpu();
        if (p->sched_class->task_fork)
                p->sched_class->task_fork(p);
        set_task_cpu(p, cpu);
+        put_cpu();
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
        if (likely(sched_info_on()))
                memset(&p->sched_info, 0, sizeof(p->sched_info));
 #endif
-#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
+#if defined(CONFIG_SMP)
        p->oncpu = 0;
 #endif
 #ifdef CONFIG_PREEMPT
@@ -2632,8 +2790,6 @@ void sched_fork(struct task_struct *p, int clone_flags)
        task_thread_info(p)->preempt_count = 1;
 #endif
        plist_node_init(&p->pushable_tasks, MAX_PRIO);
-        put_cpu();
 }
 /*
@@ -2663,11 +2819,17 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
        set_task_cpu(p, cpu);
 #endif
-        rq = task_rq_lock(p, &flags);
+        /*
+         * Since the task is not on the rq and we still have TASK_WAKING set
+         * nobody else will migrate this task.
+         */
+        rq = cpu_rq(cpu);
+        raw_spin_lock_irqsave(&rq->lock, flags);
        BUG_ON(p->state != TASK_WAKING);
        p->state = TASK_RUNNING;
        update_rq_clock(rq);
-        activate_task(rq, p, 0);
+        activate_task(rq, p, 0, false);
        trace_sched_wakeup_new(rq, p, 1);
        check_preempt_curr(rq, p, WF_FORK);
 #ifdef CONFIG_SMP
@@ -2707,8 +2869,17 @@ static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
        struct preempt_notifier *notifier;
        struct hlist_node *node;
+        if (hlist_empty(&curr->preempt_notifiers))
+                return;
+        /*
+         * The KVM sched in notifier expects to be called with
+         * interrupts enabled.
+         */
+        local_irq_enable();
        hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
                notifier->ops->sched_in(notifier, raw_smp_processor_id());
+        local_irq_disable();
 }
 static void
@@ -2793,13 +2964,17 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
         *              Manfred Spraul <manfred@colorfullife.com>
         */
        prev_state = prev->state;
-        finish_arch_switch(prev);
+        _finish_arch_switch(prev);
        perf_event_task_sched_in(current, cpu_of(rq));
        finish_lock_switch(rq, prev);
        fire_sched_in_preempt_notifiers(current);
+        /*
+         * Delay the final freeing of the mm or task, so that we dont have
+         * to do complex work from within the scheduler:
+         */
        if (mm)
-                mmdrop(mm);
+                mmdrop_delayed(mm);
        if (unlikely(prev_state == TASK_DEAD)) {
                /*
                 * Remove function-return probe instances associated with this
@@ -2853,8 +3028,10 @@ static inline void post_schedule(struct rq *rq)
 asmlinkage void schedule_tail(struct task_struct *prev)
        __releases(rq->lock)
 {
-        struct rq *rq = this_rq();
+        struct rq *rq;
+        preempt_disable();
+        rq = this_rq();
        finish_task_switch(rq, prev);
        /*
@@ -2863,9 +3040,14 @@ asmlinkage void schedule_tail(struct task_struct *prev)
         */
        post_schedule(rq);
+        __preempt_enable_no_resched();
+        local_irq_enable();
 #ifdef __ARCH_WANT_UNLOCKED_CTXSW
        /* In this case, finish_task_switch does not reenable preemption */
        preempt_enable();
+#else
+        preempt_check_resched();
 #endif
        if (current->set_child_tid)
                put_user(task_pid_vnr(current), current->set_child_tid);
@@ -2913,6 +3095,11 @@ context_switch(struct rq *rq, struct task_struct *prev,
        spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
 #endif
+#ifdef CURRENT_PTR
+        barrier();
+        *current_ptr = next;
+        *current_ti_ptr = next->thread_info;
+#endif
        /* Here we just switch the register state and the stack. */
        switch_to(prev, next, prev);
@@ -2959,6 +3146,11 @@ unsigned long nr_uninterruptible(void)
        return sum;
 }
+unsigned long nr_uninterruptible_cpu(int cpu)
+{
+        return cpu_rq(cpu)->nr_uninterruptible;
+}
 unsigned long long nr_context_switches(void)
 {
        int i;
@@ -2977,6 +3169,13 @@ unsigned long nr_iowait(void)
        for_each_possible_cpu(i)
                sum += atomic_read(&cpu_rq(i)->nr_iowait);
+        /*
+         * Since we read the counters lockless, it might be slightly
+         * inaccurate. Do not allow it to go below zero though:
+         */
+        if (unlikely((long)sum < 0))
+                sum = 0;
        return sum;
 }
@@ -3199,7 +3398,7 @@ static void pull_task(struct rq *src_rq, struct task_struct *p,
 {
        deactivate_task(src_rq, p, 0);
        set_task_cpu(p, this_cpu);
-        activate_task(this_rq, p, 0);
+        activate_task(this_rq, p, 0, false);
        check_preempt_curr(this_rq, p, 0);
 }
@@ -3295,6 +3494,10 @@ next:
         */
        if (idle == CPU_NEWLY_IDLE)
                goto out;
+        if (raw_spin_is_contended(&this_rq->lock) ||
+            raw_spin_is_contended(&busiest->lock))
+                goto out;
 #endif
        /*
@@ -3351,6 +3554,10 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
                 */
                if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
                        break;
+                if (raw_spin_is_contended(&this_rq->lock) ||
+                    raw_spin_is_contended(&busiest->lock))
+                        break;
 #endif
        } while (class && max_load_move > total_load_moved);
@@ -4867,7 +5074,7 @@ out:
 */
 static void run_rebalance_domains(struct softirq_action *h)
 {
-        int this_cpu = smp_processor_id();
+        int this_cpu = raw_smp_processor_id();
        struct rq *this_rq = cpu_rq(this_cpu);
        enum cpu_idle_type idle = this_rq->idle_at_tick ?
                                                CPU_IDLE : CPU_NOT_IDLE;
@@ -5141,9 +5348,10 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
        /* Add system time to cpustat. */
        tmp = cputime_to_cputime64(cputime);
-        if (hardirq_count() - hardirq_offset)
+        if ((hardirq_count() - hardirq_offset) ||
+            (p->extra_flags & PFE_HARDIRQ))
                cpustat->irq = cputime64_add(cpustat->irq, tmp);
-        else if (softirq_count())
+        else if (softirq_count() || (p->extra_flags & PFE_SOFTIRQ))
                cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
        else
                cpustat->system = cputime64_add(cpustat->system, tmp);
@@ -5324,10 +5532,13 @@ void scheduler_tick(void)
        sched_clock_tick();
+        BUG_ON(!irqs_disabled());
        raw_spin_lock(&rq->lock);
        update_rq_clock(rq);
        update_cpu_load(rq);
-        curr->sched_class->task_tick(rq, curr, 0);
+        if (curr != rq->idle && curr->se.on_rq)
+                curr->sched_class->task_tick(rq, curr, 0);
        raw_spin_unlock(&rq->lock);
        perf_event_task_tick(curr, cpu);
@@ -5348,6 +5559,19 @@ notrace unsigned long get_parent_ip(unsigned long addr)
        return addr;
 }
+#ifdef CONFIG_DEBUG_PREEMPT
+void notrace preempt_enable_no_resched(void)
+{
+        barrier();
+        dec_preempt_count();
+        WARN_ONCE(!preempt_count(),
+             KERN_ERR "BUG: %s:%d task might have lost a preemption check!\n",
+             current->comm, current->pid);
+}
+EXPORT_SYMBOL(preempt_enable_no_resched);
+#endif
 #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
                                defined(CONFIG_PREEMPT_TRACER))
@@ -5404,8 +5628,8 @@ static noinline void __schedule_bug(struct task_struct *prev)
 {
        struct pt_regs *regs = get_irq_regs();
-        printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
+        printk(KERN_ERR "BUG: scheduling while atomic: %s/0x%08x/%d, CPU#%d\n",
-                prev->comm, prev->pid, preempt_count());
+               prev->comm, preempt_count(), prev->pid, smp_processor_id());
        debug_show_held_locks(prev);
        print_modules();
@@ -5423,12 +5647,14 @@ static noinline void __schedule_bug(struct task_struct *prev)
 */
 static inline void schedule_debug(struct task_struct *prev)
 {
+//      WARN_ON(system_state == SYSTEM_BOOTING);
        /*
         * Test if we are atomic. Since do_exit() needs to call into
         * schedule() atomically, we ignore that path for now.
         * Otherwise, whine if we are scheduling when we should not be.
         */
-        if (unlikely(in_atomic_preempt_off() && !prev->exit_state))
+        if (unlikely(in_atomic() && !prev->exit_state))
                __schedule_bug(prev);
        profile_hit(SCHED_PROFILING, __builtin_return_address(0));
@@ -5499,15 +5725,13 @@ pick_next_task(struct rq *rq)
 /*
 * schedule() is the main scheduler function.
 */
-asmlinkage void __sched schedule(void)
+asmlinkage void __sched __schedule(void)
 {
        struct task_struct *prev, *next;
        unsigned long *switch_count;
        struct rq *rq;
        int cpu;
-need_resched:
-        preempt_disable();
        cpu = smp_processor_id();
        rq = cpu_rq(cpu);
        rcu_sched_qs(cpu);
@@ -5515,10 +5739,11 @@ need_resched:
        switch_count = &prev->nivcsw;
        release_kernel_lock(prev);
-need_resched_nonpreemptible:
        schedule_debug(prev);
+        preempt_disable();
        if (sched_feat(HRTICK))
                hrtick_clear(rq);
@@ -5526,7 +5751,8 @@ need_resched_nonpreemptible:
        update_rq_clock(rq);
        clear_tsk_need_resched(prev);
-        if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
+        if (!(prev->state & TASK_RUNNING_MUTEX) && prev->state &&
+            !(preempt_count() & PREEMPT_ACTIVE)) {
                if (unlikely(signal_pending_state(prev->state, prev)))
                        prev->state = TASK_RUNNING;
                else
@@ -5557,24 +5783,29 @@ need_resched_nonpreemptible:
                 */
                cpu = smp_processor_id();
                rq = cpu_rq(cpu);
-        } else
+                __preempt_enable_no_resched();
-                raw_spin_unlock_irq(&rq->lock);
+        } else {
+                __preempt_enable_no_resched();
+                raw_spin_unlock(&rq->lock);
+        }
        post_schedule(rq);
-        if (unlikely(reacquire_kernel_lock(current) < 0)) {
+        reacquire_kernel_lock(current);
-                prev = rq->curr;
+}
-                switch_count = &prev->nivcsw;
-                goto need_resched_nonpreemptible;
-        }
-        preempt_enable_no_resched();
+asmlinkage void __sched schedule(void)
+{
+need_resched:
+        local_irq_disable();
+        __schedule();
+        local_irq_enable();
        if (need_resched())
                goto need_resched;
 }
 EXPORT_SYMBOL(schedule);
-#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
+#if defined(CONFIG_MUTEX_SPIN_ON_OWNER) && !defined(CONFIG_PREEMPT_RT)
 /*
 * Look out! "owner" is an entirely speculative pointer
 * access and not reliable.
@@ -5636,6 +5867,35 @@ out:
 #endif
 #ifdef CONFIG_PREEMPT
+/*
+ * Global flag to turn preemption off on a CONFIG_PREEMPT kernel:
+ */
+int kernel_preemption = 1;
+static int __init preempt_setup (char *str)
+{
+        if (!strncmp(str, "off", 3)) {
+                if (kernel_preemption) {
+                        printk(KERN_INFO "turning off kernel preemption!\n");
+                        kernel_preemption = 0;
+                }
+                return 1;
+        }
+        if (!strncmp(str, "on", 2)) {
+                if (!kernel_preemption) {
+                        printk(KERN_INFO "turning on kernel preemption!\n");
+                        kernel_preemption = 1;
+                }
+                return 1;
+        }
+        get_option(&str, &kernel_preemption);
+        return 1;
+}
+__setup("preempt=", preempt_setup);
 /*
 * this is the entry point to schedule() from in-kernel preemption
 * off of preempt_enable. Kernel preemptions off return from interrupt
@@ -5644,7 +5904,11 @@ out:
 asmlinkage void __sched preempt_schedule(void)
 {
        struct thread_info *ti = current_thread_info();
+        struct task_struct *task = current;
+        int saved_lock_depth;
+        if (!kernel_preemption)
+                return;
        /*
         * If there is a non-zero preempt_count or interrupts are disabled,
         * we do not want to preempt the current task. Just return..
@@ -5653,10 +5917,23 @@ asmlinkage void __sched preempt_schedule(void)
                return;
        do {
+                local_irq_disable();
                add_preempt_count(PREEMPT_ACTIVE);
-                schedule();
+                /*
+                 * We keep the big kernel semaphore locked, but we
+                 * clear ->lock_depth so that schedule() doesnt
+                 * auto-release the semaphore:
+                 */
+                saved_lock_depth = task->lock_depth;
+                task->lock_depth = -1;
+                __schedule();
+                task->lock_depth = saved_lock_depth;
                sub_preempt_count(PREEMPT_ACTIVE);
+                local_irq_enable();
                /*
                 * Check again in case we missed a preemption opportunity
                 * between schedule and now.
@@ -5667,23 +5944,40 @@ asmlinkage void __sched preempt_schedule(void)
 EXPORT_SYMBOL(preempt_schedule);
 /*
- * this is the entry point to schedule() from kernel preemption
+ * this is is the entry point for the IRQ return path. Called with
- * off of irq context.
+ * interrupts disabled.  To avoid infinite irq-entry recursion problems
- * Note, that this is called and return with irqs disabled. This will
+ * with fast-paced IRQ sources we do all of this carefully to never
- * protect us against recursive calling from irq.
+ * enable interrupts again.
 */
 asmlinkage void __sched preempt_schedule_irq(void)
 {
        struct thread_info *ti = current_thread_info();
+        struct task_struct *task = current;
+        int saved_lock_depth;
-        /* Catch callers which need to be fixed */
+        if (!kernel_preemption)
-        BUG_ON(ti->preempt_count || !irqs_disabled());
+                return;
+        /*
+         * If there is a non-zero preempt_count then just return.
+         * (interrupts are disabled)
+         */
+        if (unlikely(ti->preempt_count))
+                return;
        do {
-                add_preempt_count(PREEMPT_ACTIVE);
-                local_irq_enable();
-                schedule();
                local_irq_disable();
+                add_preempt_count(PREEMPT_ACTIVE);
+                /*
+                 * We keep the big kernel semaphore locked, but we
+                 * clear ->lock_depth so that schedule() doesnt
+                 * auto-release the semaphore:
+                 */
+                saved_lock_depth = task->lock_depth;
+                task->lock_depth = -1;
+                __schedule();
+                task->lock_depth = saved_lock_depth;
                sub_preempt_count(PREEMPT_ACTIVE);
                /*
@@ -5699,7 +5993,7 @@ asmlinkage void __sched preempt_schedule_irq(void)
 int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
                          void *key)
 {
-        return try_to_wake_up(curr->private, mode, wake_flags);
+        return try_to_wake_up(curr->private, mode, wake_flags, 0);
 }
 EXPORT_SYMBOL(default_wake_function);
@@ -5742,7 +6036,7 @@ void __wake_up(wait_queue_head_t *q, unsigned int mode,
        unsigned long flags;
        spin_lock_irqsave(&q->lock, flags);
-        __wake_up_common(q, mode, nr_exclusive, 0, key);
+        __wake_up_common(q, mode, nr_exclusive, 1, key);
        spin_unlock_irqrestore(&q->lock, flags);
 }
 EXPORT_SYMBOL(__wake_up);
@@ -5822,7 +6116,7 @@ void complete(struct completion *x)
        spin_lock_irqsave(&x->wait.lock, flags);
        x->done++;
-        __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);
+        __wake_up_common(&x->wait, TASK_NORMAL, 1, 1, NULL);
        spin_unlock_irqrestore(&x->wait.lock, flags);
 }
 EXPORT_SYMBOL(complete);
@@ -5842,7 +6136,7 @@ void complete_all(struct completion *x)
        spin_lock_irqsave(&x->wait.lock, flags);
        x->done += UINT_MAX/2;
-        __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL);
+        __wake_up_common(&x->wait, TASK_NORMAL, 0, 1, NULL);
        spin_unlock_irqrestore(&x->wait.lock, flags);
 }
 EXPORT_SYMBOL(complete_all);
@@ -6058,19 +6352,19 @@ long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
 }
 EXPORT_SYMBOL(sleep_on_timeout);
-#ifdef CONFIG_RT_MUTEXES
 /*
- * rt_mutex_setprio - set the current priority of a task
+ * task_setprio - set the current priority of a task
 * @p: task
 * @prio: prio value (kernel-internal form)
 *
 * This function changes the 'effective' priority of a task. It does
 * not touch ->normal_prio like __setscheduler().
 *
- * Used by the rt_mutex code to implement priority inheritance logic.
+ * Used by the rt_mutex code to implement priority inheritance logic
+ * and by rcupreempt-boost to boost priorities of tasks sleeping
+ * with rcu locks.
 */
-void rt_mutex_setprio(struct task_struct *p, int prio)
+void task_setprio(struct task_struct *p, int prio)
 {
        unsigned long flags;
        int oldprio, on_rq, running;
@@ -6080,6 +6374,25 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
        BUG_ON(prio < 0 || prio > MAX_PRIO);
        rq = task_rq_lock(p, &flags);
+        /*
+         * Idle task boosting is a nono in general. There is one
+         * exception, when NOHZ is active:
+         *
+         * The idle task calls get_next_timer_interrupt() and holds
+         * the timer wheel base->lock on the CPU and another CPU wants
+         * to access the timer (probably to cancel it). We can safely
+         * ignore the boosting request, as the idle CPU runs this code
+         * with interrupts disabled and will complete the lock
+         * protected section without being interrupted. So there is no
+         * real need to boost.
+         */
+        if (unlikely(p == rq->idle)) {
+                WARN_ON(p != rq->curr);
+                WARN_ON(p->pi_blocked_on);
+                goto out_unlock;
+        }
        update_rq_clock(rq);
        oldprio = p->prio;
@@ -6098,18 +6411,20 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
        p->prio = prio;
+        trace_sched_task_setprio(rq, p, oldprio);
        if (running)
                p->sched_class->set_curr_task(rq);
        if (on_rq) {
-                enqueue_task(rq, p, 0);
+                enqueue_task(rq, p, 0, oldprio < prio);
                check_class_changed(rq, p, prev_class, oldprio, running);
        }
+out_unlock:
        task_rq_unlock(rq, &flags);
 }
-#endif
 void set_user_nice(struct task_struct *p, long nice)
 {
        int old_prio, delta, on_rq;
@@ -6145,7 +6460,7 @@ void set_user_nice(struct task_struct *p, long nice)
        delta = p->prio - old_prio;
        if (on_rq) {
-                enqueue_task(rq, p, 0);
+                enqueue_task(rq, p, 0, false);
                /*
                 * If the task increased its priority or is running and
                 * lowered its priority, then reschedule its CPU:
@@ -6423,7 +6738,25 @@ recheck:
        if (running)
                p->sched_class->set_curr_task(rq);
        if (on_rq) {
-                activate_task(rq, p, 0);
+                /*
+                 * Workaround to make prio ceiling work as expected:
+                 *
+                 * Queue task to head when task is running and task is
+                 * lowering its priority. This works around the non-
+                 * availability of a sched_setprio syscall which was
+                 * tinkered into the posix spec to make prio ceiling
+                 * work correctly.
+                 *
+                 * This workaround violates the posix scheduling
+                 * semantics of tail queueing in the case that the
+                 * priority was changed by anything else than
+                 * sched_setprio, but there is no other breakage
+                 * lurking than some specification fetishists going
+                 * berserk on me.
+                 *
+                 * Fixing this in mainline needs more thoughts.
+                 */
+                activate_task(rq, p, 0, running && oldprio < p->prio);
                check_class_changed(rq, p, prev_class, oldprio, running);
        }
@@ -6759,9 +7092,9 @@ SYSCALL_DEFINE0(sched_yield)
        __release(rq->lock);
        spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
        do_raw_spin_unlock(&rq->lock);
-        preempt_enable_no_resched();
+        local_irq_enable();
-        schedule();
+        preempt_enable_and_schedule();
        return 0;
 }
@@ -6773,9 +7106,18 @@ static inline int should_resched(void)
 static void __cond_resched(void)
 {
-        add_preempt_count(PREEMPT_ACTIVE);
+        do {
-        schedule();
+                add_preempt_count(PREEMPT_ACTIVE);
-        sub_preempt_count(PREEMPT_ACTIVE);
+                schedule();
+                sub_preempt_count(PREEMPT_ACTIVE);
+                /*
+                 * Check again in case we missed a preemption opportunity
+                 * between schedule and now.
+                 */
+                barrier();
+        } while (need_resched());
 }
 int __sched _cond_resched(void)
@@ -6816,10 +7158,16 @@ int __cond_resched_lock(spinlock_t *lock)
 }
 EXPORT_SYMBOL(__cond_resched_lock);
+/*
+ * Voluntarily preempt a process context that has softirqs disabled:
+ */
 int __sched __cond_resched_softirq(void)
 {
-        BUG_ON(!in_softirq());
+#ifndef CONFIG_PREEMPT_SOFTIRQS
+        WARN_ON_ONCE(!in_softirq());
+        if (!in_softirq())
+                return 0;
+#endif
        if (should_resched()) {
                local_bh_enable();
                __cond_resched();
@@ -6830,17 +7178,75 @@ int __sched __cond_resched_softirq(void)
 }
 EXPORT_SYMBOL(__cond_resched_softirq);
+/*
+ * Voluntarily preempt a softirq context (possible with softirq threading):
+ */
+int __sched cond_resched_softirq_context(void)
+{
+        WARN_ON_ONCE(!in_softirq() && !(current->extra_flags & PFE_SOFTIRQ));
+        if (softirq_need_resched() && system_state == SYSTEM_RUNNING) {
+                raw_local_irq_disable();
+                _local_bh_enable();
+                raw_local_irq_enable();
+                __cond_resched();
+                local_bh_disable();
+                return 1;
+        }
+        return 0;
+}
+EXPORT_SYMBOL(cond_resched_softirq_context);
+#ifdef CONFIG_PREEMPT_VOLUNTARY
+int voluntary_preemption = 1;
+EXPORT_SYMBOL(voluntary_preemption);
+static int __init voluntary_preempt_setup (char *str)
+{
+        if (!strncmp(str, "off", 3))
+                voluntary_preemption = 0;
+        else
+                get_option(&str, &voluntary_preemption);
+        if (!voluntary_preemption)
+                printk("turning off voluntary preemption!\n");
+        return 1;
+}
+__setup("voluntary-preempt=", voluntary_preempt_setup);
+#endif
 /**
 * yield - yield the current processor to other threads.
 *
 * This is a shortcut for kernel-space yielding - it marks the
 * thread runnable and calls sys_sched_yield().
 */
-void __sched yield(void)
+void __sched __yield(void)
 {
        set_current_state(TASK_RUNNING);
        sys_sched_yield();
 }
+void __sched yield(void)
+{
+        static int once = 1;
+        /*
+         * it's a bug to rely on yield() with RT priorities. We print
+         * the first occurance after bootup ... this will still give
+         * us an idea about the scope of the problem, without spamming
+         * the syslog:
+         */
+        if (once && rt_task(current)) {
+                once = 0;
+                printk(KERN_ERR "BUG: %s:%d RT task yield()-ing!\n",
+                        current->comm, current->pid);
+                dump_stack();
+        }
+        __yield();
+}
 EXPORT_SYMBOL(yield);
 /*
@@ -7004,6 +7410,7 @@ void sched_show_task(struct task_struct *p)
 void show_state_filter(unsigned long state_filter)
 {
        struct task_struct *g, *p;
+        int do_unlock = 1;
 #if BITS_PER_LONG == 32
        printk(KERN_INFO
@@ -7012,7 +7419,16 @@ void show_state_filter(unsigned long state_filter)
        printk(KERN_INFO
                "  task                        PC stack   pid father\n");
 #endif
+#ifdef CONFIG_PREEMPT_RT
+        if (!read_trylock(&tasklist_lock)) {
+                printk("hm, tasklist_lock write-locked.\n");
+                printk("ignoring ...\n");
+                do_unlock = 0;
+        }
+#else
        read_lock(&tasklist_lock);
+#endif
        do_each_thread(g, p) {
                /*
                 * reset the NMI-timeout, listing all files on a slow
@@ -7028,7 +7444,8 @@ void show_state_filter(unsigned long state_filter)
 #ifdef CONFIG_SCHED_DEBUG
        sysrq_sched_debug_show();
 #endif
-        read_unlock(&tasklist_lock);
+        if (do_unlock)
+                read_unlock(&tasklist_lock);
        /*
         * Only show locks if all tasks are dumped:
         */
@@ -7064,17 +7481,14 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
        __set_task_cpu(idle, cpu);
        rq->curr = rq->idle = idle;
-#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
+#if defined(CONFIG_SMP)
        idle->oncpu = 1;
 #endif
        raw_spin_unlock_irqrestore(&rq->lock, flags);
        /* Set the preempt count _outside_ the spinlocks! */
-#if defined(CONFIG_PREEMPT)
-        task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
-#else
        task_thread_info(idle)->preempt_count = 0;
-#endif
        /*
         * The idle tasks have their own, simple scheduling class:
         */
@@ -7172,27 +7586,8 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
        struct rq *rq;
        int ret = 0;
-        /*
-         * Since we rely on wake-ups to migrate sleeping tasks, don't change
-         * the ->cpus_allowed mask from under waking tasks, which would be
-         * possible when we change rq->lock in ttwu(), so synchronize against
-         * TASK_WAKING to avoid that.
-         *
-         * Make an exception for freshly cloned tasks, since cpuset namespaces
-         * might move the task about, we have to validate the target in
-         * wake_up_new_task() anyway since the cpu might have gone away.
-         */
-again:
-        while (p->state == TASK_WAKING && !(p->flags & PF_STARTING))
-                cpu_relax();
        rq = task_rq_lock(p, &flags);
-        if (p->state == TASK_WAKING && !(p->flags & PF_STARTING)) {
-                task_rq_unlock(rq, &flags);
-                goto again;
-        }
        if (!cpumask_intersects(new_mask, cpu_active_mask)) {
                ret = -EINVAL;
                goto out;
@@ -7248,11 +7643,18 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
 static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
 {
        struct rq *rq_dest, *rq_src;
+        unsigned long flags;
        int ret = 0;
        if (unlikely(!cpu_active(dest_cpu)))
                return ret;
+         /*
+          * PREEMPT_RT: this relies on write_lock_irq(&tasklist_lock)
+          * disabling interrupts - which on PREEMPT_RT does not do:
+          */
+        local_irq_save(flags);
        rq_src = cpu_rq(src_cpu);
        rq_dest = cpu_rq(dest_cpu);
@@ -7271,13 +7673,15 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
        if (p->se.on_rq) {
                deactivate_task(rq_src, p, 0);
                set_task_cpu(p, dest_cpu);
-                activate_task(rq_dest, p, 0);
+                activate_task(rq_dest, p, 0, false);
                check_preempt_curr(rq_dest, p, 0);
        }
 done:
        ret = 1;
 fail:
        double_rq_unlock(rq_src, rq_dest);
+        local_irq_restore(flags);
        return ret;
 }
@@ -7437,7 +7841,7 @@ void sched_idle_next(void)
        __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
        update_rq_clock(rq);
-        activate_task(rq, p, 0);
+        activate_task(rq, p, 0, false);
        raw_spin_unlock_irqrestore(&rq->lock, flags);
 }
@@ -7454,7 +7858,11 @@ void idle_task_exit(void)
        if (mm != &init_mm)
                switch_mm(mm, &init_mm, current);
+#ifdef CONFIG_PREEMPT_RT
+        mmdrop_delayed(mm);
+#else
        mmdrop(mm);
+#endif
 }
 /* called under rq->lock with disabled interrupts */
@@ -9699,6 +10107,9 @@ void __init sched_init(void)
        atomic_inc(&init_mm.mm_count);
        enter_lazy_tlb(&init_mm, current);
+#ifdef CONFIG_PREEMPT_RT
+        printk("Real-Time Preemption Support (C) 2004-2007 Ingo Molnar\n");
+#endif
        /*
         * Make us the idle thread. Technically, schedule() should not be
         * called from this thread, however somewhere below it might be,
@@ -9731,10 +10142,14 @@ void __init sched_init(void)
        scheduler_running = 1;
 }
-#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
+#if defined(CONFIG_DEBUG_SPINLOCK_SLEEP) || defined(CONFIG_DEBUG_PREEMPT)
 static inline int preempt_count_equals(int preempt_offset)
 {
-        int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
+        int nested = (preempt_count() & ~PREEMPT_ACTIVE);
+#ifndef CONFIG_PREEMPT_RT
+        nested += rcu_preempt_depth();
+#endif
        return (nested == PREEMPT_INATOMIC_BASE + preempt_offset);
 }
@@ -9755,7 +10170,8 @@ void __might_sleep(char *file, int line, int preempt_offset)
                "BUG: sleeping function called from invalid context at %s:%d\n",
                        file, line);
        printk(KERN_ERR
-                "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
+                "pcnt: %x %d in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
+               preempt_count(), preempt_offset,
                        in_atomic(), irqs_disabled(),
                        current->pid, current->comm);
@@ -9779,7 +10195,7 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
                deactivate_task(rq, p, 0);
        __setscheduler(rq, p, SCHED_NORMAL, 0);
        if (on_rq) {
-                activate_task(rq, p, 0);
+                activate_task(rq, p, 0, false);
                resched_task(rq->curr);
        }
 }
@@ -10155,7 +10571,7 @@ void sched_move_task(struct task_struct *tsk)
        if (unlikely(running))
                tsk->sched_class->set_curr_task(rq);
        if (on_rq)
-                enqueue_task(rq, tsk, 0);
+                enqueue_task(rq, tsk, 0, false);
        task_rq_unlock(rq, &flags);
 }