5 files changed, 303 insertions, 322 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 72bb9483d949..b0afd8db1396 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -318,15 +318,19 @@ static inline int cpu_of(struct rq *rq)
 }
 /*
- * Per-runqueue clock, as finegrained as the platform can give us:
+ * Update the per-runqueue clock, as finegrained as the platform can give
+ * us, but without assuming monotonicity, etc.:
 */
-static unsigned long long __rq_clock(struct rq *rq)
+static void __update_rq_clock(struct rq *rq)
 {
        u64 prev_raw = rq->prev_clock_raw;
        u64 now = sched_clock();
        s64 delta = now - prev_raw;
        u64 clock = rq->clock;
+#ifdef CONFIG_SCHED_DEBUG
+        WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
+#endif
        /*
         * Protect against sched_clock() occasionally going backwards:
         */
@@ -349,18 +353,12 @@ static unsigned long long __rq_clock(struct rq *rq)
        rq->prev_clock_raw = now;
        rq->clock = clock;
-        return clock;
 }
-static inline unsigned long long rq_clock(struct rq *rq)
+static void update_rq_clock(struct rq *rq)
 {
-        int this_cpu = smp_processor_id();
+        if (likely(smp_processor_id() == cpu_of(rq)))
+                __update_rq_clock(rq);
-        if (this_cpu == cpu_of(rq))
-                return __rq_clock(rq);
-        return rq->clock;
 }
 /*
@@ -386,9 +384,12 @@ unsigned long long cpu_clock(int cpu)
 {
        unsigned long long now;
        unsigned long flags;
+        struct rq *rq;
        local_irq_save(flags);
-        now = rq_clock(cpu_rq(cpu));
+        rq = cpu_rq(cpu);
+        update_rq_clock(rq);
+        now = rq->clock;
        local_irq_restore(flags);
        return now;
@@ -637,6 +638,11 @@ static u64 div64_likely32(u64 divident, unsigned long divisor)
 #define WMULT_SHIFT     32
+/*
+ * Shift right and round:
+ */
+#define RSR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
 static unsigned long
 calc_delta_mine(unsigned long delta_exec, unsigned long weight,
                struct load_weight *lw)
@@ -644,18 +650,17 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight,
        u64 tmp;
        if (unlikely(!lw->inv_weight))
-                lw->inv_weight = WMULT_CONST / lw->weight;
+                lw->inv_weight = (WMULT_CONST - lw->weight/2) / lw->weight + 1;
        tmp = (u64)delta_exec * weight;
        /*
         * Check whether we'd overflow the 64-bit multiplication:
         */
-        if (unlikely(tmp > WMULT_CONST)) {
+        if (unlikely(tmp > WMULT_CONST))
-                tmp = ((tmp >> WMULT_SHIFT/2) * lw->inv_weight)
+                tmp = RSR(RSR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
-                                >> (WMULT_SHIFT/2);
+                        WMULT_SHIFT/2);
-        } else {
+        else
-                tmp = (tmp * lw->inv_weight) >> WMULT_SHIFT;
+                tmp = RSR(tmp * lw->inv_weight, WMULT_SHIFT);
-        }
        return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
 }
@@ -703,11 +708,14 @@ static void update_load_sub(struct load_weight *lw, unsigned long dec)
 * the relative distance between them is ~25%.)
 */
 static const int prio_to_weight[40] = {
-/* -20 */ 88818, 71054, 56843, 45475, 36380, 29104, 23283, 18626, 14901, 11921,
+ /* -20 */     88761,     71755,     56483,     46273,     36291,
-/* -10 */  9537,  7629,  6103,  4883,  3906,  3125,  2500,  2000,  1600,  1280,
+ /* -15 */     29154,     23254,     18705,     14949,     11916,
-/*   0 */  NICE_0_LOAD /* 1024 */,
+ /* -10 */      9548,      7620,      6100,      4904,      3906,
-/*   1 */          819,   655,   524,   419,   336,   268,   215,   172,   137,
+ /*  -5 */      3121,      2501,      1991,      1586,      1277,
-/*  10 */   110,    87,    70,    56,    45,    36,    29,    23,    18,    15,
+ /*   0 */      1024,       820,       655,       526,       423,
+ /*   5 */       335,       272,       215,       172,       137,
+ /*  10 */       110,        87,        70,        56,        45,
+ /*  15 */        36,        29,        23,        18,        15,
 };
 /*
@@ -718,14 +726,14 @@ static const int prio_to_weight[40] = {
 * into multiplications:
 */
 static const u32 prio_to_wmult[40] = {
-/* -20 */     48356,     60446,     75558,     94446,    118058,
+ /* -20 */     48388,     59856,     76040,     92818,    118348,
-/* -15 */    147573,    184467,    230589,    288233,    360285,
+ /* -15 */    147320,    184698,    229616,    287308,    360437,
-/* -10 */    450347,    562979,    703746,    879575,   1099582,
+ /* -10 */    449829,    563644,    704093,    875809,   1099582,
-/*  -5 */   1374389,   1717986,   2147483,   2684354,   3355443,
+ /*  -5 */   1376151,   1717300,   2157191,   2708050,   3363326,
-/*   0 */   4194304,   5244160,   6557201,   8196502,  10250518,
+ /*   0 */   4194304,   5237765,   6557202,   8165337,  10153587,
-/*   5 */  12782640,  16025997,  19976592,  24970740,  31350126,
+ /*   5 */  12820798,  15790321,  19976592,  24970740,  31350126,
-/*  10 */  39045157,  49367440,  61356675,  76695844,  95443717,
+ /*  10 */  39045157,  49367440,  61356676,  76695844,  95443717,
-/*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
+ /*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
 };
 static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
@@ -745,8 +753,7 @@ static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
                      unsigned long max_nr_move, unsigned long max_load_move,
                      struct sched_domain *sd, enum cpu_idle_type idle,
                      int *all_pinned, unsigned long *load_moved,
-                      int this_best_prio, int best_prio, int best_prio_seen,
+                      int *this_best_prio, struct rq_iterator *iterator);
-                      struct rq_iterator *iterator);
 #include "sched_stats.h"
 #include "sched_rt.c"
@@ -782,14 +789,14 @@ static void __update_curr_load(struct rq *rq, struct load_stat *ls)
 * This function is called /before/ updating rq->ls.load
 * and when switching tasks.
 */
-static void update_curr_load(struct rq *rq, u64 now)
+static void update_curr_load(struct rq *rq)
 {
        struct load_stat *ls = &rq->ls;
        u64 start;
        start = ls->load_update_start;
-        ls->load_update_start = now;
+        ls->load_update_start = rq->clock;
-        ls->delta_stat += now - start;
+        ls->delta_stat += rq->clock - start;
        /*
         * Stagger updates to ls->delta_fair. Very frequent updates
         * can be expensive.
@@ -798,30 +805,28 @@ static void update_curr_load(struct rq *rq, u64 now)
                __update_curr_load(rq, ls);
 }
-static inline void
+static inline void inc_load(struct rq *rq, const struct task_struct *p)
-inc_load(struct rq *rq, const struct task_struct *p, u64 now)
 {
-        update_curr_load(rq, now);
+        update_curr_load(rq);
        update_load_add(&rq->ls.load, p->se.load.weight);
 }
-static inline void
+static inline void dec_load(struct rq *rq, const struct task_struct *p)
-dec_load(struct rq *rq, const struct task_struct *p, u64 now)
 {
-        update_curr_load(rq, now);
+        update_curr_load(rq);
        update_load_sub(&rq->ls.load, p->se.load.weight);
 }
-static void inc_nr_running(struct task_struct *p, struct rq *rq, u64 now)
+static void inc_nr_running(struct task_struct *p, struct rq *rq)
 {
        rq->nr_running++;
-        inc_load(rq, p, now);
+        inc_load(rq, p);
 }
-static void dec_nr_running(struct task_struct *p, struct rq *rq, u64 now)
+static void dec_nr_running(struct task_struct *p, struct rq *rq)
 {
        rq->nr_running--;
-        dec_load(rq, p, now);
+        dec_load(rq, p);
 }
 static void set_load_weight(struct task_struct *p)
@@ -848,18 +853,16 @@ static void set_load_weight(struct task_struct *p)
        p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
 }
-static void
+static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
-enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, u64 now)
 {
        sched_info_queued(p);
-        p->sched_class->enqueue_task(rq, p, wakeup, now);
+        p->sched_class->enqueue_task(rq, p, wakeup);
        p->se.on_rq = 1;
 }
-static void
+static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
-dequeue_task(struct rq *rq, struct task_struct *p, int sleep, u64 now)
 {
-        p->sched_class->dequeue_task(rq, p, sleep, now);
+        p->sched_class->dequeue_task(rq, p, sleep);
        p->se.on_rq = 0;
 }
@@ -914,13 +917,11 @@ static int effective_prio(struct task_struct *p)
 */
 static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
 {
-        u64 now = rq_clock(rq);
        if (p->state == TASK_UNINTERRUPTIBLE)
                rq->nr_uninterruptible--;
-        enqueue_task(rq, p, wakeup, now);
+        enqueue_task(rq, p, wakeup);
-        inc_nr_running(p, rq, now);
+        inc_nr_running(p, rq);
 }
 /*
@@ -928,13 +929,13 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
 */
 static inline void activate_idle_task(struct task_struct *p, struct rq *rq)
 {
-        u64 now = rq_clock(rq);
+        update_rq_clock(rq);
        if (p->state == TASK_UNINTERRUPTIBLE)
                rq->nr_uninterruptible--;
-        enqueue_task(rq, p, 0, now);
+        enqueue_task(rq, p, 0);
-        inc_nr_running(p, rq, now);
+        inc_nr_running(p, rq);
 }
 /*
@@ -942,13 +943,11 @@ static inline void activate_idle_task(struct task_struct *p, struct rq *rq)
 */
 static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
 {
-        u64 now = rq_clock(rq);
        if (p->state == TASK_UNINTERRUPTIBLE)
                rq->nr_uninterruptible++;
-        dequeue_task(rq, p, sleep, now);
+        dequeue_task(rq, p, sleep);
-        dec_nr_running(p, rq, now);
+        dec_nr_running(p, rq);
 }
 /**
@@ -1516,6 +1515,7 @@ out_set_cpu:
 out_activate:
 #endif /* CONFIG_SMP */
+        update_rq_clock(rq);
        activate_task(rq, p, 1);
        /*
         * Sync wakeups (i.e. those types of wakeups where the waker
@@ -1647,12 +1647,11 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
        unsigned long flags;
        struct rq *rq;
        int this_cpu;
-        u64 now;
        rq = task_rq_lock(p, &flags);
        BUG_ON(p->state != TASK_RUNNING);
        this_cpu = smp_processor_id(); /* parent's CPU */
-        now = rq_clock(rq);
+        update_rq_clock(rq);
        p->prio = effective_prio(p);
@@ -1666,8 +1665,8 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
                 * Let the scheduling class do new task startup
                 * management (if any):
                 */
-                p->sched_class->task_new(rq, p, now);
+                p->sched_class->task_new(rq, p);
-                inc_nr_running(p, rq, now);
+                inc_nr_running(p, rq);
        }
        check_preempt_curr(rq, p);
        task_rq_unlock(rq, &flags);
@@ -1954,7 +1953,6 @@ static void update_cpu_load(struct rq *this_rq)
        unsigned long total_load = this_rq->ls.load.weight;
        unsigned long this_load =  total_load;
        struct load_stat *ls = &this_rq->ls;
-        u64 now = __rq_clock(this_rq);
        int i, scale;
        this_rq->nr_load_updates++;
@@ -1962,7 +1960,7 @@ static void update_cpu_load(struct rq *this_rq)
                goto do_avg;
        /* Update delta_fair/delta_exec fields first */
-        update_curr_load(this_rq, now);
+        update_curr_load(this_rq);
        fair_delta64 = ls->delta_fair + 1;
        ls->delta_fair = 0;
@@ -1970,8 +1968,8 @@ static void update_cpu_load(struct rq *this_rq)
        exec_delta64 = ls->delta_exec + 1;
        ls->delta_exec = 0;
-        sample_interval64 = now - ls->load_update_last;
+        sample_interval64 = this_rq->clock - ls->load_update_last;
-        ls->load_update_last = now;
+        ls->load_update_last = this_rq->clock;
        if ((s64)sample_interval64 < (s64)TICK_NSEC)
                sample_interval64 = TICK_NSEC;
@@ -2026,6 +2024,8 @@ static void double_rq_lock(struct rq *rq1, struct rq *rq2)
                        spin_lock(&rq1->lock);
                }
        }
+        update_rq_clock(rq1);
+        update_rq_clock(rq2);
 }
 /*
@@ -2166,8 +2166,7 @@ static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
                      unsigned long max_nr_move, unsigned long max_load_move,
                      struct sched_domain *sd, enum cpu_idle_type idle,
                      int *all_pinned, unsigned long *load_moved,
-                      int this_best_prio, int best_prio, int best_prio_seen,
+                      int *this_best_prio, struct rq_iterator *iterator)
-                      struct rq_iterator *iterator)
 {
        int pulled = 0, pinned = 0, skip_for_load;
        struct task_struct *p;
@@ -2192,12 +2191,8 @@ next:
         */
        skip_for_load = (p->se.load.weight >> 1) > rem_load_move +
                                                         SCHED_LOAD_SCALE_FUZZ;
-        if (skip_for_load && p->prio < this_best_prio)
+        if ((skip_for_load && p->prio >= *this_best_prio) ||
-                skip_for_load = !best_prio_seen && p->prio == best_prio;
-        if (skip_for_load ||
            !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
-                best_prio_seen |= p->prio == best_prio;
                p = iterator->next(iterator->arg);
                goto next;
        }
@@ -2211,8 +2206,8 @@ next:
         * and the prescribed amount of weighted load.
         */
        if (pulled < max_nr_move && rem_load_move > 0) {
-                if (p->prio < this_best_prio)
+                if (p->prio < *this_best_prio)
-                        this_best_prio = p->prio;
+                        *this_best_prio = p->prio;
                p = iterator->next(iterator->arg);
                goto next;
        }
@@ -2231,32 +2226,52 @@ out:
 }
 /*
- * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted
+ * move_tasks tries to move up to max_load_move weighted load from busiest to
- * load from busiest to this_rq, as part of a balancing operation within
+ * this_rq, as part of a balancing operation within domain "sd".
- * "domain". Returns the number of tasks moved.
+ * Returns 1 if successful and 0 otherwise.
 *
 * Called with both runqueues locked.
 */
 static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
-                      unsigned long max_nr_move, unsigned long max_load_move,
+                      unsigned long max_load_move,
                      struct sched_domain *sd, enum cpu_idle_type idle,
                      int *all_pinned)
 {
        struct sched_class *class = sched_class_highest;
-        unsigned long load_moved, total_nr_moved = 0, nr_moved;
+        unsigned long total_load_moved = 0;
-        long rem_load_move = max_load_move;
+        int this_best_prio = this_rq->curr->prio;
        do {
-                nr_moved = class->load_balance(this_rq, this_cpu, busiest,
+                total_load_moved +=
-                                max_nr_move, (unsigned long)rem_load_move,
+                        class->load_balance(this_rq, this_cpu, busiest,
-                                sd, idle, all_pinned, &load_moved);
+                                ULONG_MAX, max_load_move - total_load_moved,
-                total_nr_moved += nr_moved;
+                                sd, idle, all_pinned, &this_best_prio);
-                max_nr_move -= nr_moved;
-                rem_load_move -= load_moved;
                class = class->next;
-        } while (class && max_nr_move && rem_load_move > 0);
+        } while (class && max_load_move > total_load_moved);
-        return total_nr_moved;
+        return total_load_moved > 0;
+}
+/*
+ * move_one_task tries to move exactly one task from busiest to this_rq, as
+ * part of active balancing operations within "domain".
+ * Returns 1 if successful and 0 otherwise.
+ *
+ * Called with both runqueues locked.
+ */
+static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
+                         struct sched_domain *sd, enum cpu_idle_type idle)
+{
+        struct sched_class *class;
+        int this_best_prio = MAX_PRIO;
+        for (class = sched_class_highest; class; class = class->next)
+                if (class->load_balance(this_rq, this_cpu, busiest,
+                                        1, ULONG_MAX, sd, idle, NULL,
+                                        &this_best_prio))
+                        return 1;
+        return 0;
 }
 /*
@@ -2588,11 +2603,6 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
 */
 #define MAX_PINNED_INTERVAL     512
-static inline unsigned long minus_1_or_zero(unsigned long n)
-{
-        return n > 0 ? n - 1 : 0;
-}
 /*
 * Check this_cpu to ensure it is balanced within domain. Attempt to move
 * tasks if there is an imbalance.
@@ -2601,7 +2611,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
                        struct sched_domain *sd, enum cpu_idle_type idle,
                        int *balance)
 {
-        int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
+        int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
        struct sched_group *group;
        unsigned long imbalance;
        struct rq *busiest;
@@ -2642,18 +2652,17 @@ redo:
        schedstat_add(sd, lb_imbalance[idle], imbalance);
-        nr_moved = 0;
+        ld_moved = 0;
        if (busiest->nr_running > 1) {
                /*
                 * Attempt to move tasks. If find_busiest_group has found
                 * an imbalance but busiest->nr_running <= 1, the group is
-                 * still unbalanced. nr_moved simply stays zero, so it is
+                 * still unbalanced. ld_moved simply stays zero, so it is
                 * correctly treated as an imbalance.
                 */
                local_irq_save(flags);
                double_rq_lock(this_rq, busiest);
-                nr_moved = move_tasks(this_rq, this_cpu, busiest,
+                ld_moved = move_tasks(this_rq, this_cpu, busiest,
-                                      minus_1_or_zero(busiest->nr_running),
                                      imbalance, sd, idle, &all_pinned);
                double_rq_unlock(this_rq, busiest);
                local_irq_restore(flags);
@@ -2661,7 +2670,7 @@ redo:
                /*
                 * some other cpu did the load balance for us.
                 */
-                if (nr_moved && this_cpu != smp_processor_id())
+                if (ld_moved && this_cpu != smp_processor_id())
                        resched_cpu(this_cpu);
                /* All tasks on this runqueue were pinned by CPU affinity */
@@ -2673,7 +2682,7 @@ redo:
                }
        }
-        if (!nr_moved) {
+        if (!ld_moved) {
                schedstat_inc(sd, lb_failed[idle]);
                sd->nr_balance_failed++;
@@ -2722,10 +2731,10 @@ redo:
                        sd->balance_interval *= 2;
        }
-        if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
+        if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
            !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
                return -1;
-        return nr_moved;
+        return ld_moved;
 out_balanced:
        schedstat_inc(sd, lb_balanced[idle]);
@@ -2757,7 +2766,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
        struct sched_group *group;
        struct rq *busiest = NULL;
        unsigned long imbalance;
-        int nr_moved = 0;
+        int ld_moved = 0;
        int sd_idle = 0;
        int all_pinned = 0;
        cpumask_t cpus = CPU_MASK_ALL;
@@ -2792,12 +2801,13 @@ redo:
        schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);
-        nr_moved = 0;
+        ld_moved = 0;
        if (busiest->nr_running > 1) {
                /* Attempt to move tasks */
                double_lock_balance(this_rq, busiest);
-                nr_moved = move_tasks(this_rq, this_cpu, busiest,
+                /* this_rq->clock is already updated */
-                                        minus_1_or_zero(busiest->nr_running),
+                update_rq_clock(busiest);
+                ld_moved = move_tasks(this_rq, this_cpu, busiest,
                                        imbalance, sd, CPU_NEWLY_IDLE,
                                        &all_pinned);
                spin_unlock(&busiest->lock);
@@ -2809,7 +2819,7 @@ redo:
                }
        }
-        if (!nr_moved) {
+        if (!ld_moved) {
                schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
                if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
                    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
@@ -2817,7 +2827,7 @@ redo:
        } else
                sd->nr_balance_failed = 0;
-        return nr_moved;
+        return ld_moved;
 out_balanced:
        schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);
@@ -2894,6 +2904,8 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
        /* move a task from busiest_rq to target_rq */
        double_lock_balance(busiest_rq, target_rq);
+        update_rq_clock(busiest_rq);
+        update_rq_clock(target_rq);
        /* Search for an sd spanning us and the target CPU. */
        for_each_domain(target_cpu, sd) {
@@ -2905,8 +2917,8 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
        if (likely(sd)) {
                schedstat_inc(sd, alb_cnt);
-                if (move_tasks(target_rq, target_cpu, busiest_rq, 1,
+                if (move_one_task(target_rq, target_cpu, busiest_rq,
-                               ULONG_MAX, sd, CPU_IDLE, NULL))
+                                  sd, CPU_IDLE))
                        schedstat_inc(sd, alb_pushed);
                else
                        schedstat_inc(sd, alb_failed);
@@ -3175,8 +3187,7 @@ static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
                      unsigned long max_nr_move, unsigned long max_load_move,
                      struct sched_domain *sd, enum cpu_idle_type idle,
                      int *all_pinned, unsigned long *load_moved,
-                      int this_best_prio, int best_prio, int best_prio_seen,
+                      int *this_best_prio, struct rq_iterator *iterator)
-                      struct rq_iterator *iterator)
 {
        *load_moved = 0;
@@ -3202,7 +3213,8 @@ unsigned long long task_sched_runtime(struct task_struct *p)
        rq = task_rq_lock(p, &flags);
        ns = p->se.sum_exec_runtime;
        if (rq->curr == p) {
-                delta_exec = rq_clock(rq) - p->se.exec_start;
+                update_rq_clock(rq);
+                delta_exec = rq->clock - p->se.exec_start;
                if ((s64)delta_exec > 0)
                        ns += delta_exec;
        }
@@ -3298,9 +3310,10 @@ void scheduler_tick(void)
        struct task_struct *curr = rq->curr;
        spin_lock(&rq->lock);
+        __update_rq_clock(rq);
+        update_cpu_load(rq);
        if (curr != rq->idle) /* FIXME: needed? */
                curr->sched_class->task_tick(rq, curr);
-        update_cpu_load(rq);
        spin_unlock(&rq->lock);
 #ifdef CONFIG_SMP
@@ -3382,7 +3395,7 @@ static inline void schedule_debug(struct task_struct *prev)
 * Pick up the highest-prio task:
 */
 static inline struct task_struct *
-pick_next_task(struct rq *rq, struct task_struct *prev, u64 now)
+pick_next_task(struct rq *rq, struct task_struct *prev)
 {
        struct sched_class *class;
        struct task_struct *p;
@@ -3392,14 +3405,14 @@ pick_next_task(struct rq *rq, struct task_struct *prev, u64 now)
         * the fair class we can call that function directly:
         */
        if (likely(rq->nr_running == rq->cfs.nr_running)) {
-                p = fair_sched_class.pick_next_task(rq, now);
+                p = fair_sched_class.pick_next_task(rq);
                if (likely(p))
                        return p;
        }
        class = sched_class_highest;
        for ( ; ; ) {
-                p = class->pick_next_task(rq, now);
+                p = class->pick_next_task(rq);
                if (p)
                        return p;
                /*
@@ -3418,7 +3431,6 @@ asmlinkage void __sched schedule(void)
        struct task_struct *prev, *next;
        long *switch_count;
        struct rq *rq;
-        u64 now;
        int cpu;
 need_resched:
@@ -3436,6 +3448,7 @@ need_resched_nonpreemptible:
        spin_lock_irq(&rq->lock);
        clear_tsk_need_resched(prev);
+        __update_rq_clock(rq);
        if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
                if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
@@ -3450,9 +3463,8 @@ need_resched_nonpreemptible:
        if (unlikely(!rq->nr_running))
                idle_balance(cpu, rq);
-        now = __rq_clock(rq);
+        prev->sched_class->put_prev_task(rq, prev);
-        prev->sched_class->put_prev_task(rq, prev, now);
+        next = pick_next_task(rq, prev);
-        next = pick_next_task(rq, prev, now);
        sched_info_switch(prev, next);
@@ -3895,17 +3907,16 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
        unsigned long flags;
        int oldprio, on_rq;
        struct rq *rq;
-        u64 now;
        BUG_ON(prio < 0 || prio > MAX_PRIO);
        rq = task_rq_lock(p, &flags);
-        now = rq_clock(rq);
+        update_rq_clock(rq);
        oldprio = p->prio;
        on_rq = p->se.on_rq;
        if (on_rq)
-                dequeue_task(rq, p, 0, now);
+                dequeue_task(rq, p, 0);
        if (rt_prio(prio))
                p->sched_class = &rt_sched_class;
@@ -3915,7 +3926,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
        p->prio = prio;
        if (on_rq) {
-                enqueue_task(rq, p, 0, now);
+                enqueue_task(rq, p, 0);
                /*
                 * Reschedule if we are currently running on this runqueue and
                 * our priority decreased, or if we are not currently running on
@@ -3938,7 +3949,6 @@ void set_user_nice(struct task_struct *p, long nice)
        int old_prio, delta, on_rq;
        unsigned long flags;
        struct rq *rq;
-        u64 now;
        if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
                return;
@@ -3947,7 +3957,7 @@ void set_user_nice(struct task_struct *p, long nice)
         * the task might be in the middle of scheduling on another CPU.
         */
        rq = task_rq_lock(p, &flags);
-        now = rq_clock(rq);
+        update_rq_clock(rq);
        /*
         * The RT priorities are set via sched_setscheduler(), but we still
         * allow the 'normal' nice value to be set - but as expected
@@ -3960,8 +3970,8 @@ void set_user_nice(struct task_struct *p, long nice)
        }
        on_rq = p->se.on_rq;
        if (on_rq) {
-                dequeue_task(rq, p, 0, now);
+                dequeue_task(rq, p, 0);
-                dec_load(rq, p, now);
+                dec_load(rq, p);
        }
        p->static_prio = NICE_TO_PRIO(nice);
@@ -3971,8 +3981,8 @@ void set_user_nice(struct task_struct *p, long nice)
        delta = p->prio - old_prio;
        if (on_rq) {
-                enqueue_task(rq, p, 0, now);
+                enqueue_task(rq, p, 0);
-                inc_load(rq, p, now);
+                inc_load(rq, p);
                /*
                 * If the task increased its priority or is running and
                 * lowered its priority, then reschedule its CPU:
@@ -4208,6 +4218,7 @@ recheck:
                spin_unlock_irqrestore(&p->pi_lock, flags);
                goto recheck;
        }
+        update_rq_clock(rq);
        on_rq = p->se.on_rq;
        if (on_rq)
                deactivate_task(rq, p, 0);
@@ -4463,10 +4474,8 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask)
 out_unlock:
        read_unlock(&tasklist_lock);
        mutex_unlock(&sched_hotcpu_mutex);
-        if (retval)
-                return retval;
-        return 0;
+        return retval;
 }
 /**
@@ -4966,6 +4975,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
        on_rq = p->se.on_rq;
        if (on_rq)
                deactivate_task(rq_src, p, 0);
        set_task_cpu(p, dest_cpu);
        if (on_rq) {
                activate_task(rq_dest, p, 0);
@@ -5198,7 +5208,8 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
        for ( ; ; ) {
                if (!rq->nr_running)
                        break;
-                next = pick_next_task(rq, rq->curr, rq_clock(rq));
+                update_rq_clock(rq);
+                next = pick_next_task(rq, rq->curr);
                if (!next)
                        break;
                migrate_dead(dead_cpu, next);
@@ -5210,12 +5221,19 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
 #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
 static struct ctl_table sd_ctl_dir[] = {
-        {CTL_UNNUMBERED, "sched_domain", NULL, 0, 0755, NULL, },
+        {
+                .procname       = "sched_domain",
+                .mode           = 0755,
+        },
        {0,},
 };
 static struct ctl_table sd_ctl_root[] = {
-        {CTL_UNNUMBERED, "kernel", NULL, 0, 0755, sd_ctl_dir, },
+        {
+                .procname       = "kernel",
+                .mode           = 0755,
+                .child          = sd_ctl_dir,
+        },
        {0,},
 };
@@ -5231,11 +5249,10 @@ static struct ctl_table *sd_alloc_ctl_entry(int n)
 }
 static void
-set_table_entry(struct ctl_table *entry, int ctl_name,
+set_table_entry(struct ctl_table *entry,
                const char *procname, void *data, int maxlen,
                mode_t mode, proc_handler *proc_handler)
 {
-        entry->ctl_name = ctl_name;
        entry->procname = procname;
        entry->data = data;
        entry->maxlen = maxlen;
@@ -5248,28 +5265,28 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
 {
        struct ctl_table *table = sd_alloc_ctl_entry(14);
-        set_table_entry(&table[0], 1, "min_interval", &sd->min_interval,
+        set_table_entry(&table[0], "min_interval", &sd->min_interval,
                sizeof(long), 0644, proc_doulongvec_minmax);
-        set_table_entry(&table[1], 2, "max_interval", &sd->max_interval,
+        set_table_entry(&table[1], "max_interval", &sd->max_interval,
                sizeof(long), 0644, proc_doulongvec_minmax);
-        set_table_entry(&table[2], 3, "busy_idx", &sd->busy_idx,
+        set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
                sizeof(int), 0644, proc_dointvec_minmax);
-        set_table_entry(&table[3], 4, "idle_idx", &sd->idle_idx,
+        set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
                sizeof(int), 0644, proc_dointvec_minmax);
-        set_table_entry(&table[4], 5, "newidle_idx", &sd->newidle_idx,
+        set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
                sizeof(int), 0644, proc_dointvec_minmax);
-        set_table_entry(&table[5], 6, "wake_idx", &sd->wake_idx,
+        set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
                sizeof(int), 0644, proc_dointvec_minmax);
-        set_table_entry(&table[6], 7, "forkexec_idx", &sd->forkexec_idx,
+        set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
                sizeof(int), 0644, proc_dointvec_minmax);
-        set_table_entry(&table[7], 8, "busy_factor", &sd->busy_factor,
+        set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
                sizeof(int), 0644, proc_dointvec_minmax);
-        set_table_entry(&table[8], 9, "imbalance_pct", &sd->imbalance_pct,
+        set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
                sizeof(int), 0644, proc_dointvec_minmax);
-        set_table_entry(&table[10], 11, "cache_nice_tries",
+        set_table_entry(&table[10], "cache_nice_tries",
                &sd->cache_nice_tries,
                sizeof(int), 0644, proc_dointvec_minmax);
-        set_table_entry(&table[12], 13, "flags", &sd->flags,
+        set_table_entry(&table[12], "flags", &sd->flags,
                sizeof(int), 0644, proc_dointvec_minmax);
        return table;
@@ -5289,7 +5306,6 @@ static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
        i = 0;
        for_each_domain(cpu, sd) {
                snprintf(buf, 32, "domain%d", i);
-                entry->ctl_name = i + 1;
                entry->procname = kstrdup(buf, GFP_KERNEL);
                entry->mode = 0755;
                entry->child = sd_alloc_ctl_domain_table(sd);
@@ -5310,7 +5326,6 @@ static void init_sched_domain_sysctl(void)
        for (i = 0; i < cpu_num; i++, entry++) {
                snprintf(buf, 32, "cpu%d", i);
-                entry->ctl_name = i + 1;
                entry->procname = kstrdup(buf, GFP_KERNEL);
                entry->mode = 0755;
                entry->child = sd_alloc_ctl_cpu_table(i);
@@ -5379,6 +5394,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                rq->migration_thread = NULL;
                /* Idle task back to normal (off runqueue, low prio) */
                rq = task_rq_lock(rq->idle, &flags);
+                update_rq_clock(rq);
                deactivate_task(rq, rq->idle, 0);
                rq->idle->static_prio = MAX_PRIO;
                __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
@@ -6616,12 +6632,13 @@ void normalize_rt_tasks(void)
                        goto out_unlock;
 #endif
+                update_rq_clock(rq);
                on_rq = p->se.on_rq;
                if (on_rq)
-                        deactivate_task(task_rq(p), p, 0);
+                        deactivate_task(rq, p, 0);
                __setscheduler(rq, p, SCHED_NORMAL, 0);
                if (on_rq) {
-                        activate_task(task_rq(p), p, 0);
+                        activate_task(rq, p, 0);
                        resched_task(rq->curr);
                }
 #ifdef CONFIG_SMP
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 8421b9399e10..3da32156394e 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -29,7 +29,7 @@
 } while (0)
 static void
-print_task(struct seq_file *m, struct rq *rq, struct task_struct *p, u64 now)
+print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
 {
        if (rq->curr == p)
                SEQ_printf(m, "R");
@@ -56,7 +56,7 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p, u64 now)
 #endif
 }
-static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu, u64 now)
+static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
 {
        struct task_struct *g, *p;
@@ -77,7 +77,7 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu, u64 now)
                if (!p->se.on_rq || task_cpu(p) != rq_cpu)
                        continue;
-                print_task(m, rq, p, now);
+                print_task(m, rq, p);
        } while_each_thread(g, p);
        read_unlock_irq(&tasklist_lock);
@@ -106,7 +106,7 @@ print_cfs_rq_runtime_sum(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
                (long long)wait_runtime_rq_sum);
 }
-void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq, u64 now)
+void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 {
        SEQ_printf(m, "\ncfs_rq %p\n", cfs_rq);
@@ -124,7 +124,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq, u64 now)
        print_cfs_rq_runtime_sum(m, cpu, cfs_rq);
 }
-static void print_cpu(struct seq_file *m, int cpu, u64 now)
+static void print_cpu(struct seq_file *m, int cpu)
 {
        struct rq *rq = &per_cpu(runqueues, cpu);
@@ -166,9 +166,9 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
        P(cpu_load[4]);
 #undef P
-        print_cfs_stats(m, cpu, now);
+        print_cfs_stats(m, cpu);
-        print_rq(m, rq, cpu, now);
+        print_rq(m, rq, cpu);
 }
 static int sched_debug_show(struct seq_file *m, void *v)
@@ -184,7 +184,7 @@ static int sched_debug_show(struct seq_file *m, void *v)
        SEQ_printf(m, "now at %Lu nsecs\n", (unsigned long long)now);
        for_each_online_cpu(cpu)
-                print_cpu(m, cpu, now);
+                print_cpu(m, cpu);
        SEQ_printf(m, "\n");
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 6f579ff5a9bc..e91db32cadfd 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -222,21 +222,25 @@ niced_granularity(struct sched_entity *curr, unsigned long granularity)
 {
        u64 tmp;
+        if (likely(curr->load.weight == NICE_0_LOAD))
+                return granularity;
        /*
-         * Negative nice levels get the same granularity as nice-0:
+         * Positive nice levels get the same granularity as nice-0:
         */
-        if (likely(curr->load.weight >= NICE_0_LOAD))
+        if (likely(curr->load.weight < NICE_0_LOAD)) {
-                return granularity;
+                tmp = curr->load.weight * (u64)granularity;
+                return (long) (tmp >> NICE_0_SHIFT);
+        }
        /*
-         * Positive nice level tasks get linearly finer
+         * Negative nice level tasks get linearly finer
         * granularity:
         */
-        tmp = curr->load.weight * (u64)granularity;
+        tmp = curr->load.inv_weight * (u64)granularity;
        /*
         * It will always fit into 'long':
         */
-        return (long) (tmp >> NICE_0_SHIFT);
+        return (long) (tmp >> WMULT_SHIFT);
 }
 static inline void
@@ -281,26 +285,25 @@ add_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se, long delta)
 * are not in our scheduling class.
 */
 static inline void
-__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, u64 now)
+__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 {
-        unsigned long delta, delta_exec, delta_fair;
+        unsigned long delta, delta_exec, delta_fair, delta_mine;
-        long delta_mine;
        struct load_weight *lw = &cfs_rq->load;
        unsigned long load = lw->weight;
-        if (unlikely(!load))
-                return;
        delta_exec = curr->delta_exec;
        schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max));
        curr->sum_exec_runtime += delta_exec;
        cfs_rq->exec_clock += delta_exec;
+        if (unlikely(!load))
+                return;
        delta_fair = calc_delta_fair(delta_exec, lw);
        delta_mine = calc_delta_mine(delta_exec, curr->load.weight, lw);
-        if (cfs_rq->sleeper_bonus > sysctl_sched_stat_granularity) {
+        if (cfs_rq->sleeper_bonus > sysctl_sched_granularity) {
                delta = calc_delta_mine(cfs_rq->sleeper_bonus,
                                        curr->load.weight, lw);
                if (unlikely(delta > cfs_rq->sleeper_bonus))
@@ -321,7 +324,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, u64 now)
        add_wait_runtime(cfs_rq, curr, delta_mine - delta_exec);
 }
-static void update_curr(struct cfs_rq *cfs_rq, u64 now)
+static void update_curr(struct cfs_rq *cfs_rq)
 {
        struct sched_entity *curr = cfs_rq_curr(cfs_rq);
        unsigned long delta_exec;
@@ -334,22 +337,22 @@ static void update_curr(struct cfs_rq *cfs_rq, u64 now)
         * since the last time we changed load (this cannot
         * overflow on 32 bits):
         */
-        delta_exec = (unsigned long)(now - curr->exec_start);
+        delta_exec = (unsigned long)(rq_of(cfs_rq)->clock - curr->exec_start);
        curr->delta_exec += delta_exec;
        if (unlikely(curr->delta_exec > sysctl_sched_stat_granularity)) {
-                __update_curr(cfs_rq, curr, now);
+                __update_curr(cfs_rq, curr);
                curr->delta_exec = 0;
        }
-        curr->exec_start = now;
+        curr->exec_start = rq_of(cfs_rq)->clock;
 }
 static inline void
-update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
+update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
        se->wait_start_fair = cfs_rq->fair_clock;
-        schedstat_set(se->wait_start, now);
+        schedstat_set(se->wait_start, rq_of(cfs_rq)->clock);
 }
 /*
@@ -377,8 +380,7 @@ calc_weighted(unsigned long delta, unsigned long weight, int shift)
 /*
 * Task is being enqueued - update stats:
 */
-static void
+static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
-update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
 {
        s64 key;
@@ -387,7 +389,7 @@ update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
         * a dequeue/enqueue event is a NOP)
         */
        if (se != cfs_rq_curr(cfs_rq))
-                update_stats_wait_start(cfs_rq, se, now);
+                update_stats_wait_start(cfs_rq, se);
        /*
         * Update the key:
         */
@@ -407,7 +409,8 @@ update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
                                        (WMULT_SHIFT - NICE_0_SHIFT);
                } else {
                        tmp = se->wait_runtime;
-                        key -= (tmp * se->load.weight) >> NICE_0_SHIFT;
+                        key -= (tmp * se->load.inv_weight) >>
+                                        (WMULT_SHIFT - NICE_0_SHIFT);
                }
        }
@@ -418,11 +421,12 @@ update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
 * Note: must be called with a freshly updated rq->fair_clock.
 */
 static inline void
-__update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
+__update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
        unsigned long delta_fair = se->delta_fair_run;
-        schedstat_set(se->wait_max, max(se->wait_max, now - se->wait_start));
+        schedstat_set(se->wait_max, max(se->wait_max,
+                        rq_of(cfs_rq)->clock - se->wait_start));
        if (unlikely(se->load.weight != NICE_0_LOAD))
                delta_fair = calc_weighted(delta_fair, se->load.weight,
@@ -432,7 +436,7 @@ __update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
 }
 static void
-update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
+update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
        unsigned long delta_fair;
@@ -442,7 +446,7 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
        se->delta_fair_run += delta_fair;
        if (unlikely(abs(se->delta_fair_run) >=
                                sysctl_sched_stat_granularity)) {
-                __update_stats_wait_end(cfs_rq, se, now);
+                __update_stats_wait_end(cfs_rq, se);
                se->delta_fair_run = 0;
        }
@@ -451,34 +455,34 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
 }
 static inline void
-update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
+update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-        update_curr(cfs_rq, now);
+        update_curr(cfs_rq);
        /*
         * Mark the end of the wait period if dequeueing a
         * waiting task:
         */
        if (se != cfs_rq_curr(cfs_rq))
-                update_stats_wait_end(cfs_rq, se, now);
+                update_stats_wait_end(cfs_rq, se);
 }
 /*
 * We are picking a new current task - update its stats:
 */
 static inline void
-update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
+update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
        /*
         * We are starting a new run period:
         */
-        se->exec_start = now;
+        se->exec_start = rq_of(cfs_rq)->clock;
 }
 /*
 * We are descheduling a task - update its stats:
 */
 static inline void
-update_stats_curr_end(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
+update_stats_curr_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
        se->exec_start = 0;
 }
@@ -487,8 +491,7 @@ update_stats_curr_end(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
 * Scheduling class queueing methods:
 */
-static void
+static void __enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
-__enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
 {
        unsigned long load = cfs_rq->load.weight, delta_fair;
        long prev_runtime;
@@ -522,8 +525,7 @@ __enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
        schedstat_add(cfs_rq, wait_runtime, se->wait_runtime);
 }
-static void
+static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
-enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
 {
        struct task_struct *tsk = task_of(se);
        unsigned long delta_fair;
@@ -538,7 +540,7 @@ enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
        se->delta_fair_sleep += delta_fair;
        if (unlikely(abs(se->delta_fair_sleep) >=
                                sysctl_sched_stat_granularity)) {
-                __enqueue_sleeper(cfs_rq, se, now);
+                __enqueue_sleeper(cfs_rq, se);
                se->delta_fair_sleep = 0;
        }
@@ -546,7 +548,7 @@ enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
 #ifdef CONFIG_SCHEDSTATS
        if (se->sleep_start) {
-                u64 delta = now - se->sleep_start;
+                u64 delta = rq_of(cfs_rq)->clock - se->sleep_start;
                if ((s64)delta < 0)
                        delta = 0;
@@ -558,7 +560,7 @@ enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
                se->sum_sleep_runtime += delta;
        }
        if (se->block_start) {
-                u64 delta = now - se->block_start;
+                u64 delta = rq_of(cfs_rq)->clock - se->block_start;
                if ((s64)delta < 0)
                        delta = 0;
@@ -573,26 +575,24 @@ enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
 }
 static void
-enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
+enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
-               int wakeup, u64 now)
 {
        /*
         * Update the fair clock.
         */
-        update_curr(cfs_rq, now);
+        update_curr(cfs_rq);
        if (wakeup)
-                enqueue_sleeper(cfs_rq, se, now);
+                enqueue_sleeper(cfs_rq, se);
-        update_stats_enqueue(cfs_rq, se, now);
+        update_stats_enqueue(cfs_rq, se);
        __enqueue_entity(cfs_rq, se);
 }
 static void
-dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
+dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
-               int sleep, u64 now)
 {
-        update_stats_dequeue(cfs_rq, se, now);
+        update_stats_dequeue(cfs_rq, se);
        if (sleep) {
                se->sleep_start_fair = cfs_rq->fair_clock;
 #ifdef CONFIG_SCHEDSTATS
@@ -600,9 +600,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
                        struct task_struct *tsk = task_of(se);
                        if (tsk->state & TASK_INTERRUPTIBLE)
-                                se->sleep_start = now;
+                                se->sleep_start = rq_of(cfs_rq)->clock;
                        if (tsk->state & TASK_UNINTERRUPTIBLE)
-                                se->block_start = now;
+                                se->block_start = rq_of(cfs_rq)->clock;
                }
                cfs_rq->wait_runtime -= se->wait_runtime;
 #endif
@@ -629,7 +629,7 @@ __check_preempt_curr_fair(struct cfs_rq *cfs_rq, struct sched_entity *se,
 }
 static inline void
-set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
+set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
        /*
         * Any task has to be enqueued before it get to execute on
@@ -638,49 +638,46 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
         * done a put_prev_task_fair() shortly before this, which
         * updated rq->fair_clock - used by update_stats_wait_end())
         */
-        update_stats_wait_end(cfs_rq, se, now);
+        update_stats_wait_end(cfs_rq, se);
-        update_stats_curr_start(cfs_rq, se, now);
+        update_stats_curr_start(cfs_rq, se);
        set_cfs_rq_curr(cfs_rq, se);
 }
-static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq, u64 now)
+static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
 {
        struct sched_entity *se = __pick_next_entity(cfs_rq);
-        set_next_entity(cfs_rq, se, now);
+        set_next_entity(cfs_rq, se);
        return se;
 }
-static void
+static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
-put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev, u64 now)
 {
        /*
         * If still on the runqueue then deactivate_task()
         * was not called and update_curr() has to be done:
         */
        if (prev->on_rq)
-                update_curr(cfs_rq, now);
+                update_curr(cfs_rq);
-        update_stats_curr_end(cfs_rq, prev, now);
+        update_stats_curr_end(cfs_rq, prev);
        if (prev->on_rq)
-                update_stats_wait_start(cfs_rq, prev, now);
+                update_stats_wait_start(cfs_rq, prev);
        set_cfs_rq_curr(cfs_rq, NULL);
 }
 static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 {
-        struct rq *rq = rq_of(cfs_rq);
        struct sched_entity *next;
-        u64 now = __rq_clock(rq);
        /*
         * Dequeue and enqueue the task to update its
         * position within the tree:
         */
-        dequeue_entity(cfs_rq, curr, 0, now);
+        dequeue_entity(cfs_rq, curr, 0);
-        enqueue_entity(cfs_rq, curr, 0, now);
+        enqueue_entity(cfs_rq, curr, 0);
        /*
         * Reschedule if another task tops the current one.
@@ -785,8 +782,7 @@ static inline int is_same_group(struct task_struct *curr, struct task_struct *p)
 * increased. Here we update the fair scheduling stats and
 * then put the task into the rbtree:
 */
-static void
+static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
-enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, u64 now)
 {
        struct cfs_rq *cfs_rq;
        struct sched_entity *se = &p->se;
@@ -795,7 +791,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, u64 now)
                if (se->on_rq)
                        break;
                cfs_rq = cfs_rq_of(se);
-                enqueue_entity(cfs_rq, se, wakeup, now);
+                enqueue_entity(cfs_rq, se, wakeup);
        }
 }
@@ -804,15 +800,14 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, u64 now)
 * decreased. We remove the task from the rbtree and
 * update the fair scheduling stats:
 */
-static void
+static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep)
-dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep, u64 now)
 {
        struct cfs_rq *cfs_rq;
        struct sched_entity *se = &p->se;
        for_each_sched_entity(se) {
                cfs_rq = cfs_rq_of(se);
-                dequeue_entity(cfs_rq, se, sleep, now);
+                dequeue_entity(cfs_rq, se, sleep);
                /* Don't dequeue parent if it has other entities besides us */
                if (cfs_rq->load.weight)
                        break;
@@ -825,14 +820,14 @@ dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep, u64 now)
 static void yield_task_fair(struct rq *rq, struct task_struct *p)
 {
        struct cfs_rq *cfs_rq = task_cfs_rq(p);
-        u64 now = __rq_clock(rq);
+        __update_rq_clock(rq);
        /*
         * Dequeue and enqueue the task to update its
         * position within the tree:
         */
-        dequeue_entity(cfs_rq, &p->se, 0, now);
+        dequeue_entity(cfs_rq, &p->se, 0);
-        enqueue_entity(cfs_rq, &p->se, 0, now);
+        enqueue_entity(cfs_rq, &p->se, 0);
 }
 /*
@@ -845,7 +840,8 @@ static void check_preempt_curr_fair(struct rq *rq, struct task_struct *p)
        unsigned long gran;
        if (unlikely(rt_prio(p->prio))) {
-                update_curr(cfs_rq, rq_clock(rq));
+                update_rq_clock(rq);
+                update_curr(cfs_rq);
                resched_task(curr);
                return;
        }
@@ -861,7 +857,7 @@ static void check_preempt_curr_fair(struct rq *rq, struct task_struct *p)
                __check_preempt_curr_fair(cfs_rq, &p->se, &curr->se, gran);
 }
-static struct task_struct *pick_next_task_fair(struct rq *rq, u64 now)
+static struct task_struct *pick_next_task_fair(struct rq *rq)
 {
        struct cfs_rq *cfs_rq = &rq->cfs;
        struct sched_entity *se;
@@ -870,7 +866,7 @@ static struct task_struct *pick_next_task_fair(struct rq *rq, u64 now)
                return NULL;
        do {
-                se = pick_next_entity(cfs_rq, now);
+                se = pick_next_entity(cfs_rq);
                cfs_rq = group_cfs_rq(se);
        } while (cfs_rq);
@@ -880,14 +876,14 @@ static struct task_struct *pick_next_task_fair(struct rq *rq, u64 now)
 /*
 * Account for a descheduled task:
 */
-static void put_prev_task_fair(struct rq *rq, struct task_struct *prev, u64 now)
+static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
 {
        struct sched_entity *se = &prev->se;
        struct cfs_rq *cfs_rq;
        for_each_sched_entity(se) {
                cfs_rq = cfs_rq_of(se);
-                put_prev_entity(cfs_rq, se, now);
+                put_prev_entity(cfs_rq, se);
        }
 }
@@ -930,6 +926,7 @@ static struct task_struct *load_balance_next_fair(void *arg)
        return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr);
 }
+#ifdef CONFIG_FAIR_GROUP_SCHED
 static int cfs_rq_best_prio(struct cfs_rq *cfs_rq)
 {
        struct sched_entity *curr;
@@ -943,12 +940,13 @@ static int cfs_rq_best_prio(struct cfs_rq *cfs_rq)
        return p->prio;
 }
+#endif
-static int
+static unsigned long
 load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
-                        unsigned long max_nr_move, unsigned long max_load_move,
+                  unsigned long max_nr_move, unsigned long max_load_move,
-                        struct sched_domain *sd, enum cpu_idle_type idle,
+                  struct sched_domain *sd, enum cpu_idle_type idle,
-                        int *all_pinned, unsigned long *total_load_moved)
+                  int *all_pinned, int *this_best_prio)
 {
        struct cfs_rq *busy_cfs_rq;
        unsigned long load_moved, total_nr_moved = 0, nr_moved;
@@ -959,10 +957,10 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
        cfs_rq_iterator.next = load_balance_next_fair;
        for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
+#ifdef CONFIG_FAIR_GROUP_SCHED
                struct cfs_rq *this_cfs_rq;
-                long imbalance;
+                long imbalances;
                unsigned long maxload;
-                int this_best_prio, best_prio, best_prio_seen = 0;
                this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu);
@@ -976,27 +974,17 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
                imbalance /= 2;
                maxload = min(rem_load_move, imbalance);
-                this_best_prio = cfs_rq_best_prio(this_cfs_rq);
+                *this_best_prio = cfs_rq_best_prio(this_cfs_rq);
-                best_prio = cfs_rq_best_prio(busy_cfs_rq);
+#else
+#define maxload rem_load_move
-                /*
+#endif
-                 * Enable handling of the case where there is more than one task
-                 * with the best priority. If the current running task is one
-                 * of those with prio==best_prio we know it won't be moved
-                 * and therefore it's safe to override the skip (based on load)
-                 * of any task we find with that prio.
-                 */
-                if (cfs_rq_curr(busy_cfs_rq) == &busiest->curr->se)
-                        best_prio_seen = 1;
                /* pass busy_cfs_rq argument into
                 * load_balance_[start|next]_fair iterators
                 */
                cfs_rq_iterator.arg = busy_cfs_rq;
                nr_moved = balance_tasks(this_rq, this_cpu, busiest,
                                max_nr_move, maxload, sd, idle, all_pinned,
-                                &load_moved, this_best_prio, best_prio,
+                                &load_moved, this_best_prio, &cfs_rq_iterator);
-                                best_prio_seen, &cfs_rq_iterator);
                total_nr_moved += nr_moved;
                max_nr_move -= nr_moved;
@@ -1006,9 +994,7 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
                        break;
        }
-        *total_load_moved = max_load_move - rem_load_move;
+        return max_load_move - rem_load_move;
-        return total_nr_moved;
 }
 /*
@@ -1032,14 +1018,14 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr)
 * monopolize the CPU. Note: the parent runqueue is locked,
 * the child is not running yet.
 */
-static void task_new_fair(struct rq *rq, struct task_struct *p, u64 now)
+static void task_new_fair(struct rq *rq, struct task_struct *p)
 {
        struct cfs_rq *cfs_rq = task_cfs_rq(p);
        struct sched_entity *se = &p->se;
        sched_info_queued(p);
-        update_stats_enqueue(cfs_rq, se, now);
+        update_stats_enqueue(cfs_rq, se);
        /*
         * Child runs first: we let it run before the parent
         * until it reschedules once. We set up the key so that
@@ -1072,15 +1058,10 @@ static void task_new_fair(struct rq *rq, struct task_struct *p, u64 now)
 */
 static void set_curr_task_fair(struct rq *rq)
 {
-        struct task_struct *curr = rq->curr;
+        struct sched_entity *se = &rq->curr.se;
-        struct sched_entity *se = &curr->se;
-        u64 now = rq_clock(rq);
-        struct cfs_rq *cfs_rq;
-        for_each_sched_entity(se) {
+        for_each_sched_entity(se)
-                cfs_rq = cfs_rq_of(se);
+                set_next_entity(cfs_rq_of(se), se);
-                set_next_entity(cfs_rq, se, now);
-        }
 }
 #else
 static void set_curr_task_fair(struct rq *rq)
@@ -1109,12 +1090,11 @@ struct sched_class fair_sched_class __read_mostly = {
 };
 #ifdef CONFIG_SCHED_DEBUG
-void print_cfs_stats(struct seq_file *m, int cpu, u64 now)
+static void print_cfs_stats(struct seq_file *m, int cpu)
 {
-        struct rq *rq = cpu_rq(cpu);
        struct cfs_rq *cfs_rq;
-        for_each_leaf_cfs_rq(rq, cfs_rq)
+        for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
-                print_cfs_rq(m, cpu, cfs_rq, now);
+                print_cfs_rq(m, cpu, cfs_rq);
 }
 #endif
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 41841e741c4a..3503fb2d9f96 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -13,7 +13,7 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p)
        resched_task(rq->idle);
 }
-static struct task_struct *pick_next_task_idle(struct rq *rq, u64 now)
+static struct task_struct *pick_next_task_idle(struct rq *rq)
 {
        schedstat_inc(rq, sched_goidle);
@@ -25,7 +25,7 @@ static struct task_struct *pick_next_task_idle(struct rq *rq, u64 now)
 * message if some code attempts to do it:
 */
 static void
-dequeue_task_idle(struct rq *rq, struct task_struct *p, int sleep, u64 now)
+dequeue_task_idle(struct rq *rq, struct task_struct *p, int sleep)
 {
        spin_unlock_irq(&rq->lock);
        printk(KERN_ERR "bad: scheduling from the idle thread!\n");
@@ -33,15 +33,15 @@ dequeue_task_idle(struct rq *rq, struct task_struct *p, int sleep, u64 now)
        spin_lock_irq(&rq->lock);
 }
-static void put_prev_task_idle(struct rq *rq, struct task_struct *prev, u64 now)
+static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
 {
 }
-static int
+static unsigned long
 load_balance_idle(struct rq *this_rq, int this_cpu, struct rq *busiest,
                        unsigned long max_nr_move, unsigned long max_load_move,
                        struct sched_domain *sd, enum cpu_idle_type idle,
-                        int *all_pinned, unsigned long *total_load_moved)
+                        int *all_pinned, int *this_best_prio)
 {
        return 0;
 }
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 002fcf8d3f64..dcdcad632fd9 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -7,7 +7,7 @@
 * Update the current task's runtime statistics. Skip current tasks that
 * are not in our scheduling class.
 */
-static inline void update_curr_rt(struct rq *rq, u64 now)
+static inline void update_curr_rt(struct rq *rq)
 {
        struct task_struct *curr = rq->curr;
        u64 delta_exec;
@@ -15,18 +15,17 @@ static inline void update_curr_rt(struct rq *rq, u64 now)
        if (!task_has_rt_policy(curr))
                return;
-        delta_exec = now - curr->se.exec_start;
+        delta_exec = rq->clock - curr->se.exec_start;
        if (unlikely((s64)delta_exec < 0))
                delta_exec = 0;
        schedstat_set(curr->se.exec_max, max(curr->se.exec_max, delta_exec));
        curr->se.sum_exec_runtime += delta_exec;
-        curr->se.exec_start = now;
+        curr->se.exec_start = rq->clock;
 }
-static void
+static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
-enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup, u64 now)
 {
        struct rt_prio_array *array = &rq->rt.active;
@@ -37,12 +36,11 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup, u64 now)
 /*
 * Adding/removing a task to/from a priority array:
 */
-static void
+static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
-dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep, u64 now)
 {
        struct rt_prio_array *array = &rq->rt.active;
-        update_curr_rt(rq, now);
+        update_curr_rt(rq);
        list_del(&p->run_list);
        if (list_empty(array->queue + p->prio))
@@ -75,7 +73,7 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p)
                resched_task(rq->curr);
 }
-static struct task_struct *pick_next_task_rt(struct rq *rq, u64 now)
+static struct task_struct *pick_next_task_rt(struct rq *rq)
 {
        struct rt_prio_array *array = &rq->rt.active;
        struct task_struct *next;
@@ -89,14 +87,14 @@ static struct task_struct *pick_next_task_rt(struct rq *rq, u64 now)
        queue = array->queue + idx;
        next = list_entry(queue->next, struct task_struct, run_list);
-        next->se.exec_start = now;
+        next->se.exec_start = rq->clock;
        return next;
 }
-static void put_prev_task_rt(struct rq *rq, struct task_struct *p, u64 now)
+static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
 {
-        update_curr_rt(rq, now);
+        update_curr_rt(rq);
        p->se.exec_start = 0;
 }
@@ -172,28 +170,15 @@ static struct task_struct *load_balance_next_rt(void *arg)
        return p;
 }
-static int
+static unsigned long
 load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
                        unsigned long max_nr_move, unsigned long max_load_move,
                        struct sched_domain *sd, enum cpu_idle_type idle,
-                        int *all_pinned, unsigned long *load_moved)
+                        int *all_pinned, int *this_best_prio)
 {
-        int this_best_prio, best_prio, best_prio_seen = 0;
        int nr_moved;
        struct rq_iterator rt_rq_iterator;
+        unsigned long load_moved;
-        best_prio = sched_find_first_bit(busiest->rt.active.bitmap);
-        this_best_prio = sched_find_first_bit(this_rq->rt.active.bitmap);
-        /*
-         * Enable handling of the case where there is more than one task
-         * with the best priority.   If the current running task is one
-         * of those with prio==best_prio we know it won't be moved
-         * and therefore it's safe to override the skip (based on load)
-         * of any task we find with that prio.
-         */
-        if (busiest->curr->prio == best_prio)
-                best_prio_seen = 1;
        rt_rq_iterator.start = load_balance_start_rt;
        rt_rq_iterator.next = load_balance_next_rt;
@@ -203,11 +188,10 @@ load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
        rt_rq_iterator.arg = busiest;
        nr_moved = balance_tasks(this_rq, this_cpu, busiest, max_nr_move,
-                        max_load_move, sd, idle, all_pinned, load_moved,
+                        max_load_move, sd, idle, all_pinned, &load_moved,
-                        this_best_prio, best_prio, best_prio_seen,
+                        this_best_prio, &rt_rq_iterator);
-                        &rt_rq_iterator);
-        return nr_moved;
+        return load_moved;
 }
 static void task_tick_rt(struct rq *rq, struct task_struct *p)