12 files changed, 297 insertions, 150 deletions
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 0669b70fa6a3..9fdba03dc1fc 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -52,8 +52,23 @@ config PREEMPT
 endchoice
+config PREEMPT_RCU
+        bool "Preemptible RCU"
+        depends on PREEMPT
+        default n
+        help
+          This option reduces the latency of the kernel by making certain
+          RCU sections preemptible. Normally RCU code is non-preemptible, if
+          this option is selected then read-only RCU sections become
+          preemptible. This helps latency, but may expose bugs due to
+          now-naive assumptions about each RCU read-side critical section
+          remaining on a given CPU through its execution.
+          Say N if you are unsure.
 config RCU_TRACE
        bool "Enable tracing for RCU - currently stats in debugfs"
+        depends on PREEMPT_RCU
        select DEBUG_FS
        default y
        help
diff --git a/kernel/exit.c b/kernel/exit.c
index cd20bf07e9e3..53872bf993fa 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1378,7 +1378,7 @@ unlock_sig:
        if (!retval && infop)
                retval = put_user(0, &infop->si_errno);
        if (!retval && infop)
-                retval = put_user(why, &infop->si_code);
+                retval = put_user((short)why, &infop->si_code);
        if (!retval && infop)
                retval = put_user(exit_code, &infop->si_status);
        if (!retval && infop)
diff --git a/kernel/module.c b/kernel/module.c
index be4807fb90e4..5d437bffd8dc 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2178,10 +2178,20 @@ sys_init_module(void __user *umod,
                wake_up(&module_wq);
                return ret;
        }
+        if (ret > 0) {
+                printk(KERN_WARNING "%s: '%s'->init suspiciously returned %d, "
+                                    "it should follow 0/-E convention\n"
+                       KERN_WARNING "%s: loading module anyway...\n",
+                       __func__, mod->name, ret,
+                       __func__);
+                dump_stack();
+        }
-        /* Now it's a first class citizen! */
+        /* Now it's a first class citizen!  Wake up anyone waiting for it. */
-        mutex_lock(&module_mutex);
        mod->state = MODULE_STATE_LIVE;
+        wake_up(&module_wq);
+        mutex_lock(&module_mutex);
        /* Drop initial reference. */
        module_put(mod);
        unwind_remove_table(mod->unwind_info, 1);
@@ -2190,7 +2200,6 @@ sys_init_module(void __user *umod,
        mod->init_size = 0;
        mod->init_text_size = 0;
        mutex_unlock(&module_mutex);
-        wake_up(&module_wq);
        return 0;
 }
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 79833170bb9c..6233f3b4ae66 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -190,7 +190,7 @@ config APM_EMULATION
          notification of APM "events" (e.g. battery status change).
          In order to use APM, you will need supporting software. For location
-          and more information, read <file:Documentation/pm.txt> and the
+          and more information, read <file:Documentation/power/pm.txt> and the
          Battery Powered Linux mini-HOWTO, available from
          <http://www.tldp.org/docs.html#howto>.
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 72a020cabb4c..5f91a07c4eac 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -447,7 +447,7 @@ static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free)
 *      of @bm->cur_zone_bm are updated.
 */
-static void memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn,
+static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn,
                                void **addr, unsigned int *bit_nr)
 {
        struct zone_bitmap *zone_bm;
@@ -461,7 +461,8 @@ static void memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn,
                while (pfn < zone_bm->start_pfn || pfn >= zone_bm->end_pfn) {
                        zone_bm = zone_bm->next;
-                        BUG_ON(!zone_bm);
+                        if (!zone_bm)
+                                return -EFAULT;
                }
                bm->cur.zone_bm = zone_bm;
        }
@@ -479,23 +480,40 @@ static void memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn,
        pfn -= bb->start_pfn;
        *bit_nr = pfn % BM_BITS_PER_CHUNK;
        *addr = bb->data + pfn / BM_BITS_PER_CHUNK;
+        return 0;
 }
 static void memory_bm_set_bit(struct memory_bitmap *bm, unsigned long pfn)
 {
        void *addr;
        unsigned int bit;
+        int error;
-        memory_bm_find_bit(bm, pfn, &addr, &bit);
+        error = memory_bm_find_bit(bm, pfn, &addr, &bit);
+        BUG_ON(error);
        set_bit(bit, addr);
 }
+static int mem_bm_set_bit_check(struct memory_bitmap *bm, unsigned long pfn)
+{
+        void *addr;
+        unsigned int bit;
+        int error;
+        error = memory_bm_find_bit(bm, pfn, &addr, &bit);
+        if (!error)
+                set_bit(bit, addr);
+        return error;
+}
 static void memory_bm_clear_bit(struct memory_bitmap *bm, unsigned long pfn)
 {
        void *addr;
        unsigned int bit;
+        int error;
-        memory_bm_find_bit(bm, pfn, &addr, &bit);
+        error = memory_bm_find_bit(bm, pfn, &addr, &bit);
+        BUG_ON(error);
        clear_bit(bit, addr);
 }
@@ -503,8 +521,10 @@ static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn)
 {
        void *addr;
        unsigned int bit;
+        int error;
-        memory_bm_find_bit(bm, pfn, &addr, &bit);
+        error = memory_bm_find_bit(bm, pfn, &addr, &bit);
+        BUG_ON(error);
        return test_bit(bit, addr);
 }
@@ -709,8 +729,15 @@ static void mark_nosave_pages(struct memory_bitmap *bm)
                                region->end_pfn << PAGE_SHIFT);
                for (pfn = region->start_pfn; pfn < region->end_pfn; pfn++)
-                        if (pfn_valid(pfn))
+                        if (pfn_valid(pfn)) {
-                                memory_bm_set_bit(bm, pfn);
+                                /*
+                                 * It is safe to ignore the result of
+                                 * mem_bm_set_bit_check() here, since we won't
+                                 * touch the PFNs for which the error is
+                                 * returned anyway.
+                                 */
+                                mem_bm_set_bit_check(bm, pfn);
+                        }
        }
 }
diff --git a/kernel/relay.c b/kernel/relay.c
index d080b9d161a7..4c035a8a248c 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -1066,7 +1066,7 @@ static int subbuf_splice_actor(struct file *in,
                               unsigned int flags,
                               int *nonpad_ret)
 {
-        unsigned int pidx, poff, total_len, subbuf_pages, ret;
+        unsigned int pidx, poff, total_len, subbuf_pages, nr_pages, ret;
        struct rchan_buf *rbuf = in->private_data;
        unsigned int subbuf_size = rbuf->chan->subbuf_size;
        uint64_t pos = (uint64_t) *ppos;
@@ -1097,8 +1097,9 @@ static int subbuf_splice_actor(struct file *in,
        subbuf_pages = rbuf->chan->alloc_size >> PAGE_SHIFT;
        pidx = (read_start / PAGE_SIZE) % subbuf_pages;
        poff = read_start & ~PAGE_MASK;
+        nr_pages = min_t(unsigned int, subbuf_pages, PIPE_BUFFERS);
-        for (total_len = 0; spd.nr_pages < subbuf_pages; spd.nr_pages++) {
+        for (total_len = 0; spd.nr_pages < nr_pages; spd.nr_pages++) {
                unsigned int this_len, this_end, private;
                unsigned int cur_pos = read_start + total_len;
diff --git a/kernel/sched.c b/kernel/sched.c
index 52b98675acb2..3f7c5eb254e2 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -301,7 +301,7 @@ struct cfs_rq {
        /* 'curr' points to currently running entity on this cfs_rq.
         * It is set to NULL otherwise (i.e when none are currently running).
         */
-        struct sched_entity *curr;
+        struct sched_entity *curr, *next;
        unsigned long nr_spread_over;
@@ -1084,7 +1084,7 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight,
        u64 tmp;
        if (unlikely(!lw->inv_weight))
-                lw->inv_weight = (WMULT_CONST - lw->weight/2) / lw->weight + 1;
+                lw->inv_weight = (WMULT_CONST-lw->weight/2) / (lw->weight+1);
        tmp = (u64)delta_exec * weight;
        /*
@@ -1108,11 +1108,13 @@ calc_delta_fair(unsigned long delta_exec, struct load_weight *lw)
 static inline void update_load_add(struct load_weight *lw, unsigned long inc)
 {
        lw->weight += inc;
+        lw->inv_weight = 0;
 }
 static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
 {
        lw->weight -= dec;
+        lw->inv_weight = 0;
 }
 /*
@@ -1394,6 +1396,12 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
 {
        s64 delta;
+        /*
+         * Buddy candidates are cache hot:
+         */
+        if (&p->se == cfs_rq_of(&p->se)->next)
+                return 1;
        if (p->sched_class != &fair_sched_class)
                return 0;
@@ -1853,10 +1861,11 @@ out_activate:
                schedstat_inc(p, se.nr_wakeups_remote);
        update_rq_clock(rq);
        activate_task(rq, p, 1);
-        check_preempt_curr(rq, p);
        success = 1;
 out_running:
+        check_preempt_curr(rq, p);
        p->state = TASK_RUNNING;
 #ifdef CONFIG_SMP
        if (p->sched_class->task_wake_up)
@@ -1890,6 +1899,8 @@ static void __sched_fork(struct task_struct *p)
        p->se.exec_start                = 0;
        p->se.sum_exec_runtime          = 0;
        p->se.prev_sum_exec_runtime     = 0;
+        p->se.last_wakeup               = 0;
+        p->se.avg_overlap               = 0;
 #ifdef CONFIG_SCHEDSTATS
        p->se.wait_start                = 0;
@@ -4268,11 +4279,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
        oldprio = p->prio;
        on_rq = p->se.on_rq;
        running = task_current(rq, p);
-        if (on_rq) {
+        if (on_rq)
                dequeue_task(rq, p, 0);
-                if (running)
+        if (running)
-                        p->sched_class->put_prev_task(rq, p);
+                p->sched_class->put_prev_task(rq, p);
-        }
        if (rt_prio(prio))
                p->sched_class = &rt_sched_class;
@@ -4281,10 +4291,9 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
        p->prio = prio;
+        if (running)
+                p->sched_class->set_curr_task(rq);
        if (on_rq) {
-                if (running)
-                        p->sched_class->set_curr_task(rq);
                enqueue_task(rq, p, 0);
                check_class_changed(rq, p, prev_class, oldprio, running);
@@ -4581,19 +4590,17 @@ recheck:
        update_rq_clock(rq);
        on_rq = p->se.on_rq;
        running = task_current(rq, p);
-        if (on_rq) {
+        if (on_rq)
                deactivate_task(rq, p, 0);
-                if (running)
+        if (running)
-                        p->sched_class->put_prev_task(rq, p);
+                p->sched_class->put_prev_task(rq, p);
-        }
        oldprio = p->prio;
        __setscheduler(rq, p, policy, param->sched_priority);
+        if (running)
+                p->sched_class->set_curr_task(rq);
        if (on_rq) {
-                if (running)
-                        p->sched_class->set_curr_task(rq);
                activate_task(rq, p, 0);
                check_class_changed(rq, p, prev_class, oldprio, running);
@@ -5881,7 +5888,8 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                spin_unlock_irq(&rq->lock);
                break;
-        case CPU_DOWN_PREPARE:
+        case CPU_DYING:
+        case CPU_DYING_FROZEN:
                /* Update our root-domain */
                rq = cpu_rq(cpu);
                spin_lock_irqsave(&rq->lock, flags);
@@ -7617,11 +7625,10 @@ void sched_move_task(struct task_struct *tsk)
        running = task_current(rq, tsk);
        on_rq = tsk->se.on_rq;
-        if (on_rq) {
+        if (on_rq)
                dequeue_task(rq, tsk, 0);
-                if (unlikely(running))
+        if (unlikely(running))
-                        tsk->sched_class->put_prev_task(rq, tsk);
+                tsk->sched_class->put_prev_task(rq, tsk);
-        }
        set_task_rq(tsk, task_cpu(tsk));
@@ -7630,11 +7637,10 @@ void sched_move_task(struct task_struct *tsk)
                tsk->sched_class->moved_group(tsk);
 #endif
-        if (on_rq) {
+        if (unlikely(running))
-                if (unlikely(running))
+                tsk->sched_class->set_curr_task(rq);
-                        tsk->sched_class->set_curr_task(rq);
+        if (on_rq)
                enqueue_task(rq, tsk, 0);
-        }
        task_rq_unlock(rq, &flags);
 }
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 4b5e24cf2f4a..ef358ba07683 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -288,6 +288,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
        PN(se.exec_start);
        PN(se.vruntime);
        PN(se.sum_exec_runtime);
+        PN(se.avg_overlap);
        nr_switches = p->nvcsw + p->nivcsw;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index e2a530515619..b85cac4b5e25 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -73,13 +73,13 @@ unsigned int sysctl_sched_batch_wakeup_granularity = 10000000UL;
 /*
 * SCHED_OTHER wake-up granularity.
- * (default: 10 msec * (1 + ilog(ncpus)), units: nanoseconds)
+ * (default: 5 msec * (1 + ilog(ncpus)), units: nanoseconds)
 *
 * This option delays the preemption effects of decoupled workloads
 * and reduces their over-scheduling. Synchronous workloads will still
 * have immediate wakeup/sleep latencies.
 */
-unsigned int sysctl_sched_wakeup_granularity = 10000000UL;
+unsigned int sysctl_sched_wakeup_granularity = 5000000UL;
 const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
@@ -175,8 +175,15 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
         * Maintain a cache of leftmost tree entries (it is frequently
         * used):
         */
-        if (leftmost)
+        if (leftmost) {
                cfs_rq->rb_leftmost = &se->run_node;
+                /*
+                 * maintain cfs_rq->min_vruntime to be a monotonic increasing
+                 * value tracking the leftmost vruntime in the tree.
+                 */
+                cfs_rq->min_vruntime =
+                        max_vruntime(cfs_rq->min_vruntime, se->vruntime);
+        }
        rb_link_node(&se->run_node, parent, link);
        rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
@@ -184,8 +191,24 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-        if (cfs_rq->rb_leftmost == &se->run_node)
+        if (cfs_rq->rb_leftmost == &se->run_node) {
-                cfs_rq->rb_leftmost = rb_next(&se->run_node);
+                struct rb_node *next_node;
+                struct sched_entity *next;
+                next_node = rb_next(&se->run_node);
+                cfs_rq->rb_leftmost = next_node;
+                if (next_node) {
+                        next = rb_entry(next_node,
+                                        struct sched_entity, run_node);
+                        cfs_rq->min_vruntime =
+                                max_vruntime(cfs_rq->min_vruntime,
+                                             next->vruntime);
+                }
+        }
+        if (cfs_rq->next == se)
+                cfs_rq->next = NULL;
        rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
 }
@@ -260,12 +283,8 @@ static u64 __sched_period(unsigned long nr_running)
 */
 static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-        u64 slice = __sched_period(cfs_rq->nr_running);
+        return calc_delta_mine(__sched_period(cfs_rq->nr_running),
+                               se->load.weight, &cfs_rq->load);
-        slice *= se->load.weight;
-        do_div(slice, cfs_rq->load.weight);
-        return slice;
 }
 /*
@@ -303,7 +322,6 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
              unsigned long delta_exec)
 {
        unsigned long delta_exec_weighted;
-        u64 vruntime;
        schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max));
@@ -315,19 +333,6 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
                                                        &curr->load);
        }
        curr->vruntime += delta_exec_weighted;
-        /*
-         * maintain cfs_rq->min_vruntime to be a monotonic increasing
-         * value tracking the leftmost vruntime in the tree.
-         */
-        if (first_fair(cfs_rq)) {
-                vruntime = min_vruntime(curr->vruntime,
-                                __pick_next_entity(cfs_rq)->vruntime);
-        } else
-                vruntime = curr->vruntime;
-        cfs_rq->min_vruntime =
-                max_vruntime(cfs_rq->min_vruntime, vruntime);
 }
 static void update_curr(struct cfs_rq *cfs_rq)
@@ -493,7 +498,11 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
 {
        u64 vruntime;
-        vruntime = cfs_rq->min_vruntime;
+        if (first_fair(cfs_rq)) {
+                vruntime = min_vruntime(cfs_rq->min_vruntime,
+                                __pick_next_entity(cfs_rq)->vruntime);
+        } else
+                vruntime = cfs_rq->min_vruntime;
        if (sched_feat(TREE_AVG)) {
                struct sched_entity *last = __pick_last_entity(cfs_rq);
@@ -515,8 +524,10 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
        if (!initial) {
                /* sleeps upto a single latency don't count. */
-                if (sched_feat(NEW_FAIR_SLEEPERS))
+                if (sched_feat(NEW_FAIR_SLEEPERS)) {
-                        vruntime -= sysctl_sched_latency;
+                        vruntime -= calc_delta_fair(sysctl_sched_latency,
+                                                    &cfs_rq->load);
+                }
                /* ensure we never gain time by being placed backwards. */
                vruntime = max_vruntime(se->vruntime, vruntime);
@@ -545,6 +556,21 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
        account_entity_enqueue(cfs_rq, se);
 }
+static void update_avg(u64 *avg, u64 sample)
+{
+        s64 diff = sample - *avg;
+        *avg += diff >> 3;
+}
+static void update_avg_stats(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+        if (!se->last_wakeup)
+                return;
+        update_avg(&se->avg_overlap, se->sum_exec_runtime - se->last_wakeup);
+        se->last_wakeup = 0;
+}
 static void
 dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
 {
@@ -555,6 +581,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
        update_stats_dequeue(cfs_rq, se);
        if (sleep) {
+                update_avg_stats(cfs_rq, se);
 #ifdef CONFIG_SCHEDSTATS
                if (entity_is_task(se)) {
                        struct task_struct *tsk = task_of(se);
@@ -616,12 +643,32 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
        se->prev_sum_exec_runtime = se->sum_exec_runtime;
 }
+static struct sched_entity *
+pick_next(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+        s64 diff, gran;
+        if (!cfs_rq->next)
+                return se;
+        diff = cfs_rq->next->vruntime - se->vruntime;
+        if (diff < 0)
+                return se;
+        gran = calc_delta_fair(sysctl_sched_wakeup_granularity, &cfs_rq->load);
+        if (diff > gran)
+                return se;
+        return cfs_rq->next;
+}
 static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
 {
        struct sched_entity *se = NULL;
        if (first_fair(cfs_rq)) {
                se = __pick_next_entity(cfs_rq);
+                se = pick_next(cfs_rq, se);
                set_next_entity(cfs_rq, se);
        }
@@ -949,96 +996,121 @@ static inline int wake_idle(int cpu, struct task_struct *p)
 #endif
 #ifdef CONFIG_SMP
-static int select_task_rq_fair(struct task_struct *p, int sync)
+static const struct sched_class fair_sched_class;
+static int
+wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
+            struct task_struct *p, int prev_cpu, int this_cpu, int sync,
+            int idx, unsigned long load, unsigned long this_load,
+            unsigned int imbalance)
 {
-        int cpu, this_cpu;
+        struct task_struct *curr = this_rq->curr;
-        struct rq *rq;
+        unsigned long tl = this_load;
-        struct sched_domain *sd, *this_sd = NULL;
+        unsigned long tl_per_task;
-        int new_cpu;
+        if (!(this_sd->flags & SD_WAKE_AFFINE))
+                return 0;
+        /*
+         * If the currently running task will sleep within
+         * a reasonable amount of time then attract this newly
+         * woken task:
+         */
+        if (sync && curr->sched_class == &fair_sched_class) {
+                if (curr->se.avg_overlap < sysctl_sched_migration_cost &&
+                                p->se.avg_overlap < sysctl_sched_migration_cost)
+                        return 1;
+        }
+        schedstat_inc(p, se.nr_wakeups_affine_attempts);
+        tl_per_task = cpu_avg_load_per_task(this_cpu);
+        /*
+         * If sync wakeup then subtract the (maximum possible)
+         * effect of the currently running task from the load
+         * of the current CPU:
+         */
+        if (sync)
+                tl -= current->se.load.weight;
+        if ((tl <= load && tl + target_load(prev_cpu, idx) <= tl_per_task) ||
+                        100*(tl + p->se.load.weight) <= imbalance*load) {
+                /*
+                 * This domain has SD_WAKE_AFFINE and
+                 * p is cache cold in this domain, and
+                 * there is no bad imbalance.
+                 */
+                schedstat_inc(this_sd, ttwu_move_affine);
+                schedstat_inc(p, se.nr_wakeups_affine);
-        cpu      = task_cpu(p);
+                return 1;
-        rq       = task_rq(p);
+        }
-        this_cpu = smp_processor_id();
+        return 0;
-        new_cpu  = cpu;
+}
-        if (cpu == this_cpu)
+static int select_task_rq_fair(struct task_struct *p, int sync)
-                goto out_set_cpu;
+{
+        struct sched_domain *sd, *this_sd = NULL;
+        int prev_cpu, this_cpu, new_cpu;
+        unsigned long load, this_load;
+        struct rq *rq, *this_rq;
+        unsigned int imbalance;
+        int idx;
+        prev_cpu        = task_cpu(p);
+        rq              = task_rq(p);
+        this_cpu        = smp_processor_id();
+        this_rq         = cpu_rq(this_cpu);
+        new_cpu         = prev_cpu;
+        /*
+         * 'this_sd' is the first domain that both
+         * this_cpu and prev_cpu are present in:
+         */
        for_each_domain(this_cpu, sd) {
-                if (cpu_isset(cpu, sd->span)) {
+                if (cpu_isset(prev_cpu, sd->span)) {
                        this_sd = sd;
                        break;
                }
        }
        if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
-                goto out_set_cpu;
+                goto out;
        /*
         * Check for affine wakeup and passive balancing possibilities.
         */
-        if (this_sd) {
+        if (!this_sd)
-                int idx = this_sd->wake_idx;
+                goto out;
-                unsigned int imbalance;
-                unsigned long load, this_load;
-                imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
-                load = source_load(cpu, idx);
-                this_load = target_load(this_cpu, idx);
-                new_cpu = this_cpu; /* Wake to this CPU if we can */
-                if (this_sd->flags & SD_WAKE_AFFINE) {
-                        unsigned long tl = this_load;
-                        unsigned long tl_per_task;
-                        /*
-                         * Attract cache-cold tasks on sync wakeups:
-                         */
-                        if (sync && !task_hot(p, rq->clock, this_sd))
-                                goto out_set_cpu;
-                        schedstat_inc(p, se.nr_wakeups_affine_attempts);
-                        tl_per_task = cpu_avg_load_per_task(this_cpu);
-                        /*
-                         * If sync wakeup then subtract the (maximum possible)
-                         * effect of the currently running task from the load
-                         * of the current CPU:
-                         */
-                        if (sync)
-                                tl -= current->se.load.weight;
-                        if ((tl <= load &&
-                                tl + target_load(cpu, idx) <= tl_per_task) ||
-                               100*(tl + p->se.load.weight) <= imbalance*load) {
-                                /*
-                                 * This domain has SD_WAKE_AFFINE and
-                                 * p is cache cold in this domain, and
-                                 * there is no bad imbalance.
-                                 */
-                                schedstat_inc(this_sd, ttwu_move_affine);
-                                schedstat_inc(p, se.nr_wakeups_affine);
-                                goto out_set_cpu;
-                        }
-                }
-                /*
+        idx = this_sd->wake_idx;
-                 * Start passive balancing when half the imbalance_pct
-                 * limit is reached.
+        imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
-                 */
-                if (this_sd->flags & SD_WAKE_BALANCE) {
+        load = source_load(prev_cpu, idx);
-                        if (imbalance*this_load <= 100*load) {
+        this_load = target_load(this_cpu, idx);
-                                schedstat_inc(this_sd, ttwu_move_balance);
-                                schedstat_inc(p, se.nr_wakeups_passive);
+        if (wake_affine(rq, this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx,
-                                goto out_set_cpu;
+                                     load, this_load, imbalance))
-                        }
+                return this_cpu;
+        if (prev_cpu == this_cpu)
+                goto out;
+        /*
+         * Start passive balancing when half the imbalance_pct
+         * limit is reached.
+         */
+        if (this_sd->flags & SD_WAKE_BALANCE) {
+                if (imbalance*this_load <= 100*load) {
+                        schedstat_inc(this_sd, ttwu_move_balance);
+                        schedstat_inc(p, se.nr_wakeups_passive);
+                        return this_cpu;
                }
        }
-        new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */
+out:
-out_set_cpu:
        return wake_idle(new_cpu, p);
 }
 #endif /* CONFIG_SMP */
@@ -1060,6 +1132,13 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
                resched_task(curr);
                return;
        }
+        se->last_wakeup = se->sum_exec_runtime;
+        if (unlikely(se == pse))
+                return;
+        cfs_rq_of(pse)->next = pse;
        /*
         * Batch tasks do not preempt (their preemption is driven by
         * the tick):
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index c88b5910e7ab..5fd9b9469770 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -42,12 +42,13 @@ long time_esterror = NTP_PHASE_LIMIT;	/* estimated error (us)		*/
 long time_freq;                         /* frequency offset (scaled ppm)*/
 static long time_reftime;               /* time at last adjustment (s)  */
 long time_adjust;
+static long ntp_tick_adj;
 static void ntp_update_frequency(void)
 {
        u64 second_length = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ)
                                << TICK_LENGTH_SHIFT;
-        second_length += (s64)CLOCK_TICK_ADJUST << TICK_LENGTH_SHIFT;
+        second_length += (s64)ntp_tick_adj << TICK_LENGTH_SHIFT;
        second_length += (s64)time_freq << (TICK_LENGTH_SHIFT - SHIFT_NSEC);
        tick_length_base = second_length;
@@ -342,14 +343,16 @@ int do_adjtimex(struct timex *txc)
                    freq_adj = shift_right(freq_adj, time_constant * 2 +
                                           (SHIFT_PLL + 2) * 2 - SHIFT_NSEC);
                    if (mtemp >= MINSEC && (time_status & STA_FLL || mtemp > MAXSEC)) {
+                        u64 utemp64;
                        temp64 = time_offset << (SHIFT_NSEC - SHIFT_FLL);
                        if (time_offset < 0) {
-                            temp64 = -temp64;
+                            utemp64 = -temp64;
-                            do_div(temp64, mtemp);
+                            do_div(utemp64, mtemp);
-                            freq_adj -= temp64;
+                            freq_adj -= utemp64;
                        } else {
-                            do_div(temp64, mtemp);
+                            utemp64 = temp64;
-                            freq_adj += temp64;
+                            do_div(utemp64, mtemp);
+                            freq_adj += utemp64;
                        }
                    }
                    freq_adj += time_freq;
@@ -400,3 +403,11 @@ leave:	if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0)
        notify_cmos_timer();
        return(result);
 }
+static int __init ntp_tick_adj_setup(char *str)
+{
+        ntp_tick_adj = simple_strtol(str, NULL, 0);
+        return 1;
+}
+__setup("ntp_tick_adj=", ntp_tick_adj_setup);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 2968298f8f36..686da821d376 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -640,7 +640,7 @@ void tick_cancel_sched_timer(int cpu)
        if (ts->sched_timer.base)
                hrtimer_cancel(&ts->sched_timer);
-        ts->tick_stopped = 0;
        ts->nohz_mode = NOHZ_MODE_INACTIVE;
 }
 #endif /* HIGH_RES_TIMERS */
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 1af9fb050fe2..671af612b768 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -187,8 +187,7 @@ static void change_clocksource(void)
        clock->error = 0;
        clock->xtime_nsec = 0;
-        clocksource_calculate_interval(clock,
+        clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
-                (unsigned long)(current_tick_length()>>TICK_LENGTH_SHIFT));
        tick_clock_notify();
@@ -245,8 +244,7 @@ void __init timekeeping_init(void)
        ntp_clear();
        clock = clocksource_get_next();
-        clocksource_calculate_interval(clock,
+        clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
-                (unsigned long)(current_tick_length()>>TICK_LENGTH_SHIFT));
        clock->cycle_last = clocksource_read(clock);
        xtime.tv_sec = sec;