22 files changed, 507 insertions, 277 deletions
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 972f8e61d36a..59cedfb040e7 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -243,10 +243,11 @@ static inline int open_arg(int flags, int mask)
 static int audit_match_perm(struct audit_context *ctx, int mask)
 {
+        unsigned n;
        if (unlikely(!ctx))
                return 0;
-        unsigned n = ctx->major;
+        n = ctx->major;
        switch (audit_classify_syscall(ctx->arch, n)) {
        case 0: /* native */
                if ((mask & AUDIT_PERM_WRITE) &&
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index d5ab79cf516d..f227bc172690 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -14,6 +14,8 @@
 *  2003-10-22 Updates by Stephen Hemminger.
 *  2004 May-July Rework by Paul Jackson.
 *  2006 Rework by Paul Menage to use generic cgroups
+ *  2008 Rework of the scheduler domains and CPU hotplug handling
+ *       by Max Krasnyansky
 *
 *  This file is subject to the terms and conditions of the GNU General Public
 *  License.  See the file COPYING in the main directory of the Linux
@@ -236,9 +238,11 @@ static struct cpuset top_cpuset = {
 static DEFINE_MUTEX(callback_mutex);
-/* This is ugly, but preserves the userspace API for existing cpuset
+/*
+ * This is ugly, but preserves the userspace API for existing cpuset
 * users. If someone tries to mount the "cpuset" filesystem, we
- * silently switch it to mount "cgroup" instead */
+ * silently switch it to mount "cgroup" instead
+ */
 static int cpuset_get_sb(struct file_system_type *fs_type,
                         int flags, const char *unused_dev_name,
                         void *data, struct vfsmount *mnt)
@@ -473,10 +477,9 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
 }
 /*
- * Helper routine for rebuild_sched_domains().
+ * Helper routine for generate_sched_domains().
 * Do cpusets a, b have overlapping cpus_allowed masks?
 */
 static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
 {
        return cpus_intersects(a->cpus_allowed, b->cpus_allowed);
@@ -518,26 +521,15 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
 }
 /*
- * rebuild_sched_domains()
+ * generate_sched_domains()
- *
+ *
- * This routine will be called to rebuild the scheduler's dynamic
+ * This function builds a partial partition of the systems CPUs
- * sched domains:
+ * A 'partial partition' is a set of non-overlapping subsets whose
- * - if the flag 'sched_load_balance' of any cpuset with non-empty
+ * union is a subset of that set.
- *   'cpus' changes,
+ * The output of this function needs to be passed to kernel/sched.c
- * - or if the 'cpus' allowed changes in any cpuset which has that
+ * partition_sched_domains() routine, which will rebuild the scheduler's
- *   flag enabled,
+ * load balancing domains (sched domains) as specified by that partial
- * - or if the 'sched_relax_domain_level' of any cpuset which has
+ * partition.
- *   that flag enabled and with non-empty 'cpus' changes,
- * - or if any cpuset with non-empty 'cpus' is removed,
- * - or if a cpu gets offlined.
- *
- * This routine builds a partial partition of the systems CPUs
- * (the set of non-overlappping cpumask_t's in the array 'part'
- * below), and passes that partial partition to the kernel/sched.c
- * partition_sched_domains() routine, which will rebuild the
- * schedulers load balancing domains (sched domains) as specified
- * by that partial partition.  A 'partial partition' is a set of
- * non-overlapping subsets whose union is a subset of that set.
 *
 * See "What is sched_load_balance" in Documentation/cpusets.txt
 * for a background explanation of this.
@@ -547,13 +539,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
 * domains when operating in the severe memory shortage situations
 * that could cause allocation failures below.
 *
- * Call with cgroup_mutex held.  May take callback_mutex during
+ * Must be called with cgroup_lock held.
- * call due to the kfifo_alloc() and kmalloc() calls.  May nest
- * a call to the get_online_cpus()/put_online_cpus() pair.
- * Must not be called holding callback_mutex, because we must not
- * call get_online_cpus() while holding callback_mutex.  Elsewhere
- * the kernel nests callback_mutex inside get_online_cpus() calls.
- * So the reverse nesting would risk an ABBA deadlock.
 *
 * The three key local variables below are:
 *    q  - a linked-list queue of cpuset pointers, used to implement a
@@ -588,10 +574,10 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
 *      element of the partition (one sched domain) to be passed to
 *      partition_sched_domains().
 */
+static int generate_sched_domains(cpumask_t **domains,
-void rebuild_sched_domains(void)
+                        struct sched_domain_attr **attributes)
 {
-        LIST_HEAD(q);           /* queue of cpusets to be scanned*/
+        LIST_HEAD(q);           /* queue of cpusets to be scanned */
        struct cpuset *cp;      /* scans q */
        struct cpuset **csa;    /* array of all cpuset ptrs */
        int csn;                /* how many cpuset ptrs in csa so far */
@@ -601,23 +587,26 @@ void rebuild_sched_domains(void)
        int ndoms;              /* number of sched domains in result */
        int nslot;              /* next empty doms[] cpumask_t slot */
-        csa = NULL;
+        ndoms = 0;
        doms = NULL;
        dattr = NULL;
+        csa = NULL;
        /* Special case for the 99% of systems with one, full, sched domain */
        if (is_sched_load_balance(&top_cpuset)) {
-                ndoms = 1;
                doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
                if (!doms)
-                        goto rebuild;
+                        goto done;
                dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
                if (dattr) {
                        *dattr = SD_ATTR_INIT;
                        update_domain_attr_tree(dattr, &top_cpuset);
                }
                *doms = top_cpuset.cpus_allowed;
-                goto rebuild;
+                ndoms = 1;
+                goto done;
        }
        csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL);
@@ -680,61 +669,141 @@ restart:
                }
        }
-        /* Convert <csn, csa> to <ndoms, doms> */
+        /*
+         * Now we know how many domains to create.
+         * Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
+         */
        doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL);
-        if (!doms)
+        if (!doms) {
-                goto rebuild;
+                ndoms = 0;
+                goto done;
+        }
+        /*
+         * The rest of the code, including the scheduler, can deal with
+         * dattr==NULL case. No need to abort if alloc fails.
+         */
        dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL);
        for (nslot = 0, i = 0; i < csn; i++) {
                struct cpuset *a = csa[i];
+                cpumask_t *dp;
                int apn = a->pn;
-                if (apn >= 0) {
+                if (apn < 0) {
-                        cpumask_t *dp = doms + nslot;
+                        /* Skip completed partitions */
+                        continue;
-                        if (nslot == ndoms) {
+                }
-                                static int warnings = 10;
-                                if (warnings) {
+                dp = doms + nslot;
-                                        printk(KERN_WARNING
-                                         "rebuild_sched_domains confused:"
+                if (nslot == ndoms) {
-                                          " nslot %d, ndoms %d, csn %d, i %d,"
+                        static int warnings = 10;
-                                          " apn %d\n",
+                        if (warnings) {
-                                          nslot, ndoms, csn, i, apn);
+                                printk(KERN_WARNING
-                                        warnings--;
+                                 "rebuild_sched_domains confused:"
-                                }
+                                  " nslot %d, ndoms %d, csn %d, i %d,"
-                                continue;
+                                  " apn %d\n",
+                                  nslot, ndoms, csn, i, apn);
+                                warnings--;
                        }
+                        continue;
+                }
-                        cpus_clear(*dp);
+                cpus_clear(*dp);
-                        if (dattr)
+                if (dattr)
-                                *(dattr + nslot) = SD_ATTR_INIT;
+                        *(dattr + nslot) = SD_ATTR_INIT;
-                        for (j = i; j < csn; j++) {
+                for (j = i; j < csn; j++) {
-                                struct cpuset *b = csa[j];
+                        struct cpuset *b = csa[j];
-                                if (apn == b->pn) {
+                        if (apn == b->pn) {
-                                        cpus_or(*dp, *dp, b->cpus_allowed);
+                                cpus_or(*dp, *dp, b->cpus_allowed);
-                                        b->pn = -1;
+                                if (dattr)
-                                        if (dattr)
+                                        update_domain_attr_tree(dattr + nslot, b);
-                                                update_domain_attr_tree(dattr
-                                                                   + nslot, b);
+                                /* Done with this partition */
-                                }
+                                b->pn = -1;
                        }
-                        nslot++;
                }
+                nslot++;
        }
        BUG_ON(nslot != ndoms);
-rebuild:
+done:
-        /* Have scheduler rebuild sched domains */
+        kfree(csa);
+        *domains    = doms;
+        *attributes = dattr;
+        return ndoms;
+}
+/*
+ * Rebuild scheduler domains.
+ *
+ * Call with neither cgroup_mutex held nor within get_online_cpus().
+ * Takes both cgroup_mutex and get_online_cpus().
+ *
+ * Cannot be directly called from cpuset code handling changes
+ * to the cpuset pseudo-filesystem, because it cannot be called
+ * from code that already holds cgroup_mutex.
+ */
+static void do_rebuild_sched_domains(struct work_struct *unused)
+{
+        struct sched_domain_attr *attr;
+        cpumask_t *doms;
+        int ndoms;
        get_online_cpus();
-        partition_sched_domains(ndoms, doms, dattr);
+        /* Generate domain masks and attrs */
+        cgroup_lock();
+        ndoms = generate_sched_domains(&doms, &attr);
+        cgroup_unlock();
+        /* Have scheduler rebuild the domains */
+        partition_sched_domains(ndoms, doms, attr);
        put_online_cpus();
+}
-done:
+static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains);
-        kfree(csa);
-        /* Don't kfree(doms) -- partition_sched_domains() does that. */
+/*
-        /* Don't kfree(dattr) -- partition_sched_domains() does that. */
+ * Rebuild scheduler domains, asynchronously via workqueue.
+ *
+ * If the flag 'sched_load_balance' of any cpuset with non-empty
+ * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
+ * which has that flag enabled, or if any cpuset with a non-empty
+ * 'cpus' is removed, then call this routine to rebuild the
+ * scheduler's dynamic sched domains.
+ *
+ * The rebuild_sched_domains() and partition_sched_domains()
+ * routines must nest cgroup_lock() inside get_online_cpus(),
+ * but such cpuset changes as these must nest that locking the
+ * other way, holding cgroup_lock() for much of the code.
+ *
+ * So in order to avoid an ABBA deadlock, the cpuset code handling
+ * these user changes delegates the actual sched domain rebuilding
+ * to a separate workqueue thread, which ends up processing the
+ * above do_rebuild_sched_domains() function.
+ */
+static void async_rebuild_sched_domains(void)
+{
+        schedule_work(&rebuild_sched_domains_work);
+}
+/*
+ * Accomplishes the same scheduler domain rebuild as the above
+ * async_rebuild_sched_domains(), however it directly calls the
+ * rebuild routine synchronously rather than calling it via an
+ * asynchronous work thread.
+ *
+ * This can only be called from code that is not holding
+ * cgroup_mutex (not nested in a cgroup_lock() call.)
+ */
+void rebuild_sched_domains(void)
+{
+        do_rebuild_sched_domains(NULL);
 }
 /**
@@ -863,7 +932,7 @@ static int update_cpumask(struct cpuset *cs, const char *buf)
                return retval;
        if (is_load_balanced)
-                rebuild_sched_domains();
+                async_rebuild_sched_domains();
        return 0;
 }
@@ -1090,7 +1159,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
        if (val != cs->relax_domain_level) {
                cs->relax_domain_level = val;
                if (!cpus_empty(cs->cpus_allowed) && is_sched_load_balance(cs))
-                        rebuild_sched_domains();
+                        async_rebuild_sched_domains();
        }
        return 0;
@@ -1131,7 +1200,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
        mutex_unlock(&callback_mutex);
        if (cpus_nonempty && balance_flag_changed)
-                rebuild_sched_domains();
+                async_rebuild_sched_domains();
        return 0;
 }
@@ -1492,6 +1561,9 @@ static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft)
        default:
                BUG();
        }
+        /* Unreachable but makes gcc happy */
+        return 0;
 }
 static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft)
@@ -1504,6 +1576,9 @@ static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft)
        default:
                BUG();
        }
+        /* Unrechable but makes gcc happy */
+        return 0;
 }
@@ -1692,15 +1767,9 @@ static struct cgroup_subsys_state *cpuset_create(
 }
 /*
- * Locking note on the strange update_flag() call below:
- *
 * If the cpuset being removed has its flag 'sched_load_balance'
 * enabled, then simulate turning sched_load_balance off, which
- * will call rebuild_sched_domains().  The get_online_cpus()
+ * will call async_rebuild_sched_domains().
- * call in rebuild_sched_domains() must not be made while holding
- * callback_mutex.  Elsewhere the kernel nests callback_mutex inside
- * get_online_cpus() calls.  So the reverse nesting would risk an
- * ABBA deadlock.
 */
 static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
@@ -1719,7 +1788,7 @@ static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
 struct cgroup_subsys cpuset_subsys = {
        .name = "cpuset",
        .create = cpuset_create,
-        .destroy  = cpuset_destroy,
+        .destroy = cpuset_destroy,
        .can_attach = cpuset_can_attach,
        .attach = cpuset_attach,
        .populate = cpuset_populate,
@@ -1811,7 +1880,7 @@ static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
 }
 /*
- * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs
+ * If CPU and/or memory hotplug handlers, below, unplug any CPUs
 * or memory nodes, we need to walk over the cpuset hierarchy,
 * removing that CPU or node from all cpusets.  If this removes the
 * last CPU or node from a cpuset, then move the tasks in the empty
@@ -1903,35 +1972,6 @@ static void scan_for_empty_cpusets(const struct cpuset *root)
 }
 /*
- * The cpus_allowed and mems_allowed nodemasks in the top_cpuset track
- * cpu_online_map and node_states[N_HIGH_MEMORY].  Force the top cpuset to
- * track what's online after any CPU or memory node hotplug or unplug event.
- *
- * Since there are two callers of this routine, one for CPU hotplug
- * events and one for memory node hotplug events, we could have coded
- * two separate routines here.  We code it as a single common routine
- * in order to minimize text size.
- */
-static void common_cpu_mem_hotplug_unplug(int rebuild_sd)
-{
-        cgroup_lock();
-        top_cpuset.cpus_allowed = cpu_online_map;
-        top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
-        scan_for_empty_cpusets(&top_cpuset);
-        /*
-         * Scheduler destroys domains on hotplug events.
-         * Rebuild them based on the current settings.
-         */
-        if (rebuild_sd)
-                rebuild_sched_domains();
-        cgroup_unlock();
-}
-/*
 * The top_cpuset tracks what CPUs and Memory Nodes are online,
 * period.  This is necessary in order to make cpusets transparent
 * (of no affect) on systems that are actively using CPU hotplug
@@ -1939,40 +1979,52 @@ static void common_cpu_mem_hotplug_unplug(int rebuild_sd)
 *
 * This routine ensures that top_cpuset.cpus_allowed tracks
 * cpu_online_map on each CPU hotplug (cpuhp) event.
+ *
+ * Called within get_online_cpus().  Needs to call cgroup_lock()
+ * before calling generate_sched_domains().
 */
+static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
-static int cpuset_handle_cpuhp(struct notifier_block *unused_nb,
                                unsigned long phase, void *unused_cpu)
 {
+        struct sched_domain_attr *attr;
+        cpumask_t *doms;
+        int ndoms;
        switch (phase) {
-        case CPU_UP_CANCELED:
-        case CPU_UP_CANCELED_FROZEN:
-        case CPU_DOWN_FAILED:
-        case CPU_DOWN_FAILED_FROZEN:
        case CPU_ONLINE:
        case CPU_ONLINE_FROZEN:
        case CPU_DEAD:
        case CPU_DEAD_FROZEN:
-                common_cpu_mem_hotplug_unplug(1);
                break;
        default:
                return NOTIFY_DONE;
        }
+        cgroup_lock();
+        top_cpuset.cpus_allowed = cpu_online_map;
+        scan_for_empty_cpusets(&top_cpuset);
+        ndoms = generate_sched_domains(&doms, &attr);
+        cgroup_unlock();
+        /* Have scheduler rebuild the domains */
+        partition_sched_domains(ndoms, doms, attr);
        return NOTIFY_OK;
 }
 #ifdef CONFIG_MEMORY_HOTPLUG
 /*
 * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY].
- * Call this routine anytime after you change
+ * Call this routine anytime after node_states[N_HIGH_MEMORY] changes.
- * node_states[N_HIGH_MEMORY].
+ * See also the previous routine cpuset_track_online_cpus().
- * See also the previous routine cpuset_handle_cpuhp().
 */
 void cpuset_track_online_nodes(void)
 {
-        common_cpu_mem_hotplug_unplug(0);
+        cgroup_lock();
+        top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
+        scan_for_empty_cpusets(&top_cpuset);
+        cgroup_unlock();
 }
 #endif
@@ -1987,7 +2039,7 @@ void __init cpuset_init_smp(void)
        top_cpuset.cpus_allowed = cpu_online_map;
        top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
-        hotcpu_notifier(cpuset_handle_cpuhp, 0);
+        hotcpu_notifier(cpuset_track_online_cpus, 0);
 }
 /**
diff --git a/kernel/exit.c b/kernel/exit.c
index 38ec40630149..16395644a98f 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -112,9 +112,9 @@ static void __exit_signal(struct task_struct *tsk)
                 * We won't ever get here for the group leader, since it
                 * will have been the last reference on the signal_struct.
                 */
-                sig->utime = cputime_add(sig->utime, tsk->utime);
+                sig->utime = cputime_add(sig->utime, task_utime(tsk));
-                sig->stime = cputime_add(sig->stime, tsk->stime);
+                sig->stime = cputime_add(sig->stime, task_stime(tsk));
-                sig->gtime = cputime_add(sig->gtime, tsk->gtime);
+                sig->gtime = cputime_add(sig->gtime, task_gtime(tsk));
                sig->min_flt += tsk->min_flt;
                sig->maj_flt += tsk->maj_flt;
                sig->nvcsw += tsk->nvcsw;
@@ -831,26 +831,50 @@ static void reparent_thread(struct task_struct *p, struct task_struct *father)
 * the child reaper process (ie "init") in our pid
 * space.
 */
+static struct task_struct *find_new_reaper(struct task_struct *father)
+{
+        struct pid_namespace *pid_ns = task_active_pid_ns(father);
+        struct task_struct *thread;
+        thread = father;
+        while_each_thread(father, thread) {
+                if (thread->flags & PF_EXITING)
+                        continue;
+                if (unlikely(pid_ns->child_reaper == father))
+                        pid_ns->child_reaper = thread;
+                return thread;
+        }
+        if (unlikely(pid_ns->child_reaper == father)) {
+                write_unlock_irq(&tasklist_lock);
+                if (unlikely(pid_ns == &init_pid_ns))
+                        panic("Attempted to kill init!");
+                zap_pid_ns_processes(pid_ns);
+                write_lock_irq(&tasklist_lock);
+                /*
+                 * We can not clear ->child_reaper or leave it alone.
+                 * There may by stealth EXIT_DEAD tasks on ->children,
+                 * forget_original_parent() must move them somewhere.
+                 */
+                pid_ns->child_reaper = init_pid_ns.child_reaper;
+        }
+        return pid_ns->child_reaper;
+}
 static void forget_original_parent(struct task_struct *father)
 {
-        struct task_struct *p, *n, *reaper = father;
+        struct task_struct *p, *n, *reaper;
        LIST_HEAD(ptrace_dead);
        write_lock_irq(&tasklist_lock);
+        reaper = find_new_reaper(father);
        /*
         * First clean up ptrace if we were using it.
         */
        ptrace_exit(father, &ptrace_dead);
-        do {
-                reaper = next_thread(reaper);
-                if (reaper == father) {
-                        reaper = task_child_reaper(father);
-                        break;
-                }
-        } while (reaper->flags & PF_EXITING);
        list_for_each_entry_safe(p, n, &father->children, sibling) {
                p->real_parent = reaper;
                if (p->parent == father) {
@@ -918,8 +942,8 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
        /* mt-exec, de_thread() is waiting for us */
        if (thread_group_leader(tsk) &&
-            tsk->signal->notify_count < 0 &&
+            tsk->signal->group_exit_task &&
-            tsk->signal->group_exit_task)
+            tsk->signal->notify_count < 0)
                wake_up_process(tsk->signal->group_exit_task);
        write_unlock_irq(&tasklist_lock);
@@ -959,39 +983,6 @@ static void check_stack_usage(void)
 static inline void check_stack_usage(void) {}
 #endif
-static inline void exit_child_reaper(struct task_struct *tsk)
-{
-        if (likely(tsk->group_leader != task_child_reaper(tsk)))
-                return;
-        if (tsk->nsproxy->pid_ns == &init_pid_ns)
-                panic("Attempted to kill init!");
-        /*
-         * @tsk is the last thread in the 'cgroup-init' and is exiting.
-         * Terminate all remaining processes in the namespace and reap them
-         * before exiting @tsk.
-         *
-         * Note that @tsk (last thread of cgroup-init) may not necessarily
-         * be the child-reaper (i.e main thread of cgroup-init) of the
-         * namespace i.e the child_reaper may have already exited.
-         *
-         * Even after a child_reaper exits, we let it inherit orphaned children,
-         * because, pid_ns->child_reaper remains valid as long as there is
-         * at least one living sub-thread in the cgroup init.
-         * This living sub-thread of the cgroup-init will be notified when
-         * a child inherited by the 'child-reaper' exits (do_notify_parent()
-         * uses __group_send_sig_info()). Further, when reaping child processes,
-         * do_wait() iterates over children of all living sub threads.
-         * i.e even though 'child_reaper' thread is listed as the parent of the
-         * orphaned children, any living sub-thread in the cgroup-init can
-         * perform the role of the child_reaper.
-         */
-        zap_pid_ns_processes(tsk->nsproxy->pid_ns);
-}
 NORET_TYPE void do_exit(long code)
 {
        struct task_struct *tsk = current;
@@ -1051,7 +1042,6 @@ NORET_TYPE void do_exit(long code)
        }
        group_dead = atomic_dec_and_test(&tsk->signal->live);
        if (group_dead) {
-                exit_child_reaper(tsk);
                hrtimer_cancel(&tsk->signal->real_timer);
                exit_itimers(tsk->signal);
        }
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 3bfb1877a003..dbda475b13bd 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -875,11 +875,11 @@ static int add_lock_to_list(struct lock_class *class, struct lock_class *this,
        if (!entry)
                return 0;
-        entry->class = this;
-        entry->distance = distance;
        if (!save_trace(&entry->trace))
                return 0;
+        entry->class = this;
+        entry->distance = distance;
        /*
         * Since we never remove from the dependency list, the list can
         * be walked lockless by other CPUs, it's only allocation
@@ -3029,7 +3029,7 @@ found_it:
        stats = get_lock_stats(hlock_class(hlock));
        if (point < ARRAY_SIZE(stats->contention_point))
-                stats->contention_point[i]++;
+                stats->contention_point[point]++;
        if (lock->cpu != smp_processor_id())
                stats->bounces[bounce_contended + !!hlock->read]++;
        put_lock_stats(stats);
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index 4b194d34d77f..20dbcbf9c7dd 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -472,8 +472,9 @@ static void snprint_time(char *buf, size_t bufsiz, s64 nr)
 {
        unsigned long rem;
+        nr += 5; /* for display rounding */
        rem = do_div(nr, 1000); /* XXX: do_div_signed */
-        snprintf(buf, bufsiz, "%lld.%02d", (long long)nr, ((int)rem+5)/10);
+        snprintf(buf, bufsiz, "%lld.%02d", (long long)nr, (int)rem/10);
 }
 static void seq_time(struct seq_file *m, s64 time)
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index ea567b78d1aa..fab8ea86fac3 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -179,9 +179,6 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
                rc = sys_wait4(-1, NULL, __WALL, NULL);
        } while (rc != -ECHILD);
-        /* Child reaper for the pid namespace is going away */
-        pid_ns->child_reaper = NULL;
        acct_exit_ns(pid_ns);
        return;
 }
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
index da9c2dda6a4e..dfdec524d1b7 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/pm_qos_params.c
@@ -43,7 +43,7 @@
 #include <linux/uaccess.h>
 /*
- * locking rule: all changes to target_value or requirements or notifiers lists
+ * locking rule: all changes to requirements or notifiers lists
 * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock
 * held, taken with _irqsave.  One lock to rule them all
 */
@@ -66,7 +66,7 @@ struct pm_qos_object {
        struct miscdevice pm_qos_power_miscdev;
        char *name;
        s32 default_value;
-        s32 target_value;
+        atomic_t target_value;
        s32 (*comparitor)(s32, s32);
 };
@@ -77,7 +77,7 @@ static struct pm_qos_object cpu_dma_pm_qos = {
        .notifiers = &cpu_dma_lat_notifier,
        .name = "cpu_dma_latency",
        .default_value = 2000 * USEC_PER_SEC,
-        .target_value = 2000 * USEC_PER_SEC,
+        .target_value = ATOMIC_INIT(2000 * USEC_PER_SEC),
        .comparitor = min_compare
 };
@@ -87,7 +87,7 @@ static struct pm_qos_object network_lat_pm_qos = {
        .notifiers = &network_lat_notifier,
        .name = "network_latency",
        .default_value = 2000 * USEC_PER_SEC,
-        .target_value = 2000 * USEC_PER_SEC,
+        .target_value = ATOMIC_INIT(2000 * USEC_PER_SEC),
        .comparitor = min_compare
 };
@@ -99,7 +99,7 @@ static struct pm_qos_object network_throughput_pm_qos = {
        .notifiers = &network_throughput_notifier,
        .name = "network_throughput",
        .default_value = 0,
-        .target_value = 0,
+        .target_value = ATOMIC_INIT(0),
        .comparitor = max_compare
 };
@@ -150,11 +150,11 @@ static void update_target(int target)
                extreme_value = pm_qos_array[target]->comparitor(
                                extreme_value, node->value);
        }
-        if (pm_qos_array[target]->target_value != extreme_value) {
+        if (atomic_read(&pm_qos_array[target]->target_value) != extreme_value) {
                call_notifier = 1;
-                pm_qos_array[target]->target_value = extreme_value;
+                atomic_set(&pm_qos_array[target]->target_value, extreme_value);
                pr_debug(KERN_ERR "new target for qos %d is %d\n", target,
-                        pm_qos_array[target]->target_value);
+                        atomic_read(&pm_qos_array[target]->target_value));
        }
        spin_unlock_irqrestore(&pm_qos_lock, flags);
@@ -193,14 +193,7 @@ static int find_pm_qos_object_by_minor(int minor)
 */
 int pm_qos_requirement(int pm_qos_class)
 {
-        int ret_val;
+        return atomic_read(&pm_qos_array[pm_qos_class]->target_value);
-        unsigned long flags;
-        spin_lock_irqsave(&pm_qos_lock, flags);
-        ret_val = pm_qos_array[pm_qos_class]->target_value;
-        spin_unlock_irqrestore(&pm_qos_lock, flags);
-        return ret_val;
 }
 EXPORT_SYMBOL_GPL(pm_qos_requirement);
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index f011e0870b52..bbd85c60f741 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -21,6 +21,7 @@
 #include <linux/console.h>
 #include <linux/cpu.h>
 #include <linux/freezer.h>
+#include <linux/ftrace.h>
 #include "power.h"
@@ -255,7 +256,7 @@ static int create_image(int platform_mode)
 int hibernation_snapshot(int platform_mode)
 {
-        int error;
+        int error, ftrace_save;
        /* Free memory before shutting down devices. */
        error = swsusp_shrink_memory();
@@ -267,6 +268,7 @@ int hibernation_snapshot(int platform_mode)
                goto Close;
        suspend_console();
+        ftrace_save = __ftrace_enabled_save();
        error = device_suspend(PMSG_FREEZE);
        if (error)
                goto Recover_platform;
@@ -296,6 +298,7 @@ int hibernation_snapshot(int platform_mode)
 Resume_devices:
        device_resume(in_suspend ?
                (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
+        __ftrace_enabled_restore(ftrace_save);
        resume_console();
 Close:
        platform_end(platform_mode);
@@ -366,10 +369,11 @@ static int resume_target_kernel(void)
 int hibernation_restore(int platform_mode)
 {
-        int error;
+        int error, ftrace_save;
        pm_prepare_console();
        suspend_console();
+        ftrace_save = __ftrace_enabled_save();
        error = device_suspend(PMSG_QUIESCE);
        if (error)
                goto Finish;
@@ -384,6 +388,7 @@ int hibernation_restore(int platform_mode)
        platform_restore_cleanup(platform_mode);
        device_resume(PMSG_RECOVER);
 Finish:
+        __ftrace_enabled_restore(ftrace_save);
        resume_console();
        pm_restore_console();
        return error;
@@ -396,7 +401,7 @@ int hibernation_restore(int platform_mode)
 int hibernation_platform_enter(void)
 {
-        int error;
+        int error, ftrace_save;
        if (!hibernation_ops)
                return -ENOSYS;
@@ -411,6 +416,7 @@ int hibernation_platform_enter(void)
                goto Close;
        suspend_console();
+        ftrace_save = __ftrace_enabled_save();
        error = device_suspend(PMSG_HIBERNATE);
        if (error) {
                if (hibernation_ops->recover)
@@ -445,6 +451,7 @@ int hibernation_platform_enter(void)
        hibernation_ops->finish();
 Resume_devices:
        device_resume(PMSG_RESTORE);
+        __ftrace_enabled_restore(ftrace_save);
        resume_console();
 Close:
        hibernation_ops->end();
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 0b7476f5d2a6..540b16b68565 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -21,6 +21,7 @@
 #include <linux/freezer.h>
 #include <linux/vmstat.h>
 #include <linux/syscalls.h>
+#include <linux/ftrace.h>
 #include "power.h"
@@ -310,7 +311,7 @@ static int suspend_enter(suspend_state_t state)
 */
 int suspend_devices_and_enter(suspend_state_t state)
 {
-        int error;
+        int error, ftrace_save;
        if (!suspend_ops)
                return -ENOSYS;
@@ -321,6 +322,7 @@ int suspend_devices_and_enter(suspend_state_t state)
                        goto Close;
        }
        suspend_console();
+        ftrace_save = __ftrace_enabled_save();
        suspend_test_start();
        error = device_suspend(PMSG_SUSPEND);
        if (error) {
@@ -352,6 +354,7 @@ int suspend_devices_and_enter(suspend_state_t state)
        suspend_test_start();
        device_resume(PMSG_RESUME);
        suspend_test_finish("resume devices");
+        __ftrace_enabled_restore(ftrace_save);
        resume_console();
 Close:
        if (suspend_ops->end)
diff --git a/kernel/resource.c b/kernel/resource.c
index f5b518eabefe..03d796c1b2e9 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -362,35 +362,21 @@ int allocate_resource(struct resource *root, struct resource *new,
 EXPORT_SYMBOL(allocate_resource);
-/**
+/*
- * insert_resource - Inserts a resource in the resource tree
+ * Insert a resource into the resource tree. If successful, return NULL,
- * @parent: parent of the new resource
+ * otherwise return the conflicting resource (compare to __request_resource())
- * @new: new resource to insert
- *
- * Returns 0 on success, -EBUSY if the resource can't be inserted.
- *
- * This function is equivalent to request_resource when no conflict
- * happens. If a conflict happens, and the conflicting resources
- * entirely fit within the range of the new resource, then the new
- * resource is inserted and the conflicting resources become children of
- * the new resource.
 */
-int insert_resource(struct resource *parent, struct resource *new)
+static struct resource * __insert_resource(struct resource *parent, struct resource *new)
 {
-        int result;
        struct resource *first, *next;
-        write_lock(&resource_lock);
        for (;; parent = first) {
-                result = 0;
                first = __request_resource(parent, new);
                if (!first)
-                        goto out;
+                        return first;
-                result = -EBUSY;
                if (first == parent)
-                        goto out;
+                        return first;
                if ((first->start > new->start) || (first->end < new->end))
                        break;
@@ -401,15 +387,13 @@ int insert_resource(struct resource *parent, struct resource *new)
        for (next = first; ; next = next->sibling) {
                /* Partial overlap? Bad, and unfixable */
                if (next->start < new->start || next->end > new->end)
-                        goto out;
+                        return next;
                if (!next->sibling)
                        break;
                if (next->sibling->start > new->end)
                        break;
        }
-        result = 0;
        new->parent = parent;
        new->sibling = next->sibling;
        new->child = first;
@@ -426,10 +410,64 @@ int insert_resource(struct resource *parent, struct resource *new)
                        next = next->sibling;
                next->sibling = new;
        }
+        return NULL;
+}
- out:
+/**
+ * insert_resource - Inserts a resource in the resource tree
+ * @parent: parent of the new resource
+ * @new: new resource to insert
+ *
+ * Returns 0 on success, -EBUSY if the resource can't be inserted.
+ *
+ * This function is equivalent to request_resource when no conflict
+ * happens. If a conflict happens, and the conflicting resources
+ * entirely fit within the range of the new resource, then the new
+ * resource is inserted and the conflicting resources become children of
+ * the new resource.
+ */
+int insert_resource(struct resource *parent, struct resource *new)
+{
+        struct resource *conflict;
+        write_lock(&resource_lock);
+        conflict = __insert_resource(parent, new);
+        write_unlock(&resource_lock);
+        return conflict ? -EBUSY : 0;
+}
+/**
+ * insert_resource_expand_to_fit - Insert a resource into the resource tree
+ * @root: root resource descriptor
+ * @new: new resource to insert
+ *
+ * Insert a resource into the resource tree, possibly expanding it in order
+ * to make it encompass any conflicting resources.
+ */
+void insert_resource_expand_to_fit(struct resource *root, struct resource *new)
+{
+        if (new->parent)
+                return;
+        write_lock(&resource_lock);
+        for (;;) {
+                struct resource *conflict;
+                conflict = __insert_resource(root, new);
+                if (!conflict)
+                        break;
+                if (conflict == root)
+                        break;
+                /* Ok, expand resource to cover the conflict, then try again .. */
+                if (conflict->start < new->start)
+                        new->start = conflict->start;
+                if (conflict->end > new->end)
+                        new->end = conflict->end;
+                printk("Expanded resource %s due to conflict with %s\n", new->name, conflict->name);
+        }
        write_unlock(&resource_lock);
-        return result;
 }
 /**
diff --git a/kernel/sched.c b/kernel/sched.c
index 9a1ddb84e26d..cc1f81b50b82 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4179,6 +4179,65 @@ void account_steal_time(struct task_struct *p, cputime_t steal)
 }
 /*
+ * Use precise platform statistics if available:
+ */
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING
+cputime_t task_utime(struct task_struct *p)
+{
+        return p->utime;
+}
+cputime_t task_stime(struct task_struct *p)
+{
+        return p->stime;
+}
+#else
+cputime_t task_utime(struct task_struct *p)
+{
+        clock_t utime = cputime_to_clock_t(p->utime),
+                total = utime + cputime_to_clock_t(p->stime);
+        u64 temp;
+        /*
+         * Use CFS's precise accounting:
+         */
+        temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime);
+        if (total) {
+                temp *= utime;
+                do_div(temp, total);
+        }
+        utime = (clock_t)temp;
+        p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime));
+        return p->prev_utime;
+}
+cputime_t task_stime(struct task_struct *p)
+{
+        clock_t stime;
+        /*
+         * Use CFS's precise accounting. (we subtract utime from
+         * the total, to make sure the total observed by userspace
+         * grows monotonically - apps rely on that):
+         */
+        stime = nsec_to_clock_t(p->se.sum_exec_runtime) -
+                        cputime_to_clock_t(task_utime(p));
+        if (stime >= 0)
+                p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime));
+        return p->prev_stime;
+}
+#endif
+inline cputime_t task_gtime(struct task_struct *p)
+{
+        return p->gtime;
+}
+/*
 * This function gets called by the timer code, with HZ frequency.
 * We call it with interrupts disabled.
 *
@@ -7637,24 +7696,27 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
 * and partition_sched_domains() will fallback to the single partition
 * 'fallback_doms', it also forces the domains to be rebuilt.
 *
+ * If doms_new==NULL it will be replaced with cpu_online_map.
+ * ndoms_new==0 is a special case for destroying existing domains.
+ * It will not create the default domain.
+ *
 * Call with hotplug lock held
 */
 void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
                             struct sched_domain_attr *dattr_new)
 {
-        int i, j;
+        int i, j, n;
        mutex_lock(&sched_domains_mutex);
        /* always unregister in case we don't destroy any domains */
        unregister_sched_domain_sysctl();
-        if (doms_new == NULL)
+        n = doms_new ? ndoms_new : 0;
-                ndoms_new = 0;
        /* Destroy deleted domains */
        for (i = 0; i < ndoms_cur; i++) {
-                for (j = 0; j < ndoms_new; j++) {
+                for (j = 0; j < n; j++) {
                        if (cpus_equal(doms_cur[i], doms_new[j])
                            && dattrs_equal(dattr_cur, i, dattr_new, j))
                                goto match1;
@@ -7667,7 +7729,6 @@ match1:
        if (doms_new == NULL) {
                ndoms_cur = 0;
-                ndoms_new = 1;
                doms_new = &fallback_doms;
                cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
                dattr_new = NULL;
@@ -7704,8 +7765,13 @@ match2:
 int arch_reinit_sched_domains(void)
 {
        get_online_cpus();
+        /* Destroy domains first to force the rebuild */
+        partition_sched_domains(0, NULL, NULL);
        rebuild_sched_domains();
        put_online_cpus();
        return 0;
 }
@@ -7789,7 +7855,7 @@ static int update_sched_domains(struct notifier_block *nfb,
        case CPU_ONLINE_FROZEN:
        case CPU_DEAD:
        case CPU_DEAD_FROZEN:
-                partition_sched_domains(0, NULL, NULL);
+                partition_sched_domains(1, NULL, NULL);
                return NOTIFY_OK;
        default:
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 998ba54b4543..552310798dad 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -199,6 +199,8 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
 static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 {
+        if (rt_rq->rt_nr_running)
+                resched_task(rq_of_rt_rq(rt_rq)->curr);
 }
 static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
@@ -438,9 +440,6 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
 {
        u64 runtime = sched_rt_runtime(rt_rq);
-        if (runtime == RUNTIME_INF)
-                return 0;
        if (rt_rq->rt_throttled)
                return rt_rq_throttled(rt_rq);
@@ -491,9 +490,11 @@ static void update_curr_rt(struct rq *rq)
                rt_rq = rt_rq_of_se(rt_se);
                spin_lock(&rt_rq->rt_runtime_lock);
-                rt_rq->rt_time += delta_exec;
+                if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
-                if (sched_rt_runtime_exceeded(rt_rq))
+                        rt_rq->rt_time += delta_exec;
-                        resched_task(curr);
+                        if (sched_rt_runtime_exceeded(rt_rq))
+                                resched_task(curr);
+                }
                spin_unlock(&rt_rq->rt_runtime_lock);
        }
 }
diff --git a/kernel/smp.c b/kernel/smp.c
index 782e2b93e465..f362a8553777 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -210,8 +210,10 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
 {
        struct call_single_data d;
        unsigned long flags;
-        /* prevent preemption and reschedule on another processor */
+        /* prevent preemption and reschedule on another processor,
+           as well as CPU removal */
        int me = get_cpu();
+        int err = 0;
        /* Can deadlock when called with interrupts disabled */
        WARN_ON(irqs_disabled());
@@ -220,7 +222,7 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
                local_irq_save(flags);
                func(info);
                local_irq_restore(flags);
-        } else {
+        } else if ((unsigned)cpu < NR_CPUS && cpu_online(cpu)) {
                struct call_single_data *data = NULL;
                if (!wait) {
@@ -236,10 +238,12 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
                data->func = func;
                data->info = info;
                generic_exec_single(cpu, data);
+        } else {
+                err = -ENXIO;   /* CPU not online */
        }
        put_cpu();
-        return 0;
+        return err;
 }
 EXPORT_SYMBOL(smp_call_function_single);
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index b75b492fbfcf..cb838ee93a82 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -233,7 +233,8 @@ static void check_hung_uninterruptible_tasks(int this_cpu)
        do_each_thread(g, t) {
                if (!--max_count)
                        goto unlock;
-                if (t->state & TASK_UNINTERRUPTIBLE)
+                /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
+                if (t->state == TASK_UNINTERRUPTIBLE)
                        check_hung_task(t, now);
        } while_each_thread(g, t);
 unlock:
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index fe4713347275..50ec0886fa3d 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -159,6 +159,7 @@ static int proc_dointvec_taint(struct ctl_table *table, int write, struct file *
 static struct ctl_table root_table[];
 static struct ctl_table_root sysctl_table_root;
 static struct ctl_table_header root_table_header = {
+        .count = 1,
        .ctl_table = root_table,
        .ctl_entry = LIST_HEAD_INIT(sysctl_table_root.default_set.list),
        .root = &sysctl_table_root,
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 3d1e3e1a1971..1876b526c778 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -177,7 +177,7 @@ void clockevents_register_device(struct clock_event_device *dev)
 /*
 * Noop handler when we shut down an event device
 */
-static void clockevents_handle_noop(struct clock_event_device *dev)
+void clockevents_handle_noop(struct clock_event_device *dev)
 {
 }
@@ -199,7 +199,6 @@ void clockevents_exchange_device(struct clock_event_device *old,
         * released list and do a notify add later.
         */
        if (old) {
-                old->event_handler = clockevents_handle_noop;
                clockevents_set_mode(old, CLOCK_EVT_MODE_UNUSED);
                list_del(&old->list);
                list_add(&old->list, &clockevents_released);
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 5125ddd8196b..1ad46f3df6e7 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -245,7 +245,7 @@ static void sync_cmos_clock(unsigned long dummy)
        if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2)
                fail = update_persistent_clock(now);
-        next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec;
+        next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec - (TICK_NSEC / 2);
        if (next.tv_nsec <= 0)
                next.tv_nsec += NSEC_PER_SEC;
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 31463d370b94..2f5a38294bf9 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -175,6 +175,8 @@ static void tick_do_periodic_broadcast(void)
 */
 static void tick_handle_periodic_broadcast(struct clock_event_device *dev)
 {
+        ktime_t next;
        tick_do_periodic_broadcast();
        /*
@@ -185,10 +187,13 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev)
        /*
         * Setup the next period for devices, which do not have
-         * periodic mode:
+         * periodic mode. We read dev->next_event first and add to it
+         * when the event alrady expired. clockevents_program_event()
+         * sets dev->next_event only when the event is really
+         * programmed to the device.
         */
-        for (;;) {
+        for (next = dev->next_event; ;) {
-                ktime_t next = ktime_add(dev->next_event, tick_period);
+                next = ktime_add(next, tick_period);
                if (!clockevents_program_event(dev, next, ktime_get()))
                        return;
@@ -205,7 +210,7 @@ static void tick_do_broadcast_on_off(void *why)
        struct clock_event_device *bc, *dev;
        struct tick_device *td;
        unsigned long flags, *reason = why;
-        int cpu;
+        int cpu, bc_stopped;
        spin_lock_irqsave(&tick_broadcast_lock, flags);
@@ -223,6 +228,8 @@ static void tick_do_broadcast_on_off(void *why)
        if (!tick_device_is_functional(dev))
                goto out;
+        bc_stopped = cpus_empty(tick_broadcast_mask);
        switch (*reason) {
        case CLOCK_EVT_NOTIFY_BROADCAST_ON:
        case CLOCK_EVT_NOTIFY_BROADCAST_FORCE:
@@ -245,9 +252,10 @@ static void tick_do_broadcast_on_off(void *why)
                break;
        }
-        if (cpus_empty(tick_broadcast_mask))
+        if (cpus_empty(tick_broadcast_mask)) {
-                clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN);
+                if (!bc_stopped)
-        else {
+                        clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN);
+        } else if (bc_stopped) {
                if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC)
                        tick_broadcast_start_periodic(bc);
                else
@@ -364,16 +372,8 @@ cpumask_t *tick_get_broadcast_oneshot_mask(void)
 static int tick_broadcast_set_event(ktime_t expires, int force)
 {
        struct clock_event_device *bc = tick_broadcast_device.evtdev;
-        ktime_t now = ktime_get();
-        int res;
+        return tick_dev_program_event(bc, expires, force);
-        for(;;) {
-                res = clockevents_program_event(bc, expires, now);
-                if (!res || !force)
-                        return res;
-                now = ktime_get();
-                expires = ktime_add(now, ktime_set(0, bc->min_delta_ns));
-        }
 }
 int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
@@ -491,14 +491,52 @@ static void tick_broadcast_clear_oneshot(int cpu)
        cpu_clear(cpu, tick_broadcast_oneshot_mask);
 }
+static void tick_broadcast_init_next_event(cpumask_t *mask, ktime_t expires)
+{
+        struct tick_device *td;
+        int cpu;
+        for_each_cpu_mask_nr(cpu, *mask) {
+                td = &per_cpu(tick_cpu_device, cpu);
+                if (td->evtdev)
+                        td->evtdev->next_event = expires;
+        }
+}
 /**
 * tick_broadcast_setup_oneshot - setup the broadcast device
 */
 void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
 {
-        bc->event_handler = tick_handle_oneshot_broadcast;
+        /* Set it up only once ! */
-        clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
+        if (bc->event_handler != tick_handle_oneshot_broadcast) {
-        bc->next_event.tv64 = KTIME_MAX;
+                int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC;
+                int cpu = smp_processor_id();
+                cpumask_t mask;
+                bc->event_handler = tick_handle_oneshot_broadcast;
+                clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
+                /* Take the do_timer update */
+                tick_do_timer_cpu = cpu;
+                /*
+                 * We must be careful here. There might be other CPUs
+                 * waiting for periodic broadcast. We need to set the
+                 * oneshot_mask bits for those and program the
+                 * broadcast device to fire.
+                 */
+                mask = tick_broadcast_mask;
+                cpu_clear(cpu, mask);
+                cpus_or(tick_broadcast_oneshot_mask,
+                        tick_broadcast_oneshot_mask, mask);
+                if (was_periodic && !cpus_empty(mask)) {
+                        tick_broadcast_init_next_event(&mask, tick_next_period);
+                        tick_broadcast_set_event(tick_next_period, 1);
+                } else
+                        bc->next_event.tv64 = KTIME_MAX;
+        }
 }
 /*
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 80c4336f4188..c4777193d567 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -161,6 +161,7 @@ static void tick_setup_device(struct tick_device *td,
        } else {
                handler = td->evtdev->event_handler;
                next_event = td->evtdev->next_event;
+                td->evtdev->event_handler = clockevents_handle_noop;
        }
        td->evtdev = newdev;
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index f13f2b7f4fd4..0ffc2918ea6f 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -17,6 +17,8 @@ extern void tick_handle_periodic(struct clock_event_device *dev);
 extern void tick_setup_oneshot(struct clock_event_device *newdev,
                               void (*handler)(struct clock_event_device *),
                               ktime_t nextevt);
+extern int tick_dev_program_event(struct clock_event_device *dev,
+                                  ktime_t expires, int force);
 extern int tick_program_event(ktime_t expires, int force);
 extern void tick_oneshot_notify(void);
 extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *));
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index 450c04935b66..2e8de678e767 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -23,24 +23,56 @@
 #include "tick-internal.h"
 /**
- * tick_program_event
+ * tick_program_event internal worker function
 */
-int tick_program_event(ktime_t expires, int force)
+int tick_dev_program_event(struct clock_event_device *dev, ktime_t expires,
+                           int force)
 {
-        struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
        ktime_t now = ktime_get();
+        int i;
-        while (1) {
+        for (i = 0;;) {
                int ret = clockevents_program_event(dev, expires, now);
                if (!ret || !force)
                        return ret;
+                /*
+                 * We tried 2 times to program the device with the given
+                 * min_delta_ns. If that's not working then we double it
+                 * and emit a warning.
+                 */
+                if (++i > 2) {
+                        /* Increase the min. delta and try again */
+                        if (!dev->min_delta_ns)
+                                dev->min_delta_ns = 5000;
+                        else
+                                dev->min_delta_ns += dev->min_delta_ns >> 1;
+                        printk(KERN_WARNING
+                               "CE: %s increasing min_delta_ns to %lu nsec\n",
+                               dev->name ? dev->name : "?",
+                               dev->min_delta_ns << 1);
+                        i = 0;
+                }
                now = ktime_get();
-                expires = ktime_add(now, ktime_set(0, dev->min_delta_ns));
+                expires = ktime_add_ns(now, dev->min_delta_ns);
        }
 }
 /**
+ * tick_program_event
+ */
+int tick_program_event(ktime_t expires, int force)
+{
+        struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
+        return tick_dev_program_event(dev, expires, force);
+}
+/**
 * tick_resume_onshot - resume oneshot mode
 */
 void tick_resume_oneshot(void)
@@ -61,7 +93,7 @@ void tick_setup_oneshot(struct clock_event_device *newdev,
 {
        newdev->event_handler = handler;
        clockevents_set_mode(newdev, CLOCK_EVT_MODE_ONESHOT);
-        clockevents_program_event(newdev, next_event, ktime_get());
+        tick_dev_program_event(newdev, next_event, 1);
 }
 /**
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 7a46bde78c66..a87b0468568b 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -162,6 +162,8 @@ void tick_nohz_stop_idle(int cpu)
                ts->idle_lastupdate = now;
                ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
                ts->idle_active = 0;
+                sched_clock_idle_wakeup_event(0);
        }
 }
@@ -177,6 +179,7 @@ static ktime_t tick_nohz_start_idle(struct tick_sched *ts)
        }
        ts->idle_entrytime = now;
        ts->idle_active = 1;
+        sched_clock_idle_sleep_event();
        return now;
 }