Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6

author: Paul Mundt <lethal@linux-sh.org> 2010-12-21 22:56:10 -0500
committer: Paul Mundt <lethal@linux-sh.org> 2010-12-21 22:56:10 -0500
commit: 7ccbefe07ea0a3570e44d1ec13a307552ee4dadd (patch)
tree: ba0299694a9f3940f289b6a29cadab853906e3d2 /kernel
parent: 623eb15647fc35c5a8cd38985d5958240eb072c1 (diff)
parent: 90a8a73c06cc32b609a880d48449d7083327e11a (diff)
21 files changed, 499 insertions, 230 deletions
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 37755d621924..a6e729766821 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -82,7 +82,7 @@ static kdbtab_t kdb_base_commands[50];
 #define for_each_kdbcmd(cmd, num)                                       \
        for ((cmd) = kdb_base_commands, (num) = 0;                      \
             num < kdb_max_commands;                                    \
-             num == KDB_BASE_CMD_MAX ? cmd = kdb_commands : cmd++, num++)
+             num++, num == KDB_BASE_CMD_MAX ? cmd = kdb_commands : cmd++)
 typedef struct _kdbmsg {
        int     km_diag;        /* kdb diagnostic */
@@ -646,7 +646,7 @@ static int kdb_defcmd2(const char *cmdstr, const char *argv0)
        }
        if (!s->usable)
                return KDB_NOTIMP;
-        s->command = kmalloc((s->count + 1) * sizeof(*(s->command)), GFP_KDB);
+        s->command = kzalloc((s->count + 1) * sizeof(*(s->command)), GFP_KDB);
        if (!s->command) {
                kdb_printf("Could not allocate new kdb_defcmd table for %s\n",
                           cmdstr);
@@ -2361,7 +2361,7 @@ static int kdb_pid(int argc, const char **argv)
 */
 static int kdb_ll(int argc, const char **argv)
 {
-        int diag;
+        int diag = 0;
        unsigned long addr;
        long offset = 0;
        unsigned long va;
@@ -2400,20 +2400,21 @@ static int kdb_ll(int argc, const char **argv)
                char buf[80];
                if (KDB_FLAG(CMD_INTERRUPT))
-                        return 0;
+                        goto out;
                sprintf(buf, "%s " kdb_machreg_fmt "\n", command, va);
                diag = kdb_parse(buf);
                if (diag)
-                        return diag;
+                        goto out;
                addr = va + linkoffset;
                if (kdb_getword(&va, addr, sizeof(va)))
-                        return 0;
+                        goto out;
        }
-        kfree(command);
-        return 0;
+out:
+        kfree(command);
+        return diag;
 }
 static int kdb_kgdb(int argc, const char **argv)
@@ -2739,13 +2740,13 @@ int kdb_register_repeat(char *cmd,
                }
                if (kdb_commands) {
                        memcpy(new, kdb_commands,
-                               kdb_max_commands * sizeof(*new));
+                          (kdb_max_commands - KDB_BASE_CMD_MAX) * sizeof(*new));
                        kfree(kdb_commands);
                }
                memset(new + kdb_max_commands, 0,
                       kdb_command_extend * sizeof(*new));
                kdb_commands = new;
-                kp = kdb_commands + kdb_max_commands;
+                kp = kdb_commands + kdb_max_commands - KDB_BASE_CMD_MAX;
                kdb_max_commands += kdb_command_extend;
        }
diff --git a/kernel/exit.c b/kernel/exit.c
index 21aa7b3001fb..676149a4ac5f 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -914,6 +914,15 @@ NORET_TYPE void do_exit(long code)
        if (unlikely(!tsk->pid))
                panic("Attempted to kill the idle task!");
+        /*
+         * If do_exit is called because this processes oopsed, it's possible
+         * that get_fs() was left as KERNEL_DS, so reset it to USER_DS before
+         * continuing. Amongst other possible reasons, this is to prevent
+         * mm_release()->clear_child_tid() from writing to a user-controlled
+         * kernel address.
+         */
+        set_fs(USER_DS);
        tracehook_report_exit(&code);
        validate_creds_for_do_exit(tsk);
diff --git a/kernel/fork.c b/kernel/fork.c
index 3b159c5991b7..5447dc7defa9 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -273,6 +273,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
        setup_thread_stack(tsk, orig);
        clear_user_return_notifier(tsk);
+        clear_tsk_need_resched(tsk);
        stackend = end_of_stack(tsk);
        *stackend = STACK_END_MAGIC;    /* for overflow detection */
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index 2c9120f0afca..e5325825aeb6 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -620,7 +620,7 @@ static struct pmu perf_breakpoint = {
        .read           = hw_breakpoint_pmu_read,
 };
-static int __init init_hw_breakpoint(void)
+int __init init_hw_breakpoint(void)
 {
        unsigned int **task_bp_pinned;
        int cpu, err_cpu;
@@ -655,6 +655,5 @@ static int __init init_hw_breakpoint(void)
        return -ENOMEM;
 }
-core_initcall(init_hw_breakpoint);
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 01b1d3a88983..6c8a2a9f8a7b 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -214,7 +214,7 @@ static int irq_spurious_proc_show(struct seq_file *m, void *v)
 static int irq_spurious_proc_open(struct inode *inode, struct file *file)
 {
-        return single_open(file, irq_spurious_proc_show, NULL);
+        return single_open(file, irq_spurious_proc_show, PDE(inode)->data);
 }
 static const struct file_operations irq_spurious_proc_fops = {
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index f16763ff8481..90f881904bb1 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -145,7 +145,9 @@ void irq_work_run(void)
                 * Clear the BUSY bit and return to the free state if
                 * no-one else claimed it meanwhile.
                 */
-                cmpxchg(&entry->next, next_flags(NULL, IRQ_WORK_BUSY), NULL);
+                (void)cmpxchg(&entry->next,
+                              next_flags(NULL, IRQ_WORK_BUSY),
+                              NULL);
        }
 }
 EXPORT_SYMBOL_GPL(irq_work_run);
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index a8db2570f99a..6f6d091b5757 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -546,7 +546,7 @@ static const struct file_operations kallsyms_operations = {
 static int __init kallsyms_init(void)
 {
-        proc_create("kallsyms", 0400, NULL, &kallsyms_operations);
+        proc_create("kallsyms", 0444, NULL, &kallsyms_operations);
        return 0;
 }
 device_initcall(kallsyms_init);
diff --git a/kernel/module.c b/kernel/module.c
index 437a74a7524a..d190664f25ff 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2326,6 +2326,18 @@ static void find_module_sections(struct module *mod, struct load_info *info)
        kmemleak_scan_area(mod->trace_events, sizeof(*mod->trace_events) *
                           mod->num_trace_events, GFP_KERNEL);
 #endif
+#ifdef CONFIG_TRACING
+        mod->trace_bprintk_fmt_start = section_objs(info, "__trace_printk_fmt",
+                                         sizeof(*mod->trace_bprintk_fmt_start),
+                                         &mod->num_trace_bprintk_fmt);
+        /*
+         * This section contains pointers to allocated objects in the trace
+         * code and not scanning it leads to false positives.
+         */
+        kmemleak_scan_area(mod->trace_bprintk_fmt_start,
+                           sizeof(*mod->trace_bprintk_fmt_start) *
+                           mod->num_trace_bprintk_fmt, GFP_KERNEL);
+#endif
 #ifdef CONFIG_FTRACE_MCOUNT_RECORD
        /* sechdrs[0].sh_size is always zero */
        mod->ftrace_callsites = section_objs(info, "__mcount_loc",
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index cb6c0d2af68f..2870feee81dd 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -31,6 +31,7 @@
 #include <linux/kernel_stat.h>
 #include <linux/perf_event.h>
 #include <linux/ftrace_event.h>
+#include <linux/hw_breakpoint.h>
 #include <asm/irq_regs.h>
@@ -1286,8 +1287,6 @@ void __perf_event_task_sched_out(struct task_struct *task,
 {
        int ctxn;
-        perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0);
        for_each_task_context_nr(ctxn)
                perf_event_context_sched_out(task, ctxn, next);
 }
@@ -1621,8 +1620,12 @@ static void rotate_ctx(struct perf_event_context *ctx)
 {
        raw_spin_lock(&ctx->lock);
-        /* Rotate the first entry last of non-pinned groups */
+        /*
-        list_rotate_left(&ctx->flexible_groups);
+         * Rotate the first entry last of non-pinned groups. Rotation might be
+         * disabled by the inheritance code.
+         */
+        if (!ctx->rotate_disable)
+                list_rotate_left(&ctx->flexible_groups);
        raw_spin_unlock(&ctx->lock);
 }
@@ -2234,11 +2237,6 @@ int perf_event_release_kernel(struct perf_event *event)
        raw_spin_unlock_irq(&ctx->lock);
        mutex_unlock(&ctx->mutex);
-        mutex_lock(&event->owner->perf_event_mutex);
-        list_del_init(&event->owner_entry);
-        mutex_unlock(&event->owner->perf_event_mutex);
-        put_task_struct(event->owner);
        free_event(event);
        return 0;
@@ -2251,9 +2249,43 @@ EXPORT_SYMBOL_GPL(perf_event_release_kernel);
 static int perf_release(struct inode *inode, struct file *file)
 {
        struct perf_event *event = file->private_data;
+        struct task_struct *owner;
        file->private_data = NULL;
+        rcu_read_lock();
+        owner = ACCESS_ONCE(event->owner);
+        /*
+         * Matches the smp_wmb() in perf_event_exit_task(). If we observe
+         * !owner it means the list deletion is complete and we can indeed
+         * free this event, otherwise we need to serialize on
+         * owner->perf_event_mutex.
+         */
+        smp_read_barrier_depends();
+        if (owner) {
+                /*
+                 * Since delayed_put_task_struct() also drops the last
+                 * task reference we can safely take a new reference
+                 * while holding the rcu_read_lock().
+                 */
+                get_task_struct(owner);
+        }
+        rcu_read_unlock();
+        if (owner) {
+                mutex_lock(&owner->perf_event_mutex);
+                /*
+                 * We have to re-check the event->owner field, if it is cleared
+                 * we raced with perf_event_exit_task(), acquiring the mutex
+                 * ensured they're done, and we can proceed with freeing the
+                 * event.
+                 */
+                if (event->owner)
+                        list_del_init(&event->owner_entry);
+                mutex_unlock(&owner->perf_event_mutex);
+                put_task_struct(owner);
+        }
        return perf_event_release_kernel(event);
 }
@@ -3792,6 +3824,8 @@ static void perf_event_task_event(struct perf_task_event *task_event)
        rcu_read_lock();
        list_for_each_entry_rcu(pmu, &pmus, entry) {
                cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
+                if (cpuctx->active_pmu != pmu)
+                        goto next;
                perf_event_task_ctx(&cpuctx->ctx, task_event);
                ctx = task_event->task_ctx;
@@ -3927,6 +3961,8 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
        rcu_read_lock();
        list_for_each_entry_rcu(pmu, &pmus, entry) {
                cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
+                if (cpuctx->active_pmu != pmu)
+                        goto next;
                perf_event_comm_ctx(&cpuctx->ctx, comm_event);
                ctxn = pmu->task_ctx_nr;
@@ -4112,6 +4148,8 @@ got_name:
        rcu_read_lock();
        list_for_each_entry_rcu(pmu, &pmus, entry) {
                cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
+                if (cpuctx->active_pmu != pmu)
+                        goto next;
                perf_event_mmap_ctx(&cpuctx->ctx, mmap_event,
                                        vma->vm_flags & VM_EXEC);
@@ -4681,7 +4719,7 @@ static int perf_swevent_init(struct perf_event *event)
                break;
        }
-        if (event_id > PERF_COUNT_SW_MAX)
+        if (event_id >= PERF_COUNT_SW_MAX)
                return -ENOENT;
        if (!event->parent) {
@@ -5113,20 +5151,36 @@ static void *find_pmu_context(int ctxn)
        return NULL;
 }
-static void free_pmu_context(void * __percpu cpu_context)
+static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu)
 {
-        struct pmu *pmu;
+        int cpu;
+        for_each_possible_cpu(cpu) {
+                struct perf_cpu_context *cpuctx;
+                cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
+                if (cpuctx->active_pmu == old_pmu)
+                        cpuctx->active_pmu = pmu;
+        }
+}
+static void free_pmu_context(struct pmu *pmu)
+{
+        struct pmu *i;
        mutex_lock(&pmus_lock);
        /*
         * Like a real lame refcount.
         */
-        list_for_each_entry(pmu, &pmus, entry) {
+        list_for_each_entry(i, &pmus, entry) {
-                if (pmu->pmu_cpu_context == cpu_context)
+                if (i->pmu_cpu_context == pmu->pmu_cpu_context) {
+                        update_pmu_context(i, pmu);
                        goto out;
+                }
        }
-        free_percpu(cpu_context);
+        free_percpu(pmu->pmu_cpu_context);
 out:
        mutex_unlock(&pmus_lock);
 }
@@ -5158,6 +5212,7 @@ int perf_pmu_register(struct pmu *pmu)
                cpuctx->ctx.pmu = pmu;
                cpuctx->jiffies_interval = 1;
                INIT_LIST_HEAD(&cpuctx->rotation_list);
+                cpuctx->active_pmu = pmu;
        }
 got_cpu_context:
@@ -5209,7 +5264,7 @@ void perf_pmu_unregister(struct pmu *pmu)
        synchronize_rcu();
        free_percpu(pmu->pmu_disable_count);
-        free_pmu_context(pmu->pmu_cpu_context);
+        free_pmu_context(pmu);
 }
 struct pmu *perf_init_event(struct perf_event *event)
@@ -5677,7 +5732,7 @@ SYSCALL_DEFINE5(perf_event_open,
        mutex_unlock(&ctx->mutex);
        event->owner = current;
-        get_task_struct(current);
        mutex_lock(&current->perf_event_mutex);
        list_add_tail(&event->owner_entry, &current->perf_event_list);
        mutex_unlock(&current->perf_event_mutex);
@@ -5745,12 +5800,6 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
        ++ctx->generation;
        mutex_unlock(&ctx->mutex);
-        event->owner = current;
-        get_task_struct(current);
-        mutex_lock(&current->perf_event_mutex);
-        list_add_tail(&event->owner_entry, &current->perf_event_list);
-        mutex_unlock(&current->perf_event_mutex);
        return event;
 err_free:
@@ -5901,8 +5950,24 @@ again:
 */
 void perf_event_exit_task(struct task_struct *child)
 {
+        struct perf_event *event, *tmp;
        int ctxn;
+        mutex_lock(&child->perf_event_mutex);
+        list_for_each_entry_safe(event, tmp, &child->perf_event_list,
+                                 owner_entry) {
+                list_del_init(&event->owner_entry);
+                /*
+                 * Ensure the list deletion is visible before we clear
+                 * the owner, closes a race against perf_release() where
+                 * we need to serialize on the owner->perf_event_mutex.
+                 */
+                smp_wmb();
+                event->owner = NULL;
+        }
+        mutex_unlock(&child->perf_event_mutex);
        for_each_task_context_nr(ctxn)
                perf_event_exit_task_context(child, ctxn);
 }
@@ -6122,6 +6187,7 @@ int perf_event_init_context(struct task_struct *child, int ctxn)
        struct perf_event *event;
        struct task_struct *parent = current;
        int inherited_all = 1;
+        unsigned long flags;
        int ret = 0;
        child->perf_event_ctxp[ctxn] = NULL;
@@ -6162,6 +6228,15 @@ int perf_event_init_context(struct task_struct *child, int ctxn)
                        break;
        }
+        /*
+         * We can't hold ctx->lock when iterating the ->flexible_group list due
+         * to allocations, but we need to prevent rotation because
+         * rotate_ctx() will change the list from interrupt context.
+         */
+        raw_spin_lock_irqsave(&parent_ctx->lock, flags);
+        parent_ctx->rotate_disable = 1;
+        raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
        list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
                ret = inherit_task_group(event, parent, parent_ctx,
                                         child, ctxn, &inherited_all);
@@ -6169,6 +6244,10 @@ int perf_event_init_context(struct task_struct *child, int ctxn)
                        break;
        }
+        raw_spin_lock_irqsave(&parent_ctx->lock, flags);
+        parent_ctx->rotate_disable = 0;
+        raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
        child_ctx = child->perf_event_ctxp[ctxn];
        if (child_ctx && inherited_all) {
@@ -6321,6 +6400,8 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
 void __init perf_event_init(void)
 {
+        int ret;
        perf_event_init_all_cpus();
        init_srcu_struct(&pmus_srcu);
        perf_pmu_register(&perf_swevent);
@@ -6328,4 +6409,7 @@ void __init perf_event_init(void)
        perf_pmu_register(&perf_task_clock);
        perf_tp_register();
        perf_cpu_notifier(perf_cpu_notify);
+        ret = init_hw_breakpoint();
+        WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
 }
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 6842eeba5879..05bb7173850e 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -37,13 +37,13 @@ static int check_clock(const clockid_t which_clock)
        if (pid == 0)
                return 0;
-        read_lock(&tasklist_lock);
+        rcu_read_lock();
        p = find_task_by_vpid(pid);
        if (!p || !(CPUCLOCK_PERTHREAD(which_clock) ?
-                   same_thread_group(p, current) : thread_group_leader(p))) {
+                   same_thread_group(p, current) : has_group_leader_pid(p))) {
                error = -EINVAL;
        }
-        read_unlock(&tasklist_lock);
+        rcu_read_unlock();
        return error;
 }
@@ -390,7 +390,7 @@ int posix_cpu_timer_create(struct k_itimer *new_timer)
        INIT_LIST_HEAD(&new_timer->it.cpu.entry);
-        read_lock(&tasklist_lock);
+        rcu_read_lock();
        if (CPUCLOCK_PERTHREAD(new_timer->it_clock)) {
                if (pid == 0) {
                        p = current;
@@ -404,7 +404,7 @@ int posix_cpu_timer_create(struct k_itimer *new_timer)
                        p = current->group_leader;
                } else {
                        p = find_task_by_vpid(pid);
-                        if (p && !thread_group_leader(p))
+                        if (p && !has_group_leader_pid(p))
                                p = NULL;
                }
        }
@@ -414,7 +414,7 @@ int posix_cpu_timer_create(struct k_itimer *new_timer)
        } else {
                ret = -EINVAL;
        }
-        read_unlock(&tasklist_lock);
+        rcu_read_unlock();
        return ret;
 }
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 657272e91d0a..048d0b514831 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -327,7 +327,6 @@ static int create_image(int platform_mode)
 int hibernation_snapshot(int platform_mode)
 {
        int error;
-        gfp_t saved_mask;
        error = platform_begin(platform_mode);
        if (error)
@@ -339,7 +338,7 @@ int hibernation_snapshot(int platform_mode)
                goto Close;
        suspend_console();
-        saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
+        pm_restrict_gfp_mask();
        error = dpm_suspend_start(PMSG_FREEZE);
        if (error)
                goto Recover_platform;
@@ -348,7 +347,10 @@ int hibernation_snapshot(int platform_mode)
                goto Recover_platform;
        error = create_image(platform_mode);
-        /* Control returns here after successful restore */
+        /*
+         * Control returns here (1) after the image has been created or the
+         * image creation has failed and (2) after a successful restore.
+         */
 Resume_devices:
        /* We may need to release the preallocated image pages here. */
@@ -357,7 +359,10 @@ int hibernation_snapshot(int platform_mode)
        dpm_resume_end(in_suspend ?
                (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
-        set_gfp_allowed_mask(saved_mask);
+        if (error || !in_suspend)
+                pm_restore_gfp_mask();
        resume_console();
 Close:
        platform_end(platform_mode);
@@ -452,17 +457,16 @@ static int resume_target_kernel(bool platform_mode)
 int hibernation_restore(int platform_mode)
 {
        int error;
-        gfp_t saved_mask;
        pm_prepare_console();
        suspend_console();
-        saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
+        pm_restrict_gfp_mask();
        error = dpm_suspend_start(PMSG_QUIESCE);
        if (!error) {
                error = resume_target_kernel(platform_mode);
                dpm_resume_end(PMSG_RECOVER);
        }
-        set_gfp_allowed_mask(saved_mask);
+        pm_restore_gfp_mask();
        resume_console();
        pm_restore_console();
        return error;
@@ -476,7 +480,6 @@ int hibernation_restore(int platform_mode)
 int hibernation_platform_enter(void)
 {
        int error;
-        gfp_t saved_mask;
        if (!hibernation_ops)
                return -ENOSYS;
@@ -492,7 +495,6 @@ int hibernation_platform_enter(void)
        entering_platform_hibernation = true;
        suspend_console();
-        saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
        error = dpm_suspend_start(PMSG_HIBERNATE);
        if (error) {
                if (hibernation_ops->recover)
@@ -536,7 +538,6 @@ int hibernation_platform_enter(void)
 Resume_devices:
        entering_platform_hibernation = false;
        dpm_resume_end(PMSG_RESTORE);
-        set_gfp_allowed_mask(saved_mask);
        resume_console();
 Close:
@@ -646,6 +647,7 @@ int hibernate(void)
                swsusp_free();
                if (!error)
                        power_down();
+                pm_restore_gfp_mask();
        } else {
                pr_debug("PM: Image restored successfully.\n");
        }
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 7335952ee473..ecf770509d0d 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -197,7 +197,6 @@ static int suspend_enter(suspend_state_t state)
 int suspend_devices_and_enter(suspend_state_t state)
 {
        int error;
-        gfp_t saved_mask;
        if (!suspend_ops)
                return -ENOSYS;
@@ -208,7 +207,7 @@ int suspend_devices_and_enter(suspend_state_t state)
                        goto Close;
        }
        suspend_console();
-        saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
+        pm_restrict_gfp_mask();
        suspend_test_start();
        error = dpm_suspend_start(PMSG_SUSPEND);
        if (error) {
@@ -225,7 +224,7 @@ int suspend_devices_and_enter(suspend_state_t state)
        suspend_test_start();
        dpm_resume_end(PMSG_RESUME);
        suspend_test_finish("resume devices");
-        set_gfp_allowed_mask(saved_mask);
+        pm_restore_gfp_mask();
        resume_console();
 Close:
        if (suspend_ops->end)
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index a0e4a86ccf94..8c7e4832b9be 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -6,6 +6,7 @@
 *
 * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz>
 * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
+ * Copyright (C) 2010 Bojan Smojver <bojan@rexursive.com>
 *
 * This file is released under the GPLv2.
 *
@@ -29,7 +30,7 @@
 #include "power.h"
-#define HIBERNATE_SIG   "LINHIB0001"
+#define HIBERNATE_SIG   "S1SUSPEND"
 /*
 *      The swap map is a data structure used for keeping track of each page
@@ -753,30 +754,43 @@ static int load_image_lzo(struct swap_map_handle *handle,
 {
        unsigned int m;
        int error = 0;
+        struct bio *bio;
        struct timeval start;
        struct timeval stop;
        unsigned nr_pages;
-        size_t off, unc_len, cmp_len;
+        size_t i, off, unc_len, cmp_len;
-        unsigned char *unc, *cmp, *page;
+        unsigned char *unc, *cmp, *page[LZO_CMP_PAGES];
-        page = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH);
+        for (i = 0; i < LZO_CMP_PAGES; i++) {
-        if (!page) {
+                page[i] = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH);
-                printk(KERN_ERR "PM: Failed to allocate LZO page\n");
+                if (!page[i]) {
-                return -ENOMEM;
+                        printk(KERN_ERR "PM: Failed to allocate LZO page\n");
+                        while (i)
+                                free_page((unsigned long)page[--i]);
+                        return -ENOMEM;
+                }
        }
        unc = vmalloc(LZO_UNC_SIZE);
        if (!unc) {
                printk(KERN_ERR "PM: Failed to allocate LZO uncompressed\n");
-                free_page((unsigned long)page);
+                for (i = 0; i < LZO_CMP_PAGES; i++)
+                        free_page((unsigned long)page[i]);
                return -ENOMEM;
        }
        cmp = vmalloc(LZO_CMP_SIZE);
        if (!cmp) {
                printk(KERN_ERR "PM: Failed to allocate LZO compressed\n");
                vfree(unc);
-                free_page((unsigned long)page);
+                for (i = 0; i < LZO_CMP_PAGES; i++)
+                        free_page((unsigned long)page[i]);
                return -ENOMEM;
        }
@@ -787,6 +801,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
        if (!m)
                m = 1;
        nr_pages = 0;
+        bio = NULL;
        do_gettimeofday(&start);
        error = snapshot_write_next(snapshot);
@@ -794,11 +809,11 @@ static int load_image_lzo(struct swap_map_handle *handle,
                goto out_finish;
        for (;;) {
-                error = swap_read_page(handle, page, NULL); /* sync */
+                error = swap_read_page(handle, page[0], NULL); /* sync */
                if (error)
                        break;
-                cmp_len = *(size_t *)page;
+                cmp_len = *(size_t *)page[0];
                if (unlikely(!cmp_len ||
                             cmp_len > lzo1x_worst_compress(LZO_UNC_SIZE))) {
                        printk(KERN_ERR "PM: Invalid LZO compressed length\n");
@@ -806,13 +821,20 @@ static int load_image_lzo(struct swap_map_handle *handle,
                        break;
                }
-                memcpy(cmp, page, PAGE_SIZE);
+                for (off = PAGE_SIZE, i = 1;
-                for (off = PAGE_SIZE; off < LZO_HEADER + cmp_len; off += PAGE_SIZE) {
+                     off < LZO_HEADER + cmp_len; off += PAGE_SIZE, i++) {
-                        error = swap_read_page(handle, page, NULL); /* sync */
+                        error = swap_read_page(handle, page[i], &bio);
                        if (error)
                                goto out_finish;
+                }
-                        memcpy(cmp + off, page, PAGE_SIZE);
+                error = hib_wait_on_bio_chain(&bio); /* need all data now */
+                if (error)
+                        goto out_finish;
+                for (off = 0, i = 0;
+                     off < LZO_HEADER + cmp_len; off += PAGE_SIZE, i++) {
+                        memcpy(cmp + off, page[i], PAGE_SIZE);
                }
                unc_len = LZO_UNC_SIZE;
@@ -857,7 +879,8 @@ out_finish:
        vfree(cmp);
        vfree(unc);
-        free_page((unsigned long)page);
+        for (i = 0; i < LZO_CMP_PAGES; i++)
+                free_page((unsigned long)page[i]);
        return error;
 }
diff --git a/kernel/power/user.c b/kernel/power/user.c
index e819e17877ca..c36c3b9e8a84 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -137,7 +137,7 @@ static int snapshot_release(struct inode *inode, struct file *filp)
        free_all_swap_pages(data->swap);
        if (data->frozen)
                thaw_processes();
-        pm_notifier_call_chain(data->mode == O_WRONLY ?
+        pm_notifier_call_chain(data->mode == O_RDONLY ?
                        PM_POST_HIBERNATION : PM_POST_RESTORE);
        atomic_inc(&snapshot_device_available);
@@ -263,6 +263,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
        case SNAPSHOT_UNFREEZE:
                if (!data->frozen || data->ready)
                        break;
+                pm_restore_gfp_mask();
                thaw_processes();
                usermodehelper_enable();
                data->frozen = 0;
@@ -275,6 +276,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
                        error = -EPERM;
                        break;
                }
+                pm_restore_gfp_mask();
                error = hibernation_snapshot(data->platform_support);
                if (!error)
                        error = put_user(in_suspend, (int __user *)arg);
diff --git a/kernel/printk.c b/kernel/printk.c
index 9a2264fc42ca..a23315dc4498 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -1082,13 +1082,15 @@ void printk_tick(void)
 int printk_needs_cpu(int cpu)
 {
+        if (unlikely(cpu_is_offline(cpu)))
+                printk_tick();
        return per_cpu(printk_pending, cpu);
 }
 void wake_up_klogd(void)
 {
        if (waitqueue_active(&log_wait))
-                __raw_get_cpu_var(printk_pending) = 1;
+                this_cpu_write(printk_pending, 1);
 }
 /**
diff --git a/kernel/resource.c b/kernel/resource.c
index 9fad33efd0db..798e2fae2a06 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -40,23 +40,6 @@ EXPORT_SYMBOL(iomem_resource);
 static DEFINE_RWLOCK(resource_lock);
-/*
- * By default, we allocate free space bottom-up.  The architecture can request
- * top-down by clearing this flag.  The user can override the architecture's
- * choice with the "resource_alloc_from_bottom" kernel boot option, but that
- * should only be a debugging tool.
- */
-int resource_alloc_from_bottom = 1;
-static __init int setup_alloc_from_bottom(char *s)
-{
-        printk(KERN_INFO
-               "resource: allocating from bottom-up; please report a bug\n");
-        resource_alloc_from_bottom = 1;
-        return 0;
-}
-early_param("resource_alloc_from_bottom", setup_alloc_from_bottom);
 static void *r_next(struct seq_file *m, void *v, loff_t *pos)
 {
        struct resource *p = v;
@@ -374,6 +357,10 @@ int __weak page_is_ram(unsigned long pfn)
        return walk_system_ram_range(pfn, 1, NULL, __is_ram) == 1;
 }
+void __weak arch_remove_reservations(struct resource *avail)
+{
+}
 static resource_size_t simple_align_resource(void *data,
                                             const struct resource *avail,
                                             resource_size_t size,
@@ -397,74 +384,7 @@ static bool resource_contains(struct resource *res1, struct resource *res2)
 }
 /*
- * Find the resource before "child" in the sibling list of "root" children.
- */
-static struct resource *find_sibling_prev(struct resource *root, struct resource *child)
-{
-        struct resource *this;
-        for (this = root->child; this; this = this->sibling)
-                if (this->sibling == child)
-                        return this;
-        return NULL;
-}
-/*
 * Find empty slot in the resource tree given range and alignment.
- * This version allocates from the end of the root resource first.
- */
-static int find_resource_from_top(struct resource *root, struct resource *new,
-                                  resource_size_t size, resource_size_t min,
-                                  resource_size_t max, resource_size_t align,
-                                  resource_size_t (*alignf)(void *,
-                                                   const struct resource *,
-                                                   resource_size_t,
-                                                   resource_size_t),
-                                  void *alignf_data)
-{
-        struct resource *this;
-        struct resource tmp, avail, alloc;
-        tmp.start = root->end;
-        tmp.end = root->end;
-        this = find_sibling_prev(root, NULL);
-        for (;;) {
-                if (this) {
-                        if (this->end < root->end)
-                                tmp.start = this->end + 1;
-                } else
-                        tmp.start = root->start;
-                resource_clip(&tmp, min, max);
-                /* Check for overflow after ALIGN() */
-                avail = *new;
-                avail.start = ALIGN(tmp.start, align);
-                avail.end = tmp.end;
-                if (avail.start >= tmp.start) {
-                        alloc.start = alignf(alignf_data, &avail, size, align);
-                        alloc.end = alloc.start + size - 1;
-                        if (resource_contains(&avail, &alloc)) {
-                                new->start = alloc.start;
-                                new->end = alloc.end;
-                                return 0;
-                        }
-                }
-                if (!this || this->start == root->start)
-                        break;
-                tmp.end = this->start - 1;
-                this = find_sibling_prev(root, this);
-        }
-        return -EBUSY;
-}
-/*
- * Find empty slot in the resource tree given range and alignment.
- * This version allocates from the beginning of the root resource first.
 */
 static int find_resource(struct resource *root, struct resource *new,
                         resource_size_t size, resource_size_t min,
@@ -478,23 +398,24 @@ static int find_resource(struct resource *root, struct resource *new,
        struct resource *this = root->child;
        struct resource tmp = *new, avail, alloc;
+        tmp.flags = new->flags;
        tmp.start = root->start;
        /*
-         * Skip past an allocated resource that starts at 0, since the
+         * Skip past an allocated resource that starts at 0, since the assignment
-         * assignment of this->start - 1 to tmp->end below would cause an
+         * of this->start - 1 to tmp->end below would cause an underflow.
-         * underflow.
         */
        if (this && this->start == 0) {
                tmp.start = this->end + 1;
                this = this->sibling;
        }
-        for (;;) {
+        for(;;) {
                if (this)
                        tmp.end = this->start - 1;
                else
                        tmp.end = root->end;
                resource_clip(&tmp, min, max);
+                arch_remove_reservations(&tmp);
                /* Check for overflow after ALIGN() */
                avail = *new;
@@ -509,10 +430,8 @@ static int find_resource(struct resource *root, struct resource *new,
                                return 0;
                        }
                }
                if (!this)
                        break;
                tmp.start = this->end + 1;
                this = this->sibling;
        }
@@ -545,10 +464,7 @@ int allocate_resource(struct resource *root, struct resource *new,
                alignf = simple_align_resource;
        write_lock(&resource_lock);
-        if (resource_alloc_from_bottom)
+        err = find_resource(root, new, size, min, max, align, alignf, alignf_data);
-                err = find_resource(root, new, size, min, max, align, alignf, alignf_data);
-        else
-                err = find_resource_from_top(root, new, size, min, max, align, alignf, alignf_data);
        if (err >= 0 && __request_resource(root, new))
                err = -EBUSY;
        write_unlock(&resource_lock);
diff --git a/kernel/sched.c b/kernel/sched.c
index dc91a4d09ac3..297d1a0eedb0 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -636,22 +636,18 @@ static inline struct task_group *task_group(struct task_struct *p)
 #endif /* CONFIG_CGROUP_SCHED */
-static u64 irq_time_cpu(int cpu);
+static void update_rq_clock_task(struct rq *rq, s64 delta);
-static void sched_irq_time_avg_update(struct rq *rq, u64 irq_time);
-inline void update_rq_clock(struct rq *rq)
+static void update_rq_clock(struct rq *rq)
 {
-        if (!rq->skip_clock_update) {
+        s64 delta;
-                int cpu = cpu_of(rq);
-                u64 irq_time;
-                rq->clock = sched_clock_cpu(cpu);
+        if (rq->skip_clock_update)
-                irq_time = irq_time_cpu(cpu);
+                return;
-                if (rq->clock - irq_time > rq->clock_task)
-                        rq->clock_task = rq->clock - irq_time;
-                sched_irq_time_avg_update(rq, irq_time);
+        delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
-        }
+        rq->clock += delta;
+        update_rq_clock_task(rq, delta);
 }
 /*
@@ -1924,10 +1920,9 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
 * They are read and saved off onto struct rq in update_rq_clock().
 * This may result in other CPU reading this CPU's irq time and can
 * race with irq/account_system_vtime on this CPU. We would either get old
- * or new value (or semi updated value on 32 bit) with a side effect of
+ * or new value with a side effect of accounting a slice of irq time to wrong
- * accounting a slice of irq time to wrong task when irq is in progress
+ * task when irq is in progress while we read rq->clock. That is a worthy
- * while we read rq->clock. That is a worthy compromise in place of having
+ * compromise in place of having locks on each irq in account_system_time.
- * locks on each irq in account_system_time.
 */
 static DEFINE_PER_CPU(u64, cpu_hardirq_time);
 static DEFINE_PER_CPU(u64, cpu_softirq_time);
@@ -1945,19 +1940,58 @@ void disable_sched_clock_irqtime(void)
        sched_clock_irqtime = 0;
 }
-static u64 irq_time_cpu(int cpu)
+#ifndef CONFIG_64BIT
+static DEFINE_PER_CPU(seqcount_t, irq_time_seq);
+static inline void irq_time_write_begin(void)
 {
-        if (!sched_clock_irqtime)
+        __this_cpu_inc(irq_time_seq.sequence);
-                return 0;
+        smp_wmb();
+}
+static inline void irq_time_write_end(void)
+{
+        smp_wmb();
+        __this_cpu_inc(irq_time_seq.sequence);
+}
+static inline u64 irq_time_read(int cpu)
+{
+        u64 irq_time;
+        unsigned seq;
+        do {
+                seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
+                irq_time = per_cpu(cpu_softirq_time, cpu) +
+                           per_cpu(cpu_hardirq_time, cpu);
+        } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
+        return irq_time;
+}
+#else /* CONFIG_64BIT */
+static inline void irq_time_write_begin(void)
+{
+}
+static inline void irq_time_write_end(void)
+{
+}
+static inline u64 irq_time_read(int cpu)
+{
        return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
 }
+#endif /* CONFIG_64BIT */
+/*
+ * Called before incrementing preempt_count on {soft,}irq_enter
+ * and before decrementing preempt_count on {soft,}irq_exit.
+ */
 void account_system_vtime(struct task_struct *curr)
 {
        unsigned long flags;
+        s64 delta;
        int cpu;
-        u64 now, delta;
        if (!sched_clock_irqtime)
                return;
@@ -1965,9 +1999,10 @@ void account_system_vtime(struct task_struct *curr)
        local_irq_save(flags);
        cpu = smp_processor_id();
-        now = sched_clock_cpu(cpu);
+        delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
-        delta = now - per_cpu(irq_start_time, cpu);
+        __this_cpu_add(irq_start_time, delta);
-        per_cpu(irq_start_time, cpu) = now;
+        irq_time_write_begin();
        /*
         * We do not account for softirq time from ksoftirqd here.
         * We want to continue accounting softirq time to ksoftirqd thread
@@ -1975,33 +2010,55 @@ void account_system_vtime(struct task_struct *curr)
         * that do not consume any time, but still wants to run.
         */
        if (hardirq_count())
-                per_cpu(cpu_hardirq_time, cpu) += delta;
+                __this_cpu_add(cpu_hardirq_time, delta);
        else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD))
-                per_cpu(cpu_softirq_time, cpu) += delta;
+                __this_cpu_add(cpu_softirq_time, delta);
+        irq_time_write_end();
        local_irq_restore(flags);
 }
 EXPORT_SYMBOL_GPL(account_system_vtime);
-static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time)
+static void update_rq_clock_task(struct rq *rq, s64 delta)
 {
-        if (sched_clock_irqtime && sched_feat(NONIRQ_POWER)) {
+        s64 irq_delta;
-                u64 delta_irq = curr_irq_time - rq->prev_irq_time;
-                rq->prev_irq_time = curr_irq_time;
+        irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
-                sched_rt_avg_update(rq, delta_irq);
-        }
+        /*
+         * Since irq_time is only updated on {soft,}irq_exit, we might run into
+         * this case when a previous update_rq_clock() happened inside a
+         * {soft,}irq region.
+         *
+         * When this happens, we stop ->clock_task and only update the
+         * prev_irq_time stamp to account for the part that fit, so that a next
+         * update will consume the rest. This ensures ->clock_task is
+         * monotonic.
+         *
+         * It does however cause some slight miss-attribution of {soft,}irq
+         * time, a more accurate solution would be to update the irq_time using
+         * the current rq->clock timestamp, except that would require using
+         * atomic ops.
+         */
+        if (irq_delta > delta)
+                irq_delta = delta;
+        rq->prev_irq_time += irq_delta;
+        delta -= irq_delta;
+        rq->clock_task += delta;
+        if (irq_delta && sched_feat(NONIRQ_POWER))
+                sched_rt_avg_update(rq, irq_delta);
 }
-#else
+#else /* CONFIG_IRQ_TIME_ACCOUNTING */
-static u64 irq_time_cpu(int cpu)
+static void update_rq_clock_task(struct rq *rq, s64 delta)
 {
-        return 0;
+        rq->clock_task += delta;
 }
-static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) { }
+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
-#endif
 #include "sched_idletask.c"
 #include "sched_fair.c"
@@ -2129,7 +2186,7 @@ static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
         * A queue event has occurred, and we're going to schedule.  In
         * this case, we can save a useless back to back clock update.
         */
-        if (test_tsk_need_resched(rq->curr))
+        if (rq->curr->se.on_rq && test_tsk_need_resched(rq->curr))
                rq->skip_clock_update = 1;
 }
@@ -3119,6 +3176,15 @@ static long calc_load_fold_active(struct rq *this_rq)
        return delta;
 }
+static unsigned long
+calc_load(unsigned long load, unsigned long exp, unsigned long active)
+{
+        load *= exp;
+        load += active * (FIXED_1 - exp);
+        load += 1UL << (FSHIFT - 1);
+        return load >> FSHIFT;
+}
 #ifdef CONFIG_NO_HZ
 /*
 * For NO_HZ we delay the active fold to the next LOAD_FREQ update.
@@ -3148,6 +3214,128 @@ static long calc_load_fold_idle(void)
        return delta;
 }
+/**
+ * fixed_power_int - compute: x^n, in O(log n) time
+ *
+ * @x:         base of the power
+ * @frac_bits: fractional bits of @x
+ * @n:         power to raise @x to.
+ *
+ * By exploiting the relation between the definition of the natural power
+ * function: x^n := x*x*...*x (x multiplied by itself for n times), and
+ * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
+ * (where: n_i \elem {0, 1}, the binary vector representing n),
+ * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
+ * of course trivially computable in O(log_2 n), the length of our binary
+ * vector.
+ */
+static unsigned long
+fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
+{
+        unsigned long result = 1UL << frac_bits;
+        if (n) for (;;) {
+                if (n & 1) {
+                        result *= x;
+                        result += 1UL << (frac_bits - 1);
+                        result >>= frac_bits;
+                }
+                n >>= 1;
+                if (!n)
+                        break;
+                x *= x;
+                x += 1UL << (frac_bits - 1);
+                x >>= frac_bits;
+        }
+        return result;
+}
+/*
+ * a1 = a0 * e + a * (1 - e)
+ *
+ * a2 = a1 * e + a * (1 - e)
+ *    = (a0 * e + a * (1 - e)) * e + a * (1 - e)
+ *    = a0 * e^2 + a * (1 - e) * (1 + e)
+ *
+ * a3 = a2 * e + a * (1 - e)
+ *    = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
+ *    = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
+ *
+ *  ...
+ *
+ * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
+ *    = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
+ *    = a0 * e^n + a * (1 - e^n)
+ *
+ * [1] application of the geometric series:
+ *
+ *              n         1 - x^(n+1)
+ *     S_n := \Sum x^i = -------------
+ *             i=0          1 - x
+ */
+static unsigned long
+calc_load_n(unsigned long load, unsigned long exp,
+            unsigned long active, unsigned int n)
+{
+        return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
+}
+/*
+ * NO_HZ can leave us missing all per-cpu ticks calling
+ * calc_load_account_active(), but since an idle CPU folds its delta into
+ * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold
+ * in the pending idle delta if our idle period crossed a load cycle boundary.
+ *
+ * Once we've updated the global active value, we need to apply the exponential
+ * weights adjusted to the number of cycles missed.
+ */
+static void calc_global_nohz(unsigned long ticks)
+{
+        long delta, active, n;
+        if (time_before(jiffies, calc_load_update))
+                return;
+        /*
+         * If we crossed a calc_load_update boundary, make sure to fold
+         * any pending idle changes, the respective CPUs might have
+         * missed the tick driven calc_load_account_active() update
+         * due to NO_HZ.
+         */
+        delta = calc_load_fold_idle();
+        if (delta)
+                atomic_long_add(delta, &calc_load_tasks);
+        /*
+         * If we were idle for multiple load cycles, apply them.
+         */
+        if (ticks >= LOAD_FREQ) {
+                n = ticks / LOAD_FREQ;
+                active = atomic_long_read(&calc_load_tasks);
+                active = active > 0 ? active * FIXED_1 : 0;
+                avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
+                avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
+                avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
+                calc_load_update += n * LOAD_FREQ;
+        }
+        /*
+         * Its possible the remainder of the above division also crosses
+         * a LOAD_FREQ period, the regular check in calc_global_load()
+         * which comes after this will take care of that.
+         *
+         * Consider us being 11 ticks before a cycle completion, and us
+         * sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will
+         * age us 4 cycles, and the test in calc_global_load() will
+         * pick up the final one.
+         */
+}
 #else
 static void calc_load_account_idle(struct rq *this_rq)
 {
@@ -3157,6 +3345,10 @@ static inline long calc_load_fold_idle(void)
 {
        return 0;
 }
+static void calc_global_nohz(unsigned long ticks)
+{
+}
 #endif
 /**
@@ -3174,24 +3366,17 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
        loads[2] = (avenrun[2] + offset) << shift;
 }
-static unsigned long
-calc_load(unsigned long load, unsigned long exp, unsigned long active)
-{
-        load *= exp;
-        load += active * (FIXED_1 - exp);
-        return load >> FSHIFT;
-}
 /*
 * calc_load - update the avenrun load estimates 10 ticks after the
 * CPUs have updated calc_load_tasks.
 */
-void calc_global_load(void)
+void calc_global_load(unsigned long ticks)
 {
-        unsigned long upd = calc_load_update + 10;
        long active;
-        if (time_before(jiffies, upd))
+        calc_global_nohz(ticks);
+        if (time_before(jiffies, calc_load_update + 10))
                return;
        active = atomic_long_read(&calc_load_tasks);
@@ -3845,7 +4030,6 @@ static void put_prev_task(struct rq *rq, struct task_struct *prev)
 {
        if (prev->se.on_rq)
                update_rq_clock(rq);
-        rq->skip_clock_update = 0;
        prev->sched_class->put_prev_task(rq, prev);
 }
@@ -3903,7 +4087,6 @@ need_resched_nonpreemptible:
                hrtick_clear(rq);
        raw_spin_lock_irq(&rq->lock);
-        clear_tsk_need_resched(prev);
        switch_count = &prev->nivcsw;
        if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
@@ -3935,6 +4118,8 @@ need_resched_nonpreemptible:
        put_prev_task(rq, prev);
        next = pick_next_task(rq);
+        clear_tsk_need_resched(prev);
+        rq->skip_clock_update = 0;
        if (likely(prev != next)) {
                sched_info_switch(prev, next);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 52ab113d8bb9..00ebd7686676 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1758,10 +1758,6 @@ static void pull_task(struct rq *src_rq, struct task_struct *p,
        set_task_cpu(p, this_cpu);
        activate_task(this_rq, p, 0);
        check_preempt_curr(this_rq, p, 0);
-        /* re-arm NEWIDLE balancing when moving tasks */
-        src_rq->avg_idle = this_rq->avg_idle = 2*sysctl_sched_migration_cost;
-        this_rq->idle_stamp = 0;
 }
 /*
@@ -3219,8 +3215,10 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
                interval = msecs_to_jiffies(sd->balance_interval);
                if (time_after(next_balance, sd->last_balance + interval))
                        next_balance = sd->last_balance + interval;
-                if (pulled_task)
+                if (pulled_task) {
+                        this_rq->idle_stamp = 0;
                        break;
+                }
        }
        raw_spin_lock(&this_rq->lock);
diff --git a/kernel/timer.c b/kernel/timer.c
index 68a9ae7679b7..353b9227c2ec 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1252,6 +1252,12 @@ unsigned long get_next_timer_interrupt(unsigned long now)
        struct tvec_base *base = __get_cpu_var(tvec_bases);
        unsigned long expires;
+        /*
+         * Pretend that there is no timer pending if the cpu is offline.
+         * Possible pending timers will be migrated later to an active cpu.
+         */
+        if (cpu_is_offline(smp_processor_id()))
+                return now + NEXT_TIMER_MAX_DELTA;
        spin_lock(&base->lock);
        if (time_before_eq(base->next_timer, base->timer_jiffies))
                base->next_timer = __next_timer_interrupt(base);
@@ -1319,7 +1325,7 @@ void do_timer(unsigned long ticks)
 {
        jiffies_64 += ticks;
        update_wall_time();
-        calc_global_load();
+        calc_global_load(ticks);
 }
 #ifdef __ARCH_WANT_SYS_ALARM
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 042084157980..f8cf959bad45 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1283,6 +1283,8 @@ void trace_dump_stack(void)
        __ftrace_trace_stack(global_trace.buffer, flags, 3, preempt_count());
 }
+static DEFINE_PER_CPU(int, user_stack_count);
 void
 ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
 {
@@ -1301,6 +1303,18 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
        if (unlikely(in_nmi()))
                return;
+        /*
+         * prevent recursion, since the user stack tracing may
+         * trigger other kernel events.
+         */
+        preempt_disable();
+        if (__this_cpu_read(user_stack_count))
+                goto out;
+        __this_cpu_inc(user_stack_count);
        event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK,
                                          sizeof(*entry), flags, pc);
        if (!event)
@@ -1318,6 +1332,11 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
        save_stack_trace_user(&trace);
        if (!filter_check_discard(call, entry, buffer, event))
                ring_buffer_unlock_commit(buffer, event);
+        __this_cpu_dec(user_stack_count);
+ out:
+        preempt_enable();
 }
 #ifdef UNUSED
@@ -2319,11 +2338,19 @@ tracing_write_stub(struct file *filp, const char __user *ubuf,
        return count;
 }
+static loff_t tracing_seek(struct file *file, loff_t offset, int origin)
+{
+        if (file->f_mode & FMODE_READ)
+                return seq_lseek(file, offset, origin);
+        else
+                return 0;
+}
 static const struct file_operations tracing_fops = {
        .open           = tracing_open,
        .read           = seq_read,
        .write          = tracing_write_stub,
-        .llseek         = seq_lseek,
+        .llseek         = tracing_seek,
        .release        = tracing_release,
 };
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 90db1bd1a978..e785b0f2aea5 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -661,7 +661,7 @@ void wq_worker_waking_up(struct task_struct *task, unsigned int cpu)
 {
        struct worker *worker = kthread_data(task);
-        if (likely(!(worker->flags & WORKER_NOT_RUNNING)))
+        if (!(worker->flags & WORKER_NOT_RUNNING))
                atomic_inc(get_gcwq_nr_running(cpu));
 }
@@ -687,7 +687,7 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task,
        struct global_cwq *gcwq = get_gcwq(cpu);
        atomic_t *nr_running = get_gcwq_nr_running(cpu);
-        if (unlikely(worker->flags & WORKER_NOT_RUNNING))
+        if (worker->flags & WORKER_NOT_RUNNING)
                return NULL;
        /* this can only happen on the local cpu */
@@ -3692,7 +3692,8 @@ static int __init init_workqueues(void)
        system_nrt_wq = alloc_workqueue("events_nrt", WQ_NON_REENTRANT, 0);
        system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND,
                                            WQ_UNBOUND_MAX_ACTIVE);
-        BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq);
+        BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq ||
+               !system_unbound_wq);
        return 0;
 }
 early_initcall(init_workqueues);
author	Paul Mundt <lethal@linux-sh.org>	2010-12-21 22:56:10 -0500
committer	Paul Mundt <lethal@linux-sh.org>	2010-12-21 22:56:10 -0500
commit	7ccbefe07ea0a3570e44d1ec13a307552ee4dadd (patch)
tree	ba0299694a9f3940f289b6a29cadab853906e3d2 /kernel
parent	623eb15647fc35c5a8cd38985d5958240eb072c1 (diff)
parent	90a8a73c06cc32b609a880d48449d7083327e11a (diff)