Merge branch 'merge'

author: Paul Mackerras <paulus@samba.org> 2006-07-31 20:37:25 -0400
committer: Paul Mackerras <paulus@samba.org> 2006-07-31 20:37:25 -0400
commit: 57cad8084e0837e0f2c97da789ec9b3f36809be9 (patch)
tree: e9c790afb4286f78cb08d9664f58baa7e876fe55 /kernel
parent: cb18bd40030c879cd93fef02fd579f74dbab473d (diff)
parent: 49b1e3ea19b1c95c2f012b8331ffb3b169e4c042 (diff)
33 files changed, 1310 insertions, 348 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 47dbcd570cd8..d62ec66c1af2 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -48,6 +48,8 @@ obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
 obj-$(CONFIG_SECCOMP) += seccomp.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
 obj-$(CONFIG_RELAY) += relay.o
+obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
+obj-$(CONFIG_TASKSTATS) += taskstats.o
 ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/acct.c b/kernel/acct.c
index f18e0b8df3e1..2a7c933651c7 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -488,7 +488,7 @@ static void do_acct_process(struct file *file)
                old_encode_dev(tty_devnum(current->signal->tty)) : 0;
        read_unlock(&tasklist_lock);
-        spin_lock(&current->sighand->siglock);
+        spin_lock_irq(&current->sighand->siglock);
        ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime)));
        ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime)));
        ac.ac_flag = pacct->ac_flag;
@@ -496,7 +496,7 @@ static void do_acct_process(struct file *file)
        ac.ac_minflt = encode_comp_t(pacct->ac_minflt);
        ac.ac_majflt = encode_comp_t(pacct->ac_majflt);
        ac.ac_exitcode = pacct->ac_exitcode;
-        spin_unlock(&current->sighand->siglock);
+        spin_unlock_irq(&current->sighand->siglock);
        ac.ac_io = encode_comp_t(0 /* current->io_usage */);    /* %% */
        ac.ac_rw = encode_comp_t(ac.ac_io / 1024);
        ac.ac_swaps = encode_comp_t(0);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 70fbf2e83766..f230f9ae01c2 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -16,56 +16,48 @@
 #include <linux/mutex.h>
 /* This protects CPUs going up and down... */
-static DEFINE_MUTEX(cpucontrol);
+static DEFINE_MUTEX(cpu_add_remove_lock);
+static DEFINE_MUTEX(cpu_bitmask_lock);
 static __cpuinitdata BLOCKING_NOTIFIER_HEAD(cpu_chain);
 #ifdef CONFIG_HOTPLUG_CPU
-static struct task_struct *lock_cpu_hotplug_owner;
-static int lock_cpu_hotplug_depth;
-static int __lock_cpu_hotplug(int interruptible)
+/* Crappy recursive lock-takers in cpufreq! Complain loudly about idiots */
-{
+static struct task_struct *recursive;
-        int ret = 0;
+static int recursive_depth;
-        if (lock_cpu_hotplug_owner != current) {
-                if (interruptible)
-                        ret = mutex_lock_interruptible(&cpucontrol);
-                else
-                        mutex_lock(&cpucontrol);
-        }
-        /*
-         * Set only if we succeed in locking
-         */
-        if (!ret) {
-                lock_cpu_hotplug_depth++;
-                lock_cpu_hotplug_owner = current;
-        }
-        return ret;
-}
 void lock_cpu_hotplug(void)
 {
-        __lock_cpu_hotplug(0);
+        struct task_struct *tsk = current;
+        if (tsk == recursive) {
+                static int warnings = 10;
+                if (warnings) {
+                        printk(KERN_ERR "Lukewarm IQ detected in hotplug locking\n");
+                        WARN_ON(1);
+                        warnings--;
+                }
+                recursive_depth++;
+                return;
+        }
+        mutex_lock(&cpu_bitmask_lock);
+        recursive = tsk;
 }
 EXPORT_SYMBOL_GPL(lock_cpu_hotplug);
 void unlock_cpu_hotplug(void)
 {
-        if (--lock_cpu_hotplug_depth == 0) {
+        WARN_ON(recursive != current);
-                lock_cpu_hotplug_owner = NULL;
+        if (recursive_depth) {
-                mutex_unlock(&cpucontrol);
+                recursive_depth--;
+                return;
        }
+        mutex_unlock(&cpu_bitmask_lock);
+        recursive = NULL;
 }
 EXPORT_SYMBOL_GPL(unlock_cpu_hotplug);
-int lock_cpu_hotplug_interruptible(void)
-{
-        return __lock_cpu_hotplug(1);
-}
-EXPORT_SYMBOL_GPL(lock_cpu_hotplug_interruptible);
 #endif  /* CONFIG_HOTPLUG_CPU */
 /* Need to know about CPUs going up/down? */
@@ -122,9 +114,7 @@ int cpu_down(unsigned int cpu)
        struct task_struct *p;
        cpumask_t old_allowed, tmp;
-        if ((err = lock_cpu_hotplug_interruptible()) != 0)
+        mutex_lock(&cpu_add_remove_lock);
-                return err;
        if (num_online_cpus() == 1) {
                err = -EBUSY;
                goto out;
@@ -150,7 +140,10 @@ int cpu_down(unsigned int cpu)
        cpu_clear(cpu, tmp);
        set_cpus_allowed(current, tmp);
+        mutex_lock(&cpu_bitmask_lock);
        p = __stop_machine_run(take_cpu_down, NULL, cpu);
+        mutex_unlock(&cpu_bitmask_lock);
        if (IS_ERR(p)) {
                /* CPU didn't die: tell everyone.  Can't complain. */
                if (blocking_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED,
@@ -187,7 +180,7 @@ out_thread:
 out_allowed:
        set_cpus_allowed(current, old_allowed);
 out:
-        unlock_cpu_hotplug();
+        mutex_unlock(&cpu_add_remove_lock);
        return err;
 }
 #endif /*CONFIG_HOTPLUG_CPU*/
@@ -197,9 +190,7 @@ int __devinit cpu_up(unsigned int cpu)
        int ret;
        void *hcpu = (void *)(long)cpu;
-        if ((ret = lock_cpu_hotplug_interruptible()) != 0)
+        mutex_lock(&cpu_add_remove_lock);
-                return ret;
        if (cpu_online(cpu) || !cpu_present(cpu)) {
                ret = -EINVAL;
                goto out;
@@ -214,7 +205,9 @@ int __devinit cpu_up(unsigned int cpu)
        }
        /* Arch-specific enabling code. */
+        mutex_lock(&cpu_bitmask_lock);
        ret = __cpu_up(cpu);
+        mutex_unlock(&cpu_bitmask_lock);
        if (ret != 0)
                goto out_notify;
        BUG_ON(!cpu_online(cpu));
@@ -227,6 +220,6 @@ out_notify:
                blocking_notifier_call_chain(&cpu_chain,
                                CPU_UP_CANCELED, hcpu);
 out:
-        unlock_cpu_hotplug();
+        mutex_unlock(&cpu_add_remove_lock);
        return ret;
 }
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index c232dc077438..1a649f2bb9bb 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -762,6 +762,8 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
 *
 * Call with manage_mutex held.  May nest a call to the
 * lock_cpu_hotplug()/unlock_cpu_hotplug() pair.
+ * Must not be called holding callback_mutex, because we must
+ * not call lock_cpu_hotplug() while holding callback_mutex.
 */
 static void update_cpu_domains(struct cpuset *cur)
@@ -781,7 +783,7 @@ static void update_cpu_domains(struct cpuset *cur)
                if (is_cpu_exclusive(c))
                        cpus_andnot(pspan, pspan, c->cpus_allowed);
        }
-        if (is_removed(cur) || !is_cpu_exclusive(cur)) {
+        if (!is_cpu_exclusive(cur)) {
                cpus_or(pspan, pspan, cur->cpus_allowed);
                if (cpus_equal(pspan, cur->cpus_allowed))
                        return;
@@ -1917,6 +1919,17 @@ static int cpuset_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        return cpuset_create(c_parent, dentry->d_name.name, mode | S_IFDIR);
 }
+/*
+ * Locking note on the strange update_flag() call below:
+ *
+ * If the cpuset being removed is marked cpu_exclusive, then simulate
+ * turning cpu_exclusive off, which will call update_cpu_domains().
+ * The lock_cpu_hotplug() call in update_cpu_domains() must not be
+ * made while holding callback_mutex.  Elsewhere the kernel nests
+ * callback_mutex inside lock_cpu_hotplug() calls.  So the reverse
+ * nesting would risk an ABBA deadlock.
+ */
 static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
 {
        struct cpuset *cs = dentry->d_fsdata;
@@ -1936,11 +1949,16 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
                mutex_unlock(&manage_mutex);
                return -EBUSY;
        }
+        if (is_cpu_exclusive(cs)) {
+                int retval = update_flag(CS_CPU_EXCLUSIVE, cs, "0");
+                if (retval < 0) {
+                        mutex_unlock(&manage_mutex);
+                        return retval;
+                }
+        }
        parent = cs->parent;
        mutex_lock(&callback_mutex);
        set_bit(CS_REMOVED, &cs->flags);
-        if (is_cpu_exclusive(cs))
-                update_cpu_domains(cs);
        list_del(&cs->sibling); /* delete my sibling from parent->children */
        spin_lock(&cs->dentry->d_lock);
        d = dget(cs->dentry);
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
new file mode 100644
index 000000000000..57ca3730205d
--- /dev/null
+++ b/kernel/delayacct.c
@@ -0,0 +1,178 @@
+/* delayacct.c - per-task delay accounting
+ *
+ * Copyright (C) Shailabh Nagar, IBM Corp. 2006
+ *
+ * This program is free software;  you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ */
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/time.h>
+#include <linux/sysctl.h>
+#include <linux/delayacct.h>
+int delayacct_on __read_mostly = 1;     /* Delay accounting turned on/off */
+kmem_cache_t *delayacct_cache;
+static int __init delayacct_setup_disable(char *str)
+{
+        delayacct_on = 0;
+        return 1;
+}
+__setup("nodelayacct", delayacct_setup_disable);
+void delayacct_init(void)
+{
+        delayacct_cache = kmem_cache_create("delayacct_cache",
+                                        sizeof(struct task_delay_info),
+                                        0,
+                                        SLAB_PANIC,
+                                        NULL, NULL);
+        delayacct_tsk_init(&init_task);
+}
+void __delayacct_tsk_init(struct task_struct *tsk)
+{
+        spin_lock_init(&tsk->delays_lock);
+        /* No need to acquire tsk->delays_lock for allocation here unless
+           __delayacct_tsk_init called after tsk is attached to tasklist
+        */
+        tsk->delays = kmem_cache_zalloc(delayacct_cache, SLAB_KERNEL);
+        if (tsk->delays)
+                spin_lock_init(&tsk->delays->lock);
+}
+void __delayacct_tsk_exit(struct task_struct *tsk)
+{
+        struct task_delay_info *delays = tsk->delays;
+        spin_lock(&tsk->delays_lock);
+        tsk->delays = NULL;
+        spin_unlock(&tsk->delays_lock);
+        kmem_cache_free(delayacct_cache, delays);
+}
+/*
+ * Start accounting for a delay statistic using
+ * its starting timestamp (@start)
+ */
+static inline void delayacct_start(struct timespec *start)
+{
+        do_posix_clock_monotonic_gettime(start);
+}
+/*
+ * Finish delay accounting for a statistic using
+ * its timestamps (@start, @end), accumalator (@total) and @count
+ */
+static void delayacct_end(struct timespec *start, struct timespec *end,
+                                u64 *total, u32 *count)
+{
+        struct timespec ts;
+        s64 ns;
+        do_posix_clock_monotonic_gettime(end);
+        ts = timespec_sub(*end, *start);
+        ns = timespec_to_ns(&ts);
+        if (ns < 0)
+                return;
+        spin_lock(&current->delays->lock);
+        *total += ns;
+        (*count)++;
+        spin_unlock(&current->delays->lock);
+}
+void __delayacct_blkio_start(void)
+{
+        delayacct_start(&current->delays->blkio_start);
+}
+void __delayacct_blkio_end(void)
+{
+        if (current->delays->flags & DELAYACCT_PF_SWAPIN)
+                /* Swapin block I/O */
+                delayacct_end(&current->delays->blkio_start,
+                        &current->delays->blkio_end,
+                        &current->delays->swapin_delay,
+                        &current->delays->swapin_count);
+        else    /* Other block I/O */
+                delayacct_end(&current->delays->blkio_start,
+                        &current->delays->blkio_end,
+                        &current->delays->blkio_delay,
+                        &current->delays->blkio_count);
+}
+int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
+{
+        s64 tmp;
+        struct timespec ts;
+        unsigned long t1,t2,t3;
+        spin_lock(&tsk->delays_lock);
+        /* Though tsk->delays accessed later, early exit avoids
+         * unnecessary returning of other data
+         */
+        if (!tsk->delays)
+                goto done;
+        tmp = (s64)d->cpu_run_real_total;
+        cputime_to_timespec(tsk->utime + tsk->stime, &ts);
+        tmp += timespec_to_ns(&ts);
+        d->cpu_run_real_total = (tmp < (s64)d->cpu_run_real_total) ? 0 : tmp;
+        /*
+         * No locking available for sched_info (and too expensive to add one)
+         * Mitigate by taking snapshot of values
+         */
+        t1 = tsk->sched_info.pcnt;
+        t2 = tsk->sched_info.run_delay;
+        t3 = tsk->sched_info.cpu_time;
+        d->cpu_count += t1;
+        jiffies_to_timespec(t2, &ts);
+        tmp = (s64)d->cpu_delay_total + timespec_to_ns(&ts);
+        d->cpu_delay_total = (tmp < (s64)d->cpu_delay_total) ? 0 : tmp;
+        tmp = (s64)d->cpu_run_virtual_total + (s64)jiffies_to_usecs(t3) * 1000;
+        d->cpu_run_virtual_total =
+                (tmp < (s64)d->cpu_run_virtual_total) ? 0 : tmp;
+        /* zero XXX_total, non-zero XXX_count implies XXX stat overflowed */
+        spin_lock(&tsk->delays->lock);
+        tmp = d->blkio_delay_total + tsk->delays->blkio_delay;
+        d->blkio_delay_total = (tmp < d->blkio_delay_total) ? 0 : tmp;
+        tmp = d->swapin_delay_total + tsk->delays->swapin_delay;
+        d->swapin_delay_total = (tmp < d->swapin_delay_total) ? 0 : tmp;
+        d->blkio_count += tsk->delays->blkio_count;
+        d->swapin_count += tsk->delays->swapin_count;
+        spin_unlock(&tsk->delays->lock);
+done:
+        spin_unlock(&tsk->delays_lock);
+        return 0;
+}
+__u64 __delayacct_blkio_ticks(struct task_struct *tsk)
+{
+        __u64 ret;
+        spin_lock(&tsk->delays->lock);
+        ret = nsec_to_clock_t(tsk->delays->blkio_delay +
+                                tsk->delays->swapin_delay);
+        spin_unlock(&tsk->delays->lock);
+        return ret;
+}
diff --git a/kernel/exit.c b/kernel/exit.c
index 6664c084783d..dba194a8d416 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -25,6 +25,8 @@
 #include <linux/mount.h>
 #include <linux/proc_fs.h>
 #include <linux/mempolicy.h>
+#include <linux/taskstats_kern.h>
+#include <linux/delayacct.h>
 #include <linux/cpuset.h>
 #include <linux/syscalls.h>
 #include <linux/signal.h>
@@ -843,7 +845,9 @@ static void exit_notify(struct task_struct *tsk)
 fastcall NORET_TYPE void do_exit(long code)
 {
        struct task_struct *tsk = current;
+        struct taskstats *tidstats;
        int group_dead;
+        unsigned int mycpu;
        profile_task_exit(tsk);
@@ -881,6 +885,8 @@ fastcall NORET_TYPE void do_exit(long code)
                                current->comm, current->pid,
                                preempt_count());
+        taskstats_exit_alloc(&tidstats, &mycpu);
        acct_update_integrals(tsk);
        if (tsk->mm) {
                update_hiwater_rss(tsk->mm);
@@ -900,6 +906,10 @@ fastcall NORET_TYPE void do_exit(long code)
 #endif
        if (unlikely(tsk->audit_context))
                audit_free(tsk);
+        taskstats_exit_send(tsk, tidstats, group_dead, mycpu);
+        taskstats_exit_free(tidstats);
+        delayacct_tsk_exit(tsk);
        exit_mm(tsk);
        if (group_dead)
diff --git a/kernel/fork.c b/kernel/fork.c
index 56e4e07e45f7..1b0f7b1e0881 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -43,6 +43,8 @@
 #include <linux/rmap.h>
 #include <linux/acct.h>
 #include <linux/cn_proc.h>
+#include <linux/delayacct.h>
+#include <linux/taskstats_kern.h>
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -61,9 +63,7 @@ int max_threads;		/* tunable limit on nr_threads */
 DEFINE_PER_CPU(unsigned long, process_counts) = 0;
- __cacheline_aligned DEFINE_RWLOCK(tasklist_lock);  /* outer */
+__cacheline_aligned DEFINE_RWLOCK(tasklist_lock);  /* outer */
-EXPORT_SYMBOL(tasklist_lock);
 int nr_processes(void)
 {
@@ -820,6 +820,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
        if (clone_flags & CLONE_THREAD) {
                atomic_inc(&current->signal->count);
                atomic_inc(&current->signal->live);
+                taskstats_tgid_alloc(current->signal);
                return 0;
        }
        sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL);
@@ -864,6 +865,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
        INIT_LIST_HEAD(&sig->cpu_timers[0]);
        INIT_LIST_HEAD(&sig->cpu_timers[1]);
        INIT_LIST_HEAD(&sig->cpu_timers[2]);
+        taskstats_tgid_init(sig);
        task_lock(current->group_leader);
        memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
@@ -885,6 +887,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
 void __cleanup_signal(struct signal_struct *sig)
 {
        exit_thread_group_keys(sig);
+        taskstats_tgid_free(sig);
        kmem_cache_free(signal_cachep, sig);
 }
@@ -1002,6 +1005,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
                goto bad_fork_cleanup_put_domain;
        p->did_exec = 0;
+        delayacct_tsk_init(p);  /* Must remain after dup_task_struct() */
        copy_flags(clone_flags, p);
        p->pid = pid;
        retval = -EFAULT;
diff --git a/kernel/futex.c b/kernel/futex.c
index 1dc98e4dd287..dda2049692a2 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -415,15 +415,15 @@ out_unlock:
 */
 void exit_pi_state_list(struct task_struct *curr)
 {
-        struct futex_hash_bucket *hb;
        struct list_head *next, *head = &curr->pi_state_list;
        struct futex_pi_state *pi_state;
+        struct futex_hash_bucket *hb;
        union futex_key key;
        /*
         * We are a ZOMBIE and nobody can enqueue itself on
         * pi_state_list anymore, but we have to be careful
-         * versus waiters unqueueing themselfs
+         * versus waiters unqueueing themselves:
         */
        spin_lock_irq(&curr->pi_lock);
        while (!list_empty(head)) {
@@ -431,21 +431,24 @@ void exit_pi_state_list(struct task_struct *curr)
                next = head->next;
                pi_state = list_entry(next, struct futex_pi_state, list);
                key = pi_state->key;
+                hb = hash_futex(&key);
                spin_unlock_irq(&curr->pi_lock);
-                hb = hash_futex(&key);
                spin_lock(&hb->lock);
                spin_lock_irq(&curr->pi_lock);
+                /*
+                 * We dropped the pi-lock, so re-check whether this
+                 * task still owns the PI-state:
+                 */
                if (head->next != next) {
                        spin_unlock(&hb->lock);
                        continue;
                }
-                list_del_init(&pi_state->list);
                WARN_ON(pi_state->owner != curr);
+                WARN_ON(list_empty(&pi_state->list));
+                list_del_init(&pi_state->list);
                pi_state->owner = NULL;
                spin_unlock_irq(&curr->pi_lock);
@@ -470,12 +473,20 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
        head = &hb->chain;
        list_for_each_entry_safe(this, next, head, list) {
-                if (match_futex (&this->key, &me->key)) {
+                if (match_futex(&this->key, &me->key)) {
                        /*
                         * Another waiter already exists - bump up
                         * the refcount and return its pi_state:
                         */
                        pi_state = this->pi_state;
+                        /*
+                         * Userspace might have messed up non PI and PI futexes
+                         */
+                        if (unlikely(!pi_state))
+                                return -EINVAL;
+                        WARN_ON(!atomic_read(&pi_state->refcount));
                        atomic_inc(&pi_state->refcount);
                        me->pi_state = pi_state;
@@ -484,10 +495,13 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
        }
        /*
-         * We are the first waiter - try to look up the real owner and
+         * We are the first waiter - try to look up the real owner and attach
-         * attach the new pi_state to it:
+         * the new pi_state to it, but bail out when the owner died bit is set
+         * and TID = 0:
         */
        pid = uval & FUTEX_TID_MASK;
+        if (!pid && (uval & FUTEX_OWNER_DIED))
+                return -ESRCH;
        p = futex_find_get_task(pid);
        if (!p)
                return -ESRCH;
@@ -504,6 +518,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
        pi_state->key = me->key;
        spin_lock_irq(&p->pi_lock);
+        WARN_ON(!list_empty(&pi_state->list));
        list_add(&pi_state->list, &p->pi_state_list);
        pi_state->owner = p;
        spin_unlock_irq(&p->pi_lock);
@@ -567,20 +582,29 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
         * kept enabled while there is PI state around. We must also
         * preserve the owner died bit.)
         */
-        newval = (uval & FUTEX_OWNER_DIED) | FUTEX_WAITERS | new_owner->pid;
+        if (!(uval & FUTEX_OWNER_DIED)) {
+                newval = FUTEX_WAITERS | new_owner->pid;
-        inc_preempt_count();
+                inc_preempt_count();
-        curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
+                curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
-        dec_preempt_count();
+                dec_preempt_count();
+                if (curval == -EFAULT)
+                        return -EFAULT;
+                if (curval != uval)
+                        return -EINVAL;
+        }
-        if (curval == -EFAULT)
+        spin_lock_irq(&pi_state->owner->pi_lock);
-                return -EFAULT;
+        WARN_ON(list_empty(&pi_state->list));
-        if (curval != uval)
+        list_del_init(&pi_state->list);
-                return -EINVAL;
+        spin_unlock_irq(&pi_state->owner->pi_lock);
-        list_del_init(&pi_state->owner->pi_state_list);
+        spin_lock_irq(&new_owner->pi_lock);
+        WARN_ON(!list_empty(&pi_state->list));
        list_add(&pi_state->list, &new_owner->pi_state_list);
        pi_state->owner = new_owner;
+        spin_unlock_irq(&new_owner->pi_lock);
        rt_mutex_unlock(&pi_state->pi_mutex);
        return 0;
@@ -1230,6 +1254,7 @@ static int do_futex_lock_pi(u32 __user *uaddr, int detect, int trylock,
                /* Owner died? */
                if (q.pi_state->owner != NULL) {
                        spin_lock_irq(&q.pi_state->owner->pi_lock);
+                        WARN_ON(list_empty(&q.pi_state->list));
                        list_del_init(&q.pi_state->list);
                        spin_unlock_irq(&q.pi_state->owner->pi_lock);
                } else
@@ -1238,6 +1263,7 @@ static int do_futex_lock_pi(u32 __user *uaddr, int detect, int trylock,
                q.pi_state->owner = current;
                spin_lock_irq(&current->pi_lock);
+                WARN_ON(!list_empty(&q.pi_state->list));
                list_add(&q.pi_state->list, &current->pi_state_list);
                spin_unlock_irq(&current->pi_lock);
@@ -1421,9 +1447,11 @@ retry_locked:
         * again. If it succeeds then we can return without waking
         * anyone else up:
         */
-        inc_preempt_count();
+        if (!(uval & FUTEX_OWNER_DIED)) {
-        uval = futex_atomic_cmpxchg_inatomic(uaddr, current->pid, 0);
+                inc_preempt_count();
-        dec_preempt_count();
+                uval = futex_atomic_cmpxchg_inatomic(uaddr, current->pid, 0);
+                dec_preempt_count();
+        }
        if (unlikely(uval == -EFAULT))
                goto pi_faulted;
@@ -1456,9 +1484,11 @@ retry_locked:
        /*
         * No waiters - kernel unlocks the futex:
         */
-        ret = unlock_futex_pi(uaddr, uval);
+        if (!(uval & FUTEX_OWNER_DIED)) {
-        if (ret == -EFAULT)
+                ret = unlock_futex_pi(uaddr, uval);
-                goto pi_faulted;
+                if (ret == -EFAULT)
+                        goto pi_faulted;
+        }
 out_unlock:
        spin_unlock(&hb->lock);
@@ -1677,9 +1707,9 @@ err_unlock:
 * Process a futex-list entry, check whether it's owned by the
 * dying task, and do notification if so:
 */
-int handle_futex_death(u32 __user *uaddr, struct task_struct *curr)
+int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi)
 {
-        u32 uval, nval;
+        u32 uval, nval, mval;
 retry:
        if (get_user(uval, uaddr))
@@ -1696,21 +1726,45 @@ retry:
                 * thread-death.) The rest of the cleanup is done in
                 * userspace.
                 */
-                nval = futex_atomic_cmpxchg_inatomic(uaddr, uval,
+                mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
-                                                     uval | FUTEX_OWNER_DIED);
+                nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, mval);
                if (nval == -EFAULT)
                        return -1;
                if (nval != uval)
                        goto retry;
-                if (uval & FUTEX_WAITERS)
+                /*
-                        futex_wake(uaddr, 1);
+                 * Wake robust non-PI futexes here. The wakeup of
+                 * PI futexes happens in exit_pi_state():
+                 */
+                if (!pi) {
+                        if (uval & FUTEX_WAITERS)
+                                futex_wake(uaddr, 1);
+                }
        }
        return 0;
 }
 /*
+ * Fetch a robust-list pointer. Bit 0 signals PI futexes:
+ */
+static inline int fetch_robust_entry(struct robust_list __user **entry,
+                                     struct robust_list __user **head, int *pi)
+{
+        unsigned long uentry;
+        if (get_user(uentry, (unsigned long *)head))
+                return -EFAULT;
+        *entry = (void *)(uentry & ~1UL);
+        *pi = uentry & 1;
+        return 0;
+}
+/*
 * Walk curr->robust_list (very carefully, it's a userspace list!)
 * and mark any locks found there dead, and notify any waiters.
 *
@@ -1720,14 +1774,14 @@ void exit_robust_list(struct task_struct *curr)
 {
        struct robust_list_head __user *head = curr->robust_list;
        struct robust_list __user *entry, *pending;
-        unsigned int limit = ROBUST_LIST_LIMIT;
+        unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
        unsigned long futex_offset;
        /*
         * Fetch the list head (which was registered earlier, via
         * sys_set_robust_list()):
         */
-        if (get_user(entry, &head->list.next))
+        if (fetch_robust_entry(&entry, &head->list.next, &pi))
                return;
        /*
         * Fetch the relative futex offset:
@@ -1738,10 +1792,11 @@ void exit_robust_list(struct task_struct *curr)
         * Fetch any possibly pending lock-add first, and handle it
         * if it exists:
         */
-        if (get_user(pending, &head->list_op_pending))
+        if (fetch_robust_entry(&pending, &head->list_op_pending, &pip))
                return;
        if (pending)
-                handle_futex_death((void *)pending + futex_offset, curr);
+                handle_futex_death((void *)pending + futex_offset, curr, pip);
        while (entry != &head->list) {
                /*
@@ -1750,12 +1805,12 @@ void exit_robust_list(struct task_struct *curr)
                 */
                if (entry != pending)
                        if (handle_futex_death((void *)entry + futex_offset,
-                                                curr))
+                                                curr, pi))
                                return;
                /*
                 * Fetch the next entry in the list:
                 */
-                if (get_user(entry, &entry->next))
+                if (fetch_robust_entry(&entry, &entry->next, &pi))
                        return;
                /*
                 * Avoid excessively long or circular lists:
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index d1d92b441fb7..d1aab1a452cc 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -12,6 +12,23 @@
 #include <asm/uaccess.h>
+/*
+ * Fetch a robust-list pointer. Bit 0 signals PI futexes:
+ */
+static inline int
+fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry,
+                   compat_uptr_t *head, int *pi)
+{
+        if (get_user(*uentry, head))
+                return -EFAULT;
+        *entry = compat_ptr((*uentry) & ~1);
+        *pi = (unsigned int)(*uentry) & 1;
+        return 0;
+}
 /*
 * Walk curr->robust_list (very carefully, it's a userspace list!)
 * and mark any locks found there dead, and notify any waiters.
@@ -22,17 +39,16 @@ void compat_exit_robust_list(struct task_struct *curr)
 {
        struct compat_robust_list_head __user *head = curr->compat_robust_list;
        struct robust_list __user *entry, *pending;
+        unsigned int limit = ROBUST_LIST_LIMIT, pi;
        compat_uptr_t uentry, upending;
-        unsigned int limit = ROBUST_LIST_LIMIT;
        compat_long_t futex_offset;
        /*
         * Fetch the list head (which was registered earlier, via
         * sys_set_robust_list()):
         */
-        if (get_user(uentry, &head->list.next))
+        if (fetch_robust_entry(&uentry, &entry, &head->list.next, &pi))
                return;
-        entry = compat_ptr(uentry);
        /*
         * Fetch the relative futex offset:
         */
@@ -42,11 +58,11 @@ void compat_exit_robust_list(struct task_struct *curr)
         * Fetch any possibly pending lock-add first, and handle it
         * if it exists:
         */
-        if (get_user(upending, &head->list_op_pending))
+        if (fetch_robust_entry(&upending, &pending,
+                               &head->list_op_pending, &pi))
                return;
-        pending = compat_ptr(upending);
        if (upending)
-                handle_futex_death((void *)pending + futex_offset, curr);
+                handle_futex_death((void *)pending + futex_offset, curr, pi);
        while (compat_ptr(uentry) != &head->list) {
                /*
@@ -55,15 +71,15 @@ void compat_exit_robust_list(struct task_struct *curr)
                 */
                if (entry != pending)
                        if (handle_futex_death((void *)entry + futex_offset,
-                                                curr))
+                                                curr, pi))
                                return;
                /*
                 * Fetch the next entry in the list:
                 */
-                if (get_user(uentry, (compat_uptr_t *)&entry->next))
+                if (fetch_robust_entry(&uentry, &entry,
+                                       (compat_uptr_t *)&entry->next, &pi))
                        return;
-                entry = compat_ptr(uentry);
                /*
                 * Avoid excessively long or circular lists:
                 */
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index d17766d40dab..be989efc7856 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -835,7 +835,7 @@ static void migrate_hrtimers(int cpu)
 }
 #endif /* CONFIG_HOTPLUG_CPU */
-static int __devinit hrtimer_cpu_notify(struct notifier_block *self,
+static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self,
                                        unsigned long action, void *hcpu)
 {
        long cpu = (long)hcpu;
@@ -859,7 +859,7 @@ static int __devinit hrtimer_cpu_notify(struct notifier_block *self,
        return NOTIFY_OK;
 }
-static struct notifier_block __devinitdata hrtimers_nb = {
+static struct notifier_block __cpuinitdata hrtimers_nb = {
        .notifier_call = hrtimer_cpu_notify,
 };
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 4e461438e48b..92be519eff26 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -137,16 +137,40 @@ EXPORT_SYMBOL(enable_irq);
 *      @irq:   interrupt to control
 *      @on:    enable/disable power management wakeup
 *
- *      Enable/disable power management wakeup mode
+ *      Enable/disable power management wakeup mode, which is
+ *      disabled by default.  Enables and disables must match,
+ *      just as they match for non-wakeup mode support.
+ *
+ *      Wakeup mode lets this IRQ wake the system from sleep
+ *      states like "suspend to RAM".
 */
 int set_irq_wake(unsigned int irq, unsigned int on)
 {
        struct irq_desc *desc = irq_desc + irq;
        unsigned long flags;
        int ret = -ENXIO;
+        int (*set_wake)(unsigned, unsigned) = desc->chip->set_wake;
+        /* wakeup-capable irqs can be shared between drivers that
+         * don't need to have the same sleep mode behaviors.
+         */
        spin_lock_irqsave(&desc->lock, flags);
-        if (desc->chip->set_wake)
+        if (on) {
+                if (desc->wake_depth++ == 0)
+                        desc->status |= IRQ_WAKEUP;
+                else
+                        set_wake = NULL;
+        } else {
+                if (desc->wake_depth == 0) {
+                        printk(KERN_WARNING "Unbalanced IRQ %d "
+                                        "wake disable\n", irq);
+                        WARN_ON(1);
+                } else if (--desc->wake_depth == 0)
+                        desc->status &= ~IRQ_WAKEUP;
+                else
+                        set_wake = NULL;
+        }
+        if (set_wake)
                ret = desc->chip->set_wake(irq, on);
        spin_unlock_irqrestore(&desc->lock, flags);
        return ret;
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 39277dd6bf90..ab16a5a4cfe9 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -275,8 +275,8 @@ static void upcase_if_global(struct kallsym_iter *iter)
 static int get_ksymbol_mod(struct kallsym_iter *iter)
 {
        iter->owner = module_get_kallsym(iter->pos - kallsyms_num_syms,
-                                         &iter->value,
+                                         &iter->value, &iter->type,
-                                         &iter->type, iter->name);
+                                         iter->name, sizeof(iter->name));
        if (iter->owner == NULL)
                return 0;
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 64aab081153b..3f57dfdc8f92 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -393,6 +393,7 @@ static int __kprobes add_new_kprobe(struct kprobe *old_p, struct kprobe *p)
 static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
 {
        copy_kprobe(p, ap);
+        flush_insn_slot(ap);
        ap->addr = p->addr;
        ap->pre_handler = aggr_pre_handler;
        ap->fault_handler = aggr_fault_handler;
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 24be714b04c7..4f9c60ef95e8 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -216,23 +216,6 @@ EXPORT_SYMBOL(kthread_bind);
 */
 int kthread_stop(struct task_struct *k)
 {
-        return kthread_stop_sem(k, NULL);
-}
-EXPORT_SYMBOL(kthread_stop);
-/**
- * kthread_stop_sem - stop a thread created by kthread_create().
- * @k: thread created by kthread_create().
- * @s: semaphore that @k waits on while idle.
- *
- * Does essentially the same thing as kthread_stop() above, but wakes
- * @k by calling up(@s).
- *
- * Returns the result of threadfn(), or %-EINTR if wake_up_process()
- * was never called.
- */
-int kthread_stop_sem(struct task_struct *k, struct semaphore *s)
-{
        int ret;
        mutex_lock(&kthread_stop_lock);
@@ -246,10 +229,7 @@ int kthread_stop_sem(struct task_struct *k, struct semaphore *s)
        /* Now set kthread_should_stop() to true, and wake it up. */
        kthread_stop_info.k = k;
-        if (s)
+        wake_up_process(k);
-                up(s);
-        else
-                wake_up_process(k);
        put_task_struct(k);
        /* Once it dies, reset stop ptr, gather result and we're done. */
@@ -260,7 +240,7 @@ int kthread_stop_sem(struct task_struct *k, struct semaphore *s)
        return ret;
 }
-EXPORT_SYMBOL(kthread_stop_sem);
+EXPORT_SYMBOL(kthread_stop);
 static __init int helper_init(void)
 {
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index f32ca78c198d..9bad17884513 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -169,22 +169,17 @@ EXPORT_SYMBOL(lockdep_internal);
 */
 static int class_filter(struct lock_class *class)
 {
+#if 0
+        /* Example */
        if (class->name_version == 1 &&
-                        !strcmp(class->name, "&rl->lock"))
+                        !strcmp(class->name, "lockname"))
                return 1;
        if (class->name_version == 1 &&
-                        !strcmp(class->name, "&ni->mrec_lock"))
+                        !strcmp(class->name, "&struct->lockfield"))
                return 1;
-        if (class->name_version == 1 &&
+#endif
-                        !strcmp(class->name, "mft_ni_runlist_lock"))
+        /* Allow everything else. 0 would be filter everything else */
-                return 1;
+        return 1;
-        if (class->name_version == 1 &&
-                        !strcmp(class->name, "mft_ni_mrec_lock"))
-                return 1;
-        if (class->name_version == 1 &&
-                        !strcmp(class->name, "&vol->lcnbmp_lock"))
-                return 1;
-        return 0;
 }
 #endif
@@ -408,23 +403,12 @@ static void lockdep_print_held_locks(struct task_struct *curr)
                print_lock(curr->held_locks + i);
        }
 }
-/*
- * Helper to print a nice hierarchy of lock dependencies:
- */
-static void print_spaces(int nr)
-{
-        int i;
-        for (i = 0; i < nr; i++)
-                printk("  ");
-}
 static void print_lock_class_header(struct lock_class *class, int depth)
 {
        int bit;
-        print_spaces(depth);
+        printk("%*s->", depth, "");
-        printk("->");
        print_lock_name(class);
        printk(" ops: %lu", class->ops);
        printk(" {\n");
@@ -433,17 +417,14 @@ static void print_lock_class_header(struct lock_class *class, int depth)
                if (class->usage_mask & (1 << bit)) {
                        int len = depth;
-                        print_spaces(depth);
+                        len += printk("%*s   %s", depth, "", usage_str[bit]);
-                        len += printk("   %s", usage_str[bit]);
                        len += printk(" at:\n");
                        print_stack_trace(class->usage_traces + bit, len);
                }
        }
-        print_spaces(depth);
+        printk("%*s }\n", depth, "");
-        printk(" }\n");
-        print_spaces(depth);
+        printk("%*s ... key      at: ",depth,"");
-        printk(" ... key      at: ");
        print_ip_sym((unsigned long)class->key);
 }
@@ -463,8 +444,7 @@ static void print_lock_dependencies(struct lock_class *class, int depth)
                DEBUG_LOCKS_WARN_ON(!entry->class);
                print_lock_dependencies(entry->class, depth + 1);
-                print_spaces(depth);
+                printk("%*s ... acquired at:\n",depth,"");
-                printk(" ... acquired at:\n");
                print_stack_trace(&entry->trace, 2);
                printk("\n");
        }
@@ -1124,7 +1104,7 @@ extern void __error_too_big_MAX_LOCKDEP_SUBCLASSES(void);
 * itself, so actual lookup of the hash should be once per lock object.
 */
 static inline struct lock_class *
-register_lock_class(struct lockdep_map *lock, unsigned int subclass)
+look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
 {
        struct lockdep_subclass_key *key;
        struct list_head *hash_head;
@@ -1168,7 +1148,26 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass)
         */
        list_for_each_entry(class, hash_head, hash_entry)
                if (class->key == key)
-                        goto out_set;
+                        return class;
+        return NULL;
+}
+/*
+ * Register a lock's class in the hash-table, if the class is not present
+ * yet. Otherwise we look it up. We cache the result in the lock object
+ * itself, so actual lookup of the hash should be once per lock object.
+ */
+static inline struct lock_class *
+register_lock_class(struct lockdep_map *lock, unsigned int subclass)
+{
+        struct lockdep_subclass_key *key;
+        struct list_head *hash_head;
+        struct lock_class *class;
+        class = look_up_lock_class(lock, subclass);
+        if (likely(class))
+                return class;
        /*
         * Debug-check: all keys must be persistent!
@@ -1183,6 +1182,9 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass)
                return NULL;
        }
+        key = lock->key->subkeys + subclass;
+        hash_head = classhashentry(key);
        __raw_spin_lock(&hash_lock);
        /*
         * We have to do the hash-walk again, to avoid races
@@ -1229,8 +1231,8 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass)
 out_unlock_set:
        __raw_spin_unlock(&hash_lock);
-out_set:
+        if (!subclass)
-        lock->class[subclass] = class;
+                lock->class_cache = class;
        DEBUG_LOCKS_WARN_ON(class->subclass != subclass);
@@ -1934,7 +1936,7 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
        }
        lock->name = name;
        lock->key = key;
-        memset(lock->class, 0, sizeof(lock->class[0])*MAX_LOCKDEP_SUBCLASSES);
+        lock->class_cache = NULL;
 }
 EXPORT_SYMBOL_GPL(lockdep_init_map);
@@ -1948,8 +1950,8 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
                          unsigned long ip)
 {
        struct task_struct *curr = current;
+        struct lock_class *class = NULL;
        struct held_lock *hlock;
-        struct lock_class *class;
        unsigned int depth, id;
        int chain_head = 0;
        u64 chain_key;
@@ -1967,8 +1969,11 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
                return 0;
        }
-        class = lock->class[subclass];
+        if (!subclass)
-        /* not cached yet? */
+                class = lock->class_cache;
+        /*
+         * Not cached yet or subclass?
+         */
        if (unlikely(!class)) {
                class = register_lock_class(lock, subclass);
                if (!class)
@@ -2469,48 +2474,44 @@ void lockdep_free_key_range(void *start, unsigned long size)
 void lockdep_reset_lock(struct lockdep_map *lock)
 {
-        struct lock_class *class, *next, *entry;
+        struct lock_class *class, *next;
        struct list_head *head;
        unsigned long flags;
        int i, j;
        raw_local_irq_save(flags);
-        __raw_spin_lock(&hash_lock);
        /*
-         * Remove all classes this lock has:
+         * Remove all classes this lock might have:
+         */
+        for (j = 0; j < MAX_LOCKDEP_SUBCLASSES; j++) {
+                /*
+                 * If the class exists we look it up and zap it:
+                 */
+                class = look_up_lock_class(lock, j);
+                if (class)
+                        zap_class(class);
+        }
+        /*
+         * Debug check: in the end all mapped classes should
+         * be gone.
         */
+        __raw_spin_lock(&hash_lock);
        for (i = 0; i < CLASSHASH_SIZE; i++) {
                head = classhash_table + i;
                if (list_empty(head))
                        continue;
                list_for_each_entry_safe(class, next, head, hash_entry) {
-                        for (j = 0; j < MAX_LOCKDEP_SUBCLASSES; j++) {
+                        if (unlikely(class == lock->class_cache)) {
-                                entry = lock->class[j];
+                                __raw_spin_unlock(&hash_lock);
-                                if (class == entry) {
+                                DEBUG_LOCKS_WARN_ON(1);
-                                        zap_class(class);
+                                goto out_restore;
-                                        lock->class[j] = NULL;
-                                        break;
-                                }
                        }
                }
        }
-        /*
-         * Debug check: in the end all mapped classes should
-         * be gone.
-         */
-        for (j = 0; j < MAX_LOCKDEP_SUBCLASSES; j++) {
-                entry = lock->class[j];
-                if (!entry)
-                        continue;
-                __raw_spin_unlock(&hash_lock);
-                DEBUG_LOCKS_WARN_ON(1);
-                raw_local_irq_restore(flags);
-                return;
-        }
        __raw_spin_unlock(&hash_lock);
+out_restore:
        raw_local_irq_restore(flags);
 }
@@ -2571,7 +2572,7 @@ static inline int in_range(const void *start, const void *addr, const void *end)
 static void
 print_freed_lock_bug(struct task_struct *curr, const void *mem_from,
-                     const void *mem_to)
+                     const void *mem_to, struct held_lock *hlock)
 {
        if (!debug_locks_off())
                return;
@@ -2583,6 +2584,7 @@ print_freed_lock_bug(struct task_struct *curr, const void *mem_from,
        printk(  "-------------------------\n");
        printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n",
                curr->comm, curr->pid, mem_from, mem_to-1);
+        print_lock(hlock);
        lockdep_print_held_locks(curr);
        printk("\nstack backtrace:\n");
@@ -2616,7 +2618,7 @@ void debug_check_no_locks_freed(const void *mem_from, unsigned long mem_len)
                                        !in_range(mem_from, lock_to, mem_to))
                        continue;
-                print_freed_lock_bug(curr, mem_from, mem_to);
+                print_freed_lock_bug(curr, mem_from, mem_to, hlock);
                break;
        }
        local_irq_restore(flags);
diff --git a/kernel/module.c b/kernel/module.c
index 35e1b1f859d7..2a19cd47c046 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2019,10 +2019,8 @@ const char *module_address_lookup(unsigned long addr,
        return NULL;
 }
-struct module *module_get_kallsym(unsigned int symnum,
+struct module *module_get_kallsym(unsigned int symnum, unsigned long *value,
-                                  unsigned long *value,
+                                char *type, char *name, size_t namelen)
-                                  char *type,
-                                  char namebuf[128])
 {
        struct module *mod;
@@ -2031,9 +2029,8 @@ struct module *module_get_kallsym(unsigned int symnum,
                if (symnum < mod->num_symtab) {
                        *value = mod->symtab[symnum].st_value;
                        *type = mod->symtab[symnum].st_info;
-                        strncpy(namebuf,
+                        strlcpy(name, mod->strtab + mod->symtab[symnum].st_name,
-                                mod->strtab + mod->symtab[symnum].st_name,
+                                namelen);
-                                127);
                        mutex_unlock(&module_mutex);
                        return mod;
                }
diff --git a/kernel/panic.c b/kernel/panic.c
index ab13f0f668b5..d8a0bca21233 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -172,6 +172,7 @@ const char *print_tainted(void)
 void add_taint(unsigned flag)
 {
+        debug_locks_off(); /* can't trust the integrity of the kernel anymore */
        tainted |= flag;
 }
 EXPORT_SYMBOL(add_taint);
@@ -256,6 +257,7 @@ int oops_may_print(void)
 */
 void oops_enter(void)
 {
+        debug_locks_off(); /* can't trust the integrity of the kernel anymore */
        do_oops_enter_exit();
 }
diff --git a/kernel/power/pm.c b/kernel/power/pm.c
index 84063ac8fcfc..c50d15266c10 100644
--- a/kernel/power/pm.c
+++ b/kernel/power/pm.c
@@ -75,42 +75,6 @@ struct pm_dev *pm_register(pm_dev_t type,
        return dev;
 }
-static void __pm_unregister(struct pm_dev *dev)
-{
-        if (dev) {
-                list_del(&dev->entry);
-                kfree(dev);
-        }
-}
-/**
- *      pm_unregister_all - unregister all devices with matching callback
- *      @callback: callback function pointer
- *
- *      Unregister every device that would call the callback passed. This
- *      is primarily meant as a helper function for loadable modules. It
- *      enables a module to give up all its managed devices without keeping
- *      its own private list.
- */
- 
-void pm_unregister_all(pm_callback callback)
-{
-        struct list_head *entry;
-        if (!callback)
-                return;
-        mutex_lock(&pm_devs_lock);
-        entry = pm_devs.next;
-        while (entry != &pm_devs) {
-                struct pm_dev *dev = list_entry(entry, struct pm_dev, entry);
-                entry = entry->next;
-                if (dev->callback == callback)
-                        __pm_unregister(dev);
-        }
-        mutex_unlock(&pm_devs_lock);
-}
 /**
 *      pm_send - send request to a single device
 *      @dev: device to send to
@@ -239,7 +203,6 @@ int pm_send_all(pm_request_t rqst, void *data)
 }
 EXPORT_SYMBOL(pm_register);
-EXPORT_SYMBOL(pm_unregister_all);
 EXPORT_SYMBOL(pm_send_all);
 EXPORT_SYMBOL(pm_active);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 24c96f354231..75d4886e648e 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -227,11 +227,17 @@ static void copy_data_pages(struct pbe *pblist)
                for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
                        if (saveable(zone, &zone_pfn)) {
                                struct page *page;
+                                long *src, *dst;
+                                int n;
                                page = pfn_to_page(zone_pfn + zone->zone_start_pfn);
                                BUG_ON(!pbe);
                                pbe->orig_address = (unsigned long)page_address(page);
-                                /* copy_page is not usable for copying task structs. */
+                                /* copy_page and memcpy are not usable for copying task structs. */
-                                memcpy((void *)pbe->address, (void *)pbe->orig_address, PAGE_SIZE);
+                                dst = (long *)pbe->address;
+                                src = (long *)pbe->orig_address;
+                                for (n = PAGE_SIZE / sizeof(long); n; n--)
+                                        *dst++ = *src++;
                                pbe = pbe->next;
                        }
                }
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 044b8e0c1025..f1dd146bd64d 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -263,7 +263,6 @@ int swsusp_write(void)
        struct swap_map_handle handle;
        struct snapshot_handle snapshot;
        struct swsusp_info *header;
-        unsigned long start;
        int error;
        if ((error = swsusp_swap_check())) {
@@ -281,16 +280,17 @@ int swsusp_write(void)
        }
        error = get_swap_writer(&handle);
        if (!error) {
-                start = handle.cur_swap;
+                unsigned long start = handle.cur_swap;
                error = swap_write_page(&handle, header);
-        }
+                if (!error)
-        if (!error)
+                        error = save_image(&handle, &snapshot,
-                error = save_image(&handle, &snapshot, header->pages - 1);
+                                        header->pages - 1);
-        if (!error) {
+                if (!error) {
-                flush_swap_writer(&handle);
+                        flush_swap_writer(&handle);
-                printk("S");
+                        printk("S");
-                error = mark_swapfiles(swp_entry(root_swap, start));
+                        error = mark_swapfiles(swp_entry(root_swap, start));
-                printk("|\n");
+                        printk("|\n");
+                }
        }
        if (error)
                free_all_swap_pages(root_swap, handle.bitmap);
@@ -311,8 +311,10 @@ static atomic_t io_done = ATOMIC_INIT(0);
 static int end_io(struct bio *bio, unsigned int num, int err)
 {
-        if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
+        if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
-                panic("I/O error reading memory image");
+                printk(KERN_ERR "I/O error reading swsusp image.\n");
+                return -EIO;
+        }
        atomic_set(&io_done, 0);
        return 0;
 }
diff --git a/kernel/printk.c b/kernel/printk.c
index bdba5d80496c..65ca0688f86f 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -52,7 +52,7 @@ int console_printk[4] = {
        DEFAULT_CONSOLE_LOGLEVEL,       /* default_console_loglevel */
 };
-EXPORT_SYMBOL(console_printk);
+EXPORT_UNUSED_SYMBOL(console_printk);  /*  June 2006  */
 /*
 * Low lever drivers may need that to know if they can schedule in
@@ -773,7 +773,7 @@ int is_console_locked(void)
 {
        return console_locked;
 }
-EXPORT_SYMBOL(is_console_locked);
+EXPORT_UNUSED_SYMBOL(is_console_locked);  /*  June 2006  */
 /**
 * release_console_sem - unlock the console system
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 759805c9859a..436ab35f6fa7 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -548,7 +548,7 @@ static void __devinit rcu_online_cpu(int cpu)
        tasklet_init(&per_cpu(rcu_tasklet, cpu), rcu_process_callbacks, 0UL);
 }
-static int __devinit rcu_cpu_notify(struct notifier_block *self,
+static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
                                unsigned long action, void *hcpu)
 {
        long cpu = (long)hcpu;
@@ -565,7 +565,7 @@ static int __devinit rcu_cpu_notify(struct notifier_block *self,
        return NOTIFY_OK;
 }
-static struct notifier_block __devinitdata rcu_nb = {
+static struct notifier_block __cpuinitdata rcu_nb = {
        .notifier_call  = rcu_cpu_notify,
 };
diff --git a/kernel/resource.c b/kernel/resource.c
index 129cf046e561..0dd3a857579e 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -404,8 +404,6 @@ int insert_resource(struct resource *parent, struct resource *new)
        return result;
 }
-EXPORT_SYMBOL(insert_resource);
 /*
 * Given an existing resource, change its start and size to match the
 * arguments.  Returns -EBUSY if it can't fit.  Existing children of
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
index 494dac872a13..948bd8f643e2 100644
--- a/kernel/rtmutex-tester.c
+++ b/kernel/rtmutex-tester.c
@@ -275,6 +275,7 @@ static int test_func(void *data)
                /* Wait for the next command to be executed */
                schedule();
+                try_to_freeze();
                if (signal_pending(current))
                        flush_signals(current);
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index d2ef13b485e7..3e13a1e5856f 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -7,6 +7,8 @@
 *  Copyright (C) 2005-2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
 *  Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt
 *  Copyright (C) 2006 Esben Nielsen
+ *
+ *  See Documentation/rt-mutex-design.txt for details.
 */
 #include <linux/spinlock.h>
 #include <linux/module.h>
diff --git a/kernel/sched.c b/kernel/sched.c
index 4ee400f9d56b..a2be2d055299 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -51,6 +51,7 @@
 #include <linux/times.h>
 #include <linux/acct.h>
 #include <linux/kprobes.h>
+#include <linux/delayacct.h>
 #include <asm/tlb.h>
 #include <asm/unistd.h>
@@ -501,9 +502,36 @@ struct file_operations proc_schedstat_operations = {
        .release = single_release,
 };
+/*
+ * Expects runqueue lock to be held for atomicity of update
+ */
+static inline void
+rq_sched_info_arrive(struct rq *rq, unsigned long delta_jiffies)
+{
+        if (rq) {
+                rq->rq_sched_info.run_delay += delta_jiffies;
+                rq->rq_sched_info.pcnt++;
+        }
+}
+/*
+ * Expects runqueue lock to be held for atomicity of update
+ */
+static inline void
+rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies)
+{
+        if (rq)
+                rq->rq_sched_info.cpu_time += delta_jiffies;
+}
 # define schedstat_inc(rq, field)       do { (rq)->field++; } while (0)
 # define schedstat_add(rq, field, amt)  do { (rq)->field += (amt); } while (0)
 #else /* !CONFIG_SCHEDSTATS */
+static inline void
+rq_sched_info_arrive(struct rq *rq, unsigned long delta_jiffies)
+{}
+static inline void
+rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies)
+{}
 # define schedstat_inc(rq, field)       do { } while (0)
 # define schedstat_add(rq, field, amt)  do { } while (0)
 #endif
@@ -523,7 +551,7 @@ static inline struct rq *this_rq_lock(void)
        return rq;
 }
-#ifdef CONFIG_SCHEDSTATS
+#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
 /*
 * Called when a process is dequeued from the active array and given
 * the cpu.  We should note that with the exception of interactive
@@ -551,21 +579,16 @@ static inline void sched_info_dequeued(struct task_struct *t)
 */
 static void sched_info_arrive(struct task_struct *t)
 {
-        unsigned long now = jiffies, diff = 0;
+        unsigned long now = jiffies, delta_jiffies = 0;
-        struct rq *rq = task_rq(t);
        if (t->sched_info.last_queued)
-                diff = now - t->sched_info.last_queued;
+                delta_jiffies = now - t->sched_info.last_queued;
        sched_info_dequeued(t);
-        t->sched_info.run_delay += diff;
+        t->sched_info.run_delay += delta_jiffies;
        t->sched_info.last_arrival = now;
        t->sched_info.pcnt++;
-        if (!rq)
+        rq_sched_info_arrive(task_rq(t), delta_jiffies);
-                return;
-        rq->rq_sched_info.run_delay += diff;
-        rq->rq_sched_info.pcnt++;
 }
 /*
@@ -585,8 +608,9 @@ static void sched_info_arrive(struct task_struct *t)
 */
 static inline void sched_info_queued(struct task_struct *t)
 {
-        if (!t->sched_info.last_queued)
+        if (unlikely(sched_info_on()))
-                t->sched_info.last_queued = jiffies;
+                if (!t->sched_info.last_queued)
+                        t->sched_info.last_queued = jiffies;
 }
 /*
@@ -595,13 +619,10 @@ static inline void sched_info_queued(struct task_struct *t)
 */
 static inline void sched_info_depart(struct task_struct *t)
 {
-        struct rq *rq = task_rq(t);
+        unsigned long delta_jiffies = jiffies - t->sched_info.last_arrival;
-        unsigned long diff = jiffies - t->sched_info.last_arrival;
-        t->sched_info.cpu_time += diff;
-        if (rq)
+        t->sched_info.cpu_time += delta_jiffies;
-                rq->rq_sched_info.cpu_time += diff;
+        rq_sched_info_depart(task_rq(t), delta_jiffies);
 }
 /*
@@ -610,7 +631,7 @@ static inline void sched_info_depart(struct task_struct *t)
 * the idle task.)  We are only called when prev != next.
 */
 static inline void
-sched_info_switch(struct task_struct *prev, struct task_struct *next)
+__sched_info_switch(struct task_struct *prev, struct task_struct *next)
 {
        struct rq *rq = task_rq(prev);
@@ -625,10 +646,16 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next)
        if (next != rq->idle)
                sched_info_arrive(next);
 }
+static inline void
+sched_info_switch(struct task_struct *prev, struct task_struct *next)
+{
+        if (unlikely(sched_info_on()))
+                __sched_info_switch(prev, next);
+}
 #else
 #define sched_info_queued(t)            do { } while (0)
 #define sched_info_switch(t, next)      do { } while (0)
-#endif /* CONFIG_SCHEDSTATS */
+#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
 /*
 * Adding/removing a task to/from a priority array:
@@ -1530,8 +1557,9 @@ void fastcall sched_fork(struct task_struct *p, int clone_flags)
        INIT_LIST_HEAD(&p->run_list);
        p->array = NULL;
-#ifdef CONFIG_SCHEDSTATS
+#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
-        memset(&p->sched_info, 0, sizeof(p->sched_info));
+        if (unlikely(sched_info_on()))
+                memset(&p->sched_info, 0, sizeof(p->sched_info));
 #endif
 #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
        p->oncpu = 0;
@@ -1788,7 +1816,15 @@ context_switch(struct rq *rq, struct task_struct *prev,
                WARN_ON(rq->prev_mm);
                rq->prev_mm = oldmm;
        }
+        /*
+         * Since the runqueue lock will be released by the next
+         * task (which is an invalid locking op but in the case
+         * of the scheduler it's an obvious special-case), so we
+         * do an early lockdep release here:
+         */
+#ifndef __ARCH_WANT_UNLOCKED_CTXSW
        spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
+#endif
        /* Here we just switch the register state and the stack. */
        switch_to(prev, next, prev);
@@ -3384,7 +3420,7 @@ EXPORT_SYMBOL(schedule);
 #ifdef CONFIG_PREEMPT
 /*
- * this is is the entry point to schedule() from in-kernel preemption
+ * this is the entry point to schedule() from in-kernel preemption
 * off of preempt_enable.  Kernel preemptions off return from interrupt
 * occur there and call schedule directly.
 */
@@ -3427,7 +3463,7 @@ need_resched:
 EXPORT_SYMBOL(preempt_schedule);
 /*
- * this is is the entry point to schedule() from kernel preemption
+ * this is the entry point to schedule() from kernel preemption
 * off of irq context.
 * Note, that this is called and return with irqs disabled. This will
 * protect us against recursive calling from irq.
@@ -3439,7 +3475,7 @@ asmlinkage void __sched preempt_schedule_irq(void)
        struct task_struct *task = current;
        int saved_lock_depth;
 #endif
-        /* Catch callers which need to be fixed*/
+        /* Catch callers which need to be fixed */
        BUG_ON(ti->preempt_count || !irqs_disabled());
 need_resched:
@@ -4420,9 +4456,9 @@ asmlinkage long sys_sched_yield(void)
        return 0;
 }
-static inline int __resched_legal(void)
+static inline int __resched_legal(int expected_preempt_count)
 {
-        if (unlikely(preempt_count()))
+        if (unlikely(preempt_count() != expected_preempt_count))
                return 0;
        if (unlikely(system_state != SYSTEM_RUNNING))
                return 0;
@@ -4448,7 +4484,7 @@ static void __cond_resched(void)
 int __sched cond_resched(void)
 {
-        if (need_resched() && __resched_legal()) {
+        if (need_resched() && __resched_legal(0)) {
                __cond_resched();
                return 1;
        }
@@ -4474,7 +4510,7 @@ int cond_resched_lock(spinlock_t *lock)
                ret = 1;
                spin_lock(lock);
        }
-        if (need_resched() && __resched_legal()) {
+        if (need_resched() && __resched_legal(1)) {
                spin_release(&lock->dep_map, 1, _THIS_IP_);
                _raw_spin_unlock(lock);
                preempt_enable_no_resched();
@@ -4490,7 +4526,7 @@ int __sched cond_resched_softirq(void)
 {
        BUG_ON(!in_softirq());
-        if (need_resched() && __resched_legal()) {
+        if (need_resched() && __resched_legal(0)) {
                raw_local_irq_disable();
                _local_bh_enable();
                raw_local_irq_enable();
@@ -4526,9 +4562,11 @@ void __sched io_schedule(void)
 {
        struct rq *rq = &__raw_get_cpu_var(runqueues);
+        delayacct_blkio_start();
        atomic_inc(&rq->nr_iowait);
        schedule();
        atomic_dec(&rq->nr_iowait);
+        delayacct_blkio_end();
 }
 EXPORT_SYMBOL(io_schedule);
@@ -4537,9 +4575,11 @@ long __sched io_schedule_timeout(long timeout)
        struct rq *rq = &__raw_get_cpu_var(runqueues);
        long ret;
+        delayacct_blkio_start();
        atomic_inc(&rq->nr_iowait);
        ret = schedule_timeout(timeout);
        atomic_dec(&rq->nr_iowait);
+        delayacct_blkio_end();
        return ret;
 }
@@ -4650,7 +4690,7 @@ static inline struct task_struct *younger_sibling(struct task_struct *p)
        return list_entry(p->sibling.next,struct task_struct,sibling);
 }
-static const char *stat_nam[] = { "R", "S", "D", "T", "t", "Z", "X" };
+static const char stat_nam[] = "RSDTtZX";
 static void show_task(struct task_struct *p)
 {
@@ -4658,12 +4698,9 @@ static void show_task(struct task_struct *p)
        unsigned long free = 0;
        unsigned state;
-        printk("%-13.13s ", p->comm);
        state = p->state ? __ffs(p->state) + 1 : 0;
-        if (state < ARRAY_SIZE(stat_nam))
+        printk("%-13.13s %c", p->comm,
-                printk(stat_nam[state]);
+                state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
-        else
-                printk("?");
 #if (BITS_PER_LONG == 32)
        if (state == TASK_RUNNING)
                printk(" running ");
@@ -4877,7 +4914,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
                p->timestamp = p->timestamp - rq_src->timestamp_last_tick
                                + rq_dest->timestamp_last_tick;
                deactivate_task(p, rq_src);
-                activate_task(p, rq_dest, 0);
+                __activate_task(p, rq_dest);
                if (TASK_PREEMPTS_CURR(p, rq_dest))
                        resched_task(rq_dest->curr);
        }
@@ -5776,7 +5813,7 @@ static unsigned long long measure_migration_cost(int cpu1, int cpu2)
        cache = vmalloc(max_size);
        if (!cache) {
                printk("could not vmalloc %d bytes for cache!\n", 2*max_size);
-                return 1000000; // return 1 msec on very small boxen
+                return 1000000; /* return 1 msec on very small boxen */
        }
        while (size <= max_size) {
@@ -6457,7 +6494,12 @@ static int build_sched_domains(const cpumask_t *cpu_map)
        for (i = 0; i < MAX_NUMNODES; i++)
                init_numa_sched_groups_power(sched_group_nodes[i]);
-        init_numa_sched_groups_power(sched_group_allnodes);
+        if (sched_group_allnodes) {
+                int group = cpu_to_allnodes_group(first_cpu(*cpu_map));
+                struct sched_group *sg = &sched_group_allnodes[group];
+                init_numa_sched_groups_power(sg);
+        }
 #endif
        /* Attach the domains */
@@ -6724,6 +6766,11 @@ void __init sched_init(void)
        }
        set_load_weight(&init_task);
+#ifdef CONFIG_RT_MUTEXES
+        plist_head_init(&init_task.pi_waiters, &init_task.pi_lock);
+#endif
        /*
         * The boot idle thread does lazy MMU switching as well:
         */
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 215541e26c1a..3789ca98197c 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -65,6 +65,7 @@ static inline void wakeup_softirqd(void)
 * This one is for softirq.c-internal use,
 * where hardirqs are disabled legitimately:
 */
+#ifdef CONFIG_TRACE_IRQFLAGS
 static void __local_bh_disable(unsigned long ip)
 {
        unsigned long flags;
@@ -80,6 +81,13 @@ static void __local_bh_disable(unsigned long ip)
                trace_softirqs_off(ip);
        raw_local_irq_restore(flags);
 }
+#else /* !CONFIG_TRACE_IRQFLAGS */
+static inline void __local_bh_disable(unsigned long ip)
+{
+        add_preempt_count(SOFTIRQ_OFFSET);
+        barrier();
+}
+#endif /* CONFIG_TRACE_IRQFLAGS */
 void local_bh_disable(void)
 {
@@ -121,12 +129,16 @@ EXPORT_SYMBOL(_local_bh_enable);
 void local_bh_enable(void)
 {
+#ifdef CONFIG_TRACE_IRQFLAGS
        unsigned long flags;
        WARN_ON_ONCE(in_irq());
+#endif
        WARN_ON_ONCE(irqs_disabled());
+#ifdef CONFIG_TRACE_IRQFLAGS
        local_irq_save(flags);
+#endif
        /*
         * Are softirqs going to be turned on now:
         */
@@ -142,18 +154,22 @@ void local_bh_enable(void)
                do_softirq();
        dec_preempt_count();
+#ifdef CONFIG_TRACE_IRQFLAGS
        local_irq_restore(flags);
+#endif
        preempt_check_resched();
 }
 EXPORT_SYMBOL(local_bh_enable);
 void local_bh_enable_ip(unsigned long ip)
 {
+#ifdef CONFIG_TRACE_IRQFLAGS
        unsigned long flags;
        WARN_ON_ONCE(in_irq());
        local_irq_save(flags);
+#endif
        /*
         * Are softirqs going to be turned on now:
         */
@@ -169,7 +185,9 @@ void local_bh_enable_ip(unsigned long ip)
                do_softirq();
        dec_preempt_count();
+#ifdef CONFIG_TRACE_IRQFLAGS
        local_irq_restore(flags);
+#endif
        preempt_check_resched();
 }
 EXPORT_SYMBOL(local_bh_enable_ip);
@@ -311,8 +329,6 @@ void open_softirq(int nr, void (*action)(struct softirq_action*), void *data)
        softirq_vec[nr].action = action;
 }
-EXPORT_SYMBOL(open_softirq);
 /* Tasklets */
 struct tasklet_head
 {
@@ -549,7 +565,7 @@ static void takeover_tasklets(unsigned int cpu)
 }
 #endif /* CONFIG_HOTPLUG_CPU */
-static int __devinit cpu_callback(struct notifier_block *nfb,
+static int __cpuinit cpu_callback(struct notifier_block *nfb,
                                  unsigned long action,
                                  void *hcpu)
 {
@@ -589,7 +605,7 @@ static int __devinit cpu_callback(struct notifier_block *nfb,
        return NOTIFY_OK;
 }
-static struct notifier_block __devinitdata cpu_nfb = {
+static struct notifier_block __cpuinitdata cpu_nfb = {
        .notifier_call = cpu_callback
 };
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 6b76caa22981..03e6a2b0b787 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -104,7 +104,7 @@ static int watchdog(void * __bind_cpu)
 /*
 * Create/destroy watchdog threads as CPUs come and go:
 */
-static int __devinit
+static int __cpuinit
 cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 {
        int hotcpu = (unsigned long)hcpu;
@@ -142,7 +142,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
        return NOTIFY_OK;
 }
-static struct notifier_block __devinitdata cpu_nfb = {
+static struct notifier_block __cpuinitdata cpu_nfb = {
        .notifier_call = cpu_callback
 };
diff --git a/kernel/sys.c b/kernel/sys.c
index dbb3b9c7ea64..e236f98f7ec5 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1983,7 +1983,7 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
                        error = current->mm->dumpable;
                        break;
                case PR_SET_DUMPABLE:
-                        if (arg2 < 0 || arg2 > 2) {
+                        if (arg2 < 0 || arg2 > 1) {
                                error = -EINVAL;
                                break;
                        }
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
new file mode 100644
index 000000000000..e78187657330
--- /dev/null
+++ b/kernel/taskstats.c
@@ -0,0 +1,564 @@
+/*
+ * taskstats.c - Export per-task statistics to userland
+ *
+ * Copyright (C) Shailabh Nagar, IBM Corp. 2006
+ *           (C) Balbir Singh,   IBM Corp. 2006
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/taskstats_kern.h>
+#include <linux/delayacct.h>
+#include <linux/cpumask.h>
+#include <linux/percpu.h>
+#include <net/genetlink.h>
+#include <asm/atomic.h>
+/*
+ * Maximum length of a cpumask that can be specified in
+ * the TASKSTATS_CMD_ATTR_REGISTER/DEREGISTER_CPUMASK attribute
+ */
+#define TASKSTATS_CPUMASK_MAXLEN        (100+6*NR_CPUS)
+static DEFINE_PER_CPU(__u32, taskstats_seqnum) = { 0 };
+static int family_registered;
+kmem_cache_t *taskstats_cache;
+static struct genl_family family = {
+        .id             = GENL_ID_GENERATE,
+        .name           = TASKSTATS_GENL_NAME,
+        .version        = TASKSTATS_GENL_VERSION,
+        .maxattr        = TASKSTATS_CMD_ATTR_MAX,
+};
+static struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1]
+__read_mostly = {
+        [TASKSTATS_CMD_ATTR_PID]  = { .type = NLA_U32 },
+        [TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 },
+        [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING },
+        [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },};
+struct listener {
+        struct list_head list;
+        pid_t pid;
+        char valid;
+};
+struct listener_list {
+        struct rw_semaphore sem;
+        struct list_head list;
+};
+static DEFINE_PER_CPU(struct listener_list, listener_array);
+enum actions {
+        REGISTER,
+        DEREGISTER,
+        CPU_DONT_CARE
+};
+static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
+                        void **replyp, size_t size)
+{
+        struct sk_buff *skb;
+        void *reply;
+        /*
+         * If new attributes are added, please revisit this allocation
+         */
+        skb = nlmsg_new(size);
+        if (!skb)
+                return -ENOMEM;
+        if (!info) {
+                int seq = get_cpu_var(taskstats_seqnum)++;
+                put_cpu_var(taskstats_seqnum);
+                reply = genlmsg_put(skb, 0, seq,
+                                family.id, 0, 0,
+                                cmd, family.version);
+        } else
+                reply = genlmsg_put(skb, info->snd_pid, info->snd_seq,
+                                family.id, 0, 0,
+                                cmd, family.version);
+        if (reply == NULL) {
+                nlmsg_free(skb);
+                return -EINVAL;
+        }
+        *skbp = skb;
+        *replyp = reply;
+        return 0;
+}
+/*
+ * Send taskstats data in @skb to listener with nl_pid @pid
+ */
+static int send_reply(struct sk_buff *skb, pid_t pid)
+{
+        struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data);
+        void *reply = genlmsg_data(genlhdr);
+        int rc;
+        rc = genlmsg_end(skb, reply);
+        if (rc < 0) {
+                nlmsg_free(skb);
+                return rc;
+        }
+        return genlmsg_unicast(skb, pid);
+}
+/*
+ * Send taskstats data in @skb to listeners registered for @cpu's exit data
+ */
+static void send_cpu_listeners(struct sk_buff *skb, unsigned int cpu)
+{
+        struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data);
+        struct listener_list *listeners;
+        struct listener *s, *tmp;
+        struct sk_buff *skb_next, *skb_cur = skb;
+        void *reply = genlmsg_data(genlhdr);
+        int rc, delcount = 0;
+        rc = genlmsg_end(skb, reply);
+        if (rc < 0) {
+                nlmsg_free(skb);
+                return;
+        }
+        rc = 0;
+        listeners = &per_cpu(listener_array, cpu);
+        down_read(&listeners->sem);
+        list_for_each_entry(s, &listeners->list, list) {
+                skb_next = NULL;
+                if (!list_is_last(&s->list, &listeners->list)) {
+                        skb_next = skb_clone(skb_cur, GFP_KERNEL);
+                        if (!skb_next)
+                                break;
+                }
+                rc = genlmsg_unicast(skb_cur, s->pid);
+                if (rc == -ECONNREFUSED) {
+                        s->valid = 0;
+                        delcount++;
+                }
+                skb_cur = skb_next;
+        }
+        up_read(&listeners->sem);
+        if (skb_cur)
+                nlmsg_free(skb_cur);
+        if (!delcount)
+                return;
+        /* Delete invalidated entries */
+        down_write(&listeners->sem);
+        list_for_each_entry_safe(s, tmp, &listeners->list, list) {
+                if (!s->valid) {
+                        list_del(&s->list);
+                        kfree(s);
+                }
+        }
+        up_write(&listeners->sem);
+}
+static int fill_pid(pid_t pid, struct task_struct *pidtsk,
+                struct taskstats *stats)
+{
+        int rc = 0;
+        struct task_struct *tsk = pidtsk;
+        if (!pidtsk) {
+                read_lock(&tasklist_lock);
+                tsk = find_task_by_pid(pid);
+                if (!tsk) {
+                        read_unlock(&tasklist_lock);
+                        return -ESRCH;
+                }
+                get_task_struct(tsk);
+                read_unlock(&tasklist_lock);
+        } else
+                get_task_struct(tsk);
+        /*
+         * Each accounting subsystem adds calls to its functions to
+         * fill in relevant parts of struct taskstsats as follows
+         *
+         *      per-task-foo(stats, tsk);
+         */
+        delayacct_add_tsk(stats, tsk);
+        stats->version = TASKSTATS_VERSION;
+        /* Define err: label here if needed */
+        put_task_struct(tsk);
+        return rc;
+}
+static int fill_tgid(pid_t tgid, struct task_struct *tgidtsk,
+                struct taskstats *stats)
+{
+        struct task_struct *tsk, *first;
+        unsigned long flags;
+        /*
+         * Add additional stats from live tasks except zombie thread group
+         * leaders who are already counted with the dead tasks
+         */
+        first = tgidtsk;
+        if (!first) {
+                read_lock(&tasklist_lock);
+                first = find_task_by_pid(tgid);
+                if (!first) {
+                        read_unlock(&tasklist_lock);
+                        return -ESRCH;
+                }
+                get_task_struct(first);
+                read_unlock(&tasklist_lock);
+        } else
+                get_task_struct(first);
+        /* Start with stats from dead tasks */
+        spin_lock_irqsave(&first->signal->stats_lock, flags);
+        if (first->signal->stats)
+                memcpy(stats, first->signal->stats, sizeof(*stats));
+        spin_unlock_irqrestore(&first->signal->stats_lock, flags);
+        tsk = first;
+        read_lock(&tasklist_lock);
+        do {
+                if (tsk->exit_state == EXIT_ZOMBIE && thread_group_leader(tsk))
+                        continue;
+                /*
+                 * Accounting subsystem can call its functions here to
+                 * fill in relevant parts of struct taskstsats as follows
+                 *
+                 *      per-task-foo(stats, tsk);
+                 */
+                delayacct_add_tsk(stats, tsk);
+        } while_each_thread(first, tsk);
+        read_unlock(&tasklist_lock);
+        stats->version = TASKSTATS_VERSION;
+        /*
+         * Accounting subsytems can also add calls here to modify
+         * fields of taskstats.
+         */
+        return 0;
+}
+static void fill_tgid_exit(struct task_struct *tsk)
+{
+        unsigned long flags;
+        spin_lock_irqsave(&tsk->signal->stats_lock, flags);
+        if (!tsk->signal->stats)
+                goto ret;
+        /*
+         * Each accounting subsystem calls its functions here to
+         * accumalate its per-task stats for tsk, into the per-tgid structure
+         *
+         *      per-task-foo(tsk->signal->stats, tsk);
+         */
+        delayacct_add_tsk(tsk->signal->stats, tsk);
+ret:
+        spin_unlock_irqrestore(&tsk->signal->stats_lock, flags);
+        return;
+}
+static int add_del_listener(pid_t pid, cpumask_t *maskp, int isadd)
+{
+        struct listener_list *listeners;
+        struct listener *s, *tmp;
+        unsigned int cpu;
+        cpumask_t mask = *maskp;
+        if (!cpus_subset(mask, cpu_possible_map))
+                return -EINVAL;
+        if (isadd == REGISTER) {
+                for_each_cpu_mask(cpu, mask) {
+                        s = kmalloc_node(sizeof(struct listener), GFP_KERNEL,
+                                         cpu_to_node(cpu));
+                        if (!s)
+                                goto cleanup;
+                        s->pid = pid;
+                        INIT_LIST_HEAD(&s->list);
+                        s->valid = 1;
+                        listeners = &per_cpu(listener_array, cpu);
+                        down_write(&listeners->sem);
+                        list_add(&s->list, &listeners->list);
+                        up_write(&listeners->sem);
+                }
+                return 0;
+        }
+        /* Deregister or cleanup */
+cleanup:
+        for_each_cpu_mask(cpu, mask) {
+                listeners = &per_cpu(listener_array, cpu);
+                down_write(&listeners->sem);
+                list_for_each_entry_safe(s, tmp, &listeners->list, list) {
+                        if (s->pid == pid) {
+                                list_del(&s->list);
+                                kfree(s);
+                                break;
+                        }
+                }
+                up_write(&listeners->sem);
+        }
+        return 0;
+}
+static int parse(struct nlattr *na, cpumask_t *mask)
+{
+        char *data;
+        int len;
+        int ret;
+        if (na == NULL)
+                return 1;
+        len = nla_len(na);
+        if (len > TASKSTATS_CPUMASK_MAXLEN)
+                return -E2BIG;
+        if (len < 1)
+                return -EINVAL;
+        data = kmalloc(len, GFP_KERNEL);
+        if (!data)
+                return -ENOMEM;
+        nla_strlcpy(data, na, len);
+        ret = cpulist_parse(data, *mask);
+        kfree(data);
+        return ret;
+}
+static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
+{
+        int rc = 0;
+        struct sk_buff *rep_skb;
+        struct taskstats stats;
+        void *reply;
+        size_t size;
+        struct nlattr *na;
+        cpumask_t mask;
+        rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], &mask);
+        if (rc < 0)
+                return rc;
+        if (rc == 0)
+                return add_del_listener(info->snd_pid, &mask, REGISTER);
+        rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], &mask);
+        if (rc < 0)
+                return rc;
+        if (rc == 0)
+                return add_del_listener(info->snd_pid, &mask, DEREGISTER);
+        /*
+         * Size includes space for nested attributes
+         */
+        size = nla_total_size(sizeof(u32)) +
+                nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
+        memset(&stats, 0, sizeof(stats));
+        rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, &reply, size);
+        if (rc < 0)
+                return rc;
+        if (info->attrs[TASKSTATS_CMD_ATTR_PID]) {
+                u32 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]);
+                rc = fill_pid(pid, NULL, &stats);
+                if (rc < 0)
+                        goto err;
+                na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_PID);
+                NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_PID, pid);
+                NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS,
+                                stats);
+        } else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) {
+                u32 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]);
+                rc = fill_tgid(tgid, NULL, &stats);
+                if (rc < 0)
+                        goto err;
+                na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID);
+                NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, tgid);
+                NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS,
+                                stats);
+        } else {
+                rc = -EINVAL;
+                goto err;
+        }
+        nla_nest_end(rep_skb, na);
+        return send_reply(rep_skb, info->snd_pid);
+nla_put_failure:
+        return genlmsg_cancel(rep_skb, reply);
+err:
+        nlmsg_free(rep_skb);
+        return rc;
+}
+void taskstats_exit_alloc(struct taskstats **ptidstats, unsigned int *mycpu)
+{
+        struct listener_list *listeners;
+        struct taskstats *tmp;
+        /*
+         * This is the cpu on which the task is exiting currently and will
+         * be the one for which the exit event is sent, even if the cpu
+         * on which this function is running changes later.
+         */
+        *mycpu = raw_smp_processor_id();
+        *ptidstats = NULL;
+        tmp = kmem_cache_zalloc(taskstats_cache, SLAB_KERNEL);
+        if (!tmp)
+                return;
+        listeners = &per_cpu(listener_array, *mycpu);
+        down_read(&listeners->sem);
+        if (!list_empty(&listeners->list)) {
+                *ptidstats = tmp;
+                tmp = NULL;
+        }
+        up_read(&listeners->sem);
+        kfree(tmp);
+}
+/* Send pid data out on exit */
+void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats,
+                        int group_dead, unsigned int mycpu)
+{
+        int rc;
+        struct sk_buff *rep_skb;
+        void *reply;
+        size_t size;
+        int is_thread_group;
+        struct nlattr *na;
+        unsigned long flags;
+        if (!family_registered || !tidstats)
+                return;
+        spin_lock_irqsave(&tsk->signal->stats_lock, flags);
+        is_thread_group = tsk->signal->stats ? 1 : 0;
+        spin_unlock_irqrestore(&tsk->signal->stats_lock, flags);
+        rc = 0;
+        /*
+         * Size includes space for nested attributes
+         */
+        size = nla_total_size(sizeof(u32)) +
+                nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
+        if (is_thread_group)
+                size = 2 * size;        /* PID + STATS + TGID + STATS */
+        rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, &reply, size);
+        if (rc < 0)
+                goto ret;
+        rc = fill_pid(tsk->pid, tsk, tidstats);
+        if (rc < 0)
+                goto err_skb;
+        na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_PID);
+        NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_PID, (u32)tsk->pid);
+        NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS,
+                        *tidstats);
+        nla_nest_end(rep_skb, na);
+        if (!is_thread_group)
+                goto send;
+        /*
+         * tsk has/had a thread group so fill the tsk->signal->stats structure
+         * Doesn't matter if tsk is the leader or the last group member leaving
+         */
+        fill_tgid_exit(tsk);
+        if (!group_dead)
+                goto send;
+        na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID);
+        NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, (u32)tsk->tgid);
+        /* No locking needed for tsk->signal->stats since group is dead */
+        NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS,
+                        *tsk->signal->stats);
+        nla_nest_end(rep_skb, na);
+send:
+        send_cpu_listeners(rep_skb, mycpu);
+        return;
+nla_put_failure:
+        genlmsg_cancel(rep_skb, reply);
+        goto ret;
+err_skb:
+        nlmsg_free(rep_skb);
+ret:
+        return;
+}
+static struct genl_ops taskstats_ops = {
+        .cmd            = TASKSTATS_CMD_GET,
+        .doit           = taskstats_user_cmd,
+        .policy         = taskstats_cmd_get_policy,
+};
+/* Needed early in initialization */
+void __init taskstats_init_early(void)
+{
+        unsigned int i;
+        taskstats_cache = kmem_cache_create("taskstats_cache",
+                                                sizeof(struct taskstats),
+                                                0, SLAB_PANIC, NULL, NULL);
+        for_each_possible_cpu(i) {
+                INIT_LIST_HEAD(&(per_cpu(listener_array, i).list));
+                init_rwsem(&(per_cpu(listener_array, i).sem));
+        }
+}
+static int __init taskstats_init(void)
+{
+        int rc;
+        rc = genl_register_family(&family);
+        if (rc)
+                return rc;
+        rc = genl_register_ops(&family, &taskstats_ops);
+        if (rc < 0)
+                goto err;
+        family_registered = 1;
+        return 0;
+err:
+        genl_unregister_family(&family);
+        return rc;
+}
+/*
+ * late initcall ensures initialization of statistics collection
+ * mechanisms precedes initialization of the taskstats interface
+ */
+late_initcall(taskstats_init);
diff --git a/kernel/timer.c b/kernel/timer.c
index 396a3c024c2c..b650f04888ed 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -84,7 +84,7 @@ typedef struct tvec_t_base_s tvec_base_t;
 tvec_base_t boot_tvec_bases;
 EXPORT_SYMBOL(boot_tvec_bases);
-static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = { &boot_tvec_bases };
+static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = &boot_tvec_bases;
 static inline void set_running_timer(tvec_base_t *base,
                                        struct timer_list *timer)
@@ -374,6 +374,7 @@ int del_timer_sync(struct timer_list *timer)
                int ret = try_to_del_timer_sync(timer);
                if (ret >= 0)
                        return ret;
+                cpu_relax();
        }
 }
@@ -407,7 +408,7 @@ static int cascade(tvec_base_t *base, tvec_t *tv, int index)
 * This function cascades all vectors and executes all expired timer
 * vectors.
 */
-#define INDEX(N) (base->timer_jiffies >> (TVR_BITS + N * TVN_BITS)) & TVN_MASK
+#define INDEX(N) ((base->timer_jiffies >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK)
 static inline void __run_timers(tvec_base_t *base)
 {
@@ -891,6 +892,7 @@ int do_settimeofday(struct timespec *tv)
        set_normalized_timespec(&xtime, sec, nsec);
        set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
+        clock->error = 0;
        ntp_clear();
        write_sequnlock_irqrestore(&xtime_lock, flags);
@@ -967,6 +969,7 @@ void __init timekeeping_init(void)
 }
+static int timekeeping_suspended;
 /*
 * timekeeping_resume - Resumes the generic timekeeping subsystem.
 * @dev:        unused
@@ -982,6 +985,18 @@ static int timekeeping_resume(struct sys_device *dev)
        write_seqlock_irqsave(&xtime_lock, flags);
        /* restart the last cycle value */
        clock->cycle_last = clocksource_read(clock);
+        clock->error = 0;
+        timekeeping_suspended = 0;
+        write_sequnlock_irqrestore(&xtime_lock, flags);
+        return 0;
+}
+static int timekeeping_suspend(struct sys_device *dev, pm_message_t state)
+{
+        unsigned long flags;
+        write_seqlock_irqsave(&xtime_lock, flags);
+        timekeeping_suspended = 1;
        write_sequnlock_irqrestore(&xtime_lock, flags);
        return 0;
 }
@@ -989,6 +1004,7 @@ static int timekeeping_resume(struct sys_device *dev)
 /* sysfs resume/suspend bits for timekeeping */
 static struct sysdev_class timekeeping_sysclass = {
        .resume         = timekeeping_resume,
+        .suspend        = timekeeping_suspend,
        set_kset_name("timekeeping"),
 };
@@ -1008,52 +1024,52 @@ static int __init timekeeping_init_device(void)
 device_initcall(timekeeping_init_device);
 /*
- * If the error is already larger, we look ahead another tick,
+ * If the error is already larger, we look ahead even further
 * to compensate for late or lost adjustments.
 */
-static __always_inline int clocksource_bigadjust(int sign, s64 error, s64 *interval, s64 *offset)
+static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, s64 *offset)
 {
-        int adj;
+        s64 tick_error, i;
+        u32 look_ahead, adj;
+        s32 error2, mult;
        /*
-         * As soon as the machine is synchronized to the external time
+         * Use the current error value to determine how much to look ahead.
-         * source this should be the common case.
+         * The larger the error the slower we adjust for it to avoid problems
+         * with losing too many ticks, otherwise we would overadjust and
+         * produce an even larger error.  The smaller the adjustment the
+         * faster we try to adjust for it, as lost ticks can do less harm
+         * here.  This is tuned so that an error of about 1 msec is adusted
+         * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks).
         */
-        error >>= 2;
+        error2 = clock->error >> (TICK_LENGTH_SHIFT + 22 - 2 * SHIFT_HZ);
-        if (likely(sign > 0 ? error <= *interval : error >= *interval))
+        error2 = abs(error2);
-                return sign;
+        for (look_ahead = 0; error2 > 0; look_ahead++)
+                error2 >>= 2;
        /*
-         * An extra look ahead dampens the effect of the current error,
+         * Now calculate the error in (1 << look_ahead) ticks, but first
-         * which can grow quite large with continously late updates, as
+         * remove the single look ahead already included in the error.
-         * it would dominate the adjustment value and can lead to
-         * oscillation.
         */
-        error += current_tick_length() >> (TICK_LENGTH_SHIFT - clock->shift + 1);
+        tick_error = current_tick_length() >> (TICK_LENGTH_SHIFT - clock->shift + 1);
-        error -= clock->xtime_interval >> 1;
+        tick_error -= clock->xtime_interval >> 1;
+        error = ((error - tick_error) >> look_ahead) + tick_error;
-        adj = 0;
-        while (1) {
+        /* Finally calculate the adjustment shift value.  */
-                error >>= 1;
+        i = *interval;
-                if (sign > 0 ? error <= *interval : error >= *interval)
+        mult = 1;
-                        break;
+        if (error < 0) {
-                adj++;
+                error = -error;
+                *interval = -*interval;
+                *offset = -*offset;
+                mult = -1;
        }
+        for (adj = 0; error > i; adj++)
-        /*
+                error >>= 1;
-         * Add the current adjustments to the error and take the offset
-         * into account, the latter can cause the error to be hardly
-         * reduced at the next tick. Check the error again if there's
-         * room for another adjustment, thus further reducing the error
-         * which otherwise had to be corrected at the next update.
-         */
-        error = (error << 1) - *interval + *offset;
-        if (sign > 0 ? error > *interval : error < *interval)
-                adj++;
        *interval <<= adj;
        *offset <<= adj;
-        return sign << adj;
+        return mult << adj;
 }
 /*
@@ -1068,11 +1084,19 @@ static void clocksource_adjust(struct clocksource *clock, s64 offset)
        error = clock->error >> (TICK_LENGTH_SHIFT - clock->shift - 1);
        if (error > interval) {
-                adj = clocksource_bigadjust(1, error, &interval, &offset);
+                error >>= 2;
+                if (likely(error <= interval))
+                        adj = 1;
+                else
+                        adj = clocksource_bigadjust(error, &interval, &offset);
        } else if (error < -interval) {
-                interval = -interval;
+                error >>= 2;
-                offset = -offset;
+                if (likely(error >= -interval)) {
-                adj = clocksource_bigadjust(-1, error, &interval, &offset);
+                        adj = -1;
+                        interval = -interval;
+                        offset = -offset;
+                } else
+                        adj = clocksource_bigadjust(error, &interval, &offset);
        } else
                return;
@@ -1091,13 +1115,16 @@ static void update_wall_time(void)
 {
        cycle_t offset;
-        clock->xtime_nsec += (s64)xtime.tv_nsec << clock->shift;
+        /* Make sure we're fully resumed: */
+        if (unlikely(timekeeping_suspended))
+                return;
 #ifdef CONFIG_GENERIC_TIME
        offset = (clocksource_read(clock) - clock->cycle_last) & clock->mask;
 #else
        offset = clock->cycle_interval;
 #endif
+        clock->xtime_nsec += (s64)xtime.tv_nsec << clock->shift;
        /* normally this loop will run just once, however in the
         * case of lost or late ticks, it will accumulate correctly.
@@ -1129,7 +1156,7 @@ static void update_wall_time(void)
        clocksource_adjust(clock, offset);
        /* store full nanoseconds into xtime */
-        xtime.tv_nsec = clock->xtime_nsec >> clock->shift;
+        xtime.tv_nsec = (s64)clock->xtime_nsec >> clock->shift;
        clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift;
        /* check to see if there is a new clocksource to use */
@@ -1661,7 +1688,7 @@ static void __devinit migrate_timers(int cpu)
 }
 #endif /* CONFIG_HOTPLUG_CPU */
-static int __devinit timer_cpu_notify(struct notifier_block *self,
+static int __cpuinit timer_cpu_notify(struct notifier_block *self,
                                unsigned long action, void *hcpu)
 {
        long cpu = (long)hcpu;
@@ -1681,7 +1708,7 @@ static int __devinit timer_cpu_notify(struct notifier_block *self,
        return NOTIFY_OK;
 }
-static struct notifier_block __devinitdata timers_nb = {
+static struct notifier_block __cpuinitdata timers_nb = {
        .notifier_call  = timer_cpu_notify,
 };
diff --git a/kernel/wait.c b/kernel/wait.c
index a1d57aeb7f75..59a82f63275d 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -10,9 +10,13 @@
 #include <linux/wait.h>
 #include <linux/hash.h>
-struct lock_class_key waitqueue_lock_key;
+void init_waitqueue_head(wait_queue_head_t *q)
+{
+        spin_lock_init(&q->lock);
+        INIT_LIST_HEAD(&q->task_list);
+}
-EXPORT_SYMBOL(waitqueue_lock_key);
+EXPORT_SYMBOL(init_waitqueue_head);
 void fastcall add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait)
 {
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index eebb1d839235..448e8f7b342d 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -93,9 +93,12 @@ static void __queue_work(struct cpu_workqueue_struct *cwq,
        spin_unlock_irqrestore(&cwq->lock, flags);
 }
-/*
+/**
- * Queue work on a workqueue. Return non-zero if it was successfully
+ * queue_work - queue work on a workqueue
- * added.
+ * @wq: workqueue to use
+ * @work: work to queue
+ *
+ * Returns non-zero if it was successfully added.
 *
 * We queue the work to the CPU it was submitted, but there is no
 * guarantee that it will be processed by that CPU.
@@ -128,6 +131,14 @@ static void delayed_work_timer_fn(unsigned long __data)
        __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
 }
+/**
+ * queue_delayed_work - queue work on a workqueue after delay
+ * @wq: workqueue to use
+ * @work: work to queue
+ * @delay: number of jiffies to wait before queueing
+ *
+ * Returns non-zero if it was successfully added.
+ */
 int fastcall queue_delayed_work(struct workqueue_struct *wq,
                        struct work_struct *work, unsigned long delay)
 {
@@ -150,6 +161,15 @@ int fastcall queue_delayed_work(struct workqueue_struct *wq,
 }
 EXPORT_SYMBOL_GPL(queue_delayed_work);
+/**
+ * queue_delayed_work_on - queue work on specific CPU after delay
+ * @cpu: CPU number to execute work on
+ * @wq: workqueue to use
+ * @work: work to queue
+ * @delay: number of jiffies to wait before queueing
+ *
+ * Returns non-zero if it was successfully added.
+ */
 int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
                        struct work_struct *work, unsigned long delay)
 {
@@ -275,8 +295,9 @@ static void flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
        }
 }
-/*
+/**
 * flush_workqueue - ensure that any scheduled work has run to completion.
+ * @wq: workqueue to flush
 *
 * Forces execution of the workqueue and blocks until its completion.
 * This is typically used in driver shutdown handlers.
@@ -400,6 +421,12 @@ static void cleanup_workqueue_thread(struct workqueue_struct *wq, int cpu)
                kthread_stop(p);
 }
+/**
+ * destroy_workqueue - safely terminate a workqueue
+ * @wq: target workqueue
+ *
+ * Safely destroy a workqueue. All work currently pending will be done first.
+ */
 void destroy_workqueue(struct workqueue_struct *wq)
 {
        int cpu;
@@ -425,18 +452,41 @@ EXPORT_SYMBOL_GPL(destroy_workqueue);
 static struct workqueue_struct *keventd_wq;
+/**
+ * schedule_work - put work task in global workqueue
+ * @work: job to be done
+ *
+ * This puts a job in the kernel-global workqueue.
+ */
 int fastcall schedule_work(struct work_struct *work)
 {
        return queue_work(keventd_wq, work);
 }
 EXPORT_SYMBOL(schedule_work);
+/**
+ * schedule_delayed_work - put work task in global workqueue after delay
+ * @work: job to be done
+ * @delay: number of jiffies to wait
+ *
+ * After waiting for a given time this puts a job in the kernel-global
+ * workqueue.
+ */
 int fastcall schedule_delayed_work(struct work_struct *work, unsigned long delay)
 {
        return queue_delayed_work(keventd_wq, work, delay);
 }
 EXPORT_SYMBOL(schedule_delayed_work);
+/**
+ * schedule_delayed_work_on - queue work in global workqueue on CPU after delay
+ * @cpu: cpu to use
+ * @work: job to be done
+ * @delay: number of jiffies to wait
+ *
+ * After waiting for a given time this puts a job in the kernel-global
+ * workqueue on the specified CPU.
+ */
 int schedule_delayed_work_on(int cpu,
                        struct work_struct *work, unsigned long delay)
 {
author	Paul Mackerras <paulus@samba.org>	2006-07-31 20:37:25 -0400
committer	Paul Mackerras <paulus@samba.org>	2006-07-31 20:37:25 -0400
commit	57cad8084e0837e0f2c97da789ec9b3f36809be9 (patch)
tree	e9c790afb4286f78cb08d9664f58baa7e876fe55 /kernel
parent	cb18bd40030c879cd93fef02fd579f74dbab473d (diff)
parent	49b1e3ea19b1c95c2f012b8331ffb3b169e4c042 (diff)