23 files changed, 470 insertions, 263 deletions
diff --git a/kernel/acct.c b/kernel/acct.c
index 065d8b4e51ef..b327f4d20104 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -449,8 +449,8 @@ static void do_acct_process(long exitcode, struct file *file)
        /* calculate run_time in nsec*/
        do_posix_clock_monotonic_gettime(&uptime);
        run_time = (u64)uptime.tv_sec*NSEC_PER_SEC + uptime.tv_nsec;
-        run_time -= (u64)current->start_time.tv_sec*NSEC_PER_SEC
+        run_time -= (u64)current->group_leader->start_time.tv_sec * NSEC_PER_SEC
-                                        + current->start_time.tv_nsec;
+                       + current->group_leader->start_time.tv_nsec;
        /* convert nsec -> AHZ */
        elapsed = nsec_to_AHZ(run_time);
 #if ACCT_VERSION==3
@@ -469,10 +469,10 @@ static void do_acct_process(long exitcode, struct file *file)
 #endif
        do_div(elapsed, AHZ);
        ac.ac_btime = xtime.tv_sec - elapsed;
-        jiffies = cputime_to_jiffies(cputime_add(current->group_leader->utime,
+        jiffies = cputime_to_jiffies(cputime_add(current->utime,
                                                 current->signal->utime));
        ac.ac_utime = encode_comp_t(jiffies_to_AHZ(jiffies));
-        jiffies = cputime_to_jiffies(cputime_add(current->group_leader->stime,
+        jiffies = cputime_to_jiffies(cputime_add(current->stime,
                                                 current->signal->stime));
        ac.ac_stime = encode_comp_t(jiffies_to_AHZ(jiffies));
        /* we really need to bite the bullet and change layout */
@@ -522,9 +522,9 @@ static void do_acct_process(long exitcode, struct file *file)
        ac.ac_io = encode_comp_t(0 /* current->io_usage */);    /* %% */
        ac.ac_rw = encode_comp_t(ac.ac_io / 1024);
        ac.ac_minflt = encode_comp_t(current->signal->min_flt +
-                                     current->group_leader->min_flt);
+                                     current->min_flt);
        ac.ac_majflt = encode_comp_t(current->signal->maj_flt +
-                                     current->group_leader->maj_flt);
+                                     current->maj_flt);
        ac.ac_swaps = encode_comp_t(0);
        ac.ac_exitcode = exitcode;
diff --git a/kernel/audit.c b/kernel/audit.c
index 04fe2e301b61..c8ccbd09048f 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -578,7 +578,7 @@ static int __init audit_enable(char *str)
               audit_initialized ? "" : " (after initialization)");
        if (audit_initialized)
                audit_enabled = audit_default;
-        return 0;
+        return 1;
 }
 __setup("audit=", audit_enable);
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 18aea1bd1284..72248d1b9e3f 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -616,12 +616,10 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
 * current->cpuset if a task has its memory placement changed.
 * Do not call this routine if in_interrupt().
 *
- * Call without callback_mutex or task_lock() held.  May be called
+ * Call without callback_mutex or task_lock() held.  May be
- * with or without manage_mutex held.  Doesn't need task_lock to guard
+ * called with or without manage_mutex held.  Thanks in part to
- * against another task changing a non-NULL cpuset pointer to NULL,
+ * 'the_top_cpuset_hack', the tasks cpuset pointer will never
- * as that is only done by a task on itself, and if the current task
+ * be NULL.  This routine also might acquire callback_mutex and
- * is here, it is not simultaneously in the exit code NULL'ing its
- * cpuset pointer.  This routine also might acquire callback_mutex and
 * current->mm->mmap_sem during call.
 *
 * Reading current->cpuset->mems_generation doesn't need task_lock
@@ -836,6 +834,55 @@ static int update_cpumask(struct cpuset *cs, char *buf)
 }
 /*
+ * cpuset_migrate_mm
+ *
+ *    Migrate memory region from one set of nodes to another.
+ *
+ *    Temporarilly set tasks mems_allowed to target nodes of migration,
+ *    so that the migration code can allocate pages on these nodes.
+ *
+ *    Call holding manage_mutex, so our current->cpuset won't change
+ *    during this call, as manage_mutex holds off any attach_task()
+ *    calls.  Therefore we don't need to take task_lock around the
+ *    call to guarantee_online_mems(), as we know no one is changing
+ *    our tasks cpuset.
+ *
+ *    Hold callback_mutex around the two modifications of our tasks
+ *    mems_allowed to synchronize with cpuset_mems_allowed().
+ *
+ *    While the mm_struct we are migrating is typically from some
+ *    other task, the task_struct mems_allowed that we are hacking
+ *    is for our current task, which must allocate new pages for that
+ *    migrating memory region.
+ *
+ *    We call cpuset_update_task_memory_state() before hacking
+ *    our tasks mems_allowed, so that we are assured of being in
+ *    sync with our tasks cpuset, and in particular, callbacks to
+ *    cpuset_update_task_memory_state() from nested page allocations
+ *    won't see any mismatch of our cpuset and task mems_generation
+ *    values, so won't overwrite our hacked tasks mems_allowed
+ *    nodemask.
+ */
+static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
+                                                        const nodemask_t *to)
+{
+        struct task_struct *tsk = current;
+        cpuset_update_task_memory_state();
+        mutex_lock(&callback_mutex);
+        tsk->mems_allowed = *to;
+        mutex_unlock(&callback_mutex);
+        do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
+        mutex_lock(&callback_mutex);
+        guarantee_online_mems(tsk->cpuset, &tsk->mems_allowed);
+        mutex_unlock(&callback_mutex);
+}
+/*
 * Handle user request to change the 'mems' memory placement
 * of a cpuset.  Needs to validate the request, update the
 * cpusets mems_allowed and mems_generation, and for each
@@ -947,10 +994,8 @@ static int update_nodemask(struct cpuset *cs, char *buf)
                struct mm_struct *mm = mmarray[i];
                mpol_rebind_mm(mm, &cs->mems_allowed);
-                if (migrate) {
+                if (migrate)
-                        do_migrate_pages(mm, &oldmem, &cs->mems_allowed,
+                        cpuset_migrate_mm(mm, &oldmem, &cs->mems_allowed);
-                                                        MPOL_MF_MOVE_ALL);
-                }
                mmput(mm);
        }
@@ -1185,11 +1230,11 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
        mm = get_task_mm(tsk);
        if (mm) {
                mpol_rebind_mm(mm, &to);
+                if (is_memory_migrate(cs))
+                        cpuset_migrate_mm(mm, &from, &to);
                mmput(mm);
        }
-        if (is_memory_migrate(cs))
-                do_migrate_pages(tsk->mm, &from, &to, MPOL_MF_MOVE_ALL);
        put_task_struct(tsk);
        synchronize_rcu();
        if (atomic_dec_and_test(&oldcs->count))
diff --git a/kernel/exit.c b/kernel/exit.c
index bc0ec674d3f4..1a9787ac6173 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -34,6 +34,7 @@
 #include <linux/mutex.h>
 #include <linux/futex.h>
 #include <linux/compat.h>
+#include <linux/pipe_fs_i.h>
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -127,6 +128,11 @@ static void __exit_signal(struct task_struct *tsk)
        }
 }
+static void delayed_put_task_struct(struct rcu_head *rhp)
+{
+        put_task_struct(container_of(rhp, struct task_struct, rcu));
+}
 void release_task(struct task_struct * p)
 {
        int zap_leader;
@@ -168,7 +174,7 @@ repeat:
        spin_unlock(&p->proc_lock);
        proc_pid_flush(proc_dentry);
        release_thread(p);
-        put_task_struct(p);
+        call_rcu(&p->rcu, delayed_put_task_struct);
        p = leader;
        if (unlikely(zap_leader))
@@ -936,6 +942,9 @@ fastcall NORET_TYPE void do_exit(long code)
        if (tsk->io_context)
                exit_io_context();
+        if (tsk->splice_pipe)
+                __free_pipe_info(tsk->splice_pipe);
        /* PF_DEAD causes final put_task_struct after we schedule. */
        preempt_disable();
        BUG_ON(tsk->flags & PF_DEAD);
diff --git a/kernel/fork.c b/kernel/fork.c
index b3f7a1bb5e55..3384eb89cb1c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -108,10 +108,8 @@ void free_task(struct task_struct *tsk)
 }
 EXPORT_SYMBOL(free_task);
-void __put_task_struct_cb(struct rcu_head *rhp)
+void __put_task_struct(struct task_struct *tsk)
 {
-        struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
        WARN_ON(!(tsk->exit_state & (EXIT_DEAD | EXIT_ZOMBIE)));
        WARN_ON(atomic_read(&tsk->usage));
        WARN_ON(tsk == current);
@@ -126,6 +124,12 @@ void __put_task_struct_cb(struct rcu_head *rhp)
                free_task(tsk);
 }
+void __put_task_struct_cb(struct rcu_head *rhp)
+{
+        struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
+        __put_task_struct(tsk);
+}
 void __init fork_init(unsigned long mempages)
 {
 #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
@@ -721,7 +725,7 @@ out_release:
        free_fdset (new_fdt->open_fds, new_fdt->max_fdset);
        free_fd_array(new_fdt->fd, new_fdt->max_fds);
        kmem_cache_free(files_cachep, newf);
-        goto out;
+        return NULL;
 }
 static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
@@ -1311,17 +1315,19 @@ long do_fork(unsigned long clone_flags,
 {
        struct task_struct *p;
        int trace = 0;
-        long pid = alloc_pidmap();
+        struct pid *pid = alloc_pid();
+        long nr;
-        if (pid < 0)
+        if (!pid)
                return -EAGAIN;
+        nr = pid->nr;
        if (unlikely(current->ptrace)) {
                trace = fork_traceflag (clone_flags);
                if (trace)
                        clone_flags |= CLONE_PTRACE;
        }
-        p = copy_process(clone_flags, stack_start, regs, stack_size, parent_tidptr, child_tidptr, pid);
+        p = copy_process(clone_flags, stack_start, regs, stack_size, parent_tidptr, child_tidptr, nr);
        /*
         * Do this prior waking up the new thread - the thread pointer
         * might get invalid after that point, if the thread exits quickly.
@@ -1348,7 +1354,7 @@ long do_fork(unsigned long clone_flags,
                        p->state = TASK_STOPPED;
                if (unlikely (trace)) {
-                        current->ptrace_message = pid;
+                        current->ptrace_message = nr;
                        ptrace_notify ((trace << 8) | SIGTRAP);
                }
@@ -1358,10 +1364,10 @@ long do_fork(unsigned long clone_flags,
                                ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP);
                }
        } else {
-                free_pidmap(pid);
+                free_pid(pid);
-                pid = PTR_ERR(p);
+                nr = PTR_ERR(p);
        }
-        return pid;
+        return nr;
 }
 #ifndef ARCH_MIN_MMSTRUCT_ALIGN
diff --git a/kernel/futex.c b/kernel/futex.c
index 9c9b2b6b22dd..5699c512057b 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1039,9 +1039,11 @@ asmlinkage long sys_futex(u32 __user *uaddr, int op, int val,
        unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
        int val2 = 0;
-        if ((op == FUTEX_WAIT) && utime) {
+        if (utime && (op == FUTEX_WAIT)) {
                if (copy_from_user(&t, utime, sizeof(t)) != 0)
                        return -EFAULT;
+                if (!timespec_valid(&t))
+                        return -EINVAL;
                timeout = timespec_to_jiffies(&t) + 1;
        }
        /*
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 54274fc85321..1ab6a0ea3d14 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -129,9 +129,11 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,
        unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
        int val2 = 0;
-        if ((op == FUTEX_WAIT) && utime) {
+        if (utime && (op == FUTEX_WAIT)) {
                if (get_compat_timespec(&t, utime))
                        return -EFAULT;
+                if (!timespec_valid(&t))
+                        return -EINVAL;
                timeout = timespec_to_jiffies(&t) + 1;
        }
        if (op >= FUTEX_REQUEUE)
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 0237a556eb1f..d2a7296c8251 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -501,6 +501,7 @@ int hrtimer_cancel(struct hrtimer *timer)
                if (ret >= 0)
                        return ret;
+                cpu_relax();
        }
 }
@@ -606,6 +607,9 @@ static inline void run_hrtimer_queue(struct hrtimer_base *base)
 {
        struct rb_node *node;
+        if (!base->first)
+                return;
        if (base->get_softirq_time)
                base->softirq_time = base->get_softirq_time();
@@ -655,29 +659,28 @@ void hrtimer_run_queues(void)
 /*
 * Sleep related functions:
 */
+static int hrtimer_wakeup(struct hrtimer *timer)
-struct sleep_hrtimer {
-        struct hrtimer timer;
-        struct task_struct *task;
-        int expired;
-};
-static int nanosleep_wakeup(struct hrtimer *timer)
 {
-        struct sleep_hrtimer *t =
+        struct hrtimer_sleeper *t =
-                container_of(timer, struct sleep_hrtimer, timer);
+                container_of(timer, struct hrtimer_sleeper, timer);
+        struct task_struct *task = t->task;
-        t->expired = 1;
+        t->task = NULL;
-        wake_up_process(t->task);
+        if (task)
+                wake_up_process(task);
        return HRTIMER_NORESTART;
 }
-static int __sched do_nanosleep(struct sleep_hrtimer *t, enum hrtimer_mode mode)
+void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, task_t *task)
 {
-        t->timer.function = nanosleep_wakeup;
+        sl->timer.function = hrtimer_wakeup;
-        t->task = current;
+        sl->task = task;
-        t->expired = 0;
+}
+static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode)
+{
+        hrtimer_init_sleeper(t, current);
        do {
                set_current_state(TASK_INTERRUPTIBLE);
@@ -685,18 +688,17 @@ static int __sched do_nanosleep(struct sleep_hrtimer *t, enum hrtimer_mode mode)
                schedule();
-                if (unlikely(!t->expired)) {
+                hrtimer_cancel(&t->timer);
-                        hrtimer_cancel(&t->timer);
+                mode = HRTIMER_ABS;
-                        mode = HRTIMER_ABS;
-                }
+        } while (t->task && !signal_pending(current));
-        } while (!t->expired && !signal_pending(current));
-        return t->expired;
+        return t->task == NULL;
 }
 static long __sched nanosleep_restart(struct restart_block *restart)
 {
-        struct sleep_hrtimer t;
+        struct hrtimer_sleeper t;
        struct timespec __user *rmtp;
        struct timespec tu;
        ktime_t time;
@@ -729,7 +731,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
                       const enum hrtimer_mode mode, const clockid_t clockid)
 {
        struct restart_block *restart;
-        struct sleep_hrtimer t;
+        struct hrtimer_sleeper t;
        struct timespec tu;
        ktime_t rem;
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index 2b33f852be3e..9f77f50d8143 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -1,4 +1,5 @@
-obj-y := handle.o manage.o spurious.o migration.o
+obj-y := handle.o manage.o spurious.o
 obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
 obj-$(CONFIG_PROC_FS) += proc.o
+obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index 52a8655fa080..134f9f2e0e39 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -1,6 +1,5 @@
-#include <linux/irq.h>
-#if defined(CONFIG_GENERIC_PENDING_IRQ)
+#include <linux/irq.h>
 void set_pending_irq(unsigned int irq, cpumask_t mask)
 {
@@ -61,5 +60,3 @@ void move_native_irq(int irq)
        }
        cpus_clear(pending_irq_cpumask[irq]);
 }
-#endif
diff --git a/kernel/module.c b/kernel/module.c
index bd088a7c1499..d24deb0dbbc9 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1254,6 +1254,7 @@ static inline int license_is_gpl_compatible(const char *license)
                || strcmp(license, "GPL v2") == 0
                || strcmp(license, "GPL and additional rights") == 0
                || strcmp(license, "Dual BSD/GPL") == 0
+                || strcmp(license, "Dual MIT/GPL") == 0
                || strcmp(license, "Dual MPL/GPL") == 0);
 }
diff --git a/kernel/panic.c b/kernel/panic.c
index f895c7c01d5b..cc2a4c9c36ac 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -27,7 +27,6 @@ static int pause_on_oops_flag;
 static DEFINE_SPINLOCK(pause_on_oops_lock);
 int panic_timeout;
-EXPORT_SYMBOL(panic_timeout);
 ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
diff --git a/kernel/pid.c b/kernel/pid.c
index a9f2dfd006d2..eeb836b65ca4 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -28,8 +28,9 @@
 #include <linux/hash.h>
 #define pid_hashfn(nr) hash_long((unsigned long)nr, pidhash_shift)
-static struct hlist_head *pid_hash[PIDTYPE_MAX];
+static struct hlist_head *pid_hash;
 static int pidhash_shift;
+static kmem_cache_t *pid_cachep;
 int pid_max = PID_MAX_DEFAULT;
 int last_pid;
@@ -60,9 +61,22 @@ typedef struct pidmap {
 static pidmap_t pidmap_array[PIDMAP_ENTRIES] =
         { [ 0 ... PIDMAP_ENTRIES-1 ] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } };
+/*
+ * Note: disable interrupts while the pidmap_lock is held as an
+ * interrupt might come in and do read_lock(&tasklist_lock).
+ *
+ * If we don't disable interrupts there is a nasty deadlock between
+ * detach_pid()->free_pid() and another cpu that does
+ * spin_lock(&pidmap_lock) followed by an interrupt routine that does
+ * read_lock(&tasklist_lock);
+ *
+ * After we clean up the tasklist_lock and know there are no
+ * irq handlers that take it we can leave the interrupts enabled.
+ * For now it is easier to be safe than to prove it can't happen.
+ */
 static  __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
-fastcall void free_pidmap(int pid)
+static fastcall void free_pidmap(int pid)
 {
        pidmap_t *map = pidmap_array + pid / BITS_PER_PAGE;
        int offset = pid & BITS_PER_PAGE_MASK;
@@ -71,7 +85,7 @@ fastcall void free_pidmap(int pid)
        atomic_inc(&map->nr_free);
 }
-int alloc_pidmap(void)
+static int alloc_pidmap(void)
 {
        int i, offset, max_scan, pid, last = last_pid;
        pidmap_t *map;
@@ -89,12 +103,12 @@ int alloc_pidmap(void)
                         * Free the page if someone raced with us
                         * installing it:
                         */
-                        spin_lock(&pidmap_lock);
+                        spin_lock_irq(&pidmap_lock);
                        if (map->page)
                                free_page(page);
                        else
                                map->page = (void *)page;
-                        spin_unlock(&pidmap_lock);
+                        spin_unlock_irq(&pidmap_lock);
                        if (unlikely(!map->page))
                                break;
                }
@@ -131,13 +145,73 @@ int alloc_pidmap(void)
        return -1;
 }
-struct pid * fastcall find_pid(enum pid_type type, int nr)
+fastcall void put_pid(struct pid *pid)
+{
+        if (!pid)
+                return;
+        if ((atomic_read(&pid->count) == 1) ||
+             atomic_dec_and_test(&pid->count))
+                kmem_cache_free(pid_cachep, pid);
+}
+static void delayed_put_pid(struct rcu_head *rhp)
+{
+        struct pid *pid = container_of(rhp, struct pid, rcu);
+        put_pid(pid);
+}
+fastcall void free_pid(struct pid *pid)
+{
+        /* We can be called with write_lock_irq(&tasklist_lock) held */
+        unsigned long flags;
+        spin_lock_irqsave(&pidmap_lock, flags);
+        hlist_del_rcu(&pid->pid_chain);
+        spin_unlock_irqrestore(&pidmap_lock, flags);
+        free_pidmap(pid->nr);
+        call_rcu(&pid->rcu, delayed_put_pid);
+}
+struct pid *alloc_pid(void)
+{
+        struct pid *pid;
+        enum pid_type type;
+        int nr = -1;
+        pid = kmem_cache_alloc(pid_cachep, GFP_KERNEL);
+        if (!pid)
+                goto out;
+        nr = alloc_pidmap();
+        if (nr < 0)
+                goto out_free;
+        atomic_set(&pid->count, 1);
+        pid->nr = nr;
+        for (type = 0; type < PIDTYPE_MAX; ++type)
+                INIT_HLIST_HEAD(&pid->tasks[type]);
+        spin_lock_irq(&pidmap_lock);
+        hlist_add_head_rcu(&pid->pid_chain, &pid_hash[pid_hashfn(pid->nr)]);
+        spin_unlock_irq(&pidmap_lock);
+out:
+        return pid;
+out_free:
+        kmem_cache_free(pid_cachep, pid);
+        pid = NULL;
+        goto out;
+}
+struct pid * fastcall find_pid(int nr)
 {
        struct hlist_node *elem;
        struct pid *pid;
        hlist_for_each_entry_rcu(pid, elem,
-                        &pid_hash[type][pid_hashfn(nr)], pid_chain) {
+                        &pid_hash[pid_hashfn(nr)], pid_chain) {
                if (pid->nr == nr)
                        return pid;
        }
@@ -146,77 +220,82 @@ struct pid * fastcall find_pid(enum pid_type type, int nr)
 int fastcall attach_pid(task_t *task, enum pid_type type, int nr)
 {
-        struct pid *pid, *task_pid;
+        struct pid_link *link;
+        struct pid *pid;
-        task_pid = &task->pids[type];
-        pid = find_pid(type, nr);
+        WARN_ON(!task->pid); /* to be removed soon */
-        task_pid->nr = nr;
+        WARN_ON(!nr); /* to be removed soon */
-        if (pid == NULL) {
-                INIT_LIST_HEAD(&task_pid->pid_list);
+        link = &task->pids[type];
-                hlist_add_head_rcu(&task_pid->pid_chain,
+        link->pid = pid = find_pid(nr);
-                                   &pid_hash[type][pid_hashfn(nr)]);
+        hlist_add_head_rcu(&link->node, &pid->tasks[type]);
-        } else {
-                INIT_HLIST_NODE(&task_pid->pid_chain);
-                list_add_tail_rcu(&task_pid->pid_list, &pid->pid_list);
-        }
        return 0;
 }
-static fastcall int __detach_pid(task_t *task, enum pid_type type)
+void fastcall detach_pid(task_t *task, enum pid_type type)
 {
-        struct pid *pid, *pid_next;
+        struct pid_link *link;
-        int nr = 0;
+        struct pid *pid;
+        int tmp;
-        pid = &task->pids[type];
+        link = &task->pids[type];
-        if (!hlist_unhashed(&pid->pid_chain)) {
+        pid = link->pid;
-                if (list_empty(&pid->pid_list)) {
+        hlist_del_rcu(&link->node);
-                        nr = pid->nr;
+        link->pid = NULL;
-                        hlist_del_rcu(&pid->pid_chain);
-                } else {
-                        pid_next = list_entry(pid->pid_list.next,
-                                                struct pid, pid_list);
-                        /* insert next pid from pid_list to hash */
-                        hlist_replace_rcu(&pid->pid_chain,
-                                          &pid_next->pid_chain);
-                }
-        }
-        list_del_rcu(&pid->pid_list);
+        for (tmp = PIDTYPE_MAX; --tmp >= 0; )
-        pid->nr = 0;
+                if (!hlist_empty(&pid->tasks[tmp]))
+                        return;
-        return nr;
+        free_pid(pid);
 }
-void fastcall detach_pid(task_t *task, enum pid_type type)
+struct task_struct * fastcall pid_task(struct pid *pid, enum pid_type type)
 {
-        int tmp, nr;
+        struct task_struct *result = NULL;
+        if (pid) {
+                struct hlist_node *first;
+                first = rcu_dereference(pid->tasks[type].first);
+                if (first)
+                        result = hlist_entry(first, struct task_struct, pids[(type)].node);
+        }
+        return result;
+}
-        nr = __detach_pid(task, type);
+/*
-        if (!nr)
+ * Must be called under rcu_read_lock() or with tasklist_lock read-held.
-                return;
+ */
+task_t *find_task_by_pid_type(int type, int nr)
+{
+        return pid_task(find_pid(nr), type);
+}
-        for (tmp = PIDTYPE_MAX; --tmp >= 0; )
+EXPORT_SYMBOL(find_task_by_pid_type);
-                if (tmp != type && find_pid(tmp, nr))
-                        return;
-        free_pidmap(nr);
+struct task_struct *fastcall get_pid_task(struct pid *pid, enum pid_type type)
+{
+        struct task_struct *result;
+        rcu_read_lock();
+        result = pid_task(pid, type);
+        if (result)
+                get_task_struct(result);
+        rcu_read_unlock();
+        return result;
 }
-task_t *find_task_by_pid_type(int type, int nr)
+struct pid *find_get_pid(pid_t nr)
 {
        struct pid *pid;
-        pid = find_pid(type, nr);
+        rcu_read_lock();
-        if (!pid)
+        pid = get_pid(find_pid(nr));
-                return NULL;
+        rcu_read_unlock();
-        return pid_task(&pid->pid_list, type);
+        return pid;
 }
-EXPORT_SYMBOL(find_task_by_pid_type);
 /*
 * The pid hash table is scaled according to the amount of memory in the
 * machine.  From a minimum of 16 slots up to 4096 slots at one gigabyte or
@@ -224,7 +303,7 @@ EXPORT_SYMBOL(find_task_by_pid_type);
 */
 void __init pidhash_init(void)
 {
-        int i, j, pidhash_size;
+        int i, pidhash_size;
        unsigned long megabytes = nr_kernel_pages >> (20 - PAGE_SHIFT);
        pidhash_shift = max(4, fls(megabytes * 4));
@@ -233,16 +312,13 @@ void __init pidhash_init(void)
        printk("PID hash table entries: %d (order: %d, %Zd bytes)\n",
                pidhash_size, pidhash_shift,
-                PIDTYPE_MAX * pidhash_size * sizeof(struct hlist_head));
+                pidhash_size * sizeof(struct hlist_head));
-        for (i = 0; i < PIDTYPE_MAX; i++) {
+        pid_hash = alloc_bootmem(pidhash_size * sizeof(*(pid_hash)));
-                pid_hash[i] = alloc_bootmem(pidhash_size *
+        if (!pid_hash)
-                                        sizeof(*(pid_hash[i])));
+                panic("Could not alloc pidhash!\n");
-                if (!pid_hash[i])
+        for (i = 0; i < pidhash_size; i++)
-                        panic("Could not alloc pidhash!\n");
+                INIT_HLIST_HEAD(&pid_hash[i]);
-                for (j = 0; j < pidhash_size; j++)
-                        INIT_HLIST_HEAD(&pid_hash[i][j]);
-        }
 }
 void __init pidmap_init(void)
@@ -251,4 +327,8 @@ void __init pidmap_init(void)
        /* Reserve PID 0. We never call free_pidmap(0) */
        set_bit(0, pidmap_array->page);
        atomic_dec(&pidmap_array->nr_free);
+        pid_cachep = kmem_cache_create("pid", sizeof(struct pid),
+                                        __alignof__(struct pid),
+                                        SLAB_PANIC, NULL, NULL);
 }
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 9fd8d4f03595..ce0dfb8f4a4e 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -41,7 +41,7 @@ config SOFTWARE_SUSPEND
        depends on PM && SWAP && (X86 && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP)
        ---help---
          Enable the possibility of suspending the machine.
-          It doesn't need APM.
+          It doesn't need ACPI or APM.
          You may suspend your machine by 'swsusp' or 'shutdown -z <time>' 
          (patch for sysvinit needed). 
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 8ac7c35fad77..b2a5f671d6cd 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -26,8 +26,7 @@ static inline int freezeable(struct task_struct * p)
            (p->flags & PF_NOFREEZE) ||
            (p->exit_state == EXIT_ZOMBIE) ||
            (p->exit_state == EXIT_DEAD) ||
-            (p->state == TASK_STOPPED) ||
+            (p->state == TASK_STOPPED))
-            (p->state == TASK_TRACED))
                return 0;
        return 1;
 }
diff --git a/kernel/printk.c b/kernel/printk.c
index 8cc19431e74b..c056f3324432 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -360,8 +360,7 @@ static void call_console_drivers(unsigned long start, unsigned long end)
        unsigned long cur_index, start_print;
        static int msg_level = -1;
-        if (((long)(start - end)) > 0)
+        BUG_ON(((long)(start - end)) > 0);
-                BUG();
        cur_index = start;
        start_print = start;
@@ -708,8 +707,7 @@ int __init add_preferred_console(char *name, int idx, char *options)
 */
 void acquire_console_sem(void)
 {
-        if (in_interrupt())
+        BUG_ON(in_interrupt());
-                BUG();
        down(&console_sem);
        console_locked = 1;
        console_may_schedule = 1;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 86a7f6c60cb2..0eeb7e66722c 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -30,8 +30,7 @@
 */
 void __ptrace_link(task_t *child, task_t *new_parent)
 {
-        if (!list_empty(&child->ptrace_list))
+        BUG_ON(!list_empty(&child->ptrace_list));
-                BUG();
        if (child->parent == new_parent)
                return;
        list_add(&child->ptrace_list, &child->parent->ptrace_children);
diff --git a/kernel/sched.c b/kernel/sched.c
index a9ecac398bb9..365f0b90b4de 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -665,11 +665,57 @@ static int effective_prio(task_t *p)
 }
 /*
+ * We place interactive tasks back into the active array, if possible.
+ *
+ * To guarantee that this does not starve expired tasks we ignore the
+ * interactivity of a task if the first expired task had to wait more
+ * than a 'reasonable' amount of time. This deadline timeout is
+ * load-dependent, as the frequency of array switched decreases with
+ * increasing number of running tasks. We also ignore the interactivity
+ * if a better static_prio task has expired, and switch periodically
+ * regardless, to ensure that highly interactive tasks do not starve
+ * the less fortunate for unreasonably long periods.
+ */
+static inline int expired_starving(runqueue_t *rq)
+{
+        int limit;
+        /*
+         * Arrays were recently switched, all is well
+         */
+        if (!rq->expired_timestamp)
+                return 0;
+        limit = STARVATION_LIMIT * rq->nr_running;
+        /*
+         * It's time to switch arrays
+         */
+        if (jiffies - rq->expired_timestamp >= limit)
+                return 1;
+        /*
+         * There's a better selection in the expired array
+         */
+        if (rq->curr->static_prio > rq->best_expired_prio)
+                return 1;
+        /*
+         * All is well
+         */
+        return 0;
+}
+/*
 * __activate_task - move a task to the runqueue.
 */
-static inline void __activate_task(task_t *p, runqueue_t *rq)
+static void __activate_task(task_t *p, runqueue_t *rq)
 {
-        enqueue_task(p, rq->active);
+        prio_array_t *target = rq->active;
+        if (unlikely(batch_task(p) || (expired_starving(rq) && !rt_task(p))))
+                target = rq->expired;
+        enqueue_task(p, target);
        rq->nr_running++;
 }
@@ -688,7 +734,7 @@ static int recalc_task_prio(task_t *p, unsigned long long now)
        unsigned long long __sleep_time = now - p->timestamp;
        unsigned long sleep_time;
-        if (unlikely(p->policy == SCHED_BATCH))
+        if (batch_task(p))
                sleep_time = 0;
        else {
                if (__sleep_time > NS_MAX_SLEEP_AVG)
@@ -700,21 +746,25 @@ static int recalc_task_prio(task_t *p, unsigned long long now)
        if (likely(sleep_time > 0)) {
                /*
                 * User tasks that sleep a long time are categorised as
-                 * idle and will get just interactive status to stay active &
+                 * idle. They will only have their sleep_avg increased to a
-                 * prevent them suddenly becoming cpu hogs and starving
+                 * level that makes them just interactive priority to stay
-                 * other processes.
+                 * active yet prevent them suddenly becoming cpu hogs and
+                 * starving other processes.
                 */
-                if (p->mm && p->activated != -1 &&
+                if (p->mm && sleep_time > INTERACTIVE_SLEEP(p)) {
-                        sleep_time > INTERACTIVE_SLEEP(p)) {
+                                unsigned long ceiling;
-                                p->sleep_avg = JIFFIES_TO_NS(MAX_SLEEP_AVG -
-                                                DEF_TIMESLICE);
+                                ceiling = JIFFIES_TO_NS(MAX_SLEEP_AVG -
+                                        DEF_TIMESLICE);
+                                if (p->sleep_avg < ceiling)
+                                        p->sleep_avg = ceiling;
                } else {
                        /*
                         * Tasks waking from uninterruptible sleep are
                         * limited in their sleep_avg rise as they
                         * are likely to be waiting on I/O
                         */
-                        if (p->activated == -1 && p->mm) {
+                        if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) {
                                if (p->sleep_avg >= INTERACTIVE_SLEEP(p))
                                        sleep_time = 0;
                                else if (p->sleep_avg + sleep_time >=
@@ -769,7 +819,7 @@ static void activate_task(task_t *p, runqueue_t *rq, int local)
         * This checks to make sure it's not an uninterruptible task
         * that is now waking up.
         */
-        if (!p->activated) {
+        if (p->sleep_type == SLEEP_NORMAL) {
                /*
                 * Tasks which were woken up by interrupts (ie. hw events)
                 * are most likely of interactive nature. So we give them
@@ -778,13 +828,13 @@ static void activate_task(task_t *p, runqueue_t *rq, int local)
                 * on a CPU, first time around:
                 */
                if (in_interrupt())
-                        p->activated = 2;
+                        p->sleep_type = SLEEP_INTERRUPTED;
                else {
                        /*
                         * Normal first-time wakeups get a credit too for
                         * on-runqueue time, but it will be weighted down:
                         */
-                        p->activated = 1;
+                        p->sleep_type = SLEEP_INTERACTIVE;
                }
        }
        p->timestamp = now;
@@ -1272,19 +1322,19 @@ out_activate:
                 * Tasks on involuntary sleep don't earn
                 * sleep_avg beyond just interactive state.
                 */
-                p->activated = -1;
+                p->sleep_type = SLEEP_NONINTERACTIVE;
-        }
+        } else
        /*
         * Tasks that have marked their sleep as noninteractive get
-         * woken up without updating their sleep average. (i.e. their
+         * woken up with their sleep average not weighted in an
-         * sleep is handled in a priority-neutral manner, no priority
+         * interactive way.
-         * boost and no penalty.)
         */
-        if (old_state & TASK_NONINTERACTIVE)
+                if (old_state & TASK_NONINTERACTIVE)
-                __activate_task(p, rq);
+                        p->sleep_type = SLEEP_NONINTERACTIVE;
-        else
-                activate_task(p, rq, cpu == this_cpu);
+        activate_task(p, rq, cpu == this_cpu);
        /*
         * Sync wakeups (i.e. those types of wakeups where the waker
         * has indicated that it will leave the CPU in short order)
@@ -1658,6 +1708,21 @@ unsigned long nr_iowait(void)
        return sum;
 }
+unsigned long nr_active(void)
+{
+        unsigned long i, running = 0, uninterruptible = 0;
+        for_each_online_cpu(i) {
+                running += cpu_rq(i)->nr_running;
+                uninterruptible += cpu_rq(i)->nr_uninterruptible;
+        }
+        if (unlikely((long)uninterruptible < 0))
+                uninterruptible = 0;
+        return running + uninterruptible;
+}
 #ifdef CONFIG_SMP
 /*
@@ -2467,22 +2532,6 @@ unsigned long long current_sched_time(const task_t *tsk)
 }
 /*
- * We place interactive tasks back into the active array, if possible.
- *
- * To guarantee that this does not starve expired tasks we ignore the
- * interactivity of a task if the first expired task had to wait more
- * than a 'reasonable' amount of time. This deadline timeout is
- * load-dependent, as the frequency of array switched decreases with
- * increasing number of running tasks. We also ignore the interactivity
- * if a better static_prio task has expired:
- */
-#define EXPIRED_STARVING(rq) \
-        ((STARVATION_LIMIT && ((rq)->expired_timestamp && \
-                (jiffies - (rq)->expired_timestamp >= \
-                        STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \
-                        ((rq)->curr->static_prio > (rq)->best_expired_prio))
-/*
 * Account user cpu time to a process.
 * @p: the process that the cpu time gets accounted to
 * @hardirq_offset: the offset to subtract from hardirq_count()
@@ -2617,7 +2666,7 @@ void scheduler_tick(void)
                if (!rq->expired_timestamp)
                        rq->expired_timestamp = jiffies;
-                if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) {
+                if (!TASK_INTERACTIVE(p) || expired_starving(rq)) {
                        enqueue_task(p, rq->expired);
                        if (p->static_prio < rq->best_expired_prio)
                                rq->best_expired_prio = p->static_prio;
@@ -2860,6 +2909,12 @@ EXPORT_SYMBOL(sub_preempt_count);
 #endif
+static inline int interactive_sleep(enum sleep_type sleep_type)
+{
+        return (sleep_type == SLEEP_INTERACTIVE ||
+                sleep_type == SLEEP_INTERRUPTED);
+}
 /*
 * schedule() is the main scheduler function.
 */
@@ -2983,12 +3038,12 @@ go_idle:
        queue = array->queue + idx;
        next = list_entry(queue->next, task_t, run_list);
-        if (!rt_task(next) && next->activated > 0) {
+        if (!rt_task(next) && interactive_sleep(next->sleep_type)) {
                unsigned long long delta = now - next->timestamp;
                if (unlikely((long long)(now - next->timestamp) < 0))
                        delta = 0;
-                if (next->activated == 1)
+                if (next->sleep_type == SLEEP_INTERACTIVE)
                        delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
                array = next->array;
@@ -2998,10 +3053,9 @@ go_idle:
                        dequeue_task(next, array);
                        next->prio = new_prio;
                        enqueue_task(next, array);
-                } else
+                }
-                        requeue_task(next, array);
        }
-        next->activated = 0;
+        next->sleep_type = SLEEP_NORMAL;
 switch_tasks:
        if (next == rq->idle)
                schedstat_inc(rq, sched_goidle);
diff --git a/kernel/signal.c b/kernel/signal.c
index 4922928d91f6..b14f895027c3 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -769,8 +769,7 @@ specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t)
 {
        int ret = 0;
-        if (!irqs_disabled())
+        BUG_ON(!irqs_disabled());
-                BUG();
        assert_spin_locked(&t->sighand->siglock);
        /* Short-circuit ignored signals.  */
@@ -869,7 +868,6 @@ __group_complete_signal(int sig, struct task_struct *p)
                if (t == NULL)
                        /* restart balancing at this thread */
                        t = p->signal->curr_target = p;
-                BUG_ON(t->tgid != p->tgid);
                while (!wants_signal(sig, t)) {
                        t = next_thread(t);
@@ -1384,8 +1382,7 @@ send_group_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
                 * the overrun count.  Other uses should not try to
                 * send the signal multiple times.
                 */
-                if (q->info.si_code != SI_TIMER)
+                BUG_ON(q->info.si_code != SI_TIMER);
-                        BUG();
                q->info.si_overrun++;
                goto out;
        } 
@@ -1560,6 +1557,7 @@ static void ptrace_stop(int exit_code, int nostop_code, siginfo_t *info)
        /* Let the debugger run.  */
        set_current_state(TASK_TRACED);
        spin_unlock_irq(&current->sighand->siglock);
+        try_to_freeze();
        read_lock(&tasklist_lock);
        if (likely(current->ptrace & PT_PTRACED) &&
            likely(current->parent != current->real_parent ||
diff --git a/kernel/sys.c b/kernel/sys.c
index 7ef7f6054c28..0b6ec0e7936f 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1372,18 +1372,29 @@ asmlinkage long sys_getsid(pid_t pid)
 asmlinkage long sys_setsid(void)
 {
        struct task_struct *group_leader = current->group_leader;
-        struct pid *pid;
+        pid_t session;
        int err = -EPERM;
        mutex_lock(&tty_mutex);
        write_lock_irq(&tasklist_lock);
-        pid = find_pid(PIDTYPE_PGID, group_leader->pid);
+        /* Fail if I am already a session leader */
-        if (pid)
+        if (group_leader->signal->leader)
+                goto out;
+        session = group_leader->pid;
+        /* Fail if a process group id already exists that equals the
+         * proposed session id.
+         *
+         * Don't check if session id == 1 because kernel threads use this
+         * session id and so the check will always fail and make it so
+         * init cannot successfully call setsid.
+         */
+        if (session > 1 && find_task_by_pid_type(PIDTYPE_PGID, session))
                goto out;
        group_leader->signal->leader = 1;
-        __set_special_pids(group_leader->pid, group_leader->pid);
+        __set_special_pids(session, session);
        group_leader->signal->tty = NULL;
        group_leader->signal->tty_old_pgrp = 0;
        err = process_group(group_leader);
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index d82864c4a617..5433195040f1 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -120,3 +120,15 @@ cond_syscall(sys32_sysctl);
 cond_syscall(ppc_rtas);
 cond_syscall(sys_spu_run);
 cond_syscall(sys_spu_create);
+/* mmu depending weak syscall entries */
+cond_syscall(sys_mprotect);
+cond_syscall(sys_msync);
+cond_syscall(sys_mlock);
+cond_syscall(sys_munlock);
+cond_syscall(sys_mlockall);
+cond_syscall(sys_munlockall);
+cond_syscall(sys_mincore);
+cond_syscall(sys_madvise);
+cond_syscall(sys_mremap);
+cond_syscall(sys_remap_file_pages);
diff --git a/kernel/time.c b/kernel/time.c
index ff8e7019c4c4..b00ddc71cedb 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -410,7 +410,7 @@ EXPORT_SYMBOL(current_kernel_time);
 * current_fs_time - Return FS time
 * @sb: Superblock.
 *
- * Return the current time truncated to the time granuality supported by
+ * Return the current time truncated to the time granularity supported by
 * the fs.
 */
 struct timespec current_fs_time(struct super_block *sb)
@@ -421,11 +421,11 @@ struct timespec current_fs_time(struct super_block *sb)
 EXPORT_SYMBOL(current_fs_time);
 /**
- * timespec_trunc - Truncate timespec to a granuality
+ * timespec_trunc - Truncate timespec to a granularity
 * @t: Timespec
- * @gran: Granuality in ns.
+ * @gran: Granularity in ns.
 *
- * Truncate a timespec to a granuality. gran must be smaller than a second.
+ * Truncate a timespec to a granularity. gran must be smaller than a second.
 * Always rounds down.
 *
 * This function should be only used for timestamps returned by
diff --git a/kernel/timer.c b/kernel/timer.c
index ab189dd187cb..883773788836 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -54,7 +54,6 @@ EXPORT_SYMBOL(jiffies_64);
 /*
 * per-CPU timer vector definitions:
 */
 #define TVN_BITS (CONFIG_BASE_SMALL ? 4 : 6)
 #define TVR_BITS (CONFIG_BASE_SMALL ? 6 : 8)
 #define TVN_SIZE (1 << TVN_BITS)
@@ -62,11 +61,6 @@ EXPORT_SYMBOL(jiffies_64);
 #define TVN_MASK (TVN_SIZE - 1)
 #define TVR_MASK (TVR_SIZE - 1)
-struct timer_base_s {
-        spinlock_t lock;
-        struct timer_list *running_timer;
-};
 typedef struct tvec_s {
        struct list_head vec[TVN_SIZE];
 } tvec_t;
@@ -76,7 +70,8 @@ typedef struct tvec_root_s {
 } tvec_root_t;
 struct tvec_t_base_s {
-        struct timer_base_s t_base;
+        spinlock_t lock;
+        struct timer_list *running_timer;
        unsigned long timer_jiffies;
        tvec_root_t tv1;
        tvec_t tv2;
@@ -86,14 +81,16 @@ struct tvec_t_base_s {
 } ____cacheline_aligned_in_smp;
 typedef struct tvec_t_base_s tvec_base_t;
-static DEFINE_PER_CPU(tvec_base_t *, tvec_bases);
-static tvec_base_t boot_tvec_bases;
+tvec_base_t boot_tvec_bases;
+EXPORT_SYMBOL(boot_tvec_bases);
+static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = { &boot_tvec_bases };
 static inline void set_running_timer(tvec_base_t *base,
                                        struct timer_list *timer)
 {
 #ifdef CONFIG_SMP
-        base->t_base.running_timer = timer;
+        base->running_timer = timer;
 #endif
 }
@@ -139,15 +136,6 @@ static void internal_add_timer(tvec_base_t *base, struct timer_list *timer)
        list_add_tail(&timer->entry, vec);
 }
-typedef struct timer_base_s timer_base_t;
-/*
- * Used by TIMER_INITIALIZER, we can't use per_cpu(tvec_bases)
- * at compile time, and we need timer->base to lock the timer.
- */
-timer_base_t __init_timer_base
-        ____cacheline_aligned_in_smp = { .lock = SPIN_LOCK_UNLOCKED };
-EXPORT_SYMBOL(__init_timer_base);
 /***
 * init_timer - initialize a timer.
 * @timer: the timer to be initialized
@@ -158,7 +146,7 @@ EXPORT_SYMBOL(__init_timer_base);
 void fastcall init_timer(struct timer_list *timer)
 {
        timer->entry.next = NULL;
-        timer->base = &per_cpu(tvec_bases, raw_smp_processor_id())->t_base;
+        timer->base = per_cpu(tvec_bases, raw_smp_processor_id());
 }
 EXPORT_SYMBOL(init_timer);
@@ -174,7 +162,7 @@ static inline void detach_timer(struct timer_list *timer,
 }
 /*
- * We are using hashed locking: holding per_cpu(tvec_bases).t_base.lock
+ * We are using hashed locking: holding per_cpu(tvec_bases).lock
 * means that all timers which are tied to this base via timer->base are
 * locked, and the base itself is locked too.
 *
@@ -185,10 +173,10 @@ static inline void detach_timer(struct timer_list *timer,
 * possible to set timer->base = NULL and drop the lock: the timer remains
 * locked.
 */
-static timer_base_t *lock_timer_base(struct timer_list *timer,
+static tvec_base_t *lock_timer_base(struct timer_list *timer,
                                        unsigned long *flags)
 {
-        timer_base_t *base;
+        tvec_base_t *base;
        for (;;) {
                base = timer->base;
@@ -205,8 +193,7 @@ static timer_base_t *lock_timer_base(struct timer_list *timer,
 int __mod_timer(struct timer_list *timer, unsigned long expires)
 {
-        timer_base_t *base;
+        tvec_base_t *base, *new_base;
-        tvec_base_t *new_base;
        unsigned long flags;
        int ret = 0;
@@ -221,7 +208,7 @@ int __mod_timer(struct timer_list *timer, unsigned long expires)
        new_base = __get_cpu_var(tvec_bases);
-        if (base != &new_base->t_base) {
+        if (base != new_base) {
                /*
                 * We are trying to schedule the timer on the local CPU.
                 * However we can't change timer's base while it is running,
@@ -229,21 +216,19 @@ int __mod_timer(struct timer_list *timer, unsigned long expires)
                 * handler yet has not finished. This also guarantees that
                 * the timer is serialized wrt itself.
                 */
-                if (unlikely(base->running_timer == timer)) {
+                if (likely(base->running_timer != timer)) {
-                        /* The timer remains on a former base */
-                        new_base = container_of(base, tvec_base_t, t_base);
-                } else {
                        /* See the comment in lock_timer_base() */
                        timer->base = NULL;
                        spin_unlock(&base->lock);
-                        spin_lock(&new_base->t_base.lock);
+                        base = new_base;
-                        timer->base = &new_base->t_base;
+                        spin_lock(&base->lock);
+                        timer->base = base;
                }
        }
        timer->expires = expires;
-        internal_add_timer(new_base, timer);
+        internal_add_timer(base, timer);
-        spin_unlock_irqrestore(&new_base->t_base.lock, flags);
+        spin_unlock_irqrestore(&base->lock, flags);
        return ret;
 }
@@ -263,10 +248,10 @@ void add_timer_on(struct timer_list *timer, int cpu)
        unsigned long flags;
        BUG_ON(timer_pending(timer) || !timer->function);
-        spin_lock_irqsave(&base->t_base.lock, flags);
+        spin_lock_irqsave(&base->lock, flags);
-        timer->base = &base->t_base;
+        timer->base = base;
        internal_add_timer(base, timer);
-        spin_unlock_irqrestore(&base->t_base.lock, flags);
+        spin_unlock_irqrestore(&base->lock, flags);
 }
@@ -319,7 +304,7 @@ EXPORT_SYMBOL(mod_timer);
 */
 int del_timer(struct timer_list *timer)
 {
-        timer_base_t *base;
+        tvec_base_t *base;
        unsigned long flags;
        int ret = 0;
@@ -346,7 +331,7 @@ EXPORT_SYMBOL(del_timer);
 */
 int try_to_del_timer_sync(struct timer_list *timer)
 {
-        timer_base_t *base;
+        tvec_base_t *base;
        unsigned long flags;
        int ret = -1;
@@ -410,7 +395,7 @@ static int cascade(tvec_base_t *base, tvec_t *tv, int index)
                struct timer_list *tmp;
                tmp = list_entry(curr, struct timer_list, entry);
-                BUG_ON(tmp->base != &base->t_base);
+                BUG_ON(tmp->base != base);
                curr = curr->next;
                internal_add_timer(base, tmp);
        }
@@ -432,7 +417,7 @@ static inline void __run_timers(tvec_base_t *base)
 {
        struct timer_list *timer;
-        spin_lock_irq(&base->t_base.lock);
+        spin_lock_irq(&base->lock);
        while (time_after_eq(jiffies, base->timer_jiffies)) {
                struct list_head work_list = LIST_HEAD_INIT(work_list);
                struct list_head *head = &work_list;
@@ -458,7 +443,7 @@ static inline void __run_timers(tvec_base_t *base)
                        set_running_timer(base, timer);
                        detach_timer(timer, 1);
-                        spin_unlock_irq(&base->t_base.lock);
+                        spin_unlock_irq(&base->lock);
                        {
                                int preempt_count = preempt_count();
                                fn(data);
@@ -471,11 +456,11 @@ static inline void __run_timers(tvec_base_t *base)
                                        BUG();
                                }
                        }
-                        spin_lock_irq(&base->t_base.lock);
+                        spin_lock_irq(&base->lock);
                }
        }
        set_running_timer(base, NULL);
-        spin_unlock_irq(&base->t_base.lock);
+        spin_unlock_irq(&base->lock);
 }
 #ifdef CONFIG_NO_IDLE_HZ
@@ -506,7 +491,7 @@ unsigned long next_timer_interrupt(void)
        hr_expires += jiffies;
        base = __get_cpu_var(tvec_bases);
-        spin_lock(&base->t_base.lock);
+        spin_lock(&base->lock);
        expires = base->timer_jiffies + (LONG_MAX >> 1);
        list = NULL;
@@ -554,7 +539,7 @@ found:
                                expires = nte->expires;
                }
        }
-        spin_unlock(&base->t_base.lock);
+        spin_unlock(&base->lock);
        if (time_before(hr_expires, expires))
                return hr_expires;
@@ -841,7 +826,7 @@ void update_process_times(int user_tick)
 */
 static unsigned long count_active_tasks(void)
 {
-        return (nr_running() + nr_uninterruptible()) * FIXED_1;
+        return nr_active() * FIXED_1;
 }
 /*
@@ -1240,29 +1225,37 @@ static int __devinit init_timers_cpu(int cpu)
 {
        int j;
        tvec_base_t *base;
+        static char __devinitdata tvec_base_done[NR_CPUS];
-        base = per_cpu(tvec_bases, cpu);
+        if (!tvec_base_done[cpu]) {
-        if (!base) {
                static char boot_done;
-                /*
-                 * Cannot do allocation in init_timers as that runs before the
-                 * allocator initializes (and would waste memory if there are
-                 * more possible CPUs than will ever be installed/brought up).
-                 */
                if (boot_done) {
+                        /*
+                         * The APs use this path later in boot
+                         */
                        base = kmalloc_node(sizeof(*base), GFP_KERNEL,
                                                cpu_to_node(cpu));
                        if (!base)
                                return -ENOMEM;
                        memset(base, 0, sizeof(*base));
+                        per_cpu(tvec_bases, cpu) = base;
                } else {
-                        base = &boot_tvec_bases;
+                        /*
+                         * This is for the boot CPU - we use compile-time
+                         * static initialisation because per-cpu memory isn't
+                         * ready yet and because the memory allocators are not
+                         * initialised either.
+                         */
                        boot_done = 1;
+                        base = &boot_tvec_bases;
                }
-                per_cpu(tvec_bases, cpu) = base;
+                tvec_base_done[cpu] = 1;
+        } else {
+                base = per_cpu(tvec_bases, cpu);
        }
-        spin_lock_init(&base->t_base.lock);
+        spin_lock_init(&base->lock);
        for (j = 0; j < TVN_SIZE; j++) {
                INIT_LIST_HEAD(base->tv5.vec + j);
                INIT_LIST_HEAD(base->tv4.vec + j);
@@ -1284,7 +1277,7 @@ static void migrate_timer_list(tvec_base_t *new_base, struct list_head *head)
        while (!list_empty(head)) {
                timer = list_entry(head->next, struct timer_list, entry);
                detach_timer(timer, 0);
-                timer->base = &new_base->t_base;
+                timer->base = new_base;
                internal_add_timer(new_base, timer);
        }
 }
@@ -1300,11 +1293,11 @@ static void __devinit migrate_timers(int cpu)
        new_base = get_cpu_var(tvec_bases);
        local_irq_disable();
-        spin_lock(&new_base->t_base.lock);
+        spin_lock(&new_base->lock);
-        spin_lock(&old_base->t_base.lock);
+        spin_lock(&old_base->lock);
+        BUG_ON(old_base->running_timer);
-        if (old_base->t_base.running_timer)
-                BUG();
        for (i = 0; i < TVR_SIZE; i++)
                migrate_timer_list(new_base, old_base->tv1.vec + i);
        for (i = 0; i < TVN_SIZE; i++) {
@@ -1314,8 +1307,8 @@ static void __devinit migrate_timers(int cpu)
                migrate_timer_list(new_base, old_base->tv5.vec + i);
        }
-        spin_unlock(&old_base->t_base.lock);
+        spin_unlock(&old_base->lock);
-        spin_unlock(&new_base->t_base.lock);
+        spin_unlock(&new_base->lock);
        local_irq_enable();
        put_cpu_var(tvec_bases);
 }
@@ -1471,7 +1464,7 @@ static void time_interpolator_update(long delta_nsec)
         */
        if (jiffies % INTERPOLATOR_ADJUST == 0)
        {
-                if (time_interpolator->skips == 0 && time_interpolator->offset > TICK_NSEC)
+                if (time_interpolator->skips == 0 && time_interpolator->offset > tick_nsec)
                        time_interpolator->nsec_per_cyc--;
                if (time_interpolator->ns_skipped > INTERPOLATOR_MAX_SKIP && time_interpolator->offset == 0)
                        time_interpolator->nsec_per_cyc++;
@@ -1495,8 +1488,7 @@ register_time_interpolator(struct time_interpolator *ti)
        unsigned long flags;
        /* Sanity check */
-        if (ti->frequency == 0 || ti->mask == 0)
+        BUG_ON(ti->frequency == 0 || ti->mask == 0);
-                BUG();
        ti->nsec_per_cyc = ((u64)NSEC_PER_SEC << ti->shift) / ti->frequency;
        spin_lock(&time_interpolator_lock);