Merge branch 'linus' into perfcounters/core

Conflicts: arch/x86/include/asm/kmap_types.h include/linux/mm.h include/asm-generic/kmap_types.h Merge reason: We crossed changes with kmap_types.h cleanups in mainline. Signed-off-by: Ingo Molnar <mingo@elte.hu>
author: Ingo Molnar <mingo@elte.hu> 2009-06-17 07:06:17 -0400
committer: Ingo Molnar <mingo@elte.hu> 2009-06-17 07:06:17 -0400
commit: a3d06cc6aa3e765dc2bf98626f87272dcf641dca (patch)
tree: aa3e49b58f08d6c0ea55cdca4fb5e6c8ba6ae333 /kernel
parent: 0990b1c65729012a63e0eeca93aaaafea4e9a064 (diff)
parent: 65795efbd380a832ae508b04dba8f8e53f0b84d9 (diff)
31 files changed, 804 insertions, 588 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 90b53f6dc226..9df4501cb921 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -11,6 +11,7 @@ obj-y     = sched.o fork.o exec_domain.o panic.o printk.o \
            hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
            notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \
            async.o
+obj-y += groups.o
 ifdef CONFIG_FUNCTION_TRACER
 # Do not trace debug files and internal ftrace files
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index d5a7e17474ee..7e75a41bd508 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -97,12 +97,6 @@ struct cpuset {
        struct cpuset *parent;          /* my parent */
-        /*
-         * Copy of global cpuset_mems_generation as of the most
-         * recent time this cpuset changed its mems_allowed.
-         */
-        int mems_generation;
        struct fmeter fmeter;           /* memory_pressure filter */
        /* partition number for rebuild_sched_domains() */
@@ -176,27 +170,6 @@ static inline int is_spread_slab(const struct cpuset *cs)
        return test_bit(CS_SPREAD_SLAB, &cs->flags);
 }
-/*
- * Increment this integer everytime any cpuset changes its
- * mems_allowed value.  Users of cpusets can track this generation
- * number, and avoid having to lock and reload mems_allowed unless
- * the cpuset they're using changes generation.
- *
- * A single, global generation is needed because cpuset_attach_task() could
- * reattach a task to a different cpuset, which must not have its
- * generation numbers aliased with those of that tasks previous cpuset.
- *
- * Generations are needed for mems_allowed because one task cannot
- * modify another's memory placement.  So we must enable every task,
- * on every visit to __alloc_pages(), to efficiently check whether
- * its current->cpuset->mems_allowed has changed, requiring an update
- * of its current->mems_allowed.
- *
- * Since writes to cpuset_mems_generation are guarded by the cgroup lock
- * there is no need to mark it atomic.
- */
-static int cpuset_mems_generation;
 static struct cpuset top_cpuset = {
        .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)),
 };
@@ -228,8 +201,9 @@ static struct cpuset top_cpuset = {
 * If a task is only holding callback_mutex, then it has read-only
 * access to cpusets.
 *
- * The task_struct fields mems_allowed and mems_generation may only
+ * Now, the task_struct fields mems_allowed and mempolicy may be changed
- * be accessed in the context of that task, so require no locks.
+ * by other task, we use alloc_lock in the task_struct fields to protect
+ * them.
 *
 * The cpuset_common_file_read() handlers only hold callback_mutex across
 * small pieces of code, such as when reading out possibly multi-word
@@ -331,75 +305,22 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
        BUG_ON(!nodes_intersects(*pmask, node_states[N_HIGH_MEMORY]));
 }
-/**
+/*
- * cpuset_update_task_memory_state - update task memory placement
+ * update task's spread flag if cpuset's page/slab spread flag is set
- *
+ *
- * If the current tasks cpusets mems_allowed changed behind our
+ * Called with callback_mutex/cgroup_mutex held
- * backs, update current->mems_allowed, mems_generation and task NUMA
- * mempolicy to the new value.
- *
- * Task mempolicy is updated by rebinding it relative to the
- * current->cpuset if a task has its memory placement changed.
- * Do not call this routine if in_interrupt().
- *
- * Call without callback_mutex or task_lock() held.  May be
- * called with or without cgroup_mutex held.  Thanks in part to
- * 'the_top_cpuset_hack', the task's cpuset pointer will never
- * be NULL.  This routine also might acquire callback_mutex during
- * call.
- *
- * Reading current->cpuset->mems_generation doesn't need task_lock
- * to guard the current->cpuset derefence, because it is guarded
- * from concurrent freeing of current->cpuset using RCU.
- *
- * The rcu_dereference() is technically probably not needed,
- * as I don't actually mind if I see a new cpuset pointer but
- * an old value of mems_generation.  However this really only
- * matters on alpha systems using cpusets heavily.  If I dropped
- * that rcu_dereference(), it would save them a memory barrier.
- * For all other arch's, rcu_dereference is a no-op anyway, and for
- * alpha systems not using cpusets, another planned optimization,
- * avoiding the rcu critical section for tasks in the root cpuset
- * which is statically allocated, so can't vanish, will make this
- * irrelevant.  Better to use RCU as intended, than to engage in
- * some cute trick to save a memory barrier that is impossible to
- * test, for alpha systems using cpusets heavily, which might not
- * even exist.
- *
- * This routine is needed to update the per-task mems_allowed data,
- * within the tasks context, when it is trying to allocate memory
- * (in various mm/mempolicy.c routines) and notices that some other
- * task has been modifying its cpuset.
 */
+static void cpuset_update_task_spread_flag(struct cpuset *cs,
-void cpuset_update_task_memory_state(void)
+                                        struct task_struct *tsk)
 {
-        int my_cpusets_mem_gen;
+        if (is_spread_page(cs))
-        struct task_struct *tsk = current;
+                tsk->flags |= PF_SPREAD_PAGE;
-        struct cpuset *cs;
+        else
+                tsk->flags &= ~PF_SPREAD_PAGE;
-        rcu_read_lock();
+        if (is_spread_slab(cs))
-        my_cpusets_mem_gen = task_cs(tsk)->mems_generation;
+                tsk->flags |= PF_SPREAD_SLAB;
-        rcu_read_unlock();
+        else
+                tsk->flags &= ~PF_SPREAD_SLAB;
-        if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) {
-                mutex_lock(&callback_mutex);
-                task_lock(tsk);
-                cs = task_cs(tsk); /* Maybe changed when task not locked */
-                guarantee_online_mems(cs, &tsk->mems_allowed);
-                tsk->cpuset_mems_generation = cs->mems_generation;
-                if (is_spread_page(cs))
-                        tsk->flags |= PF_SPREAD_PAGE;
-                else
-                        tsk->flags &= ~PF_SPREAD_PAGE;
-                if (is_spread_slab(cs))
-                        tsk->flags |= PF_SPREAD_SLAB;
-                else
-                        tsk->flags &= ~PF_SPREAD_SLAB;
-                task_unlock(tsk);
-                mutex_unlock(&callback_mutex);
-                mpol_rebind_task(tsk, &tsk->mems_allowed);
-        }
 }
 /*
@@ -1007,14 +928,6 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
 *    other task, the task_struct mems_allowed that we are hacking
 *    is for our current task, which must allocate new pages for that
 *    migrating memory region.
- *
- *    We call cpuset_update_task_memory_state() before hacking
- *    our tasks mems_allowed, so that we are assured of being in
- *    sync with our tasks cpuset, and in particular, callbacks to
- *    cpuset_update_task_memory_state() from nested page allocations
- *    won't see any mismatch of our cpuset and task mems_generation
- *    values, so won't overwrite our hacked tasks mems_allowed
- *    nodemask.
 */
 static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
@@ -1022,22 +935,37 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
 {
        struct task_struct *tsk = current;
-        cpuset_update_task_memory_state();
-        mutex_lock(&callback_mutex);
        tsk->mems_allowed = *to;
-        mutex_unlock(&callback_mutex);
        do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
-        mutex_lock(&callback_mutex);
        guarantee_online_mems(task_cs(tsk),&tsk->mems_allowed);
-        mutex_unlock(&callback_mutex);
 }
 /*
- * Rebind task's vmas to cpuset's new mems_allowed, and migrate pages to new
+ * cpuset_change_task_nodemask - change task's mems_allowed and mempolicy
- * nodes if memory_migrate flag is set. Called with cgroup_mutex held.
+ * @tsk: the task to change
+ * @newmems: new nodes that the task will be set
+ *
+ * In order to avoid seeing no nodes if the old and new nodes are disjoint,
+ * we structure updates as setting all new allowed nodes, then clearing newly
+ * disallowed ones.
+ *
+ * Called with task's alloc_lock held
+ */
+static void cpuset_change_task_nodemask(struct task_struct *tsk,
+                                        nodemask_t *newmems)
+{
+        nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
+        mpol_rebind_task(tsk, &tsk->mems_allowed);
+        mpol_rebind_task(tsk, newmems);
+        tsk->mems_allowed = *newmems;
+}
+/*
+ * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy
+ * of it to cpuset's new mems_allowed, and migrate pages to new nodes if
+ * memory_migrate flag is set. Called with cgroup_mutex held.
 */
 static void cpuset_change_nodemask(struct task_struct *p,
                                   struct cgroup_scanner *scan)
@@ -1046,12 +974,19 @@ static void cpuset_change_nodemask(struct task_struct *p,
        struct cpuset *cs;
        int migrate;
        const nodemask_t *oldmem = scan->data;
+        nodemask_t newmems;
+        cs = cgroup_cs(scan->cg);
+        guarantee_online_mems(cs, &newmems);
+        task_lock(p);
+        cpuset_change_task_nodemask(p, &newmems);
+        task_unlock(p);
        mm = get_task_mm(p);
        if (!mm)
                return;
-        cs = cgroup_cs(scan->cg);
        migrate = is_memory_migrate(cs);
        mpol_rebind_mm(mm, &cs->mems_allowed);
@@ -1104,10 +1039,10 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
 /*
 * Handle user request to change the 'mems' memory placement
 * of a cpuset.  Needs to validate the request, update the
- * cpusets mems_allowed and mems_generation, and for each
+ * cpusets mems_allowed, and for each task in the cpuset,
- * task in the cpuset, rebind any vma mempolicies and if
+ * update mems_allowed and rebind task's mempolicy and any vma
- * the cpuset is marked 'memory_migrate', migrate the tasks
+ * mempolicies and if the cpuset is marked 'memory_migrate',
- * pages to the new memory.
+ * migrate the tasks pages to the new memory.
 *
 * Call with cgroup_mutex held.  May take callback_mutex during call.
 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
@@ -1160,7 +1095,6 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
        mutex_lock(&callback_mutex);
        cs->mems_allowed = trialcs->mems_allowed;
-        cs->mems_generation = cpuset_mems_generation++;
        mutex_unlock(&callback_mutex);
        update_tasks_nodemask(cs, &oldmem, &heap);
@@ -1193,6 +1127,46 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
 }
 /*
+ * cpuset_change_flag - make a task's spread flags the same as its cpuset's
+ * @tsk: task to be updated
+ * @scan: struct cgroup_scanner containing the cgroup of the task
+ *
+ * Called by cgroup_scan_tasks() for each task in a cgroup.
+ *
+ * We don't need to re-check for the cgroup/cpuset membership, since we're
+ * holding cgroup_lock() at this point.
+ */
+static void cpuset_change_flag(struct task_struct *tsk,
+                                struct cgroup_scanner *scan)
+{
+        cpuset_update_task_spread_flag(cgroup_cs(scan->cg), tsk);
+}
+/*
+ * update_tasks_flags - update the spread flags of tasks in the cpuset.
+ * @cs: the cpuset in which each task's spread flags needs to be changed
+ * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
+ *
+ * Called with cgroup_mutex held
+ *
+ * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
+ * calling callback functions for each.
+ *
+ * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0
+ * if @heap != NULL.
+ */
+static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap)
+{
+        struct cgroup_scanner scan;
+        scan.cg = cs->css.cgroup;
+        scan.test_task = NULL;
+        scan.process_task = cpuset_change_flag;
+        scan.heap = heap;
+        cgroup_scan_tasks(&scan);
+}
+/*
 * update_flag - read a 0 or a 1 in a file and update associated flag
 * bit:         the bit to update (see cpuset_flagbits_t)
 * cs:          the cpuset to update
@@ -1205,8 +1179,10 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
                       int turning_on)
 {
        struct cpuset *trialcs;
-        int err;
        int balance_flag_changed;
+        int spread_flag_changed;
+        struct ptr_heap heap;
+        int err;
        trialcs = alloc_trial_cpuset(cs);
        if (!trialcs)
@@ -1221,9 +1197,16 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
        if (err < 0)
                goto out;
+        err = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
+        if (err < 0)
+                goto out;
        balance_flag_changed = (is_sched_load_balance(cs) !=
                                is_sched_load_balance(trialcs));
+        spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
+                        || (is_spread_page(cs) != is_spread_page(trialcs)));
        mutex_lock(&callback_mutex);
        cs->flags = trialcs->flags;
        mutex_unlock(&callback_mutex);
@@ -1231,6 +1214,9 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
        if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
                async_rebuild_sched_domains();
+        if (spread_flag_changed)
+                update_tasks_flags(cs, &heap);
+        heap_free(&heap);
 out:
        free_trial_cpuset(trialcs);
        return err;
@@ -1372,15 +1358,20 @@ static void cpuset_attach(struct cgroup_subsys *ss,
        if (cs == &top_cpuset) {
                cpumask_copy(cpus_attach, cpu_possible_mask);
+                to = node_possible_map;
        } else {
-                mutex_lock(&callback_mutex);
                guarantee_online_cpus(cs, cpus_attach);
-                mutex_unlock(&callback_mutex);
+                guarantee_online_mems(cs, &to);
        }
        err = set_cpus_allowed_ptr(tsk, cpus_attach);
        if (err)
                return;
+        task_lock(tsk);
+        cpuset_change_task_nodemask(tsk, &to);
+        task_unlock(tsk);
+        cpuset_update_task_spread_flag(cs, tsk);
        from = oldcs->mems_allowed;
        to = cs->mems_allowed;
        mm = get_task_mm(tsk);
@@ -1442,11 +1433,9 @@ static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
                break;
        case FILE_SPREAD_PAGE:
                retval = update_flag(CS_SPREAD_PAGE, cs, val);
-                cs->mems_generation = cpuset_mems_generation++;
                break;
        case FILE_SPREAD_SLAB:
                retval = update_flag(CS_SPREAD_SLAB, cs, val);
-                cs->mems_generation = cpuset_mems_generation++;
                break;
        default:
                retval = -EINVAL;
@@ -1786,8 +1775,6 @@ static struct cgroup_subsys_state *cpuset_create(
        struct cpuset *parent;
        if (!cont->parent) {
-                /* This is early initialization for the top cgroup */
-                top_cpuset.mems_generation = cpuset_mems_generation++;
                return &top_cpuset.css;
        }
        parent = cgroup_cs(cont->parent);
@@ -1799,7 +1786,6 @@ static struct cgroup_subsys_state *cpuset_create(
                return ERR_PTR(-ENOMEM);
        }
-        cpuset_update_task_memory_state();
        cs->flags = 0;
        if (is_spread_page(parent))
                set_bit(CS_SPREAD_PAGE, &cs->flags);
@@ -1808,7 +1794,6 @@ static struct cgroup_subsys_state *cpuset_create(
        set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
        cpumask_clear(cs->cpus_allowed);
        nodes_clear(cs->mems_allowed);
-        cs->mems_generation = cpuset_mems_generation++;
        fmeter_init(&cs->fmeter);
        cs->relax_domain_level = -1;
@@ -1827,8 +1812,6 @@ static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
 {
        struct cpuset *cs = cgroup_cs(cont);
-        cpuset_update_task_memory_state();
        if (is_sched_load_balance(cs))
                update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
@@ -1849,21 +1832,6 @@ struct cgroup_subsys cpuset_subsys = {
        .early_init = 1,
 };
-/*
- * cpuset_init_early - just enough so that the calls to
- * cpuset_update_task_memory_state() in early init code
- * are harmless.
- */
-int __init cpuset_init_early(void)
-{
-        alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_NOWAIT);
-        top_cpuset.mems_generation = cpuset_mems_generation++;
-        return 0;
-}
 /**
 * cpuset_init - initialize cpusets at system boot
 *
@@ -1874,11 +1842,13 @@ int __init cpuset_init(void)
 {
        int err = 0;
+        if (!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL))
+                BUG();
        cpumask_setall(top_cpuset.cpus_allowed);
        nodes_setall(top_cpuset.mems_allowed);
        fmeter_init(&top_cpuset.fmeter);
-        top_cpuset.mems_generation = cpuset_mems_generation++;
        set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
        top_cpuset.relax_domain_level = -1;
diff --git a/kernel/fork.c b/kernel/fork.c
index 4430eb1376f2..be022c200da6 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -178,7 +178,7 @@ void __init fork_init(unsigned long mempages)
        /* create a slab on which task_structs can be allocated */
        task_struct_cachep =
                kmem_cache_create("task_struct", sizeof(struct task_struct),
-                        ARCH_MIN_TASKALIGN, SLAB_PANIC, NULL);
+                        ARCH_MIN_TASKALIGN, SLAB_PANIC | SLAB_NOTRACK, NULL);
 #endif
        /* do the arch specific task caches init */
@@ -1470,20 +1470,20 @@ void __init proc_caches_init(void)
 {
        sighand_cachep = kmem_cache_create("sighand_cache",
                        sizeof(struct sighand_struct), 0,
-                        SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU,
+                        SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU|
-                        sighand_ctor);
+                        SLAB_NOTRACK, sighand_ctor);
        signal_cachep = kmem_cache_create("signal_cache",
                        sizeof(struct signal_struct), 0,
-                        SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+                        SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
        files_cachep = kmem_cache_create("files_cache",
                        sizeof(struct files_struct), 0,
-                        SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+                        SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
        fs_cachep = kmem_cache_create("fs_cache",
                        sizeof(struct fs_struct), 0,
-                        SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+                        SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
        mm_cachep = kmem_cache_create("mm_struct",
                        sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
-                        SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+                        SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
        vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC);
        mmap_init();
 }
diff --git a/kernel/groups.c b/kernel/groups.c
new file mode 100644
index 000000000000..2b45b2ee3964
--- /dev/null
+++ b/kernel/groups.c
@@ -0,0 +1,288 @@
+/*
+ * Supplementary group IDs
+ */
+#include <linux/cred.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/security.h>
+#include <linux/syscalls.h>
+#include <asm/uaccess.h>
+/* init to 2 - one for init_task, one to ensure it is never freed */
+struct group_info init_groups = { .usage = ATOMIC_INIT(2) };
+struct group_info *groups_alloc(int gidsetsize)
+{
+        struct group_info *group_info;
+        int nblocks;
+        int i;
+        nblocks = (gidsetsize + NGROUPS_PER_BLOCK - 1) / NGROUPS_PER_BLOCK;
+        /* Make sure we always allocate at least one indirect block pointer */
+        nblocks = nblocks ? : 1;
+        group_info = kmalloc(sizeof(*group_info) + nblocks*sizeof(gid_t *), GFP_USER);
+        if (!group_info)
+                return NULL;
+        group_info->ngroups = gidsetsize;
+        group_info->nblocks = nblocks;
+        atomic_set(&group_info->usage, 1);
+        if (gidsetsize <= NGROUPS_SMALL)
+                group_info->blocks[0] = group_info->small_block;
+        else {
+                for (i = 0; i < nblocks; i++) {
+                        gid_t *b;
+                        b = (void *)__get_free_page(GFP_USER);
+                        if (!b)
+                                goto out_undo_partial_alloc;
+                        group_info->blocks[i] = b;
+                }
+        }
+        return group_info;
+out_undo_partial_alloc:
+        while (--i >= 0) {
+                free_page((unsigned long)group_info->blocks[i]);
+        }
+        kfree(group_info);
+        return NULL;
+}
+EXPORT_SYMBOL(groups_alloc);
+void groups_free(struct group_info *group_info)
+{
+        if (group_info->blocks[0] != group_info->small_block) {
+                int i;
+                for (i = 0; i < group_info->nblocks; i++)
+                        free_page((unsigned long)group_info->blocks[i]);
+        }
+        kfree(group_info);
+}
+EXPORT_SYMBOL(groups_free);
+/* export the group_info to a user-space array */
+static int groups_to_user(gid_t __user *grouplist,
+                          const struct group_info *group_info)
+{
+        int i;
+        unsigned int count = group_info->ngroups;
+        for (i = 0; i < group_info->nblocks; i++) {
+                unsigned int cp_count = min(NGROUPS_PER_BLOCK, count);
+                unsigned int len = cp_count * sizeof(*grouplist);
+                if (copy_to_user(grouplist, group_info->blocks[i], len))
+                        return -EFAULT;
+                grouplist += NGROUPS_PER_BLOCK;
+                count -= cp_count;
+        }
+        return 0;
+}
+/* fill a group_info from a user-space array - it must be allocated already */
+static int groups_from_user(struct group_info *group_info,
+    gid_t __user *grouplist)
+{
+        int i;
+        unsigned int count = group_info->ngroups;
+        for (i = 0; i < group_info->nblocks; i++) {
+                unsigned int cp_count = min(NGROUPS_PER_BLOCK, count);
+                unsigned int len = cp_count * sizeof(*grouplist);
+                if (copy_from_user(group_info->blocks[i], grouplist, len))
+                        return -EFAULT;
+                grouplist += NGROUPS_PER_BLOCK;
+                count -= cp_count;
+        }
+        return 0;
+}
+/* a simple Shell sort */
+static void groups_sort(struct group_info *group_info)
+{
+        int base, max, stride;
+        int gidsetsize = group_info->ngroups;
+        for (stride = 1; stride < gidsetsize; stride = 3 * stride + 1)
+                ; /* nothing */
+        stride /= 3;
+        while (stride) {
+                max = gidsetsize - stride;
+                for (base = 0; base < max; base++) {
+                        int left = base;
+                        int right = left + stride;
+                        gid_t tmp = GROUP_AT(group_info, right);
+                        while (left >= 0 && GROUP_AT(group_info, left) > tmp) {
+                                GROUP_AT(group_info, right) =
+                                    GROUP_AT(group_info, left);
+                                right = left;
+                                left -= stride;
+                        }
+                        GROUP_AT(group_info, right) = tmp;
+                }
+                stride /= 3;
+        }
+}
+/* a simple bsearch */
+int groups_search(const struct group_info *group_info, gid_t grp)
+{
+        unsigned int left, right;
+        if (!group_info)
+                return 0;
+        left = 0;
+        right = group_info->ngroups;
+        while (left < right) {
+                unsigned int mid = (left+right)/2;
+                int cmp = grp - GROUP_AT(group_info, mid);
+                if (cmp > 0)
+                        left = mid + 1;
+                else if (cmp < 0)
+                        right = mid;
+                else
+                        return 1;
+        }
+        return 0;
+}
+/**
+ * set_groups - Change a group subscription in a set of credentials
+ * @new: The newly prepared set of credentials to alter
+ * @group_info: The group list to install
+ *
+ * Validate a group subscription and, if valid, insert it into a set
+ * of credentials.
+ */
+int set_groups(struct cred *new, struct group_info *group_info)
+{
+        int retval;
+        retval = security_task_setgroups(group_info);
+        if (retval)
+                return retval;
+        put_group_info(new->group_info);
+        groups_sort(group_info);
+        get_group_info(group_info);
+        new->group_info = group_info;
+        return 0;
+}
+EXPORT_SYMBOL(set_groups);
+/**
+ * set_current_groups - Change current's group subscription
+ * @group_info: The group list to impose
+ *
+ * Validate a group subscription and, if valid, impose it upon current's task
+ * security record.
+ */
+int set_current_groups(struct group_info *group_info)
+{
+        struct cred *new;
+        int ret;
+        new = prepare_creds();
+        if (!new)
+                return -ENOMEM;
+        ret = set_groups(new, group_info);
+        if (ret < 0) {
+                abort_creds(new);
+                return ret;
+        }
+        return commit_creds(new);
+}
+EXPORT_SYMBOL(set_current_groups);
+SYSCALL_DEFINE2(getgroups, int, gidsetsize, gid_t __user *, grouplist)
+{
+        const struct cred *cred = current_cred();
+        int i;
+        if (gidsetsize < 0)
+                return -EINVAL;
+        /* no need to grab task_lock here; it cannot change */
+        i = cred->group_info->ngroups;
+        if (gidsetsize) {
+                if (i > gidsetsize) {
+                        i = -EINVAL;
+                        goto out;
+                }
+                if (groups_to_user(grouplist, cred->group_info)) {
+                        i = -EFAULT;
+                        goto out;
+                }
+        }
+out:
+        return i;
+}
+/*
+ *      SMP: Our groups are copy-on-write. We can set them safely
+ *      without another task interfering.
+ */
+SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist)
+{
+        struct group_info *group_info;
+        int retval;
+        if (!capable(CAP_SETGID))
+                return -EPERM;
+        if ((unsigned)gidsetsize > NGROUPS_MAX)
+                return -EINVAL;
+        group_info = groups_alloc(gidsetsize);
+        if (!group_info)
+                return -ENOMEM;
+        retval = groups_from_user(group_info, grouplist);
+        if (retval) {
+                put_group_info(group_info);
+                return retval;
+        }
+        retval = set_current_groups(group_info);
+        put_group_info(group_info);
+        return retval;
+}
+/*
+ * Check whether we're fsgid/egid or in the supplemental group..
+ */
+int in_group_p(gid_t grp)
+{
+        const struct cred *cred = current_cred();
+        int retval = 1;
+        if (grp != cred->fsgid)
+                retval = groups_search(cred->group_info, grp);
+        return retval;
+}
+EXPORT_SYMBOL(in_group_p);
+int in_egroup_p(gid_t grp)
+{
+        const struct cred *cred = current_cred();
+        int retval = 1;
+        if (grp != cred->egid)
+                retval = groups_search(cred->group_info, grp);
+        return retval;
+}
+EXPORT_SYMBOL(in_egroup_p);
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index cb8a15c19583..b675a67c9ac3 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -43,6 +43,8 @@
 #include <linux/seq_file.h>
 #include <linux/err.h>
 #include <linux/debugobjects.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
 #include <asm/uaccess.h>
@@ -193,12 +195,24 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
 * Switch the timer base to the current CPU when possible.
 */
 static inline struct hrtimer_clock_base *
-switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base)
+switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
+                    int pinned)
 {
        struct hrtimer_clock_base *new_base;
        struct hrtimer_cpu_base *new_cpu_base;
+        int cpu, preferred_cpu = -1;
+        cpu = smp_processor_id();
+#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP)
+        if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) {
+                preferred_cpu = get_nohz_load_balancer();
+                if (preferred_cpu >= 0)
+                        cpu = preferred_cpu;
+        }
+#endif
-        new_cpu_base = &__get_cpu_var(hrtimer_bases);
+again:
+        new_cpu_base = &per_cpu(hrtimer_bases, cpu);
        new_base = &new_cpu_base->clock_base[base->index];
        if (base != new_base) {
@@ -218,6 +232,40 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base)
                timer->base = NULL;
                spin_unlock(&base->cpu_base->lock);
                spin_lock(&new_base->cpu_base->lock);
+                /* Optimized away for NOHZ=n SMP=n */
+                if (cpu == preferred_cpu) {
+                        /* Calculate clock monotonic expiry time */
+#ifdef CONFIG_HIGH_RES_TIMERS
+                        ktime_t expires = ktime_sub(hrtimer_get_expires(timer),
+                                                        new_base->offset);
+#else
+                        ktime_t expires = hrtimer_get_expires(timer);
+#endif
+                        /*
+                         * Get the next event on target cpu from the
+                         * clock events layer.
+                         * This covers the highres=off nohz=on case as well.
+                         */
+                        ktime_t next = clockevents_get_next_event(cpu);
+                        ktime_t delta = ktime_sub(expires, next);
+                        /*
+                         * We do not migrate the timer when it is expiring
+                         * before the next event on the target cpu because
+                         * we cannot reprogram the target cpu hardware and
+                         * we would cause it to fire late.
+                         */
+                        if (delta.tv64 < 0) {
+                                cpu = smp_processor_id();
+                                spin_unlock(&new_base->cpu_base->lock);
+                                spin_lock(&base->cpu_base->lock);
+                                timer->base = base;
+                                goto again;
+                        }
+                }
                timer->base = new_base;
        }
        return new_base;
@@ -235,7 +283,7 @@ lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
        return base;
 }
-# define switch_hrtimer_base(t, b)      (b)
+# define switch_hrtimer_base(t, b, p)   (b)
 #endif  /* !CONFIG_SMP */
@@ -907,9 +955,9 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
        ret = remove_hrtimer(timer, base);
        /* Switch the timer base, if necessary: */
-        new_base = switch_hrtimer_base(timer, base);
+        new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED);
-        if (mode == HRTIMER_MODE_REL) {
+        if (mode & HRTIMER_MODE_REL) {
                tim = ktime_add_safe(tim, new_base->get_time());
                /*
                 * CONFIG_TIME_LOW_RES is a temporary way for architectures
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 374faf9bfdc7..3a29dbe7898e 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -30,12 +30,16 @@
 #define all_var 0
 #endif
-/* These will be re-linked against their real values during the second link stage */
+/*
+ * These will be re-linked against their real values
+ * during the second link stage.
+ */
 extern const unsigned long kallsyms_addresses[] __attribute__((weak));
 extern const u8 kallsyms_names[] __attribute__((weak));
-/* tell the compiler that the count isn't in the small data section if the arch
+/*
- * has one (eg: FRV)
+ * Tell the compiler that the count isn't in the small data section if the arch
+ * has one (eg: FRV).
 */
 extern const unsigned long kallsyms_num_syms
 __attribute__((weak, section(".rodata")));
@@ -75,31 +79,37 @@ static int is_ksym_addr(unsigned long addr)
        return is_kernel_text(addr) || is_kernel_inittext(addr);
 }
-/* expand a compressed symbol data into the resulting uncompressed string,
+/*
-   given the offset to where the symbol is in the compressed stream */
+ * Expand a compressed symbol data into the resulting uncompressed string,
+ * given the offset to where the symbol is in the compressed stream.
+ */
 static unsigned int kallsyms_expand_symbol(unsigned int off, char *result)
 {
        int len, skipped_first = 0;
        const u8 *tptr, *data;
-        /* get the compressed symbol length from the first symbol byte */
+        /* Get the compressed symbol length from the first symbol byte. */
        data = &kallsyms_names[off];
        len = *data;
        data++;
-        /* update the offset to return the offset for the next symbol on
+        /*
-         * the compressed stream */
+         * Update the offset to return the offset for the next symbol on
+         * the compressed stream.
+         */
        off += len + 1;
-        /* for every byte on the compressed symbol data, copy the table
+        /*
-           entry for that byte */
+         * For every byte on the compressed symbol data, copy the table
-        while(len) {
+         * entry for that byte.
-                tptr = &kallsyms_token_table[ kallsyms_token_index[*data] ];
+         */
+        while (len) {
+                tptr = &kallsyms_token_table[kallsyms_token_index[*data]];
                data++;
                len--;
                while (*tptr) {
-                        if(skipped_first) {
+                        if (skipped_first) {
                                *result = *tptr;
                                result++;
                        } else
@@ -110,36 +120,46 @@ static unsigned int kallsyms_expand_symbol(unsigned int off, char *result)
        *result = '\0';
-        /* return to offset to the next symbol */
+        /* Return to offset to the next symbol. */
        return off;
 }
-/* get symbol type information. This is encoded as a single char at the
+/*
- * begining of the symbol name */
+ * Get symbol type information. This is encoded as a single char at the
+ * beginning of the symbol name.
+ */
 static char kallsyms_get_symbol_type(unsigned int off)
 {
-        /* get just the first code, look it up in the token table, and return the
+        /*
-         * first char from this token */
+         * Get just the first code, look it up in the token table,
-        return kallsyms_token_table[ kallsyms_token_index[ kallsyms_names[off+1] ] ];
+         * and return the first char from this token.
+         */
+        return kallsyms_token_table[kallsyms_token_index[kallsyms_names[off + 1]]];
 }
-/* find the offset on the compressed stream given and index in the
+/*
- * kallsyms array */
+ * Find the offset on the compressed stream given and index in the
+ * kallsyms array.
+ */
 static unsigned int get_symbol_offset(unsigned long pos)
 {
        const u8 *name;
        int i;
-        /* use the closest marker we have. We have markers every 256 positions,
+        /*
-         * so that should be close enough */
+         * Use the closest marker we have. We have markers every 256 positions,
-        name = &kallsyms_names[ kallsyms_markers[pos>>8] ];
+         * so that should be close enough.
+         */
+        name = &kallsyms_names[kallsyms_markers[pos >> 8]];
-        /* sequentially scan all the symbols up to the point we're searching for.
+        /*
-         * Every symbol is stored in a [<len>][<len> bytes of data] format, so we
+         * Sequentially scan all the symbols up to the point we're searching
-         * just need to add the len to the current pointer for every symbol we
+         * for. Every symbol is stored in a [<len>][<len> bytes of data] format,
-         * wish to skip */
+         * so we just need to add the len to the current pointer for every
-        for(i = 0; i < (pos&0xFF); i++)
+         * symbol we wish to skip.
+         */
+        for (i = 0; i < (pos & 0xFF); i++)
                name = name + (*name) + 1;
        return name - kallsyms_names;
@@ -190,7 +210,7 @@ static unsigned long get_symbol_pos(unsigned long addr,
        /* This kernel should never had been booted. */
        BUG_ON(!kallsyms_addresses);
-        /* do a binary search on the sorted kallsyms_addresses array */
+        /* Do a binary search on the sorted kallsyms_addresses array. */
        low = 0;
        high = kallsyms_num_syms;
@@ -203,15 +223,15 @@ static unsigned long get_symbol_pos(unsigned long addr,
        }
        /*
-         * search for the first aliased symbol. Aliased
+         * Search for the first aliased symbol. Aliased
-         * symbols are symbols with the same address
+         * symbols are symbols with the same address.
         */
        while (low && kallsyms_addresses[low-1] == kallsyms_addresses[low])
                --low;
        symbol_start = kallsyms_addresses[low];
-        /* Search for next non-aliased symbol */
+        /* Search for next non-aliased symbol. */
        for (i = low + 1; i < kallsyms_num_syms; i++) {
                if (kallsyms_addresses[i] > symbol_start) {
                        symbol_end = kallsyms_addresses[i];
@@ -219,7 +239,7 @@ static unsigned long get_symbol_pos(unsigned long addr,
                }
        }
-        /* if we found no next symbol, we use the end of the section */
+        /* If we found no next symbol, we use the end of the section. */
        if (!symbol_end) {
                if (is_kernel_inittext(addr))
                        symbol_end = (unsigned long)_einittext;
@@ -252,10 +272,10 @@ int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize,
 /*
 * Lookup an address
- * - modname is set to NULL if it's in the kernel
+ * - modname is set to NULL if it's in the kernel.
- * - we guarantee that the returned name is valid until we reschedule even if
+ * - We guarantee that the returned name is valid until we reschedule even if.
- *   it resides in a module
+ *   It resides in a module.
- * - we also guarantee that modname will be valid until rescheduled
+ * - We also guarantee that modname will be valid until rescheduled.
 */
 const char *kallsyms_lookup(unsigned long addr,
                            unsigned long *symbolsize,
@@ -276,7 +296,7 @@ const char *kallsyms_lookup(unsigned long addr,
                return namebuf;
        }
-        /* see if it's in a module */
+        /* See if it's in a module. */
        return module_address_lookup(addr, symbolsize, offset, modname,
                                     namebuf);
 }
@@ -294,7 +314,7 @@ int lookup_symbol_name(unsigned long addr, char *symname)
                kallsyms_expand_symbol(get_symbol_offset(pos), symname);
                return 0;
        }
-        /* see if it's in a module */
+        /* See if it's in a module. */
        return lookup_module_symbol_name(addr, symname);
 }
@@ -313,7 +333,7 @@ int lookup_symbol_attrs(unsigned long addr, unsigned long *size,
                modname[0] = '\0';
                return 0;
        }
-        /* see if it's in a module */
+        /* See if it's in a module. */
        return lookup_module_symbol_attrs(addr, size, offset, modname, name);
 }
@@ -342,6 +362,7 @@ int sprint_symbol(char *buffer, unsigned long address)
        return len;
 }
+EXPORT_SYMBOL_GPL(sprint_symbol);
 /* Look up a kernel symbol and print it to the kernel messages. */
 void __print_symbol(const char *fmt, unsigned long address)
@@ -352,13 +373,13 @@ void __print_symbol(const char *fmt, unsigned long address)
        printk(fmt, buffer);
 }
+EXPORT_SYMBOL(__print_symbol);
 /* To avoid using get_symbol_offset for every symbol, we carry prefix along. */
-struct kallsym_iter
+struct kallsym_iter {
-{
        loff_t pos;
        unsigned long value;
-        unsigned int nameoff; /* If iterating in core kernel symbols */
+        unsigned int nameoff; /* If iterating in core kernel symbols. */
        char type;
        char name[KSYM_NAME_LEN];
        char module_name[MODULE_NAME_LEN];
@@ -404,7 +425,7 @@ static int update_iter(struct kallsym_iter *iter, loff_t pos)
                iter->pos = pos;
                return get_ksymbol_mod(iter);
        }
-        
        /* If we're not on the desired position, reset to new position. */
        if (pos != iter->pos)
                reset_iter(iter, pos);
@@ -439,23 +460,25 @@ static int s_show(struct seq_file *m, void *p)
 {
        struct kallsym_iter *iter = m->private;
-        /* Some debugging symbols have no name.  Ignore them. */ 
+        /* Some debugging symbols have no name.  Ignore them. */
        if (!iter->name[0])
                return 0;
        if (iter->module_name[0]) {
                char type;
-                /* Label it "global" if it is exported,
+                /*
-                 * "local" if not exported. */
+                 * Label it "global" if it is exported,
+                 * "local" if not exported.
+                 */
                type = iter->exported ? toupper(iter->type) :
                                        tolower(iter->type);
                seq_printf(m, "%0*lx %c %s\t[%s]\n",
-                           (int)(2*sizeof(void*)),
+                           (int)(2 * sizeof(void *)),
                           iter->value, type, iter->name, iter->module_name);
        } else
                seq_printf(m, "%0*lx %c %s\n",
-                           (int)(2*sizeof(void*)),
+                           (int)(2 * sizeof(void *)),
                           iter->value, iter->type, iter->name);
        return 0;
 }
@@ -469,9 +492,11 @@ static const struct seq_operations kallsyms_op = {
 static int kallsyms_open(struct inode *inode, struct file *file)
 {
-        /* We keep iterator in m->private, since normal case is to
+        /*
+         * We keep iterator in m->private, since normal case is to
         * s_start from where we left off, so we avoid doing
-         * using get_symbol_offset for every symbol */
+         * using get_symbol_offset for every symbol.
+         */
        struct kallsym_iter *iter;
        int ret;
@@ -500,7 +525,4 @@ static int __init kallsyms_init(void)
        proc_create("kallsyms", 0444, NULL, &kallsyms_operations);
        return 0;
 }
-__initcall(kallsyms_init);
+device_initcall(kallsyms_init);
-EXPORT_SYMBOL(__print_symbol);
-EXPORT_SYMBOL_GPL(sprint_symbol);
diff --git a/kernel/kfifo.c b/kernel/kfifo.c
index bc41ad0f24f8..26539e3228e5 100644
--- a/kernel/kfifo.c
+++ b/kernel/kfifo.c
@@ -72,9 +72,9 @@ struct kfifo *kfifo_alloc(unsigned int size, gfp_t gfp_mask, spinlock_t *lock)
        /*
         * round up to the next power of 2, since our 'let the indices
-         * wrap' tachnique works only in this case.
+         * wrap' technique works only in this case.
         */
-        if (size & (size - 1)) {
+        if (!is_power_of_2(size)) {
                BUG_ON(size > 0x80000000);
                size = roundup_pow_of_two(size);
        }
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 41c88fe40500..7fa441333529 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -9,6 +9,7 @@
 #include <linux/kthread.h>
 #include <linux/completion.h>
 #include <linux/err.h>
+#include <linux/cpuset.h>
 #include <linux/unistd.h>
 #include <linux/file.h>
 #include <linux/module.h>
@@ -236,6 +237,7 @@ int kthreadd(void *unused)
        ignore_signals(tsk);
        set_user_nice(tsk, KTHREAD_NICE_LEVEL);
        set_cpus_allowed_ptr(tsk, cpu_all_mask);
+        set_mems_allowed(node_possible_map);
        current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG;
diff --git a/kernel/module.c b/kernel/module.c
index e4ab36ce7672..215aaab09e91 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2899,7 +2899,7 @@ void print_modules(void)
        struct module *mod;
        char buf[8];
-        printk("Modules linked in:");
+        printk(KERN_DEFAULT "Modules linked in:");
        /* Most callers should already have preempt disabled, but make sure */
        preempt_disable();
        list_for_each_entry_rcu(mod, &modules, list)
diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c
index 97890831e1b5..e8b337006276 100644
--- a/kernel/power/poweroff.c
+++ b/kernel/power/poweroff.c
@@ -34,7 +34,7 @@ static struct sysrq_key_op	sysrq_poweroff_op = {
        .handler        = handle_poweroff,
        .help_msg       = "powerOff",
        .action_msg     = "Power Off",
-        .enable_mask    = SYSRQ_ENABLE_BOOT,
+        .enable_mask    = SYSRQ_ENABLE_BOOT,
 };
 static int pm_sysrq_init(void)
diff --git a/kernel/power/process.c b/kernel/power/process.c
index ca634019497a..da2072d73811 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -117,9 +117,12 @@ int freeze_processes(void)
        if (error)
                goto Exit;
        printk("done.");
+        oom_killer_disable();
 Exit:
        BUG_ON(in_atomic());
        printk("\n");
        return error;
 }
@@ -145,6 +148,8 @@ static void thaw_tasks(bool nosig_only)
 void thaw_processes(void)
 {
+        oom_killer_enable();
        printk("Restarting tasks ... ");
        thaw_tasks(true);
        thaw_tasks(false);
diff --git a/kernel/printk.c b/kernel/printk.c
index 5052b5497c67..b4d97b54c1ec 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -687,20 +687,35 @@ asmlinkage int vprintk(const char *fmt, va_list args)
                                  sizeof(printk_buf) - printed_len, fmt, args);
+        p = printk_buf;
+        /* Do we have a loglevel in the string? */
+        if (p[0] == '<') {
+                unsigned char c = p[1];
+                if (c && p[2] == '>') {
+                        switch (c) {
+                        case '0' ... '7': /* loglevel */
+                                current_log_level = c - '0';
+                        /* Fallthrough - make sure we're on a new line */
+                        case 'd': /* KERN_DEFAULT */
+                                if (!new_text_line) {
+                                        emit_log_char('\n');
+                                        new_text_line = 1;
+                                }
+                        /* Fallthrough - skip the loglevel */
+                        case 'c': /* KERN_CONT */
+                                p += 3;
+                                break;
+                        }
+                }
+        }
        /*
         * Copy the output into log_buf.  If the caller didn't provide
         * appropriate log level tags, we insert them here
         */
-        for (p = printk_buf; *p; p++) {
+        for ( ; *p; p++) {
                if (new_text_line) {
-                        /* If a token, set current_log_level and skip over */
-                        if (p[0] == '<' && p[1] >= '0' && p[1] <= '7' &&
-                            p[2] == '>') {
-                                current_log_level = p[1] - '0';
-                                p += 3;
-                                printed_len -= 3;
-                        }
                        /* Always output the token */
                        emit_log_char('<');
                        emit_log_char(current_log_level + '0');
diff --git a/kernel/profile.c b/kernel/profile.c
index 28cf26ad2d24..69911b5745eb 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -365,7 +365,7 @@ static int __cpuinit profile_cpu_callback(struct notifier_block *info,
                node = cpu_to_node(cpu);
                per_cpu(cpu_profile_flip, cpu) = 0;
                if (!per_cpu(cpu_profile_hits, cpu)[1]) {
-                        page = alloc_pages_node(node,
+                        page = alloc_pages_exact_node(node,
                                        GFP_KERNEL | __GFP_ZERO,
                                        0);
                        if (!page)
@@ -373,7 +373,7 @@ static int __cpuinit profile_cpu_callback(struct notifier_block *info,
                        per_cpu(cpu_profile_hits, cpu)[1] = page_address(page);
                }
                if (!per_cpu(cpu_profile_hits, cpu)[0]) {
-                        page = alloc_pages_node(node,
+                        page = alloc_pages_exact_node(node,
                                        GFP_KERNEL | __GFP_ZERO,
                                        0);
                        if (!page)
@@ -564,14 +564,14 @@ static int create_hash_tables(void)
                int node = cpu_to_node(cpu);
                struct page *page;
-                page = alloc_pages_node(node,
+                page = alloc_pages_exact_node(node,
                                GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
                                0);
                if (!page)
                        goto out_cleanup;
                per_cpu(cpu_profile_hits, cpu)[1]
                                = (struct profile_hit *)page_address(page);
-                page = alloc_pages_node(node,
+                page = alloc_pages_exact_node(node,
                                GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
                                0);
                if (!page)
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index 820c5af44f3e..fcd107a78c5a 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -902,7 +902,7 @@ EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
 * Returns:
 *  0           on success
 * -EINTR       when interrupted by a signal
- * -ETIMEOUT    when the timeout expired
+ * -ETIMEDOUT   when the timeout expired
 * -EDEADLK     when the lock would deadlock (when deadlock detection is on)
 */
 int
diff --git a/kernel/sched.c b/kernel/sched.c
index 8ec9d13140be..8fb88a906aaa 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -240,7 +240,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
                hard = hrtimer_get_expires(&rt_b->rt_period_timer);
                delta = ktime_to_ns(ktime_sub(hard, soft));
                __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,
-                                HRTIMER_MODE_ABS, 0);
+                                HRTIMER_MODE_ABS_PINNED, 0);
        }
        spin_unlock(&rt_b->rt_runtime_lock);
 }
@@ -1155,7 +1155,7 @@ static __init void init_hrtick(void)
 static void hrtick_start(struct rq *rq, u64 delay)
 {
        __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
-                        HRTIMER_MODE_REL, 0);
+                        HRTIMER_MODE_REL_PINNED, 0);
 }
 static inline void init_hrtick(void)
@@ -4397,6 +4397,11 @@ static struct {
        .load_balancer = ATOMIC_INIT(-1),
 };
+int get_nohz_load_balancer(void)
+{
+        return atomic_read(&nohz.load_balancer);
+}
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
 /**
 * lowest_flag_domain - Return lowest sched_domain containing flag.
@@ -9029,6 +9034,8 @@ void __init sched_init_smp(void)
 }
 #endif /* CONFIG_SMP */
+const_debug unsigned int sysctl_timer_migration = 1;
 int in_sched_functions(unsigned long addr)
 {
        return in_lock_functions(addr) ||
diff --git a/kernel/signal.c b/kernel/signal.c
index 809a228019ad..d81f4952eebb 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -832,6 +832,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
 {
        struct sigpending *pending;
        struct sigqueue *q;
+        int override_rlimit;
        trace_sched_signal_send(sig, t);
@@ -863,9 +864,13 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
           make sure at least one signal gets delivered and don't
           pass on the info struct.  */
-        q = __sigqueue_alloc(t, GFP_ATOMIC, (sig < SIGRTMIN &&
+        if (sig < SIGRTMIN)
-                                             (is_si_special(info) ||
+                override_rlimit = (is_si_special(info) || info->si_code >= 0);
-                                              info->si_code >= 0)));
+        else
+                override_rlimit = 0;
+        q = __sigqueue_alloc(t, GFP_ATOMIC | __GFP_NOTRACK_FALSE_POSITIVE,
+                override_rlimit);
        if (q) {
                list_add_tail(&q->list, &pending->list);
                switch ((unsigned long) info) {
diff --git a/kernel/slow-work.c b/kernel/slow-work.c
index 521ed2004d63..09d7519557d3 100644
--- a/kernel/slow-work.c
+++ b/kernel/slow-work.c
@@ -319,6 +319,15 @@ cant_get_ref:
 EXPORT_SYMBOL(slow_work_enqueue);
 /*
+ * Schedule a cull of the thread pool at some time in the near future
+ */
+static void slow_work_schedule_cull(void)
+{
+        mod_timer(&slow_work_cull_timer,
+                  round_jiffies(jiffies + SLOW_WORK_CULL_TIMEOUT));
+}
+/*
 * Worker thread culling algorithm
 */
 static bool slow_work_cull_thread(void)
@@ -335,8 +344,7 @@ static bool slow_work_cull_thread(void)
                    list_empty(&vslow_work_queue) &&
                    atomic_read(&slow_work_thread_count) >
                    slow_work_min_threads) {
-                        mod_timer(&slow_work_cull_timer,
+                        slow_work_schedule_cull();
-                                  jiffies + SLOW_WORK_CULL_TIMEOUT);
                        do_cull = true;
                }
        }
@@ -393,8 +401,7 @@ static int slow_work_thread(void *_data)
                            list_empty(&vslow_work_queue) &&
                            atomic_read(&slow_work_thread_count) >
                            slow_work_min_threads)
-                                mod_timer(&slow_work_cull_timer,
+                                slow_work_schedule_cull();
-                                          jiffies + SLOW_WORK_CULL_TIMEOUT);
                        continue;
                }
@@ -458,7 +465,7 @@ static void slow_work_new_thread_execute(struct slow_work *work)
                if (atomic_dec_and_test(&slow_work_thread_count))
                        BUG(); /* we're running on a slow work thread... */
                mod_timer(&slow_work_oom_timer,
-                          jiffies + SLOW_WORK_OOM_TIMEOUT);
+                          round_jiffies(jiffies + SLOW_WORK_OOM_TIMEOUT));
        } else {
                /* ratelimit the starting of new threads */
                mod_timer(&slow_work_oom_timer, jiffies + 1);
@@ -502,8 +509,7 @@ static int slow_work_min_threads_sysctl(struct ctl_table *table, int write,
                        if (n < 0 && !slow_work_may_not_start_new_thread)
                                slow_work_enqueue(&slow_work_new_thread);
                        else if (n > 0)
-                                mod_timer(&slow_work_cull_timer,
+                                slow_work_schedule_cull();
-                                          jiffies + SLOW_WORK_CULL_TIMEOUT);
                }
                mutex_unlock(&slow_work_user_lock);
        }
@@ -529,8 +535,7 @@ static int slow_work_max_threads_sysctl(struct ctl_table *table, int write,
                                atomic_read(&slow_work_thread_count);
                        if (n < 0)
-                                mod_timer(&slow_work_cull_timer,
+                                slow_work_schedule_cull();
-                                          jiffies + SLOW_WORK_CULL_TIMEOUT);
                }
                mutex_unlock(&slow_work_user_lock);
        }
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 258885a543db..b41fb710e114 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -382,6 +382,17 @@ void __tasklet_hi_schedule(struct tasklet_struct *t)
 EXPORT_SYMBOL(__tasklet_hi_schedule);
+void __tasklet_hi_schedule_first(struct tasklet_struct *t)
+{
+        BUG_ON(!irqs_disabled());
+        t->next = __get_cpu_var(tasklet_hi_vec).head;
+        __get_cpu_var(tasklet_hi_vec).head = t;
+        __raise_softirq_irqoff(HI_SOFTIRQ);
+}
+EXPORT_SYMBOL(__tasklet_hi_schedule_first);
 static void tasklet_action(struct softirq_action *a)
 {
        struct tasklet_struct *list;
diff --git a/kernel/sys.c b/kernel/sys.c
index 438d99a38c87..b3f1097c76fa 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1113,289 +1113,6 @@ out:
        return err;
 }
-/*
- * Supplementary group IDs
- */
-/* init to 2 - one for init_task, one to ensure it is never freed */
-struct group_info init_groups = { .usage = ATOMIC_INIT(2) };
-struct group_info *groups_alloc(int gidsetsize)
-{
-        struct group_info *group_info;
-        int nblocks;
-        int i;
-        nblocks = (gidsetsize + NGROUPS_PER_BLOCK - 1) / NGROUPS_PER_BLOCK;
-        /* Make sure we always allocate at least one indirect block pointer */
-        nblocks = nblocks ? : 1;
-        group_info = kmalloc(sizeof(*group_info) + nblocks*sizeof(gid_t *), GFP_USER);
-        if (!group_info)
-                return NULL;
-        group_info->ngroups = gidsetsize;
-        group_info->nblocks = nblocks;
-        atomic_set(&group_info->usage, 1);
-        if (gidsetsize <= NGROUPS_SMALL)
-                group_info->blocks[0] = group_info->small_block;
-        else {
-                for (i = 0; i < nblocks; i++) {
-                        gid_t *b;
-                        b = (void *)__get_free_page(GFP_USER);
-                        if (!b)
-                                goto out_undo_partial_alloc;
-                        group_info->blocks[i] = b;
-                }
-        }
-        return group_info;
-out_undo_partial_alloc:
-        while (--i >= 0) {
-                free_page((unsigned long)group_info->blocks[i]);
-        }
-        kfree(group_info);
-        return NULL;
-}
-EXPORT_SYMBOL(groups_alloc);
-void groups_free(struct group_info *group_info)
-{
-        if (group_info->blocks[0] != group_info->small_block) {
-                int i;
-                for (i = 0; i < group_info->nblocks; i++)
-                        free_page((unsigned long)group_info->blocks[i]);
-        }
-        kfree(group_info);
-}
-EXPORT_SYMBOL(groups_free);
-/* export the group_info to a user-space array */
-static int groups_to_user(gid_t __user *grouplist,
-                          const struct group_info *group_info)
-{
-        int i;
-        unsigned int count = group_info->ngroups;
-        for (i = 0; i < group_info->nblocks; i++) {
-                unsigned int cp_count = min(NGROUPS_PER_BLOCK, count);
-                unsigned int len = cp_count * sizeof(*grouplist);
-                if (copy_to_user(grouplist, group_info->blocks[i], len))
-                        return -EFAULT;
-                grouplist += NGROUPS_PER_BLOCK;
-                count -= cp_count;
-        }
-        return 0;
-}
-/* fill a group_info from a user-space array - it must be allocated already */
-static int groups_from_user(struct group_info *group_info,
-    gid_t __user *grouplist)
-{
-        int i;
-        unsigned int count = group_info->ngroups;
-        for (i = 0; i < group_info->nblocks; i++) {
-                unsigned int cp_count = min(NGROUPS_PER_BLOCK, count);
-                unsigned int len = cp_count * sizeof(*grouplist);
-                if (copy_from_user(group_info->blocks[i], grouplist, len))
-                        return -EFAULT;
-                grouplist += NGROUPS_PER_BLOCK;
-                count -= cp_count;
-        }
-        return 0;
-}
-/* a simple Shell sort */
-static void groups_sort(struct group_info *group_info)
-{
-        int base, max, stride;
-        int gidsetsize = group_info->ngroups;
-        for (stride = 1; stride < gidsetsize; stride = 3 * stride + 1)
-                ; /* nothing */
-        stride /= 3;
-        while (stride) {
-                max = gidsetsize - stride;
-                for (base = 0; base < max; base++) {
-                        int left = base;
-                        int right = left + stride;
-                        gid_t tmp = GROUP_AT(group_info, right);
-                        while (left >= 0 && GROUP_AT(group_info, left) > tmp) {
-                                GROUP_AT(group_info, right) =
-                                    GROUP_AT(group_info, left);
-                                right = left;
-                                left -= stride;
-                        }
-                        GROUP_AT(group_info, right) = tmp;
-                }
-                stride /= 3;
-        }
-}
-/* a simple bsearch */
-int groups_search(const struct group_info *group_info, gid_t grp)
-{
-        unsigned int left, right;
-        if (!group_info)
-                return 0;
-        left = 0;
-        right = group_info->ngroups;
-        while (left < right) {
-                unsigned int mid = (left+right)/2;
-                int cmp = grp - GROUP_AT(group_info, mid);
-                if (cmp > 0)
-                        left = mid + 1;
-                else if (cmp < 0)
-                        right = mid;
-                else
-                        return 1;
-        }
-        return 0;
-}
-/**
- * set_groups - Change a group subscription in a set of credentials
- * @new: The newly prepared set of credentials to alter
- * @group_info: The group list to install
- *
- * Validate a group subscription and, if valid, insert it into a set
- * of credentials.
- */
-int set_groups(struct cred *new, struct group_info *group_info)
-{
-        int retval;
-        retval = security_task_setgroups(group_info);
-        if (retval)
-                return retval;
-        put_group_info(new->group_info);
-        groups_sort(group_info);
-        get_group_info(group_info);
-        new->group_info = group_info;
-        return 0;
-}
-EXPORT_SYMBOL(set_groups);
-/**
- * set_current_groups - Change current's group subscription
- * @group_info: The group list to impose
- *
- * Validate a group subscription and, if valid, impose it upon current's task
- * security record.
- */
-int set_current_groups(struct group_info *group_info)
-{
-        struct cred *new;
-        int ret;
-        new = prepare_creds();
-        if (!new)
-                return -ENOMEM;
-        ret = set_groups(new, group_info);
-        if (ret < 0) {
-                abort_creds(new);
-                return ret;
-        }
-        return commit_creds(new);
-}
-EXPORT_SYMBOL(set_current_groups);
-SYSCALL_DEFINE2(getgroups, int, gidsetsize, gid_t __user *, grouplist)
-{
-        const struct cred *cred = current_cred();
-        int i;
-        if (gidsetsize < 0)
-                return -EINVAL;
-        /* no need to grab task_lock here; it cannot change */
-        i = cred->group_info->ngroups;
-        if (gidsetsize) {
-                if (i > gidsetsize) {
-                        i = -EINVAL;
-                        goto out;
-                }
-                if (groups_to_user(grouplist, cred->group_info)) {
-                        i = -EFAULT;
-                        goto out;
-                }
-        }
-out:
-        return i;
-}
-/*
- *      SMP: Our groups are copy-on-write. We can set them safely
- *      without another task interfering.
- */
- 
-SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist)
-{
-        struct group_info *group_info;
-        int retval;
-        if (!capable(CAP_SETGID))
-                return -EPERM;
-        if ((unsigned)gidsetsize > NGROUPS_MAX)
-                return -EINVAL;
-        group_info = groups_alloc(gidsetsize);
-        if (!group_info)
-                return -ENOMEM;
-        retval = groups_from_user(group_info, grouplist);
-        if (retval) {
-                put_group_info(group_info);
-                return retval;
-        }
-        retval = set_current_groups(group_info);
-        put_group_info(group_info);
-        return retval;
-}
-/*
- * Check whether we're fsgid/egid or in the supplemental group..
- */
-int in_group_p(gid_t grp)
-{
-        const struct cred *cred = current_cred();
-        int retval = 1;
-        if (grp != cred->fsgid)
-                retval = groups_search(cred->group_info, grp);
-        return retval;
-}
-EXPORT_SYMBOL(in_group_p);
-int in_egroup_p(gid_t grp)
-{
-        const struct cred *cred = current_cred();
-        int retval = 1;
-        if (grp != cred->egid)
-                retval = groups_search(cred->group_info, grp);
-        return retval;
-}
-EXPORT_SYMBOL(in_egroup_p);
 DECLARE_RWSEM(uts_sem);
 SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name)
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index ce664f98e3fb..ab462b9968d5 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -27,6 +27,7 @@
 #include <linux/security.h>
 #include <linux/ctype.h>
 #include <linux/utsname.h>
+#include <linux/kmemcheck.h>
 #include <linux/smp_lock.h>
 #include <linux/fs.h>
 #include <linux/init.h>
@@ -328,6 +329,14 @@ static struct ctl_table kern_table[] = {
                .mode           = 0644,
                .proc_handler   = &proc_dointvec,
        },
+        {
+                .ctl_name       = CTL_UNNUMBERED,
+                .procname       = "timer_migration",
+                .data           = &sysctl_timer_migration,
+                .maxlen         = sizeof(unsigned int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec,
+        },
 #endif
        {
                .ctl_name       = CTL_UNNUMBERED,
@@ -959,6 +968,17 @@ static struct ctl_table kern_table[] = {
                .proc_handler   = &proc_dointvec,
        },
 #endif
+#ifdef CONFIG_KMEMCHECK
+        {
+                .ctl_name       = CTL_UNNUMBERED,
+                .procname       = "kmemcheck",
+                .data           = &kmemcheck_enabled,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec,
+        },
+#endif
 /*
 * NOTE: do not add new entries to this table unless you have read
 * Documentation/sysctl/ctl_unnumbered.txt
@@ -1317,7 +1337,6 @@ static struct ctl_table vm_table[] = {
                .extra2         = &one,
        },
 #endif
-#ifdef CONFIG_UNEVICTABLE_LRU
        {
                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "scan_unevictable_pages",
@@ -1326,7 +1345,6 @@ static struct ctl_table vm_table[] = {
                .mode           = 0644,
                .proc_handler   = &scan_unevictable_handler,
        },
-#endif
 /*
 * NOTE: do not add new entries to this table unless you have read
 * Documentation/sysctl/ctl_unnumbered.txt
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index d13be216a790..1ad6dd461119 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -18,6 +18,7 @@
 #include <linux/notifier.h>
 #include <linux/smp.h>
 #include <linux/sysdev.h>
+#include <linux/tick.h>
 /* The registered clock event devices */
 static LIST_HEAD(clockevent_devices);
@@ -54,6 +55,7 @@ unsigned long clockevent_delta2ns(unsigned long latch,
        return (unsigned long) clc;
 }
+EXPORT_SYMBOL_GPL(clockevent_delta2ns);
 /**
 * clockevents_set_mode - set the operating mode of a clock event device
@@ -187,6 +189,7 @@ void clockevents_register_device(struct clock_event_device *dev)
        spin_unlock(&clockevents_lock);
 }
+EXPORT_SYMBOL_GPL(clockevents_register_device);
 /*
 * Noop handler when we shut down an event device
@@ -251,4 +254,15 @@ void clockevents_notify(unsigned long reason, void *arg)
        spin_unlock(&clockevents_lock);
 }
 EXPORT_SYMBOL_GPL(clockevents_notify);
+ktime_t clockevents_get_next_event(int cpu)
+{
+        struct tick_device *td;
+        struct clock_event_device *dev;
+        td = &per_cpu(tick_cpu_device, cpu);
+        dev = td->evtdev;
+        return dev->next_event;
+}
 #endif
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 80189f6f1c5a..592bf584d1d2 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -509,6 +509,18 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev,
                }
        }
+        /*
+         * Check to make sure we don't switch to a non-highres capable
+         * clocksource if the tick code is in oneshot mode (highres or nohz)
+         */
+        if (tick_oneshot_mode_active() &&
+            !(ovr->flags & CLOCK_SOURCE_VALID_FOR_HRES)) {
+                printk(KERN_WARNING "%s clocksource is not HRT compatible. "
+                        "Cannot switch while in HRT/NOHZ mode\n", ovr->name);
+                ovr = NULL;
+                override_name[0] = 0;
+        }
        /* Reselect, when the override name has changed */
        if (ovr != clocksource_override) {
                clocksource_override = ovr;
@@ -537,7 +549,13 @@ sysfs_show_available_clocksources(struct sys_device *dev,
        spin_lock_irq(&clocksource_lock);
        list_for_each_entry(src, &clocksource_list, list) {
-                count += snprintf(buf + count,
+                /*
+                 * Don't show non-HRES clocksource if the tick code is
+                 * in one shot mode (highres=on or nohz=on)
+                 */
+                if (!tick_oneshot_mode_active() ||
+                    (src->flags & CLOCK_SOURCE_VALID_FOR_HRES))
+                        count += snprintf(buf + count,
                                  max((ssize_t)PAGE_SIZE - count, (ssize_t)0),
                                  "%s ", src->name);
        }
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 118a3b3b3f9a..877dbedc3118 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -27,7 +27,7 @@
 * timer stops in C3 state.
 */
-struct tick_device tick_broadcast_device;
+static struct tick_device tick_broadcast_device;
 /* FIXME: Use cpumask_var_t. */
 static DECLARE_BITMAP(tick_broadcast_mask, NR_CPUS);
 static DECLARE_BITMAP(tmpmask, NR_CPUS);
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index 2e8de678e767..a96c0e2b89cf 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -128,6 +128,23 @@ int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *))
        return 0;
 }
+/**
+ * tick_check_oneshot_mode - check whether the system is in oneshot mode
+ *
+ * returns 1 when either nohz or highres are enabled. otherwise 0.
+ */
+int tick_oneshot_mode_active(void)
+{
+        unsigned long flags;
+        int ret;
+        local_irq_save(flags);
+        ret = __get_cpu_var(tick_cpu_device).mode == TICKDEV_MODE_ONESHOT;
+        local_irq_restore(flags);
+        return ret;
+}
 #ifdef CONFIG_HIGH_RES_TIMERS
 /**
 * tick_init_highres - switch to high resolution mode
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index d3f1ef4d5cbe..2aff39c6f10c 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -349,7 +349,7 @@ void tick_nohz_stop_sched_tick(int inidle)
                if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
                        hrtimer_start(&ts->sched_timer, expires,
-                                      HRTIMER_MODE_ABS);
+                                      HRTIMER_MODE_ABS_PINNED);
                        /* Check, if the timer was already in the past */
                        if (hrtimer_active(&ts->sched_timer))
                                goto out;
@@ -395,7 +395,7 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
                if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
                        hrtimer_start_expires(&ts->sched_timer,
-                                      HRTIMER_MODE_ABS);
+                                              HRTIMER_MODE_ABS_PINNED);
                        /* Check, if the timer was already in the past */
                        if (hrtimer_active(&ts->sched_timer))
                                break;
@@ -698,7 +698,8 @@ void tick_setup_sched_timer(void)
        for (;;) {
                hrtimer_forward(&ts->sched_timer, now, tick_period);
-                hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS);
+                hrtimer_start_expires(&ts->sched_timer,
+                                      HRTIMER_MODE_ABS_PINNED);
                /* Check, if the timer was already in the past */
                if (hrtimer_active(&ts->sched_timer))
                        break;
diff --git a/kernel/timer.c b/kernel/timer.c
index c01e568935ea..54d3912f8cad 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -38,6 +38,7 @@
 #include <linux/tick.h>
 #include <linux/kallsyms.h>
 #include <linux/perf_counter.h>
+#include <linux/sched.h>
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -605,13 +606,12 @@ static struct tvec_base *lock_timer_base(struct timer_list *timer,
 }
 static inline int
-__mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
+__mod_timer(struct timer_list *timer, unsigned long expires,
+                                                bool pending_only, int pinned)
 {
        struct tvec_base *base, *new_base;
        unsigned long flags;
-        int ret;
+        int ret = 0 , cpu;
-        ret = 0;
        timer_stats_timer_set_start_info(timer);
        BUG_ON(!timer->function);
@@ -630,6 +630,18 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
        new_base = __get_cpu_var(tvec_bases);
+        cpu = smp_processor_id();
+#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP)
+        if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) {
+                int preferred_cpu = get_nohz_load_balancer();
+                if (preferred_cpu >= 0)
+                        cpu = preferred_cpu;
+        }
+#endif
+        new_base = per_cpu(tvec_bases, cpu);
        if (base != new_base) {
                /*
                 * We are trying to schedule the timer on the local CPU.
@@ -669,7 +681,7 @@ out_unlock:
 */
 int mod_timer_pending(struct timer_list *timer, unsigned long expires)
 {
-        return __mod_timer(timer, expires, true);
+        return __mod_timer(timer, expires, true, TIMER_NOT_PINNED);
 }
 EXPORT_SYMBOL(mod_timer_pending);
@@ -703,11 +715,33 @@ int mod_timer(struct timer_list *timer, unsigned long expires)
        if (timer->expires == expires && timer_pending(timer))
                return 1;
-        return __mod_timer(timer, expires, false);
+        return __mod_timer(timer, expires, false, TIMER_NOT_PINNED);
 }
 EXPORT_SYMBOL(mod_timer);
 /**
+ * mod_timer_pinned - modify a timer's timeout
+ * @timer: the timer to be modified
+ * @expires: new timeout in jiffies
+ *
+ * mod_timer_pinned() is a way to update the expire field of an
+ * active timer (if the timer is inactive it will be activated)
+ * and not allow the timer to be migrated to a different CPU.
+ *
+ * mod_timer_pinned(timer, expires) is equivalent to:
+ *
+ *     del_timer(timer); timer->expires = expires; add_timer(timer);
+ */
+int mod_timer_pinned(struct timer_list *timer, unsigned long expires)
+{
+        if (timer->expires == expires && timer_pending(timer))
+                return 1;
+        return __mod_timer(timer, expires, false, TIMER_PINNED);
+}
+EXPORT_SYMBOL(mod_timer_pinned);
+/**
 * add_timer - start a timer
 * @timer: the timer to be added
 *
@@ -757,6 +791,7 @@ void add_timer_on(struct timer_list *timer, int cpu)
        wake_up_idle_cpu(cpu);
        spin_unlock_irqrestore(&base->lock, flags);
 }
+EXPORT_SYMBOL_GPL(add_timer_on);
 /**
 * del_timer - deactive a timer.
@@ -1016,6 +1051,9 @@ cascade:
                index = slot = timer_jiffies & TVN_MASK;
                do {
                        list_for_each_entry(nte, varp->vec + slot, entry) {
+                                if (tbase_get_deferrable(nte->base))
+                                        continue;
                                found = 1;
                                if (time_before(nte->expires, expires))
                                        expires = nte->expires;
@@ -1306,7 +1344,7 @@ signed long __sched schedule_timeout(signed long timeout)
        expire = timeout + jiffies;
        setup_timer_on_stack(&timer, process_timeout, (unsigned long)current);
-        __mod_timer(&timer, expire, false);
+        __mod_timer(&timer, expire, false, TIMER_NOT_PINNED);
        schedule();
        del_singleshot_timer_sync(&timer);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 4a13e5a01ce3..61071fecc82e 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -147,7 +147,7 @@ config IRQSOFF_TRACER
          disabled by default and can be runtime (re-)started
          via:
-              echo 0 > /debugfs/tracing/tracing_max_latency
+              echo 0 > /sys/kernel/debug/tracing/tracing_max_latency
          (Note that kernel size and overhead increases with this option
          enabled. This option and the preempt-off timing option can be
@@ -168,7 +168,7 @@ config PREEMPT_TRACER
          disabled by default and can be runtime (re-)started
          via:
-              echo 0 > /debugfs/tracing/tracing_max_latency
+              echo 0 > /sys/kernel/debug/tracing/tracing_max_latency
          (Note that kernel size and overhead increases with this option
          enabled. This option and the irqs-off timing option can be
@@ -261,7 +261,7 @@ config PROFILE_ANNOTATED_BRANCHES
          This tracer profiles all the the likely and unlikely macros
          in the kernel. It will display the results in:
-          /debugfs/tracing/profile_annotated_branch
+          /sys/kernel/debug/tracing/profile_annotated_branch
          Note: this will add a significant overhead, only turn this
          on if you need to profile the system's use of these macros.
@@ -274,7 +274,7 @@ config PROFILE_ALL_BRANCHES
          taken in the kernel is recorded whether it hit or miss.
          The results will be displayed in:
-          /debugfs/tracing/profile_branch
+          /sys/kernel/debug/tracing/profile_branch
          This option also enables the likely/unlikely profiler.
@@ -323,7 +323,7 @@ config STACK_TRACER
        select KALLSYMS
        help
          This special tracer records the maximum stack footprint of the
-          kernel and displays it in debugfs/tracing/stack_trace.
+          kernel and displays it in /sys/kernel/debug/tracing/stack_trace.
          This tracer works by hooking into every function call that the
          kernel executes, and keeping a maximum stack depth value and
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 2e642b2b7253..dc4dc70171ce 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -10,6 +10,7 @@
 #include <linux/debugfs.h>
 #include <linux/uaccess.h>
 #include <linux/hardirq.h>
+#include <linux/kmemcheck.h>
 #include <linux/module.h>
 #include <linux/percpu.h>
 #include <linux/mutex.h>
@@ -1270,6 +1271,7 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
        if (tail < BUF_PAGE_SIZE) {
                /* Mark the rest of the page with padding */
                event = __rb_page_index(tail_page, tail);
+                kmemcheck_annotate_bitfield(event, bitfield);
                rb_event_set_padding(event);
        }
@@ -1327,6 +1329,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
                return NULL;
        event = __rb_page_index(tail_page, tail);
+        kmemcheck_annotate_bitfield(event, bitfield);
        rb_update_event(event, type, length);
        /* The passed in type is zero for DATA */
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8acd9b81a5d7..c1878bfb2e1e 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -344,7 +344,7 @@ static raw_spinlock_t ftrace_max_lock =
 /*
 * Copy the new maximum trace into the separate maximum-trace
 * structure. (this way the maximum trace is permanently saved,
- * for later retrieval via /debugfs/tracing/latency_trace)
+ * for later retrieval via /sys/kernel/debug/tracing/latency_trace)
 */
 static void
 __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
@@ -2414,21 +2414,20 @@ static const struct file_operations tracing_iter_fops = {
 static const char readme_msg[] =
        "tracing mini-HOWTO:\n\n"
-        "# mkdir /debug\n"
+        "# mount -t debugfs nodev /sys/kernel/debug\n\n"
-        "# mount -t debugfs nodev /debug\n\n"
+        "# cat /sys/kernel/debug/tracing/available_tracers\n"
-        "# cat /debug/tracing/available_tracers\n"
        "wakeup preemptirqsoff preemptoff irqsoff function sched_switch nop\n\n"
-        "# cat /debug/tracing/current_tracer\n"
+        "# cat /sys/kernel/debug/tracing/current_tracer\n"
        "nop\n"
-        "# echo sched_switch > /debug/tracing/current_tracer\n"
+        "# echo sched_switch > /sys/kernel/debug/tracing/current_tracer\n"
-        "# cat /debug/tracing/current_tracer\n"
+        "# cat /sys/kernel/debug/tracing/current_tracer\n"
        "sched_switch\n"
-        "# cat /debug/tracing/trace_options\n"
+        "# cat /sys/kernel/debug/tracing/trace_options\n"
        "noprint-parent nosym-offset nosym-addr noverbose\n"
-        "# echo print-parent > /debug/tracing/trace_options\n"
+        "# echo print-parent > /sys/kernel/debug/tracing/trace_options\n"
-        "# echo 1 > /debug/tracing/tracing_enabled\n"
+        "# echo 1 > /sys/kernel/debug/tracing/tracing_enabled\n"
-        "# cat /debug/tracing/trace > /tmp/trace.txt\n"
+        "# cat /sys/kernel/debug/tracing/trace > /tmp/trace.txt\n"
-        "# echo 0 > /debug/tracing/tracing_enabled\n"
+        "# echo 0 > /sys/kernel/debug/tracing/tracing_enabled\n"
 ;
 static ssize_t
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index e04b76cc238a..f6693969287d 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -203,7 +203,8 @@ static void start_stack_timer(void *unused)
        hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
        hrtimer->function = stack_trace_timer_fn;
-        hrtimer_start(hrtimer, ns_to_ktime(sample_period), HRTIMER_MODE_REL);
+        hrtimer_start(hrtimer, ns_to_ktime(sample_period),
+                      HRTIMER_MODE_REL_PINNED);
 }
 static void start_stack_timers(void)
diff --git a/kernel/user.c b/kernel/user.c
index 850e0ba41c1e..2c000e7132ac 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -75,21 +75,6 @@ static void uid_hash_remove(struct user_struct *up)
        put_user_ns(up->user_ns);
 }
-static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
-{
-        struct user_struct *user;
-        struct hlist_node *h;
-        hlist_for_each_entry(user, h, hashent, uidhash_node) {
-                if (user->uid == uid) {
-                        atomic_inc(&user->__count);
-                        return user;
-                }
-        }
-        return NULL;
-}
 #ifdef CONFIG_USER_SCHED
 static void sched_destroy_user(struct user_struct *up)
@@ -119,6 +104,23 @@ static int sched_create_user(struct user_struct *up) { return 0; }
 #if defined(CONFIG_USER_SCHED) && defined(CONFIG_SYSFS)
+static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
+{
+        struct user_struct *user;
+        struct hlist_node *h;
+        hlist_for_each_entry(user, h, hashent, uidhash_node) {
+                if (user->uid == uid) {
+                        /* possibly resurrect an "almost deleted" object */
+                        if (atomic_inc_return(&user->__count) == 1)
+                                cancel_delayed_work(&user->work);
+                        return user;
+                }
+        }
+        return NULL;
+}
 static struct kset *uids_kset; /* represents the /sys/kernel/uids/ directory */
 static DEFINE_MUTEX(uids_mutex);
@@ -283,12 +285,12 @@ int __init uids_sysfs_init(void)
        return uids_user_create(&root_user);
 }
-/* work function to remove sysfs directory for a user and free up
+/* delayed work function to remove sysfs directory for a user and free up
 * corresponding structures.
 */
 static void cleanup_user_struct(struct work_struct *w)
 {
-        struct user_struct *up = container_of(w, struct user_struct, work);
+        struct user_struct *up = container_of(w, struct user_struct, work.work);
        unsigned long flags;
        int remove_user = 0;
@@ -297,15 +299,12 @@ static void cleanup_user_struct(struct work_struct *w)
         */
        uids_mutex_lock();
-        local_irq_save(flags);
+        spin_lock_irqsave(&uidhash_lock, flags);
+        if (atomic_read(&up->__count) == 0) {
-        if (atomic_dec_and_lock(&up->__count, &uidhash_lock)) {
                uid_hash_remove(up);
                remove_user = 1;
-                spin_unlock_irqrestore(&uidhash_lock, flags);
-        } else {
-                local_irq_restore(flags);
        }
+        spin_unlock_irqrestore(&uidhash_lock, flags);
        if (!remove_user)
                goto done;
@@ -331,16 +330,28 @@ done:
 */
 static void free_user(struct user_struct *up, unsigned long flags)
 {
-        /* restore back the count */
-        atomic_inc(&up->__count);
        spin_unlock_irqrestore(&uidhash_lock, flags);
+        INIT_DELAYED_WORK(&up->work, cleanup_user_struct);
-        INIT_WORK(&up->work, cleanup_user_struct);
+        schedule_delayed_work(&up->work, msecs_to_jiffies(1000));
-        schedule_work(&up->work);
 }
 #else   /* CONFIG_USER_SCHED && CONFIG_SYSFS */
+static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
+{
+        struct user_struct *user;
+        struct hlist_node *h;
+        hlist_for_each_entry(user, h, hashent, uidhash_node) {
+                if (user->uid == uid) {
+                        atomic_inc(&user->__count);
+                        return user;
+                }
+        }
+        return NULL;
+}
 int uids_sysfs_init(void) { return 0; }
 static inline int uids_user_create(struct user_struct *up) { return 0; }
 static inline void uids_mutex_lock(void) { }
author	Ingo Molnar <mingo@elte.hu>	2009-06-17 07:06:17 -0400
committer	Ingo Molnar <mingo@elte.hu>	2009-06-17 07:06:17 -0400
commit	a3d06cc6aa3e765dc2bf98626f87272dcf641dca (patch)
tree	aa3e49b58f08d6c0ea55cdca4fb5e6c8ba6ae333 /kernel
parent	0990b1c65729012a63e0eeca93aaaafea4e9a064 (diff)
parent	65795efbd380a832ae508b04dba8f8e53f0b84d9 (diff)