Merge commit 'origin/master' into next

author: Benjamin Herrenschmidt <benh@kernel.crashing.org> 2009-06-17 21:16:55 -0400
committer: Benjamin Herrenschmidt <benh@kernel.crashing.org> 2009-06-17 21:16:55 -0400
commit: 4b337c5f245b6587ba844ac7bb13c313a2912f7b (patch)
tree: 999c6a6580b76a083c8efb9dabff709d1c49fcd0 /kernel
parent: 492b057c426e4aa747484958e18e9da29003985d (diff)
parent: 3fe0344faf7fdcb158bd5c1a9aec960a8d70c8e8 (diff)
18 files changed, 529 insertions, 500 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 90b53f6dc226..9df4501cb921 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -11,6 +11,7 @@ obj-y     = sched.o fork.o exec_domain.o panic.o printk.o \
            hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
            notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \
            async.o
+obj-y += groups.o
 ifdef CONFIG_FUNCTION_TRACER
 # Do not trace debug files and internal ftrace files
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index d5a7e17474ee..7e75a41bd508 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -97,12 +97,6 @@ struct cpuset {
        struct cpuset *parent;          /* my parent */
-        /*
-         * Copy of global cpuset_mems_generation as of the most
-         * recent time this cpuset changed its mems_allowed.
-         */
-        int mems_generation;
        struct fmeter fmeter;           /* memory_pressure filter */
        /* partition number for rebuild_sched_domains() */
@@ -176,27 +170,6 @@ static inline int is_spread_slab(const struct cpuset *cs)
        return test_bit(CS_SPREAD_SLAB, &cs->flags);
 }
-/*
- * Increment this integer everytime any cpuset changes its
- * mems_allowed value.  Users of cpusets can track this generation
- * number, and avoid having to lock and reload mems_allowed unless
- * the cpuset they're using changes generation.
- *
- * A single, global generation is needed because cpuset_attach_task() could
- * reattach a task to a different cpuset, which must not have its
- * generation numbers aliased with those of that tasks previous cpuset.
- *
- * Generations are needed for mems_allowed because one task cannot
- * modify another's memory placement.  So we must enable every task,
- * on every visit to __alloc_pages(), to efficiently check whether
- * its current->cpuset->mems_allowed has changed, requiring an update
- * of its current->mems_allowed.
- *
- * Since writes to cpuset_mems_generation are guarded by the cgroup lock
- * there is no need to mark it atomic.
- */
-static int cpuset_mems_generation;
 static struct cpuset top_cpuset = {
        .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)),
 };
@@ -228,8 +201,9 @@ static struct cpuset top_cpuset = {
 * If a task is only holding callback_mutex, then it has read-only
 * access to cpusets.
 *
- * The task_struct fields mems_allowed and mems_generation may only
+ * Now, the task_struct fields mems_allowed and mempolicy may be changed
- * be accessed in the context of that task, so require no locks.
+ * by other task, we use alloc_lock in the task_struct fields to protect
+ * them.
 *
 * The cpuset_common_file_read() handlers only hold callback_mutex across
 * small pieces of code, such as when reading out possibly multi-word
@@ -331,75 +305,22 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
        BUG_ON(!nodes_intersects(*pmask, node_states[N_HIGH_MEMORY]));
 }
-/**
+/*
- * cpuset_update_task_memory_state - update task memory placement
+ * update task's spread flag if cpuset's page/slab spread flag is set
- *
+ *
- * If the current tasks cpusets mems_allowed changed behind our
+ * Called with callback_mutex/cgroup_mutex held
- * backs, update current->mems_allowed, mems_generation and task NUMA
- * mempolicy to the new value.
- *
- * Task mempolicy is updated by rebinding it relative to the
- * current->cpuset if a task has its memory placement changed.
- * Do not call this routine if in_interrupt().
- *
- * Call without callback_mutex or task_lock() held.  May be
- * called with or without cgroup_mutex held.  Thanks in part to
- * 'the_top_cpuset_hack', the task's cpuset pointer will never
- * be NULL.  This routine also might acquire callback_mutex during
- * call.
- *
- * Reading current->cpuset->mems_generation doesn't need task_lock
- * to guard the current->cpuset derefence, because it is guarded
- * from concurrent freeing of current->cpuset using RCU.
- *
- * The rcu_dereference() is technically probably not needed,
- * as I don't actually mind if I see a new cpuset pointer but
- * an old value of mems_generation.  However this really only
- * matters on alpha systems using cpusets heavily.  If I dropped
- * that rcu_dereference(), it would save them a memory barrier.
- * For all other arch's, rcu_dereference is a no-op anyway, and for
- * alpha systems not using cpusets, another planned optimization,
- * avoiding the rcu critical section for tasks in the root cpuset
- * which is statically allocated, so can't vanish, will make this
- * irrelevant.  Better to use RCU as intended, than to engage in
- * some cute trick to save a memory barrier that is impossible to
- * test, for alpha systems using cpusets heavily, which might not
- * even exist.
- *
- * This routine is needed to update the per-task mems_allowed data,
- * within the tasks context, when it is trying to allocate memory
- * (in various mm/mempolicy.c routines) and notices that some other
- * task has been modifying its cpuset.
 */
+static void cpuset_update_task_spread_flag(struct cpuset *cs,
-void cpuset_update_task_memory_state(void)
+                                        struct task_struct *tsk)
 {
-        int my_cpusets_mem_gen;
+        if (is_spread_page(cs))
-        struct task_struct *tsk = current;
+                tsk->flags |= PF_SPREAD_PAGE;
-        struct cpuset *cs;
+        else
+                tsk->flags &= ~PF_SPREAD_PAGE;
-        rcu_read_lock();
+        if (is_spread_slab(cs))
-        my_cpusets_mem_gen = task_cs(tsk)->mems_generation;
+                tsk->flags |= PF_SPREAD_SLAB;
-        rcu_read_unlock();
+        else
+                tsk->flags &= ~PF_SPREAD_SLAB;
-        if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) {
-                mutex_lock(&callback_mutex);
-                task_lock(tsk);
-                cs = task_cs(tsk); /* Maybe changed when task not locked */
-                guarantee_online_mems(cs, &tsk->mems_allowed);
-                tsk->cpuset_mems_generation = cs->mems_generation;
-                if (is_spread_page(cs))
-                        tsk->flags |= PF_SPREAD_PAGE;
-                else
-                        tsk->flags &= ~PF_SPREAD_PAGE;
-                if (is_spread_slab(cs))
-                        tsk->flags |= PF_SPREAD_SLAB;
-                else
-                        tsk->flags &= ~PF_SPREAD_SLAB;
-                task_unlock(tsk);
-                mutex_unlock(&callback_mutex);
-                mpol_rebind_task(tsk, &tsk->mems_allowed);
-        }
 }
 /*
@@ -1007,14 +928,6 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
 *    other task, the task_struct mems_allowed that we are hacking
 *    is for our current task, which must allocate new pages for that
 *    migrating memory region.
- *
- *    We call cpuset_update_task_memory_state() before hacking
- *    our tasks mems_allowed, so that we are assured of being in
- *    sync with our tasks cpuset, and in particular, callbacks to
- *    cpuset_update_task_memory_state() from nested page allocations
- *    won't see any mismatch of our cpuset and task mems_generation
- *    values, so won't overwrite our hacked tasks mems_allowed
- *    nodemask.
 */
 static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
@@ -1022,22 +935,37 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
 {
        struct task_struct *tsk = current;
-        cpuset_update_task_memory_state();
-        mutex_lock(&callback_mutex);
        tsk->mems_allowed = *to;
-        mutex_unlock(&callback_mutex);
        do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
-        mutex_lock(&callback_mutex);
        guarantee_online_mems(task_cs(tsk),&tsk->mems_allowed);
-        mutex_unlock(&callback_mutex);
 }
 /*
- * Rebind task's vmas to cpuset's new mems_allowed, and migrate pages to new
+ * cpuset_change_task_nodemask - change task's mems_allowed and mempolicy
- * nodes if memory_migrate flag is set. Called with cgroup_mutex held.
+ * @tsk: the task to change
+ * @newmems: new nodes that the task will be set
+ *
+ * In order to avoid seeing no nodes if the old and new nodes are disjoint,
+ * we structure updates as setting all new allowed nodes, then clearing newly
+ * disallowed ones.
+ *
+ * Called with task's alloc_lock held
+ */
+static void cpuset_change_task_nodemask(struct task_struct *tsk,
+                                        nodemask_t *newmems)
+{
+        nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
+        mpol_rebind_task(tsk, &tsk->mems_allowed);
+        mpol_rebind_task(tsk, newmems);
+        tsk->mems_allowed = *newmems;
+}
+/*
+ * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy
+ * of it to cpuset's new mems_allowed, and migrate pages to new nodes if
+ * memory_migrate flag is set. Called with cgroup_mutex held.
 */
 static void cpuset_change_nodemask(struct task_struct *p,
                                   struct cgroup_scanner *scan)
@@ -1046,12 +974,19 @@ static void cpuset_change_nodemask(struct task_struct *p,
        struct cpuset *cs;
        int migrate;
        const nodemask_t *oldmem = scan->data;
+        nodemask_t newmems;
+        cs = cgroup_cs(scan->cg);
+        guarantee_online_mems(cs, &newmems);
+        task_lock(p);
+        cpuset_change_task_nodemask(p, &newmems);
+        task_unlock(p);
        mm = get_task_mm(p);
        if (!mm)
                return;
-        cs = cgroup_cs(scan->cg);
        migrate = is_memory_migrate(cs);
        mpol_rebind_mm(mm, &cs->mems_allowed);
@@ -1104,10 +1039,10 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
 /*
 * Handle user request to change the 'mems' memory placement
 * of a cpuset.  Needs to validate the request, update the
- * cpusets mems_allowed and mems_generation, and for each
+ * cpusets mems_allowed, and for each task in the cpuset,
- * task in the cpuset, rebind any vma mempolicies and if
+ * update mems_allowed and rebind task's mempolicy and any vma
- * the cpuset is marked 'memory_migrate', migrate the tasks
+ * mempolicies and if the cpuset is marked 'memory_migrate',
- * pages to the new memory.
+ * migrate the tasks pages to the new memory.
 *
 * Call with cgroup_mutex held.  May take callback_mutex during call.
 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
@@ -1160,7 +1095,6 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
        mutex_lock(&callback_mutex);
        cs->mems_allowed = trialcs->mems_allowed;
-        cs->mems_generation = cpuset_mems_generation++;
        mutex_unlock(&callback_mutex);
        update_tasks_nodemask(cs, &oldmem, &heap);
@@ -1193,6 +1127,46 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
 }
 /*
+ * cpuset_change_flag - make a task's spread flags the same as its cpuset's
+ * @tsk: task to be updated
+ * @scan: struct cgroup_scanner containing the cgroup of the task
+ *
+ * Called by cgroup_scan_tasks() for each task in a cgroup.
+ *
+ * We don't need to re-check for the cgroup/cpuset membership, since we're
+ * holding cgroup_lock() at this point.
+ */
+static void cpuset_change_flag(struct task_struct *tsk,
+                                struct cgroup_scanner *scan)
+{
+        cpuset_update_task_spread_flag(cgroup_cs(scan->cg), tsk);
+}
+/*
+ * update_tasks_flags - update the spread flags of tasks in the cpuset.
+ * @cs: the cpuset in which each task's spread flags needs to be changed
+ * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
+ *
+ * Called with cgroup_mutex held
+ *
+ * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
+ * calling callback functions for each.
+ *
+ * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0
+ * if @heap != NULL.
+ */
+static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap)
+{
+        struct cgroup_scanner scan;
+        scan.cg = cs->css.cgroup;
+        scan.test_task = NULL;
+        scan.process_task = cpuset_change_flag;
+        scan.heap = heap;
+        cgroup_scan_tasks(&scan);
+}
+/*
 * update_flag - read a 0 or a 1 in a file and update associated flag
 * bit:         the bit to update (see cpuset_flagbits_t)
 * cs:          the cpuset to update
@@ -1205,8 +1179,10 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
                       int turning_on)
 {
        struct cpuset *trialcs;
-        int err;
        int balance_flag_changed;
+        int spread_flag_changed;
+        struct ptr_heap heap;
+        int err;
        trialcs = alloc_trial_cpuset(cs);
        if (!trialcs)
@@ -1221,9 +1197,16 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
        if (err < 0)
                goto out;
+        err = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
+        if (err < 0)
+                goto out;
        balance_flag_changed = (is_sched_load_balance(cs) !=
                                is_sched_load_balance(trialcs));
+        spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
+                        || (is_spread_page(cs) != is_spread_page(trialcs)));
        mutex_lock(&callback_mutex);
        cs->flags = trialcs->flags;
        mutex_unlock(&callback_mutex);
@@ -1231,6 +1214,9 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
        if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
                async_rebuild_sched_domains();
+        if (spread_flag_changed)
+                update_tasks_flags(cs, &heap);
+        heap_free(&heap);
 out:
        free_trial_cpuset(trialcs);
        return err;
@@ -1372,15 +1358,20 @@ static void cpuset_attach(struct cgroup_subsys *ss,
        if (cs == &top_cpuset) {
                cpumask_copy(cpus_attach, cpu_possible_mask);
+                to = node_possible_map;
        } else {
-                mutex_lock(&callback_mutex);
                guarantee_online_cpus(cs, cpus_attach);
-                mutex_unlock(&callback_mutex);
+                guarantee_online_mems(cs, &to);
        }
        err = set_cpus_allowed_ptr(tsk, cpus_attach);
        if (err)
                return;
+        task_lock(tsk);
+        cpuset_change_task_nodemask(tsk, &to);
+        task_unlock(tsk);
+        cpuset_update_task_spread_flag(cs, tsk);
        from = oldcs->mems_allowed;
        to = cs->mems_allowed;
        mm = get_task_mm(tsk);
@@ -1442,11 +1433,9 @@ static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
                break;
        case FILE_SPREAD_PAGE:
                retval = update_flag(CS_SPREAD_PAGE, cs, val);
-                cs->mems_generation = cpuset_mems_generation++;
                break;
        case FILE_SPREAD_SLAB:
                retval = update_flag(CS_SPREAD_SLAB, cs, val);
-                cs->mems_generation = cpuset_mems_generation++;
                break;
        default:
                retval = -EINVAL;
@@ -1786,8 +1775,6 @@ static struct cgroup_subsys_state *cpuset_create(
        struct cpuset *parent;
        if (!cont->parent) {
-                /* This is early initialization for the top cgroup */
-                top_cpuset.mems_generation = cpuset_mems_generation++;
                return &top_cpuset.css;
        }
        parent = cgroup_cs(cont->parent);
@@ -1799,7 +1786,6 @@ static struct cgroup_subsys_state *cpuset_create(
                return ERR_PTR(-ENOMEM);
        }
-        cpuset_update_task_memory_state();
        cs->flags = 0;
        if (is_spread_page(parent))
                set_bit(CS_SPREAD_PAGE, &cs->flags);
@@ -1808,7 +1794,6 @@ static struct cgroup_subsys_state *cpuset_create(
        set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
        cpumask_clear(cs->cpus_allowed);
        nodes_clear(cs->mems_allowed);
-        cs->mems_generation = cpuset_mems_generation++;
        fmeter_init(&cs->fmeter);
        cs->relax_domain_level = -1;
@@ -1827,8 +1812,6 @@ static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
 {
        struct cpuset *cs = cgroup_cs(cont);
-        cpuset_update_task_memory_state();
        if (is_sched_load_balance(cs))
                update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
@@ -1849,21 +1832,6 @@ struct cgroup_subsys cpuset_subsys = {
        .early_init = 1,
 };
-/*
- * cpuset_init_early - just enough so that the calls to
- * cpuset_update_task_memory_state() in early init code
- * are harmless.
- */
-int __init cpuset_init_early(void)
-{
-        alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_NOWAIT);
-        top_cpuset.mems_generation = cpuset_mems_generation++;
-        return 0;
-}
 /**
 * cpuset_init - initialize cpusets at system boot
 *
@@ -1874,11 +1842,13 @@ int __init cpuset_init(void)
 {
        int err = 0;
+        if (!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL))
+                BUG();
        cpumask_setall(top_cpuset.cpus_allowed);
        nodes_setall(top_cpuset.mems_allowed);
        fmeter_init(&top_cpuset.fmeter);
-        top_cpuset.mems_generation = cpuset_mems_generation++;
        set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
        top_cpuset.relax_domain_level = -1;
diff --git a/kernel/fork.c b/kernel/fork.c
index 4430eb1376f2..be022c200da6 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -178,7 +178,7 @@ void __init fork_init(unsigned long mempages)
        /* create a slab on which task_structs can be allocated */
        task_struct_cachep =
                kmem_cache_create("task_struct", sizeof(struct task_struct),
-                        ARCH_MIN_TASKALIGN, SLAB_PANIC, NULL);
+                        ARCH_MIN_TASKALIGN, SLAB_PANIC | SLAB_NOTRACK, NULL);
 #endif
        /* do the arch specific task caches init */
@@ -1470,20 +1470,20 @@ void __init proc_caches_init(void)
 {
        sighand_cachep = kmem_cache_create("sighand_cache",
                        sizeof(struct sighand_struct), 0,
-                        SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU,
+                        SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU|
-                        sighand_ctor);
+                        SLAB_NOTRACK, sighand_ctor);
        signal_cachep = kmem_cache_create("signal_cache",
                        sizeof(struct signal_struct), 0,
-                        SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+                        SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
        files_cachep = kmem_cache_create("files_cache",
                        sizeof(struct files_struct), 0,
-                        SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+                        SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
        fs_cachep = kmem_cache_create("fs_cache",
                        sizeof(struct fs_struct), 0,
-                        SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+                        SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
        mm_cachep = kmem_cache_create("mm_struct",
                        sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
-                        SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+                        SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
        vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC);
        mmap_init();
 }
diff --git a/kernel/groups.c b/kernel/groups.c
new file mode 100644
index 000000000000..2b45b2ee3964
--- /dev/null
+++ b/kernel/groups.c
@@ -0,0 +1,288 @@
+/*
+ * Supplementary group IDs
+ */
+#include <linux/cred.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/security.h>
+#include <linux/syscalls.h>
+#include <asm/uaccess.h>
+/* init to 2 - one for init_task, one to ensure it is never freed */
+struct group_info init_groups = { .usage = ATOMIC_INIT(2) };
+struct group_info *groups_alloc(int gidsetsize)
+{
+        struct group_info *group_info;
+        int nblocks;
+        int i;
+        nblocks = (gidsetsize + NGROUPS_PER_BLOCK - 1) / NGROUPS_PER_BLOCK;
+        /* Make sure we always allocate at least one indirect block pointer */
+        nblocks = nblocks ? : 1;
+        group_info = kmalloc(sizeof(*group_info) + nblocks*sizeof(gid_t *), GFP_USER);
+        if (!group_info)
+                return NULL;
+        group_info->ngroups = gidsetsize;
+        group_info->nblocks = nblocks;
+        atomic_set(&group_info->usage, 1);
+        if (gidsetsize <= NGROUPS_SMALL)
+                group_info->blocks[0] = group_info->small_block;
+        else {
+                for (i = 0; i < nblocks; i++) {
+                        gid_t *b;
+                        b = (void *)__get_free_page(GFP_USER);
+                        if (!b)
+                                goto out_undo_partial_alloc;
+                        group_info->blocks[i] = b;
+                }
+        }
+        return group_info;
+out_undo_partial_alloc:
+        while (--i >= 0) {
+                free_page((unsigned long)group_info->blocks[i]);
+        }
+        kfree(group_info);
+        return NULL;
+}
+EXPORT_SYMBOL(groups_alloc);
+void groups_free(struct group_info *group_info)
+{
+        if (group_info->blocks[0] != group_info->small_block) {
+                int i;
+                for (i = 0; i < group_info->nblocks; i++)
+                        free_page((unsigned long)group_info->blocks[i]);
+        }
+        kfree(group_info);
+}
+EXPORT_SYMBOL(groups_free);
+/* export the group_info to a user-space array */
+static int groups_to_user(gid_t __user *grouplist,
+                          const struct group_info *group_info)
+{
+        int i;
+        unsigned int count = group_info->ngroups;
+        for (i = 0; i < group_info->nblocks; i++) {
+                unsigned int cp_count = min(NGROUPS_PER_BLOCK, count);
+                unsigned int len = cp_count * sizeof(*grouplist);
+                if (copy_to_user(grouplist, group_info->blocks[i], len))
+                        return -EFAULT;
+                grouplist += NGROUPS_PER_BLOCK;
+                count -= cp_count;
+        }
+        return 0;
+}
+/* fill a group_info from a user-space array - it must be allocated already */
+static int groups_from_user(struct group_info *group_info,
+    gid_t __user *grouplist)
+{
+        int i;
+        unsigned int count = group_info->ngroups;
+        for (i = 0; i < group_info->nblocks; i++) {
+                unsigned int cp_count = min(NGROUPS_PER_BLOCK, count);
+                unsigned int len = cp_count * sizeof(*grouplist);
+                if (copy_from_user(group_info->blocks[i], grouplist, len))
+                        return -EFAULT;
+                grouplist += NGROUPS_PER_BLOCK;
+                count -= cp_count;
+        }
+        return 0;
+}
+/* a simple Shell sort */
+static void groups_sort(struct group_info *group_info)
+{
+        int base, max, stride;
+        int gidsetsize = group_info->ngroups;
+        for (stride = 1; stride < gidsetsize; stride = 3 * stride + 1)
+                ; /* nothing */
+        stride /= 3;
+        while (stride) {
+                max = gidsetsize - stride;
+                for (base = 0; base < max; base++) {
+                        int left = base;
+                        int right = left + stride;
+                        gid_t tmp = GROUP_AT(group_info, right);
+                        while (left >= 0 && GROUP_AT(group_info, left) > tmp) {
+                                GROUP_AT(group_info, right) =
+                                    GROUP_AT(group_info, left);
+                                right = left;
+                                left -= stride;
+                        }
+                        GROUP_AT(group_info, right) = tmp;
+                }
+                stride /= 3;
+        }
+}
+/* a simple bsearch */
+int groups_search(const struct group_info *group_info, gid_t grp)
+{
+        unsigned int left, right;
+        if (!group_info)
+                return 0;
+        left = 0;
+        right = group_info->ngroups;
+        while (left < right) {
+                unsigned int mid = (left+right)/2;
+                int cmp = grp - GROUP_AT(group_info, mid);
+                if (cmp > 0)
+                        left = mid + 1;
+                else if (cmp < 0)
+                        right = mid;
+                else
+                        return 1;
+        }
+        return 0;
+}
+/**
+ * set_groups - Change a group subscription in a set of credentials
+ * @new: The newly prepared set of credentials to alter
+ * @group_info: The group list to install
+ *
+ * Validate a group subscription and, if valid, insert it into a set
+ * of credentials.
+ */
+int set_groups(struct cred *new, struct group_info *group_info)
+{
+        int retval;
+        retval = security_task_setgroups(group_info);
+        if (retval)
+                return retval;
+        put_group_info(new->group_info);
+        groups_sort(group_info);
+        get_group_info(group_info);
+        new->group_info = group_info;
+        return 0;
+}
+EXPORT_SYMBOL(set_groups);
+/**
+ * set_current_groups - Change current's group subscription
+ * @group_info: The group list to impose
+ *
+ * Validate a group subscription and, if valid, impose it upon current's task
+ * security record.
+ */
+int set_current_groups(struct group_info *group_info)
+{
+        struct cred *new;
+        int ret;
+        new = prepare_creds();
+        if (!new)
+                return -ENOMEM;
+        ret = set_groups(new, group_info);
+        if (ret < 0) {
+                abort_creds(new);
+                return ret;
+        }
+        return commit_creds(new);
+}
+EXPORT_SYMBOL(set_current_groups);
+SYSCALL_DEFINE2(getgroups, int, gidsetsize, gid_t __user *, grouplist)
+{
+        const struct cred *cred = current_cred();
+        int i;
+        if (gidsetsize < 0)
+                return -EINVAL;
+        /* no need to grab task_lock here; it cannot change */
+        i = cred->group_info->ngroups;
+        if (gidsetsize) {
+                if (i > gidsetsize) {
+                        i = -EINVAL;
+                        goto out;
+                }
+                if (groups_to_user(grouplist, cred->group_info)) {
+                        i = -EFAULT;
+                        goto out;
+                }
+        }
+out:
+        return i;
+}
+/*
+ *      SMP: Our groups are copy-on-write. We can set them safely
+ *      without another task interfering.
+ */
+SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist)
+{
+        struct group_info *group_info;
+        int retval;
+        if (!capable(CAP_SETGID))
+                return -EPERM;
+        if ((unsigned)gidsetsize > NGROUPS_MAX)
+                return -EINVAL;
+        group_info = groups_alloc(gidsetsize);
+        if (!group_info)
+                return -ENOMEM;
+        retval = groups_from_user(group_info, grouplist);
+        if (retval) {
+                put_group_info(group_info);
+                return retval;
+        }
+        retval = set_current_groups(group_info);
+        put_group_info(group_info);
+        return retval;
+}
+/*
+ * Check whether we're fsgid/egid or in the supplemental group..
+ */
+int in_group_p(gid_t grp)
+{
+        const struct cred *cred = current_cred();
+        int retval = 1;
+        if (grp != cred->fsgid)
+                retval = groups_search(cred->group_info, grp);
+        return retval;
+}
+EXPORT_SYMBOL(in_group_p);
+int in_egroup_p(gid_t grp)
+{
+        const struct cred *cred = current_cred();
+        int retval = 1;
+        if (grp != cred->egid)
+                retval = groups_search(cred->group_info, grp);
+        return retval;
+}
+EXPORT_SYMBOL(in_egroup_p);
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index b675a67c9ac3..9002958a96e7 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -380,6 +380,8 @@ ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs)
        return res;
 }
+EXPORT_SYMBOL_GPL(ktime_add_safe);
 #ifdef CONFIG_DEBUG_OBJECTS_TIMERS
 static struct debug_obj_descr hrtimer_debug_descr;
diff --git a/kernel/kfifo.c b/kernel/kfifo.c
index bc41ad0f24f8..26539e3228e5 100644
--- a/kernel/kfifo.c
+++ b/kernel/kfifo.c
@@ -72,9 +72,9 @@ struct kfifo *kfifo_alloc(unsigned int size, gfp_t gfp_mask, spinlock_t *lock)
        /*
         * round up to the next power of 2, since our 'let the indices
-         * wrap' tachnique works only in this case.
+         * wrap' technique works only in this case.
         */
-        if (size & (size - 1)) {
+        if (!is_power_of_2(size)) {
                BUG_ON(size > 0x80000000);
                size = roundup_pow_of_two(size);
        }
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 41c88fe40500..7fa441333529 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -9,6 +9,7 @@
 #include <linux/kthread.h>
 #include <linux/completion.h>
 #include <linux/err.h>
+#include <linux/cpuset.h>
 #include <linux/unistd.h>
 #include <linux/file.h>
 #include <linux/module.h>
@@ -236,6 +237,7 @@ int kthreadd(void *unused)
        ignore_signals(tsk);
        set_user_nice(tsk, KTHREAD_NICE_LEVEL);
        set_cpus_allowed_ptr(tsk, cpu_all_mask);
+        set_mems_allowed(node_possible_map);
        current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG;
diff --git a/kernel/power/process.c b/kernel/power/process.c
index ca634019497a..da2072d73811 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -117,9 +117,12 @@ int freeze_processes(void)
        if (error)
                goto Exit;
        printk("done.");
+        oom_killer_disable();
 Exit:
        BUG_ON(in_atomic());
        printk("\n");
        return error;
 }
@@ -145,6 +148,8 @@ static void thaw_tasks(bool nosig_only)
 void thaw_processes(void)
 {
+        oom_killer_enable();
        printk("Restarting tasks ... ");
        thaw_tasks(true);
        thaw_tasks(false);
diff --git a/kernel/profile.c b/kernel/profile.c
index 28cf26ad2d24..69911b5745eb 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -365,7 +365,7 @@ static int __cpuinit profile_cpu_callback(struct notifier_block *info,
                node = cpu_to_node(cpu);
                per_cpu(cpu_profile_flip, cpu) = 0;
                if (!per_cpu(cpu_profile_hits, cpu)[1]) {
-                        page = alloc_pages_node(node,
+                        page = alloc_pages_exact_node(node,
                                        GFP_KERNEL | __GFP_ZERO,
                                        0);
                        if (!page)
@@ -373,7 +373,7 @@ static int __cpuinit profile_cpu_callback(struct notifier_block *info,
                        per_cpu(cpu_profile_hits, cpu)[1] = page_address(page);
                }
                if (!per_cpu(cpu_profile_hits, cpu)[0]) {
-                        page = alloc_pages_node(node,
+                        page = alloc_pages_exact_node(node,
                                        GFP_KERNEL | __GFP_ZERO,
                                        0);
                        if (!page)
@@ -564,14 +564,14 @@ static int create_hash_tables(void)
                int node = cpu_to_node(cpu);
                struct page *page;
-                page = alloc_pages_node(node,
+                page = alloc_pages_exact_node(node,
                                GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
                                0);
                if (!page)
                        goto out_cleanup;
                per_cpu(cpu_profile_hits, cpu)[1]
                                = (struct profile_hit *)page_address(page);
-                page = alloc_pages_node(node,
+                page = alloc_pages_exact_node(node,
                                GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
                                0);
                if (!page)
diff --git a/kernel/signal.c b/kernel/signal.c
index 809a228019ad..d81f4952eebb 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -832,6 +832,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
 {
        struct sigpending *pending;
        struct sigqueue *q;
+        int override_rlimit;
        trace_sched_signal_send(sig, t);
@@ -863,9 +864,13 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
           make sure at least one signal gets delivered and don't
           pass on the info struct.  */
-        q = __sigqueue_alloc(t, GFP_ATOMIC, (sig < SIGRTMIN &&
+        if (sig < SIGRTMIN)
-                                             (is_si_special(info) ||
+                override_rlimit = (is_si_special(info) || info->si_code >= 0);
-                                              info->si_code >= 0)));
+        else
+                override_rlimit = 0;
+        q = __sigqueue_alloc(t, GFP_ATOMIC | __GFP_NOTRACK_FALSE_POSITIVE,
+                override_rlimit);
        if (q) {
                list_add_tail(&q->list, &pending->list);
                switch ((unsigned long) info) {
diff --git a/kernel/slow-work.c b/kernel/slow-work.c
index 521ed2004d63..09d7519557d3 100644
--- a/kernel/slow-work.c
+++ b/kernel/slow-work.c
@@ -319,6 +319,15 @@ cant_get_ref:
 EXPORT_SYMBOL(slow_work_enqueue);
 /*
+ * Schedule a cull of the thread pool at some time in the near future
+ */
+static void slow_work_schedule_cull(void)
+{
+        mod_timer(&slow_work_cull_timer,
+                  round_jiffies(jiffies + SLOW_WORK_CULL_TIMEOUT));
+}
+/*
 * Worker thread culling algorithm
 */
 static bool slow_work_cull_thread(void)
@@ -335,8 +344,7 @@ static bool slow_work_cull_thread(void)
                    list_empty(&vslow_work_queue) &&
                    atomic_read(&slow_work_thread_count) >
                    slow_work_min_threads) {
-                        mod_timer(&slow_work_cull_timer,
+                        slow_work_schedule_cull();
-                                  jiffies + SLOW_WORK_CULL_TIMEOUT);
                        do_cull = true;
                }
        }
@@ -393,8 +401,7 @@ static int slow_work_thread(void *_data)
                            list_empty(&vslow_work_queue) &&
                            atomic_read(&slow_work_thread_count) >
                            slow_work_min_threads)
-                                mod_timer(&slow_work_cull_timer,
+                                slow_work_schedule_cull();
-                                          jiffies + SLOW_WORK_CULL_TIMEOUT);
                        continue;
                }
@@ -458,7 +465,7 @@ static void slow_work_new_thread_execute(struct slow_work *work)
                if (atomic_dec_and_test(&slow_work_thread_count))
                        BUG(); /* we're running on a slow work thread... */
                mod_timer(&slow_work_oom_timer,
-                          jiffies + SLOW_WORK_OOM_TIMEOUT);
+                          round_jiffies(jiffies + SLOW_WORK_OOM_TIMEOUT));
        } else {
                /* ratelimit the starting of new threads */
                mod_timer(&slow_work_oom_timer, jiffies + 1);
@@ -502,8 +509,7 @@ static int slow_work_min_threads_sysctl(struct ctl_table *table, int write,
                        if (n < 0 && !slow_work_may_not_start_new_thread)
                                slow_work_enqueue(&slow_work_new_thread);
                        else if (n > 0)
-                                mod_timer(&slow_work_cull_timer,
+                                slow_work_schedule_cull();
-                                          jiffies + SLOW_WORK_CULL_TIMEOUT);
                }
                mutex_unlock(&slow_work_user_lock);
        }
@@ -529,8 +535,7 @@ static int slow_work_max_threads_sysctl(struct ctl_table *table, int write,
                                atomic_read(&slow_work_thread_count);
                        if (n < 0)
-                                mod_timer(&slow_work_cull_timer,
+                                slow_work_schedule_cull();
-                                          jiffies + SLOW_WORK_CULL_TIMEOUT);
                }
                mutex_unlock(&slow_work_user_lock);
        }
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 258885a543db..b41fb710e114 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -382,6 +382,17 @@ void __tasklet_hi_schedule(struct tasklet_struct *t)
 EXPORT_SYMBOL(__tasklet_hi_schedule);
+void __tasklet_hi_schedule_first(struct tasklet_struct *t)
+{
+        BUG_ON(!irqs_disabled());
+        t->next = __get_cpu_var(tasklet_hi_vec).head;
+        __get_cpu_var(tasklet_hi_vec).head = t;
+        __raise_softirq_irqoff(HI_SOFTIRQ);
+}
+EXPORT_SYMBOL(__tasklet_hi_schedule_first);
 static void tasklet_action(struct softirq_action *a)
 {
        struct tasklet_struct *list;
diff --git a/kernel/sys.c b/kernel/sys.c
index 438d99a38c87..b3f1097c76fa 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1113,289 +1113,6 @@ out:
        return err;
 }
-/*
- * Supplementary group IDs
- */
-/* init to 2 - one for init_task, one to ensure it is never freed */
-struct group_info init_groups = { .usage = ATOMIC_INIT(2) };
-struct group_info *groups_alloc(int gidsetsize)
-{
-        struct group_info *group_info;
-        int nblocks;
-        int i;
-        nblocks = (gidsetsize + NGROUPS_PER_BLOCK - 1) / NGROUPS_PER_BLOCK;
-        /* Make sure we always allocate at least one indirect block pointer */
-        nblocks = nblocks ? : 1;
-        group_info = kmalloc(sizeof(*group_info) + nblocks*sizeof(gid_t *), GFP_USER);
-        if (!group_info)
-                return NULL;
-        group_info->ngroups = gidsetsize;
-        group_info->nblocks = nblocks;
-        atomic_set(&group_info->usage, 1);
-        if (gidsetsize <= NGROUPS_SMALL)
-                group_info->blocks[0] = group_info->small_block;
-        else {
-                for (i = 0; i < nblocks; i++) {
-                        gid_t *b;
-                        b = (void *)__get_free_page(GFP_USER);
-                        if (!b)
-                                goto out_undo_partial_alloc;
-                        group_info->blocks[i] = b;
-                }
-        }
-        return group_info;
-out_undo_partial_alloc:
-        while (--i >= 0) {
-                free_page((unsigned long)group_info->blocks[i]);
-        }
-        kfree(group_info);
-        return NULL;
-}
-EXPORT_SYMBOL(groups_alloc);
-void groups_free(struct group_info *group_info)
-{
-        if (group_info->blocks[0] != group_info->small_block) {
-                int i;
-                for (i = 0; i < group_info->nblocks; i++)
-                        free_page((unsigned long)group_info->blocks[i]);
-        }
-        kfree(group_info);
-}
-EXPORT_SYMBOL(groups_free);
-/* export the group_info to a user-space array */
-static int groups_to_user(gid_t __user *grouplist,
-                          const struct group_info *group_info)
-{
-        int i;
-        unsigned int count = group_info->ngroups;
-        for (i = 0; i < group_info->nblocks; i++) {
-                unsigned int cp_count = min(NGROUPS_PER_BLOCK, count);
-                unsigned int len = cp_count * sizeof(*grouplist);
-                if (copy_to_user(grouplist, group_info->blocks[i], len))
-                        return -EFAULT;
-                grouplist += NGROUPS_PER_BLOCK;
-                count -= cp_count;
-        }
-        return 0;
-}
-/* fill a group_info from a user-space array - it must be allocated already */
-static int groups_from_user(struct group_info *group_info,
-    gid_t __user *grouplist)
-{
-        int i;
-        unsigned int count = group_info->ngroups;
-        for (i = 0; i < group_info->nblocks; i++) {
-                unsigned int cp_count = min(NGROUPS_PER_BLOCK, count);
-                unsigned int len = cp_count * sizeof(*grouplist);
-                if (copy_from_user(group_info->blocks[i], grouplist, len))
-                        return -EFAULT;
-                grouplist += NGROUPS_PER_BLOCK;
-                count -= cp_count;
-        }
-        return 0;
-}
-/* a simple Shell sort */
-static void groups_sort(struct group_info *group_info)
-{
-        int base, max, stride;
-        int gidsetsize = group_info->ngroups;
-        for (stride = 1; stride < gidsetsize; stride = 3 * stride + 1)
-                ; /* nothing */
-        stride /= 3;
-        while (stride) {
-                max = gidsetsize - stride;
-                for (base = 0; base < max; base++) {
-                        int left = base;
-                        int right = left + stride;
-                        gid_t tmp = GROUP_AT(group_info, right);
-                        while (left >= 0 && GROUP_AT(group_info, left) > tmp) {
-                                GROUP_AT(group_info, right) =
-                                    GROUP_AT(group_info, left);
-                                right = left;
-                                left -= stride;
-                        }
-                        GROUP_AT(group_info, right) = tmp;
-                }
-                stride /= 3;
-        }
-}
-/* a simple bsearch */
-int groups_search(const struct group_info *group_info, gid_t grp)
-{
-        unsigned int left, right;
-        if (!group_info)
-                return 0;
-        left = 0;
-        right = group_info->ngroups;
-        while (left < right) {
-                unsigned int mid = (left+right)/2;
-                int cmp = grp - GROUP_AT(group_info, mid);
-                if (cmp > 0)
-                        left = mid + 1;
-                else if (cmp < 0)
-                        right = mid;
-                else
-                        return 1;
-        }
-        return 0;
-}
-/**
- * set_groups - Change a group subscription in a set of credentials
- * @new: The newly prepared set of credentials to alter
- * @group_info: The group list to install
- *
- * Validate a group subscription and, if valid, insert it into a set
- * of credentials.
- */
-int set_groups(struct cred *new, struct group_info *group_info)
-{
-        int retval;
-        retval = security_task_setgroups(group_info);
-        if (retval)
-                return retval;
-        put_group_info(new->group_info);
-        groups_sort(group_info);
-        get_group_info(group_info);
-        new->group_info = group_info;
-        return 0;
-}
-EXPORT_SYMBOL(set_groups);
-/**
- * set_current_groups - Change current's group subscription
- * @group_info: The group list to impose
- *
- * Validate a group subscription and, if valid, impose it upon current's task
- * security record.
- */
-int set_current_groups(struct group_info *group_info)
-{
-        struct cred *new;
-        int ret;
-        new = prepare_creds();
-        if (!new)
-                return -ENOMEM;
-        ret = set_groups(new, group_info);
-        if (ret < 0) {
-                abort_creds(new);
-                return ret;
-        }
-        return commit_creds(new);
-}
-EXPORT_SYMBOL(set_current_groups);
-SYSCALL_DEFINE2(getgroups, int, gidsetsize, gid_t __user *, grouplist)
-{
-        const struct cred *cred = current_cred();
-        int i;
-        if (gidsetsize < 0)
-                return -EINVAL;
-        /* no need to grab task_lock here; it cannot change */
-        i = cred->group_info->ngroups;
-        if (gidsetsize) {
-                if (i > gidsetsize) {
-                        i = -EINVAL;
-                        goto out;
-                }
-                if (groups_to_user(grouplist, cred->group_info)) {
-                        i = -EFAULT;
-                        goto out;
-                }
-        }
-out:
-        return i;
-}
-/*
- *      SMP: Our groups are copy-on-write. We can set them safely
- *      without another task interfering.
- */
- 
-SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist)
-{
-        struct group_info *group_info;
-        int retval;
-        if (!capable(CAP_SETGID))
-                return -EPERM;
-        if ((unsigned)gidsetsize > NGROUPS_MAX)
-                return -EINVAL;
-        group_info = groups_alloc(gidsetsize);
-        if (!group_info)
-                return -ENOMEM;
-        retval = groups_from_user(group_info, grouplist);
-        if (retval) {
-                put_group_info(group_info);
-                return retval;
-        }
-        retval = set_current_groups(group_info);
-        put_group_info(group_info);
-        return retval;
-}
-/*
- * Check whether we're fsgid/egid or in the supplemental group..
- */
-int in_group_p(gid_t grp)
-{
-        const struct cred *cred = current_cred();
-        int retval = 1;
-        if (grp != cred->fsgid)
-                retval = groups_search(cred->group_info, grp);
-        return retval;
-}
-EXPORT_SYMBOL(in_group_p);
-int in_egroup_p(gid_t grp)
-{
-        const struct cred *cred = current_cred();
-        int retval = 1;
-        if (grp != cred->egid)
-                retval = groups_search(cred->group_info, grp);
-        return retval;
-}
-EXPORT_SYMBOL(in_egroup_p);
 DECLARE_RWSEM(uts_sem);
 SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name)
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 0e51a35a4486..ab462b9968d5 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -27,6 +27,7 @@
 #include <linux/security.h>
 #include <linux/ctype.h>
 #include <linux/utsname.h>
+#include <linux/kmemcheck.h>
 #include <linux/smp_lock.h>
 #include <linux/fs.h>
 #include <linux/init.h>
@@ -967,6 +968,17 @@ static struct ctl_table kern_table[] = {
                .proc_handler   = &proc_dointvec,
        },
 #endif
+#ifdef CONFIG_KMEMCHECK
+        {
+                .ctl_name       = CTL_UNNUMBERED,
+                .procname       = "kmemcheck",
+                .data           = &kmemcheck_enabled,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec,
+        },
+#endif
 /*
 * NOTE: do not add new entries to this table unless you have read
 * Documentation/sysctl/ctl_unnumbered.txt
@@ -1325,7 +1337,6 @@ static struct ctl_table vm_table[] = {
                .extra2         = &one,
        },
 #endif
-#ifdef CONFIG_UNEVICTABLE_LRU
        {
                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "scan_unevictable_pages",
@@ -1334,7 +1345,6 @@ static struct ctl_table vm_table[] = {
                .mode           = 0644,
                .proc_handler   = &scan_unevictable_handler,
        },
-#endif
 /*
 * NOTE: do not add new entries to this table unless you have read
 * Documentation/sysctl/ctl_unnumbered.txt
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 4a13e5a01ce3..61071fecc82e 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -147,7 +147,7 @@ config IRQSOFF_TRACER
          disabled by default and can be runtime (re-)started
          via:
-              echo 0 > /debugfs/tracing/tracing_max_latency
+              echo 0 > /sys/kernel/debug/tracing/tracing_max_latency
          (Note that kernel size and overhead increases with this option
          enabled. This option and the preempt-off timing option can be
@@ -168,7 +168,7 @@ config PREEMPT_TRACER
          disabled by default and can be runtime (re-)started
          via:
-              echo 0 > /debugfs/tracing/tracing_max_latency
+              echo 0 > /sys/kernel/debug/tracing/tracing_max_latency
          (Note that kernel size and overhead increases with this option
          enabled. This option and the irqs-off timing option can be
@@ -261,7 +261,7 @@ config PROFILE_ANNOTATED_BRANCHES
          This tracer profiles all the the likely and unlikely macros
          in the kernel. It will display the results in:
-          /debugfs/tracing/profile_annotated_branch
+          /sys/kernel/debug/tracing/profile_annotated_branch
          Note: this will add a significant overhead, only turn this
          on if you need to profile the system's use of these macros.
@@ -274,7 +274,7 @@ config PROFILE_ALL_BRANCHES
          taken in the kernel is recorded whether it hit or miss.
          The results will be displayed in:
-          /debugfs/tracing/profile_branch
+          /sys/kernel/debug/tracing/profile_branch
          This option also enables the likely/unlikely profiler.
@@ -323,7 +323,7 @@ config STACK_TRACER
        select KALLSYMS
        help
          This special tracer records the maximum stack footprint of the
-          kernel and displays it in debugfs/tracing/stack_trace.
+          kernel and displays it in /sys/kernel/debug/tracing/stack_trace.
          This tracer works by hooking into every function call that the
          kernel executes, and keeping a maximum stack depth value and
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 2e642b2b7253..dc4dc70171ce 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -10,6 +10,7 @@
 #include <linux/debugfs.h>
 #include <linux/uaccess.h>
 #include <linux/hardirq.h>
+#include <linux/kmemcheck.h>
 #include <linux/module.h>
 #include <linux/percpu.h>
 #include <linux/mutex.h>
@@ -1270,6 +1271,7 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
        if (tail < BUF_PAGE_SIZE) {
                /* Mark the rest of the page with padding */
                event = __rb_page_index(tail_page, tail);
+                kmemcheck_annotate_bitfield(event, bitfield);
                rb_event_set_padding(event);
        }
@@ -1327,6 +1329,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
                return NULL;
        event = __rb_page_index(tail_page, tail);
+        kmemcheck_annotate_bitfield(event, bitfield);
        rb_update_event(event, type, length);
        /* The passed in type is zero for DATA */
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8acd9b81a5d7..c1878bfb2e1e 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -344,7 +344,7 @@ static raw_spinlock_t ftrace_max_lock =
 /*
 * Copy the new maximum trace into the separate maximum-trace
 * structure. (this way the maximum trace is permanently saved,
- * for later retrieval via /debugfs/tracing/latency_trace)
+ * for later retrieval via /sys/kernel/debug/tracing/latency_trace)
 */
 static void
 __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
@@ -2414,21 +2414,20 @@ static const struct file_operations tracing_iter_fops = {
 static const char readme_msg[] =
        "tracing mini-HOWTO:\n\n"
-        "# mkdir /debug\n"
+        "# mount -t debugfs nodev /sys/kernel/debug\n\n"
-        "# mount -t debugfs nodev /debug\n\n"
+        "# cat /sys/kernel/debug/tracing/available_tracers\n"
-        "# cat /debug/tracing/available_tracers\n"
        "wakeup preemptirqsoff preemptoff irqsoff function sched_switch nop\n\n"
-        "# cat /debug/tracing/current_tracer\n"
+        "# cat /sys/kernel/debug/tracing/current_tracer\n"
        "nop\n"
-        "# echo sched_switch > /debug/tracing/current_tracer\n"
+        "# echo sched_switch > /sys/kernel/debug/tracing/current_tracer\n"
-        "# cat /debug/tracing/current_tracer\n"
+        "# cat /sys/kernel/debug/tracing/current_tracer\n"
        "sched_switch\n"
-        "# cat /debug/tracing/trace_options\n"
+        "# cat /sys/kernel/debug/tracing/trace_options\n"
        "noprint-parent nosym-offset nosym-addr noverbose\n"
-        "# echo print-parent > /debug/tracing/trace_options\n"
+        "# echo print-parent > /sys/kernel/debug/tracing/trace_options\n"
-        "# echo 1 > /debug/tracing/tracing_enabled\n"
+        "# echo 1 > /sys/kernel/debug/tracing/tracing_enabled\n"
-        "# cat /debug/tracing/trace > /tmp/trace.txt\n"
+        "# cat /sys/kernel/debug/tracing/trace > /tmp/trace.txt\n"
-        "# echo 0 > /debug/tracing/tracing_enabled\n"
+        "# echo 0 > /sys/kernel/debug/tracing/tracing_enabled\n"
 ;
 static ssize_t
diff --git a/kernel/user.c b/kernel/user.c
index 850e0ba41c1e..2c000e7132ac 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -75,21 +75,6 @@ static void uid_hash_remove(struct user_struct *up)
        put_user_ns(up->user_ns);
 }
-static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
-{
-        struct user_struct *user;
-        struct hlist_node *h;
-        hlist_for_each_entry(user, h, hashent, uidhash_node) {
-                if (user->uid == uid) {
-                        atomic_inc(&user->__count);
-                        return user;
-                }
-        }
-        return NULL;
-}
 #ifdef CONFIG_USER_SCHED
 static void sched_destroy_user(struct user_struct *up)
@@ -119,6 +104,23 @@ static int sched_create_user(struct user_struct *up) { return 0; }
 #if defined(CONFIG_USER_SCHED) && defined(CONFIG_SYSFS)
+static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
+{
+        struct user_struct *user;
+        struct hlist_node *h;
+        hlist_for_each_entry(user, h, hashent, uidhash_node) {
+                if (user->uid == uid) {
+                        /* possibly resurrect an "almost deleted" object */
+                        if (atomic_inc_return(&user->__count) == 1)
+                                cancel_delayed_work(&user->work);
+                        return user;
+                }
+        }
+        return NULL;
+}
 static struct kset *uids_kset; /* represents the /sys/kernel/uids/ directory */
 static DEFINE_MUTEX(uids_mutex);
@@ -283,12 +285,12 @@ int __init uids_sysfs_init(void)
        return uids_user_create(&root_user);
 }
-/* work function to remove sysfs directory for a user and free up
+/* delayed work function to remove sysfs directory for a user and free up
 * corresponding structures.
 */
 static void cleanup_user_struct(struct work_struct *w)
 {
-        struct user_struct *up = container_of(w, struct user_struct, work);
+        struct user_struct *up = container_of(w, struct user_struct, work.work);
        unsigned long flags;
        int remove_user = 0;
@@ -297,15 +299,12 @@ static void cleanup_user_struct(struct work_struct *w)
         */
        uids_mutex_lock();
-        local_irq_save(flags);
+        spin_lock_irqsave(&uidhash_lock, flags);
+        if (atomic_read(&up->__count) == 0) {
-        if (atomic_dec_and_lock(&up->__count, &uidhash_lock)) {
                uid_hash_remove(up);
                remove_user = 1;
-                spin_unlock_irqrestore(&uidhash_lock, flags);
-        } else {
-                local_irq_restore(flags);
        }
+        spin_unlock_irqrestore(&uidhash_lock, flags);
        if (!remove_user)
                goto done;
@@ -331,16 +330,28 @@ done:
 */
 static void free_user(struct user_struct *up, unsigned long flags)
 {
-        /* restore back the count */
-        atomic_inc(&up->__count);
        spin_unlock_irqrestore(&uidhash_lock, flags);
+        INIT_DELAYED_WORK(&up->work, cleanup_user_struct);
-        INIT_WORK(&up->work, cleanup_user_struct);
+        schedule_delayed_work(&up->work, msecs_to_jiffies(1000));
-        schedule_work(&up->work);
 }
 #else   /* CONFIG_USER_SCHED && CONFIG_SYSFS */
+static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
+{
+        struct user_struct *user;
+        struct hlist_node *h;
+        hlist_for_each_entry(user, h, hashent, uidhash_node) {
+                if (user->uid == uid) {
+                        atomic_inc(&user->__count);
+                        return user;
+                }
+        }
+        return NULL;
+}
 int uids_sysfs_init(void) { return 0; }
 static inline int uids_user_create(struct user_struct *up) { return 0; }
 static inline void uids_mutex_lock(void) { }
author	Benjamin Herrenschmidt <benh@kernel.crashing.org>	2009-06-17 21:16:55 -0400
committer	Benjamin Herrenschmidt <benh@kernel.crashing.org>	2009-06-17 21:16:55 -0400
commit	4b337c5f245b6587ba844ac7bb13c313a2912f7b (patch)
tree	999c6a6580b76a083c8efb9dabff709d1c49fcd0 /kernel
parent	492b057c426e4aa747484958e18e9da29003985d (diff)
parent	3fe0344faf7fdcb158bd5c1a9aec960a8d70c8e8 (diff)